From bc9f4edf47d2cbed3b1ba7a61d1497dded91ed22 Mon Sep 17 00:00:00 2001 From: David Spickett Date: Wed, 11 Jun 2025 16:44:09 +0100 Subject: [PATCH 0001/1322] [LTO] Fix used before intialised warning (#143705) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit For whatever reason I can't reproduce this locally but I can on Compiler Explorer (https://godbolt.org/z/nfv4b83q6) and on our flang gcc bot (https://lab.llvm.org/buildbot/#/builders/130/builds/13683/steps/5/logs/stdio). In file included from ../llvm-project/llvm/include/llvm/LTO/LTO.h:33, from ../llvm-project/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp:29: ../llvm-project/llvm/include/llvm/Transforms/IPO/FunctionImport.h: In constructor ‘llvm::FunctionImporter::ImportListsTy::ImportListsTy()’: ../llvm-project/llvm/include/llvm/Transforms/IPO/FunctionImport.h:275:33: warning: member ‘llvm::FunctionImporter::ImportListsTy::ImportIDs’ is used uninitialized [-Wuninitialized] 275 | ImportListsTy() : EmptyList(ImportIDs) {} | ^~~~~~~~~ ../llvm-project/llvm/include/llvm/Transforms/IPO/FunctionImport.h: In constructor ‘llvm::FunctionImporter::ImportListsTy::ImportListsTy(size_t)’: ../llvm-project/llvm/include/llvm/Transforms/IPO/FunctionImport.h:276:44: warning: member ‘llvm::FunctionImporter::ImportListsTy::ImportIDs’ is used uninitialized [-Wuninitialized] 276 | ImportListsTy(size_t Size) : EmptyList(ImportIDs), ListsImpl(Size) {} | ^~~~~~~~~ ImportIDs was being used during construction of EmptyList, before ImportIDs itself had been constructed. --- llvm/include/llvm/Transforms/IPO/FunctionImport.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/llvm/include/llvm/Transforms/IPO/FunctionImport.h b/llvm/include/llvm/Transforms/IPO/FunctionImport.h index 65228bb65ba8..e6ae9ee831d5 100644 --- a/llvm/include/llvm/Transforms/IPO/FunctionImport.h +++ b/llvm/include/llvm/Transforms/IPO/FunctionImport.h @@ -272,8 +272,9 @@ public: // A map from destination modules to lists of imports. class ImportListsTy { public: - ImportListsTy() : EmptyList(ImportIDs) {} - ImportListsTy(size_t Size) : EmptyList(ImportIDs), ListsImpl(Size) {} + ImportListsTy() : ImportIDs(), EmptyList(ImportIDs) {} + ImportListsTy(size_t Size) + : ImportIDs(), EmptyList(ImportIDs), ListsImpl(Size) {} ImportMapTy &operator[](StringRef DestMod) { return ListsImpl.try_emplace(DestMod, ImportIDs).first->second; @@ -293,9 +294,9 @@ public: const_iterator end() const { return ListsImpl.end(); } private: + ImportIDTable ImportIDs; ImportMapTy EmptyList; DenseMap ListsImpl; - ImportIDTable ImportIDs; }; /// The set contains an entry for every global value that the module exports. From 91be47dccfa3480c152916838404d49107fde45c Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Wed, 11 Jun 2025 08:53:54 -0700 Subject: [PATCH 0002/1322] [flang] Fix warnings This patch fixes: flang/lib/Lower/OpenMP/OpenMP.cpp:3904:9: error: unused variable 'action0' [-Werror,-Wunused-variable] flang/lib/Lower/OpenMP/OpenMP.cpp:3905:9: error: unused variable 'action1' [-Werror,-Wunused-variable] --- flang/lib/Lower/OpenMP/OpenMP.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp b/flang/lib/Lower/OpenMP/OpenMP.cpp index 3f3b85696db3..c13fa471978d 100644 --- a/flang/lib/Lower/OpenMP/OpenMP.cpp +++ b/flang/lib/Lower/OpenMP/OpenMP.cpp @@ -3911,6 +3911,8 @@ static void genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable, // Capturing operation. assert(action0 != analysis.None && action1 != analysis.None && "Expexcing two actions"); + (void)action0; + (void)action1; captureOp = builder.create(loc, hint, memOrder); // Set the non-atomic insertion point to before the atomic.capture. From 2ab83e9f68f0c7b1a7199455d7ce05430d93fa44 Mon Sep 17 00:00:00 2001 From: Tony Varghese Date: Wed, 11 Jun 2025 21:28:26 +0530 Subject: [PATCH 0003/1322] [NFC][PowerPC] Rename xxevalPattern to adhere to naming convention. (#143675) Rename class `xxevalPattern` to adhere to naming convention listed in the coding guideline and used for all other classes in the td file. --- llvm/lib/Target/PowerPC/PPCInstrP10.td | 62 +++++++++++++------------- 1 file changed, 31 insertions(+), 31 deletions(-) diff --git a/llvm/lib/Target/PowerPC/PPCInstrP10.td b/llvm/lib/Target/PowerPC/PPCInstrP10.td index a7f758745efe..d295f35fb1dd 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrP10.td +++ b/llvm/lib/Target/PowerPC/PPCInstrP10.td @@ -2159,7 +2159,7 @@ let AddedComplexity = 400, Predicates = [IsISA3_1, HasVSX] in { (COPY_TO_REGCLASS $VRB, VSRC), 2)))>; } -class xxevalPattern imm> : +class XXEvalPattern imm> : Pat<(v4i32 pattern), (XXEVAL $vA, $vB, $vC, imm)> {} let Predicates = [PrefixInstrs, HasP10Vector] in { @@ -2192,83 +2192,83 @@ let Predicates = [PrefixInstrs, HasP10Vector] in { // Anonymous patterns for XXEVAL // AND // and(A, B, C) - def : xxevalPattern<(and v4i32:$vA, (and v4i32:$vB, v4i32:$vC)), 1>; + def : XXEvalPattern<(and v4i32:$vA, (and v4i32:$vB, v4i32:$vC)), 1>; // and(A, xor(B, C)) - def : xxevalPattern<(and v4i32:$vA, (xor v4i32:$vB, v4i32:$vC)), 6>; + def : XXEvalPattern<(and v4i32:$vA, (xor v4i32:$vB, v4i32:$vC)), 6>; // and(A, or(B, C)) - def : xxevalPattern<(and v4i32:$vA, (or v4i32:$vB, v4i32:$vC)), 7>; + def : XXEvalPattern<(and v4i32:$vA, (or v4i32:$vB, v4i32:$vC)), 7>; // and(A, nor(B, C)) - def : xxevalPattern<(and v4i32:$vA, (vnot (or v4i32:$vB, v4i32:$vC))), 8>; + def : XXEvalPattern<(and v4i32:$vA, (vnot (or v4i32:$vB, v4i32:$vC))), 8>; // and(A, eqv(B, C)) - def : xxevalPattern<(and v4i32:$vA, (vnot (xor v4i32:$vB, v4i32:$vC))), 9>; + def : XXEvalPattern<(and v4i32:$vA, (vnot (xor v4i32:$vB, v4i32:$vC))), 9>; // and(A, nand(B, C)) - def : xxevalPattern<(and v4i32:$vA, (vnot (and v4i32:$vB, v4i32:$vC))), 14>; + def : XXEvalPattern<(and v4i32:$vA, (vnot (and v4i32:$vB, v4i32:$vC))), 14>; // NAND // nand(A, B, C) - def : xxevalPattern<(vnot (and v4i32:$vA, (and v4i32:$vB, v4i32:$vC))), + def : XXEvalPattern<(vnot (and v4i32:$vA, (and v4i32:$vB, v4i32:$vC))), !sub(255, 1)>; // nand(A, xor(B, C)) - def : xxevalPattern<(vnot (and v4i32:$vA, (xor v4i32:$vB, v4i32:$vC))), + def : XXEvalPattern<(vnot (and v4i32:$vA, (xor v4i32:$vB, v4i32:$vC))), !sub(255, 6)>; // nand(A, or(B, C)) - def : xxevalPattern<(vnot (and v4i32:$vA, (or v4i32:$vB, v4i32:$vC))), + def : XXEvalPattern<(vnot (and v4i32:$vA, (or v4i32:$vB, v4i32:$vC))), !sub(255, 7)>; // nand(A, nor(B, C)) - def : xxevalPattern<(or (vnot v4i32:$vA), (or v4i32:$vB, v4i32:$vC)), + def : XXEvalPattern<(or (vnot v4i32:$vA), (or v4i32:$vB, v4i32:$vC)), !sub(255, 8)>; // nand(A, eqv(B, C)) - def : xxevalPattern<(or (vnot v4i32:$vA), (xor v4i32:$vB, v4i32:$vC)), + def : XXEvalPattern<(or (vnot v4i32:$vA), (xor v4i32:$vB, v4i32:$vC)), !sub(255, 9)>; // nand(A, nand(B, C)) - def : xxevalPattern<(or (vnot v4i32:$vA), (and v4i32:$vB, v4i32:$vC)), + def : XXEvalPattern<(or (vnot v4i32:$vA), (and v4i32:$vB, v4i32:$vC)), !sub(255, 14)>; // EQV // (eqv A, B, C) - def : xxevalPattern<(or (and v4i32:$vA, (and v4i32:$vB, v4i32:$vC)), + def : XXEvalPattern<(or (and v4i32:$vA, (and v4i32:$vB, v4i32:$vC)), (vnot (or v4i32:$vA, (or v4i32:$vB, v4i32:$vC)))), 150>; // (eqv A, (and B, C)) - def : xxevalPattern<(vnot (xor v4i32:$vA, (and v4i32:$vB, v4i32:$vC))), 225>; + def : XXEvalPattern<(vnot (xor v4i32:$vA, (and v4i32:$vB, v4i32:$vC))), 225>; // (eqv A, (or B, C)) - def : xxevalPattern<(vnot (xor v4i32:$vA, (or v4i32:$vB, v4i32:$vC))), 135>; + def : XXEvalPattern<(vnot (xor v4i32:$vA, (or v4i32:$vB, v4i32:$vC))), 135>; // NOR // (nor A, B, C) - def : xxevalPattern<(vnot (or v4i32:$vA, (or v4i32:$vB, v4i32:$vC))), 128>; + def : XXEvalPattern<(vnot (or v4i32:$vA, (or v4i32:$vB, v4i32:$vC))), 128>; // (nor A, (and B, C)) - def : xxevalPattern<(vnot (or v4i32:$vA, (and v4i32:$vB, v4i32:$vC))), 224>; + def : XXEvalPattern<(vnot (or v4i32:$vA, (and v4i32:$vB, v4i32:$vC))), 224>; // (nor A, (eqv B, C)) - def : xxevalPattern<(and (vnot v4i32:$vA), (xor v4i32:$vB, v4i32:$vC)), 96>; + def : XXEvalPattern<(and (vnot v4i32:$vA), (xor v4i32:$vB, v4i32:$vC)), 96>; // (nor A, (nand B, C)) - def : xxevalPattern<(and (vnot v4i32:$vA), (and v4i32:$vB, v4i32:$vC)), 16>; + def : XXEvalPattern<(and (vnot v4i32:$vA), (and v4i32:$vB, v4i32:$vC)), 16>; // (nor A, (nor B, C)) - def : xxevalPattern<(and (vnot v4i32:$vA), (or v4i32:$vB, v4i32:$vC)), 112>; + def : XXEvalPattern<(and (vnot v4i32:$vA), (or v4i32:$vB, v4i32:$vC)), 112>; // (nor A, (xor B, C)) - def : xxevalPattern<(vnot (or v4i32:$vA, (xor v4i32:$vB, v4i32:$vC))), 144>; + def : XXEvalPattern<(vnot (or v4i32:$vA, (xor v4i32:$vB, v4i32:$vC))), 144>; // OR // (or A, B, C) - def : xxevalPattern<(or v4i32:$vA, (or v4i32:$vB, v4i32:$vC)), 127>; + def : XXEvalPattern<(or v4i32:$vA, (or v4i32:$vB, v4i32:$vC)), 127>; // (or A, (and B, C)) - def : xxevalPattern<(or v4i32:$vA, (and v4i32:$vB, v4i32:$vC)), 31>; + def : XXEvalPattern<(or v4i32:$vA, (and v4i32:$vB, v4i32:$vC)), 31>; // (or A, (eqv B, C)) - def : xxevalPattern<(or v4i32:$vA, (vnot (xor v4i32:$vB, v4i32:$vC))), 159>; + def : XXEvalPattern<(or v4i32:$vA, (vnot (xor v4i32:$vB, v4i32:$vC))), 159>; // (or A, (nand B, C)) - def : xxevalPattern<(or v4i32:$vA, (vnot (and v4i32:$vB, v4i32:$vC))), 239>; + def : XXEvalPattern<(or v4i32:$vA, (vnot (and v4i32:$vB, v4i32:$vC))), 239>; // (or A, (nor B, C)) - def : xxevalPattern<(or v4i32:$vA, (vnot (or v4i32:$vB, v4i32:$vC))), 143>; + def : XXEvalPattern<(or v4i32:$vA, (vnot (or v4i32:$vB, v4i32:$vC))), 143>; // (or A, (xor B, C)) - def : xxevalPattern<(or v4i32:$vA, (xor v4i32:$vB, v4i32:$vC)), 111>; + def : XXEvalPattern<(or v4i32:$vA, (xor v4i32:$vB, v4i32:$vC)), 111>; // XOR // (xor A, B, C) - def : xxevalPattern<(xor v4i32:$vA, (xor v4i32:$vB, v4i32:$vC)), 105>; + def : XXEvalPattern<(xor v4i32:$vA, (xor v4i32:$vB, v4i32:$vC)), 105>; // (xor A, (and B, C)) - def : xxevalPattern<(xor v4i32:$vA, (and v4i32:$vB, v4i32:$vC)), 30>; + def : XXEvalPattern<(xor v4i32:$vA, (and v4i32:$vB, v4i32:$vC)), 30>; // (xor A, (or B, C)) - def : xxevalPattern<(xor v4i32:$vA, (or v4i32:$vB, v4i32:$vC)), 120>; + def : XXEvalPattern<(xor v4i32:$vA, (or v4i32:$vB, v4i32:$vC)), 120>; // Anonymous patterns to select prefixed VSX loads and stores. // Load / Store f128 From 38fb0117ab10c4541e58697a4b56de2a646cf3f4 Mon Sep 17 00:00:00 2001 From: Peng Liu Date: Wed, 11 Jun 2025 12:13:36 -0400 Subject: [PATCH 0004/1322] [libc++] Make forward_list constexpr as part of P3372R3 (#129435) Fixes #128658 --- libcxx/docs/FeatureTestMacroTable.rst | 2 + libcxx/include/__memory/allocation_guard.h | 20 +- libcxx/include/__memory/pointer_traits.h | 16 +- libcxx/include/forward_list | 469 ++++++++++-------- libcxx/include/version | 2 + .../forwardlist/compare.three_way.pass.cpp | 7 +- .../sequences/forwardlist/empty.pass.cpp | 13 +- .../forwardlist.access/front.pass.cpp | 16 +- .../forwardlist.cons/alloc.compile.fail.cpp | 13 +- .../forwardlist.cons/alloc.pass.cpp | 13 +- .../forwardlist.cons/assign_copy.pass.cpp | 13 +- .../forwardlist.cons/assign_init.pass.cpp | 13 +- .../forwardlist.cons/assign_move.pass.cpp | 13 +- .../forwardlist.cons/assign_op_init.pass.cpp | 13 +- .../forwardlist.cons/assign_range.pass.cpp | 13 +- .../assign_size_value.pass.cpp | 13 +- .../forwardlist.cons/copy.pass.cpp | 13 +- .../forwardlist.cons/copy_alloc.pass.cpp | 13 +- .../forwardlist.cons/default.pass.cpp | 13 +- .../forwardlist.cons/from_range.pass.cpp | 19 +- .../forwardlist.cons/init.pass.cpp | 13 +- .../forwardlist.cons/init_alloc.pass.cpp | 13 +- .../forwardlist.cons/move.pass.cpp | 13 +- .../forwardlist.cons/move_alloc.pass.cpp | 13 +- .../forwardlist.cons/range.pass.cpp | 13 +- .../forwardlist.cons/range_alloc.pass.cpp | 13 +- .../forwardlist.cons/size.pass.cpp | 4 +- .../forwardlist.cons/size_value.pass.cpp | 13 +- .../size_value_alloc.pass.cpp | 13 +- .../forwardlist.erasure/erase.pass.cpp | 18 +- .../forwardlist.erasure/erase_if.pass.cpp | 18 +- .../forwardlist.iter/before_begin.pass.cpp | 17 +- .../forwardlist.iter/iterators.pass.cpp | 27 +- .../assign_range.pass.cpp | 19 +- .../forwardlist.modifiers/clear.pass.cpp | 13 +- .../emplace_after.pass.cpp | 13 +- .../emplace_front.pass.cpp | 13 +- .../erase_after_many.pass.cpp | 13 +- .../erase_after_one.pass.cpp | 13 +- .../insert_after_const.pass.cpp | 13 +- .../insert_after_init.pass.cpp | 13 +- .../insert_after_range.pass.cpp | 13 +- .../insert_after_rv.pass.cpp | 13 +- .../insert_after_size_value.pass.cpp | 13 +- .../insert_range_after.pass.cpp | 23 +- .../forwardlist.modifiers/pop_front.pass.cpp | 13 +- .../prepend_range.pass.cpp | 19 +- .../push_front_const.pass.cpp | 13 +- .../push_front_exception_safety.pass.cpp | 2 +- .../push_front_rv.pass.cpp | 13 +- .../resize_size.pass.cpp | 17 +- .../resize_size_value.pass.cpp | 15 +- .../forwardlist.ops/merge_lvalue.pass.cpp | 17 +- .../merge_lvalue_pred.pass.cpp | 17 +- .../forwardlist.ops/merge_rvalue.pass.cpp | 17 +- .../merge_rvalue_pred.pass.cpp | 17 +- .../forwardlist.ops/remove.pass.cpp | 27 +- .../forwardlist.ops/remove_if.pass.cpp | 25 +- .../forwardlist.ops/reverse.pass.cpp | 19 +- .../splice_after_flist.pass.cpp | 23 +- .../forwardlist.ops/splice_after_one.pass.cpp | 25 +- .../splice_after_range.pass.cpp | 27 +- .../forwardlist.ops/unique.pass.cpp | 15 +- .../forwardlist.ops/unique_pred.pass.cpp | 25 +- .../forwardlist.spec/equal.pass.cpp | 17 +- .../forwardlist.spec/member_swap.pass.cpp | 13 +- .../forwardlist.spec/non_member_swap.pass.cpp | 13 +- .../forwardlist.spec/relational.pass.cpp | 21 +- .../swap_noexcept.compile.pass.cpp | 4 +- .../forwardlist/get_allocator.pass.cpp | 13 +- .../sequences/forwardlist/incomplete.pass.cpp | 17 +- .../sequences/forwardlist/max_size.pass.cpp | 13 +- .../forward_list.version.compile.pass.cpp | 27 + .../version.version.compile.pass.cpp | 27 + libcxx/test/support/counting_predicates.h | 58 +-- .../generate_feature_test_macro_components.py | 5 + 76 files changed, 1184 insertions(+), 457 deletions(-) mode change 100755 => 100644 libcxx/utils/generate_feature_test_macro_components.py diff --git a/libcxx/docs/FeatureTestMacroTable.rst b/libcxx/docs/FeatureTestMacroTable.rst index a89d4038785c..3e6fd643f620 100644 --- a/libcxx/docs/FeatureTestMacroTable.rst +++ b/libcxx/docs/FeatureTestMacroTable.rst @@ -420,6 +420,8 @@ Status ---------------------------------------------------------- ----------------- ``__cpp_lib_constexpr_algorithms`` ``202306L`` ---------------------------------------------------------- ----------------- + ``__cpp_lib_constexpr_forward_list`` ``202502L`` + ---------------------------------------------------------- ----------------- ``__cpp_lib_constexpr_new`` ``202406L`` ---------------------------------------------------------- ----------------- ``__cpp_lib_constexpr_queue`` ``202502L`` diff --git a/libcxx/include/__memory/allocation_guard.h b/libcxx/include/__memory/allocation_guard.h index 66edcd92ed61..016e1a3a429b 100644 --- a/libcxx/include/__memory/allocation_guard.h +++ b/libcxx/include/__memory/allocation_guard.h @@ -49,24 +49,26 @@ struct __allocation_guard { using _Size _LIBCPP_NODEBUG = typename allocator_traits<_Alloc>::size_type; template // we perform the allocator conversion inside the constructor - _LIBCPP_HIDE_FROM_ABI explicit __allocation_guard(_AllocT __alloc, _Size __n) + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI explicit __allocation_guard(_AllocT __alloc, _Size __n) : __alloc_(std::move(__alloc)), __n_(__n), __ptr_(allocator_traits<_Alloc>::allocate(__alloc_, __n_)) // initialization order is important {} - _LIBCPP_HIDE_FROM_ABI ~__allocation_guard() _NOEXCEPT { __destroy(); } + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI ~__allocation_guard() _NOEXCEPT { __destroy(); } - _LIBCPP_HIDE_FROM_ABI __allocation_guard(const __allocation_guard&) = delete; - _LIBCPP_HIDE_FROM_ABI __allocation_guard(__allocation_guard&& __other) _NOEXCEPT + __allocation_guard(const __allocation_guard&) = delete; + __allocation_guard& operator=(const __allocation_guard& __other) = delete; + + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI __allocation_guard(__allocation_guard&& __other) _NOEXCEPT : __alloc_(std::move(__other.__alloc_)), __n_(__other.__n_), __ptr_(__other.__ptr_) { __other.__ptr_ = nullptr; } - _LIBCPP_HIDE_FROM_ABI __allocation_guard& operator=(const __allocation_guard& __other) = delete; - _LIBCPP_HIDE_FROM_ABI __allocation_guard& operator=(__allocation_guard&& __other) _NOEXCEPT { + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI __allocation_guard& + operator=(__allocation_guard&& __other) _NOEXCEPT { if (std::addressof(__other) != this) { __destroy(); @@ -79,17 +81,17 @@ struct __allocation_guard { return *this; } - _LIBCPP_HIDE_FROM_ABI _Pointer + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI _Pointer __release_ptr() _NOEXCEPT { // not called __release() because it's a keyword in objective-c++ _Pointer __tmp = __ptr_; __ptr_ = nullptr; return __tmp; } - _LIBCPP_HIDE_FROM_ABI _Pointer __get() const _NOEXCEPT { return __ptr_; } + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI _Pointer __get() const _NOEXCEPT { return __ptr_; } private: - _LIBCPP_HIDE_FROM_ABI void __destroy() _NOEXCEPT { + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void __destroy() _NOEXCEPT { if (__ptr_ != nullptr) { allocator_traits<_Alloc>::deallocate(__alloc_, __ptr_, __n_); } diff --git a/libcxx/include/__memory/pointer_traits.h b/libcxx/include/__memory/pointer_traits.h index 4ba50898fb37..879b387b9ad1 100644 --- a/libcxx/include/__memory/pointer_traits.h +++ b/libcxx/include/__memory/pointer_traits.h @@ -245,8 +245,8 @@ inline _LIBCPP_HIDE_FROM_ABI constexpr auto to_address(_Tp* __p) noexcept { } template -inline _LIBCPP_HIDE_FROM_ABI constexpr auto -to_address(const _Pointer& __p) noexcept -> decltype(std::__to_address(__p)) { +inline _LIBCPP_HIDE_FROM_ABI constexpr auto to_address(const _Pointer& __p) noexcept + -> decltype(std::__to_address(__p)) { return std::__to_address(__p); } #endif @@ -302,6 +302,18 @@ concept __resettable_smart_pointer_with_args = requires(_Smart __s, _Pointer __p #endif +// This function ensures safe conversions between fancy pointers at compile-time, where we avoid casts from/to +// `__void_pointer` by obtaining the underlying raw pointer from the fancy pointer using `std::to_address`, +// then dereferencing it to retrieve the pointed-to object, and finally constructing the target fancy pointer +// to that object using the `std::pointer_traits<>::pinter_to` function. +template +_LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI _PtrTo __static_fancy_pointer_cast(const _PtrFrom& __p) { + using __ptr_traits = pointer_traits<_PtrTo>; + using __element_type = typename __ptr_traits::element_type; + return __p ? __ptr_traits::pointer_to(*static_cast<__element_type*>(std::addressof(*__p))) + : static_cast<_PtrTo>(nullptr); +} + _LIBCPP_END_NAMESPACE_STD _LIBCPP_POP_MACROS diff --git a/libcxx/include/forward_list b/libcxx/include/forward_list index 5046de27a9da..e9b2c860b89c 100644 --- a/libcxx/include/forward_list +++ b/libcxx/include/forward_list @@ -295,8 +295,8 @@ struct __forward_node_traits { "the _LIBCPP_ABI_FORWARD_LIST_REMOVE_NODE_POINTER_UB macro to silence this diagnostic."); # endif - _LIBCPP_HIDE_FROM_ABI static __begin_node_pointer __as_iter_node(__node_pointer __p) { - return static_cast<__begin_node_pointer>(static_cast<__void_pointer>(__p)); + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI static __begin_node_pointer __as_iter_node(__node_pointer __p) { + return std::__static_fancy_pointer_cast<__begin_node_pointer>(__p); } }; @@ -307,11 +307,11 @@ struct __forward_begin_node { pointer __next_; - _LIBCPP_HIDE_FROM_ABI __forward_begin_node() : __next_(nullptr) {} - _LIBCPP_HIDE_FROM_ABI explicit __forward_begin_node(pointer __n) : __next_(__n) {} + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI __forward_begin_node() : __next_(nullptr) {} + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI explicit __forward_begin_node(pointer __n) : __next_(__n) {} - _LIBCPP_HIDE_FROM_ABI __begin_node_pointer __next_as_begin() const { - return static_cast<__begin_node_pointer>(__next_); + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI __begin_node_pointer __next_as_begin() const { + return std::__static_fancy_pointer_cast<__begin_node_pointer>(__next_); } }; @@ -335,7 +335,7 @@ private: }; public: - _LIBCPP_HIDE_FROM_ABI _Tp& __get_value() { return __value_; } + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI _Tp& __get_value() { return __value_; } # else private: @@ -345,8 +345,8 @@ public: _LIBCPP_HIDE_FROM_ABI _Tp& __get_value() { return *std::__launder(reinterpret_cast<_Tp*>(&__buffer_)); } # endif - _LIBCPP_HIDE_FROM_ABI explicit __forward_list_node(_NodePtr __next) : _Base(__next) {} - _LIBCPP_HIDE_FROM_ABI ~__forward_list_node() {} + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI explicit __forward_list_node(_NodePtr __next) : _Base(__next) {} + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI ~__forward_list_node() {} }; template > @@ -357,24 +357,26 @@ class __forward_list_const_iterator; template class __forward_list_iterator { typedef __forward_node_traits<_NodePtr> __traits; + typedef typename __traits::__node_type __node_type; + typedef typename __traits::__begin_node __begin_node_type; typedef typename __traits::__node_pointer __node_pointer; typedef typename __traits::__begin_node_pointer __begin_node_pointer; typedef typename __traits::__void_pointer __void_pointer; __begin_node_pointer __ptr_; - _LIBCPP_HIDE_FROM_ABI __begin_node_pointer __get_begin() const { - return static_cast<__begin_node_pointer>(static_cast<__void_pointer>(__ptr_)); - } - _LIBCPP_HIDE_FROM_ABI __node_pointer __get_unsafe_node_pointer() const { - return static_cast<__node_pointer>(static_cast<__void_pointer>(__ptr_)); + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI __begin_node_pointer __get_begin() const { return __ptr_; } + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI __node_pointer __get_unsafe_node_pointer() const { + return std::__static_fancy_pointer_cast<__node_pointer>(__ptr_); } - _LIBCPP_HIDE_FROM_ABI explicit __forward_list_iterator(nullptr_t) _NOEXCEPT : __ptr_(nullptr) {} + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI explicit __forward_list_iterator(nullptr_t) _NOEXCEPT + : __ptr_(nullptr) {} + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI explicit __forward_list_iterator(__begin_node_pointer __p) _NOEXCEPT : __ptr_(__p) {} - _LIBCPP_HIDE_FROM_ABI explicit __forward_list_iterator(__node_pointer __p) _NOEXCEPT + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI explicit __forward_list_iterator(__node_pointer __p) _NOEXCEPT : __ptr_(__traits::__as_iter_node(__p)) {} template @@ -389,27 +391,31 @@ public: typedef typename pointer_traits<__node_pointer>::difference_type difference_type; typedef __rebind_pointer_t<__node_pointer, value_type> pointer; - _LIBCPP_HIDE_FROM_ABI __forward_list_iterator() _NOEXCEPT : __ptr_(nullptr) {} + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI __forward_list_iterator() _NOEXCEPT : __ptr_(nullptr) {} - _LIBCPP_HIDE_FROM_ABI reference operator*() const { return __get_unsafe_node_pointer()->__get_value(); } - _LIBCPP_HIDE_FROM_ABI pointer operator->() const { + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI reference operator*() const { + return __get_unsafe_node_pointer()->__get_value(); + } + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI pointer operator->() const { return pointer_traits::pointer_to(__get_unsafe_node_pointer()->__get_value()); } - _LIBCPP_HIDE_FROM_ABI __forward_list_iterator& operator++() { + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI __forward_list_iterator& operator++() { __ptr_ = __traits::__as_iter_node(__ptr_->__next_); return *this; } - _LIBCPP_HIDE_FROM_ABI __forward_list_iterator operator++(int) { + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI __forward_list_iterator operator++(int) { __forward_list_iterator __t(*this); ++(*this); return __t; } - friend _LIBCPP_HIDE_FROM_ABI bool operator==(const __forward_list_iterator& __x, const __forward_list_iterator& __y) { + friend _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI bool + operator==(const __forward_list_iterator& __x, const __forward_list_iterator& __y) { return __x.__ptr_ == __y.__ptr_; } - friend _LIBCPP_HIDE_FROM_ABI bool operator!=(const __forward_list_iterator& __x, const __forward_list_iterator& __y) { + friend _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI bool + operator!=(const __forward_list_iterator& __x, const __forward_list_iterator& __y) { return !(__x == __y); } }; @@ -421,23 +427,25 @@ class __forward_list_const_iterator { typedef __forward_node_traits<_NodePtr> __traits; typedef typename __traits::__node_type __node_type; + typedef typename __traits::__begin_node __begin_node_type; typedef typename __traits::__node_pointer __node_pointer; typedef typename __traits::__begin_node_pointer __begin_node_pointer; typedef typename __traits::__void_pointer __void_pointer; __begin_node_pointer __ptr_; - _LIBCPP_HIDE_FROM_ABI __begin_node_pointer __get_begin() const { - return static_cast<__begin_node_pointer>(static_cast<__void_pointer>(__ptr_)); - } - _LIBCPP_HIDE_FROM_ABI __node_pointer __get_unsafe_node_pointer() const { - return static_cast<__node_pointer>(static_cast<__void_pointer>(__ptr_)); + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI __begin_node_pointer __get_begin() const { return __ptr_; } + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI __node_pointer __get_unsafe_node_pointer() const { + return std::__static_fancy_pointer_cast<__node_pointer>(__ptr_); } - _LIBCPP_HIDE_FROM_ABI explicit __forward_list_const_iterator(nullptr_t) _NOEXCEPT : __ptr_(nullptr) {} + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI explicit __forward_list_const_iterator(nullptr_t) _NOEXCEPT + : __ptr_(nullptr) {} + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI explicit __forward_list_const_iterator(__begin_node_pointer __p) _NOEXCEPT : __ptr_(__p) {} + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI explicit __forward_list_const_iterator(__node_pointer __p) _NOEXCEPT : __ptr_(__traits::__as_iter_node(__p)) {} @@ -451,30 +459,32 @@ public: typedef typename pointer_traits<__node_pointer>::difference_type difference_type; typedef __rebind_pointer_t<__node_pointer, const value_type> pointer; - _LIBCPP_HIDE_FROM_ABI __forward_list_const_iterator() _NOEXCEPT : __ptr_(nullptr) {} - _LIBCPP_HIDE_FROM_ABI __forward_list_const_iterator(__forward_list_iterator<__node_pointer> __p) _NOEXCEPT - : __ptr_(__p.__ptr_) {} + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI __forward_list_const_iterator() _NOEXCEPT : __ptr_(nullptr) {} + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI + __forward_list_const_iterator(__forward_list_iterator<__node_pointer> __p) _NOEXCEPT : __ptr_(__p.__ptr_) {} - _LIBCPP_HIDE_FROM_ABI reference operator*() const { return __get_unsafe_node_pointer()->__get_value(); } - _LIBCPP_HIDE_FROM_ABI pointer operator->() const { + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI reference operator*() const { + return __get_unsafe_node_pointer()->__get_value(); + } + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI pointer operator->() const { return pointer_traits::pointer_to(__get_unsafe_node_pointer()->__get_value()); } - _LIBCPP_HIDE_FROM_ABI __forward_list_const_iterator& operator++() { + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI __forward_list_const_iterator& operator++() { __ptr_ = __traits::__as_iter_node(__ptr_->__next_); return *this; } - _LIBCPP_HIDE_FROM_ABI __forward_list_const_iterator operator++(int) { + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI __forward_list_const_iterator operator++(int) { __forward_list_const_iterator __t(*this); ++(*this); return __t; } - friend _LIBCPP_HIDE_FROM_ABI bool + friend _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI bool operator==(const __forward_list_const_iterator& __x, const __forward_list_const_iterator& __y) { return __x.__ptr_ == __y.__ptr_; } - friend _LIBCPP_HIDE_FROM_ABI bool + friend _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI bool operator!=(const __forward_list_const_iterator& __x, const __forward_list_const_iterator& __y) { return !(__x == __y); } @@ -498,48 +508,53 @@ protected: _LIBCPP_COMPRESSED_PAIR(__begin_node, __before_begin_, __node_allocator, __alloc_); - _LIBCPP_HIDE_FROM_ABI __begin_node_pointer __before_begin() _NOEXCEPT { + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI __begin_node_pointer __before_begin() _NOEXCEPT { return pointer_traits<__begin_node_pointer>::pointer_to(__before_begin_); } - _LIBCPP_HIDE_FROM_ABI __begin_node_pointer __before_begin() const _NOEXCEPT { - return pointer_traits<__begin_node_pointer>::pointer_to(const_cast<__begin_node&>(__before_begin_)); + + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI __begin_node_pointer __before_begin() const _NOEXCEPT { + return pointer_traits<__begin_node_pointer>::pointer_to( + *const_cast<__begin_node*>(std::addressof(__before_begin_))); } typedef __forward_list_iterator<__node_pointer> iterator; typedef __forward_list_const_iterator<__node_pointer> const_iterator; - _LIBCPP_HIDE_FROM_ABI __forward_list_base() _NOEXCEPT_(is_nothrow_default_constructible<__node_allocator>::value) + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI __forward_list_base() + _NOEXCEPT_(is_nothrow_default_constructible<__node_allocator>::value) : __before_begin_(__begin_node()) {} - _LIBCPP_HIDE_FROM_ABI explicit __forward_list_base(const allocator_type& __a) + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI explicit __forward_list_base(const allocator_type& __a) : __before_begin_(__begin_node()), __alloc_(__node_allocator(__a)) {} - _LIBCPP_HIDE_FROM_ABI explicit __forward_list_base(const __node_allocator& __a) + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI explicit __forward_list_base(const __node_allocator& __a) : __before_begin_(__begin_node()), __alloc_(__a) {} public: # ifndef _LIBCPP_CXX03_LANG - _LIBCPP_HIDE_FROM_ABI + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI __forward_list_base(__forward_list_base&& __x) noexcept(is_nothrow_move_constructible<__node_allocator>::value); - _LIBCPP_HIDE_FROM_ABI __forward_list_base(__forward_list_base&& __x, const allocator_type& __a); + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI + __forward_list_base(__forward_list_base&& __x, const allocator_type& __a); # endif // _LIBCPP_CXX03_LANG __forward_list_base(const __forward_list_base&) = delete; __forward_list_base& operator=(const __forward_list_base&) = delete; - _LIBCPP_HIDE_FROM_ABI ~__forward_list_base(); + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI ~__forward_list_base(); protected: - _LIBCPP_HIDE_FROM_ABI void __copy_assign_alloc(const __forward_list_base& __x) { + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void __copy_assign_alloc(const __forward_list_base& __x) { __copy_assign_alloc(__x, integral_constant()); } - _LIBCPP_HIDE_FROM_ABI void __move_assign_alloc(__forward_list_base& __x) + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void __move_assign_alloc(__forward_list_base& __x) _NOEXCEPT_(!__node_traits::propagate_on_container_move_assignment::value || is_nothrow_move_assignable<__node_allocator>::value) { __move_assign_alloc(__x, integral_constant()); } template - _LIBCPP_HIDE_FROM_ABI __node_pointer __create_node(__node_pointer __next, _Args&&... __args) { + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI __node_pointer + __create_node(__node_pointer __next, _Args&&... __args) { __allocation_guard<__node_allocator> __guard(__alloc_, 1); // Begin the lifetime of the node itself. Note that this doesn't begin the lifetime of the value // held inside the node, since we need to use the allocator's construct() method for that. @@ -554,7 +569,7 @@ protected: return __guard.__release_ptr(); } - _LIBCPP_HIDE_FROM_ABI void __delete_node(__node_pointer __node) { + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void __delete_node(__node_pointer __node) { // For the same reason as above, we use the allocator's destroy() method for the value_type, // but not for the node itself. __node_traits::destroy(__alloc_, std::addressof(__node->__get_value())); @@ -563,7 +578,7 @@ protected: } public: - _LIBCPP_HIDE_FROM_ABI void swap(__forward_list_base& __x) + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void swap(__forward_list_base& __x) # if _LIBCPP_STD_VER >= 14 _NOEXCEPT; # else @@ -571,18 +586,21 @@ public: # endif protected: - _LIBCPP_HIDE_FROM_ABI void clear() _NOEXCEPT; + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void clear() _NOEXCEPT; private: - _LIBCPP_HIDE_FROM_ABI void __copy_assign_alloc(const __forward_list_base&, false_type) {} - _LIBCPP_HIDE_FROM_ABI void __copy_assign_alloc(const __forward_list_base& __x, true_type) { + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void __copy_assign_alloc(const __forward_list_base&, false_type) { + } + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void + __copy_assign_alloc(const __forward_list_base& __x, true_type) { if (__alloc_ != __x.__alloc_) clear(); __alloc_ = __x.__alloc_; } - _LIBCPP_HIDE_FROM_ABI void __move_assign_alloc(__forward_list_base&, false_type) _NOEXCEPT {} - _LIBCPP_HIDE_FROM_ABI void __move_assign_alloc(__forward_list_base& __x, true_type) + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void + __move_assign_alloc(__forward_list_base&, false_type) _NOEXCEPT {} + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void __move_assign_alloc(__forward_list_base& __x, true_type) _NOEXCEPT_(is_nothrow_move_assignable<__node_allocator>::value) { __alloc_ = std::move(__x.__alloc_); } @@ -591,14 +609,15 @@ private: # ifndef _LIBCPP_CXX03_LANG template -inline __forward_list_base<_Tp, _Alloc>::__forward_list_base(__forward_list_base&& __x) noexcept( - is_nothrow_move_constructible<__node_allocator>::value) +_LIBCPP_CONSTEXPR_SINCE_CXX26 inline __forward_list_base<_Tp, _Alloc>::__forward_list_base( + __forward_list_base&& __x) noexcept(is_nothrow_move_constructible<__node_allocator>::value) : __before_begin_(std::move(__x.__before_begin_)), __alloc_(std::move(__x.__alloc_)) { __x.__before_begin()->__next_ = nullptr; } template -inline __forward_list_base<_Tp, _Alloc>::__forward_list_base(__forward_list_base&& __x, const allocator_type& __a) +_LIBCPP_CONSTEXPR_SINCE_CXX26 inline __forward_list_base<_Tp, _Alloc>::__forward_list_base( + __forward_list_base&& __x, const allocator_type& __a) : __before_begin_(__begin_node()), __alloc_(__node_allocator(__a)) { if (__alloc_ == __x.__alloc_) { __before_begin()->__next_ = __x.__before_begin()->__next_; @@ -609,12 +628,12 @@ inline __forward_list_base<_Tp, _Alloc>::__forward_list_base(__forward_list_base # endif // _LIBCPP_CXX03_LANG template -__forward_list_base<_Tp, _Alloc>::~__forward_list_base() { +_LIBCPP_CONSTEXPR_SINCE_CXX26 __forward_list_base<_Tp, _Alloc>::~__forward_list_base() { clear(); } template -inline void __forward_list_base<_Tp, _Alloc>::swap(__forward_list_base& __x) +_LIBCPP_CONSTEXPR_SINCE_CXX26 inline void __forward_list_base<_Tp, _Alloc>::swap(__forward_list_base& __x) # if _LIBCPP_STD_VER >= 14 _NOEXCEPT # else @@ -627,7 +646,7 @@ inline void __forward_list_base<_Tp, _Alloc>::swap(__forward_list_base& __x) } template -void __forward_list_base<_Tp, _Alloc>::clear() _NOEXCEPT { +_LIBCPP_CONSTEXPR_SINCE_CXX26 void __forward_list_base<_Tp, _Alloc>::clear() _NOEXCEPT { for (__node_pointer __p = __before_begin()->__next_; __p != nullptr;) { __node_pointer __next = __p->__next_; __delete_node(__p); @@ -672,105 +691,123 @@ public: typedef void __remove_return_type; # endif - _LIBCPP_HIDE_FROM_ABI forward_list() _NOEXCEPT_(is_nothrow_default_constructible<__node_allocator>::value) { - } // = default; - _LIBCPP_HIDE_FROM_ABI explicit forward_list(const allocator_type& __a); - _LIBCPP_HIDE_FROM_ABI explicit forward_list(size_type __n); + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI forward_list() + _NOEXCEPT_(is_nothrow_default_constructible<__node_allocator>::value) {} // = default; + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI explicit forward_list(const allocator_type& __a); + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI explicit forward_list(size_type __n); # if _LIBCPP_STD_VER >= 14 - _LIBCPP_HIDE_FROM_ABI explicit forward_list(size_type __n, const allocator_type& __a); + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI explicit forward_list(size_type __n, const allocator_type& __a); # endif - _LIBCPP_HIDE_FROM_ABI forward_list(size_type __n, const value_type& __v); + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI forward_list(size_type __n, const value_type& __v); template <__enable_if_t<__is_allocator<_Alloc>::value, int> = 0> - _LIBCPP_HIDE_FROM_ABI forward_list(size_type __n, const value_type& __v, const allocator_type& __a) : __base(__a) { + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI + forward_list(size_type __n, const value_type& __v, const allocator_type& __a) + : __base(__a) { insert_after(cbefore_begin(), __n, __v); } template ::value, int> = 0> - _LIBCPP_HIDE_FROM_ABI forward_list(_InputIterator __f, _InputIterator __l); + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI forward_list(_InputIterator __f, _InputIterator __l); template ::value, int> = 0> - _LIBCPP_HIDE_FROM_ABI forward_list(_InputIterator __f, _InputIterator __l, const allocator_type& __a); + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI + forward_list(_InputIterator __f, _InputIterator __l, const allocator_type& __a); # if _LIBCPP_STD_VER >= 23 template <_ContainerCompatibleRange<_Tp> _Range> - _LIBCPP_HIDE_FROM_ABI forward_list(from_range_t, _Range&& __range, const allocator_type& __a = allocator_type()) + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI + forward_list(from_range_t, _Range&& __range, const allocator_type& __a = allocator_type()) : __base(__a) { prepend_range(std::forward<_Range>(__range)); } # endif - _LIBCPP_HIDE_FROM_ABI forward_list(const forward_list& __x); - _LIBCPP_HIDE_FROM_ABI forward_list(const forward_list& __x, const __type_identity_t& __a); + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI forward_list(const forward_list& __x); + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI + forward_list(const forward_list& __x, const __type_identity_t& __a); - _LIBCPP_HIDE_FROM_ABI forward_list& operator=(const forward_list& __x); + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI forward_list& operator=(const forward_list& __x); # ifndef _LIBCPP_CXX03_LANG - _LIBCPP_HIDE_FROM_ABI forward_list(forward_list&& __x) noexcept(is_nothrow_move_constructible<__base>::value) + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI + forward_list(forward_list&& __x) noexcept(is_nothrow_move_constructible<__base>::value) : __base(std::move(__x)) {} - _LIBCPP_HIDE_FROM_ABI forward_list(forward_list&& __x, const __type_identity_t& __a); + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI + forward_list(forward_list&& __x, const __type_identity_t& __a); - _LIBCPP_HIDE_FROM_ABI forward_list(initializer_list __il); - _LIBCPP_HIDE_FROM_ABI forward_list(initializer_list __il, const allocator_type& __a); + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI forward_list(initializer_list __il); + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI + forward_list(initializer_list __il, const allocator_type& __a); - _LIBCPP_HIDE_FROM_ABI forward_list& operator=(forward_list&& __x) noexcept( + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI forward_list& operator=(forward_list&& __x) noexcept( (__node_traits::propagate_on_container_move_assignment::value && is_nothrow_move_assignable::value) || allocator_traits::is_always_equal::value); - _LIBCPP_HIDE_FROM_ABI forward_list& operator=(initializer_list __il); + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI forward_list& operator=(initializer_list __il); - _LIBCPP_HIDE_FROM_ABI void assign(initializer_list __il); + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void assign(initializer_list __il); # endif // _LIBCPP_CXX03_LANG // ~forward_list() = default; template ::value, int> = 0> - void _LIBCPP_HIDE_FROM_ABI assign(_InputIterator __f, _InputIterator __l); + _LIBCPP_CONSTEXPR_SINCE_CXX26 void _LIBCPP_HIDE_FROM_ABI assign(_InputIterator __f, _InputIterator __l); # if _LIBCPP_STD_VER >= 23 template <_ContainerCompatibleRange<_Tp> _Range> - _LIBCPP_HIDE_FROM_ABI void assign_range(_Range&& __range) { + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void assign_range(_Range&& __range) { __assign_with_sentinel(ranges::begin(__range), ranges::end(__range)); } # endif - _LIBCPP_HIDE_FROM_ABI void assign(size_type __n, const value_type& __v); + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void assign(size_type __n, const value_type& __v); - _LIBCPP_HIDE_FROM_ABI allocator_type get_allocator() const _NOEXCEPT { return allocator_type(this->__alloc_); } + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI allocator_type get_allocator() const _NOEXCEPT { + return allocator_type(this->__alloc_); + } - _LIBCPP_HIDE_FROM_ABI iterator begin() _NOEXCEPT { return iterator(__base::__before_begin()->__next_); } - _LIBCPP_HIDE_FROM_ABI const_iterator begin() const _NOEXCEPT { + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI iterator begin() _NOEXCEPT { + return iterator(__base::__before_begin()->__next_); + } + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI const_iterator begin() const _NOEXCEPT { return const_iterator(__base::__before_begin()->__next_); } - _LIBCPP_HIDE_FROM_ABI iterator end() _NOEXCEPT { return iterator(nullptr); } - _LIBCPP_HIDE_FROM_ABI const_iterator end() const _NOEXCEPT { return const_iterator(nullptr); } + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI iterator end() _NOEXCEPT { return iterator(nullptr); } + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI const_iterator end() const _NOEXCEPT { + return const_iterator(nullptr); + } - _LIBCPP_HIDE_FROM_ABI const_iterator cbegin() const _NOEXCEPT { + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI const_iterator cbegin() const _NOEXCEPT { return const_iterator(__base::__before_begin()->__next_); } - _LIBCPP_HIDE_FROM_ABI const_iterator cend() const _NOEXCEPT { return const_iterator(nullptr); } + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI const_iterator cend() const _NOEXCEPT { + return const_iterator(nullptr); + } - _LIBCPP_HIDE_FROM_ABI iterator before_begin() _NOEXCEPT { return iterator(__base::__before_begin()); } - _LIBCPP_HIDE_FROM_ABI const_iterator before_begin() const _NOEXCEPT { + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI iterator before_begin() _NOEXCEPT { + return iterator(__base::__before_begin()); + } + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI const_iterator before_begin() const _NOEXCEPT { return const_iterator(__base::__before_begin()); } - _LIBCPP_HIDE_FROM_ABI const_iterator cbefore_begin() const _NOEXCEPT { + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI const_iterator cbefore_begin() const _NOEXCEPT { return const_iterator(__base::__before_begin()); } - [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI bool empty() const _NOEXCEPT { + [[__nodiscard__]] _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI bool empty() const _NOEXCEPT { return __base::__before_begin()->__next_ == nullptr; } - _LIBCPP_HIDE_FROM_ABI size_type max_size() const _NOEXCEPT { + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI size_type max_size() const _NOEXCEPT { return std::min(__node_traits::max_size(this->__alloc_), numeric_limits::max()); } - _LIBCPP_HIDE_FROM_ABI reference front() { + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI reference front() { _LIBCPP_ASSERT_NON_NULL(!empty(), "forward_list::front called on an empty list"); return __base::__before_begin()->__next_->__get_value(); } - _LIBCPP_HIDE_FROM_ABI const_reference front() const { + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI const_reference front() const { _LIBCPP_ASSERT_NON_NULL(!empty(), "forward_list::front called on an empty list"); return __base::__before_begin()->__next_->__get_value(); } @@ -778,54 +815,59 @@ public: # ifndef _LIBCPP_CXX03_LANG # if _LIBCPP_STD_VER >= 17 template - _LIBCPP_HIDE_FROM_ABI reference emplace_front(_Args&&... __args); + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI reference emplace_front(_Args&&... __args); # else template - _LIBCPP_HIDE_FROM_ABI void emplace_front(_Args&&... __args); + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void emplace_front(_Args&&... __args); # endif - _LIBCPP_HIDE_FROM_ABI void push_front(value_type&& __v); + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void push_front(value_type&& __v); # endif // _LIBCPP_CXX03_LANG - _LIBCPP_HIDE_FROM_ABI void push_front(const value_type& __v); + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void push_front(const value_type& __v); # if _LIBCPP_STD_VER >= 23 template <_ContainerCompatibleRange<_Tp> _Range> - _LIBCPP_HIDE_FROM_ABI void prepend_range(_Range&& __range) { + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void prepend_range(_Range&& __range) { insert_range_after(cbefore_begin(), std::forward<_Range>(__range)); } # endif - _LIBCPP_HIDE_FROM_ABI void pop_front(); + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void pop_front(); # ifndef _LIBCPP_CXX03_LANG template - _LIBCPP_HIDE_FROM_ABI iterator emplace_after(const_iterator __p, _Args&&... __args); + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI iterator emplace_after(const_iterator __p, _Args&&... __args); - _LIBCPP_HIDE_FROM_ABI iterator insert_after(const_iterator __p, value_type&& __v); - _LIBCPP_HIDE_FROM_ABI iterator insert_after(const_iterator __p, initializer_list __il) { + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI iterator insert_after(const_iterator __p, value_type&& __v); + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI iterator + insert_after(const_iterator __p, initializer_list __il) { return insert_after(__p, __il.begin(), __il.end()); } # endif // _LIBCPP_CXX03_LANG - _LIBCPP_HIDE_FROM_ABI iterator insert_after(const_iterator __p, const value_type& __v); - _LIBCPP_HIDE_FROM_ABI iterator insert_after(const_iterator __p, size_type __n, const value_type& __v) { + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI iterator insert_after(const_iterator __p, const value_type& __v); + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI iterator + insert_after(const_iterator __p, size_type __n, const value_type& __v) { return __insert_after(__p, __n, __v); } template ::value, int> = 0> - _LIBCPP_HIDE_FROM_ABI iterator insert_after(const_iterator __p, _InputIterator __f, _InputIterator __l); + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI iterator + insert_after(const_iterator __p, _InputIterator __f, _InputIterator __l); # if _LIBCPP_STD_VER >= 23 template <_ContainerCompatibleRange<_Tp> _Range> - _LIBCPP_HIDE_FROM_ABI iterator insert_range_after(const_iterator __position, _Range&& __range) { + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI iterator + insert_range_after(const_iterator __position, _Range&& __range) { return __insert_after_with_sentinel(__position, ranges::begin(__range), ranges::end(__range)); } # endif template - _LIBCPP_HIDE_FROM_ABI iterator __insert_after_with_sentinel(const_iterator __p, _InputIterator __f, _Sentinel __l); + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI iterator + __insert_after_with_sentinel(const_iterator __p, _InputIterator __f, _Sentinel __l); - _LIBCPP_HIDE_FROM_ABI iterator erase_after(const_iterator __p); - _LIBCPP_HIDE_FROM_ABI iterator erase_after(const_iterator __f, const_iterator __l); + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI iterator erase_after(const_iterator __p); + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI iterator erase_after(const_iterator __f, const_iterator __l); - _LIBCPP_HIDE_FROM_ABI void swap(forward_list& __x) + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void swap(forward_list& __x) # if _LIBCPP_STD_VER >= 14 _NOEXCEPT # else @@ -835,58 +877,63 @@ public: __base::swap(__x); } - _LIBCPP_HIDE_FROM_ABI void resize(size_type __n); - _LIBCPP_HIDE_FROM_ABI void resize(size_type __n, const value_type& __v); - _LIBCPP_HIDE_FROM_ABI void clear() _NOEXCEPT { __base::clear(); } + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void resize(size_type __n); + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void resize(size_type __n, const value_type& __v); + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void clear() _NOEXCEPT { __base::clear(); } - _LIBCPP_HIDE_FROM_ABI void splice_after(const_iterator __p, forward_list&& __x); - _LIBCPP_HIDE_FROM_ABI void splice_after(const_iterator __p, forward_list&& __x, const_iterator __i); - _LIBCPP_HIDE_FROM_ABI void + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void splice_after(const_iterator __p, forward_list&& __x); + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void + splice_after(const_iterator __p, forward_list&& __x, const_iterator __i); + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void splice_after(const_iterator __p, forward_list&& __x, const_iterator __f, const_iterator __l); - _LIBCPP_HIDE_FROM_ABI void splice_after(const_iterator __p, forward_list& __x); - _LIBCPP_HIDE_FROM_ABI void splice_after(const_iterator __p, forward_list& __x, const_iterator __i); - _LIBCPP_HIDE_FROM_ABI void + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void splice_after(const_iterator __p, forward_list& __x); + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void + splice_after(const_iterator __p, forward_list& __x, const_iterator __i); + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void splice_after(const_iterator __p, forward_list& __x, const_iterator __f, const_iterator __l); - _LIBCPP_HIDE_FROM_ABI __remove_return_type remove(const value_type& __v); + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI __remove_return_type remove(const value_type& __v); template - _LIBCPP_HIDE_FROM_ABI __remove_return_type remove_if(_Predicate __pred); - _LIBCPP_HIDE_FROM_ABI __remove_return_type unique() { return unique(__equal_to()); } + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI __remove_return_type remove_if(_Predicate __pred); + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI __remove_return_type unique() { return unique(__equal_to()); } template - _LIBCPP_HIDE_FROM_ABI __remove_return_type unique(_BinaryPredicate __binary_pred); + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI __remove_return_type unique(_BinaryPredicate __binary_pred); # ifndef _LIBCPP_CXX03_LANG - _LIBCPP_HIDE_FROM_ABI void merge(forward_list&& __x) { merge(__x, __less<>()); } + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void merge(forward_list&& __x) { merge(__x, __less<>()); } template - _LIBCPP_HIDE_FROM_ABI void merge(forward_list&& __x, _Compare __comp) { + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void merge(forward_list&& __x, _Compare __comp) { merge(__x, std::move(__comp)); } # endif // _LIBCPP_CXX03_LANG - _LIBCPP_HIDE_FROM_ABI void merge(forward_list& __x) { merge(__x, __less<>()); } + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void merge(forward_list& __x) { merge(__x, __less<>()); } template - _LIBCPP_HIDE_FROM_ABI void merge(forward_list& __x, _Compare __comp); - _LIBCPP_HIDE_FROM_ABI void sort() { sort(__less<>()); } + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void merge(forward_list& __x, _Compare __comp); + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void sort() { sort(__less<>()); } template - _LIBCPP_HIDE_FROM_ABI void sort(_Compare __comp); - _LIBCPP_HIDE_FROM_ABI void reverse() _NOEXCEPT; + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void sort(_Compare __comp); + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void reverse() _NOEXCEPT; private: # ifndef _LIBCPP_CXX03_LANG - _LIBCPP_HIDE_FROM_ABI void __move_assign(forward_list& __x, true_type) + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void __move_assign(forward_list& __x, true_type) _NOEXCEPT_(is_nothrow_move_assignable::value); - _LIBCPP_HIDE_FROM_ABI void __move_assign(forward_list& __x, false_type); + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void __move_assign(forward_list& __x, false_type); # endif // _LIBCPP_CXX03_LANG template - _LIBCPP_HIDE_FROM_ABI void __assign_with_sentinel(_Iter __f, _Sent __l); + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void __assign_with_sentinel(_Iter __f, _Sent __l); template - _LIBCPP_HIDE_FROM_ABI iterator __insert_after(const_iterator __p, size_type __n, _Args&&... __args); + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI iterator + __insert_after(const_iterator __p, size_type __n, _Args&&... __args); template - static _LIBCPP_HIDE_FROM_ABI __node_pointer __merge(__node_pointer __f1, __node_pointer __f2, _Compare& __comp); + _LIBCPP_CONSTEXPR_SINCE_CXX26 static _LIBCPP_HIDE_FROM_ABI __node_pointer + __merge(__node_pointer __f1, __node_pointer __f2, _Compare& __comp); // TODO: Make this _LIBCPP_HIDE_FROM_ABI template - static _LIBCPP_HIDDEN __node_pointer __sort(__node_pointer __f, difference_type __sz, _Compare& __comp); + _LIBCPP_CONSTEXPR_SINCE_CXX26 static _LIBCPP_HIDDEN __node_pointer + __sort(__node_pointer __f, difference_type __sz, _Compare& __comp); }; # if _LIBCPP_STD_VER >= 17 @@ -911,10 +958,10 @@ forward_list(from_range_t, _Range&&, _Alloc = _Alloc()) -> forward_list -inline forward_list<_Tp, _Alloc>::forward_list(const allocator_type& __a) : __base(__a) {} +_LIBCPP_CONSTEXPR_SINCE_CXX26 inline forward_list<_Tp, _Alloc>::forward_list(const allocator_type& __a) : __base(__a) {} template -forward_list<_Tp, _Alloc>::forward_list(size_type __n) { +_LIBCPP_CONSTEXPR_SINCE_CXX26 forward_list<_Tp, _Alloc>::forward_list(size_type __n) { if (__n > 0) { for (__begin_node_pointer __p = __base::__before_begin(); __n > 0; --__n, __p = __p->__next_as_begin()) { __p->__next_ = this->__create_node(/* next = */ nullptr); @@ -924,7 +971,8 @@ forward_list<_Tp, _Alloc>::forward_list(size_type __n) { # if _LIBCPP_STD_VER >= 14 template -forward_list<_Tp, _Alloc>::forward_list(size_type __n, const allocator_type& __base_alloc) : __base(__base_alloc) { +_LIBCPP_CONSTEXPR_SINCE_CXX26 forward_list<_Tp, _Alloc>::forward_list(size_type __n, const allocator_type& __base_alloc) + : __base(__base_alloc) { if (__n > 0) { for (__begin_node_pointer __p = __base::__before_begin(); __n > 0; --__n, __p = __p->__next_as_begin()) { __p->__next_ = this->__create_node(/* next = */ nullptr); @@ -934,37 +982,39 @@ forward_list<_Tp, _Alloc>::forward_list(size_type __n, const allocator_type& __b # endif template -forward_list<_Tp, _Alloc>::forward_list(size_type __n, const value_type& __v) { +_LIBCPP_CONSTEXPR_SINCE_CXX26 forward_list<_Tp, _Alloc>::forward_list(size_type __n, const value_type& __v) { insert_after(cbefore_begin(), __n, __v); } template template ::value, int> > -forward_list<_Tp, _Alloc>::forward_list(_InputIterator __f, _InputIterator __l) { +_LIBCPP_CONSTEXPR_SINCE_CXX26 forward_list<_Tp, _Alloc>::forward_list(_InputIterator __f, _InputIterator __l) { insert_after(cbefore_begin(), __f, __l); } template template ::value, int> > +_LIBCPP_CONSTEXPR_SINCE_CXX26 forward_list<_Tp, _Alloc>::forward_list(_InputIterator __f, _InputIterator __l, const allocator_type& __a) : __base(__a) { insert_after(cbefore_begin(), __f, __l); } template -forward_list<_Tp, _Alloc>::forward_list(const forward_list& __x) +_LIBCPP_CONSTEXPR_SINCE_CXX26 forward_list<_Tp, _Alloc>::forward_list(const forward_list& __x) : __base(__node_traits::select_on_container_copy_construction(__x.__alloc_)) { insert_after(cbefore_begin(), __x.begin(), __x.end()); } template +_LIBCPP_CONSTEXPR_SINCE_CXX26 forward_list<_Tp, _Alloc>::forward_list(const forward_list& __x, const __type_identity_t& __a) : __base(__a) { insert_after(cbefore_begin(), __x.begin(), __x.end()); } template -forward_list<_Tp, _Alloc>& forward_list<_Tp, _Alloc>::operator=(const forward_list& __x) { +_LIBCPP_CONSTEXPR_SINCE_CXX26 forward_list<_Tp, _Alloc>& forward_list<_Tp, _Alloc>::operator=(const forward_list& __x) { if (this != std::addressof(__x)) { __base::__copy_assign_alloc(__x); assign(__x.begin(), __x.end()); @@ -974,6 +1024,7 @@ forward_list<_Tp, _Alloc>& forward_list<_Tp, _Alloc>::operator=(const forward_li # ifndef _LIBCPP_CXX03_LANG template +_LIBCPP_CONSTEXPR_SINCE_CXX26 forward_list<_Tp, _Alloc>::forward_list(forward_list&& __x, const __type_identity_t& __a) : __base(std::move(__x), __a) { if (this->__alloc_ != __x.__alloc_) { @@ -983,17 +1034,19 @@ forward_list<_Tp, _Alloc>::forward_list(forward_list&& __x, const __type_identit } template -forward_list<_Tp, _Alloc>::forward_list(initializer_list __il) { +_LIBCPP_CONSTEXPR_SINCE_CXX26 forward_list<_Tp, _Alloc>::forward_list(initializer_list __il) { insert_after(cbefore_begin(), __il.begin(), __il.end()); } template -forward_list<_Tp, _Alloc>::forward_list(initializer_list __il, const allocator_type& __a) : __base(__a) { +_LIBCPP_CONSTEXPR_SINCE_CXX26 +forward_list<_Tp, _Alloc>::forward_list(initializer_list __il, const allocator_type& __a) + : __base(__a) { insert_after(cbefore_begin(), __il.begin(), __il.end()); } template -void forward_list<_Tp, _Alloc>::__move_assign(forward_list& __x, true_type) +_LIBCPP_CONSTEXPR_SINCE_CXX26 void forward_list<_Tp, _Alloc>::__move_assign(forward_list& __x, true_type) _NOEXCEPT_(is_nothrow_move_assignable::value) { clear(); __base::__move_assign_alloc(__x); @@ -1002,7 +1055,7 @@ void forward_list<_Tp, _Alloc>::__move_assign(forward_list& __x, true_type) } template -void forward_list<_Tp, _Alloc>::__move_assign(forward_list& __x, false_type) { +_LIBCPP_CONSTEXPR_SINCE_CXX26 void forward_list<_Tp, _Alloc>::__move_assign(forward_list& __x, false_type) { if (this->__alloc_ == __x.__alloc_) __move_assign(__x, true_type()); else { @@ -1012,7 +1065,8 @@ void forward_list<_Tp, _Alloc>::__move_assign(forward_list& __x, false_type) { } template -inline forward_list<_Tp, _Alloc>& forward_list<_Tp, _Alloc>::operator=(forward_list&& __x) noexcept( +_LIBCPP_CONSTEXPR_SINCE_CXX26 inline forward_list<_Tp, _Alloc>& +forward_list<_Tp, _Alloc>::operator=(forward_list&& __x) noexcept( (__node_traits::propagate_on_container_move_assignment::value && is_nothrow_move_assignable::value) || allocator_traits::is_always_equal::value) { @@ -1021,7 +1075,8 @@ inline forward_list<_Tp, _Alloc>& forward_list<_Tp, _Alloc>::operator=(forward_l } template -inline forward_list<_Tp, _Alloc>& forward_list<_Tp, _Alloc>::operator=(initializer_list __il) { +_LIBCPP_CONSTEXPR_SINCE_CXX26 inline forward_list<_Tp, _Alloc>& +forward_list<_Tp, _Alloc>::operator=(initializer_list __il) { assign(__il.begin(), __il.end()); return *this; } @@ -1030,13 +1085,14 @@ inline forward_list<_Tp, _Alloc>& forward_list<_Tp, _Alloc>::operator=(initializ template template ::value, int> > -void forward_list<_Tp, _Alloc>::assign(_InputIterator __f, _InputIterator __l) { +_LIBCPP_CONSTEXPR_SINCE_CXX26 void forward_list<_Tp, _Alloc>::assign(_InputIterator __f, _InputIterator __l) { __assign_with_sentinel(__f, __l); } template template -_LIBCPP_HIDE_FROM_ABI void forward_list<_Tp, _Alloc>::__assign_with_sentinel(_Iter __f, _Sent __l) { +_LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void +forward_list<_Tp, _Alloc>::__assign_with_sentinel(_Iter __f, _Sent __l) { iterator __i = before_begin(); iterator __j = std::next(__i); iterator __e = end(); @@ -1049,7 +1105,7 @@ _LIBCPP_HIDE_FROM_ABI void forward_list<_Tp, _Alloc>::__assign_with_sentinel(_It } template -void forward_list<_Tp, _Alloc>::assign(size_type __n, const value_type& __v) { +_LIBCPP_CONSTEXPR_SINCE_CXX26 void forward_list<_Tp, _Alloc>::assign(size_type __n, const value_type& __v) { iterator __i = before_begin(); iterator __j = std::next(__i); iterator __e = end(); @@ -1064,18 +1120,19 @@ void forward_list<_Tp, _Alloc>::assign(size_type __n, const value_type& __v) { # ifndef _LIBCPP_CXX03_LANG template -inline void forward_list<_Tp, _Alloc>::assign(initializer_list __il) { +_LIBCPP_CONSTEXPR_SINCE_CXX26 inline void forward_list<_Tp, _Alloc>::assign(initializer_list __il) { assign(__il.begin(), __il.end()); } template template +_LIBCPP_CONSTEXPR_SINCE_CXX26 # if _LIBCPP_STD_VER >= 17 -typename forward_list<_Tp, _Alloc>::reference + typename forward_list<_Tp, _Alloc>::reference # else -void + void # endif -forward_list<_Tp, _Alloc>::emplace_front(_Args&&... __args) { + forward_list<_Tp, _Alloc>::emplace_front(_Args&&... __args) { __base::__before_begin()->__next_ = this->__create_node(/* next = */ __base::__before_begin()->__next_, std::forward<_Args>(__args)...); # if _LIBCPP_STD_VER >= 17 @@ -1084,7 +1141,7 @@ forward_list<_Tp, _Alloc>::emplace_front(_Args&&... __args) { } template -void forward_list<_Tp, _Alloc>::push_front(value_type&& __v) { +_LIBCPP_CONSTEXPR_SINCE_CXX26 void forward_list<_Tp, _Alloc>::push_front(value_type&& __v) { __base::__before_begin()->__next_ = this->__create_node(/* next = */ __base::__before_begin()->__next_, std::move(__v)); } @@ -1092,12 +1149,12 @@ void forward_list<_Tp, _Alloc>::push_front(value_type&& __v) { # endif // _LIBCPP_CXX03_LANG template -void forward_list<_Tp, _Alloc>::push_front(const value_type& __v) { +_LIBCPP_CONSTEXPR_SINCE_CXX26 void forward_list<_Tp, _Alloc>::push_front(const value_type& __v) { __base::__before_begin()->__next_ = this->__create_node(/* next = */ __base::__before_begin()->__next_, __v); } template -void forward_list<_Tp, _Alloc>::pop_front() { +_LIBCPP_CONSTEXPR_SINCE_CXX26 void forward_list<_Tp, _Alloc>::pop_front() { _LIBCPP_ASSERT_NON_NULL(!empty(), "forward_list::pop_front called on an empty list"); __node_pointer __p = __base::__before_begin()->__next_; __base::__before_begin()->__next_ = __p->__next_; @@ -1108,7 +1165,7 @@ void forward_list<_Tp, _Alloc>::pop_front() { template template -typename forward_list<_Tp, _Alloc>::iterator +_LIBCPP_CONSTEXPR_SINCE_CXX26 typename forward_list<_Tp, _Alloc>::iterator forward_list<_Tp, _Alloc>::emplace_after(const_iterator __p, _Args&&... __args) { __begin_node_pointer const __r = __p.__get_begin(); __r->__next_ = this->__create_node(/* next = */ __r->__next_, std::forward<_Args>(__args)...); @@ -1116,7 +1173,7 @@ forward_list<_Tp, _Alloc>::emplace_after(const_iterator __p, _Args&&... __args) } template -typename forward_list<_Tp, _Alloc>::iterator +_LIBCPP_CONSTEXPR_SINCE_CXX26 typename forward_list<_Tp, _Alloc>::iterator forward_list<_Tp, _Alloc>::insert_after(const_iterator __p, value_type&& __v) { __begin_node_pointer const __r = __p.__get_begin(); __r->__next_ = this->__create_node(/* next = */ __r->__next_, std::move(__v)); @@ -1126,7 +1183,7 @@ forward_list<_Tp, _Alloc>::insert_after(const_iterator __p, value_type&& __v) { # endif // _LIBCPP_CXX03_LANG template -typename forward_list<_Tp, _Alloc>::iterator +_LIBCPP_CONSTEXPR_SINCE_CXX26 typename forward_list<_Tp, _Alloc>::iterator forward_list<_Tp, _Alloc>::insert_after(const_iterator __p, const value_type& __v) { __begin_node_pointer const __r = __p.__get_begin(); __r->__next_ = this->__create_node(/* next = */ __r->__next_, __v); @@ -1135,7 +1192,7 @@ forward_list<_Tp, _Alloc>::insert_after(const_iterator __p, const value_type& __ template template -typename forward_list<_Tp, _Alloc>::iterator +_LIBCPP_CONSTEXPR_SINCE_CXX26 typename forward_list<_Tp, _Alloc>::iterator forward_list<_Tp, _Alloc>::__insert_after(const_iterator __p, size_type __n, _Args&&... __args) { __begin_node_pointer __r = __p.__get_begin(); if (__n > 0) { @@ -1159,21 +1216,21 @@ forward_list<_Tp, _Alloc>::__insert_after(const_iterator __p, size_type __n, _Ar # endif // _LIBCPP_HAS_EXCEPTIONS __last->__next_ = __r->__next_; __r->__next_ = __first; - __r = static_cast<__begin_node_pointer>(__last); + __r = __forward_node_traits<__node_pointer>::__as_iter_node(__last); } return iterator(__r); } template template ::value, int> > -typename forward_list<_Tp, _Alloc>::iterator +_LIBCPP_CONSTEXPR_SINCE_CXX26 typename forward_list<_Tp, _Alloc>::iterator forward_list<_Tp, _Alloc>::insert_after(const_iterator __p, _InputIterator __f, _InputIterator __l) { return __insert_after_with_sentinel(__p, std::move(__f), std::move(__l)); } template template -_LIBCPP_HIDE_FROM_ABI typename forward_list<_Tp, _Alloc>::iterator +_LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI typename forward_list<_Tp, _Alloc>::iterator forward_list<_Tp, _Alloc>::__insert_after_with_sentinel(const_iterator __p, _InputIterator __f, _Sentinel __l) { __begin_node_pointer __r = __p.__get_begin(); @@ -1200,14 +1257,15 @@ forward_list<_Tp, _Alloc>::__insert_after_with_sentinel(const_iterator __p, _Inp __last->__next_ = __r->__next_; __r->__next_ = __first; - __r = static_cast<__begin_node_pointer>(__last); + __r = __forward_node_traits<__node_pointer>::__as_iter_node(__last); } return iterator(__r); } template -typename forward_list<_Tp, _Alloc>::iterator forward_list<_Tp, _Alloc>::erase_after(const_iterator __f) { +_LIBCPP_CONSTEXPR_SINCE_CXX26 typename forward_list<_Tp, _Alloc>::iterator +forward_list<_Tp, _Alloc>::erase_after(const_iterator __f) { __begin_node_pointer __p = __f.__get_begin(); __node_pointer __n = __p->__next_; __p->__next_ = __n->__next_; @@ -1216,7 +1274,7 @@ typename forward_list<_Tp, _Alloc>::iterator forward_list<_Tp, _Alloc>::erase_af } template -typename forward_list<_Tp, _Alloc>::iterator +_LIBCPP_CONSTEXPR_SINCE_CXX26 typename forward_list<_Tp, _Alloc>::iterator forward_list<_Tp, _Alloc>::erase_after(const_iterator __f, const_iterator __l) { __node_pointer __e = __l.__get_unsafe_node_pointer(); if (__f != __l) { @@ -1236,7 +1294,7 @@ forward_list<_Tp, _Alloc>::erase_after(const_iterator __f, const_iterator __l) { } template -void forward_list<_Tp, _Alloc>::resize(size_type __n) { +_LIBCPP_CONSTEXPR_SINCE_CXX26 void forward_list<_Tp, _Alloc>::resize(size_type __n) { size_type __sz = 0; iterator __p = before_begin(); iterator __i = begin(); @@ -1250,7 +1308,7 @@ void forward_list<_Tp, _Alloc>::resize(size_type __n) { } template -void forward_list<_Tp, _Alloc>::resize(size_type __n, const value_type& __v) { +_LIBCPP_CONSTEXPR_SINCE_CXX26 void forward_list<_Tp, _Alloc>::resize(size_type __n, const value_type& __v) { size_type __sz = 0; iterator __p = before_begin(); iterator __i = begin(); @@ -1264,7 +1322,7 @@ void forward_list<_Tp, _Alloc>::resize(size_type __n, const value_type& __v) { } template -void forward_list<_Tp, _Alloc>::splice_after(const_iterator __p, forward_list& __x) { +_LIBCPP_CONSTEXPR_SINCE_CXX26 void forward_list<_Tp, _Alloc>::splice_after(const_iterator __p, forward_list& __x) { if (!__x.empty()) { if (__p.__get_begin()->__next_ != nullptr) { const_iterator __lm1 = __x.before_begin(); @@ -1278,7 +1336,8 @@ void forward_list<_Tp, _Alloc>::splice_after(const_iterator __p, forward_list& _ } template -void forward_list<_Tp, _Alloc>::splice_after(const_iterator __p, forward_list& /*__other*/, const_iterator __i) { +_LIBCPP_CONSTEXPR_SINCE_CXX26 void +forward_list<_Tp, _Alloc>::splice_after(const_iterator __p, forward_list& /*__other*/, const_iterator __i) { const_iterator __lm1 = std::next(__i); if (__p != __i && __p != __lm1) { __i.__get_begin()->__next_ = __lm1.__get_begin()->__next_; @@ -1288,7 +1347,7 @@ void forward_list<_Tp, _Alloc>::splice_after(const_iterator __p, forward_list& / } template -void forward_list<_Tp, _Alloc>::splice_after( +_LIBCPP_CONSTEXPR_SINCE_CXX26 void forward_list<_Tp, _Alloc>::splice_after( const_iterator __p, forward_list& /*__other*/, const_iterator __f, const_iterator __l) { if (__f != __l && __p != __f) { const_iterator __lm1 = __f; @@ -1303,24 +1362,26 @@ void forward_list<_Tp, _Alloc>::splice_after( } template -inline _LIBCPP_HIDE_FROM_ABI void forward_list<_Tp, _Alloc>::splice_after(const_iterator __p, forward_list&& __x) { +_LIBCPP_CONSTEXPR_SINCE_CXX26 inline _LIBCPP_HIDE_FROM_ABI void +forward_list<_Tp, _Alloc>::splice_after(const_iterator __p, forward_list&& __x) { splice_after(__p, __x); } template -inline _LIBCPP_HIDE_FROM_ABI void +_LIBCPP_CONSTEXPR_SINCE_CXX26 inline _LIBCPP_HIDE_FROM_ABI void forward_list<_Tp, _Alloc>::splice_after(const_iterator __p, forward_list&& __x, const_iterator __i) { splice_after(__p, __x, __i); } template -inline _LIBCPP_HIDE_FROM_ABI void forward_list<_Tp, _Alloc>::splice_after( +_LIBCPP_CONSTEXPR_SINCE_CXX26 inline _LIBCPP_HIDE_FROM_ABI void forward_list<_Tp, _Alloc>::splice_after( const_iterator __p, forward_list&& __x, const_iterator __f, const_iterator __l) { splice_after(__p, __x, __f, __l); } template -typename forward_list<_Tp, _Alloc>::__remove_return_type forward_list<_Tp, _Alloc>::remove(const value_type& __v) { +_LIBCPP_CONSTEXPR_SINCE_CXX26 typename forward_list<_Tp, _Alloc>::__remove_return_type +forward_list<_Tp, _Alloc>::remove(const value_type& __v) { forward_list<_Tp, _Alloc> __deleted_nodes(get_allocator()); // collect the nodes we're removing typename forward_list<_Tp, _Alloc>::size_type __count_removed = 0; const iterator __e = end(); @@ -1343,7 +1404,8 @@ typename forward_list<_Tp, _Alloc>::__remove_return_type forward_list<_Tp, _Allo template template -typename forward_list<_Tp, _Alloc>::__remove_return_type forward_list<_Tp, _Alloc>::remove_if(_Predicate __pred) { +_LIBCPP_CONSTEXPR_SINCE_CXX26 typename forward_list<_Tp, _Alloc>::__remove_return_type +forward_list<_Tp, _Alloc>::remove_if(_Predicate __pred) { forward_list<_Tp, _Alloc> __deleted_nodes(get_allocator()); // collect the nodes we're removing typename forward_list<_Tp, _Alloc>::size_type __count_removed = 0; const iterator __e = end(); @@ -1366,7 +1428,7 @@ typename forward_list<_Tp, _Alloc>::__remove_return_type forward_list<_Tp, _Allo template template -typename forward_list<_Tp, _Alloc>::__remove_return_type +_LIBCPP_CONSTEXPR_SINCE_CXX26 typename forward_list<_Tp, _Alloc>::__remove_return_type forward_list<_Tp, _Alloc>::unique(_BinaryPredicate __binary_pred) { forward_list<_Tp, _Alloc> __deleted_nodes(get_allocator()); // collect the nodes we're removing typename forward_list<_Tp, _Alloc>::size_type __count_removed = 0; @@ -1384,7 +1446,7 @@ forward_list<_Tp, _Alloc>::unique(_BinaryPredicate __binary_pred) { template template -void forward_list<_Tp, _Alloc>::merge(forward_list& __x, _Compare __comp) { +_LIBCPP_CONSTEXPR_SINCE_CXX26 void forward_list<_Tp, _Alloc>::merge(forward_list& __x, _Compare __comp) { if (this != std::addressof(__x)) { __base::__before_begin()->__next_ = __merge(__base::__before_begin()->__next_, __x.__before_begin()->__next_, __comp); @@ -1394,7 +1456,7 @@ void forward_list<_Tp, _Alloc>::merge(forward_list& __x, _Compare __comp) { template template -typename forward_list<_Tp, _Alloc>::__node_pointer +_LIBCPP_CONSTEXPR_SINCE_CXX26 typename forward_list<_Tp, _Alloc>::__node_pointer forward_list<_Tp, _Alloc>::__merge(__node_pointer __f1, __node_pointer __f2, _Compare& __comp) { if (__f1 == nullptr) return __f2; @@ -1431,13 +1493,13 @@ forward_list<_Tp, _Alloc>::__merge(__node_pointer __f1, __node_pointer __f2, _Co template template -inline void forward_list<_Tp, _Alloc>::sort(_Compare __comp) { +_LIBCPP_CONSTEXPR_SINCE_CXX26 inline void forward_list<_Tp, _Alloc>::sort(_Compare __comp) { __base::__before_begin()->__next_ = __sort(__base::__before_begin()->__next_, std::distance(begin(), end()), __comp); } template template -typename forward_list<_Tp, _Alloc>::__node_pointer +_LIBCPP_CONSTEXPR_SINCE_CXX26 typename forward_list<_Tp, _Alloc>::__node_pointer forward_list<_Tp, _Alloc>::__sort(__node_pointer __f1, difference_type __sz, _Compare& __comp) { switch (__sz) { case 0: @@ -1461,7 +1523,7 @@ forward_list<_Tp, _Alloc>::__sort(__node_pointer __f1, difference_type __sz, _Co } template -void forward_list<_Tp, _Alloc>::reverse() _NOEXCEPT { +_LIBCPP_CONSTEXPR_SINCE_CXX26 void forward_list<_Tp, _Alloc>::reverse() _NOEXCEPT { __node_pointer __p = __base::__before_begin()->__next_; if (__p != nullptr) { __node_pointer __f = __p->__next_; @@ -1477,7 +1539,8 @@ void forward_list<_Tp, _Alloc>::reverse() _NOEXCEPT { } template -_LIBCPP_HIDE_FROM_ABI bool operator==(const forward_list<_Tp, _Alloc>& __x, const forward_list<_Tp, _Alloc>& __y) { +_LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI bool +operator==(const forward_list<_Tp, _Alloc>& __x, const forward_list<_Tp, _Alloc>& __y) { typedef forward_list<_Tp, _Alloc> _Cp; typedef typename _Cp::const_iterator _Ip; _Ip __ix = __x.begin(); @@ -1493,31 +1556,31 @@ _LIBCPP_HIDE_FROM_ABI bool operator==(const forward_list<_Tp, _Alloc>& __x, cons # if _LIBCPP_STD_VER <= 17 template -inline _LIBCPP_HIDE_FROM_ABI bool +_LIBCPP_CONSTEXPR_SINCE_CXX26 inline _LIBCPP_HIDE_FROM_ABI bool operator!=(const forward_list<_Tp, _Alloc>& __x, const forward_list<_Tp, _Alloc>& __y) { return !(__x == __y); } template -inline _LIBCPP_HIDE_FROM_ABI bool +_LIBCPP_CONSTEXPR_SINCE_CXX26 inline _LIBCPP_HIDE_FROM_ABI bool operator<(const forward_list<_Tp, _Alloc>& __x, const forward_list<_Tp, _Alloc>& __y) { return std::lexicographical_compare(__x.begin(), __x.end(), __y.begin(), __y.end()); } template -inline _LIBCPP_HIDE_FROM_ABI bool +_LIBCPP_CONSTEXPR_SINCE_CXX26 inline _LIBCPP_HIDE_FROM_ABI bool operator>(const forward_list<_Tp, _Alloc>& __x, const forward_list<_Tp, _Alloc>& __y) { return __y < __x; } template -inline _LIBCPP_HIDE_FROM_ABI bool +_LIBCPP_CONSTEXPR_SINCE_CXX26 inline _LIBCPP_HIDE_FROM_ABI bool operator>=(const forward_list<_Tp, _Alloc>& __x, const forward_list<_Tp, _Alloc>& __y) { return !(__x < __y); } template -inline _LIBCPP_HIDE_FROM_ABI bool +_LIBCPP_CONSTEXPR_SINCE_CXX26 inline _LIBCPP_HIDE_FROM_ABI bool operator<=(const forward_list<_Tp, _Alloc>& __x, const forward_list<_Tp, _Alloc>& __y) { return !(__y < __x); } @@ -1525,7 +1588,7 @@ operator<=(const forward_list<_Tp, _Alloc>& __x, const forward_list<_Tp, _Alloc> # else // #if _LIBCPP_STD_VER <= 17 template -_LIBCPP_HIDE_FROM_ABI __synth_three_way_result<_Tp> +_LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI __synth_three_way_result<_Tp> operator<=>(const forward_list<_Tp, _Allocator>& __x, const forward_list<_Tp, _Allocator>& __y) { return std::lexicographical_compare_three_way(__x.begin(), __x.end(), __y.begin(), __y.end(), std::__synth_three_way); } @@ -1533,20 +1596,20 @@ operator<=>(const forward_list<_Tp, _Allocator>& __x, const forward_list<_Tp, _A # endif // #if _LIBCPP_STD_VER <= 17 template -inline _LIBCPP_HIDE_FROM_ABI void swap(forward_list<_Tp, _Alloc>& __x, forward_list<_Tp, _Alloc>& __y) - _NOEXCEPT_(_NOEXCEPT_(__x.swap(__y))) { +_LIBCPP_CONSTEXPR_SINCE_CXX26 inline _LIBCPP_HIDE_FROM_ABI void +swap(forward_list<_Tp, _Alloc>& __x, forward_list<_Tp, _Alloc>& __y) _NOEXCEPT_(_NOEXCEPT_(__x.swap(__y))) { __x.swap(__y); } # if _LIBCPP_STD_VER >= 20 template -inline _LIBCPP_HIDE_FROM_ABI typename forward_list<_Tp, _Allocator>::size_type +_LIBCPP_CONSTEXPR_SINCE_CXX26 inline _LIBCPP_HIDE_FROM_ABI typename forward_list<_Tp, _Allocator>::size_type erase_if(forward_list<_Tp, _Allocator>& __c, _Predicate __pred) { return __c.remove_if(__pred); } template -inline _LIBCPP_HIDE_FROM_ABI typename forward_list<_Tp, _Allocator>::size_type +_LIBCPP_CONSTEXPR_SINCE_CXX26 inline _LIBCPP_HIDE_FROM_ABI typename forward_list<_Tp, _Allocator>::size_type erase(forward_list<_Tp, _Allocator>& __c, const _Up& __v) { return std::erase_if(__c, [&](const auto& __elem) -> bool { return __elem == __v; }); } diff --git a/libcxx/include/version b/libcxx/include/version index 65fae111dc8e..87c4ede9a7e5 100644 --- a/libcxx/include/version +++ b/libcxx/include/version @@ -68,6 +68,7 @@ __cpp_lib_constexpr_charconv 202207L __cpp_lib_constexpr_cmath 202202L __cpp_lib_constexpr_complex 201711L __cpp_lib_constexpr_dynamic_alloc 201907L +__cpp_lib_constexpr_forward_list 202502L __cpp_lib_constexpr_functional 201907L __cpp_lib_constexpr_iterator 201811L __cpp_lib_constexpr_memory 202202L @@ -543,6 +544,7 @@ __cpp_lib_void_t 201411L # define __cpp_lib_bitset 202306L # undef __cpp_lib_constexpr_algorithms # define __cpp_lib_constexpr_algorithms 202306L +# define __cpp_lib_constexpr_forward_list 202502L # if !defined(_LIBCPP_ABI_VCRUNTIME) # define __cpp_lib_constexpr_new 202406L # endif diff --git a/libcxx/test/std/containers/sequences/forwardlist/compare.three_way.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/compare.three_way.pass.cpp index 52adfc4d8598..a9ef855e9a73 100644 --- a/libcxx/test/std/containers/sequences/forwardlist/compare.three_way.pass.cpp +++ b/libcxx/test/std/containers/sequences/forwardlist/compare.three_way.pass.cpp @@ -11,7 +11,7 @@ // template // synth-three-way-result operator<=>(const forward_list& x, -// const forward_list& y); +// const forward_list& y); // constexpr since C++26 #include #include @@ -20,6 +20,9 @@ int main(int, char**) { assert(test_sequence_container_spaceship()); - // `std::forward_list` is not constexpr, so no `static_assert` test here. +#if TEST_STD_VER >= 26 + static_assert(test_sequence_container_spaceship()); +#endif + return 0; } diff --git a/libcxx/test/std/containers/sequences/forwardlist/empty.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/empty.pass.cpp index dbc0631d1193..4482d26f308a 100644 --- a/libcxx/test/std/containers/sequences/forwardlist/empty.pass.cpp +++ b/libcxx/test/std/containers/sequences/forwardlist/empty.pass.cpp @@ -10,7 +10,7 @@ // class forward_list -// bool empty() const noexcept; +// bool empty() const noexcept; // constexpr since C++26 #include #include @@ -18,7 +18,7 @@ #include "test_macros.h" #include "min_allocator.h" -int main(int, char**) { +TEST_CONSTEXPR_CXX26 bool test() { { typedef std::forward_list C; C c; @@ -42,5 +42,14 @@ int main(int, char**) { } #endif + return true; +} + +int main(int, char**) { + assert(test()); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif + return 0; } diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.access/front.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.access/front.pass.cpp index 757db7d957f5..50b549f17d56 100644 --- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.access/front.pass.cpp +++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.access/front.pass.cpp @@ -8,17 +8,18 @@ // -// reference front(); -// const_reference front() const; +// reference front(); // constexpr since C++26 +// const_reference front() const; // constexpr since C++26 #include #include #include +#include "test_allocator.h" #include "test_macros.h" #include "min_allocator.h" -int main(int, char**) { +TEST_CONSTEXPR_CXX26 bool test() { { typedef int T; typedef std::forward_list C; @@ -58,5 +59,14 @@ int main(int, char**) { } #endif + return true; +} + +int main(int, char**) { + assert(test()); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif + return 0; } diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/alloc.compile.fail.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/alloc.compile.fail.cpp index 31893a1b9599..4645560048cf 100644 --- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/alloc.compile.fail.cpp +++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/alloc.compile.fail.cpp @@ -8,7 +8,7 @@ // -// explicit forward_list(const allocator_type& a); +// explicit forward_list(const allocator_type& a); // constexpr since C++26 #include #include @@ -16,7 +16,7 @@ #include "test_allocator.h" #include "../../../NotConstructible.h" -int main(int, char**) { +TEST_CONSTEXPR_CXX26 bool test() { { typedef test_allocator A; typedef A::value_type T; @@ -26,5 +26,14 @@ int main(int, char**) { assert(c.empty()); } + return true; +} + +int main(int, char**) { + test(); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif + return 0; } diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/alloc.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/alloc.pass.cpp index bfb330fdaf9f..ffc6d37f2816 100644 --- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/alloc.pass.cpp +++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/alloc.pass.cpp @@ -8,7 +8,7 @@ // -// explicit forward_list(const allocator_type& a); +// explicit forward_list(const allocator_type& a); // constexpr since C++26 #include #include @@ -18,7 +18,7 @@ #include "../../../NotConstructible.h" #include "min_allocator.h" -int main(int, char**) { +TEST_CONSTEXPR_CXX26 bool test() { { typedef test_allocator A; typedef A::value_type T; @@ -46,5 +46,14 @@ int main(int, char**) { } #endif + return true; +} + +int main(int, char**) { + assert(test()); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif + return 0; } diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/assign_copy.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/assign_copy.pass.cpp index 27d450c63dca..b99af4ccb79e 100644 --- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/assign_copy.pass.cpp +++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/assign_copy.pass.cpp @@ -8,7 +8,7 @@ // -// forward_list& operator=(const forward_list& x); +// forward_list& operator=(const forward_list& x); // constexpr since C++26 #include #include @@ -18,7 +18,7 @@ #include "test_allocator.h" #include "min_allocator.h" -int main(int, char**) { +TEST_CONSTEXPR_CXX26 bool test() { { typedef int T; typedef test_allocator A; @@ -143,5 +143,14 @@ int main(int, char**) { } #endif + return true; +} + +int main(int, char**) { + assert(test()); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif + return 0; } diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/assign_init.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/assign_init.pass.cpp index 1cdcca82d335..ea2802b323a9 100644 --- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/assign_init.pass.cpp +++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/assign_init.pass.cpp @@ -10,7 +10,7 @@ // -// void assign(initializer_list il); +// void assign(initializer_list il); // constexpr since C++26 #include #include @@ -19,7 +19,7 @@ #include "test_macros.h" #include "min_allocator.h" -int main(int, char**) { +TEST_CONSTEXPR_CXX26 bool test() { { typedef int T; typedef std::forward_list C; @@ -65,5 +65,14 @@ int main(int, char**) { assert(n == 4); } + return true; +} + +int main(int, char**) { + assert(test()); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif + return 0; } diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/assign_move.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/assign_move.pass.cpp index 998a7e11ef34..9c88db6166ba 100644 --- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/assign_move.pass.cpp +++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/assign_move.pass.cpp @@ -10,7 +10,7 @@ // -// forward_list& operator=(forward_list&& x); +// forward_list& operator=(forward_list&& x); // constexpr since C++26 #include #include @@ -21,7 +21,7 @@ #include "MoveOnly.h" #include "min_allocator.h" -int main(int, char**) { +TEST_CONSTEXPR_CXX26 bool test() { { typedef MoveOnly T; typedef test_allocator A; @@ -194,5 +194,14 @@ int main(int, char**) { assert(c0.empty()); } + return true; +} + +int main(int, char**) { + assert(test()); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif + return 0; } diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/assign_op_init.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/assign_op_init.pass.cpp index a22d6c4985bc..d21898dc4663 100644 --- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/assign_op_init.pass.cpp +++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/assign_op_init.pass.cpp @@ -10,7 +10,7 @@ // -// forward_list& operator=(initializer_list il); +// forward_list& operator=(initializer_list il); // constexpr since C++26 #include #include @@ -19,7 +19,7 @@ #include "test_macros.h" #include "min_allocator.h" -int main(int, char**) { +TEST_CONSTEXPR_CXX26 bool test() { { typedef int T; typedef std::forward_list C; @@ -65,5 +65,14 @@ int main(int, char**) { assert(n == 4); } + return true; +} + +int main(int, char**) { + assert(test()); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif + return 0; } diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/assign_range.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/assign_range.pass.cpp index 9a3532874079..1601b4b47acd 100644 --- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/assign_range.pass.cpp +++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/assign_range.pass.cpp @@ -9,7 +9,7 @@ // // template -// void assign(InputIterator first, InputIterator last); +// void assign(InputIterator first, InputIterator last); // constexpr since C++26 #include #include @@ -19,7 +19,7 @@ #include "test_iterators.h" #include "min_allocator.h" -int main(int, char**) { +TEST_CONSTEXPR_CXX26 bool test() { { typedef int T; typedef std::forward_list C; @@ -75,5 +75,14 @@ int main(int, char**) { } #endif + return true; +} + +int main(int, char**) { + assert(test()); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif + return 0; } diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/assign_size_value.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/assign_size_value.pass.cpp index b0fbfa3249e5..75626b47c527 100644 --- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/assign_size_value.pass.cpp +++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/assign_size_value.pass.cpp @@ -8,7 +8,7 @@ // -// void assign(size_type n, const value_type& v); +// void assign(size_type n, const value_type& v); // constexpr since C++26 #include #include @@ -17,7 +17,7 @@ #include "test_macros.h" #include "min_allocator.h" -int main(int, char**) { +TEST_CONSTEXPR_CXX26 bool test() { { typedef int T; typedef std::forward_list C; @@ -65,5 +65,14 @@ int main(int, char**) { } #endif + return true; +} + +int main(int, char**) { + assert(test()); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif + return 0; } diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/copy.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/copy.pass.cpp index 22d5054b9ae1..12d701bff4b6 100644 --- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/copy.pass.cpp +++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/copy.pass.cpp @@ -8,7 +8,7 @@ // -// forward_list(const forward_list& x); +// forward_list(const forward_list& x); // constexpr since C++26 #include #include @@ -18,7 +18,7 @@ #include "test_allocator.h" #include "min_allocator.h" -int main(int, char**) { +TEST_CONSTEXPR_CXX26 bool test() { { typedef int T; typedef test_allocator A; @@ -64,5 +64,14 @@ int main(int, char**) { } #endif + return true; +} + +int main(int, char**) { + assert(test()); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif + return 0; } diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/copy_alloc.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/copy_alloc.pass.cpp index a61233e4b5d2..fc3ff485b066 100644 --- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/copy_alloc.pass.cpp +++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/copy_alloc.pass.cpp @@ -8,7 +8,7 @@ // -// forward_list(const forward_list& x, const allocator_type& a); +// forward_list(const forward_list& x, const allocator_type& a); // constexpr since C++26 #include #include @@ -18,7 +18,7 @@ #include "test_allocator.h" #include "min_allocator.h" -int main(int, char**) { +TEST_CONSTEXPR_CXX26 bool test() { { typedef int T; typedef test_allocator A; @@ -64,5 +64,14 @@ int main(int, char**) { } #endif + return true; +} + +int main(int, char**) { + assert(test()); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif + return 0; } diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/default.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/default.pass.cpp index b493a89b7800..e0ea8bf66cb3 100644 --- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/default.pass.cpp +++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/default.pass.cpp @@ -8,7 +8,7 @@ // -// forward_list(); +// forward_list(); // constexpr since C++26 #include #include @@ -16,7 +16,7 @@ #include "test_macros.h" #include "min_allocator.h" -int main(int, char**) { +TEST_CONSTEXPR_CXX26 bool test() { { typedef int T; typedef std::forward_list C; @@ -38,5 +38,14 @@ int main(int, char**) { } #endif + return true; +} + +int main(int, char**) { + assert(test()); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif + return 0; } diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/from_range.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/from_range.pass.cpp index 312f6dbad355..d1e1734e86f9 100644 --- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/from_range.pass.cpp +++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/from_range.pass.cpp @@ -9,14 +9,14 @@ // UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 // template R> -// forward_list(from_range_t, R&& rg, const Allocator& = Allocator()); // C++23 +// forward_list(from_range_t, R&& rg, const Allocator& = Allocator()); // C++23; constexpr since C++26 #include #include "../../from_range_sequence_containers.h" #include "test_macros.h" -int main(int, char**) { +TEST_CONSTEXPR_CXX26 bool test() { for_all_iterators_and_allocators([]() { test_sequence_container([](const auto&) { // No additional validation to do. @@ -26,8 +26,19 @@ int main(int, char**) { static_assert(test_constraints()); - test_exception_safety_throwing_copy(); - test_exception_safety_throwing_allocator(); + if (!TEST_IS_CONSTANT_EVALUATED) { + test_exception_safety_throwing_copy(); + test_exception_safety_throwing_allocator(); + } + + return true; +} + +int main(int, char**) { + assert(test()); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif return 0; } diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/init.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/init.pass.cpp index b42242b0a83d..b7acf60aa70c 100644 --- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/init.pass.cpp +++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/init.pass.cpp @@ -10,7 +10,7 @@ // -// forward_list(initializer_list il); +// forward_list(initializer_list il); // constexpr since C++26 #include #include @@ -18,7 +18,7 @@ #include "test_macros.h" #include "min_allocator.h" -int main(int, char**) { +TEST_CONSTEXPR_CXX26 bool test() { { typedef int T; typedef std::forward_list C; @@ -38,5 +38,14 @@ int main(int, char**) { assert(n == 10); } + return true; +} + +int main(int, char**) { + assert(test()); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif + return 0; } diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/init_alloc.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/init_alloc.pass.cpp index 0b29cbfa9254..33d569c921a9 100644 --- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/init_alloc.pass.cpp +++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/init_alloc.pass.cpp @@ -10,7 +10,7 @@ // -// forward_list(initializer_list il, const allocator_type& a); +// forward_list(initializer_list il, const allocator_type& a); // constexpr since C++26 #include #include @@ -19,7 +19,7 @@ #include "test_allocator.h" #include "min_allocator.h" -int main(int, char**) { +TEST_CONSTEXPR_CXX26 bool test() { { typedef int T; typedef test_allocator A; @@ -43,5 +43,14 @@ int main(int, char**) { assert(c.get_allocator() == A()); } + return true; +} + +int main(int, char**) { + assert(test()); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif + return 0; } diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/move.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/move.pass.cpp index 762e252ca76f..20575479f735 100644 --- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/move.pass.cpp +++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/move.pass.cpp @@ -10,7 +10,7 @@ // -// forward_list(forward_list&& x); +// forward_list(forward_list&& x); // constexpr since C++26 #include #include @@ -21,7 +21,7 @@ #include "MoveOnly.h" #include "min_allocator.h" -int main(int, char**) { +TEST_CONSTEXPR_CXX26 bool test() { { typedef MoveOnly T; typedef test_allocator A; @@ -68,5 +68,14 @@ int main(int, char**) { assert(c.get_allocator() == A()); } + return true; +} + +int main(int, char**) { + assert(test()); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif + return 0; } diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/move_alloc.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/move_alloc.pass.cpp index a9bc2cb12f28..219505bf4fd1 100644 --- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/move_alloc.pass.cpp +++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/move_alloc.pass.cpp @@ -10,7 +10,7 @@ // -// forward_list(forward_list&& x, const allocator_type& a); +// forward_list(forward_list&& x, const allocator_type& a); // constexpr since C++26 #include #include @@ -21,7 +21,7 @@ #include "MoveOnly.h" #include "min_allocator.h" -int main(int, char**) { +TEST_CONSTEXPR_CXX26 bool test() { { typedef MoveOnly T; typedef test_allocator A; @@ -68,5 +68,14 @@ int main(int, char**) { assert(c.get_allocator() == A()); } + return true; +} + +int main(int, char**) { + assert(test()); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif + return 0; } diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/range.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/range.pass.cpp index ebd0e6a5bd1e..61393eb28938 100644 --- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/range.pass.cpp +++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/range.pass.cpp @@ -9,7 +9,7 @@ // // template -// forward_list(InputIterator first, InputIterator last); +// forward_list(InputIterator first, InputIterator last); // constexpr since C++26 #include #include @@ -19,7 +19,7 @@ #include "test_iterators.h" #include "min_allocator.h" -int main(int, char**) { +TEST_CONSTEXPR_CXX26 bool test() { { typedef int T; typedef std::forward_list C; @@ -45,5 +45,14 @@ int main(int, char**) { } #endif + return true; +} + +int main(int, char**) { + assert(test()); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif + return 0; } diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/range_alloc.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/range_alloc.pass.cpp index 4a28041ad2cb..c0637420e328 100644 --- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/range_alloc.pass.cpp +++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/range_alloc.pass.cpp @@ -10,7 +10,7 @@ // template // forward_list(InputIterator first, InputIterator last, -// const allocator_type& a); +// const allocator_type& a); // constexpr since C++26 #include #include @@ -21,7 +21,7 @@ #include "test_iterators.h" #include "min_allocator.h" -int main(int, char**) { +TEST_CONSTEXPR_CXX26 bool test() { { typedef int T; typedef test_allocator A; @@ -51,5 +51,14 @@ int main(int, char**) { } #endif + return true; +} + +int main(int, char**) { + assert(test()); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif + return 0; } diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/size.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/size.pass.cpp index 81b128d2149e..206854560c19 100644 --- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/size.pass.cpp +++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/size.pass.cpp @@ -8,8 +8,8 @@ // -// explicit forward_list(size_type n); -// explicit forward_list(size_type n, const Alloc& a); +// explicit forward_list(size_type n); // constexpr since C++26 +// explicit forward_list(size_type n, const Alloc& a); // constexpr since C++26 #include #include diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/size_value.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/size_value.pass.cpp index 663422d1c3c3..85d11e3f40a2 100644 --- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/size_value.pass.cpp +++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/size_value.pass.cpp @@ -8,7 +8,7 @@ // -// forward_list(size_type n, const value_type& v); +// forward_list(size_type n, const value_type& v); // constexpr since C++26 #include #include @@ -16,7 +16,7 @@ #include "test_macros.h" #include "min_allocator.h" -int main(int, char**) { +TEST_CONSTEXPR_CXX26 bool test() { { typedef int T; typedef std::forward_list C; @@ -42,5 +42,14 @@ int main(int, char**) { } #endif + return true; +} + +int main(int, char**) { + assert(test()); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif + return 0; } diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/size_value_alloc.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/size_value_alloc.pass.cpp index af7f7471d4c9..abcdf62452b8 100644 --- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/size_value_alloc.pass.cpp +++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/size_value_alloc.pass.cpp @@ -8,7 +8,7 @@ // -// forward_list(size_type n, const value_type& v, const allocator_type& a); +// forward_list(size_type n, const value_type& v, const allocator_type& a); // constexpr since C++26 #include #include @@ -17,7 +17,7 @@ #include "test_allocator.h" #include "min_allocator.h" -int main(int, char**) { +TEST_CONSTEXPR_CXX26 bool test() { { typedef test_allocator A; typedef A::value_type T; @@ -47,5 +47,14 @@ int main(int, char**) { } #endif + return true; +} + +int main(int, char**) { + assert(test()); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif + return 0; } diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.erasure/erase.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.erasure/erase.pass.cpp index 1044d779220e..86d7769fe16e 100644 --- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.erasure/erase.pass.cpp +++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.erasure/erase.pass.cpp @@ -11,7 +11,7 @@ // template // typename forward_list::size_type -// erase(forward_list& c, const U& value); +// erase(forward_list& c, const U& value); // constexpr since C++26 #include #include @@ -21,14 +21,14 @@ #include "min_allocator.h" template -void test0(S s, U val, S expected, std::size_t expected_erased_count) { +TEST_CONSTEXPR_CXX26 void test0(S s, U val, S expected, std::size_t expected_erased_count) { ASSERT_SAME_TYPE(typename S::size_type, decltype(std::erase(s, val))); assert(expected_erased_count == std::erase(s, val)); assert(s == expected); } template -void test() { +TEST_CONSTEXPR_CXX26 void test() { test0(S(), 1, S(), 0); test0(S({1}), 1, S(), 1); @@ -62,13 +62,21 @@ void test() { test0(S({1, 2, 1}), opt(3), S({1, 2, 1}), 0); } -int main(int, char**) { +TEST_CONSTEXPR_CXX26 bool test() { test>(); test>>(); test>>(); - test>(); test>(); + return true; +} + +int main(int, char**) { + assert(test()); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif + return 0; } diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.erasure/erase_if.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.erasure/erase_if.pass.cpp index c4f45a1069a2..c665f9cccbf0 100644 --- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.erasure/erase_if.pass.cpp +++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.erasure/erase_if.pass.cpp @@ -11,7 +11,7 @@ // template // typename forward_list::size_type -// erase_if(forward_list& c, Predicate pred); +// erase_if(forward_list& c, Predicate pred); // constexpr since C++26 #include @@ -20,14 +20,14 @@ #include "min_allocator.h" template -void test0(S s, Pred p, S expected, std::size_t expected_erased_count) { +TEST_CONSTEXPR_CXX26 void test0(S s, Pred p, S expected, std::size_t expected_erased_count) { ASSERT_SAME_TYPE(typename S::size_type, decltype(std::erase_if(s, p))); assert(expected_erased_count == std::erase_if(s, p)); assert(s == expected); } template -void test() { +TEST_CONSTEXPR_CXX26 void test() { auto is1 = [](auto v) { return v == 1; }; auto is2 = [](auto v) { return v == 2; }; auto is3 = [](auto v) { return v == 3; }; @@ -64,13 +64,21 @@ void test() { test0(S({1, 2, 3}), False, S({1, 2, 3}), 0); } -int main(int, char**) { +TEST_CONSTEXPR_CXX26 bool test() { test>(); test>>(); test>>(); - test>(); test>(); + return true; +} + +int main(int, char**) { + assert(test()); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif + return 0; } diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.iter/before_begin.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.iter/before_begin.pass.cpp index d66d2cd87951..52b5d87860aa 100644 --- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.iter/before_begin.pass.cpp +++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.iter/before_begin.pass.cpp @@ -8,9 +8,9 @@ // -// iterator before_begin(); -// const_iterator before_begin() const; -// const_iterator cbefore_begin() const; +// iterator before_begin(); // constexpr since C++26 +// const_iterator before_begin() const; // constexpr since C++26 +// const_iterator cbefore_begin() const; // constexpr since C++26 #include #include @@ -19,7 +19,7 @@ #include "test_macros.h" #include "min_allocator.h" -int main(int, char**) { +TEST_CONSTEXPR_CXX26 bool test() { { typedef int T; typedef std::forward_list C; @@ -101,5 +101,14 @@ int main(int, char**) { } #endif + return true; +} + +int main(int, char**) { + assert(test()); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif + return 0; } diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.iter/iterators.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.iter/iterators.pass.cpp index 135689b2321c..560c47b17958 100644 --- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.iter/iterators.pass.cpp +++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.iter/iterators.pass.cpp @@ -8,12 +8,12 @@ // -// iterator begin(); -// iterator end(); -// const_iterator begin() const; -// const_iterator end() const; -// const_iterator cbegin() const; -// const_iterator cend() const; +// iterator begin(); // constexpr since C++26 +// iterator end(); // constexpr since C++26 +// const_iterator begin() const; // constexpr since C++26 +// const_iterator end() const; // constexpr since C++26 +// const_iterator cbegin() const; // constexpr since C++26 +// const_iterator cend() const; // constexpr since C++26 #include #include @@ -22,7 +22,7 @@ #include "test_macros.h" #include "min_allocator.h" -int main(int, char**) { +TEST_CONSTEXPR_CXX26 bool test() { { typedef int T; typedef std::forward_list C; @@ -69,6 +69,8 @@ int main(int, char**) { typedef std::forward_list C; C::iterator i; C::const_iterator j; + (void)i; + (void)j; } #if TEST_STD_VER >= 11 { @@ -117,6 +119,8 @@ int main(int, char**) { typedef std::forward_list> C; C::iterator i; C::const_iterator j; + (void)i; + (void)j; } #endif #if TEST_STD_VER > 11 @@ -142,5 +146,14 @@ int main(int, char**) { } #endif + return true; +} + +int main(int, char**) { + assert(test()); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif + return 0; } diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/assign_range.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/assign_range.pass.cpp index a27cc757025b..9a3adec1d975 100644 --- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/assign_range.pass.cpp +++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/assign_range.pass.cpp @@ -9,7 +9,7 @@ // UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 // template R> -// constexpr void prepend_range(R&& rg); // C++23 +// constexpr void prepend_range(R&& rg); // C++23; constexpr since C++26 #include @@ -21,7 +21,7 @@ // {empty/one-element/full} container); // - prepending move-only elements; // - an exception is thrown when copying the elements or when allocating new elements. -int main(int, char**) { +TEST_CONSTEXPR_CXX26 bool test() { static_assert(test_constraints_assign_range()); for_all_iterators_and_allocators([]() { @@ -31,8 +31,19 @@ int main(int, char**) { }); test_sequence_prepend_range_move_only(); - test_prepend_range_exception_safety_throwing_copy(); - test_prepend_range_exception_safety_throwing_allocator(); + if (!TEST_IS_CONSTANT_EVALUATED) { + test_prepend_range_exception_safety_throwing_copy(); + test_prepend_range_exception_safety_throwing_allocator(); + } + + return true; +} + +int main(int, char**) { + assert(test()); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif return 0; } diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/clear.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/clear.pass.cpp index 9f6d34b701df..2e1768cf8bad 100644 --- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/clear.pass.cpp +++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/clear.pass.cpp @@ -8,7 +8,7 @@ // -// void clear() noexcept; +// void clear() noexcept; // constexpr since C++26 #include #include @@ -18,7 +18,7 @@ #include "../../../NotConstructible.h" #include "min_allocator.h" -int main(int, char**) { +TEST_CONSTEXPR_CXX26 bool test() { { typedef NotConstructible T; typedef std::forward_list C; @@ -64,5 +64,14 @@ int main(int, char**) { } #endif + return true; +} + +int main(int, char**) { + assert(test()); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif + return 0; } diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/emplace_after.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/emplace_after.pass.cpp index f77d47ee7c74..6433607af9b3 100644 --- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/emplace_after.pass.cpp +++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/emplace_after.pass.cpp @@ -11,7 +11,7 @@ // // template -// iterator emplace_after(const_iterator p, Args&&... args); +// iterator emplace_after(const_iterator p, Args&&... args); // constexpr since C++26 #include #include @@ -20,7 +20,7 @@ #include "../../../Emplaceable.h" #include "min_allocator.h" -int main(int, char**) { +TEST_CONSTEXPR_CXX26 bool test() { { typedef Emplaceable T; typedef std::forward_list C; @@ -84,5 +84,14 @@ int main(int, char**) { assert(std::distance(c.begin(), c.end()) == 4); } + return true; +} + +int main(int, char**) { + assert(test()); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif + return 0; } diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/emplace_front.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/emplace_front.pass.cpp index cd3bb20c52ae..46ae27b43622 100644 --- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/emplace_front.pass.cpp +++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/emplace_front.pass.cpp @@ -10,7 +10,7 @@ // -// template reference emplace_front(Args&&... args); +// template reference emplace_front(Args&&... args); // constexpr since C++26 // return type is 'reference' in C++17; 'void' before #include @@ -21,7 +21,7 @@ #include "../../../Emplaceable.h" #include "min_allocator.h" -int main(int, char**) { +TEST_CONSTEXPR_CXX26 bool test() { { typedef Emplaceable T; typedef std::forward_list C; @@ -67,5 +67,14 @@ int main(int, char**) { assert(std::distance(c.begin(), c.end()) == 2); } + return true; +} + +int main(int, char**) { + assert(test()); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif + return 0; } diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/erase_after_many.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/erase_after_many.pass.cpp index e85951798526..73cb03c2cb7d 100644 --- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/erase_after_many.pass.cpp +++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/erase_after_many.pass.cpp @@ -8,7 +8,7 @@ // -// iterator erase_after(const_iterator first, const_iterator last); +// iterator erase_after(const_iterator first, const_iterator last); // constexpr since C++26 #include #include @@ -17,7 +17,7 @@ #include "test_macros.h" #include "min_allocator.h" -int main(int, char**) { +TEST_CONSTEXPR_CXX26 bool test() { { typedef int T; typedef std::forward_list C; @@ -153,5 +153,14 @@ int main(int, char**) { } #endif + return true; +} + +int main(int, char**) { + assert(test()); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif + return 0; } diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/erase_after_one.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/erase_after_one.pass.cpp index 892228e76def..12997f1dad3b 100644 --- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/erase_after_one.pass.cpp +++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/erase_after_one.pass.cpp @@ -8,7 +8,7 @@ // -// iterator erase_after(const_iterator p); +// iterator erase_after(const_iterator p); // constexpr since C++26 #include #include @@ -17,7 +17,7 @@ #include "test_macros.h" #include "min_allocator.h" -int main(int, char**) { +TEST_CONSTEXPR_CXX26 bool test() { { typedef int T; typedef std::forward_list C; @@ -95,5 +95,14 @@ int main(int, char**) { } #endif + return true; +} + +int main(int, char**) { + assert(test()); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif + return 0; } diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/insert_after_const.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/insert_after_const.pass.cpp index 8443158413e7..d93789dd6bb5 100644 --- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/insert_after_const.pass.cpp +++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/insert_after_const.pass.cpp @@ -8,7 +8,7 @@ // -// iterator insert_after(const_iterator p, const value_type& v); +// iterator insert_after(const_iterator p, const value_type& v); // constexpr since C++26 #include #include @@ -16,7 +16,7 @@ #include "test_macros.h" #include "min_allocator.h" -int main(int, char**) { +TEST_CONSTEXPR_CXX26 bool test() { { typedef int T; typedef std::forward_list C; @@ -84,5 +84,14 @@ int main(int, char**) { } #endif + return true; +} + +int main(int, char**) { + assert(test()); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif + return 0; } diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/insert_after_init.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/insert_after_init.pass.cpp index de924a10c18f..54be47f4264f 100644 --- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/insert_after_init.pass.cpp +++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/insert_after_init.pass.cpp @@ -10,7 +10,7 @@ // -// iterator insert_after(const_iterator p, initializer_list il); +// iterator insert_after(const_iterator p, initializer_list il); // constexpr since C++26 #include #include @@ -18,7 +18,7 @@ #include "test_macros.h" #include "min_allocator.h" -int main(int, char**) { +TEST_CONSTEXPR_CXX26 bool test() { { typedef int T; typedef std::forward_list C; @@ -70,5 +70,14 @@ int main(int, char**) { assert(*std::next(c.begin(), 4) == 2); } + return true; +} + +int main(int, char**) { + assert(test()); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif + return 0; } diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/insert_after_range.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/insert_after_range.pass.cpp index af810d0f6961..f89fbd7619da 100644 --- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/insert_after_range.pass.cpp +++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/insert_after_range.pass.cpp @@ -10,7 +10,7 @@ // template // iterator insert_after(const_iterator p, -// InputIterator first, InputIterator last); +// InputIterator first, InputIterator last); // constexpr since C++26 #include #include @@ -19,7 +19,7 @@ #include "test_iterators.h" #include "min_allocator.h" -int main(int, char**) { +TEST_CONSTEXPR_CXX26 bool test() { { typedef int T; typedef std::forward_list C; @@ -77,5 +77,14 @@ int main(int, char**) { } #endif + return true; +} + +int main(int, char**) { + assert(test()); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif + return 0; } diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/insert_after_rv.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/insert_after_rv.pass.cpp index acd4bc73f724..01b76f5cd64f 100644 --- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/insert_after_rv.pass.cpp +++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/insert_after_rv.pass.cpp @@ -10,7 +10,7 @@ // -// iterator insert_after(const_iterator p, value_type&& v); +// iterator insert_after(const_iterator p, value_type&& v); // constexpr since C++26 #include #include @@ -19,7 +19,7 @@ #include "MoveOnly.h" #include "min_allocator.h" -int main(int, char**) { +TEST_CONSTEXPR_CXX26 bool test() { { typedef MoveOnly T; typedef std::forward_list C; @@ -85,5 +85,14 @@ int main(int, char**) { assert(std::distance(c.begin(), c.end()) == 4); } + return true; +} + +int main(int, char**) { + assert(test()); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif + return 0; } diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/insert_after_size_value.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/insert_after_size_value.pass.cpp index 2506f04311e0..f4f0521ad237 100644 --- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/insert_after_size_value.pass.cpp +++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/insert_after_size_value.pass.cpp @@ -8,7 +8,7 @@ // -// iterator insert_after(const_iterator p, size_type n, const value_type& v); +// iterator insert_after(const_iterator p, size_type n, const value_type& v); // constexpr since C++26 #include #include @@ -16,7 +16,7 @@ #include "test_macros.h" #include "min_allocator.h" -int main(int, char**) { +TEST_CONSTEXPR_CXX26 bool test() { { typedef int T; typedef std::forward_list C; @@ -70,5 +70,14 @@ int main(int, char**) { } #endif + return true; +} + +int main(int, char**) { + assert(test()); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif + return 0; } diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/insert_range_after.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/insert_range_after.pass.cpp index 25f4c43f3848..71a291430b43 100644 --- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/insert_range_after.pass.cpp +++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/insert_range_after.pass.cpp @@ -8,8 +8,10 @@ // UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 +// ADDITIONAL_COMPILE_FLAGS(has-fconstexpr-steps): -fconstexpr-steps=20000000 + // template R> -// constexpr iterator insert_range_after(const_iterator position, R&& rg); // C++23 +// constexpr iterator insert_range_after(const_iterator position, R&& rg); // C++23; constexpr since C++26 #include @@ -321,7 +323,7 @@ constexpr void test_sequence_insert_range_after() { } } -void test_sequence_insert_range_after_move_only() { +TEST_CONSTEXPR_CXX26 void test_sequence_insert_range_after_move_only() { MoveOnly input[5]; std::ranges::subrange in(std::move_iterator{input}, std::move_iterator{input + 5}); @@ -366,7 +368,7 @@ void test_insert_range_after_exception_safety_throwing_allocator() { #endif } -int main(int, char**) { +TEST_CONSTEXPR_CXX26 bool test() { static_assert(test_constraints_insert_range_after()); for_all_iterators_and_allocators([]() { @@ -374,8 +376,19 @@ int main(int, char**) { }); test_sequence_insert_range_after_move_only(); - test_insert_range_after_exception_safety_throwing_copy(); - test_insert_range_after_exception_safety_throwing_allocator(); + if (!TEST_IS_CONSTANT_EVALUATED) { + test_insert_range_after_exception_safety_throwing_copy(); + test_insert_range_after_exception_safety_throwing_allocator(); + } + + return true; +} + +int main(int, char**) { + assert(test()); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif return 0; } diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/pop_front.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/pop_front.pass.cpp index 98c7a2634117..9fcade7ff6bb 100644 --- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/pop_front.pass.cpp +++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/pop_front.pass.cpp @@ -8,7 +8,7 @@ // -// void pop_front(); +// void pop_front(); // constexpr since C++26 #include #include @@ -17,7 +17,7 @@ #include "MoveOnly.h" #include "min_allocator.h" -int main(int, char**) { +TEST_CONSTEXPR_CXX26 bool test() { { typedef int T; typedef std::forward_list C; @@ -71,5 +71,14 @@ int main(int, char**) { } #endif + return true; +} + +int main(int, char**) { + assert(test()); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif + return 0; } diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/prepend_range.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/prepend_range.pass.cpp index 418aa72052ba..c4b9cd9bdfc4 100644 --- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/prepend_range.pass.cpp +++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/prepend_range.pass.cpp @@ -9,7 +9,7 @@ // UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 // template R> -// constexpr void prepend_range(R&& rg); // C++23 +// constexpr void prepend_range(R&& rg); // C++23; constexpr since C++26 #include @@ -21,7 +21,7 @@ // {empty/one-element/full} container); // - prepending move-only elements; // - an exception is thrown when copying the elements or when allocating new elements. -int main(int, char**) { +TEST_CONSTEXPR_CXX26 bool test() { static_assert(test_constraints_prepend_range()); for_all_iterators_and_allocators([]() { @@ -31,8 +31,19 @@ int main(int, char**) { }); test_sequence_prepend_range_move_only(); - test_prepend_range_exception_safety_throwing_copy(); - test_prepend_range_exception_safety_throwing_allocator(); + if (!TEST_IS_CONSTANT_EVALUATED) { + test_prepend_range_exception_safety_throwing_copy(); + test_prepend_range_exception_safety_throwing_allocator(); + } + + return true; +} + +int main(int, char**) { + assert(test()); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif return 0; } diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/push_front_const.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/push_front_const.pass.cpp index f99c40fa0c1a..61c5dcac0545 100644 --- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/push_front_const.pass.cpp +++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/push_front_const.pass.cpp @@ -8,7 +8,7 @@ // -// void push_front(const value_type& v); +// void push_front(const value_type& v); // constexpr since C++26 #include #include @@ -16,7 +16,7 @@ #include "test_macros.h" #include "min_allocator.h" -int main(int, char**) { +TEST_CONSTEXPR_CXX26 bool test() { { typedef int T; typedef std::forward_list C; @@ -44,5 +44,14 @@ int main(int, char**) { } #endif + return true; +} + +int main(int, char**) { + assert(test()); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif + return 0; } diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/push_front_exception_safety.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/push_front_exception_safety.pass.cpp index 467037465eed..cd24d6ff6af0 100644 --- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/push_front_exception_safety.pass.cpp +++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/push_front_exception_safety.pass.cpp @@ -9,7 +9,7 @@ // UNSUPPORTED: no-exceptions // -// void push_front(const value_type& x); +// void push_front(const value_type& x); // constexpr since C++26 #include #include diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/push_front_rv.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/push_front_rv.pass.cpp index d3156c5fdd38..b30ff7a0189e 100644 --- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/push_front_rv.pass.cpp +++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/push_front_rv.pass.cpp @@ -10,7 +10,7 @@ // -// void push_front(value_type&& v); +// void push_front(value_type&& v); // constexpr since C++26 #include #include @@ -19,7 +19,7 @@ #include "MoveOnly.h" #include "min_allocator.h" -int main(int, char**) { +TEST_CONSTEXPR_CXX26 bool test() { { typedef MoveOnly T; typedef std::forward_list C; @@ -45,5 +45,14 @@ int main(int, char**) { assert(std::distance(c.begin(), c.end()) == 2); } + return true; +} + +int main(int, char**) { + assert(test()); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif + return 0; } diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/resize_size.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/resize_size.pass.cpp index 2dacf458d7d9..f80886113bf2 100644 --- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/resize_size.pass.cpp +++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/resize_size.pass.cpp @@ -8,7 +8,7 @@ // -// void resize(size_type n); +// void resize(size_type n); // constexpr since C++26 #include #include @@ -18,8 +18,8 @@ #include "DefaultOnly.h" #include "min_allocator.h" -int main(int, char**) { - { +TEST_CONSTEXPR_CXX26 bool test() { + if (!TEST_IS_CONSTANT_EVALUATED) { typedef DefaultOnly T; typedef std::forward_list C; C c; @@ -65,7 +65,7 @@ int main(int, char**) { assert(*std::next(c.begin(), 5) == 0); } #if TEST_STD_VER >= 11 - { + if (!TEST_IS_CONSTANT_EVALUATED) { typedef DefaultOnly T; typedef std::forward_list> C; C c; @@ -112,5 +112,14 @@ int main(int, char**) { } #endif + return true; +} + +int main(int, char**) { + assert(test()); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif + return 0; } diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/resize_size_value.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/resize_size_value.pass.cpp index a6af763e6937..4ec859b36336 100644 --- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/resize_size_value.pass.cpp +++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/resize_size_value.pass.cpp @@ -8,7 +8,7 @@ // -// void resize(size_type n, const value_type& v); +// void resize(size_type n, const value_type& v); // constexpr since C++26 #include #include @@ -22,7 +22,7 @@ # include "container_test_types.h" #endif -int main(int, char**) { +TEST_CONSTEXPR_CXX26 bool test() { { typedef int T; typedef std::forward_list C; @@ -84,7 +84,7 @@ int main(int, char**) { assert(*std::next(c.begin(), 4) == 10); assert(*std::next(c.begin(), 5) == 10); } - { + if (!TEST_IS_CONSTANT_EVALUATED) { // Test that the allocator's construct method is being used to // construct the new elements and that it's called exactly N times. typedef std::forward_list> Container; @@ -99,5 +99,14 @@ int main(int, char**) { } #endif + return true; +} + +int main(int, char**) { + assert(test()); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif + return 0; } diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/merge_lvalue.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/merge_lvalue.pass.cpp index 9a162789569d..d8e80c56bf39 100644 --- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/merge_lvalue.pass.cpp +++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/merge_lvalue.pass.cpp @@ -8,7 +8,7 @@ // -// void merge(forward_list& x); +// void merge(forward_list& x); // constexpr since C++26 #include #include @@ -30,11 +30,11 @@ struct value { int a; int b; - friend bool operator<(const value& lhs, const value& rhs) { return lhs.a < rhs.a; } - friend bool operator==(const value& lhs, const value& rhs) { return lhs.a == rhs.a && lhs.b == rhs.b; } + friend TEST_CONSTEXPR bool operator<(const value& lhs, const value& rhs) { return lhs.a < rhs.a; } + friend TEST_CONSTEXPR bool operator==(const value& lhs, const value& rhs) { return lhs.a == rhs.a && lhs.b == rhs.b; } }; -int main(int, char**) { +TEST_CONSTEXPR_CXX26 bool test() { { // Basic merge operation. typedef int T; typedef std::forward_list C; @@ -116,5 +116,14 @@ int main(int, char**) { assert(c == std::forward_list(std::begin(a), std::end(a))); } + return true; +} + +int main(int, char**) { + assert(test()); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif + return 0; } diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/merge_lvalue_pred.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/merge_lvalue_pred.pass.cpp index 4e1814044808..0adadb2dd092 100644 --- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/merge_lvalue_pred.pass.cpp +++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/merge_lvalue_pred.pass.cpp @@ -8,7 +8,7 @@ // -// template void merge(forward_list& x, Compare comp); +// template void merge(forward_list& x, Compare comp); // constexpr since C++26 #include #include @@ -30,11 +30,11 @@ struct value { int a; int b; - friend bool operator>(const value& lhs, const value& rhs) { return lhs.a > rhs.a; } - friend bool operator==(const value& lhs, const value& rhs) { return lhs.a == rhs.a && lhs.b == rhs.b; } + friend TEST_CONSTEXPR bool operator>(const value& lhs, const value& rhs) { return lhs.a > rhs.a; } + friend TEST_CONSTEXPR bool operator==(const value& lhs, const value& rhs) { return lhs.a == rhs.a && lhs.b == rhs.b; } }; -int main(int, char**) { +TEST_CONSTEXPR_CXX26 bool test() { { // Basic merge operation. typedef int T; typedef std::forward_list C; @@ -117,5 +117,14 @@ int main(int, char**) { assert(c == std::forward_list(std::begin(a), std::end(a))); } + return true; +} + +int main(int, char**) { + assert(test()); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif + return 0; } diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/merge_rvalue.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/merge_rvalue.pass.cpp index acfa014fe254..906748ec2702 100644 --- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/merge_rvalue.pass.cpp +++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/merge_rvalue.pass.cpp @@ -10,7 +10,7 @@ // -// void merge(forward_list&& x); +// void merge(forward_list&& x); // constexpr since C++26 #include #include @@ -29,11 +29,11 @@ struct value { int a; int b; - friend bool operator<(const value& lhs, const value& rhs) { return lhs.a < rhs.a; } - friend bool operator==(const value& lhs, const value& rhs) { return lhs.a == rhs.a && lhs.b == rhs.b; } + friend TEST_CONSTEXPR bool operator<(const value& lhs, const value& rhs) { return lhs.a < rhs.a; } + friend TEST_CONSTEXPR bool operator==(const value& lhs, const value& rhs) { return lhs.a == rhs.a && lhs.b == rhs.b; } }; -int main(int, char**) { +TEST_CONSTEXPR_CXX26 bool test() { { // Basic merge operation. typedef int T; typedef std::forward_list C; @@ -109,5 +109,14 @@ int main(int, char**) { assert(c == std::forward_list(std::begin(a), std::end(a))); } + return true; +} + +int main(int, char**) { + assert(test()); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif + return 0; } diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/merge_rvalue_pred.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/merge_rvalue_pred.pass.cpp index 41b56ce7a288..2ced0b1596e4 100644 --- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/merge_rvalue_pred.pass.cpp +++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/merge_rvalue_pred.pass.cpp @@ -10,7 +10,7 @@ // -// template void merge(forward_list&& x, Compare comp); +// template void merge(forward_list&& x, Compare comp); // constexpr since C++26 #include #include @@ -29,11 +29,11 @@ struct value { int a; int b; - friend bool operator>(const value& lhs, const value& rhs) { return lhs.a > rhs.a; } - friend bool operator==(const value& lhs, const value& rhs) { return lhs.a == rhs.a && lhs.b == rhs.b; } + friend TEST_CONSTEXPR bool operator>(const value& lhs, const value& rhs) { return lhs.a > rhs.a; } + friend TEST_CONSTEXPR bool operator==(const value& lhs, const value& rhs) { return lhs.a == rhs.a && lhs.b == rhs.b; } }; -int main(int, char**) { +TEST_CONSTEXPR_CXX26 bool test() { { // Basic merge operation. typedef int T; typedef std::forward_list C; @@ -110,5 +110,14 @@ int main(int, char**) { assert(c == std::forward_list(std::begin(a), std::end(a))); } + return true; +} + +int main(int, char**) { + assert(test()); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif + return 0; } diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/remove.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/remove.pass.cpp index ec3bf845dcc5..b17708ba60ee 100644 --- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/remove.pass.cpp +++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/remove.pass.cpp @@ -9,7 +9,7 @@ // // void remove(const value_type& v); // C++17 and before -// size_type remove(const value_type& v); // C++20 and after +// size_type remove(const value_type& v); // C++20 and after; // constexpr since C++26 #include #include @@ -19,7 +19,7 @@ #include "min_allocator.h" template -void do_remove(L& l, const typename L::value_type& value, typename L::size_type expected) { +TEST_CONSTEXPR_CXX26 void do_remove(L& l, const typename L::value_type& value, typename L::size_type expected) { typename L::size_type old_size = std::distance(l.begin(), l.end()); #if TEST_STD_VER > 17 ASSERT_SAME_TYPE(decltype(l.remove(value)), typename L::size_type); @@ -32,22 +32,22 @@ void do_remove(L& l, const typename L::value_type& value, typename L::size_type } struct S { - S(int i) : i_(new int(i)) {} - S(const S& rhs) : i_(new int(*rhs.i_)) {} - S& operator=(const S& rhs) { + TEST_CONSTEXPR_CXX20 S(int i) : i_(new int(i)) {} + TEST_CONSTEXPR_CXX20 S(const S& rhs) : i_(new int(*rhs.i_)) {} + TEST_CONSTEXPR_CXX20 S& operator=(const S& rhs) { *i_ = *rhs.i_; return *this; } - ~S() { + TEST_CONSTEXPR_CXX20 ~S() { delete i_; i_ = NULL; } - bool operator==(const S& rhs) const { return *i_ == *rhs.i_; } - int get() const { return *i_; } + TEST_CONSTEXPR bool operator==(const S& rhs) const { return *i_ == *rhs.i_; } + TEST_CONSTEXPR int get() const { return *i_; } int* i_; }; -int main(int, char**) { +TEST_CONSTEXPR_CXX26 bool test() { { typedef int T; typedef std::forward_list C; @@ -171,5 +171,14 @@ int main(int, char**) { } #endif + return true; +} + +int main(int, char**) { + assert(test()); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif + return 0; } diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/remove_if.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/remove_if.pass.cpp index c6325baea259..f26205d03f64 100644 --- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/remove_if.pass.cpp +++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/remove_if.pass.cpp @@ -9,7 +9,7 @@ // // template void remove_if(Predicate pred); // C++17 and before -// template size_type remove_if(Predicate pred); // C++20 and after +// template size_type remove_if(Predicate pred); // C++20 and after; constexpr since C++26 #include #include @@ -22,7 +22,7 @@ #include "counting_predicates.h" template -void do_remove_if(L& l, Predicate pred, typename L::size_type expected) { +TEST_CONSTEXPR_CXX26 void do_remove_if(L& l, Predicate pred, typename L::size_type expected) { typename L::size_type old_size = std::distance(l.begin(), l.end()); #if TEST_STD_VER > 17 ASSERT_SAME_TYPE(decltype(l.remove_if(pred)), typename L::size_type); @@ -34,18 +34,18 @@ void do_remove_if(L& l, Predicate pred, typename L::size_type expected) { assert(old_size - std::distance(l.begin(), l.end()) == expected); } -bool g(int i) { return i < 3; } +TEST_CONSTEXPR bool g(int i) { return i < 3; } struct PredLWG526 { - PredLWG526(int i) : i_(i) {} - ~PredLWG526() { i_ = -32767; } - bool operator()(const PredLWG526& p) const { return p.i_ == i_; } + TEST_CONSTEXPR_CXX20 PredLWG526(int i) : i_(i) {} + TEST_CONSTEXPR_CXX20 ~PredLWG526() { i_ = -32767; } + TEST_CONSTEXPR bool operator()(const PredLWG526& p) const { return p.i_ == i_; } - bool operator==(int i) const { return i == i_; } + TEST_CONSTEXPR bool operator==(int i) const { return i == i_; } int i_; }; -int main(int, char**) { +TEST_CONSTEXPR_CXX26 bool test() { { typedef int T; typedef unary_counting_predicate Predicate; @@ -187,5 +187,14 @@ int main(int, char**) { } #endif + return true; +} + +int main(int, char**) { + assert(test()); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif + return 0; } diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/reverse.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/reverse.pass.cpp index 0d0656897f34..38f0e74f6632 100644 --- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/reverse.pass.cpp +++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/reverse.pass.cpp @@ -8,7 +8,7 @@ // -// void reverse(); +// void reverse(); // constexpr since C++26 #include #include @@ -19,7 +19,7 @@ #include "min_allocator.h" template -void test(int N) { +TEST_CONSTEXPR_CXX26 void test1(int N) { C c; for (int i = 0; i < N; ++i) c.push_front(i); @@ -30,12 +30,21 @@ void test(int N) { assert(*j == i); } -int main(int, char**) { +TEST_CONSTEXPR_CXX26 bool test() { for (int i = 0; i < 10; ++i) - test >(i); + test1 >(i); #if TEST_STD_VER >= 11 for (int i = 0; i < 10; ++i) - test> >(i); + test1> >(i); +#endif + + return true; +} + +int main(int, char**) { + assert(test()); +#if TEST_STD_VER >= 26 + static_assert(test()); #endif return 0; diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/splice_after_flist.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/splice_after_flist.pass.cpp index 4c91d7397adf..f8787d70784d 100644 --- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/splice_after_flist.pass.cpp +++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/splice_after_flist.pass.cpp @@ -8,7 +8,7 @@ // -// void splice_after(const_iterator p, forward_list&& x); +// void splice_after(const_iterator p, forward_list&& x); // constexpr since C++26 #include #include @@ -19,13 +19,13 @@ #include "min_allocator.h" typedef int T; -const T t1[] = {0, 1, 2, 3, 4, 5, 6, 7}; -const T t2[] = {10, 11, 12, 13, 14, 15}; -const std::ptrdiff_t size_t1 = std::end(t1) - std::begin(t1); -const std::ptrdiff_t size_t2 = std::end(t2) - std::begin(t2); +TEST_CONSTEXPR const T t1[] = {0, 1, 2, 3, 4, 5, 6, 7}; +TEST_CONSTEXPR const T t2[] = {10, 11, 12, 13, 14, 15}; +TEST_CONSTEXPR const std::ptrdiff_t size_t1 = std::end(t1) - std::begin(t1); +TEST_CONSTEXPR const std::ptrdiff_t size_t2 = std::end(t2) - std::begin(t2); template -void testd(const C& c, int p, int l) { +TEST_CONSTEXPR_CXX26 void testd(const C& c, int p, int l) { typename C::const_iterator i = c.begin(); int n1 = 0; for (; n1 < p; ++n1, ++i) @@ -37,7 +37,7 @@ void testd(const C& c, int p, int l) { assert(std::distance(c.begin(), c.end()) == size_t1 + l); } -int main(int, char**) { +TEST_CONSTEXPR_CXX26 bool test() { { // splicing different containers typedef std::forward_list C; @@ -67,5 +67,14 @@ int main(int, char**) { } #endif + return true; +} + +int main(int, char**) { + assert(test()); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif + return 0; } diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/splice_after_one.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/splice_after_one.pass.cpp index bb8bdea63254..7202b0e15362 100644 --- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/splice_after_one.pass.cpp +++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/splice_after_one.pass.cpp @@ -8,7 +8,7 @@ // -// void splice_after(const_iterator p, forward_list&& x, const_iterator i); +// void splice_after(const_iterator p, forward_list&& x, const_iterator i); // constexpr since C++26 #include #include @@ -19,13 +19,13 @@ #include "min_allocator.h" typedef int T; -const T t1[] = {0, 1, 2, 3, 4, 5, 6, 7}; -const T t2[] = {10, 11, 12}; -const std::ptrdiff_t size_t1 = std::end(t1) - std::begin(t1); -const std::ptrdiff_t size_t2 = std::end(t2) - std::begin(t2); +TEST_CONSTEXPR const T t1[] = {0, 1, 2, 3, 4, 5, 6, 7}; +TEST_CONSTEXPR const T t2[] = {10, 11, 12}; +TEST_CONSTEXPR const std::ptrdiff_t size_t1 = std::end(t1) - std::begin(t1); +TEST_CONSTEXPR const std::ptrdiff_t size_t2 = std::end(t2) - std::begin(t2); template -void testd(const C& c, int p, int f) { +TEST_CONSTEXPR_CXX26 void testd(const C& c, int p, int f) { typename C::const_iterator i = c.begin(); int n1 = 0; for (; n1 < p; ++n1, ++i) @@ -38,7 +38,7 @@ void testd(const C& c, int p, int f) { } template -void tests(const C& c, int p, int f) { +TEST_CONSTEXPR_CXX26 void tests(const C& c, int p, int f) { typename C::const_iterator i = c.begin(); int n = 0; if (p == f || p == f + 1) { @@ -67,7 +67,7 @@ void tests(const C& c, int p, int f) { assert(std::distance(c.begin(), c.end()) == size_t1); } -int main(int, char**) { +TEST_CONSTEXPR_CXX26 bool test() { { // splicing different containers typedef std::forward_list C; @@ -117,5 +117,14 @@ int main(int, char**) { } #endif + return true; +} + +int main(int, char**) { + assert(test()); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif + return 0; } diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/splice_after_range.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/splice_after_range.pass.cpp index 99b3ed1c7836..18da6f12b28d 100644 --- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/splice_after_range.pass.cpp +++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/splice_after_range.pass.cpp @@ -8,8 +8,10 @@ // +// ADDITIONAL_COMPILE_FLAGS(has-fconstexpr-steps): -fconstexpr-steps=3000000 + // void splice_after(const_iterator p, forward_list&& x, -// const_iterator first, const_iterator last); +// const_iterator first, const_iterator last); // constexpr since C++26 #include #include @@ -20,13 +22,13 @@ #include "min_allocator.h" typedef std::ptrdiff_t T; -const T t1[] = {0, 1, 2, 3, 4, 5, 6, 7}; -const T t2[] = {10, 11, 12, 13, 14, 15}; -const std::ptrdiff_t size_t1 = std::end(t1) - std::begin(t1); -const std::ptrdiff_t size_t2 = std::end(t2) - std::begin(t2); +TEST_CONSTEXPR const T t1[] = {0, 1, 2, 3, 4, 5, 6, 7}; +TEST_CONSTEXPR const T t2[] = {10, 11, 12, 13, 14, 15}; +TEST_CONSTEXPR const std::ptrdiff_t size_t1 = std::end(t1) - std::begin(t1); +TEST_CONSTEXPR const std::ptrdiff_t size_t2 = std::end(t2) - std::begin(t2); template -void testd(const C& c, std::ptrdiff_t p, ptrdiff_t f, ptrdiff_t l) { +TEST_CONSTEXPR_CXX26 void testd(const C& c, std::ptrdiff_t p, ptrdiff_t f, ptrdiff_t l) { typename C::const_iterator i = c.begin(); std::ptrdiff_t n1 = 0; for (; n1 < p; ++n1, ++i) @@ -39,7 +41,7 @@ void testd(const C& c, std::ptrdiff_t p, ptrdiff_t f, ptrdiff_t l) { } template -void tests(const C& c, std::ptrdiff_t p, ptrdiff_t f, ptrdiff_t l) { +TEST_CONSTEXPR_CXX26 void tests(const C& c, std::ptrdiff_t p, ptrdiff_t f, ptrdiff_t l) { typename C::const_iterator i = c.begin(); std::ptrdiff_t n = 0; std::ptrdiff_t d = l > f + 1 ? l - 1 - f : 0; @@ -69,7 +71,7 @@ void tests(const C& c, std::ptrdiff_t p, ptrdiff_t f, ptrdiff_t l) { assert(std::distance(c.begin(), c.end()) == size_t1); } -int main(int, char**) { +TEST_CONSTEXPR_CXX26 bool test() { { // splicing different containers typedef std::forward_list C; @@ -157,5 +159,14 @@ int main(int, char**) { } #endif + return true; +} + +int main(int, char**) { + assert(test()); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif + return 0; } diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/unique.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/unique.pass.cpp index ebd1a79cdb4b..28efff3849e6 100644 --- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/unique.pass.cpp +++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/unique.pass.cpp @@ -9,7 +9,7 @@ // // void unique(); // C++17 and before -// size_type unique(); // C++20 and after +// size_type unique(); // C++20 and after; constexpr since C++26 #include #include @@ -19,7 +19,7 @@ #include "min_allocator.h" template -void do_unique(L& l, typename L::size_type expected) { +TEST_CONSTEXPR_CXX26 void do_unique(L& l, typename L::size_type expected) { typename L::size_type old_size = std::distance(l.begin(), l.end()); #if TEST_STD_VER > 17 ASSERT_SAME_TYPE(decltype(l.unique()), typename L::size_type); @@ -31,7 +31,7 @@ void do_unique(L& l, typename L::size_type expected) { assert(old_size - std::distance(l.begin(), l.end()) == expected); } -int main(int, char**) { +TEST_CONSTEXPR_CXX26 bool test() { { typedef int T; typedef std::forward_list C; @@ -131,5 +131,14 @@ int main(int, char**) { } #endif + return true; +} + +int main(int, char**) { + assert(test()); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif + return 0; } diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/unique_pred.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/unique_pred.pass.cpp index 408cbf6ae9c2..f07142dffe9d 100644 --- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/unique_pred.pass.cpp +++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/unique_pred.pass.cpp @@ -9,7 +9,7 @@ // // template void unique(BinaryPredicate binary_pred); // C++17 and before -// template size_type unique(BinaryPredicate binary_pred); // C++20 and after +// template size_type unique(BinaryPredicate binary_pred); // C++20 and after; constexpr since C++26 #include #include @@ -20,7 +20,7 @@ #include "min_allocator.h" template -void do_unique(L& l, Predicate pred, typename L::size_type expected) { +TEST_CONSTEXPR_CXX26 void do_unique(L& l, Predicate pred, typename L::size_type expected) { typename L::size_type old_size = std::distance(l.begin(), l.end()); #if TEST_STD_VER > 17 ASSERT_SAME_TYPE(decltype(l.unique(pred)), typename L::size_type); @@ -33,17 +33,17 @@ void do_unique(L& l, Predicate pred, typename L::size_type expected) { } struct PredLWG526 { - PredLWG526(int i) : i_(i) {} - ~PredLWG526() { i_ = -32767; } - bool operator()(const PredLWG526& lhs, const PredLWG526& rhs) const { return lhs.i_ == rhs.i_; } + TEST_CONSTEXPR_CXX20 PredLWG526(int i) : i_(i) {} + TEST_CONSTEXPR_CXX20 ~PredLWG526() { i_ = -32767; } + TEST_CONSTEXPR bool operator()(const PredLWG526& lhs, const PredLWG526& rhs) const { return lhs.i_ == rhs.i_; } - bool operator==(int i) const { return i == i_; } + TEST_CONSTEXPR bool operator==(int i) const { return i == i_; } int i_; }; -bool g(int x, int y) { return x == y; } +TEST_CONSTEXPR bool g(int x, int y) { return x == y; } -int main(int, char**) { +TEST_CONSTEXPR_CXX26 bool test() { { typedef int T; typedef std::forward_list C; @@ -157,5 +157,14 @@ int main(int, char**) { } #endif + return true; +} + +int main(int, char**) { + assert(test()); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif + return 0; } diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.spec/equal.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.spec/equal.pass.cpp index ef6b72ee360a..cb57b094a077 100644 --- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.spec/equal.pass.cpp +++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.spec/equal.pass.cpp @@ -10,11 +10,11 @@ // template // bool operator==(const forward_list& x, -// const forward_list& y); +// const forward_list& y); // constexpr since C++26 // // template // bool operator!=(const forward_list& x, -// const forward_list& y); +// const forward_list& y); // constexpr since C++26 #include #include @@ -25,7 +25,7 @@ #include "min_allocator.h" template -void test(int N, int M) { +TEST_CONSTEXPR_CXX26 void test(int N, int M) { C c1; for (int i = 0; i < N; ++i) c1.push_front(i); @@ -44,7 +44,7 @@ void test(int N, int M) { } } -int main(int, char**) { +TEST_CONSTEXPR_CXX26 bool test() { for (int i = 0; i < 10; ++i) for (int j = 0; j < 10; ++j) test >(i, j); @@ -54,5 +54,14 @@ int main(int, char**) { test> >(i, j); #endif + return true; +} + +int main(int, char**) { + assert(test()); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif + return 0; } diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.spec/member_swap.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.spec/member_swap.pass.cpp index e50f9e6e9e47..f4f7c6d1f7e5 100644 --- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.spec/member_swap.pass.cpp +++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.spec/member_swap.pass.cpp @@ -8,7 +8,7 @@ // -// void swap(forward_list& x); +// void swap(forward_list& x); // constexpr since C++26 #include #include @@ -18,7 +18,7 @@ #include "test_allocator.h" #include "min_allocator.h" -int main(int, char**) { +TEST_CONSTEXPR_CXX26 bool test() { { typedef int T; typedef test_allocator A; @@ -257,5 +257,14 @@ int main(int, char**) { } #endif + return true; +} + +int main(int, char**) { + assert(test()); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif + return 0; } diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.spec/non_member_swap.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.spec/non_member_swap.pass.cpp index cae6950436de..ce2547978154 100644 --- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.spec/non_member_swap.pass.cpp +++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.spec/non_member_swap.pass.cpp @@ -9,7 +9,7 @@ // // template -// void swap(forward_list& x, forward_list& y); +// void swap(forward_list& x, forward_list& y); // constexpr since C++26 #include #include @@ -19,7 +19,7 @@ #include "test_allocator.h" #include "min_allocator.h" -int main(int, char**) { +TEST_CONSTEXPR_CXX26 bool test() { { typedef int T; typedef test_allocator A; @@ -258,5 +258,14 @@ int main(int, char**) { } #endif + return true; +} + +int main(int, char**) { + assert(test()); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif + return 0; } diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.spec/relational.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.spec/relational.pass.cpp index d16acadaeb89..7bf80ca026e8 100644 --- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.spec/relational.pass.cpp +++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.spec/relational.pass.cpp @@ -10,19 +10,19 @@ // template // bool operator< (const forward_list& x, -// const forward_list& y); +// const forward_list& y); // constexpr since C++26 // // template // bool operator> (const forward_list& x, -// const forward_list& y); +// const forward_list& y); // constexpr since C++26 // // template // bool operator>=(const forward_list& x, -// const forward_list& y); +// const forward_list& y); // constexpr since C++26 // // template // bool operator<=(const forward_list& x, -// const forward_list& y); +// const forward_list& y); // constexpr since C++26 #include #include @@ -33,7 +33,7 @@ #include "min_allocator.h" template -void test(int N, int M) { +TEST_CONSTEXPR_CXX26 void test(int N, int M) { C c1; for (int i = 0; i < N; ++i) c1.push_front(i); @@ -50,7 +50,7 @@ void test(int N, int M) { assert(c1 > c2); } -int main(int, char**) { +TEST_CONSTEXPR_CXX26 bool test() { for (int i = 0; i < 10; ++i) for (int j = 0; j < 10; ++j) test >(i, j); @@ -60,5 +60,14 @@ int main(int, char**) { test> >(i, j); #endif + return true; +} + +int main(int, char**) { + assert(test()); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif + return 0; } diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.spec/swap_noexcept.compile.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.spec/swap_noexcept.compile.pass.cpp index b50e67589471..02b7b471a1ae 100644 --- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.spec/swap_noexcept.compile.pass.cpp +++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.spec/swap_noexcept.compile.pass.cpp @@ -12,10 +12,10 @@ // void swap(forward_list& c) // noexcept(!allocator_type::propagate_on_container_swap::value || -// __is_nothrow_swappable::value); +// __is_nothrow_swappable::value); // constexpr since C++26 // // In C++17, the standard says that swap shall have: -// noexcept(is_always_equal::value); +// noexcept(is_always_equal::value); // constexpr since C++26 // This tests a conforming extension diff --git a/libcxx/test/std/containers/sequences/forwardlist/get_allocator.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/get_allocator.pass.cpp index f37f5c2f513b..624eeb17799c 100644 --- a/libcxx/test/std/containers/sequences/forwardlist/get_allocator.pass.cpp +++ b/libcxx/test/std/containers/sequences/forwardlist/get_allocator.pass.cpp @@ -10,7 +10,7 @@ // class forward_list -// allocator_type get_allocator() const +// allocator_type get_allocator() const // constexpr since C++26 #include #include @@ -18,7 +18,7 @@ #include "test_allocator.h" #include "test_macros.h" -int main(int, char**) { +TEST_CONSTEXPR_CXX26 bool test() { { std::allocator alloc; const std::forward_list fl(alloc); @@ -30,5 +30,14 @@ int main(int, char**) { assert(fl.get_allocator() == alloc); } + return true; +} + +int main(int, char**) { + assert(test()); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif + return 0; } diff --git a/libcxx/test/std/containers/sequences/forwardlist/incomplete.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/incomplete.pass.cpp index b7be03f1062d..16c6f0b90f96 100644 --- a/libcxx/test/std/containers/sequences/forwardlist/incomplete.pass.cpp +++ b/libcxx/test/std/containers/sequences/forwardlist/incomplete.pass.cpp @@ -8,9 +8,9 @@ // -// forward_list() -// forward_list::iterator() -// forward_list::const_iterator() +// forward_list() // constexpr since C++26 +// forward_list::iterator() // constexpr since C++26 +// forward_list::const_iterator() // constexpr since C++26 #include #include @@ -33,7 +33,7 @@ struct B { }; #endif -int main(int, char**) { +TEST_CONSTEXPR_CXX26 bool test() { { A a; assert(a.d.empty()); @@ -49,5 +49,14 @@ int main(int, char**) { } #endif + return true; +} + +int main(int, char**) { + assert(test()); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif + return 0; } diff --git a/libcxx/test/std/containers/sequences/forwardlist/max_size.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/max_size.pass.cpp index 5ba0d61f104e..aab53351f00e 100644 --- a/libcxx/test/std/containers/sequences/forwardlist/max_size.pass.cpp +++ b/libcxx/test/std/containers/sequences/forwardlist/max_size.pass.cpp @@ -8,7 +8,7 @@ // -// size_type max_size() const; +// size_type max_size() const; // constexpr since C++26 #include #include @@ -18,7 +18,7 @@ #include "test_allocator.h" #include "test_macros.h" -int main(int, char**) { +TEST_CONSTEXPR_CXX26 bool test() { { typedef limited_allocator A; typedef std::forward_list C; @@ -42,5 +42,14 @@ int main(int, char**) { assert(c.max_size() <= alloc_max_size(c.get_allocator())); } + return true; +} + +int main(int, char**) { + assert(test()); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif + return 0; } diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/forward_list.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/forward_list.version.compile.pass.cpp index 31b3e900aabc..05f903dccafe 100644 --- a/libcxx/test/std/language.support/support.limits/support.limits.general/forward_list.version.compile.pass.cpp +++ b/libcxx/test/std/language.support/support.limits/support.limits.general/forward_list.version.compile.pass.cpp @@ -24,6 +24,10 @@ # error "__cpp_lib_allocator_traits_is_always_equal should not be defined before c++17" # endif +# ifdef __cpp_lib_constexpr_forward_list +# error "__cpp_lib_constexpr_forward_list should not be defined before c++26" +# endif + # ifdef __cpp_lib_containers_ranges # error "__cpp_lib_containers_ranges should not be defined before c++23" # endif @@ -54,6 +58,10 @@ # error "__cpp_lib_allocator_traits_is_always_equal should not be defined before c++17" # endif +# ifdef __cpp_lib_constexpr_forward_list +# error "__cpp_lib_constexpr_forward_list should not be defined before c++26" +# endif + # ifdef __cpp_lib_containers_ranges # error "__cpp_lib_containers_ranges should not be defined before c++23" # endif @@ -87,6 +95,10 @@ # error "__cpp_lib_allocator_traits_is_always_equal should have the value 201411L in c++17" # endif +# ifdef __cpp_lib_constexpr_forward_list +# error "__cpp_lib_constexpr_forward_list should not be defined before c++26" +# endif + # ifdef __cpp_lib_containers_ranges # error "__cpp_lib_containers_ranges should not be defined before c++23" # endif @@ -126,6 +138,10 @@ # error "__cpp_lib_allocator_traits_is_always_equal should have the value 201411L in c++20" # endif +# ifdef __cpp_lib_constexpr_forward_list +# error "__cpp_lib_constexpr_forward_list should not be defined before c++26" +# endif + # ifdef __cpp_lib_containers_ranges # error "__cpp_lib_containers_ranges should not be defined before c++23" # endif @@ -171,6 +187,10 @@ # error "__cpp_lib_allocator_traits_is_always_equal should have the value 201411L in c++23" # endif +# ifdef __cpp_lib_constexpr_forward_list +# error "__cpp_lib_constexpr_forward_list should not be defined before c++26" +# endif + # ifndef __cpp_lib_containers_ranges # error "__cpp_lib_containers_ranges should be defined in c++23" # endif @@ -219,6 +239,13 @@ # error "__cpp_lib_allocator_traits_is_always_equal should have the value 201411L in c++26" # endif +# ifndef __cpp_lib_constexpr_forward_list +# error "__cpp_lib_constexpr_forward_list should be defined in c++26" +# endif +# if __cpp_lib_constexpr_forward_list != 202502L +# error "__cpp_lib_constexpr_forward_list should have the value 202502L in c++26" +# endif + # ifndef __cpp_lib_containers_ranges # error "__cpp_lib_containers_ranges should be defined in c++26" # endif diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.compile.pass.cpp index b1cc4afd3069..a13edacd1e46 100644 --- a/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.compile.pass.cpp +++ b/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.compile.pass.cpp @@ -196,6 +196,10 @@ # error "__cpp_lib_constexpr_dynamic_alloc should not be defined before c++20" # endif +# ifdef __cpp_lib_constexpr_forward_list +# error "__cpp_lib_constexpr_forward_list should not be defined before c++26" +# endif + # ifdef __cpp_lib_constexpr_functional # error "__cpp_lib_constexpr_functional should not be defined before c++20" # endif @@ -1084,6 +1088,10 @@ # error "__cpp_lib_constexpr_dynamic_alloc should not be defined before c++20" # endif +# ifdef __cpp_lib_constexpr_forward_list +# error "__cpp_lib_constexpr_forward_list should not be defined before c++26" +# endif + # ifdef __cpp_lib_constexpr_functional # error "__cpp_lib_constexpr_functional should not be defined before c++20" # endif @@ -2074,6 +2082,10 @@ # error "__cpp_lib_constexpr_dynamic_alloc should not be defined before c++20" # endif +# ifdef __cpp_lib_constexpr_forward_list +# error "__cpp_lib_constexpr_forward_list should not be defined before c++26" +# endif + # ifdef __cpp_lib_constexpr_functional # error "__cpp_lib_constexpr_functional should not be defined before c++20" # endif @@ -3304,6 +3316,10 @@ # error "__cpp_lib_constexpr_dynamic_alloc should have the value 201907L in c++20" # endif +# ifdef __cpp_lib_constexpr_forward_list +# error "__cpp_lib_constexpr_forward_list should not be defined before c++26" +# endif + # ifndef __cpp_lib_constexpr_functional # error "__cpp_lib_constexpr_functional should be defined in c++20" # endif @@ -4756,6 +4772,10 @@ # error "__cpp_lib_constexpr_dynamic_alloc should have the value 201907L in c++23" # endif +# ifdef __cpp_lib_constexpr_forward_list +# error "__cpp_lib_constexpr_forward_list should not be defined before c++26" +# endif + # ifndef __cpp_lib_constexpr_functional # error "__cpp_lib_constexpr_functional should be defined in c++23" # endif @@ -6427,6 +6447,13 @@ # error "__cpp_lib_constexpr_dynamic_alloc should have the value 201907L in c++26" # endif +# ifndef __cpp_lib_constexpr_forward_list +# error "__cpp_lib_constexpr_forward_list should be defined in c++26" +# endif +# if __cpp_lib_constexpr_forward_list != 202502L +# error "__cpp_lib_constexpr_forward_list should have the value 202502L in c++26" +# endif + # ifndef __cpp_lib_constexpr_functional # error "__cpp_lib_constexpr_functional should be defined in c++26" # endif diff --git a/libcxx/test/support/counting_predicates.h b/libcxx/test/support/counting_predicates.h index 6f34ce76302a..8fb2db1af70d 100644 --- a/libcxx/test/support/counting_predicates.h +++ b/libcxx/test/support/counting_predicates.h @@ -16,42 +16,44 @@ template struct unary_counting_predicate { public: - typedef Arg argument_type; - typedef bool result_type; + typedef Arg argument_type; + typedef bool result_type; - unary_counting_predicate(Predicate p) : p_(p), count_(0) {} - unary_counting_predicate(const unary_counting_predicate&) = default; - unary_counting_predicate& operator=(const unary_counting_predicate&) = default; - ~unary_counting_predicate() {} + TEST_CONSTEXPR_CXX20 unary_counting_predicate(Predicate p) : p_(p), count_(0) {} + unary_counting_predicate(const unary_counting_predicate&) = default; + unary_counting_predicate& operator=(const unary_counting_predicate&) = default; + TEST_CONSTEXPR_CXX20 ~unary_counting_predicate() {} - bool operator () (const Arg &a) const { ++count_; return p_(a); } - std::size_t count() const { return count_; } - void reset() { count_ = 0; } + TEST_CONSTEXPR_CXX14 bool operator()(const Arg& a) const { + ++count_; + return p_(a); + } + TEST_CONSTEXPR std::size_t count() const { return count_; } + TEST_CONSTEXPR_CXX14 void reset() { count_ = 0; } private: - Predicate p_; - mutable std::size_t count_; + Predicate p_; + mutable std::size_t count_; }; - -template +template struct binary_counting_predicate { public: - typedef Arg1 first_argument_type; - typedef Arg2 second_argument_type; - typedef bool result_type; + typedef Arg1 first_argument_type; + typedef Arg2 second_argument_type; + typedef bool result_type; - TEST_CONSTEXPR binary_counting_predicate(Predicate p) : p_(p), count_(0) {} - TEST_CONSTEXPR_CXX14 bool operator()(const Arg1& a1, const Arg2& a2) const { - ++count_; - return p_(a1, a2); - } - TEST_CONSTEXPR std::size_t count() const { return count_; } - TEST_CONSTEXPR_CXX14 void reset() { count_ = 0; } + TEST_CONSTEXPR binary_counting_predicate(Predicate p) : p_(p), count_(0) {} + TEST_CONSTEXPR_CXX14 bool operator()(const Arg1& a1, const Arg2& a2) const { + ++count_; + return p_(a1, a2); + } + TEST_CONSTEXPR std::size_t count() const { return count_; } + TEST_CONSTEXPR_CXX14 void reset() { count_ = 0; } - private: - Predicate p_; - mutable std::size_t count_; +private: + Predicate p_; + mutable std::size_t count_; }; #if TEST_STD_VER > 14 @@ -66,13 +68,13 @@ public: constexpr counting_predicate(Predicate pred, int& count) : pred_(std::move(pred)), count_(&count) {} template - constexpr decltype(auto) operator()(Args&& ...args) { + constexpr decltype(auto) operator()(Args&&... args) { ++(*count_); return pred_(std::forward(args)...); } template - constexpr decltype(auto) operator()(Args&& ...args) const { + constexpr decltype(auto) operator()(Args&&... args) const { ++(*count_); return pred_(std::forward(args)...); } diff --git a/libcxx/utils/generate_feature_test_macro_components.py b/libcxx/utils/generate_feature_test_macro_components.py old mode 100755 new mode 100644 index 82f0d09db5c3..b59c7fdaf0a3 --- a/libcxx/utils/generate_feature_test_macro_components.py +++ b/libcxx/utils/generate_feature_test_macro_components.py @@ -357,6 +357,11 @@ feature_test_macros = [ "values": {"c++20": 201907}, "headers": ["memory"], }, + { + "name": "__cpp_lib_constexpr_forward_list", + "values": {"c++26": 202502}, + "headers": ["forward_list"], + }, { "name": "__cpp_lib_constexpr_functional", "values": {"c++20": 201907}, From 5188bea9afac859fa6523e07d98748527c295aaf Mon Sep 17 00:00:00 2001 From: Andrew Rogers Date: Wed, 11 Jun 2025 09:18:55 -0700 Subject: [PATCH 0005/1322] [llvm] annotate interfaces in llvm/TargetParser for DLL export (#143616) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Purpose This patch is one in a series of code-mods that annotate LLVM’s public interface for export. This patch annotates the `llvm/TargetParser` library. These annotations currently have no meaningful impact on the LLVM build; however, they are a prerequisite to support an LLVM Windows DLL (shared library) build. ## Background This effort is tracked in #109483. Additional context is provided in [this discourse](https://discourse.llvm.org/t/psa-annotating-llvm-public-interface/85307), and documentation for `LLVM_ABI` and related annotations is found in the LLVM repo [here](https://github.com/llvm/llvm-project/blob/main/llvm/docs/InterfaceExportAnnotations.rst). Most of these changes were generated automatically using the [Interface Definition Scanner (IDS)](https://github.com/compnerd/ids) tool, followed formatting with `git clang-format`. Additionally, I manually removed the redundant declaration of `getCanonicalArchName` from llvm/include/llvm/TargetParser/ARMTargetParser.h because IDS only auto-annotates the first declaration it encounters, and the second un-annotated declaration results in an MSVC warning. ## Validation Local builds and tests to validate cross-platform compatibility. This included llvm, clang, and lldb on the following configurations: - Windows with MSVC - Windows with Clang - Linux with GCC - Linux with Clang - Darwin with Clang --- .../llvm/TargetParser/AArch64TargetParser.h | 60 +++++---- .../llvm/TargetParser/ARMTargetParser.h | 71 +++++----- .../llvm/TargetParser/ARMTargetParserCommon.h | 13 +- .../llvm/TargetParser/CSKYTargetParser.h | 30 +++-- llvm/include/llvm/TargetParser/Host.h | 25 ++-- .../llvm/TargetParser/LoongArchTargetParser.h | 13 +- .../llvm/TargetParser/PPCTargetParser.h | 15 ++- llvm/include/llvm/TargetParser/RISCVISAInfo.h | 42 +++--- .../llvm/TargetParser/RISCVTargetParser.h | 42 +++--- .../llvm/TargetParser/SubtargetFeature.h | 17 +-- llvm/include/llvm/TargetParser/TargetParser.h | 29 +++-- llvm/include/llvm/TargetParser/Triple.h | 121 +++++++++--------- .../llvm/TargetParser/X86TargetParser.h | 33 ++--- 13 files changed, 271 insertions(+), 240 deletions(-) diff --git a/llvm/include/llvm/TargetParser/AArch64TargetParser.h b/llvm/include/llvm/TargetParser/AArch64TargetParser.h index 0338770593bc..59e8117ccb73 100644 --- a/llvm/include/llvm/TargetParser/AArch64TargetParser.h +++ b/llvm/include/llvm/TargetParser/AArch64TargetParser.h @@ -19,6 +19,7 @@ #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/StringMap.h" #include "llvm/ADT/StringRef.h" +#include "llvm/Support/Compiler.h" #include "llvm/Support/VersionTuple.h" #include "llvm/Support/raw_ostream.h" #include "llvm/TargetParser/SubtargetFeature.h" @@ -79,7 +80,7 @@ struct FMVInfo { : Name(Name), FeatureBit(FeatureBit), PriorityBit(PriorityBit), ID(ID) {}; }; -const std::vector &getFMVInfo(); +LLVM_ABI const std::vector &getFMVInfo(); // Represents a dependency between two architecture extensions. Later is the // feature which was added to the architecture after Earlier, and expands the @@ -146,7 +147,7 @@ struct ArchInfo { StringRef getSubArch() const { return ArchFeature.substr(1); } // Search for ArchInfo by SubArch name - static std::optional findBySubArch(StringRef SubArch); + LLVM_ABI static std::optional findBySubArch(StringRef SubArch); }; #define EMIT_ARCHITECTURES @@ -182,34 +183,36 @@ struct ExtensionSet { // Enable the given architecture extension, and any other extensions it // depends on. Does not change the base architecture, or follow dependencies // between features which are only related by required arcitecture versions. - void enable(ArchExtKind E); + LLVM_ABI void enable(ArchExtKind E); // Disable the given architecture extension, and any other extensions which // depend on it. Does not change the base architecture, or follow // dependencies between features which are only related by required // arcitecture versions. - void disable(ArchExtKind E); + LLVM_ABI void disable(ArchExtKind E); // Add default extensions for the given CPU. Records the base architecture, // to later resolve dependencies which depend on it. - void addCPUDefaults(const CpuInfo &CPU); + LLVM_ABI void addCPUDefaults(const CpuInfo &CPU); // Add default extensions for the given architecture version. Records the // base architecture, to later resolve dependencies which depend on it. - void addArchDefaults(const ArchInfo &Arch); + LLVM_ABI void addArchDefaults(const ArchInfo &Arch); // Add or remove a feature based on a modifier string. The string must be of // the form "" to enable a feature or "no" to disable it. This // will also enable or disable any features as required by the dependencies // between them. - bool parseModifier(StringRef Modifier, const bool AllowNoDashForm = false); + LLVM_ABI bool parseModifier(StringRef Modifier, + const bool AllowNoDashForm = false); // Constructs a new ExtensionSet by toggling the corresponding bits for every // feature in the \p Features list without expanding their dependencies. Used // for reconstructing an ExtensionSet from the output of toLLVMFeatures(). // Features that are not recognized are pushed back to \p NonExtensions. - void reconstructFromParsedFeatures(const std::vector &Features, - std::vector &NonExtensions); + LLVM_ABI void + reconstructFromParsedFeatures(const std::vector &Features, + std::vector &NonExtensions); // Convert the set of enabled extension to an LLVM feature list, appending // them to Features. @@ -227,7 +230,7 @@ struct ExtensionSet { } } - void dump() const; + LLVM_ABI void dump() const; }; // Name alias. @@ -239,52 +242,53 @@ struct Alias { #define EMIT_CPU_ALIAS #include "llvm/TargetParser/AArch64TargetParserDef.inc" -const ExtensionInfo &getExtensionByID(ArchExtKind(ExtID)); +LLVM_ABI const ExtensionInfo &getExtensionByID(ArchExtKind(ExtID)); -bool getExtensionFeatures( - const AArch64::ExtensionBitset &Extensions, - std::vector &Features); +LLVM_ABI bool getExtensionFeatures(const AArch64::ExtensionBitset &Extensions, + std::vector &Features); -StringRef getArchExtFeature(StringRef ArchExt); -StringRef resolveCPUAlias(StringRef CPU); +LLVM_ABI StringRef getArchExtFeature(StringRef ArchExt); +LLVM_ABI StringRef resolveCPUAlias(StringRef CPU); // Information by Name -const ArchInfo *getArchForCpu(StringRef CPU); +LLVM_ABI const ArchInfo *getArchForCpu(StringRef CPU); // Parser -const ArchInfo *parseArch(StringRef Arch); +LLVM_ABI const ArchInfo *parseArch(StringRef Arch); // Return the extension which has the given -target-feature name. -std::optional targetFeatureToExtension(StringRef TargetFeature); +LLVM_ABI std::optional +targetFeatureToExtension(StringRef TargetFeature); // Parse a name as defined by the Extension class in tablegen. -std::optional parseArchExtension(StringRef Extension); +LLVM_ABI std::optional parseArchExtension(StringRef Extension); // Parse a name as defined by the FMVInfo class in tablegen. -std::optional parseFMVExtension(StringRef Extension); +LLVM_ABI std::optional parseFMVExtension(StringRef Extension); // Given the name of a CPU or alias, return the correponding CpuInfo. -std::optional parseCpu(StringRef Name); +LLVM_ABI std::optional parseCpu(StringRef Name); // Used by target parser tests -void fillValidCPUArchList(SmallVectorImpl &Values); +LLVM_ABI void fillValidCPUArchList(SmallVectorImpl &Values); -bool isX18ReservedByDefault(const Triple &TT); +LLVM_ABI bool isX18ReservedByDefault(const Triple &TT); // For a given set of feature names, which can be either target-features, or // fmv-features metadata, expand their dependencies and then return a bitmask // corresponding to the entries of AArch64::FeatPriorities. -uint64_t getFMVPriority(ArrayRef Features); +LLVM_ABI uint64_t getFMVPriority(ArrayRef Features); // For a given set of FMV feature names, expand their dependencies and then // return a bitmask corresponding to the entries of AArch64::CPUFeatures. // The values in CPUFeatures are not bitmasks themselves, they are sequential // (0, 1, 2, 3, ...). The resulting bitmask is used at runtime to test whether // a certain FMV feature is available on the host. -uint64_t getCpuSupportsMask(ArrayRef Features); +LLVM_ABI uint64_t getCpuSupportsMask(ArrayRef Features); -void PrintSupportedExtensions(); +LLVM_ABI void PrintSupportedExtensions(); -void printEnabledExtensions(const std::set &EnabledFeatureNames); +LLVM_ABI void +printEnabledExtensions(const std::set &EnabledFeatureNames); } // namespace AArch64 } // namespace llvm diff --git a/llvm/include/llvm/TargetParser/ARMTargetParser.h b/llvm/include/llvm/TargetParser/ARMTargetParser.h index b2403f42f1b7..798c578ced93 100644 --- a/llvm/include/llvm/TargetParser/ARMTargetParser.h +++ b/llvm/include/llvm/TargetParser/ARMTargetParser.h @@ -17,6 +17,7 @@ #include "llvm/ADT/StringMap.h" #include "llvm/ADT/StringRef.h" #include "llvm/Support/ARMBuildAttributes.h" +#include "llvm/Support/Compiler.h" #include "llvm/TargetParser/ARMTargetParserCommon.h" #include @@ -223,53 +224,55 @@ inline ArchKind &operator--(ArchKind &Kind) { } // Information by ID -StringRef getFPUName(FPUKind FPUKind); -FPUVersion getFPUVersion(FPUKind FPUKind); -NeonSupportLevel getFPUNeonSupportLevel(FPUKind FPUKind); -FPURestriction getFPURestriction(FPUKind FPUKind); +LLVM_ABI StringRef getFPUName(FPUKind FPUKind); +LLVM_ABI FPUVersion getFPUVersion(FPUKind FPUKind); +LLVM_ABI NeonSupportLevel getFPUNeonSupportLevel(FPUKind FPUKind); +LLVM_ABI FPURestriction getFPURestriction(FPUKind FPUKind); -bool getFPUFeatures(FPUKind FPUKind, std::vector &Features); -bool getHWDivFeatures(uint64_t HWDivKind, std::vector &Features); -bool getExtensionFeatures(uint64_t Extensions, - std::vector &Features); +LLVM_ABI bool getFPUFeatures(FPUKind FPUKind, std::vector &Features); +LLVM_ABI bool getHWDivFeatures(uint64_t HWDivKind, + std::vector &Features); +LLVM_ABI bool getExtensionFeatures(uint64_t Extensions, + std::vector &Features); -StringRef getArchName(ArchKind AK); -unsigned getArchAttr(ArchKind AK); -StringRef getCPUAttr(ArchKind AK); -StringRef getSubArch(ArchKind AK); -StringRef getArchExtName(uint64_t ArchExtKind); -StringRef getArchExtFeature(StringRef ArchExt); -bool appendArchExtFeatures(StringRef CPU, ARM::ArchKind AK, StringRef ArchExt, - std::vector &Features, - FPUKind &ArgFPUKind); -ArchKind convertV9toV8(ArchKind AK); +LLVM_ABI StringRef getArchName(ArchKind AK); +LLVM_ABI unsigned getArchAttr(ArchKind AK); +LLVM_ABI StringRef getCPUAttr(ArchKind AK); +LLVM_ABI StringRef getSubArch(ArchKind AK); +LLVM_ABI StringRef getArchExtName(uint64_t ArchExtKind); +LLVM_ABI StringRef getArchExtFeature(StringRef ArchExt); +LLVM_ABI bool appendArchExtFeatures(StringRef CPU, ARM::ArchKind AK, + StringRef ArchExt, + std::vector &Features, + FPUKind &ArgFPUKind); +LLVM_ABI ArchKind convertV9toV8(ArchKind AK); // Information by Name -FPUKind getDefaultFPU(StringRef CPU, ArchKind AK); -uint64_t getDefaultExtensions(StringRef CPU, ArchKind AK); -StringRef getDefaultCPU(StringRef Arch); -StringRef getCanonicalArchName(StringRef Arch); -StringRef getFPUSynonym(StringRef FPU); +LLVM_ABI FPUKind getDefaultFPU(StringRef CPU, ArchKind AK); +LLVM_ABI uint64_t getDefaultExtensions(StringRef CPU, ArchKind AK); +LLVM_ABI StringRef getDefaultCPU(StringRef Arch); +LLVM_ABI StringRef getFPUSynonym(StringRef FPU); // Parser -uint64_t parseHWDiv(StringRef HWDiv); -FPUKind parseFPU(StringRef FPU); -ArchKind parseArch(StringRef Arch); -uint64_t parseArchExt(StringRef ArchExt); -ArchKind parseCPUArch(StringRef CPU); -ProfileKind parseArchProfile(StringRef Arch); -unsigned parseArchVersion(StringRef Arch); +LLVM_ABI uint64_t parseHWDiv(StringRef HWDiv); +LLVM_ABI FPUKind parseFPU(StringRef FPU); +LLVM_ABI ArchKind parseArch(StringRef Arch); +LLVM_ABI uint64_t parseArchExt(StringRef ArchExt); +LLVM_ABI ArchKind parseCPUArch(StringRef CPU); +LLVM_ABI ProfileKind parseArchProfile(StringRef Arch); +LLVM_ABI unsigned parseArchVersion(StringRef Arch); -void fillValidCPUArchList(SmallVectorImpl &Values); -StringRef computeDefaultTargetABI(const Triple &TT, StringRef CPU); +LLVM_ABI void fillValidCPUArchList(SmallVectorImpl &Values); +LLVM_ABI StringRef computeDefaultTargetABI(const Triple &TT, StringRef CPU); /// Get the (LLVM) name of the minimum ARM CPU for the arch we are targeting. /// /// \param Arch the architecture name (e.g., "armv7s"). If it is an empty /// string then the triple's arch name is used. -StringRef getARMCPUForArch(const llvm::Triple &Triple, StringRef MArch = {}); +LLVM_ABI StringRef getARMCPUForArch(const llvm::Triple &Triple, + StringRef MArch = {}); -void PrintSupportedExtensions(StringMap DescMap); +LLVM_ABI void PrintSupportedExtensions(StringMap DescMap); } // namespace ARM } // namespace llvm diff --git a/llvm/include/llvm/TargetParser/ARMTargetParserCommon.h b/llvm/include/llvm/TargetParser/ARMTargetParserCommon.h index f6115718e9f5..7c8030dd5576 100644 --- a/llvm/include/llvm/TargetParser/ARMTargetParserCommon.h +++ b/llvm/include/llvm/TargetParser/ARMTargetParserCommon.h @@ -14,6 +14,7 @@ #define LLVM_TARGETPARSER_ARMTARGETPARSERCOMMON_H #include "llvm/ADT/StringRef.h" +#include "llvm/Support/Compiler.h" namespace llvm { namespace ARM { @@ -23,19 +24,19 @@ enum class ISAKind { INVALID = 0, ARM, THUMB, AARCH64 }; enum class EndianKind { INVALID = 0, LITTLE, BIG }; /// Converts e.g. "armv8" -> "armv8-a" -StringRef getArchSynonym(StringRef Arch); +LLVM_ABI StringRef getArchSynonym(StringRef Arch); /// MArch is expected to be of the form (arm|thumb)?(eb)?(v.+)?(eb)?, but /// (iwmmxt|xscale)(eb)? is also permitted. If the former, return /// "v.+", if the latter, return unmodified string, minus 'eb'. /// If invalid, return empty string. -StringRef getCanonicalArchName(StringRef Arch); +LLVM_ABI StringRef getCanonicalArchName(StringRef Arch); // ARM, Thumb, AArch64 -ISAKind parseArchISA(StringRef Arch); +LLVM_ABI ISAKind parseArchISA(StringRef Arch); // Little/Big endian -EndianKind parseArchEndian(StringRef Arch); +LLVM_ABI EndianKind parseArchEndian(StringRef Arch); struct ParsedBranchProtection { StringRef Scope; @@ -45,8 +46,8 @@ struct ParsedBranchProtection { bool GuardedControlStack; }; -bool parseBranchProtection(StringRef Spec, ParsedBranchProtection &PBP, - StringRef &Err, bool EnablePAuthLR = false); +LLVM_ABI bool parseBranchProtection(StringRef Spec, ParsedBranchProtection &PBP, + StringRef &Err, bool EnablePAuthLR = false); } // namespace ARM } // namespace llvm diff --git a/llvm/include/llvm/TargetParser/CSKYTargetParser.h b/llvm/include/llvm/TargetParser/CSKYTargetParser.h index 4c4ec06f758a..8eab03ca0149 100644 --- a/llvm/include/llvm/TargetParser/CSKYTargetParser.h +++ b/llvm/include/llvm/TargetParser/CSKYTargetParser.h @@ -15,6 +15,7 @@ #ifndef LLVM_TARGETPARSER_CSKYTARGETPARSER_H #define LLVM_TARGETPARSER_CSKYTARGETPARSER_H +#include "llvm/Support/Compiler.h" #include "llvm/TargetParser/Triple.h" #include @@ -176,25 +177,26 @@ const ArchNames ARCHNames[] = { #include "llvm/TargetParser/CSKYTargetParser.def" }; -StringRef getArchName(ArchKind AK); -StringRef getDefaultCPU(StringRef Arch); -StringRef getArchExtName(uint64_t ArchExtKind); -StringRef getArchExtFeature(StringRef ArchExt); -uint64_t getDefaultExtensions(StringRef CPU); -bool getExtensionFeatures(uint64_t Extensions, - std::vector &Features); +LLVM_ABI StringRef getArchName(ArchKind AK); +LLVM_ABI StringRef getDefaultCPU(StringRef Arch); +LLVM_ABI StringRef getArchExtName(uint64_t ArchExtKind); +LLVM_ABI StringRef getArchExtFeature(StringRef ArchExt); +LLVM_ABI uint64_t getDefaultExtensions(StringRef CPU); +LLVM_ABI bool getExtensionFeatures(uint64_t Extensions, + std::vector &Features); // Information by ID -StringRef getFPUName(unsigned FPUKind); -FPUVersion getFPUVersion(unsigned FPUKind); +LLVM_ABI StringRef getFPUName(unsigned FPUKind); +LLVM_ABI FPUVersion getFPUVersion(unsigned FPUKind); -bool getFPUFeatures(CSKYFPUKind Kind, std::vector &Features); +LLVM_ABI bool getFPUFeatures(CSKYFPUKind Kind, + std::vector &Features); // Parser -ArchKind parseArch(StringRef Arch); -ArchKind parseCPUArch(StringRef CPU); -uint64_t parseArchExt(StringRef ArchExt); -void fillValidCPUArchList(SmallVectorImpl &Values); +LLVM_ABI ArchKind parseArch(StringRef Arch); +LLVM_ABI ArchKind parseCPUArch(StringRef CPU); +LLVM_ABI uint64_t parseArchExt(StringRef ArchExt); +LLVM_ABI void fillValidCPUArchList(SmallVectorImpl &Values); } // namespace CSKY diff --git a/llvm/include/llvm/TargetParser/Host.h b/llvm/include/llvm/TargetParser/Host.h index 443f4f583b55..be3d41e022ad 100644 --- a/llvm/include/llvm/TargetParser/Host.h +++ b/llvm/include/llvm/TargetParser/Host.h @@ -13,6 +13,7 @@ #ifndef LLVM_TARGETPARSER_HOST_H #define LLVM_TARGETPARSER_HOST_H +#include "llvm/Support/Compiler.h" #include namespace llvm { @@ -30,18 +31,18 @@ namespace sys { /// CPU_TYPE-VENDOR-OPERATING_SYSTEM /// or /// CPU_TYPE-VENDOR-KERNEL-OPERATING_SYSTEM -std::string getDefaultTargetTriple(); +LLVM_ABI std::string getDefaultTargetTriple(); /// getProcessTriple() - Return an appropriate target triple for generating /// code to be loaded into the current process, e.g. when using the JIT. -std::string getProcessTriple(); +LLVM_ABI std::string getProcessTriple(); /// getHostCPUName - Get the LLVM name for the host CPU. The particular format /// of the name is target dependent, and suitable for passing as -mcpu to the /// target which matches the host. /// /// \return - The host CPU name, or empty if the CPU could not be determined. -StringRef getHostCPUName(); +LLVM_ABI StringRef getHostCPUName(); /// getHostCPUFeatures - Get the LLVM names for the host CPU features. /// The particular format of the names are target dependent, and suitable for @@ -52,20 +53,20 @@ StringRef getHostCPUName(); /// which features may appear in this map, except that they are all valid LLVM /// feature names. The map can be empty, for example if feature detection /// fails. -const StringMap getHostCPUFeatures(); +LLVM_ABI const StringMap getHostCPUFeatures(); /// This is a function compatible with cl::AddExtraVersionPrinter, which adds /// info about the current target triple and detected CPU. -void printDefaultTargetAndDetectedCPU(raw_ostream &OS); +LLVM_ABI void printDefaultTargetAndDetectedCPU(raw_ostream &OS); namespace detail { /// Helper functions to extract HostCPUName from /proc/cpuinfo on linux. -StringRef getHostCPUNameForPowerPC(StringRef ProcCpuinfoContent); -StringRef getHostCPUNameForARM(StringRef ProcCpuinfoContent); -StringRef getHostCPUNameForS390x(StringRef ProcCpuinfoContent); -StringRef getHostCPUNameForRISCV(StringRef ProcCpuinfoContent); -StringRef getHostCPUNameForSPARC(StringRef ProcCpuinfoContent); -StringRef getHostCPUNameForBPF(); +LLVM_ABI StringRef getHostCPUNameForPowerPC(StringRef ProcCpuinfoContent); +LLVM_ABI StringRef getHostCPUNameForARM(StringRef ProcCpuinfoContent); +LLVM_ABI StringRef getHostCPUNameForS390x(StringRef ProcCpuinfoContent); +LLVM_ABI StringRef getHostCPUNameForRISCV(StringRef ProcCpuinfoContent); +LLVM_ABI StringRef getHostCPUNameForSPARC(StringRef ProcCpuinfoContent); +LLVM_ABI StringRef getHostCPUNameForBPF(); /// Helper functions to extract CPU details from CPUID on x86. namespace x86 { @@ -78,7 +79,7 @@ enum class VendorSignatures { /// Returns the host CPU's vendor. /// MaxLeaf: if a non-nullptr pointer is specified, the EAX value will be /// assigned to its pointee. -VendorSignatures getVendorSignature(unsigned *MaxLeaf = nullptr); +LLVM_ABI VendorSignatures getVendorSignature(unsigned *MaxLeaf = nullptr); } // namespace x86 } // namespace detail } // namespace sys diff --git a/llvm/include/llvm/TargetParser/LoongArchTargetParser.h b/llvm/include/llvm/TargetParser/LoongArchTargetParser.h index a28e4e9eff81..1357d7474459 100644 --- a/llvm/include/llvm/TargetParser/LoongArchTargetParser.h +++ b/llvm/include/llvm/TargetParser/LoongArchTargetParser.h @@ -14,6 +14,7 @@ #ifndef LLVM_TARGETPARSER_LOONGARCHTARGETPARSER_H #define LLVM_TARGETPARSER_LOONGARCHTARGETPARSER_H +#include "llvm/Support/Compiler.h" #include "llvm/TargetParser/Triple.h" #include @@ -84,12 +85,12 @@ struct ArchInfo { uint32_t Features; }; -bool isValidArchName(StringRef Arch); -bool isValidFeatureName(StringRef Feature); -bool getArchFeatures(StringRef Arch, std::vector &Features); -bool isValidCPUName(StringRef TuneCPU); -void fillValidCPUList(SmallVectorImpl &Values); -StringRef getDefaultArch(bool Is64Bit); +LLVM_ABI bool isValidArchName(StringRef Arch); +LLVM_ABI bool isValidFeatureName(StringRef Feature); +LLVM_ABI bool getArchFeatures(StringRef Arch, std::vector &Features); +LLVM_ABI bool isValidCPUName(StringRef TuneCPU); +LLVM_ABI void fillValidCPUList(SmallVectorImpl &Values); +LLVM_ABI StringRef getDefaultArch(bool Is64Bit); } // namespace LoongArch diff --git a/llvm/include/llvm/TargetParser/PPCTargetParser.h b/llvm/include/llvm/TargetParser/PPCTargetParser.h index 5f9fe543aff0..59d9f867005a 100644 --- a/llvm/include/llvm/TargetParser/PPCTargetParser.h +++ b/llvm/include/llvm/TargetParser/PPCTargetParser.h @@ -15,25 +15,28 @@ #define LLVM_TARGETPARSER_PPCTARGETPARSER_H #include "llvm/ADT/StringRef.h" +#include "llvm/Support/Compiler.h" #include "llvm/TargetParser/Triple.h" namespace llvm { namespace PPC { -bool isValidCPU(StringRef CPU); -void fillValidCPUList(SmallVectorImpl &Values); -void fillValidTuneCPUList(SmallVectorImpl &Values); +LLVM_ABI bool isValidCPU(StringRef CPU); +LLVM_ABI void fillValidCPUList(SmallVectorImpl &Values); +LLVM_ABI void fillValidTuneCPUList(SmallVectorImpl &Values); // Get target CPU name. // If CPUName is empty or generic, return the default CPU name. // If CPUName is not empty or generic, return the normalized CPU name. -StringRef getNormalizedPPCTargetCPU(const Triple &T, StringRef CPUName = ""); +LLVM_ABI StringRef getNormalizedPPCTargetCPU(const Triple &T, + StringRef CPUName = ""); // Get the tune CPU name. -StringRef getNormalizedPPCTuneCPU(const Triple &T, StringRef CPUName = ""); +LLVM_ABI StringRef getNormalizedPPCTuneCPU(const Triple &T, + StringRef CPUName = ""); // For PPC, there are some cpu names for same CPU, like pwr10 and power10, // normalize them. -StringRef normalizeCPUName(StringRef CPUName); +LLVM_ABI StringRef normalizeCPUName(StringRef CPUName); } // namespace PPC } // namespace llvm diff --git a/llvm/include/llvm/TargetParser/RISCVISAInfo.h b/llvm/include/llvm/TargetParser/RISCVISAInfo.h index 5b2b6f29fd3d..0c308cadba79 100644 --- a/llvm/include/llvm/TargetParser/RISCVISAInfo.h +++ b/llvm/include/llvm/TargetParser/RISCVISAInfo.h @@ -11,6 +11,7 @@ #include "llvm/ADT/StringMap.h" #include "llvm/ADT/StringRef.h" +#include "llvm/Support/Compiler.h" #include "llvm/Support/Error.h" #include "llvm/Support/RISCVISAUtils.h" @@ -31,27 +32,27 @@ public: /// extensions with unrecognised versions will be silently dropped, except /// for the special case of the base 'i' and 'e' extensions, where the /// default version will be used (as ignoring the base is not possible). - static llvm::Expected> + LLVM_ABI static llvm::Expected> parseArchString(StringRef Arch, bool EnableExperimentalExtension, bool ExperimentalExtensionVersionCheck = true); /// Parse RISC-V ISA info from an arch string that is already in normalized /// form (as defined in the psABI). Unlike parseArchString, this function /// will not error for unrecognized extension names or extension versions. - static llvm::Expected> + LLVM_ABI static llvm::Expected> parseNormalizedArchString(StringRef Arch); /// Parse RISC-V ISA info from feature vector. - static llvm::Expected> + LLVM_ABI static llvm::Expected> parseFeatures(unsigned XLen, const std::vector &Features); - static llvm::Expected> + LLVM_ABI static llvm::Expected> createFromExtMap(unsigned XLen, const RISCVISAUtils::OrderedExtensionMap &Exts); /// Convert RISC-V ISA info to a feature vector. - std::vector toFeatures(bool AddAllExtensions = false, - bool IgnoreUnknown = true) const; + LLVM_ABI std::vector toFeatures(bool AddAllExtensions = false, + bool IgnoreUnknown = true) const; const RISCVISAUtils::OrderedExtensionMap &getExtensions() const { return Exts; @@ -64,25 +65,26 @@ public: unsigned getMaxELen() const { return MaxELen; } unsigned getMaxELenFp() const { return MaxELenFp; } - bool hasExtension(StringRef Ext) const; - std::string toString() const; - StringRef computeDefaultABI() const; + LLVM_ABI bool hasExtension(StringRef Ext) const; + LLVM_ABI std::string toString() const; + LLVM_ABI StringRef computeDefaultABI() const; - static bool isSupportedExtensionFeature(StringRef Ext); - static bool isSupportedExtension(StringRef Ext); - static bool isSupportedExtensionWithVersion(StringRef Ext); - static bool isSupportedExtension(StringRef Ext, unsigned MajorVersion, - unsigned MinorVersion); - static std::string getTargetFeatureForExtension(StringRef Ext); + LLVM_ABI static bool isSupportedExtensionFeature(StringRef Ext); + LLVM_ABI static bool isSupportedExtension(StringRef Ext); + LLVM_ABI static bool isSupportedExtensionWithVersion(StringRef Ext); + LLVM_ABI static bool isSupportedExtension(StringRef Ext, + unsigned MajorVersion, + unsigned MinorVersion); + LLVM_ABI static std::string getTargetFeatureForExtension(StringRef Ext); - static void printSupportedExtensions(StringMap &DescMap); - static void printEnabledExtensions(bool IsRV64, - std::set &EnabledFeatureNames, - StringMap &DescMap); + LLVM_ABI static void printSupportedExtensions(StringMap &DescMap); + LLVM_ABI static void + printEnabledExtensions(bool IsRV64, std::set &EnabledFeatureNames, + StringMap &DescMap); /// Return the group id and bit position of __riscv_feature_bits. Returns /// <-1, -1> if not supported. - static std::pair getRISCVFeaturesBitsInfo(StringRef Ext); + LLVM_ABI static std::pair getRISCVFeaturesBitsInfo(StringRef Ext); // The maximum value of the group ID obtained from getRISCVFeaturesBitsInfo. static constexpr unsigned FeatureBitSize = 2; diff --git a/llvm/include/llvm/TargetParser/RISCVTargetParser.h b/llvm/include/llvm/TargetParser/RISCVTargetParser.h index a529479b546d..41fdab6012aa 100644 --- a/llvm/include/llvm/TargetParser/RISCVTargetParser.h +++ b/llvm/include/llvm/TargetParser/RISCVTargetParser.h @@ -15,6 +15,7 @@ #define LLVM_TARGETPARSER_RISCVTARGETPARSER_H #include "llvm/ADT/StringRef.h" +#include "llvm/Support/Compiler.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" @@ -43,18 +44,20 @@ struct CPUInfo { static constexpr unsigned RVVBitsPerBlock = 64; static constexpr unsigned RVVBytesPerBlock = RVVBitsPerBlock / 8; -void getFeaturesForCPU(StringRef CPU, - SmallVectorImpl &EnabledFeatures, - bool NeedPlus = false); -bool parseCPU(StringRef CPU, bool IsRV64); -bool parseTuneCPU(StringRef CPU, bool IsRV64); -StringRef getMArchFromMcpu(StringRef CPU); -void fillValidCPUArchList(SmallVectorImpl &Values, bool IsRV64); -void fillValidTuneCPUArchList(SmallVectorImpl &Values, bool IsRV64); -bool hasFastScalarUnalignedAccess(StringRef CPU); -bool hasFastVectorUnalignedAccess(StringRef CPU); -bool hasValidCPUModel(StringRef CPU); -CPUModel getCPUModel(StringRef CPU); +LLVM_ABI void getFeaturesForCPU(StringRef CPU, + SmallVectorImpl &EnabledFeatures, + bool NeedPlus = false); +LLVM_ABI bool parseCPU(StringRef CPU, bool IsRV64); +LLVM_ABI bool parseTuneCPU(StringRef CPU, bool IsRV64); +LLVM_ABI StringRef getMArchFromMcpu(StringRef CPU); +LLVM_ABI void fillValidCPUArchList(SmallVectorImpl &Values, + bool IsRV64); +LLVM_ABI void fillValidTuneCPUArchList(SmallVectorImpl &Values, + bool IsRV64); +LLVM_ABI bool hasFastScalarUnalignedAccess(StringRef CPU); +LLVM_ABI bool hasFastVectorUnalignedAccess(StringRef CPU); +LLVM_ABI bool hasValidCPUModel(StringRef CPU); +LLVM_ABI CPUModel getCPUModel(StringRef CPU); } // namespace RISCV @@ -86,10 +89,10 @@ inline static bool isValidLMUL(unsigned LMUL, bool Fractional) { return isPowerOf2_32(LMUL) && LMUL <= 8 && (!Fractional || LMUL != 1); } -unsigned encodeVTYPE(VLMUL VLMUL, unsigned SEW, bool TailAgnostic, - bool MaskAgnostic); +LLVM_ABI unsigned encodeVTYPE(VLMUL VLMUL, unsigned SEW, bool TailAgnostic, + bool MaskAgnostic); -unsigned encodeXSfmmVType(unsigned SEW, unsigned Widen, bool AltFmt); +LLVM_ABI unsigned encodeXSfmmVType(unsigned SEW, unsigned Widen, bool AltFmt); inline static VLMUL getVLMUL(unsigned VType) { unsigned VLMul = VType & 0x7; @@ -97,7 +100,7 @@ inline static VLMUL getVLMUL(unsigned VType) { } // Decode VLMUL into 1,2,4,8 and fractional indicator. -std::pair decodeVLMUL(VLMUL VLMul); +LLVM_ABI std::pair decodeVLMUL(VLMUL VLMul); inline static VLMUL encodeLMUL(unsigned LMUL, bool Fractional) { assert(isValidLMUL(LMUL, Fractional) && "Unsupported LMUL"); @@ -148,11 +151,12 @@ inline static bool isMaskAgnostic(unsigned VType) { return VType & 0x80; } inline static bool isAltFmt(unsigned VType) { return VType & 0x100; } -void printVType(unsigned VType, raw_ostream &OS); +LLVM_ABI void printVType(unsigned VType, raw_ostream &OS); -unsigned getSEWLMULRatio(unsigned SEW, VLMUL VLMul); +LLVM_ABI unsigned getSEWLMULRatio(unsigned SEW, VLMUL VLMul); -std::optional getSameRatioLMUL(unsigned SEW, VLMUL VLMUL, unsigned EEW); +LLVM_ABI std::optional getSameRatioLMUL(unsigned SEW, VLMUL VLMUL, + unsigned EEW); } // namespace RISCVVType } // namespace llvm diff --git a/llvm/include/llvm/TargetParser/SubtargetFeature.h b/llvm/include/llvm/TargetParser/SubtargetFeature.h index 2e1f00dad2df..6f1723dec5d0 100644 --- a/llvm/include/llvm/TargetParser/SubtargetFeature.h +++ b/llvm/include/llvm/TargetParser/SubtargetFeature.h @@ -20,6 +20,7 @@ #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/StringRef.h" +#include "llvm/Support/Compiler.h" #include "llvm/Support/MathExtras.h" #include #include @@ -175,27 +176,27 @@ class SubtargetFeatures { std::vector Features; ///< Subtarget features as a vector public: - explicit SubtargetFeatures(StringRef Initial = ""); + LLVM_ABI explicit SubtargetFeatures(StringRef Initial = ""); /// Returns features as a string. - std::string getString() const; + LLVM_ABI std::string getString() const; /// Adds Features. - void AddFeature(StringRef String, bool Enable = true); + LLVM_ABI void AddFeature(StringRef String, bool Enable = true); - void addFeaturesVector(const ArrayRef OtherFeatures); + LLVM_ABI void addFeaturesVector(const ArrayRef OtherFeatures); /// Returns the vector of individual subtarget features. const std::vector &getFeatures() const { return Features; } /// Prints feature string. - void print(raw_ostream &OS) const; + LLVM_ABI void print(raw_ostream &OS) const; // Dumps feature info. - void dump() const; + LLVM_ABI void dump() const; /// Adds the default features for the specified target triple. - void getDefaultSubtargetFeatures(const Triple& Triple); + LLVM_ABI void getDefaultSubtargetFeatures(const Triple &Triple); /// Determine if a feature has a flag; '+' or '-' static bool hasFlag(StringRef Feature) { @@ -221,7 +222,7 @@ public: } /// Splits a string of comma separated items in to a vector of strings. - static void Split(std::vector &V, StringRef S); + LLVM_ABI static void Split(std::vector &V, StringRef S); }; } // end namespace llvm diff --git a/llvm/include/llvm/TargetParser/TargetParser.h b/llvm/include/llvm/TargetParser/TargetParser.h index f776b41f3d7c..176205e17ae0 100644 --- a/llvm/include/llvm/TargetParser/TargetParser.h +++ b/llvm/include/llvm/TargetParser/TargetParser.h @@ -16,6 +16,7 @@ #include "llvm/ADT/StringMap.h" #include "llvm/ADT/StringRef.h" +#include "llvm/Support/Compiler.h" namespace llvm { @@ -164,27 +165,27 @@ enum FeatureError : uint32_t { UNSUPPORTED_TARGET_FEATURE }; -StringRef getArchFamilyNameAMDGCN(GPUKind AK); +LLVM_ABI StringRef getArchFamilyNameAMDGCN(GPUKind AK); -StringRef getArchNameAMDGCN(GPUKind AK); -StringRef getArchNameR600(GPUKind AK); -StringRef getCanonicalArchName(const Triple &T, StringRef Arch); -GPUKind parseArchAMDGCN(StringRef CPU); -GPUKind parseArchR600(StringRef CPU); -unsigned getArchAttrAMDGCN(GPUKind AK); -unsigned getArchAttrR600(GPUKind AK); +LLVM_ABI StringRef getArchNameAMDGCN(GPUKind AK); +LLVM_ABI StringRef getArchNameR600(GPUKind AK); +LLVM_ABI StringRef getCanonicalArchName(const Triple &T, StringRef Arch); +LLVM_ABI GPUKind parseArchAMDGCN(StringRef CPU); +LLVM_ABI GPUKind parseArchR600(StringRef CPU); +LLVM_ABI unsigned getArchAttrAMDGCN(GPUKind AK); +LLVM_ABI unsigned getArchAttrR600(GPUKind AK); -void fillValidArchListAMDGCN(SmallVectorImpl &Values); -void fillValidArchListR600(SmallVectorImpl &Values); +LLVM_ABI void fillValidArchListAMDGCN(SmallVectorImpl &Values); +LLVM_ABI void fillValidArchListR600(SmallVectorImpl &Values); -IsaVersion getIsaVersion(StringRef GPU); +LLVM_ABI IsaVersion getIsaVersion(StringRef GPU); /// Fills Features map with default values for given target GPU -void fillAMDGPUFeatureMap(StringRef GPU, const Triple &T, - StringMap &Features); +LLVM_ABI void fillAMDGPUFeatureMap(StringRef GPU, const Triple &T, + StringMap &Features); /// Inserts wave size feature for given GPU into features map -std::pair +LLVM_ABI std::pair insertWaveSizeFeature(StringRef GPU, const Triple &T, StringMap &Features); diff --git a/llvm/include/llvm/TargetParser/Triple.h b/llvm/include/llvm/TargetParser/Triple.h index b56e6e18805e..b6f15ef13191 100644 --- a/llvm/include/llvm/TargetParser/Triple.h +++ b/llvm/include/llvm/TargetParser/Triple.h @@ -10,6 +10,7 @@ #define LLVM_TARGETPARSER_TRIPLE_H #include "llvm/ADT/Twine.h" +#include "llvm/Support/Compiler.h" #include "llvm/Support/VersionTuple.h" // Some system headers or GCC predefined macros conflict with identifiers in @@ -348,10 +349,11 @@ public: /// triple fields unknown. Triple() = default; - explicit Triple(const Twine &Str); - Triple(const Twine &ArchStr, const Twine &VendorStr, const Twine &OSStr); - Triple(const Twine &ArchStr, const Twine &VendorStr, const Twine &OSStr, - const Twine &EnvironmentStr); + LLVM_ABI explicit Triple(const Twine &Str); + LLVM_ABI Triple(const Twine &ArchStr, const Twine &VendorStr, + const Twine &OSStr); + LLVM_ABI Triple(const Twine &ArchStr, const Twine &VendorStr, + const Twine &OSStr, const Twine &EnvironmentStr); bool operator==(const Triple &Other) const { return Arch == Other.Arch && SubArch == Other.SubArch && @@ -381,8 +383,8 @@ public: /// reasonably be done). In particular, it handles the common case in which /// otherwise valid components are in the wrong order. \p Form is used to /// specify the output canonical form. - static std::string normalize(StringRef Str, - CanonicalForm Form = CanonicalForm::ANY); + LLVM_ABI static std::string + normalize(StringRef Str, CanonicalForm Form = CanonicalForm::ANY); /// Return the normalized form of this triple's string. std::string normalize(CanonicalForm Form = CanonicalForm::ANY) const { @@ -417,7 +419,7 @@ public: /// triple, if present. /// /// For example, "fooos1.2.3" would return (1, 2, 3). - VersionTuple getEnvironmentVersion() const; + LLVM_ABI VersionTuple getEnvironmentVersion() const; /// Get the object format for this triple. ObjectFormatType getObjectFormat() const { return ObjectFormat; } @@ -426,7 +428,7 @@ public: /// present. /// /// For example, "fooos1.2.3" would return (1, 2, 3). - VersionTuple getOSVersion() const; + LLVM_ABI VersionTuple getOSVersion() const; /// Return just the major version number, this is specialized because it is a /// common query. @@ -436,26 +438,26 @@ public: /// "darwin" versions to the corresponding OS X versions. This may also be /// called with IOS triples but the OS X version number is just set to a /// constant 10.4.0 in that case. Returns true if successful. - bool getMacOSXVersion(VersionTuple &Version) const; + LLVM_ABI bool getMacOSXVersion(VersionTuple &Version) const; /// Parse the version number as with getOSVersion. This should only be called /// with IOS or generic triples. - VersionTuple getiOSVersion() const; + LLVM_ABI VersionTuple getiOSVersion() const; /// Parse the version number as with getOSVersion. This should only be called /// with WatchOS or generic triples. - VersionTuple getWatchOSVersion() const; + LLVM_ABI VersionTuple getWatchOSVersion() const; /// Parse the version number as with getOSVersion. - VersionTuple getDriverKitVersion() const; + LLVM_ABI VersionTuple getDriverKitVersion() const; /// Parse the Vulkan version number from the OSVersion and SPIR-V version /// (SubArch). This should only be called with Vulkan SPIR-V triples. - VersionTuple getVulkanVersion() const; + LLVM_ABI VersionTuple getVulkanVersion() const; /// Parse the DXIL version number from the OSVersion and DXIL version /// (SubArch). This should only be called with DXIL triples. - VersionTuple getDXILVersion() const; + LLVM_ABI VersionTuple getDXILVersion() const; /// @} /// @name Direct Component Access @@ -469,34 +471,34 @@ public: bool empty() const { return Data.empty(); } /// Get the architecture (first) component of the triple. - StringRef getArchName() const; + LLVM_ABI StringRef getArchName() const; /// Get the vendor (second) component of the triple. - StringRef getVendorName() const; + LLVM_ABI StringRef getVendorName() const; /// Get the operating system (third) component of the triple. - StringRef getOSName() const; + LLVM_ABI StringRef getOSName() const; /// Get the optional environment (fourth) component of the triple, or "" if /// empty. - StringRef getEnvironmentName() const; + LLVM_ABI StringRef getEnvironmentName() const; /// Get the operating system and optional environment components as a single /// string (separated by a '-' if the environment component is present). - StringRef getOSAndEnvironmentName() const; + LLVM_ABI StringRef getOSAndEnvironmentName() const; /// Get the version component of the environment component as a single /// string (the version after the environment). /// /// For example, "fooos1.2.3" would return "1.2.3". - StringRef getEnvironmentVersionString() const; + LLVM_ABI StringRef getEnvironmentVersionString() const; /// @} /// @name Convenience Predicates /// @{ /// Returns the pointer width of this architecture. - static unsigned getArchPointerBitWidth(llvm::Triple::ArchType Arch); + LLVM_ABI static unsigned getArchPointerBitWidth(llvm::Triple::ArchType Arch); /// Returns the pointer width of this architecture. unsigned getArchPointerBitWidth() const { @@ -504,7 +506,7 @@ public: } /// Returns the trampoline size in bytes for this configuration. - unsigned getTrampolineSize() const; + LLVM_ABI unsigned getTrampolineSize() const; /// Test whether the architecture is 64-bit /// @@ -513,17 +515,17 @@ public: /// 16-bit. The inner details of pointer width for particular architectures /// is not summed up in the triple, and so only a coarse grained predicate /// system is provided. - bool isArch64Bit() const; + LLVM_ABI bool isArch64Bit() const; /// Test whether the architecture is 32-bit /// /// Note that this tests for 32-bit pointer width, and nothing else. - bool isArch32Bit() const; + LLVM_ABI bool isArch32Bit() const; /// Test whether the architecture is 16-bit /// /// Note that this tests for 16-bit pointer width, and nothing else. - bool isArch16Bit() const; + LLVM_ABI bool isArch16Bit() const; /// Helper function for doing comparisons against version numbers included in /// the target triple. @@ -544,8 +546,8 @@ public: /// Comparison function for checking OS X version compatibility, which handles /// supporting skewed version numbering schemes used by the "darwin" triples. - bool isMacOSXVersionLT(unsigned Major, unsigned Minor = 0, - unsigned Micro = 0) const; + LLVM_ABI bool isMacOSXVersionLT(unsigned Major, unsigned Minor = 0, + unsigned Micro = 0) const; /// Is this a Mac OS X triple. For legacy reasons, we support both "darwin" /// and "osx" as OS X triples. @@ -1171,38 +1173,38 @@ public: /// @{ /// Set the architecture (first) component of the triple to a known type. - void setArch(ArchType Kind, SubArchType SubArch = NoSubArch); + LLVM_ABI void setArch(ArchType Kind, SubArchType SubArch = NoSubArch); /// Set the vendor (second) component of the triple to a known type. - void setVendor(VendorType Kind); + LLVM_ABI void setVendor(VendorType Kind); /// Set the operating system (third) component of the triple to a known type. - void setOS(OSType Kind); + LLVM_ABI void setOS(OSType Kind); /// Set the environment (fourth) component of the triple to a known type. - void setEnvironment(EnvironmentType Kind); + LLVM_ABI void setEnvironment(EnvironmentType Kind); /// Set the object file format. - void setObjectFormat(ObjectFormatType Kind); + LLVM_ABI void setObjectFormat(ObjectFormatType Kind); /// Set all components to the new triple \p Str. - void setTriple(const Twine &Str); + LLVM_ABI void setTriple(const Twine &Str); /// Set the architecture (first) component of the triple by name. - void setArchName(StringRef Str); + LLVM_ABI void setArchName(StringRef Str); /// Set the vendor (second) component of the triple by name. - void setVendorName(StringRef Str); + LLVM_ABI void setVendorName(StringRef Str); /// Set the operating system (third) component of the triple by name. - void setOSName(StringRef Str); + LLVM_ABI void setOSName(StringRef Str); /// Set the optional environment (fourth) component of the triple by name. - void setEnvironmentName(StringRef Str); + LLVM_ABI void setEnvironmentName(StringRef Str); /// Set the operating system and optional environment components with a single /// string. - void setOSAndEnvironmentName(StringRef Str); + LLVM_ABI void setOSAndEnvironmentName(StringRef Str); /// @} /// @name Helpers to build variants of a particular triple. @@ -1214,7 +1216,7 @@ public: /// /// \returns A new triple with a 32-bit architecture or an unknown /// architecture if no such variant can be found. - llvm::Triple get32BitArchVariant() const; + LLVM_ABI llvm::Triple get32BitArchVariant() const; /// Form a triple with a 64-bit variant of the current architecture. /// @@ -1222,7 +1224,7 @@ public: /// /// \returns A new triple with a 64-bit architecture or an unknown /// architecture if no such variant can be found. - llvm::Triple get64BitArchVariant() const; + LLVM_ABI llvm::Triple get64BitArchVariant() const; /// Form a triple with a big endian variant of the current architecture. /// @@ -1230,7 +1232,7 @@ public: /// /// \returns A new triple with a big endian architecture or an unknown /// architecture if no such variant can be found. - llvm::Triple getBigEndianArchVariant() const; + LLVM_ABI llvm::Triple getBigEndianArchVariant() const; /// Form a triple with a little endian variant of the current architecture. /// @@ -1238,73 +1240,76 @@ public: /// /// \returns A new triple with a little endian architecture or an unknown /// architecture if no such variant can be found. - llvm::Triple getLittleEndianArchVariant() const; + LLVM_ABI llvm::Triple getLittleEndianArchVariant() const; /// Tests whether the target triple is little endian. /// /// \returns true if the triple is little endian, false otherwise. - bool isLittleEndian() const; + LLVM_ABI bool isLittleEndian() const; /// Test whether target triples are compatible. - bool isCompatibleWith(const Triple &Other) const; + LLVM_ABI bool isCompatibleWith(const Triple &Other) const; /// Test whether the target triple is for a GPU. bool isGPU() const { return isSPIRV() || isNVPTX() || isAMDGPU(); } /// Merge target triples. - std::string merge(const Triple &Other) const; + LLVM_ABI std::string merge(const Triple &Other) const; /// Some platforms have different minimum supported OS versions that /// varies by the architecture specified in the triple. This function /// returns the minimum supported OS version for this triple if one an exists, /// or an invalid version tuple if this triple doesn't have one. - VersionTuple getMinimumSupportedOSVersion() const; + LLVM_ABI VersionTuple getMinimumSupportedOSVersion() const; /// @} /// @name Static helpers for IDs. /// @{ /// Get the canonical name for the \p Kind architecture. - static StringRef getArchTypeName(ArchType Kind); + LLVM_ABI static StringRef getArchTypeName(ArchType Kind); /// Get the architecture name based on \p Kind and \p SubArch. - static StringRef getArchName(ArchType Kind, SubArchType SubArch = NoSubArch); + LLVM_ABI static StringRef getArchName(ArchType Kind, + SubArchType SubArch = NoSubArch); /// Get the "prefix" canonical name for the \p Kind architecture. This is the /// prefix used by the architecture specific builtins, and is suitable for /// passing to \see Intrinsic::getIntrinsicForClangBuiltin(). /// /// \return - The architecture prefix, or 0 if none is defined. - static StringRef getArchTypePrefix(ArchType Kind); + LLVM_ABI static StringRef getArchTypePrefix(ArchType Kind); /// Get the canonical name for the \p Kind vendor. - static StringRef getVendorTypeName(VendorType Kind); + LLVM_ABI static StringRef getVendorTypeName(VendorType Kind); /// Get the canonical name for the \p Kind operating system. - static StringRef getOSTypeName(OSType Kind); + LLVM_ABI static StringRef getOSTypeName(OSType Kind); /// Get the canonical name for the \p Kind environment. - static StringRef getEnvironmentTypeName(EnvironmentType Kind); + LLVM_ABI static StringRef getEnvironmentTypeName(EnvironmentType Kind); /// Get the name for the \p Object format. - static StringRef getObjectFormatTypeName(ObjectFormatType ObjectFormat); + LLVM_ABI static StringRef + getObjectFormatTypeName(ObjectFormatType ObjectFormat); /// @} /// @name Static helpers for converting alternate architecture names. /// @{ /// The canonical type for the given LLVM architecture name (e.g., "x86"). - static ArchType getArchTypeForLLVMName(StringRef Str); + LLVM_ABI static ArchType getArchTypeForLLVMName(StringRef Str); /// @} /// Returns a canonicalized OS version number for the specified OS. - static VersionTuple getCanonicalVersionForOS(OSType OSKind, - const VersionTuple &Version, - bool IsInValidRange); + LLVM_ABI static VersionTuple + getCanonicalVersionForOS(OSType OSKind, const VersionTuple &Version, + bool IsInValidRange); /// Returns whether an OS version is invalid and would not map to an Apple OS. - static bool isValidVersionForOS(OSType OSKind, const VersionTuple &Version); + LLVM_ABI static bool isValidVersionForOS(OSType OSKind, + const VersionTuple &Version); }; } // End llvm namespace diff --git a/llvm/include/llvm/TargetParser/X86TargetParser.h b/llvm/include/llvm/TargetParser/X86TargetParser.h index 8447aca7bb92..f6aeaada346e 100644 --- a/llvm/include/llvm/TargetParser/X86TargetParser.h +++ b/llvm/include/llvm/TargetParser/X86TargetParser.h @@ -15,6 +15,7 @@ #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/StringMap.h" +#include "llvm/Support/Compiler.h" #include namespace llvm { @@ -153,34 +154,36 @@ enum CPUKind { /// Parse \p CPU string into a CPUKind. Will only accept 64-bit capable CPUs if /// \p Only64Bit is true. -CPUKind parseArchX86(StringRef CPU, bool Only64Bit = false); -CPUKind parseTuneCPU(StringRef CPU, bool Only64Bit = false); +LLVM_ABI CPUKind parseArchX86(StringRef CPU, bool Only64Bit = false); +LLVM_ABI CPUKind parseTuneCPU(StringRef CPU, bool Only64Bit = false); /// Provide a list of valid CPU names. If \p Only64Bit is true, the list will /// only contain 64-bit capable CPUs. -void fillValidCPUArchList(SmallVectorImpl &Values, - bool Only64Bit = false); +LLVM_ABI void fillValidCPUArchList(SmallVectorImpl &Values, + bool Only64Bit = false); /// Provide a list of valid -mtune names. -void fillValidTuneCPUList(SmallVectorImpl &Values, - bool Only64Bit = false); +LLVM_ABI void fillValidTuneCPUList(SmallVectorImpl &Values, + bool Only64Bit = false); /// Get the key feature prioritizing target multiversioning. -ProcessorFeatures getKeyFeature(CPUKind Kind); +LLVM_ABI ProcessorFeatures getKeyFeature(CPUKind Kind); /// Fill in the features that \p CPU supports into \p Features. /// "+" will be append in front of each feature if NeedPlus is true. -void getFeaturesForCPU(StringRef CPU, SmallVectorImpl &Features, - bool NeedPlus = false); +LLVM_ABI void getFeaturesForCPU(StringRef CPU, + SmallVectorImpl &Features, + bool NeedPlus = false); /// Set or clear entries in \p Features that are implied to be enabled/disabled /// by the provided \p Feature. -void updateImpliedFeatures(StringRef Feature, bool Enabled, - StringMap &Features); +LLVM_ABI void updateImpliedFeatures(StringRef Feature, bool Enabled, + StringMap &Features); -char getCPUDispatchMangling(StringRef Name); -bool validateCPUSpecificCPUDispatch(StringRef Name); -std::array getCpuSupportsMask(ArrayRef FeatureStrs); -unsigned getFeaturePriority(ProcessorFeatures Feat); +LLVM_ABI char getCPUDispatchMangling(StringRef Name); +LLVM_ABI bool validateCPUSpecificCPUDispatch(StringRef Name); +LLVM_ABI std::array +getCpuSupportsMask(ArrayRef FeatureStrs); +LLVM_ABI unsigned getFeaturePriority(ProcessorFeatures Feat); } // namespace X86 } // namespace llvm From 8f8ed23c6247e9c1dd2df4494930813b353c52c4 Mon Sep 17 00:00:00 2001 From: Andrew Rogers Date: Wed, 11 Jun 2025 09:19:13 -0700 Subject: [PATCH 0006/1322] [llvm] annotate interfaces in llvm/SandboxIR for DLL export (#142863) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Purpose This patch is one in a series of code-mods that annotate LLVM’s public interface for export. This patch annotates the `llvm/SandboxIR` library. These annotations currently have no meaningful impact on the LLVM build; however, they are a prerequisite to support an LLVM Windows DLL (shared library) build. ## Background This effort is tracked in #109483. Additional context is provided in [this discourse](https://discourse.llvm.org/t/psa-annotating-llvm-public-interface/85307), and documentation for `LLVM_ABI` and related annotations is found in the LLVM repo [here](https://github.com/llvm/llvm-project/blob/main/llvm/docs/InterfaceExportAnnotations.rst). The bulk of these changes were generated automatically using the [Interface Definition Scanner (IDS)](https://github.com/compnerd/ids) tool, followed formatting with `git clang-format`. The following manual adjustments were also applied after running IDS on Linux: - Remove explicit `GlobalWithNodeAPI::LLVMGVToGV::operator()` template function instantiations that were previously added for the dylib build. Instead, directly annotate the `LLVMGVToGV::operator()` method with `LLVM_ABI`. This is done so the DLL build works with both MSVC and clang-cl. - Explicitly `#include "llvm/SandboxIR/Value.h"` in `Tracker.h` so that the symbol is available for exported templates in this file. These templates get fully instantiated on DLL export, so they require the full definition of `Value`. - Add extern template instantiation declarations for `GlobalWithNodeAPI` template types in `Constants.h` and annotate them with `LLVM_TEMPLATE_ABI`. - Add `LLVM_EXPORT_TEMPLATE` to `GlobalWithNodeAPI` template instantiations in `Constants.cpp`. ## Validation Local builds and tests to validate cross-platform compatibility. This included llvm, clang, and lldb on the following configurations: - Windows with MSVC - Windows with Clang - Linux with GCC - Linux with Clang - Darwin with Clang --- llvm/include/llvm/SandboxIR/BasicBlock.h | 21 +- llvm/include/llvm/SandboxIR/Constant.h | 207 ++++---- llvm/include/llvm/SandboxIR/Context.h | 135 +++--- llvm/include/llvm/SandboxIR/Function.h | 5 +- llvm/include/llvm/SandboxIR/Instruction.h | 545 +++++++++++----------- llvm/include/llvm/SandboxIR/Module.h | 10 +- llvm/include/llvm/SandboxIR/PassManager.h | 6 +- llvm/include/llvm/SandboxIR/Region.h | 19 +- llvm/include/llvm/SandboxIR/Tracker.h | 34 +- llvm/include/llvm/SandboxIR/Type.h | 53 ++- llvm/include/llvm/SandboxIR/Use.h | 9 +- llvm/include/llvm/SandboxIR/User.h | 13 +- llvm/include/llvm/SandboxIR/Value.h | 20 +- llvm/lib/SandboxIR/Constant.cpp | 37 +- 14 files changed, 564 insertions(+), 550 deletions(-) diff --git a/llvm/include/llvm/SandboxIR/BasicBlock.h b/llvm/include/llvm/SandboxIR/BasicBlock.h index 93e79e2a421f..25bbb6c058fa 100644 --- a/llvm/include/llvm/SandboxIR/BasicBlock.h +++ b/llvm/include/llvm/SandboxIR/BasicBlock.h @@ -11,6 +11,7 @@ #include "llvm/IR/BasicBlock.h" #include "llvm/SandboxIR/Value.h" +#include "llvm/Support/Compiler.h" namespace llvm::sandboxir { @@ -32,20 +33,20 @@ private: llvm::BasicBlock *BB; llvm::BasicBlock::iterator It; Context *Ctx; - pointer getInstr(llvm::BasicBlock::iterator It) const; + LLVM_ABI pointer getInstr(llvm::BasicBlock::iterator It) const; public: BBIterator() : BB(nullptr), Ctx(nullptr) {} BBIterator(llvm::BasicBlock *BB, llvm::BasicBlock::iterator It, Context *Ctx) : BB(BB), It(It), Ctx(Ctx) {} reference operator*() const { return *getInstr(It); } - BBIterator &operator++(); + LLVM_ABI BBIterator &operator++(); BBIterator operator++(int) { auto Copy = *this; ++*this; return Copy; } - BBIterator &operator--(); + LLVM_ABI BBIterator &operator--(); BBIterator operator--(int) { auto Copy = *this; --*this; @@ -60,14 +61,14 @@ public: /// the instruction is not found in the IR-to-SandboxIR tables. pointer get() const { return getInstr(It); } /// \Returns the parent BB. - BasicBlock *getNodeParent() const; + LLVM_ABI BasicBlock *getNodeParent() const; }; /// Contains a list of sandboxir::Instruction's. class BasicBlock : public Value { /// Builds a graph that contains all values in \p BB in their original form /// i.e., no vectorization is taking place here. - void buildBasicBlockFromLLVMIR(llvm::BasicBlock *LLVMBB); + LLVM_ABI void buildBasicBlockFromLLVMIR(llvm::BasicBlock *LLVMBB); friend class Context; // For `buildBasicBlockFromIR` friend class Instruction; // For LLVM Val. @@ -82,9 +83,9 @@ public: static bool classof(const Value *From) { return From->getSubclassID() == Value::ClassID::Block; } - Function *getParent() const; + LLVM_ABI Function *getParent() const; using iterator = BBIterator; - iterator begin() const; + LLVM_ABI iterator begin() const; iterator end() const { auto *BB = cast(Val); return iterator(BB, BB->end(), &Ctx); @@ -96,10 +97,10 @@ public: return std::make_reverse_iterator(begin()); } Context &getContext() const { return Ctx; } - Instruction *getTerminator() const; + LLVM_ABI Instruction *getTerminator() const; bool empty() const { return begin() == end(); } - Instruction &front() const; - Instruction &back() const; + LLVM_ABI Instruction &front() const; + LLVM_ABI Instruction &back() const; #ifndef NDEBUG void verify() const final; diff --git a/llvm/include/llvm/SandboxIR/Constant.h b/llvm/include/llvm/SandboxIR/Constant.h index e7b18a442d33..6f682a7059d1 100644 --- a/llvm/include/llvm/SandboxIR/Constant.h +++ b/llvm/include/llvm/SandboxIR/Constant.h @@ -76,16 +76,16 @@ class ConstantInt : public Constant { } public: - static ConstantInt *getTrue(Context &Ctx); - static ConstantInt *getFalse(Context &Ctx); - static ConstantInt *getBool(Context &Ctx, bool V); - static Constant *getTrue(Type *Ty); - static Constant *getFalse(Type *Ty); - static Constant *getBool(Type *Ty, bool V); + LLVM_ABI static ConstantInt *getTrue(Context &Ctx); + LLVM_ABI static ConstantInt *getFalse(Context &Ctx); + LLVM_ABI static ConstantInt *getBool(Context &Ctx, bool V); + LLVM_ABI static Constant *getTrue(Type *Ty); + LLVM_ABI static Constant *getFalse(Type *Ty); + LLVM_ABI static Constant *getBool(Type *Ty, bool V); /// If Ty is a vector type, return a Constant with a splat of the given /// value. Otherwise return a ConstantInt for the given value. - static ConstantInt *get(Type *Ty, uint64_t V, bool IsSigned = false); + LLVM_ABI static ConstantInt *get(Type *Ty, uint64_t V, bool IsSigned = false); /// Return a ConstantInt with the specified integer value for the specified /// type. If the type is wider than 64 bits, the value will be zero-extended @@ -93,27 +93,29 @@ public: /// be interpreted as a 64-bit signed integer and sign-extended to fit /// the type. /// Get a ConstantInt for a specific value. - static ConstantInt *get(IntegerType *Ty, uint64_t V, bool IsSigned = false); + LLVM_ABI static ConstantInt *get(IntegerType *Ty, uint64_t V, + bool IsSigned = false); /// Return a ConstantInt with the specified value for the specified type. The /// value V will be canonicalized to a an unsigned APInt. Accessing it with /// either getSExtValue() or getZExtValue() will yield a correctly sized and /// signed value for the type Ty. /// Get a ConstantInt for a specific signed value. - static ConstantInt *getSigned(IntegerType *Ty, int64_t V); - static Constant *getSigned(Type *Ty, int64_t V); + LLVM_ABI static ConstantInt *getSigned(IntegerType *Ty, int64_t V); + LLVM_ABI static Constant *getSigned(Type *Ty, int64_t V); /// Return a ConstantInt with the specified value and an implied Type. The /// type is the integer type that corresponds to the bit width of the value. - static ConstantInt *get(Context &Ctx, const APInt &V); + LLVM_ABI static ConstantInt *get(Context &Ctx, const APInt &V); /// Return a ConstantInt constructed from the string strStart with the given /// radix. - static ConstantInt *get(IntegerType *Ty, StringRef Str, uint8_t Radix); + LLVM_ABI static ConstantInt *get(IntegerType *Ty, StringRef Str, + uint8_t Radix); /// If Ty is a vector type, return a Constant with a splat of the given /// value. Otherwise return a ConstantInt for the given value. - static Constant *get(Type *Ty, const APInt &V); + LLVM_ABI static Constant *get(Type *Ty, const APInt &V); /// Return the constant as an APInt value reference. This allows clients to /// obtain a full-precision copy of the value. @@ -166,7 +168,7 @@ public: /// Variant of the getType() method to always return an IntegerType, which /// reduces the amount of casting needed in parts of the compiler. - IntegerType *getIntegerType() const; + LLVM_ABI IntegerType *getIntegerType() const; /// This static method returns true if the type Ty is big enough to /// represent the value V. This can be used to avoid having the get method @@ -177,8 +179,8 @@ public: /// to the appropriate unsigned type before calling the method. /// @returns true if V is a valid value for type Ty /// Determine if the value is in range for the given type. - static bool isValueValidForType(Type *Ty, uint64_t V); - static bool isValueValidForType(Type *Ty, int64_t V); + LLVM_ABI static bool isValueValidForType(Type *Ty, uint64_t V); + LLVM_ABI static bool isValueValidForType(Type *Ty, int64_t V); bool isNegative() const { return cast(Val)->isNegative(); } @@ -264,29 +266,29 @@ public: /// for the specified value in the specified type. This should only be used /// for simple constant values like 2.0/1.0 etc, that are known-valid both as /// host double and as the target format. - static Constant *get(Type *Ty, double V); + LLVM_ABI static Constant *get(Type *Ty, double V); /// If Ty is a vector type, return a Constant with a splat of the given /// value. Otherwise return a ConstantFP for the given value. - static Constant *get(Type *Ty, const APFloat &V); + LLVM_ABI static Constant *get(Type *Ty, const APFloat &V); - static Constant *get(Type *Ty, StringRef Str); + LLVM_ABI static Constant *get(Type *Ty, StringRef Str); - static ConstantFP *get(const APFloat &V, Context &Ctx); + LLVM_ABI static ConstantFP *get(const APFloat &V, Context &Ctx); - static Constant *getNaN(Type *Ty, bool Negative = false, - uint64_t Payload = 0); - static Constant *getQNaN(Type *Ty, bool Negative = false, - APInt *Payload = nullptr); - static Constant *getSNaN(Type *Ty, bool Negative = false, - APInt *Payload = nullptr); - static Constant *getZero(Type *Ty, bool Negative = false); + LLVM_ABI static Constant *getNaN(Type *Ty, bool Negative = false, + uint64_t Payload = 0); + LLVM_ABI static Constant *getQNaN(Type *Ty, bool Negative = false, + APInt *Payload = nullptr); + LLVM_ABI static Constant *getSNaN(Type *Ty, bool Negative = false, + APInt *Payload = nullptr); + LLVM_ABI static Constant *getZero(Type *Ty, bool Negative = false); - static Constant *getNegativeZero(Type *Ty); - static Constant *getInfinity(Type *Ty, bool Negative = false); + LLVM_ABI static Constant *getNegativeZero(Type *Ty); + LLVM_ABI static Constant *getInfinity(Type *Ty, bool Negative = false); /// Return true if Ty is big enough to represent V. - static bool isValueValidForType(Type *Ty, const APFloat &V); + LLVM_ABI static bool isValueValidForType(Type *Ty, const APFloat &V); inline const APFloat &getValueAPF() const { return cast(Val)->getValueAPF(); @@ -362,8 +364,8 @@ class ConstantArray final : public ConstantAggregate { friend class Context; // For constructor. public: - static Constant *get(ArrayType *T, ArrayRef V); - ArrayType *getType() const; + LLVM_ABI static Constant *get(ArrayType *T, ArrayRef V); + LLVM_ABI ArrayType *getType() const; // TODO: Missing functions: getType(), getTypeForElements(), getAnon(), get(). @@ -379,7 +381,7 @@ class ConstantStruct final : public ConstantAggregate { friend class Context; // For constructor. public: - static Constant *get(StructType *T, ArrayRef V); + LLVM_ABI static Constant *get(StructType *T, ArrayRef V); template static std::enable_if_t::value, Constant *> @@ -396,8 +398,8 @@ public: return get(getTypeForElements(Ctx, V, Packed), V); } /// This version of the method allows an empty list. - static StructType *getTypeForElements(Context &Ctx, ArrayRef V, - bool Packed = false); + LLVM_ABI static StructType * + getTypeForElements(Context &Ctx, ArrayRef V, bool Packed = false); /// Return an anonymous struct type to use for a constant with the specified /// set of elements. The list must not be empty. static StructType *getTypeForElements(ArrayRef V, @@ -424,10 +426,10 @@ class ConstantVector final : public ConstantAggregate { friend class Context; // For constructor. public: - static Constant *get(ArrayRef V); + LLVM_ABI static Constant *get(ArrayRef V); /// Return a ConstantVector with the specified constant in each element. /// Note that this might not return an instance of ConstantVector - static Constant *getSplat(ElementCount EC, Constant *Elt); + LLVM_ABI static Constant *getSplat(ElementCount EC, Constant *Elt); /// Specialize the getType() method to always return a FixedVectorType, /// which reduces the amount of casting needed in parts of the compiler. inline FixedVectorType *getType() const { @@ -436,7 +438,7 @@ public: /// If all elements of the vector constant have the same value, return that /// value. Otherwise, return nullptr. Ignore poison elements by setting /// AllowPoison to true. - Constant *getSplatValue(bool AllowPoison = false) const; + LLVM_ABI Constant *getSplatValue(bool AllowPoison = false) const; /// For isa/dyn_cast. static bool classof(const Value *From) { @@ -451,18 +453,18 @@ class ConstantAggregateZero final : public Constant { friend class Context; // For constructor. public: - static ConstantAggregateZero *get(Type *Ty); + LLVM_ABI static ConstantAggregateZero *get(Type *Ty); /// If this CAZ has array or vector type, return a zero with the right element /// type. - Constant *getSequentialElement() const; + LLVM_ABI Constant *getSequentialElement() const; /// If this CAZ has struct type, return a zero with the right element type for /// the specified element. - Constant *getStructElement(unsigned Elt) const; + LLVM_ABI Constant *getStructElement(unsigned Elt) const; /// Return a zero of the right value for the specified GEP index if we can, /// otherwise return null (e.g. if C is a ConstantExpr). - Constant *getElementValue(Constant *C) const; + LLVM_ABI Constant *getElementValue(Constant *C) const; /// Return a zero of the right value for the specified GEP index. - Constant *getElementValue(unsigned Idx) const; + LLVM_ABI Constant *getElementValue(unsigned Idx) const; /// Return the number of elements in the array, vector, or struct. ElementCount getElementCount() const { return cast(Val)->getElementCount(); @@ -769,9 +771,9 @@ class ConstantPointerNull final : public Constant { friend class Context; // For constructor. public: - static ConstantPointerNull *get(PointerType *Ty); + LLVM_ABI static ConstantPointerNull *get(PointerType *Ty); - PointerType *getType() const; + LLVM_ABI PointerType *getType() const; /// For isa/dyn_cast. static bool classof(const sandboxir::Value *From) { @@ -802,22 +804,22 @@ protected: public: /// Static factory methods - Return an 'undef' object of the specified type. - static UndefValue *get(Type *T); + LLVM_ABI static UndefValue *get(Type *T); /// If this Undef has array or vector type, return a undef with the right /// element type. - UndefValue *getSequentialElement() const; + LLVM_ABI UndefValue *getSequentialElement() const; /// If this undef has struct type, return a undef with the right element type /// for the specified element. - UndefValue *getStructElement(unsigned Elt) const; + LLVM_ABI UndefValue *getStructElement(unsigned Elt) const; /// Return an undef of the right value for the specified GEP index if we can, /// otherwise return null (e.g. if C is a ConstantExpr). - UndefValue *getElementValue(Constant *C) const; + LLVM_ABI UndefValue *getElementValue(Constant *C) const; /// Return an undef of the right value for the specified GEP index. - UndefValue *getElementValue(unsigned Idx) const; + LLVM_ABI UndefValue *getElementValue(unsigned Idx) const; /// Return the number of elements in the array, vector, or struct. unsigned getNumElements() const { @@ -850,22 +852,22 @@ class PoisonValue final : public UndefValue { public: /// Static factory methods - Return an 'poison' object of the specified type. - static PoisonValue *get(Type *T); + LLVM_ABI static PoisonValue *get(Type *T); /// If this poison has array or vector type, return a poison with the right /// element type. - PoisonValue *getSequentialElement() const; + LLVM_ABI PoisonValue *getSequentialElement() const; /// If this poison has struct type, return a poison with the right element /// type for the specified element. - PoisonValue *getStructElement(unsigned Elt) const; + LLVM_ABI PoisonValue *getStructElement(unsigned Elt) const; /// Return an poison of the right value for the specified GEP index if we can, /// otherwise return null (e.g. if C is a ConstantExpr). - PoisonValue *getElementValue(Constant *C) const; + LLVM_ABI PoisonValue *getElementValue(Constant *C) const; /// Return an poison of the right value for the specified GEP index. - PoisonValue *getElementValue(unsigned Idx) const; + LLVM_ABI PoisonValue *getElementValue(unsigned Idx) const; /// For isa/dyn_cast. static bool classof(const sandboxir::Value *From) { @@ -924,7 +926,7 @@ public: UnnamedAddr getUnnamedAddr() const { return cast(Val)->getUnnamedAddr(); } - void setUnnamedAddr(UnnamedAddr V); + LLVM_ABI void setUnnamedAddr(UnnamedAddr V); static UnnamedAddr getMinUnnamedAddr(UnnamedAddr A, UnnamedAddr B) { return llvm::GlobalValue::getMinUnnamedAddr(A, B); @@ -946,7 +948,7 @@ public: bool hasProtectedVisibility() const { return cast(Val)->hasProtectedVisibility(); } - void setVisibility(VisibilityTypes V); + LLVM_ABI void setVisibility(VisibilityTypes V); // TODO: Add missing functions. }; @@ -996,7 +998,7 @@ public: /// /// Setting the section to the empty string tells LLVM to choose an /// appropriate default object file section. - void setSection(StringRef S); + LLVM_ABI void setSection(StringRef S); bool hasComdat() const { return cast(Val)->hasComdat(); } @@ -1031,7 +1033,7 @@ class GlobalWithNodeAPI : public ParentT { struct LLVMGVToGV { Context &Ctx; LLVMGVToGV(Context &Ctx) : Ctx(Ctx) {} - GlobalT &operator()(LLVMGlobalT &LLVMGV) const; + LLVM_ABI GlobalT &operator()(LLVMGlobalT &LLVMGV) const; }; public: @@ -1060,24 +1062,15 @@ public: } }; -// These are needed for SandboxIRTest when building with LLVM_BUILD_LLVM_DYLIB -extern template LLVM_TEMPLATE_ABI GlobalIFunc & -GlobalWithNodeAPI::LLVMGVToGV::operator()(llvm::GlobalIFunc - &LLVMGV) - const; -extern template LLVM_TEMPLATE_ABI Function & -GlobalWithNodeAPI:: - LLVMGVToGV::operator()(llvm::Function &LLVMGV) const; - -extern template LLVM_TEMPLATE_ABI GlobalVariable &GlobalWithNodeAPI< - GlobalVariable, llvm::GlobalVariable, GlobalObject, - llvm::GlobalObject>::LLVMGVToGV::operator()(llvm::GlobalVariable &LLVMGV) - const; -extern template LLVM_TEMPLATE_ABI GlobalAlias & -GlobalWithNodeAPI::LLVMGVToGV::operator()(llvm::GlobalAlias - &LLVMGV) const; +// Explicit instantiations. +extern template class LLVM_TEMPLATE_ABI GlobalWithNodeAPI< + GlobalIFunc, llvm::GlobalIFunc, GlobalObject, llvm::GlobalObject>; +extern template class LLVM_TEMPLATE_ABI GlobalWithNodeAPI< + Function, llvm::Function, GlobalObject, llvm::GlobalObject>; +extern template class LLVM_TEMPLATE_ABI GlobalWithNodeAPI< + GlobalVariable, llvm::GlobalVariable, GlobalObject, llvm::GlobalObject>; +extern template class LLVM_TEMPLATE_ABI GlobalWithNodeAPI< + GlobalAlias, llvm::GlobalAlias, GlobalValue, llvm::GlobalValue>; class GlobalIFunc final : public GlobalWithNodeAPI(this)->getResolverFunction(); } @@ -1136,7 +1129,7 @@ class GlobalVariable final struct LLVMGVToGV { Context &Ctx; LLVMGVToGV(Context &Ctx) : Ctx(Ctx) {} - GlobalVariable &operator()(llvm::GlobalVariable &LLVMGV) const; + LLVM_ABI GlobalVariable &operator()(llvm::GlobalVariable &LLVMGV) const; }; public: @@ -1181,11 +1174,11 @@ public: /// illegal to call this method if the global is external, because we cannot /// tell what the value is initialized to! /// - Constant *getInitializer() const; + LLVM_ABI Constant *getInitializer() const; /// setInitializer - Sets the initializer for this global variable, removing /// any existing initializer if InitVal==NULL. The initializer must have the /// type getValueType(). - void setInitializer(Constant *InitVal); + LLVM_ABI void setInitializer(Constant *InitVal); // TODO: Add missing replaceInitializer(). Requires special tracker @@ -1196,12 +1189,12 @@ public: bool isConstant() const { return cast(Val)->isConstant(); } - void setConstant(bool V); + LLVM_ABI void setConstant(bool V); bool isExternallyInitialized() const { return cast(Val)->isExternallyInitialized(); } - void setExternallyInitialized(bool Val); + LLVM_ABI void setExternallyInitialized(bool Val); // TODO: Missing copyAttributesFrom() @@ -1278,7 +1271,7 @@ public: /// Sets the alignment attribute of the GlobalVariable. /// This method will be deprecated as the alignment property should always be /// defined. - void setAlignment(MaybeAlign Align); + LLVM_ABI void setAlignment(MaybeAlign Align); // TODO: Missing setCodeModel(). Requires custom tracker. @@ -1311,10 +1304,10 @@ public: // TODO: Missing copyAttributresFrom(). // TODO: Missing removeFromParent(), eraseFromParent(). - void setAliasee(Constant *Aliasee); - Constant *getAliasee() const; + LLVM_ABI void setAliasee(Constant *Aliasee); + LLVM_ABI Constant *getAliasee() const; - const GlobalObject *getAliaseeObject() const; + LLVM_ABI const GlobalObject *getAliaseeObject() const; GlobalObject *getAliaseeObject() { return const_cast( static_cast(this)->getAliaseeObject()); @@ -1336,12 +1329,12 @@ class NoCFIValue final : public Constant { public: /// Return a NoCFIValue for the specified function. - static NoCFIValue *get(GlobalValue *GV); + LLVM_ABI static NoCFIValue *get(GlobalValue *GV); - GlobalValue *getGlobalValue() const; + LLVM_ABI GlobalValue *getGlobalValue() const; /// NoCFIValue is always a pointer. - PointerType *getType() const; + LLVM_ABI PointerType *getType() const; /// For isa/dyn_cast. static bool classof(const sandboxir::Value *From) { return From->getSubclassID() == ClassID::NoCFIValue; @@ -1369,21 +1362,21 @@ class ConstantPtrAuth final : public Constant { public: /// Return a pointer signed with the specified parameters. - static ConstantPtrAuth *get(Constant *Ptr, ConstantInt *Key, - ConstantInt *Disc, Constant *AddrDisc); + LLVM_ABI static ConstantPtrAuth *get(Constant *Ptr, ConstantInt *Key, + ConstantInt *Disc, Constant *AddrDisc); /// The pointer that is signed in this ptrauth signed pointer. - Constant *getPointer() const; + LLVM_ABI Constant *getPointer() const; /// The Key ID, an i32 constant. - ConstantInt *getKey() const; + LLVM_ABI ConstantInt *getKey() const; /// The integer discriminator, an i64 constant, or 0. - ConstantInt *getDiscriminator() const; + LLVM_ABI ConstantInt *getDiscriminator() const; /// The address discriminator if any, or the null constant. /// If present, this must be a value equivalent to the storage location of /// the only global-initializer user of the ptrauth signed pointer. - Constant *getAddrDiscriminator() const; + LLVM_ABI Constant *getAddrDiscriminator() const; /// Whether there is any non-null address discriminator. bool hasAddressDiscriminator() const { @@ -1410,7 +1403,7 @@ public: /// Produce a new ptrauth expression signing the given value using /// the same schema as is stored in one. - ConstantPtrAuth *getWithSameSchema(Constant *Pointer) const; + LLVM_ABI ConstantPtrAuth *getWithSameSchema(Constant *Pointer) const; /// For isa/dyn_cast. static bool classof(const sandboxir::Value *From) { @@ -1438,19 +1431,19 @@ class BlockAddress final : public Constant { public: /// Return a BlockAddress for the specified function and basic block. - static BlockAddress *get(Function *F, BasicBlock *BB); + LLVM_ABI static BlockAddress *get(Function *F, BasicBlock *BB); /// Return a BlockAddress for the specified basic block. The basic /// block must be embedded into a function. - static BlockAddress *get(BasicBlock *BB); + LLVM_ABI static BlockAddress *get(BasicBlock *BB); /// Lookup an existing \c BlockAddress constant for the given BasicBlock. /// /// \returns 0 if \c !BB->hasAddressTaken(), otherwise the \c BlockAddress. - static BlockAddress *lookup(const BasicBlock *BB); + LLVM_ABI static BlockAddress *lookup(const BasicBlock *BB); - Function *getFunction() const; - BasicBlock *getBasicBlock() const; + LLVM_ABI Function *getFunction() const; + LLVM_ABI BasicBlock *getBasicBlock() const; /// For isa/dyn_cast. static bool classof(const sandboxir::Value *From) { @@ -1465,9 +1458,9 @@ class DSOLocalEquivalent final : public Constant { public: /// Return a DSOLocalEquivalent for the specified global value. - static DSOLocalEquivalent *get(GlobalValue *GV); + LLVM_ABI static DSOLocalEquivalent *get(GlobalValue *GV); - GlobalValue *getGlobalValue() const; + LLVM_ABI GlobalValue *getGlobalValue() const; /// For isa/dyn_cast. static bool classof(const sandboxir::Value *From) { @@ -1498,7 +1491,7 @@ class ConstantTokenNone final : public Constant { public: /// Return the ConstantTokenNone. - static ConstantTokenNone *get(Context &Ctx); + LLVM_ABI static ConstantTokenNone *get(Context &Ctx); /// For isa/dyn_cast. static bool classof(const sandboxir::Value *From) { diff --git a/llvm/include/llvm/SandboxIR/Context.h b/llvm/include/llvm/SandboxIR/Context.h index a8a21b0db855..7d8b2c86e94a 100644 --- a/llvm/include/llvm/SandboxIR/Context.h +++ b/llvm/include/llvm/SandboxIR/Context.h @@ -15,6 +15,7 @@ #include "llvm/IR/LLVMContext.h" #include "llvm/SandboxIR/Tracker.h" #include "llvm/SandboxIR/Type.h" +#include "llvm/Support/Compiler.h" #include @@ -112,32 +113,33 @@ protected: CallbackID::ValTy NextCallbackID = 1; /// Remove \p V from the maps and returns the unique_ptr. - std::unique_ptr detachLLVMValue(llvm::Value *V); + LLVM_ABI std::unique_ptr detachLLVMValue(llvm::Value *V); /// Remove \p SBV from all SandboxIR maps and stop owning it. This effectively /// detaches \p V from the underlying IR. - std::unique_ptr detach(Value *V); + LLVM_ABI std::unique_ptr detach(Value *V); friend class Instruction; // For detach(). /// Take ownership of VPtr and store it in `LLVMValueToValueMap`. - Value *registerValue(std::unique_ptr &&VPtr); + LLVM_ABI Value *registerValue(std::unique_ptr &&VPtr); friend class EraseFromParent; // For registerValue(). /// This is the actual function that creates sandboxir values for \p V, /// and among others handles all instruction types. - Value *getOrCreateValueInternal(llvm::Value *V, llvm::User *U = nullptr); + LLVM_ABI Value *getOrCreateValueInternal(llvm::Value *V, + llvm::User *U = nullptr); /// Get or create a sandboxir::Argument for an existing LLVM IR \p LLVMArg. - Argument *getOrCreateArgument(llvm::Argument *LLVMArg); + LLVM_ABI Argument *getOrCreateArgument(llvm::Argument *LLVMArg); /// Get or create a sandboxir::Value for an existing LLVM IR \p LLVMV. Value *getOrCreateValue(llvm::Value *LLVMV) { return getOrCreateValueInternal(LLVMV, 0); } /// Get or create a sandboxir::Constant from an existing LLVM IR \p LLVMC. - Constant *getOrCreateConstant(llvm::Constant *LLVMC); + LLVM_ABI Constant *getOrCreateConstant(llvm::Constant *LLVMC); friend class ConstantDataSequential; // For getOrCreateConstant(). friend class Utils; // For getMemoryBase - void runEraseInstrCallbacks(Instruction *I); - void runCreateInstrCallbacks(Instruction *I); - void runMoveInstrCallbacks(Instruction *I, const BBIterator &Where); - void runSetUseCallbacks(const Use &U, Value *NewSrc); + LLVM_ABI void runEraseInstrCallbacks(Instruction *I); + LLVM_ABI void runCreateInstrCallbacks(Instruction *I); + LLVM_ABI void runMoveInstrCallbacks(Instruction *I, const BBIterator &Where); + LLVM_ABI void runSetUseCallbacks(const Use &U, Value *NewSrc); friend class User; // For runSetUseCallbacks(). friend class Value; // For runSetUseCallbacks(). @@ -148,90 +150,97 @@ protected: /// Create a sandboxir::BasicBlock for an existing LLVM IR \p BB. This will /// also create all contents of the block. - BasicBlock *createBasicBlock(llvm::BasicBlock *BB); + LLVM_ABI BasicBlock *createBasicBlock(llvm::BasicBlock *BB); friend class BasicBlock; // For getOrCreateValue(). IRBuilder LLVMIRBuilder; auto &getLLVMIRBuilder() { return LLVMIRBuilder; } - VAArgInst *createVAArgInst(llvm::VAArgInst *SI); + LLVM_ABI VAArgInst *createVAArgInst(llvm::VAArgInst *SI); friend VAArgInst; // For createVAArgInst() - FreezeInst *createFreezeInst(llvm::FreezeInst *SI); + LLVM_ABI FreezeInst *createFreezeInst(llvm::FreezeInst *SI); friend FreezeInst; // For createFreezeInst() - FenceInst *createFenceInst(llvm::FenceInst *SI); + LLVM_ABI FenceInst *createFenceInst(llvm::FenceInst *SI); friend FenceInst; // For createFenceInst() - SelectInst *createSelectInst(llvm::SelectInst *SI); + LLVM_ABI SelectInst *createSelectInst(llvm::SelectInst *SI); friend SelectInst; // For createSelectInst() - InsertElementInst *createInsertElementInst(llvm::InsertElementInst *IEI); + LLVM_ABI InsertElementInst * + createInsertElementInst(llvm::InsertElementInst *IEI); friend InsertElementInst; // For createInsertElementInst() - ExtractElementInst *createExtractElementInst(llvm::ExtractElementInst *EEI); + LLVM_ABI ExtractElementInst * + createExtractElementInst(llvm::ExtractElementInst *EEI); friend ExtractElementInst; // For createExtractElementInst() - ShuffleVectorInst *createShuffleVectorInst(llvm::ShuffleVectorInst *SVI); + LLVM_ABI ShuffleVectorInst * + createShuffleVectorInst(llvm::ShuffleVectorInst *SVI); friend ShuffleVectorInst; // For createShuffleVectorInst() - ExtractValueInst *createExtractValueInst(llvm::ExtractValueInst *IVI); + LLVM_ABI ExtractValueInst * + createExtractValueInst(llvm::ExtractValueInst *IVI); friend ExtractValueInst; // For createExtractValueInst() - InsertValueInst *createInsertValueInst(llvm::InsertValueInst *IVI); + LLVM_ABI InsertValueInst *createInsertValueInst(llvm::InsertValueInst *IVI); friend InsertValueInst; // For createInsertValueInst() - BranchInst *createBranchInst(llvm::BranchInst *I); + LLVM_ABI BranchInst *createBranchInst(llvm::BranchInst *I); friend BranchInst; // For createBranchInst() - LoadInst *createLoadInst(llvm::LoadInst *LI); + LLVM_ABI LoadInst *createLoadInst(llvm::LoadInst *LI); friend LoadInst; // For createLoadInst() - StoreInst *createStoreInst(llvm::StoreInst *SI); + LLVM_ABI StoreInst *createStoreInst(llvm::StoreInst *SI); friend StoreInst; // For createStoreInst() - ReturnInst *createReturnInst(llvm::ReturnInst *I); + LLVM_ABI ReturnInst *createReturnInst(llvm::ReturnInst *I); friend ReturnInst; // For createReturnInst() - CallInst *createCallInst(llvm::CallInst *I); + LLVM_ABI CallInst *createCallInst(llvm::CallInst *I); friend CallInst; // For createCallInst() - InvokeInst *createInvokeInst(llvm::InvokeInst *I); + LLVM_ABI InvokeInst *createInvokeInst(llvm::InvokeInst *I); friend InvokeInst; // For createInvokeInst() - CallBrInst *createCallBrInst(llvm::CallBrInst *I); + LLVM_ABI CallBrInst *createCallBrInst(llvm::CallBrInst *I); friend CallBrInst; // For createCallBrInst() - LandingPadInst *createLandingPadInst(llvm::LandingPadInst *I); + LLVM_ABI LandingPadInst *createLandingPadInst(llvm::LandingPadInst *I); friend LandingPadInst; // For createLandingPadInst() - CatchPadInst *createCatchPadInst(llvm::CatchPadInst *I); + LLVM_ABI CatchPadInst *createCatchPadInst(llvm::CatchPadInst *I); friend CatchPadInst; // For createCatchPadInst() - CleanupPadInst *createCleanupPadInst(llvm::CleanupPadInst *I); + LLVM_ABI CleanupPadInst *createCleanupPadInst(llvm::CleanupPadInst *I); friend CleanupPadInst; // For createCleanupPadInst() - CatchReturnInst *createCatchReturnInst(llvm::CatchReturnInst *I); + LLVM_ABI CatchReturnInst *createCatchReturnInst(llvm::CatchReturnInst *I); friend CatchReturnInst; // For createCatchReturnInst() - CleanupReturnInst *createCleanupReturnInst(llvm::CleanupReturnInst *I); + LLVM_ABI CleanupReturnInst * + createCleanupReturnInst(llvm::CleanupReturnInst *I); friend CleanupReturnInst; // For createCleanupReturnInst() - GetElementPtrInst *createGetElementPtrInst(llvm::GetElementPtrInst *I); + LLVM_ABI GetElementPtrInst * + createGetElementPtrInst(llvm::GetElementPtrInst *I); friend GetElementPtrInst; // For createGetElementPtrInst() - CatchSwitchInst *createCatchSwitchInst(llvm::CatchSwitchInst *I); + LLVM_ABI CatchSwitchInst *createCatchSwitchInst(llvm::CatchSwitchInst *I); friend CatchSwitchInst; // For createCatchSwitchInst() - ResumeInst *createResumeInst(llvm::ResumeInst *I); + LLVM_ABI ResumeInst *createResumeInst(llvm::ResumeInst *I); friend ResumeInst; // For createResumeInst() - SwitchInst *createSwitchInst(llvm::SwitchInst *I); + LLVM_ABI SwitchInst *createSwitchInst(llvm::SwitchInst *I); friend SwitchInst; // For createSwitchInst() - UnaryOperator *createUnaryOperator(llvm::UnaryOperator *I); + LLVM_ABI UnaryOperator *createUnaryOperator(llvm::UnaryOperator *I); friend UnaryOperator; // For createUnaryOperator() - BinaryOperator *createBinaryOperator(llvm::BinaryOperator *I); + LLVM_ABI BinaryOperator *createBinaryOperator(llvm::BinaryOperator *I); friend BinaryOperator; // For createBinaryOperator() - AtomicRMWInst *createAtomicRMWInst(llvm::AtomicRMWInst *I); + LLVM_ABI AtomicRMWInst *createAtomicRMWInst(llvm::AtomicRMWInst *I); friend AtomicRMWInst; // For createAtomicRMWInst() - AtomicCmpXchgInst *createAtomicCmpXchgInst(llvm::AtomicCmpXchgInst *I); + LLVM_ABI AtomicCmpXchgInst * + createAtomicCmpXchgInst(llvm::AtomicCmpXchgInst *I); friend AtomicCmpXchgInst; // For createAtomicCmpXchgInst() - AllocaInst *createAllocaInst(llvm::AllocaInst *I); + LLVM_ABI AllocaInst *createAllocaInst(llvm::AllocaInst *I); friend AllocaInst; // For createAllocaInst() - CastInst *createCastInst(llvm::CastInst *I); + LLVM_ABI CastInst *createCastInst(llvm::CastInst *I); friend CastInst; // For createCastInst() - PHINode *createPHINode(llvm::PHINode *I); + LLVM_ABI PHINode *createPHINode(llvm::PHINode *I); friend PHINode; // For createPHINode() - UnreachableInst *createUnreachableInst(llvm::UnreachableInst *UI); + LLVM_ABI UnreachableInst *createUnreachableInst(llvm::UnreachableInst *UI); friend UnreachableInst; // For createUnreachableInst() - CmpInst *createCmpInst(llvm::CmpInst *I); + LLVM_ABI CmpInst *createCmpInst(llvm::CmpInst *I); friend CmpInst; // For createCmpInst() - ICmpInst *createICmpInst(llvm::ICmpInst *I); + LLVM_ABI ICmpInst *createICmpInst(llvm::ICmpInst *I); friend ICmpInst; // For createICmpInst() - FCmpInst *createFCmpInst(llvm::FCmpInst *I); + LLVM_ABI FCmpInst *createFCmpInst(llvm::FCmpInst *I); friend FCmpInst; // For createFCmpInst() public: - Context(LLVMContext &LLVMCtx); - ~Context(); + LLVM_ABI Context(LLVMContext &LLVMCtx); + LLVM_ABI ~Context(); /// Clears function-level state. - void clear(); + LLVM_ABI void clear(); Tracker &getTracker() { return IRTracker; } /// Convenience function for `getTracker().save()` @@ -241,14 +250,14 @@ public: /// Convenience function for `getTracker().accept()` void accept() { IRTracker.accept(); } - sandboxir::Value *getValue(llvm::Value *V) const; + LLVM_ABI sandboxir::Value *getValue(llvm::Value *V) const; const sandboxir::Value *getValue(const llvm::Value *V) const { return getValue(const_cast(V)); } - Module *getModule(llvm::Module *LLVMM) const; + LLVM_ABI Module *getModule(llvm::Module *LLVMM) const; - Module *getOrCreateModule(llvm::Module *LLVMM); + LLVM_ABI Module *getOrCreateModule(llvm::Module *LLVMM); Type *getType(llvm::Type *LLVMTy) { if (LLVMTy == nullptr) @@ -265,10 +274,10 @@ public: /// This is the main API function for creating Sandbox IR. /// Note: this will not fully populate its parent module. The only globals /// that will be available are those used within the function. - Function *createFunction(llvm::Function *F); + LLVM_ABI Function *createFunction(llvm::Function *F); /// Create a sandboxir::Module corresponding to \p LLVMM. - Module *createModule(llvm::Module *LLVMM); + LLVM_ABI Module *createModule(llvm::Module *LLVMM); /// \Returns the number of values registered with Context. size_t getNumValues() const { return LLVMValueToValueMap.size(); } @@ -277,26 +286,26 @@ public: /// to be removed from its parent. Note that this will also be called when /// reverting the creation of an instruction. /// \Returns a callback ID for later deregistration. - CallbackID registerEraseInstrCallback(EraseInstrCallback CB); - void unregisterEraseInstrCallback(CallbackID ID); + LLVM_ABI CallbackID registerEraseInstrCallback(EraseInstrCallback CB); + LLVM_ABI void unregisterEraseInstrCallback(CallbackID ID); /// Register a callback that gets called right after a SandboxIR instruction /// is created. Note that this will also be called when reverting the removal /// of an instruction. /// \Returns a callback ID for later deregistration. - CallbackID registerCreateInstrCallback(CreateInstrCallback CB); - void unregisterCreateInstrCallback(CallbackID ID); + LLVM_ABI CallbackID registerCreateInstrCallback(CreateInstrCallback CB); + LLVM_ABI void unregisterCreateInstrCallback(CallbackID ID); /// Register a callback that gets called when a SandboxIR instruction is about /// to be moved. Note that this will also be called when reverting a move. /// \Returns a callback ID for later deregistration. - CallbackID registerMoveInstrCallback(MoveInstrCallback CB); - void unregisterMoveInstrCallback(CallbackID ID); + LLVM_ABI CallbackID registerMoveInstrCallback(MoveInstrCallback CB); + LLVM_ABI void unregisterMoveInstrCallback(CallbackID ID); /// Register a callback that gets called when a Use gets set. /// \Returns a callback ID for later deregistration. - CallbackID registerSetUseCallback(SetUseCallback CB); - void unregisterSetUseCallback(CallbackID ID); + LLVM_ABI CallbackID registerSetUseCallback(SetUseCallback CB); + LLVM_ABI void unregisterSetUseCallback(CallbackID ID); }; } // namespace sandboxir diff --git a/llvm/include/llvm/SandboxIR/Function.h b/llvm/include/llvm/SandboxIR/Function.h index 2c4b53ef6c1e..28c69112b2b7 100644 --- a/llvm/include/llvm/SandboxIR/Function.h +++ b/llvm/include/llvm/SandboxIR/Function.h @@ -11,6 +11,7 @@ #include "llvm/IR/Function.h" #include "llvm/SandboxIR/Constant.h" +#include "llvm/Support/Compiler.h" namespace llvm::sandboxir { @@ -56,7 +57,7 @@ public: LLVMBBToBB BBGetter(Ctx); return iterator(cast(Val)->end(), BBGetter); } - FunctionType *getFunctionType() const; + LLVM_ABI FunctionType *getFunctionType() const; /// Returns the alignment of the given function. MaybeAlign getAlign() const { return cast(Val)->getAlign(); } @@ -66,7 +67,7 @@ public: /// Sets the alignment attribute of the Function. /// This method will be deprecated as the alignment property should always be /// defined. - void setAlignment(MaybeAlign Align); + LLVM_ABI void setAlignment(MaybeAlign Align); #ifndef NDEBUG void verify() const final { diff --git a/llvm/include/llvm/SandboxIR/Instruction.h b/llvm/include/llvm/SandboxIR/Instruction.h index ce5a2cbec85b..4e3ff19d4778 100644 --- a/llvm/include/llvm/SandboxIR/Instruction.h +++ b/llvm/include/llvm/SandboxIR/Instruction.h @@ -16,6 +16,7 @@ #include "llvm/SandboxIR/BasicBlock.h" #include "llvm/SandboxIR/Constant.h" #include "llvm/SandboxIR/User.h" +#include "llvm/Support/Compiler.h" namespace llvm::sandboxir { @@ -57,7 +58,7 @@ protected: /// A SandboxIR Instruction may map to multiple LLVM IR Instruction. This /// returns its topmost LLVM IR instruction. - llvm::Instruction *getTopmostLLVMInstruction() const; + LLVM_ABI llvm::Instruction *getTopmostLLVMInstruction() const; friend class VAArgInst; // For getTopmostLLVMInstruction(). friend class FreezeInst; // For getTopmostLLVMInstruction(). friend class FenceInst; // For getTopmostLLVMInstruction(). @@ -113,17 +114,17 @@ protected: } public: - static const char *getOpcodeName(Opcode Opc); + LLVM_ABI static const char *getOpcodeName(Opcode Opc); /// This is used by BasicBlock::iterator. virtual unsigned getNumOfIRInstrs() const = 0; /// \Returns a BasicBlock::iterator for this Instruction. - BBIterator getIterator() const; + LLVM_ABI BBIterator getIterator() const; /// \Returns the next sandboxir::Instruction in the block, or nullptr if at /// the end of the block. - Instruction *getNextNode() const; + LLVM_ABI Instruction *getNextNode() const; /// \Returns the previous sandboxir::Instruction in the block, or nullptr if /// at the beginning of the block. - Instruction *getPrevNode() const; + LLVM_ABI Instruction *getPrevNode() const; /// \Returns this Instruction's opcode. Note that SandboxIR has its own opcode /// state to allow for new SandboxIR-specific instructions. Opcode getOpcode() const { return Opc; } @@ -188,17 +189,17 @@ public: // TODO: More missing functions /// Detach this from its parent BasicBlock without deleting it. - void removeFromParent(); + LLVM_ABI void removeFromParent(); /// Detach this Value from its parent and delete it. - void eraseFromParent(); + LLVM_ABI void eraseFromParent(); /// Insert this detached instruction before \p BeforeI. - void insertBefore(Instruction *BeforeI); + LLVM_ABI void insertBefore(Instruction *BeforeI); /// Insert this detached instruction after \p AfterI. - void insertAfter(Instruction *AfterI); + LLVM_ABI void insertAfter(Instruction *AfterI); /// Insert this detached instruction into \p BB at \p WhereIt. - void insertInto(BasicBlock *BB, const BBIterator &WhereIt); + LLVM_ABI void insertInto(BasicBlock *BB, const BBIterator &WhereIt); /// Move this instruction to \p WhereIt. - void moveBefore(BasicBlock &BB, const BBIterator &WhereIt); + LLVM_ABI void moveBefore(BasicBlock &BB, const BBIterator &WhereIt); /// Move this instruction before \p Before. void moveBefore(Instruction *Before) { moveBefore(*Before->getParent(), Before->getIterator()); @@ -217,9 +218,9 @@ public: } /// \Returns the BasicBlock containing this Instruction, or null if it is /// detached. - BasicBlock *getParent() const; + LLVM_ABI BasicBlock *getParent() const; /// For isa/dyn_cast. - static bool classof(const sandboxir::Value *From); + LLVM_ABI static bool classof(const sandboxir::Value *From); /// Determine whether the no signed wrap flag is set. bool hasNoUnsignedWrap() const { @@ -227,20 +228,20 @@ public: } /// Set or clear the nuw flag on this instruction, which must be an operator /// which supports this flag. See LangRef.html for the meaning of this flag. - void setHasNoUnsignedWrap(bool B = true); + LLVM_ABI void setHasNoUnsignedWrap(bool B = true); /// Determine whether the no signed wrap flag is set. bool hasNoSignedWrap() const { return cast(Val)->hasNoSignedWrap(); } /// Set or clear the nsw flag on this instruction, which must be an operator /// which supports this flag. See LangRef.html for the meaning of this flag. - void setHasNoSignedWrap(bool B = true); + LLVM_ABI void setHasNoSignedWrap(bool B = true); /// Determine whether all fast-math-flags are set. bool isFast() const { return cast(Val)->isFast(); } /// Set or clear all fast-math-flags on this instruction, which must be an /// operator which supports this flag. See LangRef.html for the meaning of /// this flag. - void setFast(bool B); + LLVM_ABI void setFast(bool B); /// Determine whether the allow-reassociation flag is set. bool hasAllowReassoc() const { return cast(Val)->hasAllowReassoc(); @@ -248,24 +249,24 @@ public: /// Set or clear the reassociation flag on this instruction, which must be /// an operator which supports this flag. See LangRef.html for the meaning of /// this flag. - void setHasAllowReassoc(bool B); + LLVM_ABI void setHasAllowReassoc(bool B); /// Determine whether the exact flag is set. bool isExact() const { return cast(Val)->isExact(); } /// Set or clear the exact flag on this instruction, which must be an operator /// which supports this flag. See LangRef.html for the meaning of this flag. - void setIsExact(bool B = true); + LLVM_ABI void setIsExact(bool B = true); /// Determine whether the no-NaNs flag is set. bool hasNoNaNs() const { return cast(Val)->hasNoNaNs(); } /// Set or clear the no-nans flag on this instruction, which must be an /// operator which supports this flag. See LangRef.html for the meaning of /// this flag. - void setHasNoNaNs(bool B); + LLVM_ABI void setHasNoNaNs(bool B); /// Determine whether the no-infs flag is set. bool hasNoInfs() const { return cast(Val)->hasNoInfs(); } /// Set or clear the no-infs flag on this instruction, which must be an /// operator which supports this flag. See LangRef.html for the meaning of /// this flag. - void setHasNoInfs(bool B); + LLVM_ABI void setHasNoInfs(bool B); /// Determine whether the no-signed-zeros flag is set. bool hasNoSignedZeros() const { return cast(Val)->hasNoSignedZeros(); @@ -273,7 +274,7 @@ public: /// Set or clear the no-signed-zeros flag on this instruction, which must be /// an operator which supports this flag. See LangRef.html for the meaning of /// this flag. - void setHasNoSignedZeros(bool B); + LLVM_ABI void setHasNoSignedZeros(bool B); /// Determine whether the allow-reciprocal flag is set. bool hasAllowReciprocal() const { return cast(Val)->hasAllowReciprocal(); @@ -281,7 +282,7 @@ public: /// Set or clear the allow-reciprocal flag on this instruction, which must be /// an operator which supports this flag. See LangRef.html for the meaning of /// this flag. - void setHasAllowReciprocal(bool B); + LLVM_ABI void setHasAllowReciprocal(bool B); /// Determine whether the allow-contract flag is set. bool hasAllowContract() const { return cast(Val)->hasAllowContract(); @@ -289,7 +290,7 @@ public: /// Set or clear the allow-contract flag on this instruction, which must be /// an operator which supports this flag. See LangRef.html for the meaning of /// this flag. - void setHasAllowContract(bool B); + LLVM_ABI void setHasAllowContract(bool B); /// Determine whether the approximate-math-functions flag is set. bool hasApproxFunc() const { return cast(Val)->hasApproxFunc(); @@ -297,7 +298,7 @@ public: /// Set or clear the approximate-math-functions flag on this instruction, /// which must be an operator which supports this flag. See LangRef.html for /// the meaning of this flag. - void setHasApproxFunc(bool B); + LLVM_ABI void setHasApproxFunc(bool B); /// Convenience function for getting all the fast-math flags, which must be an /// operator which supports these flags. See LangRef.html for the meaning of /// these flags. @@ -307,11 +308,11 @@ public: /// Convenience function for setting multiple fast-math flags on this /// instruction, which must be an operator which supports these flags. See /// LangRef.html for the meaning of these flags. - void setFastMathFlags(FastMathFlags FMF); + LLVM_ABI void setFastMathFlags(FastMathFlags FMF); /// Convenience function for transferring all fast-math flag values to this /// instruction, which must be an operator which supports these flags. See /// LangRef.html for the meaning of these flags. - void copyFastMathFlags(FastMathFlags FMF); + LLVM_ABI void copyFastMathFlags(FastMathFlags FMF); bool isAssociative() const { return cast(Val)->isAssociative(); @@ -352,7 +353,7 @@ public: bool isVolatile() const { return cast(Val)->isVolatile(); } - Type *getAccessType() const; + LLVM_ABI Type *getAccessType() const; bool mayThrow(bool IncludePhaseOneUnwind = false) const { return cast(Val)->mayThrow(IncludePhaseOneUnwind); @@ -414,22 +415,22 @@ class FenceInst : public SingleLLVMInstructionImpl { friend Context; // For constructor; public: - static FenceInst *create(AtomicOrdering Ordering, InsertPosition Pos, - Context &Ctx, - SyncScope::ID SSID = SyncScope::System); + LLVM_ABI static FenceInst *create(AtomicOrdering Ordering, InsertPosition Pos, + Context &Ctx, + SyncScope::ID SSID = SyncScope::System); /// Returns the ordering constraint of this fence instruction. AtomicOrdering getOrdering() const { return cast(Val)->getOrdering(); } /// Sets the ordering constraint of this fence instruction. May only be /// Acquire, Release, AcquireRelease, or SequentiallyConsistent. - void setOrdering(AtomicOrdering Ordering); + LLVM_ABI void setOrdering(AtomicOrdering Ordering); /// Returns the synchronization scope ID of this fence instruction. SyncScope::ID getSyncScopeID() const { return cast(Val)->getSyncScopeID(); } /// Sets the synchronization scope ID of this fence instruction. - void setSyncScopeID(SyncScope::ID SSID); + LLVM_ABI void setSyncScopeID(SyncScope::ID SSID); static bool classof(const Value *From) { return From->getSubclassID() == ClassID::Fence; } @@ -443,9 +444,9 @@ class SelectInst : public SingleLLVMInstructionImpl { friend Context; // for SelectInst() public: - static Value *create(Value *Cond, Value *True, Value *False, - InsertPosition Pos, Context &Ctx, - const Twine &Name = ""); + LLVM_ABI static Value *create(Value *Cond, Value *True, Value *False, + InsertPosition Pos, Context &Ctx, + const Twine &Name = ""); const Value *getCondition() const { return getOperand(0); } const Value *getTrueValue() const { return getOperand(1); } @@ -457,7 +458,7 @@ public: void setCondition(Value *New) { setOperand(0, New); } void setTrueValue(Value *New) { setOperand(1, New); } void setFalseValue(Value *New) { setOperand(2, New); } - void swapValues(); + LLVM_ABI void swapValues(); /// Return a string if the specified operands are invalid for a select /// operation, otherwise return null. @@ -468,7 +469,7 @@ public: } /// For isa/dyn_cast. - static bool classof(const Value *From); + LLVM_ABI static bool classof(const Value *From); }; class InsertElementInst final @@ -480,9 +481,9 @@ class InsertElementInst final friend class Context; // For accessing the constructor in create*() public: - static Value *create(Value *Vec, Value *NewElt, Value *Idx, - InsertPosition Pos, Context &Ctx, - const Twine &Name = ""); + LLVM_ABI static Value *create(Value *Vec, Value *NewElt, Value *Idx, + InsertPosition Pos, Context &Ctx, + const Twine &Name = ""); static bool classof(const Value *From) { return From->getSubclassID() == ClassID::InsertElement; } @@ -503,8 +504,8 @@ class ExtractElementInst final // create*() public: - static Value *create(Value *Vec, Value *Idx, InsertPosition Pos, Context &Ctx, - const Twine &Name = ""); + LLVM_ABI static Value *create(Value *Vec, Value *Idx, InsertPosition Pos, + Context &Ctx, const Twine &Name = ""); static bool classof(const Value *From) { return From->getSubclassID() == ClassID::ExtractElement; } @@ -516,7 +517,7 @@ public: Value *getIndexOperand() { return getOperand(1); } const Value *getVectorOperand() const { return getOperand(0); } const Value *getIndexOperand() const { return getOperand(1); } - VectorType *getVectorOperandType() const; + LLVM_ABI VectorType *getVectorOperandType() const; }; class ShuffleVectorInst final @@ -528,18 +529,19 @@ class ShuffleVectorInst final friend class Context; // For accessing the constructor in create*() public: - static Value *create(Value *V1, Value *V2, Value *Mask, InsertPosition Pos, - Context &Ctx, const Twine &Name = ""); - static Value *create(Value *V1, Value *V2, ArrayRef Mask, - InsertPosition Pos, Context &Ctx, - const Twine &Name = ""); + LLVM_ABI static Value *create(Value *V1, Value *V2, Value *Mask, + InsertPosition Pos, Context &Ctx, + const Twine &Name = ""); + LLVM_ABI static Value *create(Value *V1, Value *V2, ArrayRef Mask, + InsertPosition Pos, Context &Ctx, + const Twine &Name = ""); static bool classof(const Value *From) { return From->getSubclassID() == ClassID::ShuffleVector; } /// Swap the operands and adjust the mask to preserve the semantics of the /// instruction. - void commute(); + LLVM_ABI void commute(); /// Return true if a shufflevector instruction can be formed with the /// specified operands. @@ -554,7 +556,7 @@ public: } /// Overload to return most specific vector type. - VectorType *getType() const; + LLVM_ABI VectorType *getType() const; /// Return the shuffle mask value of this instruction for the given element /// index. Return PoisonMaskElem if the element is undef. @@ -577,12 +579,12 @@ public: } /// Return the mask for this instruction, for use in bitcode. - Constant *getShuffleMaskForBitcode() const; + LLVM_ABI Constant *getShuffleMaskForBitcode() const; - static Constant *convertShuffleMaskForBitcode(ArrayRef Mask, - Type *ResultTy); + LLVM_ABI static Constant *convertShuffleMaskForBitcode(ArrayRef Mask, + Type *ResultTy); - void setShuffleMask(ArrayRef Mask); + LLVM_ABI void setShuffleMask(ArrayRef Mask); ArrayRef getShuffleMask() const { return cast(Val)->getShuffleMask(); @@ -965,9 +967,9 @@ class InsertValueInst friend Context; // for InsertValueInst() public: - static Value *create(Value *Agg, Value *Val, ArrayRef Idxs, - InsertPosition Pos, Context &Ctx, - const Twine &Name = ""); + LLVM_ABI static Value *create(Value *Agg, Value *Val, ArrayRef Idxs, + InsertPosition Pos, Context &Ctx, + const Twine &Name = ""); static bool classof(const Value *From) { return From->getSubclassID() == ClassID::InsertValue; @@ -1024,36 +1026,37 @@ class BranchInst : public SingleLLVMInstructionImpl { friend Context; // for BranchInst() public: - static BranchInst *create(BasicBlock *IfTrue, InsertPosition Pos, - Context &Ctx); - static BranchInst *create(BasicBlock *IfTrue, BasicBlock *IfFalse, - Value *Cond, InsertPosition Pos, Context &Ctx); + LLVM_ABI static BranchInst *create(BasicBlock *IfTrue, InsertPosition Pos, + Context &Ctx); + LLVM_ABI static BranchInst *create(BasicBlock *IfTrue, BasicBlock *IfFalse, + Value *Cond, InsertPosition Pos, + Context &Ctx); /// For isa/dyn_cast. - static bool classof(const Value *From); + LLVM_ABI static bool classof(const Value *From); bool isUnconditional() const { return cast(Val)->isUnconditional(); } bool isConditional() const { return cast(Val)->isConditional(); } - Value *getCondition() const; + LLVM_ABI Value *getCondition() const; void setCondition(Value *V) { setOperand(0, V); } unsigned getNumSuccessors() const { return 1 + isConditional(); } - BasicBlock *getSuccessor(unsigned SuccIdx) const; - void setSuccessor(unsigned Idx, BasicBlock *NewSucc); + LLVM_ABI BasicBlock *getSuccessor(unsigned SuccIdx) const; + LLVM_ABI void setSuccessor(unsigned Idx, BasicBlock *NewSucc); void swapSuccessors() { swapOperandsInternal(1, 2); } private: struct LLVMBBToSBBB { Context &Ctx; LLVMBBToSBBB(Context &Ctx) : Ctx(Ctx) {} - BasicBlock *operator()(llvm::BasicBlock *BB) const; + LLVM_ABI BasicBlock *operator()(llvm::BasicBlock *BB) const; }; struct ConstLLVMBBToSBBB { Context &Ctx; ConstLLVMBBToSBBB(Context &Ctx) : Ctx(Ctx) {} - const BasicBlock *operator()(const llvm::BasicBlock *BB) const; + LLVM_ABI const BasicBlock *operator()(const llvm::BasicBlock *BB) const; }; public: @@ -1109,8 +1112,9 @@ class ExtractValueInst : public UnaryInstruction { friend Context; // for ExtractValueInst() public: - static Value *create(Value *Agg, ArrayRef Idxs, InsertPosition Pos, - Context &Ctx, const Twine &Name = ""); + LLVM_ABI static Value *create(Value *Agg, ArrayRef Idxs, + InsertPosition Pos, Context &Ctx, + const Twine &Name = ""); static bool classof(const Value *From) { return From->getSubclassID() == ClassID::ExtractValue; @@ -1120,7 +1124,7 @@ public: /// with an extractvalue instruction with the specified parameters. /// /// Null is returned if the indices are invalid for the specified type. - static Type *getIndexedType(Type *Agg, ArrayRef Idxs); + LLVM_ABI static Type *getIndexedType(Type *Agg, ArrayRef Idxs); using idx_iterator = llvm::ExtractValueInst::idx_iterator; @@ -1163,9 +1167,9 @@ class VAArgInst : public UnaryInstruction { friend Context; // For constructor; public: - static VAArgInst *create(Value *List, Type *Ty, InsertPosition Pos, - Context &Ctx, const Twine &Name = ""); - Value *getPointerOperand(); + LLVM_ABI static VAArgInst *create(Value *List, Type *Ty, InsertPosition Pos, + Context &Ctx, const Twine &Name = ""); + LLVM_ABI Value *getPointerOperand(); const Value *getPointerOperand() const { return const_cast(this)->getPointerOperand(); } @@ -1183,8 +1187,8 @@ class FreezeInst : public UnaryInstruction { friend Context; // For constructor; public: - static FreezeInst *create(Value *V, InsertPosition Pos, Context &Ctx, - const Twine &Name = ""); + LLVM_ABI static FreezeInst *create(Value *V, InsertPosition Pos, Context &Ctx, + const Twine &Name = ""); static bool classof(const Value *From) { return From->getSubclassID() == ClassID::Freeze; } @@ -1200,11 +1204,11 @@ public: /// Return true if this is a load from a volatile memory location. bool isVolatile() const { return cast(Val)->isVolatile(); } /// Specify whether this is a volatile load or not. - void setVolatile(bool V); + LLVM_ABI void setVolatile(bool V); - static LoadInst *create(Type *Ty, Value *Ptr, MaybeAlign Align, - InsertPosition Pos, bool IsVolatile, Context &Ctx, - const Twine &Name = ""); + LLVM_ABI static LoadInst *create(Type *Ty, Value *Ptr, MaybeAlign Align, + InsertPosition Pos, bool IsVolatile, + Context &Ctx, const Twine &Name = ""); static LoadInst *create(Type *Ty, Value *Ptr, MaybeAlign Align, InsertPosition Pos, Context &Ctx, const Twine &Name = "") { @@ -1212,8 +1216,8 @@ public: } /// For isa/dyn_cast. - static bool classof(const Value *From); - Value *getPointerOperand() const; + LLVM_ABI static bool classof(const Value *From); + LLVM_ABI Value *getPointerOperand() const; Align getAlign() const { return cast(Val)->getAlign(); } bool isUnordered() const { return cast(Val)->isUnordered(); } bool isSimple() const { return cast(Val)->isSimple(); } @@ -1229,19 +1233,20 @@ public: /// Return true if this is a store from a volatile memory location. bool isVolatile() const { return cast(Val)->isVolatile(); } /// Specify whether this is a volatile store or not. - void setVolatile(bool V); + LLVM_ABI void setVolatile(bool V); - static StoreInst *create(Value *V, Value *Ptr, MaybeAlign Align, - InsertPosition Pos, bool IsVolatile, Context &Ctx); + LLVM_ABI static StoreInst *create(Value *V, Value *Ptr, MaybeAlign Align, + InsertPosition Pos, bool IsVolatile, + Context &Ctx); static StoreInst *create(Value *V, Value *Ptr, MaybeAlign Align, InsertPosition Pos, Context &Ctx) { return create(V, Ptr, Align, Pos, /*IsVolatile=*/false, Ctx); } /// For isa/dyn_cast. - static bool classof(const Value *From); - Value *getValueOperand() const; - Value *getPointerOperand() const; + LLVM_ABI static bool classof(const Value *From); + LLVM_ABI Value *getValueOperand() const; + LLVM_ABI Value *getPointerOperand() const; Align getAlign() const { return cast(Val)->getAlign(); } bool isSimple() const { return cast(Val)->isSimple(); } bool isUnordered() const { return cast(Val)->isUnordered(); } @@ -1260,8 +1265,8 @@ class UnreachableInst final : public Instruction { } public: - static UnreachableInst *create(InsertPosition Pos, Context &Ctx); - static bool classof(const Value *From); + LLVM_ABI static UnreachableInst *create(InsertPosition Pos, Context &Ctx); + LLVM_ABI static bool classof(const Value *From); unsigned getNumSuccessors() const { return 0; } unsigned getUseOperandNo(const Use &Use) const final { llvm_unreachable("UnreachableInst has no operands!"); @@ -1280,12 +1285,13 @@ class ReturnInst final : public SingleLLVMInstructionImpl { Context &Ctx); public: - static ReturnInst *create(Value *RetVal, InsertPosition Pos, Context &Ctx); + LLVM_ABI static ReturnInst *create(Value *RetVal, InsertPosition Pos, + Context &Ctx); static bool classof(const Value *From) { return From->getSubclassID() == ClassID::Ret; } /// \Returns null if there is no return value. - Value *getReturnValue() const; + LLVM_ABI Value *getReturnValue() const; }; class CallBase : public SingleLLVMInstructionImpl { @@ -1303,7 +1309,7 @@ public: Opc == Instruction::ClassID::CallBr; } - FunctionType *getFunctionType() const; + LLVM_ABI FunctionType *getFunctionType() const; op_iterator data_operands_begin() { return op_begin(); } const_op_iterator data_operands_begin() const { @@ -1390,17 +1396,17 @@ public: } bool hasArgument(const Value *V) const { return is_contained(args(), V); } - Value *getCalledOperand() const; - Use getCalledOperandUse() const; + LLVM_ABI Value *getCalledOperand() const; + LLVM_ABI Use getCalledOperandUse() const; - Function *getCalledFunction() const; + LLVM_ABI Function *getCalledFunction() const; bool isIndirectCall() const { return cast(Val)->isIndirectCall(); } bool isCallee(Use U) const { return cast(Val)->isCallee(U.LLVMUse); } - Function *getCaller(); + LLVM_ABI Function *getCaller(); const Function *getCaller() const { return const_cast(this)->getCaller(); } @@ -1412,7 +1418,7 @@ public: return cast(Val)->getIntrinsicID(); } void setCalledOperand(Value *V) { getCalledOperandUse().set(V); } - void setCalledFunction(Function *F); + LLVM_ABI void setCalledFunction(Function *F); CallingConv::ID getCallingConv() const { return cast(Val)->getCallingConv(); } @@ -1428,9 +1434,9 @@ class CallInst : public CallBase { friend class IntrinsicInst; // For constructor public: - static CallInst *create(FunctionType *FTy, Value *Func, - ArrayRef Args, InsertPosition Pos, - Context &Ctx, const Twine &NameStr = ""); + LLVM_ABI static CallInst *create(FunctionType *FTy, Value *Func, + ArrayRef Args, InsertPosition Pos, + Context &Ctx, const Twine &NameStr = ""); static bool classof(const Value *From) { return From->getSubclassID() == ClassID::Call; @@ -1446,20 +1452,21 @@ class InvokeInst final : public CallBase { // create*() public: - static InvokeInst *create(FunctionType *FTy, Value *Func, - BasicBlock *IfNormal, BasicBlock *IfException, - ArrayRef Args, InsertPosition Pos, - Context &Ctx, const Twine &NameStr = ""); + LLVM_ABI static InvokeInst *create(FunctionType *FTy, Value *Func, + BasicBlock *IfNormal, + BasicBlock *IfException, + ArrayRef Args, InsertPosition Pos, + Context &Ctx, const Twine &NameStr = ""); static bool classof(const Value *From) { return From->getSubclassID() == ClassID::Invoke; } - BasicBlock *getNormalDest() const; - BasicBlock *getUnwindDest() const; - void setNormalDest(BasicBlock *BB); - void setUnwindDest(BasicBlock *BB); - LandingPadInst *getLandingPadInst() const; - BasicBlock *getSuccessor(unsigned SuccIdx) const; + LLVM_ABI BasicBlock *getNormalDest() const; + LLVM_ABI BasicBlock *getUnwindDest() const; + LLVM_ABI void setNormalDest(BasicBlock *BB); + LLVM_ABI void setUnwindDest(BasicBlock *BB); + LLVM_ABI LandingPadInst *getLandingPadInst() const; + LLVM_ABI BasicBlock *getSuccessor(unsigned SuccIdx) const; void setSuccessor(unsigned SuccIdx, BasicBlock *NewSucc) { assert(SuccIdx < 2 && "Successor # out of range for invoke!"); if (SuccIdx == 0) @@ -1481,25 +1488,25 @@ class CallBrInst final : public CallBase { // create*() public: - static CallBrInst *create(FunctionType *FTy, Value *Func, - BasicBlock *DefaultDest, - ArrayRef IndirectDests, - ArrayRef Args, InsertPosition Pos, - Context &Ctx, const Twine &NameStr = ""); + LLVM_ABI static CallBrInst *create(FunctionType *FTy, Value *Func, + BasicBlock *DefaultDest, + ArrayRef IndirectDests, + ArrayRef Args, InsertPosition Pos, + Context &Ctx, const Twine &NameStr = ""); static bool classof(const Value *From) { return From->getSubclassID() == ClassID::CallBr; } unsigned getNumIndirectDests() const { return cast(Val)->getNumIndirectDests(); } - Value *getIndirectDestLabel(unsigned Idx) const; - Value *getIndirectDestLabelUse(unsigned Idx) const; - BasicBlock *getDefaultDest() const; - BasicBlock *getIndirectDest(unsigned Idx) const; - SmallVector getIndirectDests() const; - void setDefaultDest(BasicBlock *BB); - void setIndirectDest(unsigned Idx, BasicBlock *BB); - BasicBlock *getSuccessor(unsigned Idx) const; + LLVM_ABI Value *getIndirectDestLabel(unsigned Idx) const; + LLVM_ABI Value *getIndirectDestLabelUse(unsigned Idx) const; + LLVM_ABI BasicBlock *getDefaultDest() const; + LLVM_ABI BasicBlock *getIndirectDest(unsigned Idx) const; + LLVM_ABI SmallVector getIndirectDests() const; + LLVM_ABI void setDefaultDest(BasicBlock *BB); + LLVM_ABI void setIndirectDest(unsigned Idx, BasicBlock *BB); + LLVM_ABI BasicBlock *getSuccessor(unsigned Idx) const; unsigned getNumSuccessors() const { return cast(Val)->getNumSuccessors(); } @@ -1512,9 +1519,10 @@ class LandingPadInst : public SingleLLVMInstructionImpl { friend class Context; // For constructor. public: - static LandingPadInst *create(Type *RetTy, unsigned NumReservedClauses, - InsertPosition Pos, Context &Ctx, - const Twine &Name = ""); + LLVM_ABI static LandingPadInst *create(Type *RetTy, + unsigned NumReservedClauses, + InsertPosition Pos, Context &Ctx, + const Twine &Name = ""); /// Return 'true' if this landingpad instruction is a /// cleanup. I.e., it should be run when unwinding even if its landing pad /// doesn't catch the exception. @@ -1522,14 +1530,14 @@ public: return cast(Val)->isCleanup(); } /// Indicate that this landingpad instruction is a cleanup. - void setCleanup(bool V); + LLVM_ABI void setCleanup(bool V); // TODO: We are not implementing addClause() because we have no way to revert // it for now. /// Get the value of the clause at index Idx. Use isCatch/isFilter to /// determine what type of clause this is. - Constant *getClause(unsigned Idx) const; + LLVM_ABI Constant *getClause(unsigned Idx) const; /// Return 'true' if the clause and index Idx is a catch clause. bool isCatch(unsigned Idx) const { @@ -1565,12 +1573,12 @@ public: /// /// Note: This returns the associated CatchSwitchInst if this FuncletPadInst /// is a CatchPadInst. - Value *getParentPad() const; - void setParentPad(Value *ParentPad); + LLVM_ABI Value *getParentPad() const; + LLVM_ABI void setParentPad(Value *ParentPad); /// Return the Idx-th funcletpad argument. - Value *getArgOperand(unsigned Idx) const; + LLVM_ABI Value *getArgOperand(unsigned Idx) const; /// Set the Idx-th funcletpad argument. - void setArgOperand(unsigned Idx, Value *V); + LLVM_ABI void setArgOperand(unsigned Idx, Value *V); // TODO: Implement missing functions: arg_operands(). static bool classof(const Value *From) { @@ -1585,13 +1593,13 @@ class CatchPadInst : public FuncletPadInst { friend class Context; // For constructor. public: - CatchSwitchInst *getCatchSwitch() const; + LLVM_ABI CatchSwitchInst *getCatchSwitch() const; // TODO: We have not implemented setCatchSwitch() because we can't revert it // for now, as there is no CatchPadInst member function that can undo it. - static CatchPadInst *create(Value *ParentPad, ArrayRef Args, - InsertPosition Pos, Context &Ctx, - const Twine &Name = ""); + LLVM_ABI static CatchPadInst *create(Value *ParentPad, ArrayRef Args, + InsertPosition Pos, Context &Ctx, + const Twine &Name = ""); static bool classof(const Value *From) { return From->getSubclassID() == ClassID::CatchPad; } @@ -1603,9 +1611,10 @@ class CleanupPadInst : public FuncletPadInst { friend class Context; // For constructor. public: - static CleanupPadInst *create(Value *ParentPad, ArrayRef Args, - InsertPosition Pos, Context &Ctx, - const Twine &Name = ""); + LLVM_ABI static CleanupPadInst *create(Value *ParentPad, + ArrayRef Args, + InsertPosition Pos, Context &Ctx, + const Twine &Name = ""); static bool classof(const Value *From) { return From->getSubclassID() == ClassID::CleanupPad; } @@ -1619,16 +1628,17 @@ class CatchReturnInst friend class Context; // For constructor. public: - static CatchReturnInst *create(CatchPadInst *CatchPad, BasicBlock *BB, - InsertPosition Pos, Context &Ctx); - CatchPadInst *getCatchPad() const; - void setCatchPad(CatchPadInst *CatchPad); - BasicBlock *getSuccessor() const; - void setSuccessor(BasicBlock *NewSucc); + LLVM_ABI static CatchReturnInst *create(CatchPadInst *CatchPad, + BasicBlock *BB, InsertPosition Pos, + Context &Ctx); + LLVM_ABI CatchPadInst *getCatchPad() const; + LLVM_ABI void setCatchPad(CatchPadInst *CatchPad); + LLVM_ABI BasicBlock *getSuccessor() const; + LLVM_ABI void setSuccessor(BasicBlock *NewSucc); unsigned getNumSuccessors() { return cast(Val)->getNumSuccessors(); } - Value *getCatchSwitchParentPad() const; + LLVM_ABI Value *getCatchSwitchParentPad() const; static bool classof(const Value *From) { return From->getSubclassID() == ClassID::CatchRet; } @@ -1642,22 +1652,22 @@ class CleanupReturnInst friend class Context; // For constructor. public: - static CleanupReturnInst *create(CleanupPadInst *CleanupPad, - BasicBlock *UnwindBB, InsertPosition Pos, - Context &Ctx); + LLVM_ABI static CleanupReturnInst *create(CleanupPadInst *CleanupPad, + BasicBlock *UnwindBB, + InsertPosition Pos, Context &Ctx); bool hasUnwindDest() const { return cast(Val)->hasUnwindDest(); } bool unwindsToCaller() const { return cast(Val)->unwindsToCaller(); } - CleanupPadInst *getCleanupPad() const; - void setCleanupPad(CleanupPadInst *CleanupPad); + LLVM_ABI CleanupPadInst *getCleanupPad() const; + LLVM_ABI void setCleanupPad(CleanupPadInst *CleanupPad); unsigned getNumSuccessors() const { return cast(Val)->getNumSuccessors(); } - BasicBlock *getUnwindDest() const; - void setUnwindDest(BasicBlock *NewDest); + LLVM_ABI BasicBlock *getUnwindDest() const; + LLVM_ABI void setUnwindDest(BasicBlock *NewDest); static bool classof(const Value *From) { return From->getSubclassID() == ClassID::CleanupRet; @@ -1677,16 +1687,16 @@ class GetElementPtrInst final // create*() public: - static Value *create(Type *Ty, Value *Ptr, ArrayRef IdxList, - InsertPosition Pos, Context &Ctx, - const Twine &NameStr = ""); + LLVM_ABI static Value *create(Type *Ty, Value *Ptr, ArrayRef IdxList, + InsertPosition Pos, Context &Ctx, + const Twine &NameStr = ""); static bool classof(const Value *From) { return From->getSubclassID() == ClassID::GetElementPtr; } - Type *getSourceElementType() const; - Type *getResultElementType() const; + LLVM_ABI Type *getSourceElementType() const; + LLVM_ABI Type *getResultElementType() const; unsigned getAddressSpace() const { return cast(Val)->getAddressSpace(); } @@ -1706,11 +1716,11 @@ public: return const_cast(this)->indices(); } - Value *getPointerOperand() const; + LLVM_ABI Value *getPointerOperand() const; static unsigned getPointerOperandIndex() { return llvm::GetElementPtrInst::getPointerOperandIndex(); } - Type *getPointerOperandType() const; + LLVM_ABI Type *getPointerOperandType() const; unsigned getPointerAddressSpace() const { return cast(Val)->getPointerAddressSpace(); } @@ -1750,12 +1760,12 @@ class CatchSwitchInst friend class Context; // For accessing the constructor in create*() public: - static CatchSwitchInst *create(Value *ParentPad, BasicBlock *UnwindBB, - unsigned NumHandlers, InsertPosition Pos, - Context &Ctx, const Twine &Name = ""); + LLVM_ABI static CatchSwitchInst * + create(Value *ParentPad, BasicBlock *UnwindBB, unsigned NumHandlers, + InsertPosition Pos, Context &Ctx, const Twine &Name = ""); - Value *getParentPad() const; - void setParentPad(Value *ParentPad); + LLVM_ABI Value *getParentPad() const; + LLVM_ABI void setParentPad(Value *ParentPad); bool hasUnwindDest() const { return cast(Val)->hasUnwindDest(); @@ -1763,8 +1773,8 @@ public: bool unwindsToCaller() const { return cast(Val)->unwindsToCaller(); } - BasicBlock *getUnwindDest() const; - void setUnwindDest(BasicBlock *UnwindDest); + LLVM_ABI BasicBlock *getUnwindDest() const; + LLVM_ABI void setUnwindDest(BasicBlock *UnwindDest); unsigned getNumHandlers() const { return cast(Val)->getNumHandlers(); @@ -1810,7 +1820,7 @@ public: return make_range(handler_begin(), handler_end()); } - void addHandler(BasicBlock *Dest); + LLVM_ABI void addHandler(BasicBlock *Dest); // TODO: removeHandler() cannot be reverted because there is no equivalent // addHandler() with a handler_iterator to specify the position. So we can't @@ -1839,8 +1849,9 @@ class ResumeInst : public SingleLLVMInstructionImpl { friend class Context; // For accessing the constructor in create*() public: - static ResumeInst *create(Value *Exn, InsertPosition Pos, Context &Ctx); - Value *getValue() const; + LLVM_ABI static ResumeInst *create(Value *Exn, InsertPosition Pos, + Context &Ctx); + LLVM_ABI Value *getValue() const; unsigned getNumSuccessors() const { return cast(Val)->getNumSuccessors(); } @@ -1858,17 +1869,17 @@ public: static constexpr const unsigned DefaultPseudoIndex = llvm::SwitchInst::DefaultPseudoIndex; - static SwitchInst *create(Value *V, BasicBlock *Dest, unsigned NumCases, - InsertPosition Pos, Context &Ctx, - const Twine &Name = ""); + LLVM_ABI static SwitchInst *create(Value *V, BasicBlock *Dest, + unsigned NumCases, InsertPosition Pos, + Context &Ctx, const Twine &Name = ""); - Value *getCondition() const; - void setCondition(Value *V); - BasicBlock *getDefaultDest() const; + LLVM_ABI Value *getCondition() const; + LLVM_ABI void setCondition(Value *V); + LLVM_ABI BasicBlock *getDefaultDest() const; bool defaultDestUnreachable() const { return cast(Val)->defaultDestUnreachable(); } - void setDefaultDest(BasicBlock *DefaultCase); + LLVM_ABI void setDefaultDest(BasicBlock *DefaultCase); unsigned getNumCases() const { return cast(Val)->getNumCases(); } @@ -1913,9 +1924,9 @@ public: return I; return case_default(); } - ConstantInt *findCaseDest(BasicBlock *BB); + LLVM_ABI ConstantInt *findCaseDest(BasicBlock *BB); - void addCase(ConstantInt *OnVal, BasicBlock *Dest); + LLVM_ABI void addCase(ConstantInt *OnVal, BasicBlock *Dest); /// This method removes the specified case and its successor from the switch /// instruction. Note that this operation may reorder the remaining cases at /// index idx and above. @@ -1923,13 +1934,13 @@ public: /// This action invalidates iterators for all cases following the one removed, /// including the case_end() iterator. It returns an iterator for the next /// case. - CaseIt removeCase(CaseIt It); + LLVM_ABI CaseIt removeCase(CaseIt It); unsigned getNumSuccessors() const { return cast(Val)->getNumSuccessors(); } - BasicBlock *getSuccessor(unsigned Idx) const; - void setSuccessor(unsigned Idx, BasicBlock *NewSucc); + LLVM_ABI BasicBlock *getSuccessor(unsigned Idx) const; + LLVM_ABI void setSuccessor(unsigned Idx, BasicBlock *NewSucc); static bool classof(const Value *From) { return From->getSubclassID() == ClassID::Switch; } @@ -1950,11 +1961,13 @@ class UnaryOperator : public UnaryInstruction { Ctx) {} friend Context; // for constructor. public: - static Value *create(Instruction::Opcode Op, Value *OpV, InsertPosition Pos, - Context &Ctx, const Twine &Name = ""); - static Value *createWithCopiedFlags(Instruction::Opcode Op, Value *OpV, - Value *CopyFrom, InsertPosition Pos, - Context &Ctx, const Twine &Name = ""); + LLVM_ABI static Value *create(Instruction::Opcode Op, Value *OpV, + InsertPosition Pos, Context &Ctx, + const Twine &Name = ""); + LLVM_ABI static Value *createWithCopiedFlags(Instruction::Opcode Op, + Value *OpV, Value *CopyFrom, + InsertPosition Pos, Context &Ctx, + const Twine &Name = ""); /// For isa/dyn_cast. static bool classof(const Value *From) { return From->getSubclassID() == ClassID::UnOp; @@ -2013,14 +2026,15 @@ protected: friend class Context; // For constructor. public: - static Value *create(Instruction::Opcode Op, Value *LHS, Value *RHS, - InsertPosition Pos, Context &Ctx, - const Twine &Name = ""); + LLVM_ABI static Value *create(Instruction::Opcode Op, Value *LHS, Value *RHS, + InsertPosition Pos, Context &Ctx, + const Twine &Name = ""); - static Value *createWithCopiedFlags(Instruction::Opcode Op, Value *LHS, - Value *RHS, Value *CopyFrom, - InsertPosition Pos, Context &Ctx, - const Twine &Name = ""); + LLVM_ABI static Value *createWithCopiedFlags(Instruction::Opcode Op, + Value *LHS, Value *RHS, + Value *CopyFrom, + InsertPosition Pos, Context &Ctx, + const Twine &Name = ""); /// For isa/dyn_cast. static bool classof(const Value *From) { return From->getSubclassID() == ClassID::BinaryOperator; @@ -2033,7 +2047,7 @@ public: /// can also be treated as an add. class PossiblyDisjointInst : public BinaryOperator { public: - void setIsDisjoint(bool B); + LLVM_ABI void setIsDisjoint(bool B); bool isDisjoint() const { return cast(Val)->isDisjoint(); } @@ -2066,24 +2080,24 @@ public: cast(Val)->setOperation(Op); } Align getAlign() const { return cast(Val)->getAlign(); } - void setAlignment(Align Align); + LLVM_ABI void setAlignment(Align Align); bool isVolatile() const { return cast(Val)->isVolatile(); } - void setVolatile(bool V); + LLVM_ABI void setVolatile(bool V); AtomicOrdering getOrdering() const { return cast(Val)->getOrdering(); } - void setOrdering(AtomicOrdering Ordering); + LLVM_ABI void setOrdering(AtomicOrdering Ordering); SyncScope::ID getSyncScopeID() const { return cast(Val)->getSyncScopeID(); } - void setSyncScopeID(SyncScope::ID SSID); - Value *getPointerOperand(); + LLVM_ABI void setSyncScopeID(SyncScope::ID SSID); + LLVM_ABI Value *getPointerOperand(); const Value *getPointerOperand() const { return const_cast(this)->getPointerOperand(); } - Value *getValOperand(); + LLVM_ABI Value *getValOperand(); const Value *getValOperand() const { return const_cast(this)->getValOperand(); } @@ -2097,11 +2111,10 @@ public: return From->getSubclassID() == ClassID::AtomicRMW; } - static AtomicRMWInst *create(BinOp Op, Value *Ptr, Value *Val, - MaybeAlign Align, AtomicOrdering Ordering, - InsertPosition Pos, Context &Ctx, - SyncScope::ID SSID = SyncScope::System, - const Twine &Name = ""); + LLVM_ABI static AtomicRMWInst * + create(BinOp Op, Value *Ptr, Value *Val, MaybeAlign Align, + AtomicOrdering Ordering, InsertPosition Pos, Context &Ctx, + SyncScope::ID SSID = SyncScope::System, const Twine &Name = ""); }; class AtomicCmpXchgInst @@ -2119,17 +2132,17 @@ public: return cast(Val)->getAlign(); } - void setAlignment(Align Align); + LLVM_ABI void setAlignment(Align Align); /// Return true if this is a cmpxchg from a volatile memory /// location. bool isVolatile() const { return cast(Val)->isVolatile(); } /// Specify whether this is a volatile cmpxchg. - void setVolatile(bool V); + LLVM_ABI void setVolatile(bool V); /// Return true if this cmpxchg may spuriously fail. bool isWeak() const { return cast(Val)->isWeak(); } - void setWeak(bool IsWeak); + LLVM_ABI void setWeak(bool IsWeak); static bool isValidSuccessOrdering(AtomicOrdering Ordering) { return llvm::AtomicCmpXchgInst::isValidSuccessOrdering(Ordering); } @@ -2139,30 +2152,30 @@ public: AtomicOrdering getSuccessOrdering() const { return cast(Val)->getSuccessOrdering(); } - void setSuccessOrdering(AtomicOrdering Ordering); + LLVM_ABI void setSuccessOrdering(AtomicOrdering Ordering); AtomicOrdering getFailureOrdering() const { return cast(Val)->getFailureOrdering(); } - void setFailureOrdering(AtomicOrdering Ordering); + LLVM_ABI void setFailureOrdering(AtomicOrdering Ordering); AtomicOrdering getMergedOrdering() const { return cast(Val)->getMergedOrdering(); } SyncScope::ID getSyncScopeID() const { return cast(Val)->getSyncScopeID(); } - void setSyncScopeID(SyncScope::ID SSID); - Value *getPointerOperand(); + LLVM_ABI void setSyncScopeID(SyncScope::ID SSID); + LLVM_ABI Value *getPointerOperand(); const Value *getPointerOperand() const { return const_cast(this)->getPointerOperand(); } - Value *getCompareOperand(); + LLVM_ABI Value *getCompareOperand(); const Value *getCompareOperand() const { return const_cast(this)->getCompareOperand(); } - Value *getNewValOperand(); + LLVM_ABI Value *getNewValOperand(); const Value *getNewValOperand() const { return const_cast(this)->getNewValOperand(); } @@ -2172,7 +2185,7 @@ public: return cast(Val)->getPointerAddressSpace(); } - static AtomicCmpXchgInst * + LLVM_ABI static AtomicCmpXchgInst * create(Value *Ptr, Value *Cmp, Value *New, MaybeAlign Align, AtomicOrdering SuccessOrdering, AtomicOrdering FailureOrdering, InsertPosition Pos, Context &Ctx, @@ -2190,9 +2203,10 @@ class AllocaInst final : public UnaryInstruction { friend class Context; // For constructor. public: - static AllocaInst *create(Type *Ty, unsigned AddrSpace, InsertPosition Pos, - Context &Ctx, Value *ArraySize = nullptr, - const Twine &Name = ""); + LLVM_ABI static AllocaInst *create(Type *Ty, unsigned AddrSpace, + InsertPosition Pos, Context &Ctx, + Value *ArraySize = nullptr, + const Twine &Name = ""); /// Return true if there is an allocation size parameter to the allocation /// instruction that is not 1. @@ -2201,12 +2215,12 @@ public: } /// Get the number of elements allocated. For a simple allocation of a single /// element, this will return a constant 1 value. - Value *getArraySize(); + LLVM_ABI Value *getArraySize(); const Value *getArraySize() const { return const_cast(this)->getArraySize(); } /// Overload to return most specific pointer type. - PointerType *getType() const; + LLVM_ABI PointerType *getType() const; /// Return the address space for the allocation. unsigned getAddressSpace() const { return cast(Val)->getAddressSpace(); @@ -2222,14 +2236,14 @@ public: return cast(Val)->getAllocationSizeInBits(DL); } /// Return the type that is being allocated by the instruction. - Type *getAllocatedType() const; + LLVM_ABI Type *getAllocatedType() const; /// for use only in special circumstances that need to generically /// transform a whole instruction (eg: IR linking and vectorization). - void setAllocatedType(Type *Ty); + LLVM_ABI void setAllocatedType(Type *Ty); /// Return the alignment of the memory that is being allocated by the /// instruction. Align getAlign() const { return cast(Val)->getAlign(); } - void setAlignment(Align Align); + LLVM_ABI void setAlignment(Align Align); /// Return true if this alloca is in the entry block of the function and is a /// constant size. If so, the code generator will fold it into the /// prolog/epilog code, so it is basically free. @@ -2242,7 +2256,7 @@ public: return cast(Val)->isUsedWithInAlloca(); } /// Specify whether this alloca is used to represent the arguments to a call. - void setUsedWithInAlloca(bool V); + LLVM_ABI void setUsedWithInAlloca(bool V); static bool classof(const Value *From) { if (auto *I = dyn_cast(From)) @@ -2293,13 +2307,13 @@ class CastInst : public UnaryInstruction { friend Context; // for SBCastInstruction() public: - static Value *create(Type *DestTy, Opcode Op, Value *Operand, - InsertPosition Pos, Context &Ctx, - const Twine &Name = ""); + LLVM_ABI static Value *create(Type *DestTy, Opcode Op, Value *Operand, + InsertPosition Pos, Context &Ctx, + const Twine &Name = ""); /// For isa/dyn_cast. - static bool classof(const Value *From); - Type *getSrcTy() const; - Type *getDestTy() const; + LLVM_ABI static bool classof(const Value *From); + LLVM_ABI Type *getSrcTy() const; + LLVM_ABI Type *getDestTy() const; }; /// Instruction that can have a nneg flag (zext/uitofp). @@ -2308,7 +2322,7 @@ public: bool hasNonNeg() const { return cast(Val)->hasNonNeg(); } - void setNonNeg(bool B); + LLVM_ABI void setNonNeg(bool B); /// For isa/dyn_cast. static bool classof(const Value *From) { if (auto *I = dyn_cast(From)) { @@ -2383,15 +2397,15 @@ class PHINode final : public SingleLLVMInstructionImpl { struct LLVMBBToBB { Context &Ctx; LLVMBBToBB(Context &Ctx) : Ctx(Ctx) {} - BasicBlock *operator()(llvm::BasicBlock *LLVMBB) const; + LLVM_ABI BasicBlock *operator()(llvm::BasicBlock *LLVMBB) const; }; public: - static PHINode *create(Type *Ty, unsigned NumReservedValues, - InsertPosition Pos, Context &Ctx, - const Twine &Name = ""); + LLVM_ABI static PHINode *create(Type *Ty, unsigned NumReservedValues, + InsertPosition Pos, Context &Ctx, + const Twine &Name = ""); /// For isa/dyn_cast. - static bool classof(const Value *From); + LLVM_ABI static bool classof(const Value *From); using const_block_iterator = mapped_iterator; @@ -2417,35 +2431,36 @@ public: unsigned getNumIncomingValues() const { return cast(Val)->getNumIncomingValues(); } - Value *getIncomingValue(unsigned Idx) const; - void setIncomingValue(unsigned Idx, Value *V); + LLVM_ABI Value *getIncomingValue(unsigned Idx) const; + LLVM_ABI void setIncomingValue(unsigned Idx, Value *V); static unsigned getOperandNumForIncomingValue(unsigned Idx) { return llvm::PHINode::getOperandNumForIncomingValue(Idx); } static unsigned getIncomingValueNumForOperand(unsigned Idx) { return llvm::PHINode::getIncomingValueNumForOperand(Idx); } - BasicBlock *getIncomingBlock(unsigned Idx) const; - BasicBlock *getIncomingBlock(const Use &U) const; + LLVM_ABI BasicBlock *getIncomingBlock(unsigned Idx) const; + LLVM_ABI BasicBlock *getIncomingBlock(const Use &U) const; - void setIncomingBlock(unsigned Idx, BasicBlock *BB); + LLVM_ABI void setIncomingBlock(unsigned Idx, BasicBlock *BB); - void addIncoming(Value *V, BasicBlock *BB); + LLVM_ABI void addIncoming(Value *V, BasicBlock *BB); - Value *removeIncomingValue(unsigned Idx); - Value *removeIncomingValue(BasicBlock *BB); + LLVM_ABI Value *removeIncomingValue(unsigned Idx); + LLVM_ABI Value *removeIncomingValue(BasicBlock *BB); - int getBasicBlockIndex(const BasicBlock *BB) const; - Value *getIncomingValueForBlock(const BasicBlock *BB) const; + LLVM_ABI int getBasicBlockIndex(const BasicBlock *BB) const; + LLVM_ABI Value *getIncomingValueForBlock(const BasicBlock *BB) const; - Value *hasConstantValue() const; + LLVM_ABI Value *hasConstantValue() const; bool hasConstantOrUndefValue() const { return cast(Val)->hasConstantOrUndefValue(); } bool isComplete() const { return cast(Val)->isComplete(); } - void replaceIncomingBlockWith(const BasicBlock *Old, BasicBlock *New); - void removeIncomingValueIf(function_ref Predicate); + LLVM_ABI void replaceIncomingBlockWith(const BasicBlock *Old, + BasicBlock *New); + LLVM_ABI void removeIncomingValueIf(function_ref Predicate); // TODO: Implement // void copyIncomingBlocks(iterator_range BBRange, // uint32_t ToIdx = 0) @@ -2471,21 +2486,23 @@ protected: CmpInst(llvm::CmpInst *CI, Context &Ctx, ClassID Id, Opcode Opc) : SingleLLVMInstructionImpl(Id, Opc, CI, Ctx) {} friend Context; // for CmpInst() - static Value *createCommon(Value *Cond, Value *True, Value *False, - const Twine &Name, IRBuilder<> &Builder, - Context &Ctx); + LLVM_ABI static Value *createCommon(Value *Cond, Value *True, Value *False, + const Twine &Name, IRBuilder<> &Builder, + Context &Ctx); public: using Predicate = llvm::CmpInst::Predicate; - static Value *create(Predicate Pred, Value *S1, Value *S2, InsertPosition Pos, - Context &Ctx, const Twine &Name = ""); - static Value *createWithCopiedFlags(Predicate Pred, Value *S1, Value *S2, - const Instruction *FlagsSource, - InsertPosition Pos, Context &Ctx, - const Twine &Name = ""); - void setPredicate(Predicate P); - void swapOperands(); + LLVM_ABI static Value *create(Predicate Pred, Value *S1, Value *S2, + InsertPosition Pos, Context &Ctx, + const Twine &Name = ""); + LLVM_ABI static Value *createWithCopiedFlags(Predicate Pred, Value *S1, + Value *S2, + const Instruction *FlagsSource, + InsertPosition Pos, Context &Ctx, + const Twine &Name = ""); + LLVM_ABI void setPredicate(Predicate P); + LLVM_ABI void swapOperands(); WRAP_MEMBER(getPredicate); WRAP_BOTH(isFPPredicate); @@ -2517,7 +2534,7 @@ public: } /// Create a result type for fcmp/icmp - static Type *makeCmpResultType(Type *OpndType); + LLVM_ABI static Type *makeCmpResultType(Type *OpndType); #ifndef NDEBUG void dumpOS(raw_ostream &OS) const override; @@ -2533,7 +2550,7 @@ class ICmpInst : public CmpInst { using LLVMValType = llvm::ICmpInst; public: - void swapOperands(); + LLVM_ABI void swapOperands(); WRAP_BOTH(getSignedPredicate); WRAP_BOTH(getUnsignedPredicate); @@ -2570,7 +2587,7 @@ class FCmpInst : public CmpInst { using LLVMValType = llvm::FCmpInst; public: - void swapOperands(); + LLVM_ABI void swapOperands(); WRAP_BOTH(isEquality); WRAP_MEMBER(isCommutative); diff --git a/llvm/include/llvm/SandboxIR/Module.h b/llvm/include/llvm/SandboxIR/Module.h index 429bb04539bc..275960392211 100644 --- a/llvm/include/llvm/SandboxIR/Module.h +++ b/llvm/include/llvm/SandboxIR/Module.h @@ -11,6 +11,7 @@ #include "llvm/ADT/STLExtras.h" #include "llvm/IR/Module.h" +#include "llvm/Support/Compiler.h" #include namespace llvm { @@ -38,7 +39,7 @@ class Module { public: Context &getContext() const { return Ctx; } - Function *getFunction(StringRef Name) const; + LLVM_ABI Function *getFunction(StringRef Name) const; const DataLayout &getDataLayout() const { return LLVMM.getDataLayout(); } @@ -50,7 +51,8 @@ public: /// does not exist, return null. If AllowInternal is set to true, this /// function will return types that have InternalLinkage. By default, these /// types are not returned. - GlobalVariable *getGlobalVariable(StringRef Name, bool AllowInternal) const; + LLVM_ABI GlobalVariable *getGlobalVariable(StringRef Name, + bool AllowInternal) const; GlobalVariable *getGlobalVariable(StringRef Name) const { return getGlobalVariable(Name, /*AllowInternal=*/false); } @@ -66,12 +68,12 @@ public: /// Return the global alias in the module with the specified name, of /// arbitrary type. This method returns null if a global with the specified /// name is not found. - GlobalAlias *getNamedAlias(StringRef Name) const; + LLVM_ABI GlobalAlias *getNamedAlias(StringRef Name) const; /// Return the global ifunc in the module with the specified name, of /// arbitrary type. This method returns null if a global with the specified /// name is not found. - GlobalIFunc *getNamedIFunc(StringRef Name) const; + LLVM_ABI GlobalIFunc *getNamedIFunc(StringRef Name) const; // TODO: Missing removeGlobalVariable() eraseGlobalVariable(), // insertGlobalVariable() diff --git a/llvm/include/llvm/SandboxIR/PassManager.h b/llvm/include/llvm/SandboxIR/PassManager.h index 55a0301f4756..6fccaf04b270 100644 --- a/llvm/include/llvm/SandboxIR/PassManager.h +++ b/llvm/include/llvm/SandboxIR/PassManager.h @@ -18,6 +18,7 @@ #ifndef LLVM_SANDBOXIR_PASSMANAGER_H #define LLVM_SANDBOXIR_PASSMANAGER_H +#include "llvm/Support/Compiler.h" #include #include "llvm/ADT/DenseMap.h" @@ -201,7 +202,7 @@ public: } }; -class FunctionPassManager final +class LLVM_ABI FunctionPassManager final : public PassManager { public: FunctionPassManager(StringRef Name) : PassManager(Name) {} @@ -211,7 +212,8 @@ public: bool runOnFunction(Function &F, const Analyses &A) final; }; -class RegionPassManager final : public PassManager { +class LLVM_ABI RegionPassManager final + : public PassManager { public: RegionPassManager(StringRef Name) : PassManager(Name) {} RegionPassManager(StringRef Name, StringRef Pipeline, diff --git a/llvm/include/llvm/SandboxIR/Region.h b/llvm/include/llvm/SandboxIR/Region.h index f86199ab6c22..d70f21277fb1 100644 --- a/llvm/include/llvm/SandboxIR/Region.h +++ b/llvm/include/llvm/SandboxIR/Region.h @@ -9,6 +9,7 @@ #ifndef LLVM_SANDBOXIR_REGION_H #define LLVM_SANDBOXIR_REGION_H +#include "llvm/Support/Compiler.h" #include #include "llvm/ADT/SetVector.h" @@ -30,7 +31,7 @@ class ScoreBoard { /// The cost of all instructions that got removed and replaced by new ones. InstructionCost BeforeCost = 0; /// Helper for both add() and remove(). \Returns the TTI cost of \p I. - InstructionCost getCost(Instruction *I) const; + LLVM_ABI InstructionCost getCost(Instruction *I) const; /// No need to allow copies. ScoreBoard(const ScoreBoard &) = delete; const ScoreBoard &operator=(const ScoreBoard &) = delete; @@ -40,7 +41,7 @@ public: /// Mark \p I as a newly added instruction to the region. void add(Instruction *I) { AfterCost += getCost(I); } /// Mark \p I as a deleted instruction from the region. - void remove(Instruction *I); + LLVM_ABI void remove(Instruction *I); /// \Returns the cost of the newly added instructions. InstructionCost getAfterCost() const { return AfterCost; } /// \Returns the cost of the Removed instructions. @@ -122,12 +123,12 @@ class Region { /// add an instruction to the auxiliary vector it does get tagged as being a /// member of the region (for ownership reasons), but its cost does not get /// counted because the instruction hasn't been added in the "normal" way. - void addImpl(Instruction *I, bool IgnoreCost); + LLVM_ABI void addImpl(Instruction *I, bool IgnoreCost); /// Adds I to the set. This is the main API for adding an instruction to the /// region. void add(Instruction *I) { addImpl(I, /*IgnoreCost=*/false); } /// Removes I from the set. - void remove(Instruction *I); + LLVM_ABI void remove(Instruction *I); friend class Context; // The callbacks need to call add() and remove(). friend class RegionInternalsAttorney; // For unit tests. friend class RegionsFromBBs; // For add(). @@ -141,8 +142,8 @@ class Region { void removeFromAux(Instruction *I); public: - Region(Context &Ctx, TargetTransformInfo &TTI); - ~Region(); + LLVM_ABI Region(Context &Ctx, TargetTransformInfo &TTI); + LLVM_ABI ~Region(); Context &getContext() const { return Ctx; } /// Returns true if I is in the Region. @@ -150,18 +151,18 @@ public: /// Returns true if the Region has no instructions. bool empty() const { return Insts.empty(); } /// Set the auxiliary vector. - void setAux(ArrayRef Aux); + LLVM_ABI void setAux(ArrayRef Aux); /// \Returns the auxiliary vector. const SmallVector &getAux() const { return Aux; } /// Clears all auxiliary data. - void clearAux(); + LLVM_ABI void clearAux(); using iterator = decltype(Insts.begin()); iterator begin() { return Insts.begin(); } iterator end() { return Insts.end(); } iterator_range insts() { return make_range(begin(), end()); } - static SmallVector> + LLVM_ABI static SmallVector> createRegionsFromMD(Function &F, TargetTransformInfo &TTI); /// \Returns the ScoreBoard data structure that keeps track of instr costs. const ScoreBoard &getScoreboard() const { return Scoreboard; } diff --git a/llvm/include/llvm/SandboxIR/Tracker.h b/llvm/include/llvm/SandboxIR/Tracker.h index f7b469965eae..9a2c9dd51648 100644 --- a/llvm/include/llvm/SandboxIR/Tracker.h +++ b/llvm/include/llvm/SandboxIR/Tracker.h @@ -46,6 +46,8 @@ #include "llvm/IR/IRBuilder.h" #include "llvm/IR/Instruction.h" #include "llvm/SandboxIR/Use.h" +#include "llvm/SandboxIR/Value.h" +#include "llvm/Support/Compiler.h" #include "llvm/Support/Debug.h" #include @@ -149,7 +151,7 @@ public: #endif }; -class PHIRemoveIncoming : public IRChangeBase { +class LLVM_ABI PHIRemoveIncoming : public IRChangeBase { PHINode *PHI; unsigned RemovedIdx; Value *RemovedV; @@ -165,7 +167,7 @@ public: #endif }; -class PHIAddIncoming : public IRChangeBase { +class LLVM_ABI PHIAddIncoming : public IRChangeBase { PHINode *PHI; unsigned Idx; @@ -179,7 +181,7 @@ public: #endif }; -class CmpSwapOperands : public IRChangeBase { +class LLVM_ABI CmpSwapOperands : public IRChangeBase { CmpInst *Cmp; public: @@ -210,7 +212,7 @@ public: #endif }; -class EraseFromParent : public IRChangeBase { +class LLVM_ABI EraseFromParent : public IRChangeBase { /// Contains all the data we need to restore an "erased" (i.e., detached) /// instruction: the instruction itself and its operands in order. struct InstrAndOperands { @@ -242,7 +244,7 @@ public: #endif }; -class RemoveFromParent : public IRChangeBase { +class LLVM_ABI RemoveFromParent : public IRChangeBase { /// The instruction that is about to get removed. Instruction *RemovedI = nullptr; /// This is either the next instr, or the parent BB if at the end of the BB. @@ -327,7 +329,7 @@ public: #endif }; -class CatchSwitchAddHandler : public IRChangeBase { +class LLVM_ABI CatchSwitchAddHandler : public IRChangeBase { CatchSwitchInst *CSI; unsigned HandlerIdx; @@ -344,7 +346,7 @@ public: #endif // NDEBUG }; -class SwitchAddCase : public IRChangeBase { +class LLVM_ABI SwitchAddCase : public IRChangeBase { SwitchInst *Switch; ConstantInt *Val; @@ -359,7 +361,7 @@ public: #endif // NDEBUG }; -class SwitchRemoveCase : public IRChangeBase { +class LLVM_ABI SwitchRemoveCase : public IRChangeBase { SwitchInst *Switch; struct Case { ConstantInt *Val; @@ -378,7 +380,7 @@ public: #endif // NDEBUG }; -class MoveInstr : public IRChangeBase { +class LLVM_ABI MoveInstr : public IRChangeBase { /// The instruction that moved. Instruction *MovedI; /// This is either the next instruction in the block, or the parent BB if at @@ -395,7 +397,7 @@ public: #endif // NDEBUG }; -class InsertIntoBB final : public IRChangeBase { +class LLVM_ABI InsertIntoBB final : public IRChangeBase { Instruction *InsertedI = nullptr; public: @@ -408,7 +410,7 @@ public: #endif // NDEBUG }; -class CreateAndInsertInst final : public IRChangeBase { +class LLVM_ABI CreateAndInsertInst final : public IRChangeBase { Instruction *NewI = nullptr; public: @@ -421,7 +423,7 @@ public: #endif }; -class ShuffleVectorSetMask final : public IRChangeBase { +class LLVM_ABI ShuffleVectorSetMask final : public IRChangeBase { ShuffleVectorInst *SVI; SmallVector PrevMask; @@ -472,7 +474,7 @@ public: { } - ~Tracker(); + LLVM_ABI ~Tracker(); Context &getContext() const { return Ctx; } /// \Returns true if there are no changes tracked. bool empty() const { return Changes.empty(); } @@ -506,11 +508,11 @@ public: /// \Returns the current state of the tracker. TrackerState getState() const { return State; } /// Turns on IR tracking. - void save(); + LLVM_ABI void save(); /// Stops tracking and accept changes. - void accept(); + LLVM_ABI void accept(); /// Stops tracking and reverts to saved state. - void revert(); + LLVM_ABI void revert(); #ifndef NDEBUG void dump(raw_ostream &OS) const; diff --git a/llvm/include/llvm/SandboxIR/Type.h b/llvm/include/llvm/SandboxIR/Type.h index f90ae096443b..d9c5e6c098da 100644 --- a/llvm/include/llvm/SandboxIR/Type.h +++ b/llvm/include/llvm/SandboxIR/Type.h @@ -16,6 +16,7 @@ #include "llvm/ADT/SmallPtrSet.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Type.h" +#include "llvm/Support/Compiler.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" @@ -264,18 +265,18 @@ public: /// If this is a vector type, return the element type, otherwise return /// 'this'. - Type *getScalarType() const; + LLVM_ABI Type *getScalarType() const; // TODO: ADD MISSING - static Type *getInt64Ty(Context &Ctx); - static Type *getInt32Ty(Context &Ctx); - static Type *getInt16Ty(Context &Ctx); - static Type *getInt8Ty(Context &Ctx); - static Type *getInt1Ty(Context &Ctx); - static Type *getDoubleTy(Context &Ctx); - static Type *getFloatTy(Context &Ctx); - static Type *getHalfTy(Context &Ctx); + LLVM_ABI static Type *getInt64Ty(Context &Ctx); + LLVM_ABI static Type *getInt32Ty(Context &Ctx); + LLVM_ABI static Type *getInt16Ty(Context &Ctx); + LLVM_ABI static Type *getInt8Ty(Context &Ctx); + LLVM_ABI static Type *getInt1Ty(Context &Ctx); + LLVM_ABI static Type *getDoubleTy(Context &Ctx); + LLVM_ABI static Type *getFloatTy(Context &Ctx); + LLVM_ABI static Type *getHalfTy(Context &Ctx); // TODO: missing get* /// Get the address space of this pointer or pointer vector type. @@ -293,7 +294,7 @@ class PointerType : public Type { public: // TODO: add missing functions - static PointerType *get(Context &Ctx, unsigned AddressSpace); + LLVM_ABI static PointerType *get(Context &Ctx, unsigned AddressSpace); static bool classof(const Type *From) { return isa(From->LLVMTy); @@ -302,7 +303,7 @@ public: class ArrayType : public Type { public: - static ArrayType *get(Type *ElementType, uint64_t NumElements); + LLVM_ABI static ArrayType *get(Type *ElementType, uint64_t NumElements); // TODO: add missing functions static bool classof(const Type *From) { return isa(From->LLVMTy); @@ -312,8 +313,8 @@ public: class StructType : public Type { public: /// This static method is the primary way to create a literal StructType. - static StructType *get(Context &Ctx, ArrayRef Elements, - bool IsPacked = false); + LLVM_ABI static StructType *get(Context &Ctx, ArrayRef Elements, + bool IsPacked = false); bool isPacked() const { return cast(LLVMTy)->isPacked(); } @@ -325,13 +326,13 @@ public: class VectorType : public Type { public: - static VectorType *get(Type *ElementType, ElementCount EC); + LLVM_ABI static VectorType *get(Type *ElementType, ElementCount EC); static VectorType *get(Type *ElementType, unsigned NumElements, bool Scalable) { return VectorType::get(ElementType, ElementCount::get(NumElements, Scalable)); } - Type *getElementType() const; + LLVM_ABI Type *getElementType() const; static VectorType *get(Type *ElementType, const VectorType *Other) { return VectorType::get(ElementType, Other->getElementCount()); @@ -340,13 +341,14 @@ public: inline ElementCount getElementCount() const { return cast(LLVMTy)->getElementCount(); } - static VectorType *getInteger(VectorType *VTy); - static VectorType *getExtendedElementVectorType(VectorType *VTy); - static VectorType *getTruncatedElementVectorType(VectorType *VTy); - static VectorType *getSubdividedVectorType(VectorType *VTy, int NumSubdivs); - static VectorType *getHalfElementsVectorType(VectorType *VTy); - static VectorType *getDoubleElementsVectorType(VectorType *VTy); - static bool isValidElementType(Type *ElemTy); + LLVM_ABI static VectorType *getInteger(VectorType *VTy); + LLVM_ABI static VectorType *getExtendedElementVectorType(VectorType *VTy); + LLVM_ABI static VectorType *getTruncatedElementVectorType(VectorType *VTy); + LLVM_ABI static VectorType *getSubdividedVectorType(VectorType *VTy, + int NumSubdivs); + LLVM_ABI static VectorType *getHalfElementsVectorType(VectorType *VTy); + LLVM_ABI static VectorType *getDoubleElementsVectorType(VectorType *VTy); + LLVM_ABI static bool isValidElementType(Type *ElemTy); static bool classof(const Type *From) { return isa(From->LLVMTy); @@ -355,7 +357,7 @@ public: class FixedVectorType : public VectorType { public: - static FixedVectorType *get(Type *ElementType, unsigned NumElts); + LLVM_ABI static FixedVectorType *get(Type *ElementType, unsigned NumElts); static FixedVectorType *get(Type *ElementType, const FixedVectorType *FVTy) { return get(ElementType, FVTy->getNumElements()); @@ -399,7 +401,8 @@ public: class ScalableVectorType : public VectorType { public: - static ScalableVectorType *get(Type *ElementType, unsigned MinNumElts); + LLVM_ABI static ScalableVectorType *get(Type *ElementType, + unsigned MinNumElts); static ScalableVectorType *get(Type *ElementType, const ScalableVectorType *SVTy) { @@ -462,7 +465,7 @@ public: /// Integer representation type class IntegerType : public Type { public: - static IntegerType *get(Context &C, unsigned NumBits); + LLVM_ABI static IntegerType *get(Context &C, unsigned NumBits); // TODO: add missing functions static bool classof(const Type *From) { return isa(From->LLVMTy); diff --git a/llvm/include/llvm/SandboxIR/Use.h b/llvm/include/llvm/SandboxIR/Use.h index c4a774aa3a89..5c02c4f2b349 100644 --- a/llvm/include/llvm/SandboxIR/Use.h +++ b/llvm/include/llvm/SandboxIR/Use.h @@ -14,6 +14,7 @@ #define LLVM_SANDBOXIR_USE_H #include "llvm/IR/Use.h" +#include "llvm/Support/Compiler.h" #include "llvm/Support/raw_ostream.h" namespace llvm::sandboxir { @@ -49,11 +50,11 @@ class Use { public: operator Value *() const { return get(); } - Value *get() const; - void set(Value *V); + LLVM_ABI Value *get() const; + LLVM_ABI void set(Value *V); class User *getUser() const { return Usr; } - unsigned getOperandNo() const; - void swap(Use &OtherUse); + LLVM_ABI unsigned getOperandNo() const; + LLVM_ABI void swap(Use &OtherUse); Context *getContext() const { return Ctx; } bool operator==(const Use &Other) const { assert(Ctx == Other.Ctx && "Contexts differ!"); diff --git a/llvm/include/llvm/SandboxIR/User.h b/llvm/include/llvm/SandboxIR/User.h index 80e672de3490..c552e2e3378b 100644 --- a/llvm/include/llvm/SandboxIR/User.h +++ b/llvm/include/llvm/SandboxIR/User.h @@ -13,6 +13,7 @@ #include "llvm/IR/Value.h" #include "llvm/SandboxIR/Use.h" #include "llvm/SandboxIR/Value.h" +#include "llvm/Support/Compiler.h" namespace llvm::sandboxir { @@ -36,8 +37,8 @@ public: using iterator_category = std::input_iterator_tag; OperandUseIterator() = default; - value_type operator*() const; - OperandUseIterator &operator++(); + LLVM_ABI value_type operator*() const; + LLVM_ABI OperandUseIterator &operator++(); OperandUseIterator operator++(int) { auto Copy = *this; this->operator++(); @@ -49,13 +50,13 @@ public: bool operator!=(const OperandUseIterator &Other) const { return !(*this == Other); } - OperandUseIterator operator+(unsigned Num) const; - OperandUseIterator operator-(unsigned Num) const; - int operator-(const OperandUseIterator &Other) const; + LLVM_ABI OperandUseIterator operator+(unsigned Num) const; + LLVM_ABI OperandUseIterator operator-(unsigned Num) const; + LLVM_ABI int operator-(const OperandUseIterator &Other) const; }; /// A sandboxir::User has operands. -class User : public Value { +class LLVM_ABI User : public Value { protected: User(ClassID ID, llvm::Value *V, Context &Ctx) : Value(ID, V, Ctx) {} diff --git a/llvm/include/llvm/SandboxIR/Value.h b/llvm/include/llvm/SandboxIR/Value.h index dbd0208b4f3f..dd0bc76db3e3 100644 --- a/llvm/include/llvm/SandboxIR/Value.h +++ b/llvm/include/llvm/SandboxIR/Value.h @@ -12,6 +12,7 @@ #include "llvm/IR/Metadata.h" #include "llvm/IR/Value.h" #include "llvm/SandboxIR/Use.h" +#include "llvm/Support/Compiler.h" namespace llvm::sandboxir { @@ -50,7 +51,7 @@ public: UserUseIterator() = default; value_type operator*() const { return Use; } - UserUseIterator &operator++(); + LLVM_ABI UserUseIterator &operator++(); bool operator==(const UserUseIterator &Other) const { return Use == Other.Use; } @@ -179,7 +180,7 @@ protected: void clearValue() { Val = nullptr; } template friend class LLVMOpUserItToSBTy; - Value(ClassID SubclassID, llvm::Value *Val, Context &Ctx); + LLVM_ABI Value(ClassID SubclassID, llvm::Value *Val, Context &Ctx); /// Disable copies. Value(const Value &) = delete; Value &operator=(const Value &) = delete; @@ -191,7 +192,7 @@ public: using use_iterator = UserUseIterator; using const_use_iterator = UserUseIterator; - use_iterator use_begin(); + LLVM_ABI use_iterator use_begin(); const_use_iterator use_begin() const { return const_cast(this)->use_begin(); } @@ -215,7 +216,7 @@ public: using user_iterator = mapped_iterator; using const_user_iterator = user_iterator; - user_iterator user_begin(); + LLVM_ABI user_iterator user_begin(); user_iterator user_end() { return user_iterator(Use(nullptr, nullptr, Ctx), UseToUser()); } @@ -234,7 +235,7 @@ public: } /// \Returns the number of user edges (not necessarily to unique users). /// WARNING: This is a linear-time operation. - unsigned getNumUses() const; + LLVM_ABI unsigned getNumUses() const; /// Return true if this value has N uses or more. /// This is logically equivalent to getNumUses() >= N. /// WARNING: This can be expensive, as it is linear to the number of users. @@ -256,13 +257,14 @@ public: return Cnt == Num; } - Type *getType() const; + LLVM_ABI Type *getType() const; Context &getContext() const { return Ctx; } - void replaceUsesWithIf(Value *OtherV, - llvm::function_ref ShouldReplace); - void replaceAllUsesWith(Value *Other); + LLVM_ABI void + replaceUsesWithIf(Value *OtherV, + llvm::function_ref ShouldReplace); + LLVM_ABI void replaceAllUsesWith(Value *Other); /// \Returns the LLVM IR name of the bottom-most LLVM value. StringRef getName() const { return Val->getName(); } diff --git a/llvm/lib/SandboxIR/Constant.cpp b/llvm/lib/SandboxIR/Constant.cpp index 82cf0876d580..9de88ef2cf0a 100644 --- a/llvm/lib/SandboxIR/Constant.cpp +++ b/llvm/lib/SandboxIR/Constant.cpp @@ -305,35 +305,14 @@ GlobalT &GlobalWithNodeAPI:: } // Explicit instantiations. -template class GlobalWithNodeAPI; -template class GlobalWithNodeAPI; -template class GlobalWithNodeAPI; -template class GlobalWithNodeAPI; - -#if defined(_MSC_VER) && !defined(__clang__) -// These are needed for SandboxIRTest when building with LLVM_BUILD_LLVM_DYLIB -template LLVM_EXPORT_TEMPLATE GlobalIFunc & -GlobalWithNodeAPI::LLVMGVToGV::operator()(llvm::GlobalIFunc - &LLVMGV) - const; -template LLVM_EXPORT_TEMPLATE Function & -GlobalWithNodeAPI:: - LLVMGVToGV::operator()(llvm::Function &LLVMGV) const; - -template LLVM_EXPORT_TEMPLATE GlobalVariable &GlobalWithNodeAPI< - GlobalVariable, llvm::GlobalVariable, GlobalObject, - llvm::GlobalObject>::LLVMGVToGV::operator()(llvm::GlobalVariable &LLVMGV) - const; -template LLVM_EXPORT_TEMPLATE GlobalAlias & -GlobalWithNodeAPI::LLVMGVToGV::operator()(llvm::GlobalAlias - &LLVMGV) const; -#endif +template class LLVM_EXPORT_TEMPLATE GlobalWithNodeAPI< + GlobalIFunc, llvm::GlobalIFunc, GlobalObject, llvm::GlobalObject>; +template class LLVM_EXPORT_TEMPLATE GlobalWithNodeAPI< + Function, llvm::Function, GlobalObject, llvm::GlobalObject>; +template class LLVM_EXPORT_TEMPLATE GlobalWithNodeAPI< + GlobalVariable, llvm::GlobalVariable, GlobalObject, llvm::GlobalObject>; +template class LLVM_EXPORT_TEMPLATE GlobalWithNodeAPI< + GlobalAlias, llvm::GlobalAlias, GlobalValue, llvm::GlobalValue>; void GlobalIFunc::setResolver(Constant *Resolver) { Ctx.getTracker() From 2652d1b2fd65950a66f37ed6d5ed9c4ffabacbee Mon Sep 17 00:00:00 2001 From: Andrew Rogers Date: Wed, 11 Jun 2025 09:19:47 -0700 Subject: [PATCH 0007/1322] [llvm] annotate interfaces in llvm/TextAPI for DLL export (#143447) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Purpose This patch is one in a series of code-mods that annotate LLVM’s public interface for export. This patch annotates the `llvm/TextAPI` library. These annotations currently have no meaningful impact on the LLVM build; however, they are a prerequisite to support an LLVM Windows DLL (shared library) build. ## Background This effort is tracked in #109483. Additional context is provided in [this discourse](https://discourse.llvm.org/t/psa-annotating-llvm-public-interface/85307), and documentation for `LLVM_ABI` and related annotations is found in the LLVM repo [here](https://github.com/llvm/llvm-project/blob/main/llvm/docs/InterfaceExportAnnotations.rst). These changes were generated automatically using the [Interface Definition Scanner (IDS)](https://github.com/compnerd/ids) tool, followed formatting with `git clang-format`. ## Validation Local builds and tests to validate cross-platform compatibility. This included llvm, clang, and lldb on the following configurations: - Windows with MSVC - Windows with Clang - Linux with GCC - Linux with Clang - Darwin with Clang --- llvm/include/llvm/TextAPI/Architecture.h | 17 ++++--- llvm/include/llvm/TextAPI/ArchitectureSet.h | 13 +++--- llvm/include/llvm/TextAPI/DylibReader.h | 10 ++-- llvm/include/llvm/TextAPI/InterfaceFile.h | 34 +++++++------- llvm/include/llvm/TextAPI/PackedVersion.h | 9 ++-- llvm/include/llvm/TextAPI/Platform.h | 17 +++---- llvm/include/llvm/TextAPI/Record.h | 18 +++---- llvm/include/llvm/TextAPI/RecordVisitor.h | 5 +- llvm/include/llvm/TextAPI/RecordsSlice.h | 52 ++++++++++++--------- llvm/include/llvm/TextAPI/Symbol.h | 8 ++-- llvm/include/llvm/TextAPI/SymbolSet.h | 11 +++-- llvm/include/llvm/TextAPI/Target.h | 15 +++--- llvm/include/llvm/TextAPI/TextAPIError.h | 3 +- llvm/include/llvm/TextAPI/TextAPIReader.h | 5 +- llvm/include/llvm/TextAPI/TextAPIWriter.h | 8 ++-- llvm/include/llvm/TextAPI/Utils.h | 21 +++++---- 16 files changed, 138 insertions(+), 108 deletions(-) diff --git a/llvm/include/llvm/TextAPI/Architecture.h b/llvm/include/llvm/TextAPI/Architecture.h index 978359995074..7a7f5416fe7c 100644 --- a/llvm/include/llvm/TextAPI/Architecture.h +++ b/llvm/include/llvm/TextAPI/Architecture.h @@ -13,6 +13,7 @@ #ifndef LLVM_TEXTAPI_ARCHITECTURE_H #define LLVM_TEXTAPI_ARCHITECTURE_H +#include "llvm/Support/Compiler.h" #include #include @@ -32,24 +33,26 @@ enum Architecture : uint8_t { }; /// Convert a CPU Type and Subtype pair to an architecture slice. -Architecture getArchitectureFromCpuType(uint32_t CPUType, uint32_t CPUSubType); +LLVM_ABI Architecture getArchitectureFromCpuType(uint32_t CPUType, + uint32_t CPUSubType); /// Convert a name to an architecture slice. -Architecture getArchitectureFromName(StringRef Name); +LLVM_ABI Architecture getArchitectureFromName(StringRef Name); /// Convert an architecture slice to a string. -StringRef getArchitectureName(Architecture Arch); +LLVM_ABI StringRef getArchitectureName(Architecture Arch); /// Convert an architecture slice to a CPU Type and Subtype pair. -std::pair getCPUTypeFromArchitecture(Architecture Arch); +LLVM_ABI std::pair +getCPUTypeFromArchitecture(Architecture Arch); /// Convert a target to an architecture slice. -Architecture mapToArchitecture(const llvm::Triple &Target); +LLVM_ABI Architecture mapToArchitecture(const llvm::Triple &Target); /// Check if architecture is 64 bit. -bool is64Bit(Architecture); +LLVM_ABI bool is64Bit(Architecture); -raw_ostream &operator<<(raw_ostream &OS, Architecture Arch); +LLVM_ABI raw_ostream &operator<<(raw_ostream &OS, Architecture Arch); } // end namespace MachO. } // end namespace llvm. diff --git a/llvm/include/llvm/TextAPI/ArchitectureSet.h b/llvm/include/llvm/TextAPI/ArchitectureSet.h index 2cce9dbf0d80..a7d3394c9982 100644 --- a/llvm/include/llvm/TextAPI/ArchitectureSet.h +++ b/llvm/include/llvm/TextAPI/ArchitectureSet.h @@ -13,6 +13,7 @@ #ifndef LLVM_TEXTAPI_ARCHITECTURESET_H #define LLVM_TEXTAPI_ARCHITECTURESET_H +#include "llvm/Support/Compiler.h" #include "llvm/TextAPI/Architecture.h" #include #include @@ -38,7 +39,7 @@ public: constexpr ArchitectureSet() = default; constexpr ArchitectureSet(ArchSetType Raw) : ArchSet(Raw) {} ArchitectureSet(Architecture Arch) : ArchitectureSet() { set(Arch); } - ArchitectureSet(const std::vector &Archs); + LLVM_ABI ArchitectureSet(const std::vector &Archs); static ArchitectureSet All() { return ArchitectureSet(EndIndexVal); } @@ -61,7 +62,7 @@ public: return (ArchSet & Archs.ArchSet) == Archs.ArchSet; } - size_t count() const; + LLVM_ABI size_t count() const; bool empty() const { return ArchSet == 0; } @@ -158,9 +159,9 @@ public: const_iterator begin() const { return {&ArchSet}; } const_iterator end() const { return {&ArchSet, EndIndexVal}; } - operator std::string() const; - operator std::vector() const; - void print(raw_ostream &OS) const; + LLVM_ABI operator std::string() const; + LLVM_ABI operator std::vector() const; + LLVM_ABI void print(raw_ostream &OS) const; }; inline ArchitectureSet operator|(const Architecture &lhs, @@ -168,7 +169,7 @@ inline ArchitectureSet operator|(const Architecture &lhs, return ArchitectureSet(lhs) | ArchitectureSet(rhs); } -raw_ostream &operator<<(raw_ostream &OS, ArchitectureSet Set); +LLVM_ABI raw_ostream &operator<<(raw_ostream &OS, ArchitectureSet Set); } // end namespace MachO. } // end namespace llvm. diff --git a/llvm/include/llvm/TextAPI/DylibReader.h b/llvm/include/llvm/TextAPI/DylibReader.h index 6861d3cb1591..f3a806d78df7 100644 --- a/llvm/include/llvm/TextAPI/DylibReader.h +++ b/llvm/include/llvm/TextAPI/DylibReader.h @@ -14,6 +14,7 @@ #define LLVM_TEXTAPI_DYLIBREADER_H #include "llvm/ADT/StringMap.h" +#include "llvm/Support/Compiler.h" #include "llvm/Support/Error.h" #include "llvm/Support/MemoryBuffer.h" #include "llvm/TextAPI/ArchitectureSet.h" @@ -37,20 +38,21 @@ struct ParseOption { /// \param Buffer Data that points to dylib. /// \param Options Determines which attributes to extract. /// \return List of record slices. -Expected readFile(MemoryBufferRef Buffer, const ParseOption &Opt); +LLVM_ABI Expected readFile(MemoryBufferRef Buffer, + const ParseOption &Opt); /// Get TAPI file representation of binary dylib. /// /// \param Buffer Data that points to dylib. -Expected> get(MemoryBufferRef Buffer); +LLVM_ABI Expected> get(MemoryBufferRef Buffer); using SymbolToSourceLocMap = llvm::StringMap; /// Get the source location for each symbol from dylib. /// /// \param DSYM Path to DSYM file. /// \param T Requested target slice for dylib. -SymbolToSourceLocMap accumulateSourceLocFromDSYM(const StringRef DSYM, - const Target &T); +LLVM_ABI SymbolToSourceLocMap accumulateSourceLocFromDSYM(const StringRef DSYM, + const Target &T); } // namespace llvm::MachO::DylibReader diff --git a/llvm/include/llvm/TextAPI/InterfaceFile.h b/llvm/include/llvm/TextAPI/InterfaceFile.h index 23c27cb0f474..747c8d0a208c 100644 --- a/llvm/include/llvm/TextAPI/InterfaceFile.h +++ b/llvm/include/llvm/TextAPI/InterfaceFile.h @@ -18,6 +18,7 @@ #include "llvm/ADT/StringRef.h" #include "llvm/ADT/iterator.h" #include "llvm/Support/Allocator.h" +#include "llvm/Support/Compiler.h" #include "llvm/TextAPI/ArchitectureSet.h" #include "llvm/TextAPI/FileTypes.h" #include "llvm/TextAPI/PackedVersion.h" @@ -60,7 +61,7 @@ public: StringRef getInstallName() const { return InstallName; }; - void addTarget(const Target &Target); + LLVM_ABI void addTarget(const Target &Target); template void addTargets(RangeT &&Targets) { for (const auto &Target : Targets) addTarget(Target(Target)); @@ -146,7 +147,7 @@ public: /// Set and add target. /// /// \param Target the target to add into. - void addTarget(const Target &Target); + LLVM_ABI void addTarget(const Target &Target); /// Determine if target triple slice exists in file. /// @@ -174,7 +175,7 @@ public: std::function>; using const_filtered_target_range = llvm::iterator_range; - const_filtered_target_range targets(ArchitectureSet Archs) const; + LLVM_ABI const_filtered_target_range targets(ArchitectureSet Archs) const; /// Set the install name of the library. void setInstallName(StringRef InstallName_) { @@ -241,7 +242,7 @@ public: /// Set the parent umbrella frameworks. /// \param Target_ The target applicable to Parent /// \param Parent The name of Parent - void addParentUmbrella(const Target &Target_, StringRef Parent); + LLVM_ABI void addParentUmbrella(const Target &Target_, StringRef Parent); /// Get the list of Parent Umbrella frameworks. /// @@ -261,7 +262,7 @@ public: /// \param InstallName The name of the client that is allowed to link this /// library. /// \param Target The target triple for which this applies. - void addAllowableClient(StringRef InstallName, const Target &Target); + LLVM_ABI void addAllowableClient(StringRef InstallName, const Target &Target); /// Get the list of allowable clients. /// @@ -274,7 +275,8 @@ public: /// /// \param InstallName The name of the library to re-export. /// \param Target The target triple for which this applies. - void addReexportedLibrary(StringRef InstallName, const Target &Target); + LLVM_ABI void addReexportedLibrary(StringRef InstallName, + const Target &Target); /// Get the list of re-exported libraries. /// @@ -286,7 +288,7 @@ public: /// Add a library for inlining to top level library. /// ///\param Document The library to inline with top level library. - void addDocument(std::shared_ptr &&Document); + LLVM_ABI void addDocument(std::shared_ptr &&Document); /// Returns the pointer to parent document if exists or nullptr otherwise. InterfaceFile *getParent() const { return Parent; } @@ -301,7 +303,7 @@ public: /// Set the runpath search paths. /// \param RPath The name of runpath. /// \param InputTarget The target applicable to runpath search path. - void addRPath(StringRef RPath, const Target &InputTarget); + LLVM_ABI void addRPath(StringRef RPath, const Target &InputTarget); /// Get the list of runpath search paths. /// @@ -373,14 +375,14 @@ public: /// /// \param Arch architecture to extract from. /// \return New InterfaceFile with extracted architecture slice. - llvm::Expected> + LLVM_ABI llvm::Expected> extract(Architecture Arch) const; /// Remove architecture slice from Interface. /// /// \param Arch architecture to remove. /// \return New Interface File with removed architecture slice. - llvm::Expected> + LLVM_ABI llvm::Expected> remove(Architecture Arch) const; /// Merge Interfaces for the same library. The following library attributes @@ -390,29 +392,29 @@ public: /// /// \param O The Interface to merge. /// \return New Interface File that was merged. - llvm::Expected> + LLVM_ABI llvm::Expected> merge(const InterfaceFile *O) const; /// Inline reexported library into Interface. /// /// \param Library Interface of reexported library. /// \param Overwrite Whether to overwrite preexisting inlined library. - void inlineLibrary(std::shared_ptr Library, - bool Overwrite = false); + LLVM_ABI void inlineLibrary(std::shared_ptr Library, + bool Overwrite = false); /// Set InterfaceFile properties from pre-gathered binary attributes, /// if they are not set already. /// /// \param BA Attributes typically represented in load commands. /// \param Targ MachO Target slice to add attributes to. - void setFromBinaryAttrs(const RecordsSlice::BinaryAttrs &BA, - const Target &Targ); + LLVM_ABI void setFromBinaryAttrs(const RecordsSlice::BinaryAttrs &BA, + const Target &Targ); /// The equality is determined by attributes that impact linking /// compatibilities. Path, & FileKind are irrelevant since these by /// itself should not impact linking. /// This is an expensive operation. - bool operator==(const InterfaceFile &O) const; + LLVM_ABI bool operator==(const InterfaceFile &O) const; bool operator!=(const InterfaceFile &O) const { return !(*this == O); } diff --git a/llvm/include/llvm/TextAPI/PackedVersion.h b/llvm/include/llvm/TextAPI/PackedVersion.h index e680d40c7104..cabe365e6d97 100644 --- a/llvm/include/llvm/TextAPI/PackedVersion.h +++ b/llvm/include/llvm/TextAPI/PackedVersion.h @@ -13,6 +13,7 @@ #ifndef LLVM_TEXTAPI_PACKEDVERSION_H #define LLVM_TEXTAPI_PACKEDVERSION_H +#include "llvm/Support/Compiler.h" #include "llvm/Support/VersionTuple.h" #include #include @@ -53,8 +54,8 @@ public: /// Retrieve the subminor version number, if provided. unsigned getSubminor() const { return Version & 0xff; } - bool parse32(StringRef Str); - std::pair parse64(StringRef Str); + LLVM_ABI bool parse32(StringRef Str); + LLVM_ABI std::pair parse64(StringRef Str); bool operator<(const PackedVersion &O) const { return Version < O.Version; } @@ -64,9 +65,9 @@ public: uint32_t rawValue() const { return Version; } - operator std::string() const; + LLVM_ABI operator std::string() const; - void print(raw_ostream &OS) const; + LLVM_ABI void print(raw_ostream &OS) const; }; inline raw_ostream &operator<<(raw_ostream &OS, const PackedVersion &Version) { diff --git a/llvm/include/llvm/TextAPI/Platform.h b/llvm/include/llvm/TextAPI/Platform.h index d828d9ac49f6..8ea187acc02f 100644 --- a/llvm/include/llvm/TextAPI/Platform.h +++ b/llvm/include/llvm/TextAPI/Platform.h @@ -14,6 +14,7 @@ #include "llvm/ADT/SmallSet.h" #include "llvm/BinaryFormat/MachO.h" +#include "llvm/Support/Compiler.h" #include "llvm/Support/VersionTuple.h" namespace llvm { @@ -22,14 +23,14 @@ namespace MachO { using PlatformSet = SmallSet; using PlatformVersionSet = SmallSet, 3>; -PlatformType mapToPlatformType(PlatformType Platform, bool WantSim); -PlatformType mapToPlatformType(const Triple &Target); -PlatformSet mapToPlatformSet(ArrayRef Targets); -StringRef getPlatformName(PlatformType Platform); -PlatformType getPlatformFromName(StringRef Name); -std::string getOSAndEnvironmentName(PlatformType Platform, - std::string Version = ""); -VersionTuple mapToSupportedOSVersion(const Triple &Triple); +LLVM_ABI PlatformType mapToPlatformType(PlatformType Platform, bool WantSim); +LLVM_ABI PlatformType mapToPlatformType(const Triple &Target); +LLVM_ABI PlatformSet mapToPlatformSet(ArrayRef Targets); +LLVM_ABI StringRef getPlatformName(PlatformType Platform); +LLVM_ABI PlatformType getPlatformFromName(StringRef Name); +LLVM_ABI std::string getOSAndEnvironmentName(PlatformType Platform, + std::string Version = ""); +LLVM_ABI VersionTuple mapToSupportedOSVersion(const Triple &Triple); } // end namespace MachO. } // end namespace llvm. diff --git a/llvm/include/llvm/TextAPI/Record.h b/llvm/include/llvm/TextAPI/Record.h index 7d721988ec3d..6e470d97325f 100644 --- a/llvm/include/llvm/TextAPI/Record.h +++ b/llvm/include/llvm/TextAPI/Record.h @@ -17,6 +17,7 @@ #include "llvm/ADT/MapVector.h" #include "llvm/ADT/StringRef.h" #include "llvm/Support/Casting.h" +#include "llvm/Support/Compiler.h" #include "llvm/TextAPI/Symbol.h" #include @@ -104,7 +105,7 @@ public: SymbolFlags getFlags() const { return Flags; } private: - SymbolFlags mergeFlags(SymbolFlags Flags, RecordLinkage Linkage); + LLVM_ABI SymbolFlags mergeFlags(SymbolFlags Flags, RecordLinkage Linkage); protected: StringRef Name; @@ -164,9 +165,9 @@ public: ObjCContainerRecord(StringRef Name, RecordLinkage Linkage) : Record({Name, Linkage, SymbolFlags::Data}) {} - ObjCIVarRecord *addObjCIVar(StringRef IVar, RecordLinkage Linkage); - ObjCIVarRecord *findObjCIVar(StringRef IVar) const; - std::vector getObjCIVars() const; + LLVM_ABI ObjCIVarRecord *addObjCIVar(StringRef IVar, RecordLinkage Linkage); + LLVM_ABI ObjCIVarRecord *findObjCIVar(StringRef IVar) const; + LLVM_ABI std::vector getObjCIVars() const; RecordLinkage getLinkage() const { return Linkage; } private: @@ -207,11 +208,12 @@ public: return getLinkageForSymbol(CurrType) >= RecordLinkage::Rexported; } - RecordLinkage getLinkageForSymbol(ObjCIFSymbolKind CurrType) const; - void updateLinkageForSymbols(ObjCIFSymbolKind SymType, RecordLinkage Link); + LLVM_ABI RecordLinkage getLinkageForSymbol(ObjCIFSymbolKind CurrType) const; + LLVM_ABI void updateLinkageForSymbols(ObjCIFSymbolKind SymType, + RecordLinkage Link); - bool addObjCCategory(ObjCCategoryRecord *Record); - std::vector getObjCCategories() const; + LLVM_ABI bool addObjCCategory(ObjCCategoryRecord *Record); + LLVM_ABI std::vector getObjCCategories() const; private: /// Linkage level for each symbol represented in ObjCInterfaceRecord. diff --git a/llvm/include/llvm/TextAPI/RecordVisitor.h b/llvm/include/llvm/TextAPI/RecordVisitor.h index 34e43f5b0027..65bc96df244d 100644 --- a/llvm/include/llvm/TextAPI/RecordVisitor.h +++ b/llvm/include/llvm/TextAPI/RecordVisitor.h @@ -13,6 +13,7 @@ #ifndef LLVM_TEXTAPI_RECORDVISITOR_H #define LLVM_TEXTAPI_RECORDVISITOR_H +#include "llvm/Support/Compiler.h" #include "llvm/TextAPI/Record.h" #include "llvm/TextAPI/SymbolSet.h" @@ -20,7 +21,7 @@ namespace llvm { namespace MachO { /// Base class for any usage of traversing over collected Records. -class RecordVisitor { +class LLVM_ABI RecordVisitor { public: virtual ~RecordVisitor(); @@ -32,7 +33,7 @@ public: /// Specialized RecordVisitor for collecting exported symbols /// and undefined symbols if RecordSlice being visited represents a /// flat-namespaced library. -class SymbolConverter : public RecordVisitor { +class LLVM_ABI SymbolConverter : public RecordVisitor { public: SymbolConverter(SymbolSet *Symbols, const Target &T, const bool RecordUndefs = false) diff --git a/llvm/include/llvm/TextAPI/RecordsSlice.h b/llvm/include/llvm/TextAPI/RecordsSlice.h index f934cf7607f1..6ecb79a115ae 100644 --- a/llvm/include/llvm/TextAPI/RecordsSlice.h +++ b/llvm/include/llvm/TextAPI/RecordsSlice.h @@ -15,6 +15,7 @@ #define LLVM_TEXTAPI_RECORDSLICE_H #include "llvm/Support/Allocator.h" +#include "llvm/Support/Compiler.h" #include "llvm/TextAPI/FileTypes.h" #include "llvm/TextAPI/PackedVersion.h" #include "llvm/TextAPI/Record.h" @@ -43,9 +44,10 @@ public: /// symbol. /// \param Linkage The linkage of symbol. /// \return The non-owning pointer to added record in slice. - Record *addRecord(StringRef Name, SymbolFlags Flags, - GlobalRecord::Kind GV = GlobalRecord::Kind::Unknown, - RecordLinkage Linkage = RecordLinkage::Unknown); + LLVM_ABI Record * + addRecord(StringRef Name, SymbolFlags Flags, + GlobalRecord::Kind GV = GlobalRecord::Kind::Unknown, + RecordLinkage Linkage = RecordLinkage::Unknown); /// Add non-ObjC global record. /// @@ -56,10 +58,10 @@ public: /// \param Inlined Whether declaration is inlined, only applicable to /// functions. /// \return The non-owning pointer to added record in slice. - GlobalRecord *addGlobal(StringRef Name, RecordLinkage Linkage, - GlobalRecord::Kind GV, - SymbolFlags Flags = SymbolFlags::None, - bool Inlined = false); + LLVM_ABI GlobalRecord *addGlobal(StringRef Name, RecordLinkage Linkage, + GlobalRecord::Kind GV, + SymbolFlags Flags = SymbolFlags::None, + bool Inlined = false); /// Add ObjC Class record. /// @@ -67,8 +69,9 @@ public: /// \param Linkage The linkage of symbol. /// \param SymType The symbols this class represents. /// \return The non-owning pointer to added record in slice. - ObjCInterfaceRecord *addObjCInterface(StringRef Name, RecordLinkage Linkage, - ObjCIFSymbolKind SymType); + LLVM_ABI ObjCInterfaceRecord *addObjCInterface(StringRef Name, + RecordLinkage Linkage, + ObjCIFSymbolKind SymType); /// Add ObjC IVar record. /// @@ -76,8 +79,8 @@ public: /// \param Name The name of ivar, not symbol. /// \param Linkage The linkage of symbol. /// \return The non-owning pointer to added record in slice. - ObjCIVarRecord *addObjCIVar(ObjCContainerRecord *Container, StringRef Name, - RecordLinkage Linkage); + LLVM_ABI ObjCIVarRecord *addObjCIVar(ObjCContainerRecord *Container, + StringRef Name, RecordLinkage Linkage); /// Add ObjC Category record. /// @@ -85,22 +88,22 @@ public: /// category, not symbol. /// \param Category The name of category. /// \return The non-owning pointer to added record in slice. - ObjCCategoryRecord *addObjCCategory(StringRef ClassToExtend, - StringRef Category); + LLVM_ABI ObjCCategoryRecord *addObjCCategory(StringRef ClassToExtend, + StringRef Category); /// Find ObjC Class. /// /// \param Name name of class, not full symbol name. /// \return The non-owning pointer to record in slice. - ObjCInterfaceRecord *findObjCInterface(StringRef Name) const; + LLVM_ABI ObjCInterfaceRecord *findObjCInterface(StringRef Name) const; /// Find ObjC Category. /// /// \param ClassToExtend The name of class, not full symbol name. /// \param Category The name of category. /// \return The non-owning pointer to record in slice. - ObjCCategoryRecord *findObjCCategory(StringRef ClassToExtend, - StringRef Category) const; + LLVM_ABI ObjCCategoryRecord *findObjCCategory(StringRef ClassToExtend, + StringRef Category) const; /// Find ObjC Container. This is commonly used for assigning for looking up /// instance variables that are assigned to either a category or class. @@ -110,21 +113,23 @@ public: /// \param Name Either the name of ivar or name of container. /// \return The non-owning pointer to record in /// slice. - ObjCContainerRecord *findContainer(bool IsIVar, StringRef Name) const; + LLVM_ABI ObjCContainerRecord *findContainer(bool IsIVar, + StringRef Name) const; /// Find ObjC instance variable. /// /// \param IsScopedName This is used to determine how to parse the name. /// \param Name Either the full name of the symbol or just the ivar. /// \return The non-owning pointer to record in slice. - ObjCIVarRecord *findObjCIVar(bool IsScopedName, StringRef Name) const; + LLVM_ABI ObjCIVarRecord *findObjCIVar(bool IsScopedName, + StringRef Name) const; /// Find non-objc global. /// /// \param Name The name of symbol. /// \param GV The Kind of global to find. /// \return The non-owning pointer to record in slice. - GlobalRecord * + LLVM_ABI GlobalRecord * findGlobal(StringRef Name, GlobalRecord::Kind GV = GlobalRecord::Kind::Unknown) const; @@ -138,7 +143,7 @@ public: } // Visit all records known to RecordsSlice. - void visit(RecordVisitor &V) const; + LLVM_ABI void visit(RecordVisitor &V) const; struct BinaryAttrs { std::vector AllowableClients; @@ -158,11 +163,11 @@ public: }; /// Return reference to BinaryAttrs. - BinaryAttrs &getBinaryAttrs(); + LLVM_ABI BinaryAttrs &getBinaryAttrs(); /// Store any strings owned by RecordSlice into allocator and return back /// reference to that. - StringRef copyString(StringRef String); + LLVM_ABI StringRef copyString(StringRef String); private: const llvm::Triple TargetTriple; @@ -196,7 +201,8 @@ private: using Records = llvm::SmallVector, 4>; class InterfaceFile; -std::unique_ptr convertToInterfaceFile(const Records &Slices); +LLVM_ABI std::unique_ptr +convertToInterfaceFile(const Records &Slices); } // namespace MachO } // namespace llvm diff --git a/llvm/include/llvm/TextAPI/Symbol.h b/llvm/include/llvm/TextAPI/Symbol.h index 5a5eb0eb4832..92ff0746f799 100644 --- a/llvm/include/llvm/TextAPI/Symbol.h +++ b/llvm/include/llvm/TextAPI/Symbol.h @@ -11,6 +11,7 @@ #include "llvm/ADT/BitmaskEnum.h" #include "llvm/ADT/StringRef.h" +#include "llvm/Support/Compiler.h" #include "llvm/Support/raw_ostream.h" #include "llvm/TextAPI/ArchitectureSet.h" #include "llvm/TextAPI/Target.h" @@ -152,14 +153,15 @@ public: std::function>; using const_filtered_target_range = llvm::iterator_range; - const_filtered_target_range targets(ArchitectureSet architectures) const; + LLVM_ABI const_filtered_target_range + targets(ArchitectureSet architectures) const; #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void dump(raw_ostream &OS) const; void dump() const { dump(llvm::errs()); } #endif - bool operator==(const Symbol &O) const; + LLVM_ABI bool operator==(const Symbol &O) const; bool operator!=(const Symbol &O) const { return !(*this == O); } @@ -189,7 +191,7 @@ struct SimpleSymbol { /// Get symbol classification by parsing the name of a symbol. /// /// \param SymName The name of symbol. -SimpleSymbol parseSymbol(StringRef SymName); +LLVM_ABI SimpleSymbol parseSymbol(StringRef SymName); } // end namespace MachO. } // end namespace llvm. diff --git a/llvm/include/llvm/TextAPI/SymbolSet.h b/llvm/include/llvm/TextAPI/SymbolSet.h index 6ccabb907720..cd3066317f3a 100644 --- a/llvm/include/llvm/TextAPI/SymbolSet.h +++ b/llvm/include/llvm/TextAPI/SymbolSet.h @@ -15,6 +15,7 @@ #include "llvm/ADT/iterator.h" #include "llvm/ADT/iterator_range.h" #include "llvm/Support/Allocator.h" +#include "llvm/Support/Compiler.h" #include "llvm/TextAPI/Architecture.h" #include "llvm/TextAPI/ArchitectureSet.h" #include "llvm/TextAPI/Symbol.h" @@ -87,12 +88,12 @@ private: using SymbolsMapType = llvm::DenseMap; SymbolsMapType Symbols; - Symbol *addGlobalImpl(EncodeKind, StringRef Name, SymbolFlags Flags); + LLVM_ABI Symbol *addGlobalImpl(EncodeKind, StringRef Name, SymbolFlags Flags); public: SymbolSet() = default; - Symbol *addGlobal(EncodeKind Kind, StringRef Name, SymbolFlags Flags, - const Target &Targ); + LLVM_ABI Symbol *addGlobal(EncodeKind Kind, StringRef Name, SymbolFlags Flags, + const Target &Targ); size_t size() const { return Symbols.size(); } template create(StringRef Target); + LLVM_ABI static llvm::Expected create(StringRef Target); - operator std::string() const; + LLVM_ABI operator std::string() const; Architecture Arch; PlatformType Platform; @@ -66,13 +67,13 @@ inline bool operator!=(const Target &LHS, const Architecture &RHS) { return LHS.Arch != RHS; } -PlatformVersionSet mapToPlatformVersionSet(ArrayRef Targets); -PlatformSet mapToPlatformSet(ArrayRef Targets); -ArchitectureSet mapToArchitectureSet(ArrayRef Targets); +LLVM_ABI PlatformVersionSet mapToPlatformVersionSet(ArrayRef Targets); +LLVM_ABI PlatformSet mapToPlatformSet(ArrayRef Targets); +LLVM_ABI ArchitectureSet mapToArchitectureSet(ArrayRef Targets); -std::string getTargetTripleName(const Target &Targ); +LLVM_ABI std::string getTargetTripleName(const Target &Targ); -raw_ostream &operator<<(raw_ostream &OS, const Target &Target); +LLVM_ABI raw_ostream &operator<<(raw_ostream &OS, const Target &Target); } // namespace MachO } // namespace llvm diff --git a/llvm/include/llvm/TextAPI/TextAPIError.h b/llvm/include/llvm/TextAPI/TextAPIError.h index f0578654697b..7b2182edd621 100644 --- a/llvm/include/llvm/TextAPI/TextAPIError.h +++ b/llvm/include/llvm/TextAPI/TextAPIError.h @@ -14,6 +14,7 @@ #ifndef LLVM_TEXTAPI_TEXTAPIERROR_H #define LLVM_TEXTAPI_TEXTAPIERROR_H +#include "llvm/Support/Compiler.h" #include "llvm/Support/Error.h" namespace llvm::MachO { @@ -25,7 +26,7 @@ enum class TextAPIErrorCode { UnsupportedTarget }; -class TextAPIError : public llvm::ErrorInfo { +class LLVM_ABI TextAPIError : public llvm::ErrorInfo { public: static char ID; TextAPIErrorCode EC; diff --git a/llvm/include/llvm/TextAPI/TextAPIReader.h b/llvm/include/llvm/TextAPI/TextAPIReader.h index 32af0e3601f1..603b24b47283 100644 --- a/llvm/include/llvm/TextAPI/TextAPIReader.h +++ b/llvm/include/llvm/TextAPI/TextAPIReader.h @@ -9,6 +9,7 @@ #ifndef LLVM_TEXTAPI_TEXTAPIREADER_H #define LLVM_TEXTAPI_TEXTAPIREADER_H +#include "llvm/Support/Compiler.h" #include "llvm/Support/Error.h" namespace llvm { @@ -29,13 +30,13 @@ public: /// /// \param InputBuffer Buffer holding contents of TAPI text file. /// \return The file format version of TAPI text file. - static Expected canRead(MemoryBufferRef InputBuffer); + LLVM_ABI static Expected canRead(MemoryBufferRef InputBuffer); /// Parse and get an InterfaceFile that represents the full /// library. /// /// \param InputBuffer Buffer holding contents of TAPI text file. - static Expected> + LLVM_ABI static Expected> get(MemoryBufferRef InputBuffer); TextAPIReader() = delete; diff --git a/llvm/include/llvm/TextAPI/TextAPIWriter.h b/llvm/include/llvm/TextAPI/TextAPIWriter.h index 7fd32c6fe2a9..5f06c372fe85 100644 --- a/llvm/include/llvm/TextAPI/TextAPIWriter.h +++ b/llvm/include/llvm/TextAPI/TextAPIWriter.h @@ -10,6 +10,7 @@ #define LLVM_TEXTAPI_TEXTAPIWRITER_H #include "llvm/ADT/StringSwitch.h" +#include "llvm/Support/Compiler.h" #include "llvm/TextAPI/InterfaceFile.h" namespace llvm { @@ -30,9 +31,10 @@ public: /// \param FileKind File format to write text file as. If not specified, it /// will read from File. /// \param Compact Whether to limit whitespace in text file. - static Error writeToStream(raw_ostream &OS, const InterfaceFile &File, - const FileType FileKind = FileType::Invalid, - bool Compact = false); + LLVM_ABI static Error + writeToStream(raw_ostream &OS, const InterfaceFile &File, + const FileType FileKind = FileType::Invalid, + bool Compact = false); /// Get TAPI FileType from the input string. /// diff --git a/llvm/include/llvm/TextAPI/Utils.h b/llvm/include/llvm/TextAPI/Utils.h index 00dfd63e14f9..27db717f5a63 100644 --- a/llvm/include/llvm/TextAPI/Utils.h +++ b/llvm/include/llvm/TextAPI/Utils.h @@ -14,6 +14,7 @@ #define LLVM_TEXTAPI_UTILS_H #include "llvm/ADT/Twine.h" +#include "llvm/Support/Compiler.h" #include "llvm/Support/Error.h" #include "llvm/Support/FileSystem.h" #include "llvm/Support/MemoryBuffer.h" @@ -51,34 +52,35 @@ struct SymLink { /// /// \param Path Location of file. /// \param Extension File extension to update with. -void replace_extension(SmallVectorImpl &Path, const Twine &Extension); +LLVM_ABI void replace_extension(SmallVectorImpl &Path, + const Twine &Extension); /// Determine whether to skip over symlink due to either too many symlink levels /// or is cyclic. /// /// \param Path Location to symlink. /// \param Result Holds whether to skip over Path. -std::error_code shouldSkipSymLink(const Twine &Path, bool &Result); +LLVM_ABI std::error_code shouldSkipSymLink(const Twine &Path, bool &Result); /// Turn absolute symlink into relative. /// /// \param From The symlink. /// \param To What the symlink points to. /// \param RelativePath Path location to update what the symlink points to. -std::error_code make_relative(StringRef From, StringRef To, - SmallVectorImpl &RelativePath); +LLVM_ABI std::error_code make_relative(StringRef From, StringRef To, + SmallVectorImpl &RelativePath); /// Determine if library is private by parsing file path. /// It does not touch the file system. /// /// \param Path File path for library. /// \param IsSymLink Whether path points to a symlink. -bool isPrivateLibrary(StringRef Path, bool IsSymLink = false); +LLVM_ABI bool isPrivateLibrary(StringRef Path, bool IsSymLink = false); /// Create a regex rule from provided glob string. /// \param Glob String that represents glob input. /// \return The equivalent regex rule. -llvm::Expected createRegexFromGlob(llvm::StringRef Glob); +LLVM_ABI llvm::Expected createRegexFromGlob(llvm::StringRef Glob); using AliasEntry = std::pair; using AliasMap = std::map; @@ -87,14 +89,15 @@ using AliasMap = std::map; /// /// \param Buffer Data contents of file for the alias list. /// \return Lookup table of alias to their base symbol. -Expected parseAliasList(std::unique_ptr &Buffer); +LLVM_ABI Expected +parseAliasList(std::unique_ptr &Buffer); /// Pickup active paths for a given platform. /// /// \param Paths File or search paths to pick up. /// \param Platform Platform to collect paths for. -PathSeq getPathsForPlatform(const PathToPlatformSeq &Paths, - PlatformType Platform); +LLVM_ABI PathSeq getPathsForPlatform(const PathToPlatformSeq &Paths, + PlatformType Platform); } // namespace llvm::MachO #endif // LLVM_TEXTAPI_UTILS_H From 78765bb856bd6cdc3b1db48e80f74b8de5181f3f Mon Sep 17 00:00:00 2001 From: Jay Foad Date: Wed, 11 Jun 2025 17:23:04 +0100 Subject: [PATCH 0008/1322] [TableGen] Simplify computeUberWeights. NFC. (#143716) Using RegUnitIterator made the code more complicated than having two nested loops over each register and each register's regunits. --- .../TableGen/Common/CodeGenRegisters.cpp | 29 ++++++++----------- 1 file changed, 12 insertions(+), 17 deletions(-) diff --git a/llvm/utils/TableGen/Common/CodeGenRegisters.cpp b/llvm/utils/TableGen/Common/CodeGenRegisters.cpp index 5ec9b35379fa..4d24eb3de1ed 100644 --- a/llvm/utils/TableGen/Common/CodeGenRegisters.cpp +++ b/llvm/utils/TableGen/Common/CodeGenRegisters.cpp @@ -1849,26 +1849,21 @@ static void computeUberWeights(MutableArrayRef UberSets, // Skip the first unallocatable set. for (UberRegSet &S : UberSets.drop_front()) { // Initialize all unit weights in this set, and remember the max units/reg. - const CodeGenRegister *Reg = nullptr; - unsigned MaxWeight = 0, Weight = 0; - for (RegUnitIterator UnitI(S.Regs); UnitI.isValid(); ++UnitI) { - if (Reg != UnitI.getReg()) { - if (Weight > MaxWeight) - MaxWeight = Weight; - Reg = UnitI.getReg(); - Weight = 0; - } - if (!RegBank.getRegUnit(*UnitI).Artificial) { - unsigned UWeight = RegBank.getRegUnit(*UnitI).Weight; - if (!UWeight) { - UWeight = 1; - RegBank.increaseRegUnitWeight(*UnitI, UWeight); + unsigned MaxWeight = 0; + for (const CodeGenRegister *R : S.Regs) { + unsigned Weight = 0; + for (unsigned U : R->getRegUnits()) { + if (!RegBank.getRegUnit(U).Artificial) { + unsigned UWeight = RegBank.getRegUnit(U).Weight; + if (!UWeight) { + UWeight = 1; + RegBank.increaseRegUnitWeight(U, UWeight); + } + Weight += UWeight; } - Weight += UWeight; } + MaxWeight = std::max(MaxWeight, Weight); } - if (Weight > MaxWeight) - MaxWeight = Weight; if (S.Weight != MaxWeight) { LLVM_DEBUG({ dbgs() << "UberSet " << &S - UberSets.begin() << " Weight " From 8e4f0d8614dcd48cfe2d885a021e2927c1bc8616 Mon Sep 17 00:00:00 2001 From: Morris Hafner Date: Wed, 11 Jun 2025 18:24:46 +0200 Subject: [PATCH 0009/1322] [CIR] Upstream minimal builtin function call support (#142981) This patch adds all bits required to implement builtin function calls to ClangIR. It doesn't actually implement any of the builtins except those that fold to a constant ahead of CodeGen (`__builtin_is_constant_evaluated()` being one example). --- clang/include/clang/CIR/MissingFeatures.h | 3 +- clang/lib/CIR/CodeGen/CIRGenBuilder.cpp | 28 ++++++++ clang/lib/CIR/CodeGen/CIRGenBuilder.h | 11 ++++ clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp | 55 ++++++++++++++++ clang/lib/CIR/CodeGen/CIRGenCall.h | 30 ++++++++- clang/lib/CIR/CodeGen/CIRGenExpr.cpp | 53 +++++++++++++-- clang/lib/CIR/CodeGen/CIRGenFunction.h | 5 ++ clang/lib/CIR/CodeGen/CMakeLists.txt | 1 + clang/test/CIR/CodeGen/builtin_call.cpp | 78 +++++++++++++++++++++++ 9 files changed, 255 insertions(+), 9 deletions(-) create mode 100644 clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp create mode 100644 clang/test/CIR/CodeGen/builtin_call.cpp diff --git a/clang/include/clang/CIR/MissingFeatures.h b/clang/include/clang/CIR/MissingFeatures.h index f89d386378e5..87908e2ec08a 100644 --- a/clang/include/clang/CIR/MissingFeatures.h +++ b/clang/include/clang/CIR/MissingFeatures.h @@ -83,7 +83,6 @@ struct MissingFeatures { static bool opFuncSetComdat() { return false; } // CallOp handling - static bool opCallBuiltinFunc() { return false; } static bool opCallPseudoDtor() { return false; } static bool opCallAggregateArgs() { return false; } static bool opCallPaddingArgs() { return false; } @@ -225,6 +224,8 @@ struct MissingFeatures { static bool isMemcpyEquivalentSpecialMember() { return false; } static bool isTrivialCtorOrDtor() { return false; } static bool implicitConstructorArgs() { return false; } + static bool intrinsics() { return false; } + static bool attributeNoBuiltin() { return false; } // Missing types static bool dataMemberType() { return false; } diff --git a/clang/lib/CIR/CodeGen/CIRGenBuilder.cpp b/clang/lib/CIR/CodeGen/CIRGenBuilder.cpp index 4c8c6ed289c3..9cec17bcb2fd 100644 --- a/clang/lib/CIR/CodeGen/CIRGenBuilder.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenBuilder.cpp @@ -39,6 +39,34 @@ mlir::Value CIRGenBuilderTy::getArrayElement(mlir::Location arrayLocBegin, return create(arrayLocEnd, flatPtrTy, basePtr, idx); } +cir::ConstantOp CIRGenBuilderTy::getConstInt(mlir::Location loc, + llvm::APSInt intVal) { + bool isSigned = intVal.isSigned(); + unsigned width = intVal.getBitWidth(); + cir::IntType t = isSigned ? getSIntNTy(width) : getUIntNTy(width); + return getConstInt(loc, t, + isSigned ? intVal.getSExtValue() : intVal.getZExtValue()); +} + +cir::ConstantOp CIRGenBuilderTy::getConstInt(mlir::Location loc, + llvm::APInt intVal) { + return getConstInt(loc, llvm::APSInt(intVal)); +} + +cir::ConstantOp CIRGenBuilderTy::getConstInt(mlir::Location loc, mlir::Type t, + uint64_t c) { + assert(mlir::isa(t) && "expected cir::IntType"); + return create(loc, cir::IntAttr::get(t, c)); +} + +cir::ConstantOp +clang::CIRGen::CIRGenBuilderTy::getConstFP(mlir::Location loc, mlir::Type t, + llvm::APFloat fpVal) { + assert(mlir::isa(t) && + "expected floating point type"); + return create(loc, getAttr(t, fpVal)); +} + // This can't be defined in Address.h because that file is included by // CIRGenBuilder.h Address Address::withElementType(CIRGenBuilderTy &builder, diff --git a/clang/lib/CIR/CodeGen/CIRGenBuilder.h b/clang/lib/CIR/CodeGen/CIRGenBuilder.h index 03077ee062a6..fb1a290c18fa 100644 --- a/clang/lib/CIR/CodeGen/CIRGenBuilder.h +++ b/clang/lib/CIR/CodeGen/CIRGenBuilder.h @@ -11,10 +11,12 @@ #include "Address.h" #include "CIRGenTypeCache.h" +#include "clang/CIR/Interfaces/CIRFPTypeInterface.h" #include "clang/CIR/MissingFeatures.h" #include "clang/CIR/Dialect/Builder/CIRBaseBuilder.h" #include "clang/CIR/MissingFeatures.h" +#include "llvm/ADT/APFloat.h" #include "llvm/ADT/STLExtras.h" namespace clang::CIRGen { @@ -229,6 +231,15 @@ public: cir::IntType getUInt32Ty() { return typeCache.UInt32Ty; } cir::IntType getUInt64Ty() { return typeCache.UInt64Ty; } + cir::ConstantOp getConstInt(mlir::Location loc, llvm::APSInt intVal); + + cir::ConstantOp getConstInt(mlir::Location loc, llvm::APInt intVal); + + cir::ConstantOp getConstInt(mlir::Location loc, mlir::Type t, uint64_t c); + + cir::ConstantOp getConstFP(mlir::Location loc, mlir::Type t, + llvm::APFloat fpVal); + bool isInt8Ty(mlir::Type i) { return i == typeCache.UInt8Ty || i == typeCache.SInt8Ty; } diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp b/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp new file mode 100644 index 000000000000..c59ac78210f8 --- /dev/null +++ b/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp @@ -0,0 +1,55 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This contains code to emit Builtin calls as CIR or a function call to be +// later resolved. +// +//===----------------------------------------------------------------------===// + +#include "CIRGenCall.h" +#include "CIRGenFunction.h" +#include "CIRGenModule.h" +#include "CIRGenValue.h" +#include "mlir/IR/BuiltinAttributes.h" +#include "mlir/IR/Value.h" +#include "mlir/Support/LLVM.h" +#include "clang/AST/Expr.h" +#include "clang/AST/GlobalDecl.h" +#include "llvm/Support/ErrorHandling.h" + +using namespace clang; +using namespace clang::CIRGen; + +RValue CIRGenFunction::emitBuiltinExpr(const GlobalDecl &gd, unsigned builtinID, + const CallExpr *e, + ReturnValueSlot returnValue) { + // See if we can constant fold this builtin. If so, don't emit it at all. + // TODO: Extend this handling to all builtin calls that we can constant-fold. + Expr::EvalResult result; + if (e->isPRValue() && e->EvaluateAsRValue(result, cgm.getASTContext()) && + !result.hasSideEffects()) { + if (result.Val.isInt()) { + return RValue::get(builder.getConstInt(getLoc(e->getSourceRange()), + result.Val.getInt())); + } + if (result.Val.isFloat()) { + // Note: we are using result type of CallExpr to determine the type of + // the constant. Classic codegen uses the result value to determine the + // type. We feel it should be Ok to use expression type because it is + // hard to imagine a builtin function evaluates to a value that + // over/underflows its own defined type. + mlir::Type type = convertType(e->getType()); + return RValue::get(builder.getConstFP(getLoc(e->getExprLoc()), type, + result.Val.getFloat())); + } + } + + mlir::Location loc = getLoc(e->getExprLoc()); + cgm.errorNYI(loc, "non constant foldable builtin calls"); + return getUndefRValue(e->getType()); +} diff --git a/clang/lib/CIR/CodeGen/CIRGenCall.h b/clang/lib/CIR/CodeGen/CIRGenCall.h index 605625705a75..15c9080448c8 100644 --- a/clang/lib/CIR/CodeGen/CIRGenCall.h +++ b/clang/lib/CIR/CodeGen/CIRGenCall.h @@ -44,16 +44,25 @@ public: class CIRGenCallee { enum class SpecialKind : uintptr_t { Invalid, + Builtin, - Last = Invalid, + Last = Builtin, + }; + + struct BuiltinInfoStorage { + const clang::FunctionDecl *decl; + unsigned id; }; SpecialKind kindOrFunctionPtr; union { CIRGenCalleeInfo abstractInfo; + BuiltinInfoStorage builtinInfo; }; + explicit CIRGenCallee(SpecialKind kind) : kindOrFunctionPtr(kind) {} + public: CIRGenCallee() : kindOrFunctionPtr(SpecialKind::Invalid) {} @@ -69,6 +78,25 @@ public: return CIRGenCallee(abstractInfo, funcPtr); } + bool isBuiltin() const { return kindOrFunctionPtr == SpecialKind::Builtin; } + + const clang::FunctionDecl *getBuiltinDecl() const { + assert(isBuiltin()); + return builtinInfo.decl; + } + unsigned getBuiltinID() const { + assert(isBuiltin()); + return builtinInfo.id; + } + + static CIRGenCallee forBuiltin(unsigned builtinID, + const clang::FunctionDecl *builtinDecl) { + CIRGenCallee result(SpecialKind::Builtin); + result.builtinInfo.decl = builtinDecl; + result.builtinInfo.id = builtinID; + return result; + } + bool isOrdinary() const { return uintptr_t(kindOrFunctionPtr) > uintptr_t(SpecialKind::Last); } diff --git a/clang/lib/CIR/CodeGen/CIRGenExpr.cpp b/clang/lib/CIR/CodeGen/CIRGenExpr.cpp index f2c2de7a4f59..f1f86509c9a9 100644 --- a/clang/lib/CIR/CodeGen/CIRGenExpr.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenExpr.cpp @@ -1029,8 +1029,48 @@ static cir::FuncOp emitFunctionDeclPointer(CIRGenModule &cgm, GlobalDecl gd) { return cgm.getAddrOfFunction(gd); } -static CIRGenCallee emitDirectCallee(CIRGenModule &cgm, GlobalDecl gd) { - assert(!cir::MissingFeatures::opCallBuiltinFunc()); +// Detect the unusual situation where an inline version is shadowed by a +// non-inline version. In that case we should pick the external one +// everywhere. That's GCC behavior too. +static bool onlyHasInlineBuiltinDeclaration(const FunctionDecl *fd) { + for (const FunctionDecl *pd = fd; pd; pd = pd->getPreviousDecl()) + if (!pd->isInlineBuiltinDeclaration()) + return false; + return true; +} + +CIRGenCallee CIRGenFunction::emitDirectCallee(const GlobalDecl &gd) { + const auto *fd = cast(gd.getDecl()); + + if (unsigned builtinID = fd->getBuiltinID()) { + if (fd->getAttr()) { + cgm.errorNYI("AsmLabelAttr"); + } + + StringRef ident = fd->getName(); + std::string fdInlineName = (ident + ".inline").str(); + + bool isPredefinedLibFunction = + cgm.getASTContext().BuiltinInfo.isPredefinedLibFunction(builtinID); + bool hasAttributeNoBuiltin = false; + assert(!cir::MissingFeatures::attributeNoBuiltin()); + + // When directing calling an inline builtin, call it through it's mangled + // name to make it clear it's not the actual builtin. + auto fn = cast(curFn); + if (fn.getName() != fdInlineName && onlyHasInlineBuiltinDeclaration(fd)) { + cgm.errorNYI("Inline only builtin function calls"); + } + + // Replaceable builtins provide their own implementation of a builtin. If we + // are in an inline builtin implementation, avoid trivial infinite + // recursion. Honor __attribute__((no_builtin("foo"))) or + // __attribute__((no_builtin)) on the current function unless foo is + // not a predefined library function which means we must generate the + // builtin no matter what. + else if (!isPredefinedLibFunction || !hasAttributeNoBuiltin) + return CIRGenCallee::forBuiltin(builtinID, fd); + } cir::FuncOp callee = emitFunctionDeclPointer(cgm, gd); @@ -1106,7 +1146,7 @@ CIRGenCallee CIRGenFunction::emitCallee(const clang::Expr *e) { } else if (const auto *declRef = dyn_cast(e)) { // Resolve direct calls. const auto *funcDecl = cast(declRef->getDecl()); - return emitDirectCallee(cgm, funcDecl); + return emitDirectCallee(funcDecl); } else if (isa(e)) { cgm.errorNYI(e->getSourceRange(), "emitCallee: call to member function is NYI"); @@ -1162,10 +1202,9 @@ RValue CIRGenFunction::emitCallExpr(const clang::CallExpr *e, CIRGenCallee callee = emitCallee(e->getCallee()); - if (e->getBuiltinCallee()) { - cgm.errorNYI(e->getSourceRange(), "call to builtin functions"); - } - assert(!cir::MissingFeatures::opCallBuiltinFunc()); + if (callee.isBuiltin()) + return emitBuiltinExpr(callee.getBuiltinDecl(), callee.getBuiltinID(), e, + returnValue); if (isa(e->getCallee())) { cgm.errorNYI(e->getSourceRange(), "call to pseudo destructor"); diff --git a/clang/lib/CIR/CodeGen/CIRGenFunction.h b/clang/lib/CIR/CodeGen/CIRGenFunction.h index 7db7f6928fd8..b08dd540e628 100644 --- a/clang/lib/CIR/CodeGen/CIRGenFunction.h +++ b/clang/lib/CIR/CodeGen/CIRGenFunction.h @@ -665,6 +665,8 @@ private: void emitAndUpdateRetAlloca(clang::QualType type, mlir::Location loc, clang::CharUnits alignment); + CIRGenCallee emitDirectCallee(const GlobalDecl &gd); + public: Address emitAddrOfFieldStorage(Address base, const FieldDecl *field, llvm::StringRef fieldName, @@ -711,6 +713,9 @@ public: mlir::LogicalResult emitBreakStmt(const clang::BreakStmt &s); + RValue emitBuiltinExpr(const clang::GlobalDecl &gd, unsigned builtinID, + const clang::CallExpr *e, ReturnValueSlot returnValue); + RValue emitCall(const CIRGenFunctionInfo &funcInfo, const CIRGenCallee &callee, ReturnValueSlot returnValue, const CallArgList &args, cir::CIRCallOpInterface *callOp, diff --git a/clang/lib/CIR/CodeGen/CMakeLists.txt b/clang/lib/CIR/CodeGen/CMakeLists.txt index 8bfcd2773d07..beaa9afb31f9 100644 --- a/clang/lib/CIR/CodeGen/CMakeLists.txt +++ b/clang/lib/CIR/CodeGen/CMakeLists.txt @@ -13,6 +13,7 @@ add_clang_library(clangCIR CIRGenClass.cpp CIRGenCXXABI.cpp CIRGenCXXExpr.cpp + CIRGenBuiltin.cpp CIRGenDecl.cpp CIRGenDeclOpenACC.cpp CIRGenExpr.cpp diff --git a/clang/test/CIR/CodeGen/builtin_call.cpp b/clang/test/CIR/CodeGen/builtin_call.cpp new file mode 100644 index 000000000000..2706ea7f8f85 --- /dev/null +++ b/clang/test/CIR/CodeGen/builtin_call.cpp @@ -0,0 +1,78 @@ +// RUN: %clang_cc1 -std=c++11 -triple x86_64-unknown-linux-gnu -Wno-unused-value -fclangir -emit-cir %s -o %t.cir +// RUN: FileCheck --input-file=%t.cir %s -check-prefix=CIR +// RUN: %clang_cc1 -std=c++11 -triple x86_64-unknown-linux-gnu -Wno-unused-value -fclangir -emit-llvm %s -o %t-cir.ll +// RUN: FileCheck --input-file=%t-cir.ll %s -check-prefix=LLVM +// RUN: %clang_cc1 -std=c++11 -triple x86_64-unknown-linux-gnu -Wno-unused-value -emit-llvm %s -o %t.ll +// RUN: FileCheck --input-file=%t.ll %s -check-prefix=OGCG + +constexpr extern int cx_var = __builtin_is_constant_evaluated(); + +// CIR: cir.global {{.*}} @cx_var = #cir.int<1> : !s32i +// LLVM: @cx_var = {{.*}} i32 1 +// OGCG: @cx_var = {{.*}} i32 1 + +constexpr extern float cx_var_single = __builtin_huge_valf(); + +// CIR: cir.global {{.*}} @cx_var_single = #cir.fp<0x7F800000> : !cir.float +// LLVM: @cx_var_single = {{.*}} float 0x7FF0000000000000 +// OGCG: @cx_var_single = {{.*}} float 0x7FF0000000000000 + +constexpr extern long double cx_var_ld = __builtin_huge_vall(); + +// CIR: cir.global {{.*}} @cx_var_ld = #cir.fp<0x7FFF8000000000000000> : !cir.long_double +// LLVM: @cx_var_ld = {{.*}} x86_fp80 0xK7FFF8000000000000000 +// OGCG: @cx_var_ld = {{.*}} x86_fp80 0xK7FFF8000000000000000 + +int is_constant_evaluated() { + return __builtin_is_constant_evaluated(); +} + +// CIR: cir.func @_Z21is_constant_evaluatedv() -> !s32i +// CIR: %[[ZERO:.+]] = cir.const #cir.int<0> + +// LLVM: define {{.*}}i32 @_Z21is_constant_evaluatedv() +// LLVM: %[[MEM:.+]] = alloca i32 +// LLVM: store i32 0, ptr %[[MEM]] +// LLVM: %[[RETVAL:.+]] = load i32, ptr %[[MEM]] +// LLVM: ret i32 %[[RETVAL]] +// LLVM: } + +// OGCG: define {{.*}}i32 @_Z21is_constant_evaluatedv() +// OGCG: ret i32 0 +// OGCG: } + +long double constant_fp_builtin_ld() { + return __builtin_fabsl(-0.1L); +} + +// CIR: cir.func @_Z22constant_fp_builtin_ldv() -> !cir.long_double +// CIR: %[[PONE:.+]] = cir.const #cir.fp<1.000000e-01> : !cir.long_double + +// LLVM: define {{.*}}x86_fp80 @_Z22constant_fp_builtin_ldv() +// LLVM: %[[MEM:.+]] = alloca x86_fp80 +// LLVM: store x86_fp80 0xK3FFBCCCCCCCCCCCCCCCD, ptr %[[MEM]] +// LLVM: %[[RETVAL:.+]] = load x86_fp80, ptr %[[MEM]] +// LLVM: ret x86_fp80 %[[RETVAL]] +// LLVM: } + +// OGCG: define {{.*}}x86_fp80 @_Z22constant_fp_builtin_ldv() +// OGCG: ret x86_fp80 0xK3FFBCCCCCCCCCCCCCCCD +// OGCG: } + +float constant_fp_builtin_single() { + return __builtin_fabsf(-0.1f); +} + +// CIR: cir.func @_Z26constant_fp_builtin_singlev() -> !cir.float +// CIR: %[[PONE:.+]] = cir.const #cir.fp<1.000000e-01> : !cir.float + +// LLVM: define {{.*}}float @_Z26constant_fp_builtin_singlev() +// LLVM: %[[MEM:.+]] = alloca float +// LLVM: store float 0x3FB99999A0000000, ptr %[[MEM]] +// LLVM: %[[RETVAL:.+]] = load float, ptr %[[MEM]] +// LLVM: ret float %[[RETVAL]] +// LLVM: } + +// OGCG: define {{.*}}float @_Z26constant_fp_builtin_singlev() +// OGCG: ret float 0x3FB99999A0000000 +// OGCG: } From ec8d68b59f82423e5a6bf452e33ee8c5f64b0edc Mon Sep 17 00:00:00 2001 From: vabridgers <58314289+vabridgers@users.noreply.github.com> Date: Wed, 11 Jun 2025 11:25:24 -0500 Subject: [PATCH 0010/1322] [clang][analyzer] Correct SMT Layer for _BitInt cases refutations (#143310) Since _BitInt was added later, ASTContext did not comprehend getting a type by bitwidth that's not a power of 2, and the SMT layer also did not comprehend this. This led to unexpected crashes using Z3 refutation during randomized testing. The assertion and redacted and summarized crash stack is shown here. clang: ../../clang/include/clang/StaticAnalyzer/Core/PathSensitive/SMTConv.h:103: static llvm::SMTExprRef clang::ento::SMTConv::fromBinOp(llvm::SMTSolverRef &, const llvm::SMTExprRef &, const BinaryOperator::Opcode, const llvm::SMTExprRef &, bool): Assertion `*Solver->getSort(LHS) == *Solver->getSort(RHS) && "AST's must have the same sort!"' failed. ...
clang::ento::SMTConv::fromBinOp(std::shared_ptr&, llvm::SMTExpr const* const&, clang::BinaryOperatorKind, llvm::SMTExpr const* const&, bool) SMTConstraintManager.cpp clang::ASTContext&, llvm::SMTExpr const* const&, clang::QualType, clang::BinaryOperatorKind, llvm::SMTExpr const* const&, clang::QualType, clang::QualType*) SMTConstraintManager.cpp clang::ASTContext&, clang::ento::SymExpr const*, llvm::APSInt const&, llvm::APSInt const&, bool) SMTConstraintManager.cpp clang::ento::ExplodedNode const*, clang::ento::PathSensitiveBugReport&) --------- Co-authored-by: Vince Bridgers --- .../Core/PathSensitive/SMTConv.h | 28 ++++++++++++++----- clang/test/Analysis/bitint-z3.c | 22 +++++++++++++++ 2 files changed, 43 insertions(+), 7 deletions(-) create mode 100644 clang/test/Analysis/bitint-z3.c diff --git a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/SMTConv.h b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/SMTConv.h index 580b49a38dc7..70a7953918ac 100644 --- a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/SMTConv.h +++ b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/SMTConv.h @@ -18,6 +18,8 @@ #include "clang/StaticAnalyzer/Core/PathSensitive/SymbolManager.h" #include "llvm/Support/SMTAPI.h" +#include + namespace clang { namespace ento { @@ -570,23 +572,35 @@ public: // TODO: Refactor to put elsewhere static inline QualType getAPSIntType(ASTContext &Ctx, const llvm::APSInt &Int) { - return Ctx.getIntTypeForBitwidth(Int.getBitWidth(), Int.isSigned()); + const QualType Ty = + Ctx.getIntTypeForBitwidth(Int.getBitWidth(), Int.isSigned()); + if (!Ty.isNull()) + return Ty; + // If Ty is Null, could be because the original type was a _BitInt. + // Get the size of the _BitInt type (expressed in bits) and round it up to + // the next power of 2 that is at least the bit size of 'char' (usually 8). + unsigned CharTypeSize = Ctx.getTypeSize(Ctx.CharTy); + unsigned Pow2DestWidth = + std::max(llvm::bit_ceil(Int.getBitWidth()), CharTypeSize); + return Ctx.getIntTypeForBitwidth(Pow2DestWidth, Int.isSigned()); } // Get the QualTy for the input APSInt, and fix it if it has a bitwidth of 1. static inline std::pair fixAPSInt(ASTContext &Ctx, const llvm::APSInt &Int) { llvm::APSInt NewInt; + unsigned APSIntBitwidth = Int.getBitWidth(); + QualType Ty = getAPSIntType(Ctx, Int); // FIXME: This should be a cast from a 1-bit integer type to a boolean type, // but the former is not available in Clang. Instead, extend the APSInt // directly. - if (Int.getBitWidth() == 1 && getAPSIntType(Ctx, Int).isNull()) { - NewInt = Int.extend(Ctx.getTypeSize(Ctx.BoolTy)); - } else - NewInt = Int; - - return std::make_pair(NewInt, getAPSIntType(Ctx, NewInt)); + if (APSIntBitwidth == 1 && Ty.isNull()) + return {Int.extend(Ctx.getTypeSize(Ctx.BoolTy)), + getAPSIntType(Ctx, NewInt)}; + if (llvm::isPowerOf2_32(APSIntBitwidth) || Ty.isNull()) + return {Int, Ty}; + return {Int.extend(Ctx.getTypeSize(Ty)), Ty}; } // Perform implicit type conversion on binary symbolic expressions. diff --git a/clang/test/Analysis/bitint-z3.c b/clang/test/Analysis/bitint-z3.c new file mode 100644 index 000000000000..4cb97f9de829 --- /dev/null +++ b/clang/test/Analysis/bitint-z3.c @@ -0,0 +1,22 @@ +// RUN: %clang_analyze_cc1 -analyzer-checker=core,debug.ExprInspection -w \ +// RUN: -analyzer-config crosscheck-with-z3=true -verify %s +// REQUIRES: z3 + +// Previously these tests were crashing because the SMTConv layer did not +// comprehend the _BitInt types. + +void clang_analyzer_warnIfReached(); + +void c(int b, _BitInt(35) a) { + int d = 0; + if (a) + b = d; + clang_analyzer_warnIfReached(); // expected-warning{{REACHABLE}} +} + +void f(int *d, _BitInt(3) e) { + int g; + d = &g; + e ?: 0; + clang_analyzer_warnIfReached(); // expected-warning{{REACHABLE}} +} From fe7bf4b90b1a835418bddd2b2aa63b4977a9f6d2 Mon Sep 17 00:00:00 2001 From: Rolf Morel <854835+rolfmorel@users.noreply.github.com> Date: Wed, 11 Jun 2025 17:33:55 +0100 Subject: [PATCH 0011/1322] [MLIR][Transform] apply_registered_pass op's options as a dict (#143159) Improve ApplyRegisteredPassOp's support for taking options by taking them as a dict (vs a list of string-valued key-value pairs). Values of options are provided as either static attributes or as params (which pass in attributes at interpreter runtime). In either case, the keys and value attributes are converted to strings and a single options-string, in the format used on the commandline, is constructed to pass to the `addToPipeline`-pass API. --- .../mlir/Dialect/Transform/IR/CMakeLists.txt | 4 + .../Dialect/Transform/IR/TransformAttrs.h | 3 + .../Dialect/Transform/IR/TransformAttrs.td | 19 ++ .../Dialect/Transform/IR/TransformDialect.td | 1 + .../mlir/Dialect/Transform/IR/TransformOps.td | 23 +- .../Dialect/Transform/IR/TransformDialect.cpp | 9 + .../lib/Dialect/Transform/IR/TransformOps.cpp | 219 +++++++++++------- .../mlir/dialects/transform/__init__.py | 82 ++++++- .../Transform/test-pass-application.mlir | 169 ++++++++++++-- mlir/test/python/dialects/transform.py | 52 +++++ 10 files changed, 467 insertions(+), 114 deletions(-) diff --git a/mlir/include/mlir/Dialect/Transform/IR/CMakeLists.txt b/mlir/include/mlir/Dialect/Transform/IR/CMakeLists.txt index df5af7ae710d..9acab9228f10 100644 --- a/mlir/include/mlir/Dialect/Transform/IR/CMakeLists.txt +++ b/mlir/include/mlir/Dialect/Transform/IR/CMakeLists.txt @@ -20,6 +20,10 @@ mlir_tablegen(TransformDialectEnums.h.inc -gen-enum-decls) mlir_tablegen(TransformDialectEnums.cpp.inc -gen-enum-defs) add_public_tablegen_target(MLIRTransformDialectEnumIncGen) add_dependencies(mlir-headers MLIRTransformDialectEnumIncGen) +mlir_tablegen(TransformAttrs.h.inc -gen-attrdef-decls) +mlir_tablegen(TransformAttrs.cpp.inc -gen-attrdef-defs) +add_public_tablegen_target(MLIRTransformDialectAttributesIncGen) +add_dependencies(mlir-headers MLIRTransformDialectAttributesIncGen) add_mlir_dialect(TransformOps transform) add_mlir_doc(TransformOps TransformOps Dialects/ -gen-op-doc -dialect=transform) diff --git a/mlir/include/mlir/Dialect/Transform/IR/TransformAttrs.h b/mlir/include/mlir/Dialect/Transform/IR/TransformAttrs.h index 3cb935003b4c..379af932ca48 100644 --- a/mlir/include/mlir/Dialect/Transform/IR/TransformAttrs.h +++ b/mlir/include/mlir/Dialect/Transform/IR/TransformAttrs.h @@ -17,4 +17,7 @@ #include "mlir/Dialect/Transform/IR/TransformDialectEnums.h.inc" +#define GET_ATTRDEF_CLASSES +#include "mlir/Dialect/Transform/IR/TransformAttrs.h.inc" + #endif // MLIR_DIALECT_TRANSFORM_IR_TRANSFORMATTRS_H diff --git a/mlir/include/mlir/Dialect/Transform/IR/TransformAttrs.td b/mlir/include/mlir/Dialect/Transform/IR/TransformAttrs.td index ebad2994880e..e67a9444c24a 100644 --- a/mlir/include/mlir/Dialect/Transform/IR/TransformAttrs.td +++ b/mlir/include/mlir/Dialect/Transform/IR/TransformAttrs.td @@ -10,6 +10,14 @@ #define MLIR_DIALECT_TRANSFORM_IR_TRANSFORMATTRS include "mlir/IR/EnumAttr.td" +include "mlir/Dialect/Transform/IR/TransformDialect.td" + +class Transform_Attr traits = [], + string baseCppClass = "::mlir::Attribute"> + : AttrDef { + let mnemonic = attrMnemonic; +} def PropagateFailuresCase : I32EnumAttrCase<"Propagate", 1, "propagate">; def SuppressFailuresCase : I32EnumAttrCase<"Suppress", 2, "suppress">; @@ -33,4 +41,15 @@ def MatchCmpIPredicateAttr : I32EnumAttr< let cppNamespace = "::mlir::transform"; } +def ParamOperandAttr : Transform_Attr<"ParamOperand", "param_operand"> { + let description = [{ + Used to refer to a specific param-operand (via its index) from within an + attribute on a transform operation. + }]; + let parameters = (ins + "IntegerAttr":$index + ); + let assemblyFormat = "`<` `index` `=` $index `>`"; +} + #endif // MLIR_DIALECT_TRANSFORM_IR_TRANSFORMATTRS diff --git a/mlir/include/mlir/Dialect/Transform/IR/TransformDialect.td b/mlir/include/mlir/Dialect/Transform/IR/TransformDialect.td index d03049e186f9..c7ea5ade72ac 100644 --- a/mlir/include/mlir/Dialect/Transform/IR/TransformDialect.td +++ b/mlir/include/mlir/Dialect/Transform/IR/TransformDialect.td @@ -19,6 +19,7 @@ def Transform_Dialect : Dialect { let cppNamespace = "::mlir::transform"; let hasOperationAttrVerify = 1; + let useDefaultAttributePrinterParser = 1; let extraClassDeclaration = [{ /// Symbol name for the default entry point "named sequence". constexpr const static ::llvm::StringLiteral diff --git a/mlir/include/mlir/Dialect/Transform/IR/TransformOps.td b/mlir/include/mlir/Dialect/Transform/IR/TransformOps.td index e864a65f8cea..f75ba27e58e7 100644 --- a/mlir/include/mlir/Dialect/Transform/IR/TransformOps.td +++ b/mlir/include/mlir/Dialect/Transform/IR/TransformOps.td @@ -405,10 +405,23 @@ def ApplyRegisteredPassOp : TransformDialectOp<"apply_registered_pass", let description = [{ This transform applies the specified pass or pass pipeline to the targeted ops. The name of the pass/pipeline is specified as a string attribute, as - set during pass/pipeline registration. Optionally, pass options may be - specified as (space-separated) string attributes with the option to pass - these attributes via params. The pass options syntax is identical to the one - used with "mlir-opt". + set during pass/pipeline registration. + + Optionally, pass options may be specified via a DictionaryAttr. This + dictionary is converted to a string -- formatted `key=value ...` -- which + is expected to be in the exact format used by the pass on the commandline. + Values are either attributes or (SSA-values of) Transform Dialect params. + For example: + + ```mlir + transform.apply_registered_pass "canonicalize" + with options = { "top-down" = false, + "max-iterations" = %max_iter, + "test-convergence" = true, + "max-num-rewrites" = %max_rewrites } + to %module + : (!transform.any_param, !transform.any_param, !transform.any_op) -> !transform.any_op + ``` This op first looks for a pass pipeline with the specified name. If no such pipeline exists, it looks for a pass with the specified name. If no such @@ -422,7 +435,7 @@ def ApplyRegisteredPassOp : TransformDialectOp<"apply_registered_pass", }]; let arguments = (ins StrAttr:$pass_name, - DefaultValuedAttr:$options, + DefaultValuedAttr:$options, Variadic:$dynamic_options, TransformHandleTypeInterface:$target); let results = (outs TransformHandleTypeInterface:$result); diff --git a/mlir/lib/Dialect/Transform/IR/TransformDialect.cpp b/mlir/lib/Dialect/Transform/IR/TransformDialect.cpp index 497ceb19f1a2..4a95fe7459e8 100644 --- a/mlir/lib/Dialect/Transform/IR/TransformDialect.cpp +++ b/mlir/lib/Dialect/Transform/IR/TransformDialect.cpp @@ -8,17 +8,22 @@ #include "mlir/Dialect/Transform/IR/TransformDialect.h" #include "mlir/Analysis/CallGraph.h" +#include "mlir/Dialect/Transform/IR/TransformAttrs.h" #include "mlir/Dialect/Transform/IR/TransformOps.h" #include "mlir/Dialect/Transform/IR/TransformTypes.h" #include "mlir/Dialect/Transform/IR/Utils.h" #include "mlir/Dialect/Transform/Interfaces/TransformInterfaces.h" #include "mlir/IR/DialectImplementation.h" #include "llvm/ADT/SCCIterator.h" +#include "llvm/ADT/TypeSwitch.h" using namespace mlir; #include "mlir/Dialect/Transform/IR/TransformDialect.cpp.inc" +#define GET_ATTRDEF_CLASSES +#include "mlir/Dialect/Transform/IR/TransformAttrs.cpp.inc" + #ifndef NDEBUG void transform::detail::checkImplementsTransformOpInterface( StringRef name, MLIRContext *context) { @@ -66,6 +71,10 @@ void transform::TransformDialect::initialize() { #include "mlir/Dialect/Transform/IR/TransformOps.cpp.inc" >(); initializeTypes(); + addAttributes< +#define GET_ATTRDEF_LIST +#include "mlir/Dialect/Transform/IR/TransformAttrs.cpp.inc" + >(); initializeLibraryModule(); } diff --git a/mlir/lib/Dialect/Transform/IR/TransformOps.cpp b/mlir/lib/Dialect/Transform/IR/TransformOps.cpp index a0f9518e3d12..582d082153be 100644 --- a/mlir/lib/Dialect/Transform/IR/TransformOps.cpp +++ b/mlir/lib/Dialect/Transform/IR/TransformOps.cpp @@ -54,10 +54,11 @@ using namespace mlir; static ParseResult parseApplyRegisteredPassOptions( - OpAsmParser &parser, ArrayAttr &options, + OpAsmParser &parser, DictionaryAttr &options, SmallVectorImpl &dynamicOptions); static void printApplyRegisteredPassOptions(OpAsmPrinter &printer, - Operation *op, ArrayAttr options, + Operation *op, + DictionaryAttr options, ValueRange dynamicOptions); static ParseResult parseSequenceOpOperands( OpAsmParser &parser, std::optional &root, @@ -784,41 +785,50 @@ DiagnosedSilenceableFailure transform::ApplyRegisteredPassOp::apply(transform::TransformRewriter &rewriter, transform::TransformResults &results, transform::TransformState &state) { - // Obtain a single options-string from options passed statically as - // string attributes as well as "dynamically" through params. - std::string options; - OperandRange dynamicOptions = getDynamicOptions(); - size_t dynamicOptionsIdx = 0; - for (auto [idx, optionAttr] : llvm::enumerate(getOptions())) { - if (idx > 0) - options += " "; // Interleave options seperator. + // Obtain a single options-string to pass to the pass(-pipeline) from options + // passed in as a dictionary of keys mapping to values which are either + // attributes or param-operands pointing to attributes. - if (auto strAttr = dyn_cast(optionAttr)) { - options += strAttr.getValue(); - } else if (isa(optionAttr)) { - assert(dynamicOptionsIdx < dynamicOptions.size() && + std::string options; + llvm::raw_string_ostream optionsStream(options); // For "printing" attrs. + + OperandRange dynamicOptions = getDynamicOptions(); + for (auto [idx, namedAttribute] : llvm::enumerate(getOptions())) { + if (idx > 0) + optionsStream << " "; // Interleave options separator. + optionsStream << namedAttribute.getName().str(); // Append the key. + optionsStream << "="; // And the key-value separator. + + Attribute valueAttrToAppend; + if (auto paramOperandIndex = + dyn_cast(namedAttribute.getValue())) { + // The corresponding value attribute is passed in via a param. + // Obtain the param-operand via its specified index. + size_t dynamicOptionIdx = paramOperandIndex.getIndex().getInt(); + assert(dynamicOptionIdx < dynamicOptions.size() && "number of dynamic option markers (UnitAttr) in options ArrayAttr " "should be the same as the number of options passed as params"); ArrayRef dynamicOption = - state.getParams(dynamicOptions[dynamicOptionsIdx++]); + state.getParams(dynamicOptions[dynamicOptionIdx]); if (dynamicOption.size() != 1) - return emitSilenceableError() << "options passed as a param must have " - "a single value associated, param " - << dynamicOptionsIdx - 1 << " associates " - << dynamicOption.size(); - - if (auto dynamicOptionStr = dyn_cast(dynamicOption[0])) { - options += dynamicOptionStr.getValue(); - } else { return emitSilenceableError() - << "options passed as a param must be a string, got " - << dynamicOption[0]; - } + << "options passed as a param must have " + "a single value associated, param " + << dynamicOptionIdx << " associates " << dynamicOption.size(); + valueAttrToAppend = dynamicOption[0]; } else { - llvm_unreachable( - "expected options element to be either StringAttr or UnitAttr"); + // Value is a static attribute. + valueAttrToAppend = namedAttribute.getValue(); + } + + // Append string representation of value attribute. + if (auto strAttr = dyn_cast(valueAttrToAppend)) { + optionsStream << strAttr.getValue().str(); + } else { + valueAttrToAppend.print(optionsStream, /*elideType=*/true); } } + optionsStream.flush(); // Get pass or pass pipeline from registry. const PassRegistryEntry *info = PassPipelineInfo::lookup(getPassName()); @@ -864,84 +874,121 @@ transform::ApplyRegisteredPassOp::apply(transform::TransformRewriter &rewriter, } static ParseResult parseApplyRegisteredPassOptions( - OpAsmParser &parser, ArrayAttr &options, + OpAsmParser &parser, DictionaryAttr &options, SmallVectorImpl &dynamicOptions) { - auto dynamicOptionMarker = UnitAttr::get(parser.getContext()); - SmallVector optionsArray; + // Construct the options DictionaryAttr per a `{ key = value, ... }` syntax. + SmallVector keyValuePairs; - auto parseOperandOrString = [&]() -> OptionalParseResult { - OpAsmParser::UnresolvedOperand operand; - OptionalParseResult parsedOperand = parser.parseOptionalOperand(operand); - if (parsedOperand.has_value()) { - if (failed(parsedOperand.value())) - return failure(); + size_t dynamicOptionsIdx = 0; + auto parseKeyValuePair = [&]() -> ParseResult { + // Parse items of the form `key = value` where `key` is a bare identifier or + // a string and `value` is either an attribute or an operand. + std::string key; + Attribute valueAttr; + if (parser.parseOptionalKeywordOrString(&key)) + return parser.emitError(parser.getCurrentLocation()) + << "expected key to either be an identifier or a string"; + if (key.empty()) + return failure(); + + if (parser.parseEqual()) + return parser.emitError(parser.getCurrentLocation()) + << "expected '=' after key in key-value pair"; + + // Parse the value, which can be either an attribute or an operand. + OptionalParseResult parsedValueAttr = + parser.parseOptionalAttribute(valueAttr); + if (!parsedValueAttr.has_value()) { + OpAsmParser::UnresolvedOperand operand; + ParseResult parsedOperand = parser.parseOperand(operand); + if (failed(parsedOperand)) + return parser.emitError(parser.getCurrentLocation()) + << "expected a valid attribute or operand as value associated " + << "to key '" << key << "'"; + // To make use of the operand, we need to store it in the options dict. + // As SSA-values cannot occur in attributes, what we do instead is store + // an attribute in its place that contains the index of the param-operand, + // so that an attr-value associated to the param can be resolved later on. dynamicOptions.push_back(operand); - optionsArray.push_back( - dynamicOptionMarker); // Placeholder for knowing where to - // inject the dynamic option-as-param. - return success(); + auto wrappedIndex = IntegerAttr::get( + IntegerType::get(parser.getContext(), 64), dynamicOptionsIdx++); + valueAttr = + transform::ParamOperandAttr::get(parser.getContext(), wrappedIndex); + } else if (failed(parsedValueAttr.value())) { + return failure(); // NB: Attempted parse should have output error message. + } else if (isa(valueAttr)) { + return parser.emitError(parser.getCurrentLocation()) + << "the param_operand attribute is a marker reserved for " + << "indicating a value will be passed via params and is only used " + << "in the generic print format"; } - StringAttr stringAttr; - OptionalParseResult parsedStringAttr = - parser.parseOptionalAttribute(stringAttr); - if (parsedStringAttr.has_value()) { - if (failed(parsedStringAttr.value())) - return failure(); - optionsArray.push_back(stringAttr); - return success(); - } - - return std::nullopt; + keyValuePairs.push_back(NamedAttribute(key, valueAttr)); + return success(); }; - OptionalParseResult parsedOptionsElement = parseOperandOrString(); - while (parsedOptionsElement.has_value()) { - if (failed(parsedOptionsElement.value())) - return failure(); - parsedOptionsElement = parseOperandOrString(); - } + if (parser.parseCommaSeparatedList(AsmParser::Delimiter::Braces, + parseKeyValuePair, + " in options dictionary")) + return failure(); // NB: Attempted parse should have output error message. - if (optionsArray.empty()) { + if (DictionaryAttr::findDuplicate( + keyValuePairs, /*isSorted=*/false) // Also sorts the keyValuePairs. + .has_value()) return parser.emitError(parser.getCurrentLocation()) - << "expected at least one option (either a string or a param)"; - } - options = parser.getBuilder().getArrayAttr(optionsArray); + << "duplicate keys found in options dictionary"; + + options = DictionaryAttr::getWithSorted(parser.getContext(), keyValuePairs); + return success(); } static void printApplyRegisteredPassOptions(OpAsmPrinter &printer, - Operation *op, ArrayAttr options, + Operation *op, + DictionaryAttr options, ValueRange dynamicOptions) { - size_t currentDynamicOptionIdx = 0; - for (auto [idx, optionAttr] : llvm::enumerate(options)) { - if (idx > 0) - printer << " "; // Interleave options separator. + if (options.empty()) + return; - if (isa(optionAttr)) - printer.printOperand(dynamicOptions[currentDynamicOptionIdx++]); - else if (auto strAttr = dyn_cast(optionAttr)) - printer.printAttribute(strAttr); - else - llvm_unreachable("each option should be either a StringAttr or UnitAttr"); - } + printer << "{"; + llvm::interleaveComma(options, printer, [&](NamedAttribute namedAttribute) { + printer << namedAttribute.getName() << " = "; + Attribute value = namedAttribute.getValue(); + if (auto indexAttr = dyn_cast(value)) { + // Resolve index of param-operand to its actual SSA-value and print that. + printer.printOperand(dynamicOptions[indexAttr.getIndex().getInt()]); + } else { + printer.printAttribute(value); + } + }); + printer << "}"; } LogicalResult transform::ApplyRegisteredPassOp::verify() { - size_t numUnitsInOptions = 0; - for (Attribute optionsElement : getOptions()) { - if (isa(optionsElement)) - numUnitsInOptions++; - else if (!isa(optionsElement)) - return emitOpError() << "expected each option to be either a StringAttr " - << "or a UnitAttr, got " << optionsElement; - } + // Check that there is a one-to-one correspondence between param operands + // and references to dynamic options in the options dictionary. - if (getDynamicOptions().size() != numUnitsInOptions) - return emitOpError() - << "expected the same number of options passed as params as " - << "UnitAttr elements in options ArrayAttr"; + auto dynamicOptions = SmallVector(getDynamicOptions()); + for (NamedAttribute namedAttr : getOptions()) + if (auto paramOperand = + dyn_cast(namedAttr.getValue())) { + size_t dynamicOptionIdx = paramOperand.getIndex().getInt(); + if (dynamicOptionIdx < 0 || dynamicOptionIdx >= dynamicOptions.size()) + return emitOpError() + << "dynamic option index " << dynamicOptionIdx + << " is out of bounds for the number of dynamic options: " + << dynamicOptions.size(); + if (dynamicOptions[dynamicOptionIdx] == nullptr) + return emitOpError() << "dynamic option index " << dynamicOptionIdx + << " is already used in options"; + dynamicOptions[dynamicOptionIdx] = nullptr; // Mark this option as used. + } + + for (Value dynamicOption : dynamicOptions) + if (dynamicOption) + return emitOpError() << "a param operand does not have a corresponding " + << "param_operand attr in the options dict"; return success(); } diff --git a/mlir/python/mlir/dialects/transform/__init__.py b/mlir/python/mlir/dialects/transform/__init__.py index 5b158ec6b65f..10a04b0cc14e 100644 --- a/mlir/python/mlir/dialects/transform/__init__.py +++ b/mlir/python/mlir/dialects/transform/__init__.py @@ -18,7 +18,12 @@ try: except ImportError as e: raise RuntimeError("Error loading imports from extension module") from e -from typing import Optional, Sequence, Union, NewType +from typing import Dict, Optional, Sequence, Union, NewType + + +@register_attribute_builder("ParamOperandAttr") +def _paramOperandAttr(x: int, context) -> Attribute: + return Attribute.parse(f"#transform.param_operand", context=context) @_ods_cext.register_operation(_Dialect, replace=True) @@ -214,6 +219,81 @@ class YieldOp(YieldOp): super().__init__(_get_op_results_or_values(operands), loc=loc, ip=ip) +@_ods_cext.register_operation(_Dialect, replace=True) +class ApplyRegisteredPassOp(ApplyRegisteredPassOp): + def __init__( + self, + result: Type, + pass_name: Union[str, StringAttr], + target: Union[Operation, Value, OpView], + *, + options: Optional[ + Dict[ + Union[str, StringAttr], + Union[Attribute, Value, Operation, OpView], + ] + ] = None, + loc=None, + ip=None, + ): + options_dict = {} + dynamic_options = [] + + ParamOperandAttr = AttrBuilder.get("ParamOperandAttr") + context = (loc and loc.context) or Context.current + + cur_param_operand_idx = 0 + for key, value in options.items() if options is not None else {}: + if isinstance(key, StringAttr): + key = key.value + + if isinstance(value, (Value, Operation, OpView)): + dynamic_options.append(_get_op_result_or_value(value)) + options_dict[key] = ParamOperandAttr(cur_param_operand_idx, context) + cur_param_operand_idx += 1 + elif isinstance(value, Attribute): + options_dict[key] = value + elif isinstance(value, str): + options_dict[key] = StringAttr.get(value) + else: + raise TypeError(f"Unsupported option type: {type(value)}") + if len(options_dict) > 0: + print(options_dict, cur_param_operand_idx) + super().__init__( + result, + pass_name, + dynamic_options, + target=_get_op_result_or_value(target), + options=DictAttr.get(options_dict), + loc=loc, + ip=ip, + ) + + +def apply_registered_pass( + result: Type, + pass_name: Union[str, StringAttr], + target: Union[Operation, Value, OpView], + *, + options: Optional[ + Dict[ + Union[str, StringAttr], + Union[Attribute, Value, Operation, OpView], + ] + ] = None, + loc=None, + ip=None, +) -> Value: + return ApplyRegisteredPassOp( + result=result, + pass_name=pass_name, + target=target, + options=options, + loc=loc, + ip=ip, + ).result + + AnyOpTypeT = NewType("AnyOpType", AnyOpType) diff --git a/mlir/test/Dialect/Transform/test-pass-application.mlir b/mlir/test/Dialect/Transform/test-pass-application.mlir index 463fd98afa65..6e6d4eb7e249 100644 --- a/mlir/test/Dialect/Transform/test-pass-application.mlir +++ b/mlir/test/Dialect/Transform/test-pass-application.mlir @@ -80,7 +80,7 @@ module attributes {transform.with_named_sequence} { // expected-error @below {{failed to add pass or pass pipeline to pipeline: canonicalize}} // expected-error @below {{: no such option invalid-option}} transform.apply_registered_pass "canonicalize" - with options = "invalid-option=1" to %1 + with options = { "invalid-option" = 1 } to %1 : (!transform.any_op) -> !transform.any_op transform.yield } @@ -97,7 +97,7 @@ module attributes {transform.with_named_sequence} { transform.named_sequence @__transform_main(%arg1: !transform.any_op) { %1 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op transform.apply_registered_pass "canonicalize" - with options = "top-down=false" to %1 + with options = { "top-down" = false } to %1 : (!transform.any_op) -> !transform.any_op transform.yield } @@ -115,7 +115,7 @@ module attributes {transform.with_named_sequence} { %1 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op //transform.apply_registered_pass "canonicalize" with options = "top-down=false,max-iterations=10" to %1 : (!transform.any_op) -> !transform.any_op transform.apply_registered_pass "canonicalize" - with options = "top-down=false test-convergence=true" to %1 + with options = { "top-down" = false, "test-convergence" =true } to %1 : (!transform.any_op) -> !transform.any_op transform.yield } @@ -132,7 +132,7 @@ module attributes {transform.with_named_sequence} { transform.named_sequence @__transform_main(%arg1: !transform.any_op) { %1 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op transform.apply_registered_pass "canonicalize" - with options = "top-down=false" "max-iterations=0" to %1 + with options = { "top-down" = false, "max-iterations" = 0 } to %1 : (!transform.any_op) -> !transform.any_op transform.yield } @@ -148,10 +148,15 @@ func.func @valid_dynamic_pass_options() { module attributes {transform.with_named_sequence} { transform.named_sequence @__transform_main(%arg1: !transform.any_op) { %1 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op - %max_iter = transform.param.constant "max-iterations=10" -> !transform.any_param - %max_rewrites = transform.param.constant "max-num-rewrites=1" -> !transform.any_param - %2 = transform.apply_registered_pass "canonicalize" - with options = "top-down=false" %max_iter "test-convergence=true" %max_rewrites to %1 + %max_iter = transform.param.constant 10 -> !transform.any_param + %max_rewrites = transform.param.constant 1 -> !transform.any_param + %2 = transform.apply_registered_pass + "canonicalize" + with options = { "top-down" = false, + "max-iterations" = %max_iter, + "test-convergence" = true, + "max-num-rewrites" = %max_rewrites } + to %1 : (!transform.any_param, !transform.any_param, !transform.any_op) -> !transform.any_op transform.yield } @@ -159,7 +164,7 @@ module attributes {transform.with_named_sequence} { // ----- -func.func @invalid_dynamic_options_as_array() { +func.func @invalid_options_as_str() { return } @@ -167,34 +172,80 @@ module attributes {transform.with_named_sequence} { transform.named_sequence @__transform_main(%arg1: !transform.any_op) { %1 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op %max_iter = transform.param.constant "max-iterations=10" -> !transform.any_param - // expected-error @+2 {{expected at least one option (either a string or a param)}} + // expected-error @+2 {{expected '{' in options dictionary}} %2 = transform.apply_registered_pass "canonicalize" - with options = ["top-down=false" %max_iter] to %1 - : (!transform.any_param, !transform.any_param, !transform.any_op) -> !transform.any_op + with options = "top-down=false" to %1 : (!transform.any_op) -> !transform.any_op transform.yield } } // ----- -func.func @invalid_options_as_pairs() { +func.func @invalid_options_as_pairs_without_braces() { return } module attributes {transform.with_named_sequence} { transform.named_sequence @__transform_main(%arg1: !transform.any_op) { %1 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op - // expected-error @+2 {{expected 'to'}} + // expected-error @+2 {{expected '{' in options dictionary}} %2 = transform.apply_registered_pass "canonicalize" - with options = "top-down=" false to %1 - : (!transform.any_param, !transform.any_param, !transform.any_op) -> !transform.any_op + with options = "top-down"=false to %1 : (!transform.any_op) -> !transform.any_op transform.yield } } // ----- -func.func @invalid_pass_option_param() { +func.func @invalid_options_due_to_reserved_attr() { + return +} + +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg1: !transform.any_op) { + %1 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op + // expected-error @+2 {{the param_operand attribute is a marker reserved for indicating a value will be passed via params and is only used in the generic print format}} + %2 = transform.apply_registered_pass "canonicalize" + with options = { "top-down" = #transform.param_operand } to %1 : (!transform.any_op) -> !transform.any_op + transform.yield + } +} + +// ----- + +func.func @invalid_options_due_duplicated_key() { + return +} + +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg1: !transform.any_op) { + %1 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op + // expected-error @+2 {{duplicate keys found in options dictionary}} + %2 = transform.apply_registered_pass "canonicalize" + with options = {"top-down"=false,"top-down"=true} to %1 : (!transform.any_op) -> !transform.any_op + transform.yield + } +} + +// ----- + +func.func @invalid_options_due_invalid_key() { + return +} + +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg1: !transform.any_op) { + %1 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op + // expected-error @+2 {{expected key to either be an identifier or a string}} + %2 = transform.apply_registered_pass "canonicalize" + with options = { @label = 0 } to %1 : (!transform.any_op) -> !transform.any_op + transform.yield + } +} + +// ----- + +func.func @invalid_pass_option_bare_param() { return } @@ -202,7 +253,7 @@ module attributes {transform.with_named_sequence} { transform.named_sequence @__transform_main(%arg1: !transform.any_op) { %1 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op %pass_options = transform.param.constant 42 -> !transform.any_param - // expected-error @below {{options passed as a param must be a string, got 42}} + // expected-error @+2 {{expected '{' in options dictionary}} transform.apply_registered_pass "canonicalize" with options = %pass_options to %1 : (!transform.any_param, !transform.any_op) -> !transform.any_op @@ -219,12 +270,12 @@ func.func @too_many_pass_option_params() { module attributes {transform.with_named_sequence} { transform.named_sequence @__transform_main(%arg1: !transform.any_op) { %1 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op - %x = transform.param.constant "x" -> !transform.any_param - %y = transform.param.constant "y" -> !transform.any_param - %pass_options = transform.merge_handles %x, %y : !transform.any_param + %x = transform.param.constant true -> !transform.any_param + %y = transform.param.constant false -> !transform.any_param + %topdown_options = transform.merge_handles %x, %y : !transform.any_param // expected-error @below {{options passed as a param must have a single value associated, param 0 associates 2}} transform.apply_registered_pass "canonicalize" - with options = %pass_options to %1 + with options = { "top-down" = %topdown_options } to %1 : (!transform.any_param, !transform.any_op) -> !transform.any_op transform.yield } @@ -248,3 +299,77 @@ module attributes {transform.with_named_sequence} { transform.yield } } + +// ----- + +///////////////////////////////////////////////////////////////////// +// Check that the following cases are caugh in the generic format. // +///////////////////////////////////////////////////////////////////// + +// Invalid due to param_operand occurences in options dict not being +// one-to-one with the dynamic options provided as params: +// param_operand_index out of bounds w.r.t. the number of options provided via params. + +"builtin.module"() ({ + "transform.named_sequence"() <{function_type = (!transform.any_op) -> (), sym_name = "__transform_main"}> ({ + ^bb0(%arg0: !transform.any_op): + %0 = "transform.structured.match"(%arg0) <{ops = ["func.func"]}> : (!transform.any_op) -> !transform.any_op + %1 = "transform.param.constant"() <{value = 10 : i64}> : () -> !transform.any_param + // expected-error @below {{dynamic option index 1 is out of bounds for the number of dynamic options: 1}} + %2 = "transform.apply_registered_pass"(%1, %0) <{ + options = {"max-iterations" = #transform.param_operand, + "test-convergence" = true, + "top-down" = false}, + pass_name = "canonicalize"}> + : (!transform.any_param, !transform.any_op) -> !transform.any_op + "transform.yield"() : () -> () + }) : () -> () +}) {transform.with_named_sequence} : () -> () + +// ----- + +// Invalid due to param_operand occurences in options dict not being +// one-to-one with the dynamic options provided as params: +// the first option-param is referred to twice and the second one not at all. +// (In the pretty-printed format, if you want to refer to a param SSA-value twice, it counts as two param arguments.) + +"builtin.module"() ({ + "transform.named_sequence"() <{function_type = (!transform.any_op) -> (), sym_name = "__transform_main"}> ({ + ^bb0(%arg0: !transform.any_op): + %0 = "transform.structured.match"(%arg0) <{ops = ["func.func"]}> : (!transform.any_op) -> !transform.any_op + %1 = "transform.param.constant"() <{value = 10 : i64}> : () -> !transform.any_param + %2 = "transform.param.constant"() <{value = 1 : i64}> : () -> !transform.any_param + // expected-error @below {{dynamic option index 0 is already used in options}} + %3 = "transform.apply_registered_pass"(%1, %2, %0) <{ + options = {"max-iterations" = #transform.param_operand, + "max-num-rewrites" = #transform.param_operand, + "test-convergence" = true, + "top-down" = false}, + pass_name = "canonicalize"}> + : (!transform.any_param, !transform.any_param, !transform.any_op) -> !transform.any_op + "transform.yield"() : () -> () + }) : () -> () +}) {transform.with_named_sequence} : () -> () + +// ----- + +// Invalid due to param_operand occurences in options dict not being +// one-to-one with the dynamic options provided as params: +// two option-params are provide though only the first one is referred to from the options-dict. + +"builtin.module"() ({ + "transform.named_sequence"() <{function_type = (!transform.any_op) -> (), sym_name = "__transform_main"}> ({ + ^bb0(%arg0: !transform.any_op): + %0 = "transform.structured.match"(%arg0) <{ops = ["func.func"]}> : (!transform.any_op) -> !transform.any_op + %1 = "transform.param.constant"() <{value = 10 : i64}> : () -> !transform.any_param + %2 = "transform.param.constant"() <{value = 1 : i64}> : () -> !transform.any_param + // expected-error @below {{a param operand does not have a corresponding param_operand attr in the options dict}} + %3 = "transform.apply_registered_pass"(%1, %2, %0) <{ + options = {"max-iterations" = #transform.param_operand, + "test-convergence" = true, + "top-down" = false}, + pass_name = "canonicalize"}> + : (!transform.any_param, !transform.any_param, !transform.any_op) -> !transform.any_op + "transform.yield"() : () -> () + }) : () -> () +}) {transform.with_named_sequence} : () -> () diff --git a/mlir/test/python/dialects/transform.py b/mlir/test/python/dialects/transform.py index 6ed4818fc9d2..48bc9bad37a1 100644 --- a/mlir/test/python/dialects/transform.py +++ b/mlir/test/python/dialects/transform.py @@ -254,3 +254,55 @@ def testReplicateOp(module: Module): # CHECK: %[[FIRST:.+]] = pdl_match # CHECK: %[[SECOND:.+]] = pdl_match # CHECK: %{{.*}} = replicate num(%[[FIRST]]) %[[SECOND]] + + +@run +def testApplyRegisteredPassOp(module: Module): + sequence = transform.SequenceOp( + transform.FailurePropagationMode.Propagate, [], transform.AnyOpType.get() + ) + with InsertionPoint(sequence.body): + mod = transform.ApplyRegisteredPassOp( + transform.AnyOpType.get(), "canonicalize", sequence.bodyTarget + ) + mod = transform.ApplyRegisteredPassOp( + transform.AnyOpType.get(), + "canonicalize", + mod.result, + options={"top-down": BoolAttr.get(False)}, + ) + max_iter = transform.param_constant( + transform.AnyParamType.get(), + IntegerAttr.get(IntegerType.get_signless(64), 10), + ) + max_rewrites = transform.param_constant( + transform.AnyParamType.get(), + IntegerAttr.get(IntegerType.get_signless(64), 1), + ) + transform.apply_registered_pass( + transform.AnyOpType.get(), + "canonicalize", + mod, + options={ + "top-down": BoolAttr.get(False), + "max-iterations": max_iter, + "test-convergence": BoolAttr.get(True), + "max-rewrites": max_rewrites, + }, + ) + transform.YieldOp() + # CHECK-LABEL: TEST: testApplyRegisteredPassOp + # CHECK: transform.sequence + # CHECK: %{{.*}} = apply_registered_pass "canonicalize" to {{.*}} : (!transform.any_op) -> !transform.any_op + # CHECK: %{{.*}} = apply_registered_pass "canonicalize" + # CHECK-SAME: with options = {"top-down" = false} + # CHECK-SAME: to {{.*}} : (!transform.any_op) -> !transform.any_op + # CHECK: %[[MAX_ITER:.+]] = transform.param.constant + # CHECK: %[[MAX_REWRITE:.+]] = transform.param.constant + # CHECK: %{{.*}} = apply_registered_pass "canonicalize" + # NB: MLIR has sorted the dict lexicographically by key: + # CHECK-SAME: with options = {"max-iterations" = %[[MAX_ITER]], + # CHECK-SAME: "max-rewrites" = %[[MAX_REWRITE]], + # CHECK-SAME: "test-convergence" = true, + # CHECK-SAME: "top-down" = false} + # CHECK-SAME: to %{{.*}} : (!transform.any_param, !transform.any_param, !transform.any_op) -> !transform.any_op From 459475020aeff15d0f886ab99c59d66b744d3e17 Mon Sep 17 00:00:00 2001 From: Jeremy Morse Date: Wed, 11 Jun 2025 16:35:55 +0100 Subject: [PATCH 0012/1322] Reapply 76197ea6f91f after removing an assertion Specifically this is the assertion in BasicBlock.cpp. Now that we're not examining or setting that flag consistently (because it'll be deleted in about an hour) there's no need to keep this assertion. Original commit title: [DebugInfo][RemoveDIs] Remove some debug intrinsic-only codepaths (#143451) --- llvm/lib/CodeGen/SelectionDAG/FastISel.cpp | 3 - llvm/lib/IR/AutoUpgrade.cpp | 25 ++---- llvm/lib/IR/BasicBlock.cpp | 1 - llvm/lib/IR/DIBuilder.cpp | 97 +++++----------------- llvm/lib/IR/DebugInfo.cpp | 19 +---- llvm/lib/Transforms/Utils/LoopUtils.cpp | 36 +++----- llvm/unittests/IR/IRBuilderTest.cpp | 10 --- 7 files changed, 40 insertions(+), 151 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp b/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp index 59cd0dc8dd34..e8a3df3366b2 100644 --- a/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp @@ -1671,9 +1671,6 @@ void FastISel::fastEmitBranch(MachineBasicBlock *MSucc, const DebugLoc &DbgLoc) { const BasicBlock *BB = FuncInfo.MBB->getBasicBlock(); bool BlockHasMultipleInstrs = &BB->front() != &BB->back(); - // Handle legacy case of debug intrinsics - if (BlockHasMultipleInstrs && !BB->getModule()->IsNewDbgInfoFormat) - BlockHasMultipleInstrs = BB->sizeWithoutDebug() > 1; if (BlockHasMultipleInstrs && FuncInfo.MBB->isLayoutSuccessor(MSucc)) { // For more accurate line information if this is the only non-debug // instruction in the block then emit it, otherwise we have the diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp index cb90af36f3d9..a0886776ff93 100644 --- a/llvm/lib/IR/AutoUpgrade.cpp +++ b/llvm/lib/IR/AutoUpgrade.cpp @@ -4490,7 +4490,6 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) { Builder.SetInsertPoint(CI->getParent(), CI->getIterator()); if (!NewFn) { - bool FallthroughToDefaultUpgrade = false; // Get the Function's name. StringRef Name = F->getName(); @@ -4518,29 +4517,15 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) { } else if (IsAMDGCN) { Rep = upgradeAMDGCNIntrinsicCall(Name, CI, F, Builder); } else if (IsDbg) { - // We might have decided we don't want the new format after all between - // first requesting the upgrade and now; skip the conversion if that is - // the case, and check here to see if the intrinsic needs to be upgraded - // normally. - if (!CI->getModule()->IsNewDbgInfoFormat) { - bool NeedsUpgrade = - upgradeIntrinsicFunction1(CI->getCalledFunction(), NewFn, false); - if (!NeedsUpgrade) - return; - FallthroughToDefaultUpgrade = true; - } else { - upgradeDbgIntrinsicToDbgRecord(Name, CI); - } + upgradeDbgIntrinsicToDbgRecord(Name, CI); } else { llvm_unreachable("Unknown function for CallBase upgrade."); } - if (!FallthroughToDefaultUpgrade) { - if (Rep) - CI->replaceAllUsesWith(Rep); - CI->eraseFromParent(); - return; - } + if (Rep) + CI->replaceAllUsesWith(Rep); + CI->eraseFromParent(); + return; } const auto &DefaultCase = [&]() -> void { diff --git a/llvm/lib/IR/BasicBlock.cpp b/llvm/lib/IR/BasicBlock.cpp index f716e9970b84..62a75313bb17 100644 --- a/llvm/lib/IR/BasicBlock.cpp +++ b/llvm/lib/IR/BasicBlock.cpp @@ -60,7 +60,6 @@ void BasicBlock::convertToNewDbgValues() { // instruction. SmallVector DbgVarRecs; for (Instruction &I : make_early_inc_range(InstList)) { - assert(!I.DebugMarker && "DebugMarker already set on old-format instrs?"); if (DbgVariableIntrinsic *DVI = dyn_cast(&I)) { // Convert this dbg.value to a DbgVariableRecord. DbgVariableRecord *Value = new DbgVariableRecord(DVI); diff --git a/llvm/lib/IR/DIBuilder.cpp b/llvm/lib/IR/DIBuilder.cpp index 5e5ff22132e9..1484c549dd58 100644 --- a/llvm/lib/IR/DIBuilder.cpp +++ b/llvm/lib/IR/DIBuilder.cpp @@ -1047,36 +1047,13 @@ DbgInstPtr DIBuilder::insertDbgAssign(Instruction *LinkedInstr, Value *Val, LinkedInstr->getMetadata(LLVMContext::MD_DIAssignID)); assert(Link && "Linked instruction must have DIAssign metadata attached"); - if (M.IsNewDbgInfoFormat) { - DbgVariableRecord *DVR = DbgVariableRecord::createDVRAssign( - Val, SrcVar, ValExpr, Link, Addr, AddrExpr, DL); - // Insert after LinkedInstr. - BasicBlock::iterator NextIt = std::next(LinkedInstr->getIterator()); - NextIt.setHeadBit(true); - insertDbgVariableRecord(DVR, NextIt); - return DVR; - } - - LLVMContext &Ctx = LinkedInstr->getContext(); - Module *M = LinkedInstr->getModule(); - if (!AssignFn) - AssignFn = Intrinsic::getOrInsertDeclaration(M, Intrinsic::dbg_assign); - - std::array Args = { - MetadataAsValue::get(Ctx, ValueAsMetadata::get(Val)), - MetadataAsValue::get(Ctx, SrcVar), - MetadataAsValue::get(Ctx, ValExpr), - MetadataAsValue::get(Ctx, Link), - MetadataAsValue::get(Ctx, ValueAsMetadata::get(Addr)), - MetadataAsValue::get(Ctx, AddrExpr), - }; - - IRBuilder<> B(Ctx); - B.SetCurrentDebugLocation(DL); - - auto *DVI = cast(B.CreateCall(AssignFn, Args)); - DVI->insertAfter(LinkedInstr->getIterator()); - return DVI; + DbgVariableRecord *DVR = DbgVariableRecord::createDVRAssign( + Val, SrcVar, ValExpr, Link, Addr, AddrExpr, DL); + // Insert after LinkedInstr. + BasicBlock::iterator NextIt = std::next(LinkedInstr->getIterator()); + NextIt.setHeadBit(true); + insertDbgVariableRecord(DVR, NextIt); + return DVR; } /// Initialize IRBuilder for inserting dbg.declare and dbg.value intrinsics. @@ -1101,18 +1078,10 @@ DbgInstPtr DIBuilder::insertDbgValueIntrinsic(llvm::Value *Val, DIExpression *Expr, const DILocation *DL, InsertPosition InsertPt) { - if (M.IsNewDbgInfoFormat) { - DbgVariableRecord *DVR = - DbgVariableRecord::createDbgVariableRecord(Val, VarInfo, Expr, DL); - insertDbgVariableRecord(DVR, InsertPt); - return DVR; - } - - if (!ValueFn) - ValueFn = Intrinsic::getOrInsertDeclaration(&M, Intrinsic::dbg_value); - auto *DVI = insertDbgIntrinsic(ValueFn, Val, VarInfo, Expr, DL, InsertPt); - cast(DVI)->setTailCall(); - return DVI; + DbgVariableRecord *DVR = + DbgVariableRecord::createDbgVariableRecord(Val, VarInfo, Expr, DL); + insertDbgVariableRecord(DVR, InsertPt); + return DVR; } DbgInstPtr DIBuilder::insertDeclare(Value *Storage, DILocalVariable *VarInfo, @@ -1124,25 +1093,10 @@ DbgInstPtr DIBuilder::insertDeclare(Value *Storage, DILocalVariable *VarInfo, VarInfo->getScope()->getSubprogram() && "Expected matching subprograms"); - if (M.IsNewDbgInfoFormat) { - DbgVariableRecord *DVR = - DbgVariableRecord::createDVRDeclare(Storage, VarInfo, Expr, DL); - insertDbgVariableRecord(DVR, InsertPt); - return DVR; - } - - if (!DeclareFn) - DeclareFn = getDeclareIntrin(M); - - trackIfUnresolved(VarInfo); - trackIfUnresolved(Expr); - Value *Args[] = {getDbgIntrinsicValueImpl(VMContext, Storage), - MetadataAsValue::get(VMContext, VarInfo), - MetadataAsValue::get(VMContext, Expr)}; - - IRBuilder<> B(DL->getContext()); - initIRBuilder(B, DL, InsertPt); - return B.CreateCall(DeclareFn, Args); + DbgVariableRecord *DVR = + DbgVariableRecord::createDVRDeclare(Storage, VarInfo, Expr, DL); + insertDbgVariableRecord(DVR, InsertPt); + return DVR; } void DIBuilder::insertDbgVariableRecord(DbgVariableRecord *DVR, @@ -1191,23 +1145,12 @@ DbgInstPtr DIBuilder::insertLabel(DILabel *LabelInfo, const DILocation *DL, "Expected matching subprograms"); trackIfUnresolved(LabelInfo); - if (M.IsNewDbgInfoFormat) { - DbgLabelRecord *DLR = new DbgLabelRecord(LabelInfo, DL); - if (InsertPt.isValid()) { - auto *BB = InsertPt.getBasicBlock(); - BB->insertDbgRecordBefore(DLR, InsertPt); - } - return DLR; + DbgLabelRecord *DLR = new DbgLabelRecord(LabelInfo, DL); + if (InsertPt.isValid()) { + auto *BB = InsertPt.getBasicBlock(); + BB->insertDbgRecordBefore(DLR, InsertPt); } - - if (!LabelFn) - LabelFn = Intrinsic::getOrInsertDeclaration(&M, Intrinsic::dbg_label); - - Value *Args[] = {MetadataAsValue::get(VMContext, LabelInfo)}; - - IRBuilder<> B(DL->getContext()); - initIRBuilder(B, DL, InsertPt); - return B.CreateCall(LabelFn, Args); + return DLR; } void DIBuilder::replaceVTableHolder(DICompositeType *&T, DIType *VTableHolder) { diff --git a/llvm/lib/IR/DebugInfo.cpp b/llvm/lib/IR/DebugInfo.cpp index 7db9891fdbd7..2a84e7bae0f1 100644 --- a/llvm/lib/IR/DebugInfo.cpp +++ b/llvm/lib/IR/DebugInfo.cpp @@ -2123,22 +2123,11 @@ static void emitDbgAssign(AssignmentInfo Info, Value *Val, Value *Dest, Expr = *R; } DIExpression *AddrExpr = DIExpression::get(StoreLikeInst.getContext(), {}); - if (StoreLikeInst.getParent()->IsNewDbgInfoFormat) { - auto *Assign = DbgVariableRecord::createLinkedDVRAssign( - &StoreLikeInst, Val, VarRec.Var, Expr, Dest, AddrExpr, VarRec.DL); - (void)Assign; - LLVM_DEBUG(if (Assign) errs() << " > INSERT: " << *Assign << "\n"); - return; - } - auto Assign = DIB.insertDbgAssign(&StoreLikeInst, Val, VarRec.Var, Expr, Dest, - AddrExpr, VarRec.DL); + auto *Assign = DbgVariableRecord::createLinkedDVRAssign( + &StoreLikeInst, Val, VarRec.Var, Expr, Dest, AddrExpr, VarRec.DL); (void)Assign; - LLVM_DEBUG(if (!Assign.isNull()) { - if (const auto *Record = dyn_cast(Assign)) - errs() << " > INSERT: " << *Record << "\n"; - else - errs() << " > INSERT: " << *cast(Assign) << "\n"; - }); + LLVM_DEBUG(if (Assign) errs() << " > INSERT: " << *Assign << "\n"); + return; } #undef DEBUG_TYPE // Silence redefinition warning (from ConstantsContext.h). diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp index 0681ebc111cb..ff69fa9f70c4 100644 --- a/llvm/lib/Transforms/Utils/LoopUtils.cpp +++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp @@ -606,7 +606,6 @@ void llvm::deleteDeadLoop(Loop *L, DominatorTree *DT, ScalarEvolution *SE, // Use a map to unique and a vector to guarantee deterministic ordering. llvm::SmallDenseSet DeadDebugSet; - llvm::SmallVector DeadDebugInst; llvm::SmallVector DeadDbgVariableRecords; if (ExitBlock) { @@ -633,29 +632,19 @@ void llvm::deleteDeadLoop(Loop *L, DominatorTree *DT, ScalarEvolution *SE, U.set(Poison); } - // RemoveDIs: do the same as below for DbgVariableRecords. - if (Block->IsNewDbgInfoFormat) { - for (DbgVariableRecord &DVR : llvm::make_early_inc_range( - filterDbgVars(I.getDbgRecordRange()))) { - DebugVariable Key(DVR.getVariable(), DVR.getExpression(), - DVR.getDebugLoc().get()); - if (!DeadDebugSet.insert(Key).second) - continue; - // Unlinks the DVR from it's container, for later insertion. - DVR.removeFromParent(); - DeadDbgVariableRecords.push_back(&DVR); - } - } - - // For one of each variable encountered, preserve a debug intrinsic (set + // For one of each variable encountered, preserve a debug record (set // to Poison) and transfer it to the loop exit. This terminates any // variable locations that were set during the loop. - auto *DVI = dyn_cast(&I); - if (!DVI) - continue; - if (!DeadDebugSet.insert(DebugVariable(DVI)).second) - continue; - DeadDebugInst.push_back(DVI); + for (DbgVariableRecord &DVR : + llvm::make_early_inc_range(filterDbgVars(I.getDbgRecordRange()))) { + DebugVariable Key(DVR.getVariable(), DVR.getExpression(), + DVR.getDebugLoc().get()); + if (!DeadDebugSet.insert(Key).second) + continue; + // Unlinks the DVR from it's container, for later insertion. + DVR.removeFromParent(); + DeadDbgVariableRecords.push_back(&DVR); + } } // After the loop has been deleted all the values defined and modified @@ -671,9 +660,6 @@ void llvm::deleteDeadLoop(Loop *L, DominatorTree *DT, ScalarEvolution *SE, "There should be a non-PHI instruction in exit block, else these " "instructions will have no parent."); - for (auto *DVI : DeadDebugInst) - DVI->moveBefore(*ExitBlock, InsertDbgValueBefore); - // Due to the "head" bit in BasicBlock::iterator, we're going to insert // each DbgVariableRecord right at the start of the block, wheras dbg.values // would be repeatedly inserted before the first instruction. To replicate diff --git a/llvm/unittests/IR/IRBuilderTest.cpp b/llvm/unittests/IR/IRBuilderTest.cpp index 3a7ba924792e..aadae5287c38 100644 --- a/llvm/unittests/IR/IRBuilderTest.cpp +++ b/llvm/unittests/IR/IRBuilderTest.cpp @@ -1003,18 +1003,8 @@ TEST_F(IRBuilderTest, DIBuilder) { EXPECT_TRUE(verifyModule(*M)); }; - // Test in new-debug mode. - EXPECT_TRUE(M->IsNewDbgInfoFormat); RunTest(); - - // Test in old-debug mode. - // Reset the test then call convertFromNewDbgValues to flip the flag - // on the test's Module, Function and BasicBlock. TearDown(); - SetUp(); - M->convertFromNewDbgValues(); - EXPECT_FALSE(M->IsNewDbgInfoFormat); - RunTest(); } TEST_F(IRBuilderTest, createArtificialSubprogram) { From f1575de4c5de9268f92eea1641af755a477e4ee4 Mon Sep 17 00:00:00 2001 From: Joseph Huber Date: Wed, 11 Jun 2025 11:37:12 -0500 Subject: [PATCH 0013/1322] [libc][NFC] Remove template from GPU allocator reference counter Summary: We don't need this to be generic, precommit for https://github.com/llvm/llvm-project/pull/143607 --- libc/src/__support/GPU/allocator.cpp | 32 ++++++++++++++-------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/libc/src/__support/GPU/allocator.cpp b/libc/src/__support/GPU/allocator.cpp index 135ced3df704..ecc0de1cb6ec 100644 --- a/libc/src/__support/GPU/allocator.cpp +++ b/libc/src/__support/GPU/allocator.cpp @@ -283,7 +283,7 @@ struct Slab { /// A wait-free guard around a pointer resource to be created dynamically if /// space is available and freed once there are no more users. -template struct GuardPtr { +struct GuardPtr { private: struct RefCounter { // Indicates that the object is in its deallocation phase and thus invalid. @@ -339,22 +339,22 @@ private: cpp::Atomic counter{0}; }; - cpp::Atomic ptr{nullptr}; + cpp::Atomic ptr{nullptr}; RefCounter ref{}; // Should be called be a single lane for each different pointer. template - T *try_lock_impl(uint32_t n, uint64_t &count, Args &&...args) { - T *expected = ptr.load(cpp::MemoryOrder::RELAXED); + Slab *try_lock_impl(uint32_t n, uint64_t &count, Args &&...args) { + Slab *expected = ptr.load(cpp::MemoryOrder::RELAXED); if (!expected && - ptr.compare_exchange_strong(expected, reinterpret_cast(SENTINEL), - cpp::MemoryOrder::RELAXED, - cpp::MemoryOrder::RELAXED)) { + ptr.compare_exchange_strong( + expected, reinterpret_cast(SENTINEL), + cpp::MemoryOrder::RELAXED, cpp::MemoryOrder::RELAXED)) { count = cpp::numeric_limits::max(); - void *raw = impl::rpc_allocate(sizeof(T)); + void *raw = impl::rpc_allocate(sizeof(Slab)); if (!raw) return nullptr; - T *mem = new (raw) T(cpp::forward(args)...); + Slab *mem = new (raw) Slab(cpp::forward(args)...); cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE); ptr.store(mem, cpp::MemoryOrder::RELAXED); @@ -364,7 +364,7 @@ private: return mem; } - if (!expected || expected == reinterpret_cast(SENTINEL)) + if (!expected || expected == reinterpret_cast(SENTINEL)) return nullptr; if (!ref.acquire(n, count)) @@ -379,10 +379,10 @@ public: // The uniform mask represents which lanes share the same pointer. For each // uniform value we elect a leader to handle it on behalf of the other lanes. template - T *try_lock(uint64_t lane_mask, uint64_t uniform, uint64_t &count, - Args &&...args) { + Slab *try_lock(uint64_t lane_mask, uint64_t uniform, uint64_t &count, + Args &&...args) { count = 0; - T *result = nullptr; + Slab *result = nullptr; if (gpu::get_lane_id() == uint32_t(cpp::countr_zero(uniform))) result = try_lock_impl(cpp::popcount(uniform), count, cpp::forward(args)...); @@ -403,8 +403,8 @@ public: cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE); if (gpu::get_lane_id() == uint32_t(cpp::countr_zero(mask)) && ref.release(cpp::popcount(mask))) { - T *p = ptr.load(cpp::MemoryOrder::RELAXED); - p->~T(); + Slab *p = ptr.load(cpp::MemoryOrder::RELAXED); + p->~Slab(); impl::rpc_free(p); cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE); ptr.store(nullptr, cpp::MemoryOrder::RELAXED); @@ -417,7 +417,7 @@ public: }; // The global array used to search for a valid slab to allocate from. -static GuardPtr slots[ARRAY_SIZE] = {}; +static GuardPtr slots[ARRAY_SIZE] = {}; // Tries to find a slab in the table that can support the given chunk size. static Slab *find_slab(uint32_t chunk_size) { From aa8a1fa6f515f45db55365b9c1f8453ded24ed32 Mon Sep 17 00:00:00 2001 From: Stephen Tozer Date: Wed, 11 Jun 2025 18:42:10 +0200 Subject: [PATCH 0014/1322] [DLCov][NFC] Annotate intentionally-blank DebugLocs in existing code (#136192) Following the work in PR #107279, this patch applies the annotative DebugLocs, which indicate that a particular instruction is intentionally missing a location for a given reason, to existing sites in the compiler where their conditions apply. This is NFC in ordinary LLVM builds (each function `DebugLoc::getFoo()` is inlined as `DebugLoc()`), but marks the instruction in coverage-tracking builds so that it will be ignored by Debugify, allowing only real errors to be reported. From a developer standpoint, it also communicates the intentionality and reason for a missing DebugLoc. Some notes for reviewers: - The difference between `I->dropLocation()` and `I->setDebugLoc(DebugLoc::getDropped())` is that the former _may_ decide to keep some debug info alive, while the latter will always be empty; in this patch, I always used the latter (even if the former could technically be correct), because the former could result in some (barely) different output, and I'd prefer to keep this patch purely NFC. - I've generally documented the uses of `DebugLoc::getUnknown()`, with the exception of the vectorizers - in summary, they are a huge cause of dropped source locations, and I don't have the time or the domain knowledge currently to solve that, so I've plastered it all over them as a form of "fixme". --- llvm/lib/Transforms/IPO/GlobalOpt.cpp | 10 ++++-- llvm/lib/Transforms/IPO/IROutliner.cpp | 4 +-- .../Transforms/InstCombine/InstCombinePHI.cpp | 9 ++++- .../Scalar/CorrelatedValuePropagation.cpp | 3 +- llvm/lib/Transforms/Scalar/IndVarSimplify.cpp | 3 ++ llvm/lib/Transforms/Scalar/JumpThreading.cpp | 4 ++- llvm/lib/Transforms/Scalar/LICM.cpp | 4 ++- .../Transforms/Scalar/LoopLoadElimination.cpp | 3 +- .../Transforms/Scalar/SimpleLoopUnswitch.cpp | 3 ++ .../Scalar/TailRecursionElimination.cpp | 4 ++- llvm/lib/Transforms/Utils/InlineFunction.cpp | 9 +++++ llvm/lib/Transforms/Utils/Local.cpp | 3 +- llvm/lib/Transforms/Utils/SCCPSolver.cpp | 4 ++- llvm/lib/Transforms/Utils/SSAUpdater.cpp | 5 +++ llvm/lib/Transforms/Utils/SimplifyCFG.cpp | 10 +++--- .../Vectorize/LoopVectorizationPlanner.h | 34 ++++++++++++------- .../Transforms/Vectorize/LoopVectorize.cpp | 8 +++-- .../Transforms/Vectorize/SLPVectorizer.cpp | 12 +++++-- llvm/lib/Transforms/Vectorize/VPlan.h | 6 ++-- 19 files changed, 101 insertions(+), 37 deletions(-) diff --git a/llvm/lib/Transforms/IPO/GlobalOpt.cpp b/llvm/lib/Transforms/IPO/GlobalOpt.cpp index b3fe0ab8b5cb..7db058638650 100644 --- a/llvm/lib/Transforms/IPO/GlobalOpt.cpp +++ b/llvm/lib/Transforms/IPO/GlobalOpt.cpp @@ -1494,8 +1494,14 @@ processInternalGlobal(GlobalVariable *GV, const GlobalStatus &GS, // FIXME: Pass Global's alignment when globals have alignment AllocaInst *Alloca = new AllocaInst(ElemTy, DL.getAllocaAddrSpace(), nullptr, GV->getName(), FirstI); - if (!isa(GV->getInitializer())) - new StoreInst(GV->getInitializer(), Alloca, FirstI); + Alloca->setDebugLoc(DebugLoc::getCompilerGenerated()); + if (!isa(GV->getInitializer())) { + auto *SI = new StoreInst(GV->getInitializer(), Alloca, FirstI); + // FIXME: We're localizing a global and creating a store instruction for + // the initial value of that global. Could we logically use the global + // variable's (if one exists) line for this? + SI->setDebugLoc(DebugLoc::getCompilerGenerated()); + } GV->replaceAllUsesWith(Alloca); GV->eraseFromParent(); diff --git a/llvm/lib/Transforms/IPO/IROutliner.cpp b/llvm/lib/Transforms/IPO/IROutliner.cpp index ff66a518be75..cb18b55ae218 100644 --- a/llvm/lib/Transforms/IPO/IROutliner.cpp +++ b/llvm/lib/Transforms/IPO/IROutliner.cpp @@ -730,7 +730,7 @@ static void moveFunctionData(Function &Old, Function &New, // other outlined instructions. if (!isa(&Val)) { // Remove the debug information for outlined functions. - Val.setDebugLoc(DebugLoc()); + Val.setDebugLoc(DebugLoc::getDropped()); // Loop info metadata may contain line locations. Update them to have no // value in the new subprogram since the outlined code could be from @@ -1864,7 +1864,7 @@ replaceArgumentUses(OutlinableRegion &Region, Value *ValueOperand = SI->getValueOperand(); StoreInst *NewI = cast(I->clone()); - NewI->setDebugLoc(DebugLoc()); + NewI->setDebugLoc(DebugLoc::getDropped()); BasicBlock *OutputBB = VBBIt->second; NewI->insertInto(OutputBB, OutputBB->end()); LLVM_DEBUG(dbgs() << "Move store for instruction " << *I << " to " diff --git a/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp b/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp index a842a5edcb8a..6477141ab095 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp @@ -870,7 +870,14 @@ Instruction *InstCombinerImpl::foldPHIArgZextsIntoPHI(PHINode &Phi) { NewPhi->addIncoming(NewIncoming[I], Phi.getIncomingBlock(I)); InsertNewInstBefore(NewPhi, Phi.getIterator()); - return CastInst::CreateZExtOrBitCast(NewPhi, Phi.getType()); + auto *CI = CastInst::CreateZExtOrBitCast(NewPhi, Phi.getType()); + + // We use a dropped location here because the new ZExt is necessarily a merge + // of ZExtInsts and at least one constant from incoming branches; the presence + // of the constant means we have no viable DebugLoc from that branch, and + // therefore we must use a dropped location. + CI->setDebugLoc(DebugLoc::getDropped()); + return CI; } /// If all operands to a PHI node are the same "unary" operator and they all are diff --git a/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp b/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp index b95a851c99b4..4627f537dc16 100644 --- a/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp +++ b/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp @@ -432,7 +432,8 @@ static bool processSwitch(SwitchInst *I, LazyValueInfo *LVI, BasicBlock *NewUnreachableBB = BasicBlock::Create(BB->getContext(), "default.unreachable", BB->getParent(), DefaultDest); - new UnreachableInst(BB->getContext(), NewUnreachableBB); + auto *UI = new UnreachableInst(BB->getContext(), NewUnreachableBB); + UI->setDebugLoc(DebugLoc::getTemporary()); DefaultDest->removePredecessor(BB); SI->setDefaultDest(NewUnreachableBB); diff --git a/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp b/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp index 95d52b9b4e18..334c911191cb 100644 --- a/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp +++ b/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp @@ -1506,6 +1506,9 @@ bool IndVarSimplify::canonicalizeExitCondition(Loop *L) { auto *NewRHS = CastInst::Create( Instruction::Trunc, RHS, LHSOp->getType(), "", L->getLoopPreheader()->getTerminator()->getIterator()); + // NewRHS is an operation that has been hoisted out of the loop, and + // therefore should have a dropped location. + NewRHS->setDebugLoc(DebugLoc::getDropped()); ICmp->setOperand(Swapped ? 1 : 0, LHSOp); ICmp->setOperand(Swapped ? 0 : 1, NewRHS); // Samesign flag cannot be preserved after narrowing the compare. diff --git a/llvm/lib/Transforms/Scalar/JumpThreading.cpp b/llvm/lib/Transforms/Scalar/JumpThreading.cpp index 9449b4cb35b9..37b85bf9de81 100644 --- a/llvm/lib/Transforms/Scalar/JumpThreading.cpp +++ b/llvm/lib/Transforms/Scalar/JumpThreading.cpp @@ -3001,8 +3001,10 @@ bool JumpThreadingPass::tryToUnfoldSelectInCurrBB(BasicBlock *BB) { continue; // Expand the select. Value *Cond = SI->getCondition(); - if (!isGuaranteedNotToBeUndefOrPoison(Cond, nullptr, SI)) + if (!isGuaranteedNotToBeUndefOrPoison(Cond, nullptr, SI)) { Cond = new FreezeInst(Cond, "cond.fr", SI->getIterator()); + cast(Cond)->setDebugLoc(DebugLoc::getTemporary()); + } MDNode *BranchWeights = getBranchWeightMDNode(*SI); Instruction *Term = SplitBlockAndInsertIfThen(Cond, SI, false, BranchWeights); diff --git a/llvm/lib/Transforms/Scalar/LICM.cpp b/llvm/lib/Transforms/Scalar/LICM.cpp index 9773ef778b69..3024ccb330b1 100644 --- a/llvm/lib/Transforms/Scalar/LICM.cpp +++ b/llvm/lib/Transforms/Scalar/LICM.cpp @@ -2248,7 +2248,7 @@ bool llvm::promoteLoopAccessesToScalars( if (SawUnorderedAtomic) PreheaderLoad->setOrdering(AtomicOrdering::Unordered); PreheaderLoad->setAlignment(Alignment); - PreheaderLoad->setDebugLoc(DebugLoc()); + PreheaderLoad->setDebugLoc(DebugLoc::getDropped()); if (AATags && LoadIsGuaranteedToExecute) PreheaderLoad->setAAMetadata(AATags); @@ -2808,6 +2808,7 @@ static bool hoistMulAddAssociation(Instruction &I, Loop &L, auto *NewBO = BinaryOperator::Create(Ins->getOpcode(), LHS, RHS, Ins->getName() + ".reass", Ins->getIterator()); + NewBO->setDebugLoc(DebugLoc::getDropped()); NewBO->copyIRFlags(Ins); if (VariantOp == Ins) VariantOp = NewBO; @@ -2864,6 +2865,7 @@ static bool hoistBOAssociation(Instruction &I, Loop &L, auto *NewBO = BinaryOperator::Create( Opcode, LV, Inv, BO->getName() + ".reass", BO->getIterator()); + NewBO->setDebugLoc(DebugLoc::getDropped()); if (Opcode == Instruction::FAdd || Opcode == Instruction::FMul) { // Intersect FMF flags for FADD and FMUL. diff --git a/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp b/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp index 39e8d702a692..6bdf76f789a4 100644 --- a/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp +++ b/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp @@ -442,7 +442,7 @@ public: assert(PH && "Preheader should exist!"); Value *InitialPtr = SEE.expandCodeFor(PtrSCEV->getStart(), Ptr->getType(), PH->getTerminator()); - Value *Initial = + Instruction *Initial = new LoadInst(Cand.Load->getType(), InitialPtr, "load_initial", /* isVolatile */ false, Cand.Load->getAlign(), PH->getTerminator()->getIterator()); @@ -450,6 +450,7 @@ public: // into the loop's preheader. A debug location inside the loop will cause // a misleading stepping when debugging. The test update-debugloc-store // -forwarded.ll checks this. + Initial->setDebugLoc(DebugLoc::getDropped()); PHINode *PHI = PHINode::Create(Initial->getType(), 2, "store_forwarded"); PHI->insertBefore(L->getHeader()->begin()); diff --git a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp index 0bf90036b8b8..9b40fc03da6b 100644 --- a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp +++ b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp @@ -274,6 +274,7 @@ static void buildPartialUnswitchConditionalBranch( BasicBlock &UnswitchedSucc, BasicBlock &NormalSucc, bool InsertFreeze, const Instruction *I, AssumptionCache *AC, const DominatorTree &DT) { IRBuilder<> IRB(&BB); + IRB.SetCurrentDebugLocation(DebugLoc::getCompilerGenerated()); SmallVector FrozenInvariants; for (Value *Inv : Invariants) { @@ -330,6 +331,7 @@ static void buildPartialInvariantUnswitchConditionalBranch( } IRBuilder<> IRB(&BB); + IRB.SetCurrentDebugLocation(DebugLoc::getCompilerGenerated()); Value *Cond = VMap[ToDuplicate[0]]; IRB.CreateCondBr(Cond, Direction ? &UnswitchedSucc : &NormalSucc, Direction ? &NormalSucc : &UnswitchedSucc); @@ -2369,6 +2371,7 @@ static void unswitchNontrivialInvariants( // BI (`dyn_cast(TI)`) is an in-loop instruction hoisted // out of the loop. Cond = new FreezeInst(Cond, Cond->getName() + ".fr", BI->getIterator()); + cast(Cond)->setDebugLoc(DebugLoc::getDropped()); } BI->setCondition(Cond); DTUpdates.push_back({DominatorTree::Insert, SplitBB, ClonedPH}); diff --git a/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp b/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp index 7dd6c60370ed..c71c5a70a12f 100644 --- a/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp +++ b/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp @@ -515,7 +515,8 @@ void TailRecursionEliminator::createTailRecurseLoopHeader(CallInst *CI) { BasicBlock *NewEntry = BasicBlock::Create(F.getContext(), "", &F, HeaderBB); NewEntry->takeName(HeaderBB); HeaderBB->setName("tailrecurse"); - BranchInst::Create(HeaderBB, NewEntry); + auto *BI = BranchInst::Create(HeaderBB, NewEntry); + BI->setDebugLoc(DebugLoc::getCompilerGenerated()); // If the new branch preserves the debug location of CI, it could result in // misleading stepping, if CI is located in a conditional branch. // So, here we don't give any debug location to the new branch. @@ -801,6 +802,7 @@ void TailRecursionEliminator::cleanupAndFinalize() { SelectInst *SI = SelectInst::Create(RetKnownPN, RetPN, RI->getOperand(0), "current.ret.tr", RI->getIterator()); + SI->setDebugLoc(DebugLoc::getCompilerGenerated()); RetSelects.push_back(SI); RI->setOperand(0, SI); } diff --git a/llvm/lib/Transforms/Utils/InlineFunction.cpp b/llvm/lib/Transforms/Utils/InlineFunction.cpp index 7a9605bf5f8d..f47c467d1514 100644 --- a/llvm/lib/Transforms/Utils/InlineFunction.cpp +++ b/llvm/lib/Transforms/Utils/InlineFunction.cpp @@ -1775,6 +1775,7 @@ static Value *HandleByValArgument(Type *ByValType, Value *Arg, AllocaInst *NewAlloca = new AllocaInst(ByValType, Arg->getType()->getPointerAddressSpace(), nullptr, Alignment, Arg->getName()); + NewAlloca->setDebugLoc(DebugLoc::getCompilerGenerated()); NewAlloca->insertBefore(Caller->begin()->begin()); IFI.StaticAllocas.push_back(NewAlloca); @@ -3258,6 +3259,8 @@ llvm::InlineResult llvm::InlineFunction(CallBase &CB, InlineFunctionInfo &IFI, // Add an unconditional branch to make this look like the CallInst case... CreatedBranchToNormalDest = BranchInst::Create(II->getNormalDest(), CB.getIterator()); + // We intend to replace this DebugLoc with another later. + CreatedBranchToNormalDest->setDebugLoc(DebugLoc::getTemporary()); // Split the basic block. This guarantees that no PHI nodes will have to be // updated due to new incoming edges, and make the invoke case more @@ -3359,6 +3362,12 @@ llvm::InlineResult llvm::InlineFunction(CallBase &CB, InlineFunctionInfo &IFI, Returns[0]->eraseFromParent(); ReturnBB->eraseFromParent(); } else if (!CB.use_empty()) { + // In this case there are no returns to use, so there is no clear source + // location for the "return". + // FIXME: It may be correct to use the scope end line of the function here, + // since this likely means we are falling out of the function. + if (CreatedBranchToNormalDest) + CreatedBranchToNormalDest->setDebugLoc(DebugLoc::getUnknown()); // No returns, but something is using the return value of the call. Just // nuke the result. CB.replaceAllUsesWith(PoisonValue::get(CB.getType())); diff --git a/llvm/lib/Transforms/Utils/Local.cpp b/llvm/lib/Transforms/Utils/Local.cpp index 2630a1a7a6af..a3252a69874d 100644 --- a/llvm/lib/Transforms/Utils/Local.cpp +++ b/llvm/lib/Transforms/Utils/Local.cpp @@ -3127,7 +3127,8 @@ static bool markAliveBlocks(Function &F, BasicBlock *UnreachableNormalDest = BasicBlock::Create( Ctx, OrigNormalDest->getName() + ".unreachable", II->getFunction(), OrigNormalDest); - new UnreachableInst(Ctx, UnreachableNormalDest); + auto *UI = new UnreachableInst(Ctx, UnreachableNormalDest); + UI->setDebugLoc(DebugLoc::getTemporary()); II->setNormalDest(UnreachableNormalDest); if (DTU) DTU->applyUpdates( diff --git a/llvm/lib/Transforms/Utils/SCCPSolver.cpp b/llvm/lib/Transforms/Utils/SCCPSolver.cpp index 1a2e42235627..f4b378b82dae 100644 --- a/llvm/lib/Transforms/Utils/SCCPSolver.cpp +++ b/llvm/lib/Transforms/Utils/SCCPSolver.cpp @@ -348,7 +348,9 @@ bool SCCPSolver::removeNonFeasibleEdges(BasicBlock *BB, DomTreeUpdater &DTU, NewUnreachableBB = BasicBlock::Create(DefaultDest->getContext(), "default.unreachable", DefaultDest->getParent(), DefaultDest); - new UnreachableInst(DefaultDest->getContext(), NewUnreachableBB); + auto *UI = + new UnreachableInst(DefaultDest->getContext(), NewUnreachableBB); + UI->setDebugLoc(DebugLoc::getTemporary()); } DefaultDest->removePredecessor(BB); diff --git a/llvm/lib/Transforms/Utils/SSAUpdater.cpp b/llvm/lib/Transforms/Utils/SSAUpdater.cpp index 48d9528f0c3d..5db7fc956c49 100644 --- a/llvm/lib/Transforms/Utils/SSAUpdater.cpp +++ b/llvm/lib/Transforms/Utils/SSAUpdater.cpp @@ -318,6 +318,11 @@ public: SSAUpdater *Updater) { PHINode *PHI = PHINode::Create(Updater->ProtoType, NumPreds, Updater->ProtoName); + // FIXME: Ordinarily we don't care about or try to assign DebugLocs to PHI + // nodes, but loop optimizations may try to use a PHI node as a DebugLoc + // source (e.g. if this is an induction variable), and it's not clear what + // location we could attach here, so mark this unknown for now. + PHI->setDebugLoc(DebugLoc::getUnknown()); PHI->insertBefore(BB->begin()); return PHI; } diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp index e221022bb836..975ce3bef517 100644 --- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp +++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp @@ -1137,7 +1137,7 @@ static void cloneInstructionsIntoPredecessorBlockAndUpdateSSAUses( // branch, drop it. When we fold the bonus instructions we want to make // sure we reset their debug locations in order to avoid stepping on // dead code caused by folding dead branches. - NewBonusInst->setDebugLoc(DebugLoc()); + NewBonusInst->setDebugLoc(DebugLoc::getDropped()); } else if (const DebugLoc &DL = NewBonusInst->getDebugLoc()) { mapAtomInstance(DL, VMap); } @@ -2821,7 +2821,8 @@ static void mergeCompatibleInvokesImpl(ArrayRef Invokes, // so just form a new block with unreachable terminator. BasicBlock *MergedNormalDest = BasicBlock::Create( Ctx, II0BB->getName() + ".cont", Func, InsertBeforeBlock); - new UnreachableInst(Ctx, MergedNormalDest); + auto *UI = new UnreachableInst(Ctx, MergedNormalDest); + UI->setDebugLoc(DebugLoc::getTemporary()); MergedInvoke->setNormalDest(MergedNormalDest); } @@ -3389,7 +3390,7 @@ bool SimplifyCFGOpt::speculativelyExecuteBB(BranchInst *BI, if (!SpeculatedStoreValue || &I != SpeculatedStore) { // Don't update the DILocation of dbg.assign intrinsics. if (!isa(&I)) - I.setDebugLoc(DebugLoc()); + I.setDebugLoc(DebugLoc::getDropped()); } I.dropUBImplyingAttrsAndMetadata(); @@ -5707,7 +5708,8 @@ static void createUnreachableSwitchDefault(SwitchInst *Switch, BasicBlock *NewDefaultBlock = BasicBlock::Create( BB->getContext(), BB->getName() + ".unreachabledefault", BB->getParent(), OrigDefaultBlock); - new UnreachableInst(Switch->getContext(), NewDefaultBlock); + auto *UI = new UnreachableInst(Switch->getContext(), NewDefaultBlock); + UI->setDebugLoc(DebugLoc::getTemporary()); Switch->setDefaultDest(&*NewDefaultBlock); if (DTU) { SmallVector Updates; diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h index b81d582f07e8..70f541d64b30 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h @@ -153,7 +153,7 @@ public: VPInstruction *createNaryOp(unsigned Opcode, ArrayRef Operands, Instruction *Inst = nullptr, const Twine &Name = "") { - DebugLoc DL; + DebugLoc DL = DebugLoc::getUnknown(); if (Inst) DL = Inst->getDebugLoc(); VPInstruction *NewVPInst = createInstruction(Opcode, Operands, DL, Name); @@ -165,7 +165,8 @@ public: return createInstruction(Opcode, Operands, DL, Name); } VPInstruction *createNaryOp(unsigned Opcode, ArrayRef Operands, - const VPIRFlags &Flags, DebugLoc DL = {}, + const VPIRFlags &Flags, + DebugLoc DL = DebugLoc::getUnknown(), const Twine &Name = "") { return tryInsertInstruction( new VPInstruction(Opcode, Operands, Flags, DL, Name)); @@ -174,7 +175,8 @@ public: VPInstruction *createNaryOp(unsigned Opcode, std::initializer_list Operands, Type *ResultTy, const VPIRFlags &Flags = {}, - DebugLoc DL = {}, const Twine &Name = "") { + DebugLoc DL = DebugLoc::getUnknown(), + const Twine &Name = "") { return tryInsertInstruction( new VPInstructionWithType(Opcode, Operands, ResultTy, Flags, DL, Name)); } @@ -182,22 +184,25 @@ public: VPInstruction *createOverflowingOp(unsigned Opcode, std::initializer_list Operands, VPRecipeWithIRFlags::WrapFlagsTy WrapFlags, - DebugLoc DL = {}, const Twine &Name = "") { + DebugLoc DL = DebugLoc::getUnknown(), + const Twine &Name = "") { return tryInsertInstruction( new VPInstruction(Opcode, Operands, WrapFlags, DL, Name)); } - VPValue *createNot(VPValue *Operand, DebugLoc DL = {}, + VPValue *createNot(VPValue *Operand, DebugLoc DL = DebugLoc::getUnknown(), const Twine &Name = "") { return createInstruction(VPInstruction::Not, {Operand}, DL, Name); } - VPValue *createAnd(VPValue *LHS, VPValue *RHS, DebugLoc DL = {}, + VPValue *createAnd(VPValue *LHS, VPValue *RHS, + DebugLoc DL = DebugLoc::getUnknown(), const Twine &Name = "") { return createInstruction(Instruction::BinaryOps::And, {LHS, RHS}, DL, Name); } - VPValue *createOr(VPValue *LHS, VPValue *RHS, DebugLoc DL = {}, + VPValue *createOr(VPValue *LHS, VPValue *RHS, + DebugLoc DL = DebugLoc::getUnknown(), const Twine &Name = "") { return tryInsertInstruction(new VPInstruction( @@ -205,14 +210,16 @@ public: VPRecipeWithIRFlags::DisjointFlagsTy(false), DL, Name)); } - VPValue *createLogicalAnd(VPValue *LHS, VPValue *RHS, DebugLoc DL = {}, + VPValue *createLogicalAnd(VPValue *LHS, VPValue *RHS, + DebugLoc DL = DebugLoc::getUnknown(), const Twine &Name = "") { return tryInsertInstruction( new VPInstruction(VPInstruction::LogicalAnd, {LHS, RHS}, DL, Name)); } VPValue *createSelect(VPValue *Cond, VPValue *TrueVal, VPValue *FalseVal, - DebugLoc DL = {}, const Twine &Name = "", + DebugLoc DL = DebugLoc::getUnknown(), + const Twine &Name = "", std::optional FMFs = std::nullopt) { auto *Select = FMFs ? new VPInstruction(Instruction::Select, {Cond, TrueVal, FalseVal}, @@ -226,20 +233,23 @@ public: /// and \p B. /// TODO: add createFCmp when needed. VPValue *createICmp(CmpInst::Predicate Pred, VPValue *A, VPValue *B, - DebugLoc DL = {}, const Twine &Name = "") { + DebugLoc DL = DebugLoc::getUnknown(), + const Twine &Name = "") { assert(Pred >= CmpInst::FIRST_ICMP_PREDICATE && Pred <= CmpInst::LAST_ICMP_PREDICATE && "invalid predicate"); return tryInsertInstruction( new VPInstruction(Instruction::ICmp, {A, B}, Pred, DL, Name)); } - VPInstruction *createPtrAdd(VPValue *Ptr, VPValue *Offset, DebugLoc DL = {}, + VPInstruction *createPtrAdd(VPValue *Ptr, VPValue *Offset, + DebugLoc DL = DebugLoc::getUnknown(), const Twine &Name = "") { return tryInsertInstruction( new VPInstruction(VPInstruction::PtrAdd, {Ptr, Offset}, GEPNoWrapFlags::none(), DL, Name)); } - VPValue *createInBoundsPtrAdd(VPValue *Ptr, VPValue *Offset, DebugLoc DL = {}, + VPValue *createInBoundsPtrAdd(VPValue *Ptr, VPValue *Offset, + DebugLoc DL = DebugLoc::getUnknown(), const Twine &Name = "") { return tryInsertInstruction( new VPInstruction(VPInstruction::PtrAdd, {Ptr, Offset}, diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 427c1460fcfc..2a237f42e404 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -772,7 +772,7 @@ protected: /// Look for a meaningful debug location on the instruction or its operands. static DebugLoc getDebugLocFromInstOrOperands(Instruction *I) { if (!I) - return DebugLoc(); + return DebugLoc::getUnknown(); DebugLoc Empty; if (I->getDebugLoc() != Empty) @@ -1881,13 +1881,15 @@ public: if (SCEVCheckBlock) { SCEVCheckBlock->getTerminator()->moveBefore( Preheader->getTerminator()->getIterator()); - new UnreachableInst(Preheader->getContext(), SCEVCheckBlock); + auto *UI = new UnreachableInst(Preheader->getContext(), SCEVCheckBlock); + UI->setDebugLoc(DebugLoc::getTemporary()); Preheader->getTerminator()->eraseFromParent(); } if (MemCheckBlock) { MemCheckBlock->getTerminator()->moveBefore( Preheader->getTerminator()->getIterator()); - new UnreachableInst(Preheader->getContext(), MemCheckBlock); + auto *UI = new UnreachableInst(Preheader->getContext(), MemCheckBlock); + UI->setDebugLoc(DebugLoc::getTemporary()); Preheader->getTerminator()->eraseFromParent(); } diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index ec40124c57a6..c3ca22dce0cc 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -17434,6 +17434,12 @@ static Instruction *propagateMetadata(Instruction *Inst, ArrayRef VL) { return llvm::propagateMetadata(Inst, Insts); } +static DebugLoc getDebugLocFromPHI(PHINode &PN) { + if (DebugLoc DL = PN.getDebugLoc()) + return DL; + return DebugLoc::getUnknown(); +} + Value *BoUpSLP::vectorizeTree(TreeEntry *E) { IRBuilderBase::InsertPointGuard Guard(Builder); @@ -17599,14 +17605,14 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { auto *PH = cast(VL0); Builder.SetInsertPoint(PH->getParent(), PH->getParent()->getFirstNonPHIIt()); - Builder.SetCurrentDebugLocation(PH->getDebugLoc()); + Builder.SetCurrentDebugLocation(getDebugLocFromPHI(*PH)); PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues()); Value *V = NewPhi; // Adjust insertion point once all PHI's have been generated. Builder.SetInsertPoint(PH->getParent(), PH->getParent()->getFirstInsertionPt()); - Builder.SetCurrentDebugLocation(PH->getDebugLoc()); + Builder.SetCurrentDebugLocation(getDebugLocFromPHI(*PH)); V = FinalShuffle(V, E); @@ -17638,7 +17644,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { } Builder.SetInsertPoint(IBB->getTerminator()); - Builder.SetCurrentDebugLocation(PH->getDebugLoc()); + Builder.SetCurrentDebugLocation(getDebugLocFromPHI(*PH)); Value *Vec = vectorizeOperand(E, I); if (VecTy != Vec->getType()) { assert((It != MinBWs.end() || getOperandEntry(E, I)->isGather() || diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index bbcbfee4e471..acc861b99197 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -1816,9 +1816,9 @@ public: class VPHeaderPHIRecipe : public VPSingleDefRecipe, public VPPhiAccessors { protected: VPHeaderPHIRecipe(unsigned char VPDefID, Instruction *UnderlyingInstr, - VPValue *Start, DebugLoc DL = {}) - : VPSingleDefRecipe(VPDefID, ArrayRef({Start}), UnderlyingInstr, DL) { - } + VPValue *Start, DebugLoc DL = DebugLoc::getUnknown()) + : VPSingleDefRecipe(VPDefID, ArrayRef({Start}), + UnderlyingInstr, DL) {} const VPRecipeBase *getAsRecipe() const override { return this; } From 117e78fe5012087c1ee535b91936bf4d8e3c7785 Mon Sep 17 00:00:00 2001 From: William <113542065+saturn691@users.noreply.github.com> Date: Wed, 11 Jun 2025 17:51:34 +0100 Subject: [PATCH 0015/1322] [libc] Add NULL macro definitions to header files (#142764) By the C standard, , , , , , and require NULL to be defined. --- libc/include/CMakeLists.txt | 5 +++++ libc/include/locale.yaml | 3 +++ libc/include/stdio.yaml | 2 ++ libc/include/stdlib.yaml | 4 +++- libc/include/string.h.def | 2 -- libc/include/string.yaml | 4 +++- libc/include/time.yaml | 4 +++- libc/include/wchar.yaml | 4 +++- 8 files changed, 22 insertions(+), 6 deletions(-) diff --git a/libc/include/CMakeLists.txt b/libc/include/CMakeLists.txt index 7209e10c68b8..55268d19529c 100644 --- a/libc/include/CMakeLists.txt +++ b/libc/include/CMakeLists.txt @@ -255,6 +255,7 @@ add_header_macro( time.h DEPENDS .llvm_libc_common_h + .llvm-libc-macros.null_macro .llvm-libc-macros.time_macros .llvm-libc-types.clock_t .llvm-libc-types.time_t @@ -329,6 +330,7 @@ add_header_macro( stdio.h DEPENDS .llvm-libc-macros.file_seek_macros + .llvm-libc-macros.null_macro .llvm-libc-macros.stdio_macros .llvm-libc-types.FILE .llvm-libc-types.cookie_io_functions_t @@ -343,6 +345,7 @@ add_header_macro( ../libc/include/stdlib.yaml stdlib.h DEPENDS + .llvm-libc-macros.null_macro .llvm-libc-macros.stdlib_macros .llvm-libc-types.__atexithandler_t .llvm-libc-types.__qsortcompare_t @@ -709,6 +712,7 @@ add_header_macro( wchar.h DEPENDS .llvm_libc_common_h + .llvm-libc-macros.null_macro .llvm-libc-macros.wchar_macros .llvm-libc-types.mbstate_t .llvm-libc-types.size_t @@ -723,6 +727,7 @@ add_header_macro( DEPENDS .llvm_libc_common_h .llvm-libc-macros.locale_macros + .llvm-libc-macros.null_macro .llvm-libc-types.locale_t .llvm-libc-types.struct_lconv ) diff --git a/libc/include/locale.yaml b/libc/include/locale.yaml index 6c71b70e59f0..4566984ad83a 100644 --- a/libc/include/locale.yaml +++ b/libc/include/locale.yaml @@ -1,5 +1,8 @@ header: locale.h header_template: locale.h.def +macros: + - macro_name: NULL + macro_header: null-macro.h types: - type_name: locale_t - type_name: struct_lconv diff --git a/libc/include/stdio.yaml b/libc/include/stdio.yaml index 2619984cca26..3d5164fa10ff 100644 --- a/libc/include/stdio.yaml +++ b/libc/include/stdio.yaml @@ -1,6 +1,8 @@ header: stdio.h header_template: stdio.h.def macros: + - macro_name: NULL + macro_header: null-macro.h - macro_name: stdout macro_value: stdout - macro_name: stdin diff --git a/libc/include/stdlib.yaml b/libc/include/stdlib.yaml index f7155ba27a16..3b2ff13c684b 100644 --- a/libc/include/stdlib.yaml +++ b/libc/include/stdlib.yaml @@ -4,7 +4,9 @@ standards: - stdc merge_yaml_files: - stdlib-malloc.yaml -macros: [] +macros: + - macro_name: NULL + macro_header: null-macro.h types: - type_name: __atexithandler_t - type_name: __qsortcompare_t diff --git a/libc/include/string.h.def b/libc/include/string.h.def index 1bd2687db2be..339d005e43a4 100644 --- a/libc/include/string.h.def +++ b/libc/include/string.h.def @@ -11,8 +11,6 @@ #include "__llvm-libc-common.h" -#include "llvm-libc-macros/null-macro.h" - %%public_api() #endif // LLVM_LIBC_STRING_H diff --git a/libc/include/string.yaml b/libc/include/string.yaml index 9f72b8db6c1e..736deceb453d 100644 --- a/libc/include/string.yaml +++ b/libc/include/string.yaml @@ -1,6 +1,8 @@ header: string.h header_template: string.h.def -macros: [] +macros: + - macro_name: NULL + macro_header: null-macro.h types: - type_name: locale_t - type_name: size_t diff --git a/libc/include/time.yaml b/libc/include/time.yaml index 7bb25dbe85ac..3b9d77c0aaae 100644 --- a/libc/include/time.yaml +++ b/libc/include/time.yaml @@ -1,6 +1,8 @@ header: time.h header_template: time.h.def -macros: [] +macros: + - macro_name: NULL + macro_header: null-macro.h types: - type_name: struct_timeval - type_name: clockid_t diff --git a/libc/include/wchar.yaml b/libc/include/wchar.yaml index 877be48b6a10..57f4f6660827 100644 --- a/libc/include/wchar.yaml +++ b/libc/include/wchar.yaml @@ -1,6 +1,8 @@ header: wchar.h header_template: wchar.h.def -macros: [] +macros: + - macro_name: NULL + macro_header: null-macro.h types: - type_name: size_t - type_name: wint_t From 469922f7c40a1733fba98e29fa2bd09a9565ddd6 Mon Sep 17 00:00:00 2001 From: Sami Tolvanen Date: Wed, 11 Jun 2025 16:57:23 +0000 Subject: [PATCH 0016/1322] [X86] Don't emit ENDBR for asm goto branch targets (#143439) Similarly to #141562, which disabled BTI generation for ARM asm goto branch targets, drop unnecessary ENDBRs from IsInlineAsmBrIndirectTarget machine basic blocks. --- .../Target/X86/X86IndirectBranchTracking.cpp | 2 +- llvm/test/CodeGen/X86/callbr-asm-endbr.ll | 34 +++++++++++++++++++ 2 files changed, 35 insertions(+), 1 deletion(-) create mode 100644 llvm/test/CodeGen/X86/callbr-asm-endbr.ll diff --git a/llvm/lib/Target/X86/X86IndirectBranchTracking.cpp b/llvm/lib/Target/X86/X86IndirectBranchTracking.cpp index 7740a174af4f..52be14228e55 100644 --- a/llvm/lib/Target/X86/X86IndirectBranchTracking.cpp +++ b/llvm/lib/Target/X86/X86IndirectBranchTracking.cpp @@ -147,7 +147,7 @@ bool X86IndirectBranchTrackingPass::runOnMachineFunction(MachineFunction &MF) { for (auto &MBB : MF) { // Find all basic blocks that their address was taken (for example // in the case of indirect jump) and add ENDBR instruction. - if (MBB.hasAddressTaken()) + if (MBB.isMachineBlockAddressTaken() || MBB.isIRBlockAddressTaken()) Changed |= addENDBR(MBB, MBB.begin()); for (MachineBasicBlock::iterator I = MBB.begin(); I != MBB.end(); ++I) { diff --git a/llvm/test/CodeGen/X86/callbr-asm-endbr.ll b/llvm/test/CodeGen/X86/callbr-asm-endbr.ll new file mode 100644 index 000000000000..133de89d5f3a --- /dev/null +++ b/llvm/test/CodeGen/X86/callbr-asm-endbr.ll @@ -0,0 +1,34 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s + +define i32 @test1(i32 %a) { +; CHECK-LABEL: test1: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: endbr64 +; CHECK-NEXT: addl $4, %edi +; CHECK-NEXT: #APP +; CHECK-NEXT: xorl %edi, %edi +; CHECK-NEXT: jmp .LBB0_2 +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: # %bb.1: # %normal +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: retq +; CHECK-NEXT: .LBB0_2: # Inline asm indirect target +; CHECK-NEXT: # %fail +; CHECK-NEXT: # Label of block must be emitted +; CHECK-NEXT: movl $1, %eax +; CHECK-NEXT: retq +entry: + %0 = add i32 %a, 4 + callbr void asm "xorl $0, $0; jmp ${1:l}", "r,!i,~{dirflag},~{fpsr},~{flags}"(i32 %0) to label %normal [label %fail] + +normal: + ret i32 0 + +fail: + ret i32 1 +} + +!llvm.module.flags = !{!0} + +!0 = !{i32 8, !"cf-protection-branch", i32 1} From 145b1b0f103e61cfc8a47ed37080e955630a1390 Mon Sep 17 00:00:00 2001 From: Felipe de Azevedo Piovezan Date: Wed, 11 Jun 2025 09:57:42 -0700 Subject: [PATCH 0017/1322] [lldb][nfc] Factor out code checking if Variable is in scope (#143572) This is useful for checking whether a variable is in scope inside a specific block. --- lldb/include/lldb/Symbol/Variable.h | 3 ++ lldb/source/Symbol/Variable.cpp | 46 +++++++++++++++-------------- 2 files changed, 27 insertions(+), 22 deletions(-) diff --git a/lldb/include/lldb/Symbol/Variable.h b/lldb/include/lldb/Symbol/Variable.h index c437624d1ea6..5b9c709c8b86 100644 --- a/lldb/include/lldb/Symbol/Variable.h +++ b/lldb/include/lldb/Symbol/Variable.h @@ -89,6 +89,9 @@ public: bool IsInScope(StackFrame *frame); + /// Returns true if this variable is in scope at `addr` inside `block`. + bool IsInScope(const Block &block, const Address &addr); + bool LocationIsValidForFrame(StackFrame *frame); bool LocationIsValidForAddress(const Address &address); diff --git a/lldb/source/Symbol/Variable.cpp b/lldb/source/Symbol/Variable.cpp index 8244725aba54..af32e0e958e5 100644 --- a/lldb/source/Symbol/Variable.cpp +++ b/lldb/source/Symbol/Variable.cpp @@ -290,28 +290,9 @@ bool Variable::IsInScope(StackFrame *frame) { // this variable was defined in is currently Block *deepest_frame_block = frame->GetSymbolContext(eSymbolContextBlock).block; - if (deepest_frame_block) { - SymbolContext variable_sc; - CalculateSymbolContext(&variable_sc); - - // Check for static or global variable defined at the compile unit - // level that wasn't defined in a block - if (variable_sc.block == nullptr) - return true; - - // Check if the variable is valid in the current block - if (variable_sc.block != deepest_frame_block && - !variable_sc.block->Contains(deepest_frame_block)) - return false; - - // If no scope range is specified then it means that the scope is the - // same as the scope of the enclosing lexical block. - if (m_scope_range.IsEmpty()) - return true; - - addr_t file_address = frame->GetFrameCodeAddress().GetFileAddress(); - return m_scope_range.FindEntryThatContains(file_address) != nullptr; - } + Address frame_addr = frame->GetFrameCodeAddress(); + if (deepest_frame_block) + return IsInScope(*deepest_frame_block, frame_addr); } break; @@ -321,6 +302,27 @@ bool Variable::IsInScope(StackFrame *frame) { return false; } +bool Variable::IsInScope(const Block &block, const Address &addr) { + SymbolContext variable_sc; + CalculateSymbolContext(&variable_sc); + + // Check for static or global variable defined at the compile unit + // level that wasn't defined in a block + if (variable_sc.block == nullptr) + return true; + + // Check if the variable is valid in the current block + if (variable_sc.block != &block && !variable_sc.block->Contains(&block)) + return false; + + // If no scope range is specified then it means that the scope is the + // same as the scope of the enclosing lexical block. + if (m_scope_range.IsEmpty()) + return true; + + return m_scope_range.FindEntryThatContains(addr.GetFileAddress()) != nullptr; +} + Status Variable::GetValuesForVariableExpressionPath( llvm::StringRef variable_expr_path, ExecutionContextScope *scope, GetVariableCallback callback, void *baton, VariableList &variable_list, From 370e54d03a5bb11f3f283ad5ab479501c74069c7 Mon Sep 17 00:00:00 2001 From: Amr Hesham Date: Wed, 11 Jun 2025 19:02:36 +0200 Subject: [PATCH 0018/1322] [CIR] Upstream splat op for VectorType (#139827) This change adds support for splat op for VectorType Issue https://github.com/llvm/llvm-project/issues/136487 --- clang/include/clang/CIR/Dialect/IR/CIROps.td | 32 ++++++++++ clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp | 8 +++ .../CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp | 51 +++++++++++++++ .../CIR/Lowering/DirectToLLVM/LowerToLLVM.h | 10 +++ clang/test/CIR/CodeGen/vector-ext.cpp | 64 +++++++++++++++++++ clang/test/CIR/CodeGen/vector.cpp | 63 ++++++++++++++++++ clang/test/CIR/IR/vector.cir | 33 ++++++++++ 7 files changed, 261 insertions(+) diff --git a/clang/include/clang/CIR/Dialect/IR/CIROps.td b/clang/include/clang/CIR/Dialect/IR/CIROps.td index 565c0676773e..634f0dd554c7 100644 --- a/clang/include/clang/CIR/Dialect/IR/CIROps.td +++ b/clang/include/clang/CIR/Dialect/IR/CIROps.td @@ -2277,6 +2277,38 @@ def VecTernaryOp : CIR_Op<"vec.ternary", let hasFolder = 1; } +//===----------------------------------------------------------------------===// +// VecSplatOp +//===----------------------------------------------------------------------===// + +def VecSplatOp : CIR_Op<"vec.splat", [Pure, + TypesMatchWith<"type of 'value' matches element type of 'result'", "result", + "value", "cast($_self).getElementType()">]> { + + let summary = "Convert a scalar into a vector"; + let description = [{ + The `cir.vec.splat` operation creates a vector value from a scalar value. + All elements of the vector have the same value, that of the given scalar. + + It's a separate operation from `cir.vec.create` because more + efficient LLVM IR can be generated for it, and because some optimization and + analysis passes can benefit from knowing that all elements of the vector + have the same value. + + ```mlir + %value = cir.const #cir.int<3> : !s32i + %value_vec = cir.vec.splat %value : !s32i, !cir.vector<4 x !s32i> + ``` + }]; + + let arguments = (ins CIR_VectorElementType:$value); + let results = (outs CIR_VectorType:$result); + + let assemblyFormat = [{ + $value `:` type($value) `,` qualified(type($result)) attr-dict + }]; +} + //===----------------------------------------------------------------------===// // BaseClassAddrOp //===----------------------------------------------------------------------===// diff --git a/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp b/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp index 481eb492d187..30d231e2c61d 100644 --- a/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp @@ -1780,6 +1780,14 @@ mlir::Value ScalarExprEmitter::VisitCastExpr(CastExpr *ce) { cgf.convertType(destTy)); } + case CK_VectorSplat: { + // Create a vector object and fill all elements with the same scalar value. + assert(destTy->isVectorType() && "CK_VectorSplat to non-vector type"); + return builder.create( + cgf.getLoc(subExpr->getSourceRange()), cgf.convertType(destTy), + Visit(subExpr)); + } + default: cgf.getCIRGenModule().errorNYI(subExpr->getSourceRange(), "CastExpr: ", ce->getCastKindName()); diff --git a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp index 4fdf8f9ec269..1642d10d427b 100644 --- a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp +++ b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp @@ -1803,6 +1803,7 @@ void ConvertCIRToLLVMPass::runOnOperation() { CIRToLLVMVecExtractOpLowering, CIRToLLVMVecInsertOpLowering, CIRToLLVMVecCmpOpLowering, + CIRToLLVMVecSplatOpLowering, CIRToLLVMVecShuffleOpLowering, CIRToLLVMVecShuffleDynamicOpLowering, CIRToLLVMVecTernaryOpLowering @@ -1956,6 +1957,56 @@ mlir::LogicalResult CIRToLLVMVecCmpOpLowering::matchAndRewrite( return mlir::success(); } +mlir::LogicalResult CIRToLLVMVecSplatOpLowering::matchAndRewrite( + cir::VecSplatOp op, OpAdaptor adaptor, + mlir::ConversionPatternRewriter &rewriter) const { + // Vector splat can be implemented with an `insertelement` and a + // `shufflevector`, which is better than an `insertelement` for each + // element in the vector. Start with an undef vector. Insert the value into + // the first element. Then use a `shufflevector` with a mask of all 0 to + // fill out the entire vector with that value. + cir::VectorType vecTy = op.getType(); + mlir::Type llvmTy = typeConverter->convertType(vecTy); + mlir::Location loc = op.getLoc(); + mlir::Value poison = rewriter.create(loc, llvmTy); + + mlir::Value elementValue = adaptor.getValue(); + if (mlir::isa(elementValue.getDefiningOp())) { + // If the splat value is poison, then we can just use poison value + // for the entire vector. + rewriter.replaceOp(op, poison); + return mlir::success(); + } + + if (auto constValue = + dyn_cast(elementValue.getDefiningOp())) { + if (auto intAttr = dyn_cast(constValue.getValue())) { + mlir::DenseIntElementsAttr denseVec = mlir::DenseIntElementsAttr::get( + mlir::cast(llvmTy), intAttr.getValue()); + rewriter.replaceOpWithNewOp( + op, denseVec.getType(), denseVec); + return mlir::success(); + } + + if (auto fpAttr = dyn_cast(constValue.getValue())) { + mlir::DenseFPElementsAttr denseVec = mlir::DenseFPElementsAttr::get( + mlir::cast(llvmTy), fpAttr.getValue()); + rewriter.replaceOpWithNewOp( + op, denseVec.getType(), denseVec); + return mlir::success(); + } + } + + mlir::Value indexValue = + rewriter.create(loc, rewriter.getI64Type(), 0); + mlir::Value oneElement = rewriter.create( + loc, poison, elementValue, indexValue); + SmallVector zeroValues(vecTy.getSize(), 0); + rewriter.replaceOpWithNewOp(op, oneElement, + poison, zeroValues); + return mlir::success(); +} + mlir::LogicalResult CIRToLLVMVecShuffleOpLowering::matchAndRewrite( cir::VecShuffleOp op, OpAdaptor adaptor, mlir::ConversionPatternRewriter &rewriter) const { diff --git a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.h b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.h index 22d8a1e7c22e..2eda568c84bd 100644 --- a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.h +++ b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.h @@ -367,6 +367,16 @@ public: mlir::ConversionPatternRewriter &) const override; }; +class CIRToLLVMVecSplatOpLowering + : public mlir::OpConversionPattern { +public: + using mlir::OpConversionPattern::OpConversionPattern; + + mlir::LogicalResult + matchAndRewrite(cir::VecSplatOp op, OpAdaptor, + mlir::ConversionPatternRewriter &) const override; +}; + class CIRToLLVMVecShuffleOpLowering : public mlir::OpConversionPattern { public: diff --git a/clang/test/CIR/CodeGen/vector-ext.cpp b/clang/test/CIR/CodeGen/vector-ext.cpp index e1814f216f6b..965c44c9461a 100644 --- a/clang/test/CIR/CodeGen/vector-ext.cpp +++ b/clang/test/CIR/CodeGen/vector-ext.cpp @@ -990,6 +990,7 @@ void foo14() { // OGCG: %[[TMP_B:.*]] = load <4 x float>, ptr %[[VEC_B]], align 16 // OGCG: %[[GE:.*]] = fcmp oge <4 x float> %[[TMP_A]], %[[TMP_B]] // OGCG: %[[RES:.*]] = sext <4 x i1> %[[GE]] to <4 x i32> +// OGCG: store <4 x i32> %[[RES]], ptr {{.*}}, align 16 void foo15() { vi4 a; @@ -1092,6 +1093,69 @@ void foo17() { // OGCG: %[[TMP:.*]] = load <2 x double>, ptr %[[VEC_A]], align 16 // OGCG: %[[RES:.*]]= fptoui <2 x double> %[[TMP]] to <2 x i16> +void foo18() { + vi4 a = {1, 2, 3, 4}; + vi4 shl = a << 3; + + uvi4 b = {1u, 2u, 3u, 4u}; + uvi4 shr = b >> 3u; +} + +// CIR: %[[VEC_A:.*]] = cir.alloca !cir.vector<4 x !s32i>, !cir.ptr>, ["a", init] +// CIR: %[[SHL_RES:.*]] = cir.alloca !cir.vector<4 x !s32i>, !cir.ptr>, ["shl", init] +// CIR: %[[VEC_B:.*]] = cir.alloca !cir.vector<4 x !u32i>, !cir.ptr>, ["b", init] +// CIR: %[[SHR_RES:.*]] = cir.alloca !cir.vector<4 x !u32i>, !cir.ptr>, ["shr", init] +// CIR: %[[CONST_1:.*]] = cir.const #cir.int<1> : !s32i +// CIR: %[[CONST_2:.*]] = cir.const #cir.int<2> : !s32i +// CIR: %[[CONST_3:.*]] = cir.const #cir.int<3> : !s32i +// CIR: %[[CONST_4:.*]] = cir.const #cir.int<4> : !s32i +// CIR: %[[VEC_A_VAL:.*]] = cir.vec.create(%[[CONST_1]], %[[CONST_2]], %[[CONST_3]], %[[CONST_4]] : +// CIR-SAME: !s32i, !s32i, !s32i, !s32i) : !cir.vector<4 x !s32i> +// CIR: cir.store{{.*}} %[[VEC_A_VAL]], %[[VEC_A]] : !cir.vector<4 x !s32i>, !cir.ptr> +// CIR: %[[TMP_A:.*]] = cir.load{{.*}} %[[VEC_A]] : !cir.ptr>, !cir.vector<4 x !s32i> +// CIR: %[[SH_AMOUNT:.*]] = cir.const #cir.int<3> : !s32i +// CIR: %[[SPLAT_VEC:.*]] = cir.vec.splat %[[SH_AMOUNT]] : !s32i, !cir.vector<4 x !s32i> +// CIR: %[[SHL:.*]] = cir.shift(left, %[[TMP_A]] : !cir.vector<4 x !s32i>, %[[SPLAT_VEC]] : !cir.vector<4 x !s32i>) -> !cir.vector<4 x !s32i> +// CIR: cir.store{{.*}} %[[SHL]], %[[SHL_RES]] : !cir.vector<4 x !s32i>, !cir.ptr> +// CIR: %[[CONST_1:.*]] = cir.const #cir.int<1> : !u32i +// CIR: %[[CONST_2:.*]] = cir.const #cir.int<2> : !u32i +// CIR: %[[CONST_3:.*]] = cir.const #cir.int<3> : !u32i +// CIR: %[[CONST_4:.*]] = cir.const #cir.int<4> : !u32i +// CIR: %[[VEC_B_VAL:.*]] = cir.vec.create(%[[CONST_1]], %[[CONST_2]], %[[CONST_3]], %[[CONST_4]] : +// CIR-SAME: !u32i, !u32i, !u32i, !u32i) : !cir.vector<4 x !u32i> +// CIR: cir.store{{.*}} %[[VEC_B_VAL]], %[[VEC_B]] : !cir.vector<4 x !u32i>, !cir.ptr> +// CIR: %[[TMP_B:.*]] = cir.load{{.*}} %[[VEC_B]] : !cir.ptr>, !cir.vector<4 x !u32i> +// CIR: %[[SH_AMOUNT:.*]] = cir.const #cir.int<3> : !u32i +// CIR: %[[SPLAT_VEC:.*]] = cir.vec.splat %[[SH_AMOUNT]] : !u32i, !cir.vector<4 x !u32i> +// CIR: %[[SHR:.*]] = cir.shift(right, %[[TMP_B]] : !cir.vector<4 x !u32i>, %[[SPLAT_VEC]] : !cir.vector<4 x !u32i>) -> !cir.vector<4 x !u32i> +// CIR: cir.store{{.*}} %[[SHR]], %[[SHR_RES]] : !cir.vector<4 x !u32i>, !cir.ptr> + +// LLVM: %[[VEC_A:.*]] = alloca <4 x i32>, i64 1, align 16 +// LLVM: %[[SHL_RES:.*]] = alloca <4 x i32>, i64 1, align 16 +// LLVM: %[[VEC_B:.*]] = alloca <4 x i32>, i64 1, align 16 +// LLVM: %[[SHR_RES:.*]] = alloca <4 x i32>, i64 1, align 16 +// LLVM: store <4 x i32> , ptr %[[VEC_A]], align 16 +// LLVM: %[[TMP_A:.*]] = load <4 x i32>, ptr %[[VEC_A]], align 16 +// LLVM: %[[SHL:.*]] = shl <4 x i32> %[[TMP_A]], splat (i32 3) +// LLVM: store <4 x i32> %[[SHL]], ptr %[[SHL_RES]], align 16 +// LLVM: store <4 x i32> , ptr %[[VEC_B]], align 16 +// LLVM: %[[TMP_B:.*]] = load <4 x i32>, ptr %[[VEC_B]], align 16 +// LLVM: %[[SHR:.*]] = lshr <4 x i32> %[[TMP_B]], splat (i32 3) +// LLVM: store <4 x i32> %[[SHR]], ptr %[[SHR_RES]], align 16 + +// OGCG: %[[VEC_A:.*]] = alloca <4 x i32>, align 16 +// OGCG: %[[SHL_RES:.*]] = alloca <4 x i32>, align 16 +// OGCG: %[[VEC_B:.*]] = alloca <4 x i32>, align 16 +// OGCG: %[[SHR_RES:.*]] = alloca <4 x i32>, align 16 +// OGCG: store <4 x i32> , ptr %[[VEC_A]], align 16 +// OGCG: %[[TMP_A:.*]] = load <4 x i32>, ptr %[[VEC_A]], align 16 +// OGCG: %[[SHL:.*]] = shl <4 x i32> %[[TMP_A]], splat (i32 3) +// OGCG: store <4 x i32> %[[SHL]], ptr %[[SHL_RES]], align 16 +// OGCG: store <4 x i32> , ptr %[[VEC_B]], align 16 +// OGCG: %[[TMP_B:.*]] = load <4 x i32>, ptr %[[VEC_B]], align 16 +// OGCG: %[[SHR:.*]] = lshr <4 x i32> %[[TMP_B]], splat (i32 3) +// OGCG: store <4 x i32> %[[SHR]], ptr %[[SHR_RES]], align 16 + void foo19() { vi4 a; vi4 b; diff --git a/clang/test/CIR/CodeGen/vector.cpp b/clang/test/CIR/CodeGen/vector.cpp index 4f116faa7a1a..23e91724dc0f 100644 --- a/clang/test/CIR/CodeGen/vector.cpp +++ b/clang/test/CIR/CodeGen/vector.cpp @@ -1071,6 +1071,69 @@ void foo17() { // OGCG: %[[TMP:.*]] = load <2 x double>, ptr %[[VEC_A]], align 16 // OGCG: %[[RES:.*]]= fptoui <2 x double> %[[TMP]] to <2 x i16> +void foo18() { + vi4 a = {1, 2, 3, 4}; + vi4 shl = a << 3; + + uvi4 b = {1u, 2u, 3u, 4u}; + uvi4 shr = b >> 3u; +} + +// CIR: %[[VEC_A:.*]] = cir.alloca !cir.vector<4 x !s32i>, !cir.ptr>, ["a", init] +// CIR: %[[SHL_RES:.*]] = cir.alloca !cir.vector<4 x !s32i>, !cir.ptr>, ["shl", init] +// CIR: %[[VEC_B:.*]] = cir.alloca !cir.vector<4 x !u32i>, !cir.ptr>, ["b", init] +// CIR: %[[SHR_RES:.*]] = cir.alloca !cir.vector<4 x !u32i>, !cir.ptr>, ["shr", init] +// CIR: %[[CONST_1:.*]] = cir.const #cir.int<1> : !s32i +// CIR: %[[CONST_2:.*]] = cir.const #cir.int<2> : !s32i +// CIR: %[[CONST_3:.*]] = cir.const #cir.int<3> : !s32i +// CIR: %[[CONST_4:.*]] = cir.const #cir.int<4> : !s32i +// CIR: %[[VEC_A_VAL:.*]] = cir.vec.create(%[[CONST_1]], %[[CONST_2]], %[[CONST_3]], %[[CONST_4]] : +// CIR-SAME: !s32i, !s32i, !s32i, !s32i) : !cir.vector<4 x !s32i> +// CIR: cir.store{{.*}} %[[VEC_A_VAL]], %[[VEC_A]] : !cir.vector<4 x !s32i>, !cir.ptr> +// CIR: %[[TMP_A:.*]] = cir.load{{.*}} %[[VEC_A]] : !cir.ptr>, !cir.vector<4 x !s32i> +// CIR: %[[SH_AMOUNT:.*]] = cir.const #cir.int<3> : !s32i +// CIR: %[[SPLAT_VEC:.*]] = cir.vec.splat %[[SH_AMOUNT]] : !s32i, !cir.vector<4 x !s32i> +// CIR: %[[SHL:.*]] = cir.shift(left, %[[TMP_A]] : !cir.vector<4 x !s32i>, %[[SPLAT_VEC]] : !cir.vector<4 x !s32i>) -> !cir.vector<4 x !s32i> +// CIR: cir.store{{.*}} %[[SHL]], %[[SHL_RES]] : !cir.vector<4 x !s32i>, !cir.ptr> +// CIR: %[[CONST_1:.*]] = cir.const #cir.int<1> : !u32i +// CIR: %[[CONST_2:.*]] = cir.const #cir.int<2> : !u32i +// CIR: %[[CONST_3:.*]] = cir.const #cir.int<3> : !u32i +// CIR: %[[CONST_4:.*]] = cir.const #cir.int<4> : !u32i +// CIR: %[[VEC_B_VAL:.*]] = cir.vec.create(%[[CONST_1]], %[[CONST_2]], %[[CONST_3]], %[[CONST_4]] : +// CIR-SAME: !u32i, !u32i, !u32i, !u32i) : !cir.vector<4 x !u32i> +// CIR: cir.store{{.*}} %[[VEC_B_VAL]], %[[VEC_B]] : !cir.vector<4 x !u32i>, !cir.ptr> +// CIR: %[[TMP_B:.*]] = cir.load{{.*}} %[[VEC_B]] : !cir.ptr>, !cir.vector<4 x !u32i> +// CIR: %[[SH_AMOUNT:.*]] = cir.const #cir.int<3> : !u32i +// CIR: %[[SPLAT_VEC:.*]] = cir.vec.splat %[[SH_AMOUNT]] : !u32i, !cir.vector<4 x !u32i> +// CIR: %[[SHR:.*]] = cir.shift(right, %[[TMP_B]] : !cir.vector<4 x !u32i>, %[[SPLAT_VEC]] : !cir.vector<4 x !u32i>) -> !cir.vector<4 x !u32i> +// CIR: cir.store{{.*}} %[[SHR]], %[[SHR_RES]] : !cir.vector<4 x !u32i>, !cir.ptr> + +// LLVM: %[[VEC_A:.*]] = alloca <4 x i32>, i64 1, align 16 +// LLVM: %[[SHL_RES:.*]] = alloca <4 x i32>, i64 1, align 16 +// LLVM: %[[VEC_B:.*]] = alloca <4 x i32>, i64 1, align 16 +// LLVM: %[[SHR_RES:.*]] = alloca <4 x i32>, i64 1, align 16 +// LLVM: store <4 x i32> , ptr %[[VEC_A]], align 16 +// LLVM: %[[TMP_A:.*]] = load <4 x i32>, ptr %[[VEC_A]], align 16 +// LLVM: %[[SHL:.*]] = shl <4 x i32> %[[TMP_A]], splat (i32 3) +// LLVM: store <4 x i32> %[[SHL]], ptr %[[SHL_RES]], align 16 +// LLVM: store <4 x i32> , ptr %[[VEC_B]], align 16 +// LLVM: %[[TMP_B:.*]] = load <4 x i32>, ptr %[[VEC_B]], align 16 +// LLVM: %[[SHR:.*]] = lshr <4 x i32> %[[TMP_B]], splat (i32 3) +// LLVM: store <4 x i32> %[[SHR]], ptr %[[SHR_RES]], align 16 + +// OGCG: %[[VEC_A:.*]] = alloca <4 x i32>, align 16 +// OGCG: %[[SHL_RES:.*]] = alloca <4 x i32>, align 16 +// OGCG: %[[VEC_B:.*]] = alloca <4 x i32>, align 16 +// OGCG: %[[SHR_RES:.*]] = alloca <4 x i32>, align 16 +// OGCG: store <4 x i32> , ptr %[[VEC_A]], align 16 +// OGCG: %[[TMP_A:.*]] = load <4 x i32>, ptr %[[VEC_A]], align 16 +// OGCG: %[[SHL:.*]] = shl <4 x i32> %[[TMP_A]], splat (i32 3) +// OGCG: store <4 x i32> %[[SHL]], ptr %[[SHL_RES]], align 16 +// OGCG: store <4 x i32> , ptr %[[VEC_B]], align 16 +// OGCG: %[[TMP_B:.*]] = load <4 x i32>, ptr %[[VEC_B]], align 16 +// OGCG: %[[SHR:.*]] = lshr <4 x i32> %[[TMP_B]], splat (i32 3) +// OGCG: store <4 x i32> %[[SHR]], ptr %[[SHR_RES]], align 16 + void foo19() { vi4 a; vi4 b; diff --git a/clang/test/CIR/IR/vector.cir b/clang/test/CIR/IR/vector.cir index a455acf92ab6..f23f5de9692d 100644 --- a/clang/test/CIR/IR/vector.cir +++ b/clang/test/CIR/IR/vector.cir @@ -187,4 +187,37 @@ cir.func @vector_shuffle_dynamic_test() { // CHECK: cir.return // CHECK: } +cir.func @vector_splat_test() { + %0 = cir.alloca !cir.vector<4 x !s32i>, !cir.ptr>, ["a", init] + %1 = cir.alloca !cir.vector<4 x !s32i>, !cir.ptr>, ["shl", init] + %2 = cir.const #cir.int<1> : !s32i + %3 = cir.const #cir.int<2> : !s32i + %4 = cir.const #cir.int<3> : !s32i + %5 = cir.const #cir.int<4> : !s32i + %6 = cir.vec.create(%2, %3, %4, %5 : !s32i, !s32i, !s32i, !s32i) : !cir.vector<4 x !s32i> + cir.store %6, %0 : !cir.vector<4 x !s32i>, !cir.ptr> + %7 = cir.load %0 : !cir.ptr>, !cir.vector<4 x !s32i> + %8 = cir.const #cir.int<3> : !s32i + %9 = cir.vec.splat %8 : !s32i, !cir.vector<4 x !s32i> + %10 = cir.shift(left, %7 : !cir.vector<4 x !s32i>, %9 : !cir.vector<4 x !s32i>) -> !cir.vector<4 x !s32i> + cir.store %10, %1 : !cir.vector<4 x !s32i>, !cir.ptr> + cir.return +} + +// CHECK: cir.func @vector_splat_test() { +// CHECK-NEXT: %[[VEC:.*]] = cir.alloca !cir.vector<4 x !s32i>, !cir.ptr>, ["a", init] +// CHECK-NEXT: %[[SHL_RES:.*]] = cir.alloca !cir.vector<4 x !s32i>, !cir.ptr>, ["shl", init] +// CHECK-NEXT: %[[CONST_1:.*]] = cir.const #cir.int<1> : !s32i +// CHECK-NEXT: %[[CONST_2:.*]] = cir.const #cir.int<2> : !s32i +// CHECK-NEXT: %[[CONST_3:.*]] = cir.const #cir.int<3> : !s32i +// CHECK-NEXT: %[[CONST_4:.*]] = cir.const #cir.int<4> : !s32i +// CHECK-NEXT: %[[VEC_VAL:.*]] = cir.vec.create(%[[CONST_1]], %[[CONST_2]], %[[CONST_3]], %[[CONST_4]] : !s32i, !s32i, !s32i, !s32i) : !cir.vector<4 x !s32i> +// CHECK-NEXT: cir.store %[[VEC_VAL]], %[[VEC]] : !cir.vector<4 x !s32i>, !cir.ptr> +// CHECK-NEXT: %[[TMP:.*]] = cir.load %[[VEC]] : !cir.ptr>, !cir.vector<4 x !s32i> +// CHECK-NEXT: %[[SPLAT_VAL:.*]] = cir.const #cir.int<3> : !s32i +// CHECK-NEXT: %[[SPLAT_VEC:.*]] = cir.vec.splat %[[SPLAT_VAL]] : !s32i, !cir.vector<4 x !s32i> +// CHECK-NEXT: %[[SHL:.*]] = cir.shift(left, %[[TMP]] : !cir.vector<4 x !s32i>, %[[SPLAT_VEC]] : !cir.vector<4 x !s32i>) -> !cir.vector<4 x !s32i> +// CHECK-NEXT: cir.store %[[SHL]], %[[SHL_RES:.*]] : !cir.vector<4 x !s32i>, !cir.ptr> +// CHECK-NEXT: cir.return + } From 621a7d0f66f3da27e687dd7dd832450334ee81da Mon Sep 17 00:00:00 2001 From: jeanPerier Date: Wed, 11 Jun 2025 19:02:47 +0200 Subject: [PATCH 0019/1322] [flang] silence bogus error with BIND(C) variable in hermetic module (#143737) The global name semantic check was firing in a bogus way when BIND(C) variables are in hermetic module. Do not raise the error if one of the symbol with the conflicting global name is an "hermetic variant" of the other. --- flang/lib/Semantics/check-declarations.cpp | 10 +++++++++ flang/test/Semantics/modfile76.F90 | 24 ++++++++++++++++++++++ 2 files changed, 34 insertions(+) create mode 100644 flang/test/Semantics/modfile76.F90 diff --git a/flang/lib/Semantics/check-declarations.cpp b/flang/lib/Semantics/check-declarations.cpp index 46a5b970fdf0..f9d64485f140 100644 --- a/flang/lib/Semantics/check-declarations.cpp +++ b/flang/lib/Semantics/check-declarations.cpp @@ -2958,6 +2958,14 @@ static std::optional DefinesGlobalName(const Symbol &symbol) { return std::nullopt; } +static bool IsSameSymbolFromHermeticModule( + const Symbol &symbol, const Symbol &other) { + return symbol.name() == other.name() && symbol.owner().IsModule() && + other.owner().IsModule() && symbol.owner() != other.owner() && + symbol.owner().GetName() && + symbol.owner().GetName() == other.owner().GetName(); +} + // 19.2 p2 void CheckHelper::CheckGlobalName(const Symbol &symbol) { if (auto global{DefinesGlobalName(symbol)}) { @@ -2975,6 +2983,8 @@ void CheckHelper::CheckGlobalName(const Symbol &symbol) { (!IsExternalProcedureDefinition(symbol) || !IsExternalProcedureDefinition(other))) { // both are procedures/BLOCK DATA, not both definitions + } else if (IsSameSymbolFromHermeticModule(symbol, other)) { + // Both symbols are the same thing. } else if (symbol.has()) { Warn(common::LanguageFeature::BenignNameClash, symbol.name(), "Module '%s' conflicts with a global name"_port_en_US, diff --git a/flang/test/Semantics/modfile76.F90 b/flang/test/Semantics/modfile76.F90 new file mode 100644 index 000000000000..50ee9a088e11 --- /dev/null +++ b/flang/test/Semantics/modfile76.F90 @@ -0,0 +1,24 @@ +!RUN: %flang_fc1 -fsyntax-only -fhermetic-module-files -DSTEP=1 %s +!RUN: %flang_fc1 -fsyntax-only %s + +! Tests that a BIND(C) variable in a module A captured in a hermetic module +! file USE'd in a module B is not creating bogus complaints about BIND(C) name +! conflict when both module A and B are later accessed. + +#if STEP == 1 +module modfile75a + integer, bind(c) :: x +end + +module modfile75b + use modfile75a ! capture hermetically +end + +#else +subroutine test + use modfile75a + use modfile75b + implicit none + print *, x +end subroutine +#endif From 7414d88b5f8af1bdf8da6bf2493b485ba5d079f2 Mon Sep 17 00:00:00 2001 From: Jeremy Morse Date: Wed, 11 Jun 2025 18:13:56 +0100 Subject: [PATCH 0020/1322] Squelch an unused-function warning After removing some debug-intrinsic creation code, this function is now unused (and un-necessary) --- llvm/lib/IR/DIBuilder.cpp | 4 ---- 1 file changed, 4 deletions(-) diff --git a/llvm/lib/IR/DIBuilder.cpp b/llvm/lib/IR/DIBuilder.cpp index 1484c549dd58..c56dd7a1d382 100644 --- a/llvm/lib/IR/DIBuilder.cpp +++ b/llvm/lib/IR/DIBuilder.cpp @@ -1069,10 +1069,6 @@ static Value *getDbgIntrinsicValueImpl(LLVMContext &VMContext, Value *V) { return MetadataAsValue::get(VMContext, ValueAsMetadata::get(V)); } -static Function *getDeclareIntrin(Module &M) { - return Intrinsic::getOrInsertDeclaration(&M, Intrinsic::dbg_declare); -} - DbgInstPtr DIBuilder::insertDbgValueIntrinsic(llvm::Value *Val, DILocalVariable *VarInfo, DIExpression *Expr, From 3e24dadee0d7ecc5f95fe0760afb7abdeb9a2dc5 Mon Sep 17 00:00:00 2001 From: Shafik Yaghmour Date: Wed, 11 Jun 2025 10:24:19 -0700 Subject: [PATCH 0021/1322] [Clang][Tooling][NFC] Use move to avoid copies of large objects (#143603) Static analysis flagged these cases in which can use std::move and avoid copies of large objects. --- clang/lib/Tooling/DependencyScanning/ModuleDepCollector.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/clang/lib/Tooling/DependencyScanning/ModuleDepCollector.cpp b/clang/lib/Tooling/DependencyScanning/ModuleDepCollector.cpp index 44a270d5f7b3..b1495163ccc2 100644 --- a/clang/lib/Tooling/DependencyScanning/ModuleDepCollector.cpp +++ b/clang/lib/Tooling/DependencyScanning/ModuleDepCollector.cpp @@ -657,7 +657,7 @@ void ModuleDepCollectorPP::moduleImport(SourceLocation ImportLoc, P1689ModuleInfo RequiredModule; RequiredModule.ModuleName = Path[0].getIdentifierInfo()->getName().str(); RequiredModule.Type = P1689ModuleInfo::ModuleType::NamedCXXModule; - MDC.RequiredStdCXXModules.push_back(RequiredModule); + MDC.RequiredStdCXXModules.push_back(std::move(RequiredModule)); return; } @@ -920,7 +920,7 @@ void ModuleDepCollectorPP::addAllSubmoduleDeps( void ModuleDepCollectorPP::addOneModuleDep(const Module *M, const ModuleID ID, ModuleDeps &MD) { - MD.ClangModuleDeps.push_back(ID); + MD.ClangModuleDeps.push_back(std::move(ID)); if (MD.IsInStableDirectories) MD.IsInStableDirectories = MDC.ModularDeps[M]->IsInStableDirectories; } From 66f533e7e34d6f6d0e293a67dd54be9e4c240ddd Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Wed, 11 Jun 2025 10:39:02 -0700 Subject: [PATCH 0022/1322] [IR] Fix warnings (#143752) This patch fixes: llvm/lib/IR/DIBuilder.cpp:1072:18: error: unused function 'getDeclareIntrin' [-Werror,-Wunused-function] llvm/include/llvm/IR/DIBuilder.h:51:15: error: private field 'DeclareFn' is not used [-Werror,-Wunused-private-field] llvm/include/llvm/IR/DIBuilder.h:52:15: error: private field 'ValueFn' is not used [-Werror,-Wunused-private-field] llvm/include/llvm/IR/DIBuilder.h:53:15: error: private field 'LabelFn' is not used [-Werror,-Wunused-private-field] llvm/include/llvm/IR/DIBuilder.h:54:15: error: private field 'AssignFn' is not used [-Werror,-Wunused-private-field] --- llvm/include/llvm/IR/DIBuilder.h | 6 +----- llvm/lib/IR/DIBuilder.cpp | 3 +-- 2 files changed, 2 insertions(+), 7 deletions(-) diff --git a/llvm/include/llvm/IR/DIBuilder.h b/llvm/include/llvm/IR/DIBuilder.h index ebfe41dd59af..43fca571ee6d 100644 --- a/llvm/include/llvm/IR/DIBuilder.h +++ b/llvm/include/llvm/IR/DIBuilder.h @@ -47,11 +47,7 @@ namespace llvm { Module &M; LLVMContext &VMContext; - DICompileUnit *CUNode; ///< The one compile unit created by this DIBuiler. - Function *DeclareFn; ///< llvm.dbg.declare - Function *ValueFn; ///< llvm.dbg.value - Function *LabelFn; ///< llvm.dbg.label - Function *AssignFn; ///< llvm.dbg.assign + DICompileUnit *CUNode; ///< The one compile unit created by this DIBuiler. SmallVector AllEnumTypes; /// Track the RetainTypes, since they can be updated later on. diff --git a/llvm/lib/IR/DIBuilder.cpp b/llvm/lib/IR/DIBuilder.cpp index c56dd7a1d382..fd8c2d7bb5cc 100644 --- a/llvm/lib/IR/DIBuilder.cpp +++ b/llvm/lib/IR/DIBuilder.cpp @@ -25,8 +25,7 @@ using namespace llvm; using namespace llvm::dwarf; DIBuilder::DIBuilder(Module &m, bool AllowUnresolvedNodes, DICompileUnit *CU) - : M(m), VMContext(M.getContext()), CUNode(CU), DeclareFn(nullptr), - ValueFn(nullptr), LabelFn(nullptr), AssignFn(nullptr), + : M(m), VMContext(M.getContext()), CUNode(CU), AllowUnresolvedNodes(AllowUnresolvedNodes) { if (CUNode) { if (const auto &ETs = CUNode->getEnumTypes()) From c2f0af514beb7618660cf8d145fa9e49fb78869c Mon Sep 17 00:00:00 2001 From: Alexander Richardson Date: Wed, 11 Jun 2025 10:47:17 -0700 Subject: [PATCH 0023/1322] [GISelValueTracking] Add test case for G_PTRTOINT While we can only reason about the index/address, the G_PTRTOINT operations returns all representation bits, so we can't assume the remaining ones are all zeroes. This behaviour was clarified as part of the discussion in https://discourse.llvm.org/t/clarifiying-the-semantics-of-ptrtoint/83987/54. The LangRef semantics of ptrtoint being a full representation bitcast were documented in https://github.com/llvm/llvm-project/pull/139349. Prior to 77c8d214131e951e3d3a07b45a7436f54988d6f3 we were incorrectly assuming known zeroes beyond the index size even if the input was completely unknown. This commit adds a test case for G_PTRTOINT which was omitted from that change. See https://github.com/llvm/llvm-project/issues/139598 Reviewed By: arsenm Pull Request: https://github.com/llvm/llvm-project/pull/139608 --- .../AMDGPU/GlobalISel/knownbits-ptrtoint.mir | 110 ++++++++++++++++++ 1 file changed, 110 insertions(+) create mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/knownbits-ptrtoint.mir diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/knownbits-ptrtoint.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/knownbits-ptrtoint.mir new file mode 100644 index 000000000000..4073568fd421 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/knownbits-ptrtoint.mir @@ -0,0 +1,110 @@ +# NOTE: Assertions have been autogenerated by utils/update_givaluetracking_test_checks.py UTC_ARGS: --version 5 +# RUN: llc -mtriple=amdgcn -passes="print" %s -filetype=null 2>&1 | FileCheck %s +## Check that we don't incorrectly assume known zeroes for and extend of a truncated ptrtoint +## Test case for https://github.com/llvm/llvm-project/issues/139598 +--- +## We should see 128 unknown bits. +name: PtrToInt +body: | + bb.0: + ; CHECK-LABEL: name: @PtrToInt + ; CHECK-NEXT: %0:_ KnownBits:???????????????????????????????? SignBits:1 + ; CHECK-NEXT: %1:_ KnownBits:???????????????????????????????? SignBits:1 + ; CHECK-NEXT: %2:_ KnownBits:???????????????????????????????? SignBits:1 + ; CHECK-NEXT: %3:_ KnownBits:???????????????????????????????? SignBits:1 + ; CHECK-NEXT: %4:_ KnownBits:???????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????? SignBits:1 + ; CHECK-NEXT: %5:_ KnownBits:???????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????? SignBits:1 + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = COPY $vgpr1 + %2:_(s32) = COPY $vgpr2 + %3:_(s32) = COPY $vgpr3 + %4:_(p8) = G_MERGE_VALUES %0(s32), %1(s32), %2(s32), %3(s32) + %5:_(s128) = G_PTRTOINT %4(p8) +... +--- +## We should see 128 high zeroes followed by 128 unknown bits for extending ptrtoint. +name: PtrToIntExt +body: | + bb.0: + ; CHECK-LABEL: name: @PtrToIntExt + ; CHECK-NEXT: %0:_ KnownBits:???????????????????????????????? SignBits:1 + ; CHECK-NEXT: %1:_ KnownBits:???????????????????????????????? SignBits:1 + ; CHECK-NEXT: %2:_ KnownBits:???????????????????????????????? SignBits:1 + ; CHECK-NEXT: %3:_ KnownBits:???????????????????????????????? SignBits:1 + ; CHECK-NEXT: %4:_ KnownBits:???????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????? SignBits:1 + ; CHECK-NEXT: %5:_ KnownBits:00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000???????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????? SignBits:128 + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = COPY $vgpr1 + %2:_(s32) = COPY $vgpr2 + %3:_(s32) = COPY $vgpr3 + %4:_(p8) = G_MERGE_VALUES %0(s32), %1(s32), %2(s32), %3(s32) + %5:_(s256) = G_PTRTOINT %4(p8) +... +--- +## We should see 48 unknown bits for truncating ptrtoint. +name: PtrToIntTrunc +body: | + bb.0: + ; CHECK-LABEL: name: @PtrToIntTrunc + ; CHECK-NEXT: %0:_ KnownBits:???????????????????????????????? SignBits:1 + ; CHECK-NEXT: %1:_ KnownBits:???????????????????????????????? SignBits:1 + ; CHECK-NEXT: %2:_ KnownBits:???????????????????????????????? SignBits:1 + ; CHECK-NEXT: %3:_ KnownBits:???????????????????????????????? SignBits:1 + ; CHECK-NEXT: %4:_ KnownBits:???????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????? SignBits:1 + ; CHECK-NEXT: %5:_ KnownBits:???????????????????????????????????????????????? SignBits:1 + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = COPY $vgpr1 + %2:_(s32) = COPY $vgpr2 + %3:_(s32) = COPY $vgpr3 + %4:_(p8) = G_MERGE_VALUES %0(s32), %1(s32), %2(s32), %3(s32) + %5:_(s48) = G_PTRTOINT %4(p8) +... +--- +## This is the test for issue 139598: Truncating and then extending the +## G_PTRTOINT result was filling all bits above the index bitwidth with known +## zeroes even though the incoming value is completely unknown and G_PTRTOINT. +## is lowered to a bitwise copy. +## We should see all zero high bits with 48 unknown bits. +name: PtrToIntTruncExplicitExt +body: | + bb.0: + ; CHECK-LABEL: name: @PtrToIntTruncExplicitExt + ; CHECK-NEXT: %0:_ KnownBits:???????????????????????????????? SignBits:1 + ; CHECK-NEXT: %1:_ KnownBits:???????????????????????????????? SignBits:1 + ; CHECK-NEXT: %2:_ KnownBits:???????????????????????????????? SignBits:1 + ; CHECK-NEXT: %3:_ KnownBits:???????????????????????????????? SignBits:1 + ; CHECK-NEXT: %4:_ KnownBits:???????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????? SignBits:1 + ; CHECK-NEXT: %5:_ KnownBits:???????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????? SignBits:1 + ; CHECK-NEXT: %6:_ KnownBits:???????????????????????????????????????????????? SignBits:1 + ; CHECK-NEXT: %7:_ KnownBits:0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000???????????????????????????????????????????????? SignBits:208 + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = COPY $vgpr1 + %2:_(s32) = COPY $vgpr2 + %3:_(s32) = COPY $vgpr3 + %4:_(p8) = G_MERGE_VALUES %0(s32), %1(s32), %2(s32), %3(s32) + %5:_(s128) = G_PTRTOINT %4(p8) + %6:_(s48) = G_TRUNC %5(s128) + %7:_(s256) = G_ZEXT %6(s48) +... +--- +## Same test again but this time have the G_PTRTOINT do the truncation. +## We should see all zero high bits with 48 unknown bits. +name: PtrToIntTruncImplicitExt +body: | + bb.0: + ; CHECK-LABEL: name: @PtrToIntTruncImplicitExt + ; CHECK-NEXT: %0:_ KnownBits:???????????????????????????????? SignBits:1 + ; CHECK-NEXT: %1:_ KnownBits:???????????????????????????????? SignBits:1 + ; CHECK-NEXT: %2:_ KnownBits:???????????????????????????????? SignBits:1 + ; CHECK-NEXT: %3:_ KnownBits:???????????????????????????????? SignBits:1 + ; CHECK-NEXT: %4:_ KnownBits:???????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????? SignBits:1 + ; CHECK-NEXT: %5:_ KnownBits:???????????????????????????????????????????????? SignBits:1 + ; CHECK-NEXT: %6:_ KnownBits:0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000???????????????????????????????????????????????? SignBits:208 + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = COPY $vgpr1 + %2:_(s32) = COPY $vgpr2 + %3:_(s32) = COPY $vgpr3 + %4:_(p8) = G_MERGE_VALUES %0(s32), %1(s32), %2(s32), %3(s32) + %5:_(s48) = G_PTRTOINT %4(p8) + %6:_(s256) = G_ZEXT %5(s48) +... From bbe59e19b60b0efa8cc200fb3260fe572e188b26 Mon Sep 17 00:00:00 2001 From: Kewen12 Date: Wed, 11 Jun 2025 11:12:54 -0700 Subject: [PATCH 0024/1322] [OpenMP][Offload] Update the Logic for Configuring Auto Zero-Copy (#143638) Summary: Currently the Auto Zero-Copy is enabled by checking every initialized device to ensure that no dGPU is attached to an APU. However, an APU is designed to comprise a homogeneous set of GPUs, therefore, it should be sufficient to check any device for configuring Auto Zero-Copy. In this PR, it checks the first initialized device in the list. The changes in this PR are to clearly reflect the design and logic of enabling the feature for further improving the readibility. --- offload/libomptarget/PluginManager.cpp | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/offload/libomptarget/PluginManager.cpp b/offload/libomptarget/PluginManager.cpp index 93589960a426..c4d99dfa9f10 100644 --- a/offload/libomptarget/PluginManager.cpp +++ b/offload/libomptarget/PluginManager.cpp @@ -286,16 +286,16 @@ void PluginManager::registerLib(__tgt_bin_desc *Desc) { } PM->RTLsMtx.unlock(); - bool UseAutoZeroCopy = Plugins.size() > 0; + bool UseAutoZeroCopy = false; auto ExclusiveDevicesAccessor = getExclusiveDevicesAccessor(); - for (const auto &Device : *ExclusiveDevicesAccessor) - UseAutoZeroCopy &= Device->useAutoZeroCopy(); + // APUs are homogeneous set of GPUs. Check the first device for + // configuring Auto Zero-Copy. + if (ExclusiveDevicesAccessor->size() > 0) { + auto &Device = *(*ExclusiveDevicesAccessor)[0]; + UseAutoZeroCopy = Device.useAutoZeroCopy(); + } - // Auto Zero-Copy can only be currently triggered when the system is an - // homogeneous APU architecture without attached discrete GPUs. - // If all devices suggest to use it, change requirement flags to trigger - // zero-copy behavior when mapping memory. if (UseAutoZeroCopy) addRequirements(OMPX_REQ_AUTO_ZERO_COPY); From fad1972d74aead159a5e91b068cbf736e83836b5 Mon Sep 17 00:00:00 2001 From: VISHAKH PRAKASH Date: Wed, 11 Jun 2025 23:43:01 +0530 Subject: [PATCH 0025/1322] [SPIRV] FIX print the symbolic operand for opcode for the operation OpSpecConstantOp (#135756) Current implementation outputs opcode is an immediate but spirv-tools requires that the name of the operation without "Op" is needed for the instruction OpSpecConstantOp that is if the opcode is OpBitcast the instruction must be `%1 = OpSpecConstantOp %6 Bitcast %17` instead of `%1 = OpBitcast %6 124 %17` [refer this commit for more info](https://github.com/KhronosGroup/SPIRV-Tools/commit/0f166be68d4b6624a10d6bf312679505d391ec22) --------- Co-authored-by: Dmitry Sidorov Co-authored-by: Ebin-McW --- .../SPIRV/MCTargetDesc/SPIRVBaseInfo.cpp | 3 +- .../Target/SPIRV/MCTargetDesc/SPIRVBaseInfo.h | 5 ++ llvm/lib/Target/SPIRV/SPIRVInstrInfo.td | 2 +- .../lib/Target/SPIRV/SPIRVSymbolicOperands.td | 90 +++++++++++++++++++ llvm/test/CodeGen/SPIRV/const-nested-vecs.ll | 4 +- .../fun-ptr-addrcast.ll | 2 +- .../opencl/basic/progvar_prog_scope_init.ll | 2 +- .../CodeGen/SPIRV/opt-gepoperator-of-gvar.ll | 2 +- .../pointers/PtrCast-in-OpSpecConstantOp.ll | 12 +-- .../CodeGen/SPIRV/pointers/global-ptrtoint.ll | 4 +- .../pointers/irtrans-added-int-const-32-64.ll | 2 +- 11 files changed, 112 insertions(+), 16 deletions(-) diff --git a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVBaseInfo.cpp b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVBaseInfo.cpp index 342456757409..0ed97f5b41c5 100644 --- a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVBaseInfo.cpp +++ b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVBaseInfo.cpp @@ -68,7 +68,8 @@ getSymbolicOperandMnemonic(SPIRV::OperandCategory::OperandCategory Category, Category != SPIRV::OperandCategory::FunctionControlOperand && Category != SPIRV::OperandCategory::MemorySemanticsOperand && Category != SPIRV::OperandCategory::MemoryOperandOperand && - Category != SPIRV::OperandCategory::KernelProfilingInfoOperand) + Category != SPIRV::OperandCategory::KernelProfilingInfoOperand && + Category != SPIRV::OperandCategory::SpecConstantOpOperandsOperand) return "UNKNOWN"; // Value that encodes many enum values (one bit per enum value). std::string Name; diff --git a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVBaseInfo.h b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVBaseInfo.h index 083c7f8460bf..b8c467fef8e8 100644 --- a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVBaseInfo.h +++ b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVBaseInfo.h @@ -222,6 +222,11 @@ namespace CooperativeMatrixOperands { #include "SPIRVGenTables.inc" } // namespace CooperativeMatrixOperands +namespace SpecConstantOpOperands { +#define GET_SpecConstantOpOperands_DECL +#include "SPIRVGenTables.inc" +} // namespace SpecConstantOpOperands + struct ExtendedBuiltin { StringRef Name; InstructionSet::InstructionSet Set; diff --git a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td index 338f6809a3e4..049ba0275f22 100644 --- a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td +++ b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td @@ -245,7 +245,7 @@ def OpSpecConstantComposite: Op<51, (outs ID:$res), (ins TYPE:$type, variable_op "$res = OpSpecConstantComposite $type">; def OpSpecConstantCompositeContinuedINTEL: Op<6092, (outs), (ins variable_ops), "OpSpecConstantCompositeContinuedINTEL">; -def OpSpecConstantOp: Op<52, (outs ID:$res), (ins TYPE:$t, i32imm:$c, ID:$o, variable_ops), +def OpSpecConstantOp: Op<52, (outs ID:$res), (ins TYPE:$t, SpecConstantOpOperands:$c, ID:$o, variable_ops), "$res = OpSpecConstantOp $t $c $o">; // 3.42.8 Memory Instructions diff --git a/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td b/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td index ca8a9a9997a8..f1aae42ea2be 100644 --- a/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td +++ b/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td @@ -172,6 +172,7 @@ def KernelProfilingInfoOperand : OperandCategory; def OpcodeOperand : OperandCategory; def CooperativeMatrixLayoutOperand : OperandCategory; def CooperativeMatrixOperandsOperand : OperandCategory; +def SpecConstantOpOperandsOperand : OperandCategory; def MatrixMultiplyAccumulateOperandsOperand : OperandCategory; //===----------------------------------------------------------------------===// @@ -1755,6 +1756,95 @@ defm MatrixAAndBBFloat16ComponentsINTEL : CooperativeMatrixOperandsOperand<0x40, defm MatrixCBFloat16ComponentsINTEL : CooperativeMatrixOperandsOperand<0x80, [SPV_INTEL_joint_matrix], [CooperativeMatrixBFloat16ComponentTypeINTEL]>; defm MatrixResultBFloat16ComponentsINTEL : CooperativeMatrixOperandsOperand<0x100, [SPV_INTEL_joint_matrix], [CooperativeMatrixBFloat16ComponentTypeINTEL]>; +//===----------------------------------------------------------------------===// +// Multiclass used to define SpecConstant Operands enum values and at the +// same time SymbolicOperand. +//===----------------------------------------------------------------------===// + +def SpecConstantOpOperands : GenericEnum, Operand { + let FilterClass = "SpecConstantOpOperands"; + let NameField = "Name"; + let ValueField = "Value"; + let PrintMethod = !strconcat("printSymbolicOperand"); +} + +class SpecConstantOpOperands value> { + string Name = name; + bits<32> Value = value; +} + +multiclass SpecConstantOpOperandsOperand value, list reqExtensions, list reqCapabilities> { + def : SpecConstantOpOperands; + defm : SymbolicOperandWithRequirements; +} + +// Conversion +defm SConvert : SpecConstantOpOperandsOperand<114, [], []>; +defm FConvert : SpecConstantOpOperandsOperand<115, [], []>; +defm ConvertFToS : SpecConstantOpOperandsOperand<110, [], [Kernel]>; +defm ConvertSToF : SpecConstantOpOperandsOperand<111, [], [Kernel]>; +defm ConvertFToU : SpecConstantOpOperandsOperand<109, [], [Kernel]>; +defm ConvertUToF : SpecConstantOpOperandsOperand<112, [], [Kernel]>; +defm UConvert : SpecConstantOpOperandsOperand<113, [], [Kernel]>; +defm ConvertPtrToU : SpecConstantOpOperandsOperand<117, [], [Kernel]>; +defm ConvertUToPtr : SpecConstantOpOperandsOperand<120, [], [Kernel]>; +defm GenericCastToPtr : SpecConstantOpOperandsOperand<122, [], [Kernel]>; +defm PtrCastToGeneric : SpecConstantOpOperandsOperand<121, [], [Kernel]>; +defm Bitcast : SpecConstantOpOperandsOperand<124, [], []>; +defm QuantizeToF16 : SpecConstantOpOperandsOperand<116, [], [Shader]>; +// Arithmetic +defm SNegate : SpecConstantOpOperandsOperand<126, [], []>; +defm Not : SpecConstantOpOperandsOperand<200, [], []>; +defm IAdd : SpecConstantOpOperandsOperand<128, [], []>; +defm ISub : SpecConstantOpOperandsOperand<130, [], []>; +defm IMul : SpecConstantOpOperandsOperand<132, [], []>; +defm UDiv : SpecConstantOpOperandsOperand<134, [], []>; +defm SDiv : SpecConstantOpOperandsOperand<135, [], []>; +defm UMod : SpecConstantOpOperandsOperand<137, [], []>; +defm SRem : SpecConstantOpOperandsOperand<138, [], []>; +defm SMod : SpecConstantOpOperandsOperand<139, [], []>; +defm ShiftRightLogical : SpecConstantOpOperandsOperand<194, [], []>; +defm ShiftRightArithmetic : SpecConstantOpOperandsOperand<195, [], []>; +defm ShiftLeftLogical : SpecConstantOpOperandsOperand<196, [], []>; +defm BitwiseOr : SpecConstantOpOperandsOperand<197, [], []>; +defm BitwiseAnd : SpecConstantOpOperandsOperand<199, [], []>; +defm BitwiseXor : SpecConstantOpOperandsOperand<198, [], []>; +defm FNegate : SpecConstantOpOperandsOperand<127, [], [Kernel]>; +defm FAdd : SpecConstantOpOperandsOperand<129, [], [Kernel]>; +defm FSub : SpecConstantOpOperandsOperand<131, [], [Kernel]>; +defm FMul : SpecConstantOpOperandsOperand<133, [], [Kernel]>; +defm FDiv : SpecConstantOpOperandsOperand<136, [], [Kernel]>; +defm FRem : SpecConstantOpOperandsOperand<140, [], [Kernel]>; +defm FMod : SpecConstantOpOperandsOperand<141, [], [Kernel]>; +// Composite; +defm VectorShuffle : SpecConstantOpOperandsOperand<79, [], []>; +defm CompositeExtract : SpecConstantOpOperandsOperand<81, [], []>; +defm CompositeInsert : SpecConstantOpOperandsOperand<82, [], []>; +// Logical; +defm LogicalOr : SpecConstantOpOperandsOperand<166, [], []>; +defm LogicalAnd : SpecConstantOpOperandsOperand<167, [], []>; +defm LogicalNot : SpecConstantOpOperandsOperand<168, [], []>; +defm LogicalEqual : SpecConstantOpOperandsOperand<164, [], []>; +defm LogicalNotEqual : SpecConstantOpOperandsOperand<165, [], []>; +defm Select : SpecConstantOpOperandsOperand<169, [], []>; +// Comparison; +defm IEqual : SpecConstantOpOperandsOperand<170, [], []>; +defm INotEqual : SpecConstantOpOperandsOperand<171, [], []>; +defm ULessThan : SpecConstantOpOperandsOperand<176, [], []>; +defm SLessThan : SpecConstantOpOperandsOperand<177, [], []>; +defm UGreaterThan : SpecConstantOpOperandsOperand<172, [], []>; +defm SGreaterThan : SpecConstantOpOperandsOperand<173, [], []>; +defm ULessThanEqual : SpecConstantOpOperandsOperand<178, [], []>; +defm SLessThanEqual : SpecConstantOpOperandsOperand<179, [], []>; +defm UGreaterThanEqual : SpecConstantOpOperandsOperand<174, [], []>; +defm SGreaterThanEqual : SpecConstantOpOperandsOperand<175, [], []>; +// Memory +defm AccessChain : SpecConstantOpOperandsOperand<65, [], [Kernel]>; +defm InBoundsAccessChain : SpecConstantOpOperandsOperand<66, [], [Kernel]>; +defm PtrAccessChain : SpecConstantOpOperandsOperand<67, [], [Kernel]>; +defm InBoundsPtrAccessChain : SpecConstantOpOperandsOperand<70, [], [Kernel]>; +defm CooperativeMatrixLengthKHR : SpecConstantOpOperandsOperand<4460, [], []>; + //===----------------------------------------------------------------------===// // Multiclass used to define Matrix Multiply Accumulate Operands enum values and at the same time // SymbolicOperand entries with string mnemonics and capabilities. diff --git a/llvm/test/CodeGen/SPIRV/const-nested-vecs.ll b/llvm/test/CodeGen/SPIRV/const-nested-vecs.ll index 9234106e5fcd..266b46e65f31 100644 --- a/llvm/test/CodeGen/SPIRV/const-nested-vecs.ll +++ b/llvm/test/CodeGen/SPIRV/const-nested-vecs.ll @@ -25,8 +25,8 @@ ; CHECK-SPIRV-DAG: %[[#IntZero:]] = OpConstantNull %[[#IntTy]] ; CHECK-SPIRV-DAG: %[[#LongZero:]] = OpConstantNull %[[#LongTy]] ; CHECK-SPIRV64-DAG: %[[#ConstLong2:]] = OpConstant %[[#LongTy]] 2 -; CHECK-SPIRV64-DAG: %[[#PvarInit:]] = OpSpecConstantOp %[[#PtrCharTy]] 70 %[[#VarV2Char:]] %[[#IntZero]] %[[#ConstLong2]] -; CHECK-SPIRV32-DAG: %[[#PvarInit:]] = OpSpecConstantOp %[[#PtrCharTy]] 70 %[[#VarV2Char:]] %[[#IntZero]] %[[#Const2]] +; CHECK-SPIRV64-DAG: %[[#PvarInit:]] = OpSpecConstantOp %[[#PtrCharTy]] InBoundsPtrAccessChain %[[#VarV2Char:]] %[[#IntZero]] %[[#ConstLong2]] +; CHECK-SPIRV32-DAG: %[[#PvarInit:]] = OpSpecConstantOp %[[#PtrCharTy]] InBoundsPtrAccessChain %[[#VarV2Char:]] %[[#IntZero]] %[[#Const2]] ; CHECK-SPIRV-DAG: %[[#PtrPtrCharTy:]] = OpTypePointer CrossWorkgroup %[[#PtrCharTy]] ; CHECK-SPIRV-DAG: %[[#AVar]] = OpVariable %[[#PtrArr2V2CharTy]] CrossWorkgroup %[[#Arr2V2Char]] ; CHECK-SPIRV-DAG: %[[#PVar]] = OpVariable %[[#PtrPtrCharTy]] CrossWorkgroup %[[#PvarInit]] diff --git a/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_function_pointers/fun-ptr-addrcast.ll b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_function_pointers/fun-ptr-addrcast.ll index 8edecc1329d0..e5736b88b63a 100644 --- a/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_function_pointers/fun-ptr-addrcast.ll +++ b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_function_pointers/fun-ptr-addrcast.ll @@ -5,7 +5,7 @@ ; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown %s -o - --spirv-ext=+SPV_INTEL_function_pointers | FileCheck %s ; TODO: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %} -; CHECK-COUNT-3: %[[#]] = OpSpecConstantOp %[[#]] 121 %[[#]] +; CHECK-COUNT-3: %[[#]] = OpSpecConstantOp %[[#]] PtrCastToGeneric %[[#]] ; CHECK-COUNT-3: OpPtrCastToGeneric @G1 = addrspace(1) constant { [3 x ptr addrspace(4)] } { [3 x ptr addrspace(4)] [ptr addrspace(4) null, ptr addrspace(4) addrspacecast (ptr @foo to ptr addrspace(4)), ptr addrspace(4) addrspacecast (ptr @bar to ptr addrspace(4))] } diff --git a/llvm/test/CodeGen/SPIRV/opencl/basic/progvar_prog_scope_init.ll b/llvm/test/CodeGen/SPIRV/opencl/basic/progvar_prog_scope_init.ll index 9d759a1cf47d..fbc83c7a1e04 100644 --- a/llvm/test/CodeGen/SPIRV/opencl/basic/progvar_prog_scope_init.ll +++ b/llvm/test/CodeGen/SPIRV/opencl/basic/progvar_prog_scope_init.ll @@ -10,7 +10,7 @@ ; CHECK-DAG: %[[#pt2:]] = OpTypePointer CrossWorkgroup %[[#arr2]] ; CHECK-DAG: %[[#pt3:]] = OpTypePointer CrossWorkgroup %[[#pt1]] ; CHECK-DAG: %[[#a_var]] = OpVariable %[[#pt2]] CrossWorkgroup -; CHECK-DAG: %[[#const:]] = OpSpecConstantOp %[[#pt1]] 70 %[[#a_var]] +; CHECK-DAG: %[[#const:]] = OpSpecConstantOp %[[#pt1]] InBoundsPtrAccessChain %[[#a_var]] ; CHECK-DAG: %[[#p_var]] = OpVariable %[[#pt3]] CrossWorkgroup %[[#const]] @var = addrspace(1) global i8 0, align 1 @g_var = addrspace(1) global i8 1, align 1 diff --git a/llvm/test/CodeGen/SPIRV/opt-gepoperator-of-gvar.ll b/llvm/test/CodeGen/SPIRV/opt-gepoperator-of-gvar.ll index 5f9229f5a5bd..447dfa701b65 100644 --- a/llvm/test/CodeGen/SPIRV/opt-gepoperator-of-gvar.ll +++ b/llvm/test/CodeGen/SPIRV/opt-gepoperator-of-gvar.ll @@ -14,7 +14,7 @@ ; CHECK-DAG: %[[#PtrStruct:]] = OpTypePointer CrossWorkgroup %[[#Struct]] ; CHECK-DAG: %[[#Var:]] = OpVariable %[[#PtrStruct]] CrossWorkgroup %[[#VarInit]] ; CHECK-DAG: %[[#Bytes:]] = OpVariable %[[#PtrChar]] CrossWorkgroup %[[#]] -; CHECK-DAG: %[[#BytesGEP:]] = OpSpecConstantOp %[[#PtrChar]] 70 %[[#Bytes]] %[[#C648]] +; CHECK-DAG: %[[#BytesGEP:]] = OpSpecConstantOp %[[#PtrChar]] InBoundsPtrAccessChain %[[#Bytes]] %[[#C648]] ; CHECK: OpFunction ; CHECK: %[[#]] = OpFunctionParameter %[[#]] diff --git a/llvm/test/CodeGen/SPIRV/pointers/PtrCast-in-OpSpecConstantOp.ll b/llvm/test/CodeGen/SPIRV/pointers/PtrCast-in-OpSpecConstantOp.ll index 55d638f80cc5..ca7ca06fbdc8 100644 --- a/llvm/test/CodeGen/SPIRV/pointers/PtrCast-in-OpSpecConstantOp.ll +++ b/llvm/test/CodeGen/SPIRV/pointers/PtrCast-in-OpSpecConstantOp.ll @@ -23,20 +23,20 @@ ; CHECK-DAG: %[[WPtr:.*]] = OpTypePointer Workgroup %[[Int]] ; CHECK-DAG: %[[F]] = OpVariable %[[CWPtr]] CrossWorkgroup %[[#]] -; CHECK-DAG: %[[GenF:.*]] = OpSpecConstantOp %[[GenPtrChar]] 121 %[[F]] +; CHECK-DAG: %[[GenF:.*]] = OpSpecConstantOp %[[GenPtrChar]] PtrCastToGeneric %[[F]] ; CHECK-DAG: %[[B]] = OpVariable %[[CWPtr]] CrossWorkgroup %[[#]] -; CHECK-DAG: %[[GenB:.*]] = OpSpecConstantOp %[[GenPtrChar]] 121 %[[B]] +; CHECK-DAG: %[[GenB:.*]] = OpSpecConstantOp %[[GenPtrChar]] PtrCastToGeneric %[[B]] ; CHECK-DAG: %[[GenFB:.*]] = OpConstantComposite %[[Arr2]] %[[GenF]] %[[GenB]] ; CHECK-DAG: %[[GenBF:.*]] = OpConstantComposite %[[Arr2]] %[[GenB]] %[[GenF]] ; CHECK-DAG: %[[CG1:.*]] = OpConstantComposite %[[Struct2]] %[[GenFB]] ; CHECK-DAG: %[[CG2:.*]] = OpConstantComposite %[[Struct2]] %[[GenBF]] ; CHECK-DAG: %[[X]] = OpVariable %[[WPtr]] Workgroup %[[#]] -; CHECK-DAG: %[[GenX:.*]] = OpSpecConstantOp %[[GenPtr]] 121 %[[X]] -; CHECK-DAG: %[[CWX:.*]] = OpSpecConstantOp %[[CWPtrChar]] 122 %[[GenX]] +; CHECK-DAG: %[[GenX:.*]] = OpSpecConstantOp %[[GenPtr]] PtrCastToGeneric %[[X]] +; CHECK-DAG: %[[CWX:.*]] = OpSpecConstantOp %[[CWPtrChar]] GenericCastToPtr %[[GenX]] ; CHECK-DAG: %[[Y]] = OpVariable %[[WPtr]] Workgroup %[[#]] -; CHECK-DAG: %[[GenY:.*]] = OpSpecConstantOp %[[GenPtr]] 121 %[[Y]] -; CHECK-DAG: %[[CWY:.*]] = OpSpecConstantOp %[[CWPtrChar]] 122 %[[GenY]] +; CHECK-DAG: %[[GenY:.*]] = OpSpecConstantOp %[[GenPtr]] PtrCastToGeneric %[[Y]] +; CHECK-DAG: %[[CWY:.*]] = OpSpecConstantOp %[[CWPtrChar]] GenericCastToPtr %[[GenY]] ; CHECK-DAG: %[[CWXY:.*]] = OpConstantComposite %[[Arr1]] %[[CWX]] %[[CWY]] ; CHECK-DAG: %[[CWYX:.*]] = OpConstantComposite %[[Arr1]] %[[CWY]] %[[CWX]] ; CHECK-DAG: %[[CG3:.*]] = OpConstantComposite %[[Struct1]] %[[CWXY]] diff --git a/llvm/test/CodeGen/SPIRV/pointers/global-ptrtoint.ll b/llvm/test/CodeGen/SPIRV/pointers/global-ptrtoint.ll index 16c20f9067e6..0fd2f622dc84 100644 --- a/llvm/test/CodeGen/SPIRV/pointers/global-ptrtoint.ll +++ b/llvm/test/CodeGen/SPIRV/pointers/global-ptrtoint.ll @@ -11,9 +11,9 @@ ; CHECK-DAG: %[[TyStruct:.*]] = OpTypeStruct %[[TyI64]] %[[TyI64]] ; CHECK-DAG: %[[Const128:.*]] = OpConstant %[[TyI64]] 128 ; CHECK-DAG: %[[GlobalValue]] = OpVariable -; CHECK-DAG: %[[PtrToInt:.*]] = OpSpecConstantOp %[[TyI64]] 117 %[[GlobalValue]] +; CHECK-DAG: %[[PtrToInt:.*]] = OpSpecConstantOp %[[TyI64]] ConvertPtrToU %[[GlobalValue]] ; TODO: The following bitcast line looks unneeded and we may expect it to be removed in future -; CHECK-DAG: %[[UseGlobalValue:.*]] = OpSpecConstantOp %[[TyI64]] 124 %[[PtrToInt]] +; CHECK-DAG: %[[UseGlobalValue:.*]] = OpSpecConstantOp %[[TyI64]] Bitcast %[[PtrToInt]] ; CHECK-DAG: %[[ConstComposite:.*]] = OpConstantComposite %[[TyStruct]] %[[Const128]] %[[UseGlobalValue]] ; CHECK-DAG: %[[TyPtrStruct:.*]] = OpTypePointer CrossWorkgroup %[[TyStruct]] ; CHECK: OpVariable %[[TyPtrStruct]] CrossWorkgroup %[[ConstComposite]] diff --git a/llvm/test/CodeGen/SPIRV/pointers/irtrans-added-int-const-32-64.ll b/llvm/test/CodeGen/SPIRV/pointers/irtrans-added-int-const-32-64.ll index c2738229aa4d..f5abcd38d040 100644 --- a/llvm/test/CodeGen/SPIRV/pointers/irtrans-added-int-const-32-64.ll +++ b/llvm/test/CodeGen/SPIRV/pointers/irtrans-added-int-const-32-64.ll @@ -12,7 +12,7 @@ ; CHECK-SPIRV64-DAG: %[[#IntTy:]] = OpTypeInt 64 0 ; CHECK-SPIRV32-DAG: %[[#IntTy:]] = OpTypeInt 32 0 ; CHECK-SPIRV-DAG: %[[#Const2:]] = OpConstant %[[#IntTy]] 2 -; CHECK-SPIRV-DAG: %[[#]] = OpSpecConstantOp %[[#]] 70 %[[#]] %[[#]] %[[#Const2]] +; CHECK-SPIRV-DAG: %[[#]] = OpSpecConstantOp %[[#]] InBoundsPtrAccessChain %[[#]] %[[#]] %[[#Const2]] ; CHECK-SPIRV: OpFunction @a_var = addrspace(1) global [2 x i8] [i8 1, i8 1] From 42c82fcc29c1c8e19b2265495a5d8f59fb5ea764 Mon Sep 17 00:00:00 2001 From: Nikolas Klauser Date: Wed, 11 Jun 2025 20:19:26 +0200 Subject: [PATCH 0026/1322] [libc++] Upgrade to GCC 15 (#138293) --- .github/workflows/libcxx-build-and-test.yaml | 8 ++++---- libcxx/docs/index.rst | 2 +- libcxx/src/experimental/time_zone.cpp | 9 +++++++++ .../alg.contains/ranges.contains.pass.cpp | 4 ++-- .../equality_comparable.compile.pass.cpp | 6 ++++++ .../equality_comparable_with.compile.pass.cpp | 15 +++++++++++++++ .../totally_ordered.compile.pass.cpp | 3 +++ .../totally_ordered_with.compile.pass.cpp | 10 ++++++++++ .../new.delete.array/new.size.except.pass.cpp | 3 +++ .../new.delete/new.delete.array/new.size.pass.cpp | 3 +++ .../new.size_align.except.pass.cpp | 3 +++ .../new.delete.array/new.size_align.pass.cpp | 3 +++ .../new.delete.single/new.size.except.pass.cpp | 3 +++ .../new.delete.single/new.size.pass.cpp | 3 +++ .../new.size_align.except.pass.cpp | 3 +++ .../new.delete.single/new.size_align.pass.cpp | 3 +++ .../rand.dist.samp.discrete/ctor_func.pass.cpp | 3 +++ .../param_ctor_func.pass.cpp | 3 +++ .../range.lazy.split/general.pass.cpp | 12 ++++++++++++ .../expected.expected/monadic/transform.pass.cpp | 4 ++-- .../monadic/transform_error.pass.cpp | 4 ++-- .../monadic/transform_error.pass.cpp | 4 ++-- .../formatter.char_array.pass.cpp | 2 +- .../meta/meta.rel/is_virtual_base_of.pass.cpp | 7 +++++++ ...le.pass.cpp => dependent_return_type.pass.cpp} | 4 ++++ .../meta.unary.prop/is_implicit_lifetime.pass.cpp | 2 +- .../make_optional_explicit.pass.cpp | 3 +++ ...ke_optional_explicit_initializer_list.pass.cpp | 3 +++ .../tuple.tuple/tuple.cnstr/PR31384.pass.cpp | 2 +- .../catch_member_function_pointer_02.pass.cpp | 2 +- 30 files changed, 119 insertions(+), 17 deletions(-) rename libcxx/test/std/utilities/meta/meta.unary/{dependent_return_type.compile.pass.cpp => dependent_return_type.pass.cpp} (94%) diff --git a/.github/workflows/libcxx-build-and-test.yaml b/.github/workflows/libcxx-build-and-test.yaml index 80f2432b78de..f0bdf6c0b589 100644 --- a/.github/workflows/libcxx-build-and-test.yaml +++ b/.github/workflows/libcxx-build-and-test.yaml @@ -52,8 +52,8 @@ jobs: cxx: [ 'clang++-21' ] include: - config: 'generic-gcc' - cc: 'gcc-14' - cxx: 'g++-14' + cc: 'gcc-15' + cxx: 'g++-15' steps: - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - name: ${{ matrix.config }}.${{ matrix.cxx }} @@ -92,8 +92,8 @@ jobs: cxx: [ 'clang++-21' ] include: - config: 'generic-gcc-cxx11' - cc: 'gcc-14' - cxx: 'g++-14' + cc: 'gcc-15' + cxx: 'g++-15' - config: 'generic-cxx26' cc: 'clang-20' cxx: 'clang++-20' diff --git a/libcxx/docs/index.rst b/libcxx/docs/index.rst index 9c957e9d20cb..ae9cc87c797f 100644 --- a/libcxx/docs/index.rst +++ b/libcxx/docs/index.rst @@ -135,7 +135,7 @@ Compiler Versions Restrictions Support policy Clang 19, 20, 21-git latest two stable releases per `LLVM's release page `_ and the development version AppleClang 15 latest stable release per `Xcode's release page `_ Open XL 17.1.3 (AIX) latest stable release per `Open XL's documentation page `_ -GCC 14 In C++11 or later only latest stable release per `GCC's release page `_ +GCC 15 In C++11 or later only latest stable release per `GCC's release page `_ ============ =================== ========================== ===================== Libc++ also supports common platforms and architectures: diff --git a/libcxx/src/experimental/time_zone.cpp b/libcxx/src/experimental/time_zone.cpp index 289164ab1203..a735800b6031 100644 --- a/libcxx/src/experimental/time_zone.cpp +++ b/libcxx/src/experimental/time_zone.cpp @@ -29,6 +29,15 @@ // These quirks often use a 12h interval; this is the scan interval of zdump, // which implies there are no sys_info objects with a duration of less than 12h. +// Work around https://gcc.gnu.org/bugzilla/show_bug.cgi?id=120502 + +#include <__config> + +// TODO(LLVM 23): When upgrading to GCC 16 this can be removed +#ifdef _LIBCPP_COMPILER_GCC +# pragma GCC optimize("-O0") +#endif + #include #include #include diff --git a/libcxx/test/std/algorithms/alg.nonmodifying/alg.contains/ranges.contains.pass.cpp b/libcxx/test/std/algorithms/alg.nonmodifying/alg.contains/ranges.contains.pass.cpp index 08d8e119a4d2..1e89cd272e64 100644 --- a/libcxx/test/std/algorithms/alg.nonmodifying/alg.contains/ranges.contains.pass.cpp +++ b/libcxx/test/std/algorithms/alg.nonmodifying/alg.contains/ranges.contains.pass.cpp @@ -195,7 +195,7 @@ constexpr bool test() { std::string a[] = {str1, str1, str, str1, str1}; auto whole = std::ranges::subrange(forward_iterator(std::move_iterator(a)), forward_iterator(std::move_iterator(a + 5))); - bool ret = std::ranges::contains(whole.begin(), whole.end(), "hello world", [&](const std::string i) { + bool ret = std::ranges::contains(whole.begin(), whole.end(), +"hello world", [&](const std::string i) { ++projection_count; return i; }); @@ -207,7 +207,7 @@ constexpr bool test() { std::string a[] = {str1, str1, str, str1, str1}; auto whole = std::ranges::subrange(forward_iterator(std::move_iterator(a)), forward_iterator(std::move_iterator(a + 5))); - bool ret = std::ranges::contains(whole, "hello world", [&](const std::string i) { + bool ret = std::ranges::contains(whole, +"hello world", [&](const std::string i) { ++projection_count; return i; }); diff --git a/libcxx/test/std/concepts/concepts.compare/concept.equalitycomparable/equality_comparable.compile.pass.cpp b/libcxx/test/std/concepts/concepts.compare/concept.equalitycomparable/equality_comparable.compile.pass.cpp index ca0f40eb77d4..0531c0e096a1 100644 --- a/libcxx/test/std/concepts/concepts.compare/concept.equalitycomparable/equality_comparable.compile.pass.cpp +++ b/libcxx/test/std/concepts/concepts.compare/concept.equalitycomparable/equality_comparable.compile.pass.cpp @@ -26,6 +26,7 @@ #include #include "compare_types.h" +#include "test_macros.h" namespace fundamentals { static_assert(std::equality_comparable); @@ -43,7 +44,12 @@ static_assert(std::equality_comparable); static_assert(std::equality_comparable); static_assert(std::equality_comparable); static_assert(std::equality_comparable); +// Array comparisons are ill-formed in C++26, but Clang doesn't implement this yet. +#if TEST_STD_VER <= 23 || defined(TEST_COMPILER_CLANG) static_assert(std::equality_comparable); +#else +static_assert(!std::equality_comparable); +#endif static_assert(std::equality_comparable); static_assert(std::equality_comparable); static_assert(std::equality_comparable); diff --git a/libcxx/test/std/concepts/concepts.compare/concept.equalitycomparable/equality_comparable_with.compile.pass.cpp b/libcxx/test/std/concepts/concepts.compare/concept.equalitycomparable/equality_comparable_with.compile.pass.cpp index 0afbe582ba89..2f8d7862c0f4 100644 --- a/libcxx/test/std/concepts/concepts.compare/concept.equalitycomparable/equality_comparable_with.compile.pass.cpp +++ b/libcxx/test/std/concepts/concepts.compare/concept.equalitycomparable/equality_comparable_with.compile.pass.cpp @@ -107,7 +107,12 @@ static_assert(!check_equality_comparable_with < int, int (S::*)() const volatile&& noexcept > ()); static_assert(check_equality_comparable_with()); +// Array comparisons are ill-formed in C++26, but Clang doesn't implement this yet. +#if TEST_STD_VER <= 23 || defined(TEST_COMPILER_CLANG) static_assert(check_equality_comparable_with()); +#else +static_assert(!check_equality_comparable_with()); +#endif static_assert(!check_equality_comparable_with()); static_assert(!check_equality_comparable_with()); static_assert(!check_equality_comparable_with()); @@ -148,7 +153,12 @@ static_assert( static_assert(!check_equality_comparable_with < int*, int (S::*)() const volatile&& noexcept > ()); +// Array comparisons are ill-formed in C++26, but Clang doesn't implement this yet. +#if TEST_STD_VER <= 23 || defined(TEST_COMPILER_CLANG) static_assert(check_equality_comparable_with()); +#else +static_assert(!check_equality_comparable_with()); +#endif static_assert(!check_equality_comparable_with()); static_assert(!check_equality_comparable_with()); static_assert(!check_equality_comparable_with()); @@ -942,7 +952,12 @@ static_assert( static_assert(!check_equality_comparable_with()); static_assert(check_equality_comparable_with()); +// Array comparisons are ill-formed in C++26, but Clang doesn't implement this yet. +#if TEST_STD_VER <= 23 || defined(TEST_COMPILER_CLANG) static_assert(check_equality_comparable_with()); +#else +static_assert(!check_equality_comparable_with()); +#endif static_assert(check_equality_comparable_with()); static_assert(check_equality_comparable_with()); static_assert(check_equality_comparable_with()); diff --git a/libcxx/test/std/concepts/concepts.compare/concepts.totallyordered/totally_ordered.compile.pass.cpp b/libcxx/test/std/concepts/concepts.compare/concepts.totallyordered/totally_ordered.compile.pass.cpp index 6f8324eaf764..5959f70cf396 100644 --- a/libcxx/test/std/concepts/concepts.compare/concepts.totallyordered/totally_ordered.compile.pass.cpp +++ b/libcxx/test/std/concepts/concepts.compare/concepts.totallyordered/totally_ordered.compile.pass.cpp @@ -55,7 +55,10 @@ static_assert(models_totally_ordered()); static_assert(models_totally_ordered()); static_assert(models_totally_ordered()); static_assert(models_totally_ordered()); +// Array comparisons are ill-formed in C++26 +#if TEST_STD_VER <= 23 static_assert(models_totally_ordered()); +#endif static_assert(models_totally_ordered()); static_assert(models_totally_ordered()); static_assert(models_totally_ordered()); diff --git a/libcxx/test/std/concepts/concepts.compare/concepts.totallyordered/totally_ordered_with.compile.pass.cpp b/libcxx/test/std/concepts/concepts.compare/concepts.totallyordered/totally_ordered_with.compile.pass.cpp index dffc33265aeb..398ef445baf9 100644 --- a/libcxx/test/std/concepts/concepts.compare/concepts.totallyordered/totally_ordered_with.compile.pass.cpp +++ b/libcxx/test/std/concepts/concepts.compare/concepts.totallyordered/totally_ordered_with.compile.pass.cpp @@ -89,7 +89,12 @@ static_assert(!check_totally_ordered_with()) static_assert(!check_totally_ordered_with < int, int (S::*)() const volatile&& noexcept > ()); static_assert(check_totally_ordered_with()); +// Array comparisons are ill-formed in C++26, but Clang doesn't implement this yet. +#if TEST_STD_VER <= 23 || defined(TEST_COMPILER_CLANG) static_assert(check_totally_ordered_with()); +#else +static_assert(!check_totally_ordered_with()); +#endif static_assert(!check_totally_ordered_with()); static_assert(!check_totally_ordered_with()); static_assert(!check_totally_ordered_with()); @@ -117,7 +122,12 @@ static_assert(!check_totally_ordered_with < int*, int (S::*)() volatile&& noexce static_assert(!check_totally_ordered_with()); static_assert(!check_totally_ordered_with < int*, int (S::*)() const volatile&& noexcept > ()); +// Array comparisons are ill-formed in C++26, but Clang doesn't implement this yet. +#if TEST_STD_VER <= 23 || defined(TEST_COMPILER_CLANG) static_assert(check_totally_ordered_with()); +#else +static_assert(!check_totally_ordered_with()); +#endif static_assert(!check_totally_ordered_with()); static_assert(!check_totally_ordered_with()); static_assert(!check_totally_ordered_with()); diff --git a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new.size.except.pass.cpp b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new.size.except.pass.cpp index 6a2b098c1b57..9ee32b841783 100644 --- a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new.size.except.pass.cpp +++ b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new.size.except.pass.cpp @@ -9,6 +9,9 @@ // UNSUPPORTED: no-exceptions // UNSUPPORTED: sanitizer-new-delete +// GCC warns about allocating numeric_limits::max() being too large (which we test here) +// ADDITIONAL_COMPILE_FLAGS(gcc): -Wno-alloc-size-larger-than + #include #include #include diff --git a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new.size.pass.cpp b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new.size.pass.cpp index 437d06430773..4fdcc3b535a8 100644 --- a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new.size.pass.cpp +++ b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new.size.pass.cpp @@ -11,6 +11,9 @@ // asan and msan will not call the new handler. // UNSUPPORTED: sanitizer-new-delete +// GCC warns about allocating numeric_limits::max() being too large (which we test here) +// ADDITIONAL_COMPILE_FLAGS(gcc): -Wno-alloc-size-larger-than + #include #include #include diff --git a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new.size_align.except.pass.cpp b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new.size_align.except.pass.cpp index 4e34ebcb46c7..4dfaf7a30d7a 100644 --- a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new.size_align.except.pass.cpp +++ b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new.size_align.except.pass.cpp @@ -9,6 +9,9 @@ // UNSUPPORTED: no-exceptions // UNSUPPORTED: sanitizer-new-delete +// GCC warns about allocating numeric_limits::max() being too large (which we test here) +// ADDITIONAL_COMPILE_FLAGS(gcc): -Wno-alloc-size-larger-than + // Libc++ when built for z/OS doesn't contain the aligned allocation functions, // nor does the dynamic library shipped with z/OS. // XFAIL: target={{.+}}-zos{{.*}} diff --git a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new.size_align.pass.cpp b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new.size_align.pass.cpp index c9b59ecaff39..a1b8466340a2 100644 --- a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new.size_align.pass.cpp +++ b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new.size_align.pass.cpp @@ -13,6 +13,9 @@ // asan and msan will not call the new handler. // UNSUPPORTED: sanitizer-new-delete +// GCC warns about allocating numeric_limits::max() being too large (which we test here) +// ADDITIONAL_COMPILE_FLAGS(gcc): -Wno-alloc-size-larger-than + // Libc++ when built for z/OS doesn't contain the aligned allocation functions, // nor does the dynamic library shipped with z/OS. // XFAIL: target={{.+}}-zos{{.*}} diff --git a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new.size.except.pass.cpp b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new.size.except.pass.cpp index 6a515555e6db..346e881d016b 100644 --- a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new.size.except.pass.cpp +++ b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new.size.except.pass.cpp @@ -9,6 +9,9 @@ // UNSUPPORTED: no-exceptions // UNSUPPORTED: sanitizer-new-delete +// GCC warns about allocating numeric_limits::max() being too large (which we test here) +// ADDITIONAL_COMPILE_FLAGS(gcc): -Wno-alloc-size-larger-than + #include #include #include diff --git a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new.size.pass.cpp b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new.size.pass.cpp index 729ef3ec46b0..0013dd3d0cbc 100644 --- a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new.size.pass.cpp +++ b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new.size.pass.cpp @@ -11,6 +11,9 @@ // asan and msan will not call the new handler. // UNSUPPORTED: sanitizer-new-delete +// GCC warns about allocating numeric_limits::max() being too large (which we test here) +// ADDITIONAL_COMPILE_FLAGS(gcc): -Wno-alloc-size-larger-than + #include #include #include diff --git a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new.size_align.except.pass.cpp b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new.size_align.except.pass.cpp index 7694314c87bf..fbeb880c83d8 100644 --- a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new.size_align.except.pass.cpp +++ b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new.size_align.except.pass.cpp @@ -9,6 +9,9 @@ // UNSUPPORTED: no-exceptions // UNSUPPORTED: sanitizer-new-delete +// GCC warns about allocating numeric_limits::max() being too large (which we test here) +// ADDITIONAL_COMPILE_FLAGS(gcc): -Wno-alloc-size-larger-than + // Libc++ when built for z/OS doesn't contain the aligned allocation functions, // nor does the dynamic library shipped with z/OS. // XFAIL: target={{.+}}-zos{{.*}} diff --git a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new.size_align.pass.cpp b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new.size_align.pass.cpp index 5d321f08282b..59ecbe205513 100644 --- a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new.size_align.pass.cpp +++ b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new.size_align.pass.cpp @@ -13,6 +13,9 @@ // asan and msan will not call the new handler. // UNSUPPORTED: sanitizer-new-delete +// GCC warns about allocating numeric_limits::max() being too large (which we test here) +// ADDITIONAL_COMPILE_FLAGS(gcc): -Wno-alloc-size-larger-than + // Libc++ when built for z/OS doesn't contain the aligned allocation functions, // nor does the dynamic library shipped with z/OS. // XFAIL: target={{.+}}-zos{{.*}} diff --git a/libcxx/test/std/numerics/rand/rand.dist/rand.dist.samp/rand.dist.samp.discrete/ctor_func.pass.cpp b/libcxx/test/std/numerics/rand/rand.dist/rand.dist.samp/rand.dist.samp.discrete/ctor_func.pass.cpp index c3a88af92d36..c05a9434175a 100644 --- a/libcxx/test/std/numerics/rand/rand.dist/rand.dist.samp/rand.dist.samp.discrete/ctor_func.pass.cpp +++ b/libcxx/test/std/numerics/rand/rand.dist/rand.dist.samp/rand.dist.samp.discrete/ctor_func.pass.cpp @@ -15,6 +15,9 @@ // discrete_distribution(size_t nw, double xmin, double xmax, // UnaryOperation fw); +// There is a bogus diagnostic about a too large allocation +// ADDITIONAL_COMPILE_FLAGS(gcc): -Wno-alloc-size-larger-than + #include #include diff --git a/libcxx/test/std/numerics/rand/rand.dist/rand.dist.samp/rand.dist.samp.discrete/param_ctor_func.pass.cpp b/libcxx/test/std/numerics/rand/rand.dist/rand.dist.samp/rand.dist.samp.discrete/param_ctor_func.pass.cpp index 7ef936b7fc35..206bf5a0eb8a 100644 --- a/libcxx/test/std/numerics/rand/rand.dist/rand.dist.samp/rand.dist.samp.discrete/param_ctor_func.pass.cpp +++ b/libcxx/test/std/numerics/rand/rand.dist/rand.dist.samp/rand.dist.samp.discrete/param_ctor_func.pass.cpp @@ -15,6 +15,9 @@ // param_type(size_t nw, double xmin, double xmax, // UnaryOperation fw); +// There is a bogus diagnostic about a too large allocation +// ADDITIONAL_COMPILE_FLAGS(gcc): -Wno-alloc-size-larger-than + #include #include diff --git a/libcxx/test/std/ranges/range.adaptors/range.lazy.split/general.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.lazy.split/general.pass.cpp index f4e87bb47399..521c0b1610bc 100644 --- a/libcxx/test/std/ranges/range.adaptors/range.lazy.split/general.pass.cpp +++ b/libcxx/test/std/ranges/range.adaptors/range.lazy.split/general.pass.cpp @@ -312,7 +312,10 @@ constexpr bool main_test() { // Leading separator. { std::array expected = {""sv, "abc"sv, "def"sv}; +// FIXME: Why does GCC complain here? +#ifndef TEST_COMPILER_GCC test_one(" abc def"sv, short_sep, expected); +#endif test_one("12abc12def"sv, long_sep, expected); } @@ -326,7 +329,10 @@ constexpr bool main_test() { // Input consisting of a single separator. { std::array expected = {""sv, ""sv}; +// FIXME: Why does GCC complain here? +#ifndef TEST_COMPILER_GCC test_one(" "sv, short_sep, expected); +#endif test_one("12"sv, long_sep, expected); } @@ -354,7 +360,10 @@ constexpr bool main_test() { // Separators after every character. { std::array expected = {""sv, "a"sv, "b"sv, "c"sv, ""sv}; +// FIXME: Why does GCC complain here? +#ifndef TEST_COMPILER_GCC test_one(" a b c "sv, short_sep, expected); +#endif test_one("12a12b12c12"sv, long_sep, expected); } @@ -383,7 +392,10 @@ constexpr bool main_test() { // Terminating null as a separator. { std::array expected = {"abc"sv, "def"sv}; +// FIXME: Why does GCC complain here? +#ifndef TEST_COMPILER_GCC test_one("abc\0def"sv, '\0', expected); +#endif test_one("abc\0\0def"sv, "\0\0"sv, expected); } diff --git a/libcxx/test/std/utilities/expected/expected.expected/monadic/transform.pass.cpp b/libcxx/test/std/utilities/expected/expected.expected/monadic/transform.pass.cpp index cbd54d623c0f..97c1e4a40f35 100644 --- a/libcxx/test/std/utilities/expected/expected.expected/monadic/transform.pass.cpp +++ b/libcxx/test/std/utilities/expected/expected.expected/monadic/transform.pass.cpp @@ -9,8 +9,8 @@ // UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 // GCC has a issue for `Guaranteed copy elision for potentially-overlapping non-static data members`, -// please refer to: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=108333 -// XFAIL: gcc-14 +// please refer to: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=98995 +// XFAIL: gcc-14, gcc-15 // diff --git a/libcxx/test/std/utilities/expected/expected.expected/monadic/transform_error.pass.cpp b/libcxx/test/std/utilities/expected/expected.expected/monadic/transform_error.pass.cpp index a19e17b01f6a..9570b2faac69 100644 --- a/libcxx/test/std/utilities/expected/expected.expected/monadic/transform_error.pass.cpp +++ b/libcxx/test/std/utilities/expected/expected.expected/monadic/transform_error.pass.cpp @@ -9,8 +9,8 @@ // UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 // GCC has a issue for `Guaranteed copy elision for potentially-overlapping non-static data members`, -// please refer to: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=108333. -// XFAIL: gcc-14 +// please refer to: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=98995. +// XFAIL: gcc-14, gcc-15 // diff --git a/libcxx/test/std/utilities/expected/expected.void/monadic/transform_error.pass.cpp b/libcxx/test/std/utilities/expected/expected.void/monadic/transform_error.pass.cpp index f6d3011d1ea9..2ec15b51d11e 100644 --- a/libcxx/test/std/utilities/expected/expected.void/monadic/transform_error.pass.cpp +++ b/libcxx/test/std/utilities/expected/expected.void/monadic/transform_error.pass.cpp @@ -9,8 +9,8 @@ // UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 // GCC has a issue for `Guaranteed copy elision for potentially-overlapping non-static data members`, -// please refer to: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=108333 -// XFAIL: gcc-14 +// please refer to: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=98995 +// XFAIL: gcc-14, gcc-15 // diff --git a/libcxx/test/std/utilities/format/format.formatter/format.formatter.spec/formatter.char_array.pass.cpp b/libcxx/test/std/utilities/format/format.formatter/format.formatter.spec/formatter.char_array.pass.cpp index bc056db9e254..8c4f3000ec1e 100644 --- a/libcxx/test/std/utilities/format/format.formatter/format.formatter.spec/formatter.char_array.pass.cpp +++ b/libcxx/test/std/utilities/format/format.formatter/format.formatter.spec/formatter.char_array.pass.cpp @@ -8,7 +8,7 @@ // UNSUPPORTED: c++03, c++11, c++14, c++17 // TODO FMT __builtin_memcpy isn't constexpr in GCC -// UNSUPPORTED: gcc-14 +// UNSUPPORTED: gcc-14, gcc-15 // diff --git a/libcxx/test/std/utilities/meta/meta.rel/is_virtual_base_of.pass.cpp b/libcxx/test/std/utilities/meta/meta.rel/is_virtual_base_of.pass.cpp index f443d2030961..47c95c64a085 100644 --- a/libcxx/test/std/utilities/meta/meta.rel/is_virtual_base_of.pass.cpp +++ b/libcxx/test/std/utilities/meta/meta.rel/is_virtual_base_of.pass.cpp @@ -18,6 +18,8 @@ #include #include +#include "test_macros.h" + template void test() { // Test the type of the variables @@ -98,8 +100,13 @@ int main(int, char**) { // Test with virtual inheritance { +#ifdef TEST_COMPILER_GCC // FIXME: Is this a GCC or Clang bug? Or is the standards wording ambiguous? + test(); + test(); +#else test(); test(); +#endif test(); test(); test(); diff --git a/libcxx/test/std/utilities/meta/meta.unary/dependent_return_type.compile.pass.cpp b/libcxx/test/std/utilities/meta/meta.unary/dependent_return_type.pass.cpp similarity index 94% rename from libcxx/test/std/utilities/meta/meta.unary/dependent_return_type.compile.pass.cpp rename to libcxx/test/std/utilities/meta/meta.unary/dependent_return_type.pass.cpp index 935a6e3db001..37d66831c7ce 100644 --- a/libcxx/test/std/utilities/meta/meta.unary/dependent_return_type.compile.pass.cpp +++ b/libcxx/test/std/utilities/meta/meta.unary/dependent_return_type.pass.cpp @@ -168,3 +168,7 @@ void instantiate() { void_t(); #endif } + +// This is not a .compile.pass.cpp because we want to ensure that GCC doesn't complain about incorrect builtins usage, +// which only happens during CodeGen. +int main(int, char**) { return 0; } diff --git a/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/is_implicit_lifetime.pass.cpp b/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/is_implicit_lifetime.pass.cpp index 681ad13a07df..afd76e65060e 100644 --- a/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/is_implicit_lifetime.pass.cpp +++ b/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/is_implicit_lifetime.pass.cpp @@ -9,7 +9,7 @@ // UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 // These compilers don't support __builtin_is_implicit_lifetime yet. -// UNSUPPORTED: clang-18, clang-19, gcc-14, apple-clang-15, apple-clang-16, apple-clang-17 +// UNSUPPORTED: clang-18, clang-19, gcc-14, gcc-15, apple-clang-15, apple-clang-16, apple-clang-17 // diff --git a/libcxx/test/std/utilities/optional/optional.specalg/make_optional_explicit.pass.cpp b/libcxx/test/std/utilities/optional/optional.specalg/make_optional_explicit.pass.cpp index e7931e07e31d..23f131d2fc49 100644 --- a/libcxx/test/std/utilities/optional/optional.specalg/make_optional_explicit.pass.cpp +++ b/libcxx/test/std/utilities/optional/optional.specalg/make_optional_explicit.pass.cpp @@ -12,6 +12,9 @@ // template // constexpr optional make_optional(Args&&... args); +// GCC crashes on this file, see https://gcc.gnu.org/bugzilla/show_bug.cgi?id=120577 +// XFAIL: gcc-15 + #include #include #include diff --git a/libcxx/test/std/utilities/optional/optional.specalg/make_optional_explicit_initializer_list.pass.cpp b/libcxx/test/std/utilities/optional/optional.specalg/make_optional_explicit_initializer_list.pass.cpp index 80371d633371..5ddb229ad926 100644 --- a/libcxx/test/std/utilities/optional/optional.specalg/make_optional_explicit_initializer_list.pass.cpp +++ b/libcxx/test/std/utilities/optional/optional.specalg/make_optional_explicit_initializer_list.pass.cpp @@ -12,6 +12,9 @@ // template // constexpr optional make_optional(initializer_list il, Args&&... args); +// GCC crashes on this file, see https://gcc.gnu.org/bugzilla/show_bug.cgi?id=120577 +// XFAIL: gcc-15 + #include #include #include diff --git a/libcxx/test/std/utilities/tuple/tuple.tuple/tuple.cnstr/PR31384.pass.cpp b/libcxx/test/std/utilities/tuple/tuple.tuple/tuple.cnstr/PR31384.pass.cpp index e6812e9a3a30..ae5984c15530 100644 --- a/libcxx/test/std/utilities/tuple/tuple.tuple/tuple.cnstr/PR31384.pass.cpp +++ b/libcxx/test/std/utilities/tuple/tuple.tuple/tuple.cnstr/PR31384.pass.cpp @@ -9,7 +9,7 @@ // UNSUPPORTED: c++03 // FIXME: Why does this start to fail with GCC 14? -// XFAIL: !(c++11 || c++14) && gcc-14 +// XFAIL: !(c++11 || c++14) && (gcc-14 || gcc-15) // See https://llvm.org/PR31384. diff --git a/libcxxabi/test/catch_member_function_pointer_02.pass.cpp b/libcxxabi/test/catch_member_function_pointer_02.pass.cpp index 5d702031ce35..ec400713620c 100644 --- a/libcxxabi/test/catch_member_function_pointer_02.pass.cpp +++ b/libcxxabi/test/catch_member_function_pointer_02.pass.cpp @@ -12,7 +12,7 @@ // GCC supports noexcept function types but this test still fails. // This is likely a bug in their implementation. Investigation needed. -// XFAIL: gcc-14 +// XFAIL: gcc-14, gcc-15 #include From 806333063ff9a09ca001dcd77d4d5d6f0b9ecd74 Mon Sep 17 00:00:00 2001 From: Jesse Huang Date: Thu, 12 Jun 2025 02:24:10 +0800 Subject: [PATCH 0027/1322] [RISCV] Guard the alternative static chain register use on ILP32E/LP64E (#142715) Asserts the use of t3(x28) as the static chain register when branch control flow protection is enabled with ILP32E/LP64E, because such register is not present within the ABI. --- llvm/lib/Target/RISCV/RISCVCallingConv.cpp | 24 ++++++++++++++-------- llvm/test/CodeGen/RISCV/nest-register.ll | 3 +++ 2 files changed, 19 insertions(+), 8 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVCallingConv.cpp b/llvm/lib/Target/RISCV/RISCVCallingConv.cpp index e0d1fb2facc8..cb6117eb0917 100644 --- a/llvm/lib/Target/RISCV/RISCVCallingConv.cpp +++ b/llvm/lib/Target/RISCV/RISCVCallingConv.cpp @@ -333,15 +333,23 @@ bool llvm::CC_RISCV(unsigned ValNo, MVT ValVT, MVT LocVT, unsigned XLen = Subtarget.getXLen(); MVT XLenVT = Subtarget.getXLenVT(); - // Static chain parameter must not be passed in normal argument registers, - // so we assign t2/t3 for it as done in GCC's __builtin_call_with_static_chain - bool HasCFBranch = - Subtarget.hasStdExtZicfilp() && - MF.getFunction().getParent()->getModuleFlag("cf-protection-branch"); - // Normal: t2, Branch control flow protection: t3 - const auto StaticChainReg = HasCFBranch ? RISCV::X28 : RISCV::X7; - if (ArgFlags.isNest()) { + // Static chain parameter must not be passed in normal argument registers, + // so we assign t2/t3 for it as done in GCC's + // __builtin_call_with_static_chain + bool HasCFBranch = + Subtarget.hasStdExtZicfilp() && + MF.getFunction().getParent()->getModuleFlag("cf-protection-branch"); + + // Normal: t2, Branch control flow protection: t3 + const auto StaticChainReg = HasCFBranch ? RISCV::X28 : RISCV::X7; + + RISCVABI::ABI ABI = Subtarget.getTargetABI(); + if (HasCFBranch && + (ABI == RISCVABI::ABI_ILP32E || ABI == RISCVABI::ABI_LP64E)) + reportFatalUsageError( + "Nested functions with control flow protection are not " + "usable with ILP32E or LP64E ABI."); if (MCRegister Reg = State.AllocateReg(StaticChainReg)) { State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo)); return false; diff --git a/llvm/test/CodeGen/RISCV/nest-register.ll b/llvm/test/CodeGen/RISCV/nest-register.ll index 9f8e4e1a2d8d..6e892e05c429 100644 --- a/llvm/test/CodeGen/RISCV/nest-register.ll +++ b/llvm/test/CodeGen/RISCV/nest-register.ll @@ -5,6 +5,8 @@ ; RUN: | FileCheck -check-prefix=RV64I %s ; RUN: llc -mtriple=riscv64 -mattr=+experimental-zicfilp -verify-machineinstrs < %s \ ; RUN: | FileCheck -check-prefix=RV64I-ZICFILP %s +; RUN: not llc -mtriple=riscv64 -target-abi=lp64e -mattr=+experimental-zicfilp \ +; RUN: -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefix=LP64E-ZICFILP %s ; Tests that the 'nest' parameter attribute causes the relevant parameter to be ; passed in the right register. @@ -63,6 +65,7 @@ define ptr @nest_caller(ptr %arg) nounwind { ret ptr %result } +; LP64E-ZICFILP: LLVM ERROR: Nested functions with control flow protection are not usable with ILP32E or LP64E ABI. !llvm.module.flags = !{!0} !0 = !{i32 8, !"cf-protection-branch", i32 1} From 7a0c9f607a26b77a7e584fd6734f03b7ee40ca95 Mon Sep 17 00:00:00 2001 From: Tony Varghese Date: Wed, 11 Jun 2025 23:56:15 +0530 Subject: [PATCH 0028/1322] [NFC][PowerPC] Pre-commit test case for exploitation of xxeval for the pattern ternary(A,X,or(B,C)) (#143693) Pre-commit test case for exploitation of `xxeval` for ternary operations of the pattern `ternary(A,X,or(B,C))`. Exploitation of `xxeval` to be added later. Co-authored-by: Tony Varghese --- .../CodeGen/PowerPC/xxeval-vselect-x-or.ll | 268 ++++++++++++++++++ 1 file changed, 268 insertions(+) create mode 100644 llvm/test/CodeGen/PowerPC/xxeval-vselect-x-or.ll diff --git a/llvm/test/CodeGen/PowerPC/xxeval-vselect-x-or.ll b/llvm/test/CodeGen/PowerPC/xxeval-vselect-x-or.ll new file mode 100644 index 000000000000..1ad7e95e3682 --- /dev/null +++ b/llvm/test/CodeGen/PowerPC/xxeval-vselect-x-or.ll @@ -0,0 +1,268 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; Test file to verify the emission of Vector selection instructions when ternary operators are used. + +; RUN: llc -verify-machineinstrs -mcpu=pwr10 -mtriple=powerpc64le-unknown-unknown \ +; RUN: -ppc-asm-full-reg-names --ppc-vsr-nums-as-vr < %s | FileCheck %s + +; RUN: llc -verify-machineinstrs -mcpu=pwr10 -mtriple=powerpc-ibm-aix-xcoff \ +; RUN: -ppc-asm-full-reg-names --ppc-vsr-nums-as-vr < %s | FileCheck %s + +; RUN: llc -verify-machineinstrs -mcpu=pwr10 -mtriple=powerpc64-ibm-aix-xcoff \ +; RUN: -ppc-asm-full-reg-names --ppc-vsr-nums-as-vr < %s | FileCheck %s + +; Function to test ternary(A, and(B, C), or(B, C)) for <4 x i32> +define <4 x i32> @ternary_A_and_BC_or_BC_4x32(<4 x i1> %A, <4 x i32> %B, <4 x i32> %C) { +; CHECK-LABEL: ternary_A_and_BC_or_BC_4x32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xxleqv v5, v5, v5 +; CHECK-NEXT: xxland vs0, v3, v4 +; CHECK-NEXT: xxlor vs1, v3, v4 +; CHECK-NEXT: vslw v2, v2, v5 +; CHECK-NEXT: vsraw v2, v2, v5 +; CHECK-NEXT: xxsel v2, vs1, vs0, v2 +; CHECK-NEXT: blr +entry: + %and = and <4 x i32> %B, %C + %or = or <4 x i32> %B, %C + %res = select <4 x i1> %A, <4 x i32> %and, <4 x i32> %or + ret <4 x i32> %res +} + +; Function to test ternary(A, and(B, C), or(B, C)) for <2 x i64> +define <2 x i64> @ternary_A_and_BC_or_BC_2x64(<2 x i1> %A, <2 x i64> %B, <2 x i64> %C) { +; CHECK-LABEL: ternary_A_and_BC_or_BC_2x64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xxlxor v5, v5, v5 +; CHECK-NEXT: xxland vs0, v3, v4 +; CHECK-NEXT: xxlor vs1, v3, v4 +; CHECK-NEXT: xxsplti32dx v5, 1, 63 +; CHECK-NEXT: vsld v2, v2, v5 +; CHECK-NEXT: vsrad v2, v2, v5 +; CHECK-NEXT: xxsel v2, vs1, vs0, v2 +; CHECK-NEXT: blr +entry: + %and = and <2 x i64> %B, %C + %or = or <2 x i64> %B, %C + %res = select <2 x i1> %A, <2 x i64> %and, <2 x i64> %or + ret <2 x i64> %res +} + +; Function to test ternary(A, B, or(B, C)) for <4 x i32> +define <4 x i32> @ternary_A_B_or_BC_4x32(<4 x i1> %A, <4 x i32> %B, <4 x i32> %C) { +; CHECK-LABEL: ternary_A_B_or_BC_4x32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xxleqv v5, v5, v5 +; CHECK-NEXT: xxlor vs0, v3, v4 +; CHECK-NEXT: vslw v2, v2, v5 +; CHECK-NEXT: vsraw v2, v2, v5 +; CHECK-NEXT: xxsel v2, vs0, v3, v2 +; CHECK-NEXT: blr +entry: + %or = or <4 x i32> %B, %C + %res = select <4 x i1> %A, <4 x i32> %B, <4 x i32> %or + ret <4 x i32> %res +} + +; Function to test ternary(A, B, or(B, C)) for <2 x i64> +define <2 x i64> @ternary_A_B_or_BC_2x64(<2 x i1> %A, <2 x i64> %B, <2 x i64> %C) { +; CHECK-LABEL: ternary_A_B_or_BC_2x64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xxlxor v5, v5, v5 +; CHECK-NEXT: xxlor vs0, v3, v4 +; CHECK-NEXT: xxsplti32dx v5, 1, 63 +; CHECK-NEXT: vsld v2, v2, v5 +; CHECK-NEXT: vsrad v2, v2, v5 +; CHECK-NEXT: xxsel v2, vs0, v3, v2 +; CHECK-NEXT: blr +entry: + %or = or <2 x i64> %B, %C + %res = select <2 x i1> %A, <2 x i64> %B, <2 x i64> %or + ret <2 x i64> %res +} + + +; Function to test ternary(A, C, or(B, C)) for <4 x i32> +define <4 x i32> @ternary_A_C_or_BC_4x32(<4 x i1> %A, <4 x i32> %B, <4 x i32> %C) { +; CHECK-LABEL: ternary_A_C_or_BC_4x32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xxleqv v5, v5, v5 +; CHECK-NEXT: xxlor vs0, v3, v4 +; CHECK-NEXT: vslw v2, v2, v5 +; CHECK-NEXT: vsraw v2, v2, v5 +; CHECK-NEXT: xxsel v2, vs0, v4, v2 +; CHECK-NEXT: blr +entry: + %or = or <4 x i32> %B, %C + %res = select <4 x i1> %A, <4 x i32> %C, <4 x i32> %or + ret <4 x i32> %res +} + +; Function to test ternary(A, C, or(B, C)) for <2 x i64> +define <2 x i64> @ternary_A_C_or_BC_2x64(<2 x i1> %A, <2 x i64> %B, <2 x i64> %C) { +; CHECK-LABEL: ternary_A_C_or_BC_2x64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xxlxor v5, v5, v5 +; CHECK-NEXT: xxlor vs0, v3, v4 +; CHECK-NEXT: xxsplti32dx v5, 1, 63 +; CHECK-NEXT: vsld v2, v2, v5 +; CHECK-NEXT: vsrad v2, v2, v5 +; CHECK-NEXT: xxsel v2, vs0, v4, v2 +; CHECK-NEXT: blr +entry: + %or = or <2 x i64> %B, %C + %res = select <2 x i1> %A, <2 x i64> %C, <2 x i64> %or + ret <2 x i64> %res +} + + +; Function to test ternary(A, eqv(B,C), or(B, C)) for <4 x i32> +define <4 x i32> @ternary_A_eqv_BC_or_BC_4x32(<4 x i1> %A, <4 x i32> %B, <4 x i32> %C) { +; CHECK-LABEL: ternary_A_eqv_BC_or_BC_4x32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xxleqv v5, v5, v5 +; CHECK-NEXT: xxleqv vs0, v3, v4 +; CHECK-NEXT: xxlor vs1, v3, v4 +; CHECK-NEXT: vslw v2, v2, v5 +; CHECK-NEXT: vsraw v2, v2, v5 +; CHECK-NEXT: xxsel v2, vs1, vs0, v2 +; CHECK-NEXT: blr +entry: + %xor = xor <4 x i32> %B, %C + %eqv = xor <4 x i32> %xor, ; Vector eqv operation + %or = or <4 x i32> %B, %C + %res = select <4 x i1> %A, <4 x i32> %eqv, <4 x i32> %or + ret <4 x i32> %res +} + +; Function to test ternary(A, eqv(B,C), or(B, C)) for <2 x i64> +define <2 x i64> @ternary_A_eqv_BC_or_BC_2x64(<2 x i1> %A, <2 x i64> %B, <2 x i64> %C) { +; CHECK-LABEL: ternary_A_eqv_BC_or_BC_2x64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xxlxor v5, v5, v5 +; CHECK-NEXT: xxleqv vs0, v3, v4 +; CHECK-NEXT: xxlor vs1, v3, v4 +; CHECK-NEXT: xxsplti32dx v5, 1, 63 +; CHECK-NEXT: vsld v2, v2, v5 +; CHECK-NEXT: vsrad v2, v2, v5 +; CHECK-NEXT: xxsel v2, vs1, vs0, v2 +; CHECK-NEXT: blr +entry: + %xor = xor <2 x i64> %B, %C + %eqv = xor <2 x i64> %xor, ; Vector eqv operation + %or = or <2 x i64> %B, %C + %res = select <2 x i1> %A, <2 x i64> %eqv, <2 x i64> %or + ret <2 x i64> %res +} + +; Function to test ternary(A, not(C), or(B, C)) for <4 x i32> +define <4 x i32> @ternary_A_not_C_or_BC_4x32(<4 x i1> %A, <4 x i32> %B, <4 x i32> %C) { +; CHECK-LABEL: ternary_A_not_C_or_BC_4x32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xxleqv v5, v5, v5 +; CHECK-NEXT: xxlnor vs0, v4, v4 +; CHECK-NEXT: xxlor vs1, v3, v4 +; CHECK-NEXT: vslw v2, v2, v5 +; CHECK-NEXT: vsraw v2, v2, v5 +; CHECK-NEXT: xxsel v2, vs1, vs0, v2 +; CHECK-NEXT: blr +entry: + %not = xor <4 x i32> %C, ; Vector not operation + %or = or <4 x i32> %B, %C + %res = select <4 x i1> %A, <4 x i32> %not, <4 x i32> %or + ret <4 x i32> %res +} + +; Function to test ternary(A, not(C), or(B, C)) for <2 x i64> +define <2 x i64> @ternary_A_not_C_or_BC_2x64(<2 x i1> %A, <2 x i64> %B, <2 x i64> %C) { +; CHECK-LABEL: ternary_A_not_C_or_BC_2x64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xxlxor v5, v5, v5 +; CHECK-NEXT: xxlnor vs0, v4, v4 +; CHECK-NEXT: xxlor vs1, v3, v4 +; CHECK-NEXT: xxsplti32dx v5, 1, 63 +; CHECK-NEXT: vsld v2, v2, v5 +; CHECK-NEXT: vsrad v2, v2, v5 +; CHECK-NEXT: xxsel v2, vs1, vs0, v2 +; CHECK-NEXT: blr +entry: + %not = xor <2 x i64> %C, ; Vector not operation + %or = or <2 x i64> %B, %C + %res = select <2 x i1> %A, <2 x i64> %not, <2 x i64> %or + ret <2 x i64> %res +} + +; Function to test ternary(A, not(B), or(B, C)) for <4 x i32> +define <4 x i32> @ternary_A_not_B_or_BC_4x32(<4 x i1> %A, <4 x i32> %B, <4 x i32> %C) { +; CHECK-LABEL: ternary_A_not_B_or_BC_4x32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xxleqv v5, v5, v5 +; CHECK-NEXT: xxlnor vs0, v3, v3 +; CHECK-NEXT: xxlor vs1, v3, v4 +; CHECK-NEXT: vslw v2, v2, v5 +; CHECK-NEXT: vsraw v2, v2, v5 +; CHECK-NEXT: xxsel v2, vs1, vs0, v2 +; CHECK-NEXT: blr +entry: + %not = xor <4 x i32> %B, ; Vector not operation + %or = or <4 x i32> %B, %C + %res = select <4 x i1> %A, <4 x i32> %not, <4 x i32> %or + ret <4 x i32> %res +} + +; Function to test ternary(A, not(B), or(B, C)) for <2 x i64> +define <2 x i64> @ternary_A_not_B_or_BC_2x64(<2 x i1> %A, <2 x i64> %B, <2 x i64> %C) { +; CHECK-LABEL: ternary_A_not_B_or_BC_2x64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xxlxor v5, v5, v5 +; CHECK-NEXT: xxlnor vs0, v3, v3 +; CHECK-NEXT: xxlor vs1, v3, v4 +; CHECK-NEXT: xxsplti32dx v5, 1, 63 +; CHECK-NEXT: vsld v2, v2, v5 +; CHECK-NEXT: vsrad v2, v2, v5 +; CHECK-NEXT: xxsel v2, vs1, vs0, v2 +; CHECK-NEXT: blr +entry: + %not = xor <2 x i64> %B, ; Vector not operation + %or = or <2 x i64> %B, %C + %res = select <2 x i1> %A, <2 x i64> %not, <2 x i64> %or + ret <2 x i64> %res +} + +; Function to test ternary(A, nand(B,C), or(B, C)) for <4 x i32> +define <4 x i32> @ternary_A_nand_BC_or_BC_4x32(<4 x i1> %A, <4 x i32> %B, <4 x i32> %C) { +; CHECK-LABEL: ternary_A_nand_BC_or_BC_4x32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xxleqv v5, v5, v5 +; CHECK-NEXT: xxlnand vs0, v3, v4 +; CHECK-NEXT: xxlor vs1, v3, v4 +; CHECK-NEXT: vslw v2, v2, v5 +; CHECK-NEXT: vsraw v2, v2, v5 +; CHECK-NEXT: xxsel v2, vs1, vs0, v2 +; CHECK-NEXT: blr +entry: + %and = and <4 x i32> %B, %C + %nand = xor <4 x i32> %and, ; Vector nand operation + %or = or <4 x i32> %B, %C + %res = select <4 x i1> %A, <4 x i32> %nand, <4 x i32> %or + ret <4 x i32> %res +} + +; Function to test ternary(A, nand(B,C), or(B, C)) for <2 x i64> +define <2 x i64> @ternary_A_nand_BC_or_BC_2x64(<2 x i1> %A, <2 x i64> %B, <2 x i64> %C) { +; CHECK-LABEL: ternary_A_nand_BC_or_BC_2x64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xxlxor v5, v5, v5 +; CHECK-NEXT: xxlnand vs0, v3, v4 +; CHECK-NEXT: xxlor vs1, v3, v4 +; CHECK-NEXT: xxsplti32dx v5, 1, 63 +; CHECK-NEXT: vsld v2, v2, v5 +; CHECK-NEXT: vsrad v2, v2, v5 +; CHECK-NEXT: xxsel v2, vs1, vs0, v2 +; CHECK-NEXT: blr +entry: + %and = and <2 x i64> %B, %C + %nand = xor <2 x i64> %and, ; Vector nand operation + %or = or <2 x i64> %B, %C + %res = select <2 x i1> %A, <2 x i64> %nand, <2 x i64> %or + ret <2 x i64> %res +} From 8d7da9a2a40302af25ee70841a4b549f4ed5ee8a Mon Sep 17 00:00:00 2001 From: Yifei Xu Date: Wed, 11 Jun 2025 13:33:23 -0500 Subject: [PATCH 0029/1322] Update BUILD.bazel Add missing dependency after https://github.com/llvm/llvm-project/pull/142916. --- utils/bazel/llvm-project-overlay/mlir/BUILD.bazel | 1 + 1 file changed, 1 insertion(+) diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel index c1d63de04b8f..f6a7cd7dea85 100644 --- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel @@ -6882,6 +6882,7 @@ cc_library( ":SPIRVDialect", ":Support", "//llvm:config", + "//llvm:Support", ], ) From 773d357b9882fe0e30ffddee5ac1fbe2254fac05 Mon Sep 17 00:00:00 2001 From: Nikolas Klauser Date: Wed, 11 Jun 2025 20:39:45 +0200 Subject: [PATCH 0030/1322] [libc++] Simplify the implementation of __next_prime a bit (#143512) --- libcxx/src/hash.cpp | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/libcxx/src/hash.cpp b/libcxx/src/hash.cpp index 41c4eb480a5f..50d8cf9f9f53 100644 --- a/libcxx/src/hash.cpp +++ b/libcxx/src/hash.cpp @@ -9,7 +9,6 @@ #include <__hash_table> #include #include -#include _LIBCPP_CLANG_DIAGNOSTIC_IGNORED("-Wtautological-constant-out-of-range-compare") @@ -52,16 +51,15 @@ const unsigned indices[] = { // are fewer potential primes to search, and fewer potential primes to divide // against. -template -inline _LIBCPP_HIDE_FROM_ABI typename enable_if<_Sz == 4, void>::type __check_for_overflow(size_t N) { - if (N > 0xFFFFFFFB) - std::__throw_overflow_error("__next_prime overflow"); -} - -template -inline _LIBCPP_HIDE_FROM_ABI typename enable_if<_Sz == 8, void>::type __check_for_overflow(size_t N) { - if (N > 0xFFFFFFFFFFFFFFC5ull) - std::__throw_overflow_error("__next_prime overflow"); +inline void __check_for_overflow(size_t N) { + if constexpr (sizeof(size_t) == 4) { + if (N > 0xFFFFFFFB) + std::__throw_overflow_error("__next_prime overflow"); + } else { + static_assert(sizeof(size_t) == 8); + if (N > 0xFFFFFFFFFFFFFFC5ull) + std::__throw_overflow_error("__next_prime overflow"); + } } size_t __next_prime(size_t n) { From 8dc63ca59003a4b72217221c1c801237614c9d7d Mon Sep 17 00:00:00 2001 From: Jorge Gorbe Moya Date: Wed, 11 Jun 2025 11:47:09 -0700 Subject: [PATCH 0031/1322] Make clang/test/Frontend/aarch64-print-enabled-extensions-cc1.c write output file to temp dir --- clang/test/Frontend/aarch64-print-enabled-extensions-cc1.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/clang/test/Frontend/aarch64-print-enabled-extensions-cc1.c b/clang/test/Frontend/aarch64-print-enabled-extensions-cc1.c index 5d65fdafaa25..d761e12e8392 100644 --- a/clang/test/Frontend/aarch64-print-enabled-extensions-cc1.c +++ b/clang/test/Frontend/aarch64-print-enabled-extensions-cc1.c @@ -57,7 +57,7 @@ // RUN: | FileCheck --strict-whitespace --implicit-check-not=FEAT_ %s --check-prefix=DISABLE_VIA_XCLANG // However, sve2 is actually enabled in clang but disabled for MC. -// RUN: %clang --target=aarch64 -march=armv8-a+sve2 -c %s \ +// RUN: %clang --target=aarch64 -march=armv8-a+sve2 -c %s -o %t \ // RUN: -Xclang -target-feature -Xclang -sve \ // RUN: -Xclang -verify -Xclang -verify-ignore-unexpected=note From 0c62571d9f02f7d5c1a649b5b20fdf5b0f6bb41c Mon Sep 17 00:00:00 2001 From: Nikolas Klauser Date: Wed, 11 Jun 2025 20:57:07 +0200 Subject: [PATCH 0032/1322] [libc++] Remove static_assert from hash.cpp that fires unconditionall --- libcxx/src/hash.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/libcxx/src/hash.cpp b/libcxx/src/hash.cpp index 50d8cf9f9f53..e1e6d2b4c2bd 100644 --- a/libcxx/src/hash.cpp +++ b/libcxx/src/hash.cpp @@ -56,7 +56,6 @@ inline void __check_for_overflow(size_t N) { if (N > 0xFFFFFFFB) std::__throw_overflow_error("__next_prime overflow"); } else { - static_assert(sizeof(size_t) == 8); if (N > 0xFFFFFFFFFFFFFFC5ull) std::__throw_overflow_error("__next_prime overflow"); } From 02b6849cf1feb425885bf6f5ee505d5cd4a824d7 Mon Sep 17 00:00:00 2001 From: Abhinav Gaba Date: Wed, 11 Jun 2025 12:03:55 -0700 Subject: [PATCH 0033/1322] [Clang][OpenMP] Fix mapping of arrays of structs with members with mappers (#142511) This builds upon #101101 from @jyu2-git, which used compiler-generated mappers when mapping an array-section of structs with members that have user-defined default mappers. Now we do the same when mapping arrays of structs. --- clang/docs/ReleaseNotes.rst | 3 + clang/lib/Sema/SemaOpenMP.cpp | 38 ++- ...of_structs_with_nested_mapper_ast_dump.cpp | 34 ++ ..._of_structs_with_nested_mapper_codegen.cpp | 323 ++++++++++++++++++ ...f_structs_with_nested_mapper_ast_dump.cpp} | 0 ...of_structs_with_nested_mapper_codegen.cpp} | 0 ...re_mapper_nested_default_mappers_array.cpp | 6 +- 7 files changed, 388 insertions(+), 16 deletions(-) create mode 100644 clang/test/OpenMP/target_map_array_of_structs_with_nested_mapper_ast_dump.cpp create mode 100644 clang/test/OpenMP/target_map_array_of_structs_with_nested_mapper_codegen.cpp rename clang/test/OpenMP/{target_map_nest_defalut_mapper_ast_dump.cpp => target_map_array_section_of_structs_with_nested_mapper_ast_dump.cpp} (100%) rename clang/test/OpenMP/{target_map_nest_defalut_mapper_codegen.cpp => target_map_array_section_of_structs_with_nested_mapper_codegen.cpp} (100%) diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index b5e6cf088a4b..8043ab48f0b4 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -1102,6 +1102,9 @@ OpenMP Support - An error is now emitted when OpenMP ``collapse`` and ``ordered`` clauses have an argument larger than what can fit within a 64-bit integer. - Added support for private variable reduction. +- Fixed mapping of arrays of structs containing nested structs with user defined + mappers, by using compiler-generated default mappers for the outer structs for + such maps. Improvements ^^^^^^^^^^^^ diff --git a/clang/lib/Sema/SemaOpenMP.cpp b/clang/lib/Sema/SemaOpenMP.cpp index a3395ac157d9..2cbe79c5c07c 100644 --- a/clang/lib/Sema/SemaOpenMP.cpp +++ b/clang/lib/Sema/SemaOpenMP.cpp @@ -22057,20 +22057,34 @@ static void checkMappableExpressionList( Type.getCanonicalType(), UnresolvedMapper); if (ER.isInvalid()) continue; - if (!ER.get() && isa(VE)) { - // Create implicit mapper as needed. - QualType BaseType = VE->getType().getCanonicalType(); - if (BaseType->isSpecificBuiltinType(BuiltinType::ArraySection)) { - const auto *OASE = cast(VE->IgnoreParenImpCasts()); - QualType BType = ArraySectionExpr::getBaseOriginalType(OASE->getBase()); - QualType ElemType; - if (const auto *ATy = BType->getAsArrayTypeUnsafe()) - ElemType = ATy->getElementType(); - else - ElemType = BType->getPointeeType(); + + // If no user-defined mapper is found, we need to create an implicit one for + // arrays/array-sections on structs that have members that have + // user-defined mappers. This is needed to ensure that the mapper for the + // member is invoked when mapping each element of the array/array-section. + if (!ER.get()) { + QualType BaseType; + + if (isa(VE)) { + BaseType = VE->getType().getCanonicalType(); + if (BaseType->isSpecificBuiltinType(BuiltinType::ArraySection)) { + const auto *OASE = cast(VE->IgnoreParenImpCasts()); + QualType BType = + ArraySectionExpr::getBaseOriginalType(OASE->getBase()); + QualType ElemType; + if (const auto *ATy = BType->getAsArrayTypeUnsafe()) + ElemType = ATy->getElementType(); + else + ElemType = BType->getPointeeType(); + BaseType = ElemType.getCanonicalType(); + } + } else if (VE->getType()->isArrayType()) { + const ArrayType *AT = VE->getType()->getAsArrayTypeUnsafe(); + const QualType ElemType = AT->getElementType(); BaseType = ElemType.getCanonicalType(); } - if (BaseType->getAsRecordDecl() && + + if (!BaseType.isNull() && BaseType->getAsRecordDecl() && isImplicitMapperNeeded(SemaRef, DSAS, BaseType, VE)) { ER = buildImplicitMapper(SemaRef, BaseType, DSAS); } diff --git a/clang/test/OpenMP/target_map_array_of_structs_with_nested_mapper_ast_dump.cpp b/clang/test/OpenMP/target_map_array_of_structs_with_nested_mapper_ast_dump.cpp new file mode 100644 index 000000000000..a5847709d3e7 --- /dev/null +++ b/clang/test/OpenMP/target_map_array_of_structs_with_nested_mapper_ast_dump.cpp @@ -0,0 +1,34 @@ +//RUN: %clang_cc1 -triple x86_64-pc-linux-gnu -fopenmp -ast-dump %s | FileCheck %s --check-prefix=DUM + +typedef struct { + int a; +} C; +#pragma omp declare mapper(C s) map(to : s.a) + +typedef struct { + int e; + C f; + int h; +} D; + +void foo() { + D sa[10]; + sa[1].e = 111; + sa[1].f.a = 222; + +#pragma omp target map(tofrom : sa) + { + sa[0].e = 333; + sa[1].f.a = 444; + } +} + +// DUM: -OMPDeclareMapperDecl{{.*}}<> +// DUM-NEXT: |-OMPMapClause {{.*}}<> +// DUM-NEXT: | |-MemberExpr {{.*}} 'int' lvalue .e +// DUM-NEXT: | | `-DeclRefExpr {{.*}}<> 'D' lvalue Var {{.*}} '_s' 'D' +// DUM-NEXT: | |-MemberExpr {{.*}} 'C' lvalue .f {{.*}} +// DUM-NEXT: | | `-DeclRefExpr {{.*}}<> 'D' lvalue Var {{.*}} '_s' 'D' +// DUM-NEXT: | `-MemberExpr {{.*}} 'int' lvalue .h {{.*}} +// DUM-NEXT: | `-DeclRefExpr {{.*}}<> 'D' lvalue Var {{.*}} '_s' 'D' +// DUM-NEXT: `-VarDecl {{.*}} col:1 implicit used _s 'D' diff --git a/clang/test/OpenMP/target_map_array_of_structs_with_nested_mapper_codegen.cpp b/clang/test/OpenMP/target_map_array_of_structs_with_nested_mapper_codegen.cpp new file mode 100644 index 000000000000..5df1e958ad55 --- /dev/null +++ b/clang/test/OpenMP/target_map_array_of_structs_with_nested_mapper_codegen.cpp @@ -0,0 +1,323 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --check-globals --include-generated-funcs --replace-value-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+" "reduction_size[.].+[.]" "pl_cond[.].+[.|,]" --prefix-filecheck-ir-name _ --global-value-regex "\.offload_.*" +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck %s +// RUN: %clang_cc1 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s + +// expected-no-diagnostics +#ifndef HEADER +#define HEADER + +typedef struct { + int a; +} C; +#pragma omp declare mapper(C s) map(to : s.a) + +typedef struct { + int e; + C f; + int h; +} D; + +void foo() { + D sa[10]; + sa[1].e = 111; + sa[1].f.a = 222; + +#pragma omp target map(tofrom : sa) + { + sa[1].e = 333; + sa[1].f.a = 444; + } +} +#endif +//. +// CHECK: @.offload_sizes = private unnamed_addr constant [1 x i64] [i64 120] +// CHECK: @.offload_maptypes = private unnamed_addr constant [1 x i64] [i64 35] +//. +// CHECK-LABEL: define {{[^@]+}}@_Z3foov +// CHECK-SAME: () #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[SA:%.*]] = alloca [10 x %struct.D], align 4 +// CHECK-NEXT: [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [1 x ptr], align 8 +// CHECK-NEXT: [[DOTOFFLOAD_PTRS:%.*]] = alloca [1 x ptr], align 8 +// CHECK-NEXT: [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [1 x ptr], align 8 +// CHECK-NEXT: [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8 +// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x %struct.D], ptr [[SA]], i64 0, i64 1 +// CHECK-NEXT: [[E:%.*]] = getelementptr inbounds nuw [[STRUCT_D:%.*]], ptr [[ARRAYIDX]], i32 0, i32 0 +// CHECK-NEXT: store i32 111, ptr [[E]], align 4 +// CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [10 x %struct.D], ptr [[SA]], i64 0, i64 1 +// CHECK-NEXT: [[F:%.*]] = getelementptr inbounds nuw [[STRUCT_D]], ptr [[ARRAYIDX1]], i32 0, i32 1 +// CHECK-NEXT: [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_C:%.*]], ptr [[F]], i32 0, i32 0 +// CHECK-NEXT: store i32 222, ptr [[A]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0 +// CHECK-NEXT: store ptr [[SA]], ptr [[TMP0]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0 +// CHECK-NEXT: store ptr [[SA]], ptr [[TMP1]], align 8 +// CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 0 +// CHECK-NEXT: store ptr @.omp_mapper._ZTS1D.default, ptr [[TMP2]], align 8 +// CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0 +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0 +// CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0 +// CHECK-NEXT: store i32 3, ptr [[TMP5]], align 4 +// CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1 +// CHECK-NEXT: store i32 1, ptr [[TMP6]], align 4 +// CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2 +// CHECK-NEXT: store ptr [[TMP3]], ptr [[TMP7]], align 8 +// CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3 +// CHECK-NEXT: store ptr [[TMP4]], ptr [[TMP8]], align 8 +// CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4 +// CHECK-NEXT: store ptr @.offload_sizes, ptr [[TMP9]], align 8 +// CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5 +// CHECK-NEXT: store ptr @.offload_maptypes, ptr [[TMP10]], align 8 +// CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6 +// CHECK-NEXT: store ptr null, ptr [[TMP11]], align 8 +// CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7 +// CHECK-NEXT: store ptr [[DOTOFFLOAD_MAPPERS]], ptr [[TMP12]], align 8 +// CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8 +// CHECK-NEXT: store i64 0, ptr [[TMP13]], align 8 +// CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9 +// CHECK-NEXT: store i64 0, ptr [[TMP14]], align 8 +// CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10 +// CHECK-NEXT: store [3 x i32] [i32 -1, i32 0, i32 0], ptr [[TMP15]], align 4 +// CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11 +// CHECK-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP16]], align 4 +// CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12 +// CHECK-NEXT: store i32 0, ptr [[TMP17]], align 4 +// CHECK-NEXT: [[TMP18:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1:[0-9]+]], i64 -1, i32 -1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l26.region_id, ptr [[KERNEL_ARGS]]) +// CHECK-NEXT: [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0 +// CHECK-NEXT: br i1 [[TMP19]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] +// CHECK: omp_offload.failed: +// CHECK-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l26(ptr [[SA]]) #[[ATTR3:[0-9]+]] +// CHECK-NEXT: br label [[OMP_OFFLOAD_CONT]] +// CHECK: omp_offload.cont: +// CHECK-NEXT: ret void +// +// +// CHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l26 +// CHECK-SAME: (ptr noundef nonnull align 4 dereferenceable(120) [[SA:%.*]]) #[[ATTR1:[0-9]+]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[SA_ADDR:%.*]] = alloca ptr, align 8 +// CHECK-NEXT: store ptr [[SA]], ptr [[SA_ADDR]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[SA_ADDR]], align 8, !nonnull [[META5:![0-9]+]], !align [[META6:![0-9]+]] +// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x %struct.D], ptr [[TMP0]], i64 0, i64 1 +// CHECK-NEXT: [[E:%.*]] = getelementptr inbounds nuw [[STRUCT_D:%.*]], ptr [[ARRAYIDX]], i32 0, i32 0 +// CHECK-NEXT: store i32 333, ptr [[E]], align 4 +// CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [10 x %struct.D], ptr [[TMP0]], i64 0, i64 1 +// CHECK-NEXT: [[F:%.*]] = getelementptr inbounds nuw [[STRUCT_D]], ptr [[ARRAYIDX1]], i32 0, i32 1 +// CHECK-NEXT: [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_C:%.*]], ptr [[F]], i32 0, i32 0 +// CHECK-NEXT: store i32 444, ptr [[A]], align 4 +// CHECK-NEXT: ret void +// +// +// CHECK-LABEL: define {{[^@]+}}@.omp_mapper._ZTS1D.default +// CHECK-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]], ptr noundef [[TMP2:%.*]], i64 noundef [[TMP3:%.*]], i64 noundef [[TMP4:%.*]], ptr noundef [[TMP5:%.*]]) #[[ATTR2:[0-9]+]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP6:%.*]] = udiv exact i64 [[TMP3]], 12 +// CHECK-NEXT: [[TMP7:%.*]] = getelementptr [[STRUCT_D:%.*]], ptr [[TMP2]], i64 [[TMP6]] +// CHECK-NEXT: [[OMP_ARRAYINIT_ISARRAY:%.*]] = icmp sgt i64 [[TMP6]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = and i64 [[TMP4]], 8 +// CHECK-NEXT: [[TMP9:%.*]] = icmp ne ptr [[TMP1]], [[TMP2]] +// CHECK-NEXT: [[TMP10:%.*]] = and i64 [[TMP4]], 16 +// CHECK-NEXT: [[TMP11:%.*]] = icmp ne i64 [[TMP10]], 0 +// CHECK-NEXT: [[TMP12:%.*]] = and i1 [[TMP9]], [[TMP11]] +// CHECK-NEXT: [[TMP13:%.*]] = or i1 [[OMP_ARRAYINIT_ISARRAY]], [[TMP12]] +// CHECK-NEXT: [[DOTOMP_ARRAY__INIT__DELETE:%.*]] = icmp eq i64 [[TMP8]], 0 +// CHECK-NEXT: [[TMP14:%.*]] = and i1 [[TMP13]], [[DOTOMP_ARRAY__INIT__DELETE]] +// CHECK-NEXT: br i1 [[TMP14]], label [[DOTOMP_ARRAY__INIT:%.*]], label [[OMP_ARRAYMAP_HEAD:%.*]] +// CHECK: .omp.array..init: +// CHECK-NEXT: [[TMP15:%.*]] = mul nuw i64 [[TMP6]], 12 +// CHECK-NEXT: [[TMP16:%.*]] = and i64 [[TMP4]], -4 +// CHECK-NEXT: [[TMP17:%.*]] = or i64 [[TMP16]], 512 +// CHECK-NEXT: call void @__tgt_push_mapper_component(ptr [[TMP0]], ptr [[TMP1]], ptr [[TMP2]], i64 [[TMP15]], i64 [[TMP17]], ptr [[TMP5]]) +// CHECK-NEXT: br label [[OMP_ARRAYMAP_HEAD]] +// CHECK: omp.arraymap.head: +// CHECK-NEXT: [[OMP_ARRAYMAP_ISEMPTY:%.*]] = icmp eq ptr [[TMP2]], [[TMP7]] +// CHECK-NEXT: br i1 [[OMP_ARRAYMAP_ISEMPTY]], label [[OMP_DONE:%.*]], label [[OMP_ARRAYMAP_BODY:%.*]] +// CHECK: omp.arraymap.body: +// CHECK-NEXT: [[OMP_ARRAYMAP_PTRCURRENT:%.*]] = phi ptr [ [[TMP2]], [[OMP_ARRAYMAP_HEAD]] ], [ [[OMP_ARRAYMAP_NEXT:%.*]], [[OMP_TYPE_END20:%.*]] ] +// CHECK-NEXT: [[E:%.*]] = getelementptr inbounds nuw [[STRUCT_D]], ptr [[OMP_ARRAYMAP_PTRCURRENT]], i32 0, i32 0 +// CHECK-NEXT: [[F:%.*]] = getelementptr inbounds nuw [[STRUCT_D]], ptr [[OMP_ARRAYMAP_PTRCURRENT]], i32 0, i32 1 +// CHECK-NEXT: [[H:%.*]] = getelementptr inbounds nuw [[STRUCT_D]], ptr [[OMP_ARRAYMAP_PTRCURRENT]], i32 0, i32 2 +// CHECK-NEXT: [[TMP18:%.*]] = getelementptr i32, ptr [[H]], i32 1 +// CHECK-NEXT: [[TMP19:%.*]] = ptrtoint ptr [[TMP18]] to i64 +// CHECK-NEXT: [[TMP20:%.*]] = ptrtoint ptr [[E]] to i64 +// CHECK-NEXT: [[TMP21:%.*]] = sub i64 [[TMP19]], [[TMP20]] +// CHECK-NEXT: [[TMP22:%.*]] = sdiv exact i64 [[TMP21]], ptrtoint (ptr getelementptr (i8, ptr null, i32 1) to i64) +// CHECK-NEXT: [[TMP23:%.*]] = call i64 @__tgt_mapper_num_components(ptr [[TMP0]]) +// CHECK-NEXT: [[TMP24:%.*]] = shl i64 [[TMP23]], 48 +// CHECK-NEXT: [[TMP25:%.*]] = add nuw i64 0, [[TMP24]] +// CHECK-NEXT: [[TMP26:%.*]] = and i64 [[TMP4]], 3 +// CHECK-NEXT: [[TMP27:%.*]] = icmp eq i64 [[TMP26]], 0 +// CHECK-NEXT: br i1 [[TMP27]], label [[OMP_TYPE_ALLOC:%.*]], label [[OMP_TYPE_ALLOC_ELSE:%.*]] +// CHECK: omp.type.alloc: +// CHECK-NEXT: [[TMP28:%.*]] = and i64 [[TMP25]], -4 +// CHECK-NEXT: br label [[OMP_TYPE_END:%.*]] +// CHECK: omp.type.alloc.else: +// CHECK-NEXT: [[TMP29:%.*]] = icmp eq i64 [[TMP26]], 1 +// CHECK-NEXT: br i1 [[TMP29]], label [[OMP_TYPE_TO:%.*]], label [[OMP_TYPE_TO_ELSE:%.*]] +// CHECK: omp.type.to: +// CHECK-NEXT: [[TMP30:%.*]] = and i64 [[TMP25]], -3 +// CHECK-NEXT: br label [[OMP_TYPE_END]] +// CHECK: omp.type.to.else: +// CHECK-NEXT: [[TMP31:%.*]] = icmp eq i64 [[TMP26]], 2 +// CHECK-NEXT: br i1 [[TMP31]], label [[OMP_TYPE_FROM:%.*]], label [[OMP_TYPE_END]] +// CHECK: omp.type.from: +// CHECK-NEXT: [[TMP32:%.*]] = and i64 [[TMP25]], -2 +// CHECK-NEXT: br label [[OMP_TYPE_END]] +// CHECK: omp.type.end: +// CHECK-NEXT: [[OMP_MAPTYPE:%.*]] = phi i64 [ [[TMP28]], [[OMP_TYPE_ALLOC]] ], [ [[TMP30]], [[OMP_TYPE_TO]] ], [ [[TMP32]], [[OMP_TYPE_FROM]] ], [ [[TMP25]], [[OMP_TYPE_TO_ELSE]] ] +// CHECK-NEXT: call void @__tgt_push_mapper_component(ptr [[TMP0]], ptr [[OMP_ARRAYMAP_PTRCURRENT]], ptr [[E]], i64 [[TMP22]], i64 [[OMP_MAPTYPE]], ptr null) +// CHECK-NEXT: [[TMP33:%.*]] = add nuw i64 281474976711171, [[TMP24]] +// CHECK-NEXT: [[TMP34:%.*]] = and i64 [[TMP4]], 3 +// CHECK-NEXT: [[TMP35:%.*]] = icmp eq i64 [[TMP34]], 0 +// CHECK-NEXT: br i1 [[TMP35]], label [[OMP_TYPE_ALLOC1:%.*]], label [[OMP_TYPE_ALLOC_ELSE2:%.*]] +// CHECK: omp.type.alloc1: +// CHECK-NEXT: [[TMP36:%.*]] = and i64 [[TMP33]], -4 +// CHECK-NEXT: br label [[OMP_TYPE_END6:%.*]] +// CHECK: omp.type.alloc.else2: +// CHECK-NEXT: [[TMP37:%.*]] = icmp eq i64 [[TMP34]], 1 +// CHECK-NEXT: br i1 [[TMP37]], label [[OMP_TYPE_TO3:%.*]], label [[OMP_TYPE_TO_ELSE4:%.*]] +// CHECK: omp.type.to3: +// CHECK-NEXT: [[TMP38:%.*]] = and i64 [[TMP33]], -3 +// CHECK-NEXT: br label [[OMP_TYPE_END6]] +// CHECK: omp.type.to.else4: +// CHECK-NEXT: [[TMP39:%.*]] = icmp eq i64 [[TMP34]], 2 +// CHECK-NEXT: br i1 [[TMP39]], label [[OMP_TYPE_FROM5:%.*]], label [[OMP_TYPE_END6]] +// CHECK: omp.type.from5: +// CHECK-NEXT: [[TMP40:%.*]] = and i64 [[TMP33]], -2 +// CHECK-NEXT: br label [[OMP_TYPE_END6]] +// CHECK: omp.type.end6: +// CHECK-NEXT: [[OMP_MAPTYPE7:%.*]] = phi i64 [ [[TMP36]], [[OMP_TYPE_ALLOC1]] ], [ [[TMP38]], [[OMP_TYPE_TO3]] ], [ [[TMP40]], [[OMP_TYPE_FROM5]] ], [ [[TMP33]], [[OMP_TYPE_TO_ELSE4]] ] +// CHECK-NEXT: call void @__tgt_push_mapper_component(ptr [[TMP0]], ptr [[OMP_ARRAYMAP_PTRCURRENT]], ptr [[E]], i64 4, i64 [[OMP_MAPTYPE7]], ptr null) +// CHECK-NEXT: [[TMP41:%.*]] = add nuw i64 281474976711171, [[TMP24]] +// CHECK-NEXT: [[TMP42:%.*]] = and i64 [[TMP4]], 3 +// CHECK-NEXT: [[TMP43:%.*]] = icmp eq i64 [[TMP42]], 0 +// CHECK-NEXT: br i1 [[TMP43]], label [[OMP_TYPE_ALLOC8:%.*]], label [[OMP_TYPE_ALLOC_ELSE9:%.*]] +// CHECK: omp.type.alloc8: +// CHECK-NEXT: [[TMP44:%.*]] = and i64 [[TMP41]], -4 +// CHECK-NEXT: br label [[OMP_TYPE_END13:%.*]] +// CHECK: omp.type.alloc.else9: +// CHECK-NEXT: [[TMP45:%.*]] = icmp eq i64 [[TMP42]], 1 +// CHECK-NEXT: br i1 [[TMP45]], label [[OMP_TYPE_TO10:%.*]], label [[OMP_TYPE_TO_ELSE11:%.*]] +// CHECK: omp.type.to10: +// CHECK-NEXT: [[TMP46:%.*]] = and i64 [[TMP41]], -3 +// CHECK-NEXT: br label [[OMP_TYPE_END13]] +// CHECK: omp.type.to.else11: +// CHECK-NEXT: [[TMP47:%.*]] = icmp eq i64 [[TMP42]], 2 +// CHECK-NEXT: br i1 [[TMP47]], label [[OMP_TYPE_FROM12:%.*]], label [[OMP_TYPE_END13]] +// CHECK: omp.type.from12: +// CHECK-NEXT: [[TMP48:%.*]] = and i64 [[TMP41]], -2 +// CHECK-NEXT: br label [[OMP_TYPE_END13]] +// CHECK: omp.type.end13: +// CHECK-NEXT: [[OMP_MAPTYPE14:%.*]] = phi i64 [ [[TMP44]], [[OMP_TYPE_ALLOC8]] ], [ [[TMP46]], [[OMP_TYPE_TO10]] ], [ [[TMP48]], [[OMP_TYPE_FROM12]] ], [ [[TMP41]], [[OMP_TYPE_TO_ELSE11]] ] +// CHECK-NEXT: call void @.omp_mapper._ZTS1C.default(ptr [[TMP0]], ptr [[OMP_ARRAYMAP_PTRCURRENT]], ptr [[F]], i64 4, i64 [[OMP_MAPTYPE14]], ptr null) #[[ATTR3]] +// CHECK-NEXT: [[TMP49:%.*]] = add nuw i64 281474976711171, [[TMP24]] +// CHECK-NEXT: [[TMP50:%.*]] = and i64 [[TMP4]], 3 +// CHECK-NEXT: [[TMP51:%.*]] = icmp eq i64 [[TMP50]], 0 +// CHECK-NEXT: br i1 [[TMP51]], label [[OMP_TYPE_ALLOC15:%.*]], label [[OMP_TYPE_ALLOC_ELSE16:%.*]] +// CHECK: omp.type.alloc15: +// CHECK-NEXT: [[TMP52:%.*]] = and i64 [[TMP49]], -4 +// CHECK-NEXT: br label [[OMP_TYPE_END20]] +// CHECK: omp.type.alloc.else16: +// CHECK-NEXT: [[TMP53:%.*]] = icmp eq i64 [[TMP50]], 1 +// CHECK-NEXT: br i1 [[TMP53]], label [[OMP_TYPE_TO17:%.*]], label [[OMP_TYPE_TO_ELSE18:%.*]] +// CHECK: omp.type.to17: +// CHECK-NEXT: [[TMP54:%.*]] = and i64 [[TMP49]], -3 +// CHECK-NEXT: br label [[OMP_TYPE_END20]] +// CHECK: omp.type.to.else18: +// CHECK-NEXT: [[TMP55:%.*]] = icmp eq i64 [[TMP50]], 2 +// CHECK-NEXT: br i1 [[TMP55]], label [[OMP_TYPE_FROM19:%.*]], label [[OMP_TYPE_END20]] +// CHECK: omp.type.from19: +// CHECK-NEXT: [[TMP56:%.*]] = and i64 [[TMP49]], -2 +// CHECK-NEXT: br label [[OMP_TYPE_END20]] +// CHECK: omp.type.end20: +// CHECK-NEXT: [[OMP_MAPTYPE21:%.*]] = phi i64 [ [[TMP52]], [[OMP_TYPE_ALLOC15]] ], [ [[TMP54]], [[OMP_TYPE_TO17]] ], [ [[TMP56]], [[OMP_TYPE_FROM19]] ], [ [[TMP49]], [[OMP_TYPE_TO_ELSE18]] ] +// CHECK-NEXT: call void @__tgt_push_mapper_component(ptr [[TMP0]], ptr [[OMP_ARRAYMAP_PTRCURRENT]], ptr [[H]], i64 4, i64 [[OMP_MAPTYPE21]], ptr null) +// CHECK-NEXT: [[OMP_ARRAYMAP_NEXT]] = getelementptr [[STRUCT_D]], ptr [[OMP_ARRAYMAP_PTRCURRENT]], i32 1 +// CHECK-NEXT: [[OMP_ARRAYMAP_ISDONE:%.*]] = icmp eq ptr [[OMP_ARRAYMAP_NEXT]], [[TMP7]] +// CHECK-NEXT: br i1 [[OMP_ARRAYMAP_ISDONE]], label [[OMP_ARRAYMAP_EXIT:%.*]], label [[OMP_ARRAYMAP_BODY]] +// CHECK: omp.arraymap.exit: +// CHECK-NEXT: [[OMP_ARRAYINIT_ISARRAY22:%.*]] = icmp sgt i64 [[TMP6]], 1 +// CHECK-NEXT: [[TMP57:%.*]] = and i64 [[TMP4]], 8 +// CHECK-NEXT: [[DOTOMP_ARRAY__DEL__DELETE:%.*]] = icmp ne i64 [[TMP57]], 0 +// CHECK-NEXT: [[TMP58:%.*]] = and i1 [[OMP_ARRAYINIT_ISARRAY22]], [[DOTOMP_ARRAY__DEL__DELETE]] +// CHECK-NEXT: br i1 [[TMP58]], label [[DOTOMP_ARRAY__DEL:%.*]], label [[OMP_DONE]] +// CHECK: .omp.array..del: +// CHECK-NEXT: [[TMP59:%.*]] = mul nuw i64 [[TMP6]], 12 +// CHECK-NEXT: [[TMP60:%.*]] = and i64 [[TMP4]], -4 +// CHECK-NEXT: [[TMP61:%.*]] = or i64 [[TMP60]], 512 +// CHECK-NEXT: call void @__tgt_push_mapper_component(ptr [[TMP0]], ptr [[TMP1]], ptr [[TMP2]], i64 [[TMP59]], i64 [[TMP61]], ptr [[TMP5]]) +// CHECK-NEXT: br label [[OMP_DONE]] +// CHECK: omp.done: +// CHECK-NEXT: ret void +// +// +// CHECK-LABEL: define {{[^@]+}}@.omp_mapper._ZTS1C.default +// CHECK-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]], ptr noundef [[TMP2:%.*]], i64 noundef [[TMP3:%.*]], i64 noundef [[TMP4:%.*]], ptr noundef [[TMP5:%.*]]) #[[ATTR2]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP6:%.*]] = udiv exact i64 [[TMP3]], 4 +// CHECK-NEXT: [[TMP7:%.*]] = getelementptr [[STRUCT_C:%.*]], ptr [[TMP2]], i64 [[TMP6]] +// CHECK-NEXT: [[OMP_ARRAYINIT_ISARRAY:%.*]] = icmp sgt i64 [[TMP6]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = and i64 [[TMP4]], 8 +// CHECK-NEXT: [[TMP9:%.*]] = icmp ne ptr [[TMP1]], [[TMP2]] +// CHECK-NEXT: [[TMP10:%.*]] = and i64 [[TMP4]], 16 +// CHECK-NEXT: [[TMP11:%.*]] = icmp ne i64 [[TMP10]], 0 +// CHECK-NEXT: [[TMP12:%.*]] = and i1 [[TMP9]], [[TMP11]] +// CHECK-NEXT: [[TMP13:%.*]] = or i1 [[OMP_ARRAYINIT_ISARRAY]], [[TMP12]] +// CHECK-NEXT: [[DOTOMP_ARRAY__INIT__DELETE:%.*]] = icmp eq i64 [[TMP8]], 0 +// CHECK-NEXT: [[TMP14:%.*]] = and i1 [[TMP13]], [[DOTOMP_ARRAY__INIT__DELETE]] +// CHECK-NEXT: br i1 [[TMP14]], label [[DOTOMP_ARRAY__INIT:%.*]], label [[OMP_ARRAYMAP_HEAD:%.*]] +// CHECK: .omp.array..init: +// CHECK-NEXT: [[TMP15:%.*]] = mul nuw i64 [[TMP6]], 4 +// CHECK-NEXT: [[TMP16:%.*]] = and i64 [[TMP4]], -4 +// CHECK-NEXT: [[TMP17:%.*]] = or i64 [[TMP16]], 512 +// CHECK-NEXT: call void @__tgt_push_mapper_component(ptr [[TMP0]], ptr [[TMP1]], ptr [[TMP2]], i64 [[TMP15]], i64 [[TMP17]], ptr [[TMP5]]) +// CHECK-NEXT: br label [[OMP_ARRAYMAP_HEAD]] +// CHECK: omp.arraymap.head: +// CHECK-NEXT: [[OMP_ARRAYMAP_ISEMPTY:%.*]] = icmp eq ptr [[TMP2]], [[TMP7]] +// CHECK-NEXT: br i1 [[OMP_ARRAYMAP_ISEMPTY]], label [[OMP_DONE:%.*]], label [[OMP_ARRAYMAP_BODY:%.*]] +// CHECK: omp.arraymap.body: +// CHECK-NEXT: [[OMP_ARRAYMAP_PTRCURRENT:%.*]] = phi ptr [ [[TMP2]], [[OMP_ARRAYMAP_HEAD]] ], [ [[OMP_ARRAYMAP_NEXT:%.*]], [[OMP_TYPE_END:%.*]] ] +// CHECK-NEXT: [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_C]], ptr [[OMP_ARRAYMAP_PTRCURRENT]], i32 0, i32 0 +// CHECK-NEXT: [[TMP18:%.*]] = call i64 @__tgt_mapper_num_components(ptr [[TMP0]]) +// CHECK-NEXT: [[TMP19:%.*]] = shl i64 [[TMP18]], 48 +// CHECK-NEXT: [[TMP20:%.*]] = add nuw i64 1, [[TMP19]] +// CHECK-NEXT: [[TMP21:%.*]] = and i64 [[TMP4]], 3 +// CHECK-NEXT: [[TMP22:%.*]] = icmp eq i64 [[TMP21]], 0 +// CHECK-NEXT: br i1 [[TMP22]], label [[OMP_TYPE_ALLOC:%.*]], label [[OMP_TYPE_ALLOC_ELSE:%.*]] +// CHECK: omp.type.alloc: +// CHECK-NEXT: [[TMP23:%.*]] = and i64 [[TMP20]], -4 +// CHECK-NEXT: br label [[OMP_TYPE_END]] +// CHECK: omp.type.alloc.else: +// CHECK-NEXT: [[TMP24:%.*]] = icmp eq i64 [[TMP21]], 1 +// CHECK-NEXT: br i1 [[TMP24]], label [[OMP_TYPE_TO:%.*]], label [[OMP_TYPE_TO_ELSE:%.*]] +// CHECK: omp.type.to: +// CHECK-NEXT: [[TMP25:%.*]] = and i64 [[TMP20]], -3 +// CHECK-NEXT: br label [[OMP_TYPE_END]] +// CHECK: omp.type.to.else: +// CHECK-NEXT: [[TMP26:%.*]] = icmp eq i64 [[TMP21]], 2 +// CHECK-NEXT: br i1 [[TMP26]], label [[OMP_TYPE_FROM:%.*]], label [[OMP_TYPE_END]] +// CHECK: omp.type.from: +// CHECK-NEXT: [[TMP27:%.*]] = and i64 [[TMP20]], -2 +// CHECK-NEXT: br label [[OMP_TYPE_END]] +// CHECK: omp.type.end: +// CHECK-NEXT: [[OMP_MAPTYPE:%.*]] = phi i64 [ [[TMP23]], [[OMP_TYPE_ALLOC]] ], [ [[TMP25]], [[OMP_TYPE_TO]] ], [ [[TMP27]], [[OMP_TYPE_FROM]] ], [ [[TMP20]], [[OMP_TYPE_TO_ELSE]] ] +// CHECK-NEXT: call void @__tgt_push_mapper_component(ptr [[TMP0]], ptr [[OMP_ARRAYMAP_PTRCURRENT]], ptr [[A]], i64 4, i64 [[OMP_MAPTYPE]], ptr null) +// CHECK-NEXT: [[OMP_ARRAYMAP_NEXT]] = getelementptr [[STRUCT_C]], ptr [[OMP_ARRAYMAP_PTRCURRENT]], i32 1 +// CHECK-NEXT: [[OMP_ARRAYMAP_ISDONE:%.*]] = icmp eq ptr [[OMP_ARRAYMAP_NEXT]], [[TMP7]] +// CHECK-NEXT: br i1 [[OMP_ARRAYMAP_ISDONE]], label [[OMP_ARRAYMAP_EXIT:%.*]], label [[OMP_ARRAYMAP_BODY]] +// CHECK: omp.arraymap.exit: +// CHECK-NEXT: [[OMP_ARRAYINIT_ISARRAY1:%.*]] = icmp sgt i64 [[TMP6]], 1 +// CHECK-NEXT: [[TMP28:%.*]] = and i64 [[TMP4]], 8 +// CHECK-NEXT: [[DOTOMP_ARRAY__DEL__DELETE:%.*]] = icmp ne i64 [[TMP28]], 0 +// CHECK-NEXT: [[TMP29:%.*]] = and i1 [[OMP_ARRAYINIT_ISARRAY1]], [[DOTOMP_ARRAY__DEL__DELETE]] +// CHECK-NEXT: br i1 [[TMP29]], label [[DOTOMP_ARRAY__DEL:%.*]], label [[OMP_DONE]] +// CHECK: .omp.array..del: +// CHECK-NEXT: [[TMP30:%.*]] = mul nuw i64 [[TMP6]], 4 +// CHECK-NEXT: [[TMP31:%.*]] = and i64 [[TMP4]], -4 +// CHECK-NEXT: [[TMP32:%.*]] = or i64 [[TMP31]], 512 +// CHECK-NEXT: call void @__tgt_push_mapper_component(ptr [[TMP0]], ptr [[TMP1]], ptr [[TMP2]], i64 [[TMP30]], i64 [[TMP32]], ptr [[TMP5]]) +// CHECK-NEXT: br label [[OMP_DONE]] +// CHECK: omp.done: +// CHECK-NEXT: ret void +// diff --git a/clang/test/OpenMP/target_map_nest_defalut_mapper_ast_dump.cpp b/clang/test/OpenMP/target_map_array_section_of_structs_with_nested_mapper_ast_dump.cpp similarity index 100% rename from clang/test/OpenMP/target_map_nest_defalut_mapper_ast_dump.cpp rename to clang/test/OpenMP/target_map_array_section_of_structs_with_nested_mapper_ast_dump.cpp diff --git a/clang/test/OpenMP/target_map_nest_defalut_mapper_codegen.cpp b/clang/test/OpenMP/target_map_array_section_of_structs_with_nested_mapper_codegen.cpp similarity index 100% rename from clang/test/OpenMP/target_map_nest_defalut_mapper_codegen.cpp rename to clang/test/OpenMP/target_map_array_section_of_structs_with_nested_mapper_codegen.cpp diff --git a/offload/test/mapping/declare_mapper_nested_default_mappers_array.cpp b/offload/test/mapping/declare_mapper_nested_default_mappers_array.cpp index d545e98ef6c3..93695d1b388f 100644 --- a/offload/test/mapping/declare_mapper_nested_default_mappers_array.cpp +++ b/offload/test/mapping/declare_mapper_nested_default_mappers_array.cpp @@ -4,8 +4,6 @@ // RUN: %libomptarget-compilexx-run-and-check-x86_64-unknown-linux-gnu // RUN: %libomptarget-compilexx-run-and-check-nvptx64-nvidia-cuda -// UNSUPPORTED: clang - #include #include @@ -50,7 +48,7 @@ int main() { sa[1].h = N; printf("%d %d %d %4.5f %d\n", sa[1].e, sa[1].f.a, sa[1].f.c.a, sa[1].f.b[1], - sa[1].f.b == &x[0] ? 1 : 0); + sa[1].f.b == &y[0] ? 1 : 0); // CHECK: 111 222 777 20.00000 1 __intptr_t p = reinterpret_cast<__intptr_t>(&y[0]); @@ -65,6 +63,6 @@ int main() { sa[1].f.b[1] = 40; } printf("%d %d %d %4.5f %d\n", sa[1].e, sa[1].f.a, sa[1].f.c.a, sa[1].f.b[1], - sa[1].f.b == &x[0] ? 1 : 0); + sa[1].f.b == &y[0] ? 1 : 0); // CHECK: 333 222 777 40.00000 1 } From 574f77a1ee34461bc1f4a0823da6c960ff1c9655 Mon Sep 17 00:00:00 2001 From: Erich Keane Date: Wed, 11 Jun 2025 12:04:26 -0700 Subject: [PATCH 0034/1322] [OpenACC][CIR] Add parallelism determ. to all acc.loops (#143751) PR #143720 adds a requirement to the ACC dialect that every acc.loop must have a seq, independent, or auto attribute for the 'default' device_type. The standard has rules for how this can be intuited: orphan/parallel/parallel loop: independent kernels/kernels loop: auto serial/serial loop: seq, unless there is a gang/worker/vector, at which point it should be 'auto'. This patch implements all of this rule as a 'cleanup' step on the IR generation for combined/loop operations. Note that the test impact is much less since I inadvertently have my 'operation' terminating curley matching the end curley from 'attribute' instead of the front of the line, so I've added sufficient tests to ensure I captured the above. --- clang/lib/CIR/CodeGen/CIRGenFunction.h | 12 +++ clang/lib/CIR/CodeGen/CIRGenStmtOpenACC.cpp | 2 + .../lib/CIR/CodeGen/CIRGenStmtOpenACCLoop.cpp | 33 ++++++ clang/test/CIR/CodeGenOpenACC/combined.cpp | 69 ++++++++++-- clang/test/CIR/CodeGenOpenACC/loop.cpp | 101 ++++++++++++++++-- .../mlir/Dialect/OpenACC/OpenACCOps.td | 8 ++ mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp | 24 +++++ 7 files changed, 232 insertions(+), 17 deletions(-) diff --git a/clang/lib/CIR/CodeGen/CIRGenFunction.h b/clang/lib/CIR/CodeGen/CIRGenFunction.h index b08dd540e628..682d59d63faa 100644 --- a/clang/lib/CIR/CodeGen/CIRGenFunction.h +++ b/clang/lib/CIR/CodeGen/CIRGenFunction.h @@ -34,6 +34,12 @@ namespace { class ScalarExprEmitter; } // namespace +namespace mlir { +namespace acc { +class LoopOp; +} // namespace acc +} // namespace mlir + namespace clang::CIRGen { class CIRGenFunction : public CIRGenTypeCache { @@ -1082,6 +1088,12 @@ private: OpenACCDirectiveKind dirKind, SourceLocation dirLoc, ArrayRef clauses); + // The OpenACC LoopOp requires that we have auto, seq, or independent on all + // LoopOp operations for the 'none' device type case. This function checks if + // the LoopOp has one, else it updates it to have one. + void updateLoopOpParallelism(mlir::acc::LoopOp &op, bool isOrphan, + OpenACCDirectiveKind dk); + public: mlir::LogicalResult emitOpenACCComputeConstruct(const OpenACCComputeConstruct &s); diff --git a/clang/lib/CIR/CodeGen/CIRGenStmtOpenACC.cpp b/clang/lib/CIR/CodeGen/CIRGenStmtOpenACC.cpp index 2aab9cecf93d..1feefa55eb27 100644 --- a/clang/lib/CIR/CodeGen/CIRGenStmtOpenACC.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenStmtOpenACC.cpp @@ -102,6 +102,8 @@ mlir::LogicalResult CIRGenFunction::emitOpenACCOpCombinedConstruct( emitOpenACCClauses(computeOp, loopOp, dirKind, dirLoc, clauses); + updateLoopOpParallelism(loopOp, /*isOrphan=*/false, dirKind); + builder.create(end); } diff --git a/clang/lib/CIR/CodeGen/CIRGenStmtOpenACCLoop.cpp b/clang/lib/CIR/CodeGen/CIRGenStmtOpenACCLoop.cpp index 24cd1d399de6..71f3ccb8e040 100644 --- a/clang/lib/CIR/CodeGen/CIRGenStmtOpenACCLoop.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenStmtOpenACCLoop.cpp @@ -22,6 +22,36 @@ using namespace clang::CIRGen; using namespace cir; using namespace mlir::acc; +void CIRGenFunction::updateLoopOpParallelism(mlir::acc::LoopOp &op, + bool isOrphan, + OpenACCDirectiveKind dk) { + // Check that at least one of auto, independent, or seq is present + // for the device-independent default clauses. + if (op.hasParallelismFlag(mlir::acc::DeviceType::None)) + return; + + switch (dk) { + default: + llvm_unreachable("Invalid parent directive kind"); + case OpenACCDirectiveKind::Invalid: + case OpenACCDirectiveKind::Parallel: + case OpenACCDirectiveKind::ParallelLoop: + op.addIndependent(builder.getContext(), {}); + return; + case OpenACCDirectiveKind::Kernels: + case OpenACCDirectiveKind::KernelsLoop: + op.addAuto(builder.getContext(), {}); + return; + case OpenACCDirectiveKind::Serial: + case OpenACCDirectiveKind::SerialLoop: + if (op.hasDefaultGangWorkerVector()) + op.addAuto(builder.getContext(), {}); + else + op.addSeq(builder.getContext(), {}); + return; + }; +} + mlir::LogicalResult CIRGenFunction::emitOpenACCLoopConstruct(const OpenACCLoopConstruct &s) { mlir::Location start = getLoc(s.getSourceRange().getBegin()); @@ -90,6 +120,9 @@ CIRGenFunction::emitOpenACCLoopConstruct(const OpenACCLoopConstruct &s) { emitOpenACCClauses(op, s.getDirectiveKind(), s.getDirectiveLoc(), s.clauses()); + updateLoopOpParallelism(op, s.isOrphanedLoopConstruct(), + s.getParentComputeConstructKind()); + mlir::LogicalResult stmtRes = mlir::success(); // Emit body. { diff --git a/clang/test/CIR/CodeGenOpenACC/combined.cpp b/clang/test/CIR/CodeGenOpenACC/combined.cpp index 1f3c9f1a8d3f..5b83a9cb9189 100644 --- a/clang/test/CIR/CodeGenOpenACC/combined.cpp +++ b/clang/test/CIR/CodeGenOpenACC/combined.cpp @@ -74,7 +74,7 @@ extern "C" void acc_combined(int N, int cond) { // CHECK: acc.serial combined(loop) { // CHECK: acc.loop combined(serial) { // CHECK: acc.yield - // CHECK-NEXT: } attributes {seq = [#acc.device_type, #acc.device_type]} loc + // CHECK-NEXT: } attributes {seq = [#acc.device_type, #acc.device_type, #acc.device_type]} loc // CHECK: acc.yield // CHECK-NEXT: } loc #pragma acc kernels loop seq device_type(nvidia, radeon) @@ -99,7 +99,7 @@ extern "C" void acc_combined(int N, int cond) { // CHECK: acc.serial combined(loop) { // CHECK: acc.loop combined(serial) { // CHECK: acc.yield - // CHECK-NEXT: } attributes {auto_ = [#acc.device_type, #acc.device_type]} loc + // CHECK-NEXT: } attributes {auto_ = [#acc.device_type, #acc.device_type], seq = [#acc.device_type]} loc // CHECK: acc.yield // CHECK-NEXT: } loc #pragma acc kernels loop auto device_type(nvidia, radeon) @@ -124,7 +124,7 @@ extern "C" void acc_combined(int N, int cond) { // CHECK: acc.serial combined(loop) { // CHECK: acc.loop combined(serial) { // CHECK: acc.yield - // CHECK-NEXT: } attributes {independent = [#acc.device_type, #acc.device_type]} loc + // CHECK-NEXT: } attributes {independent = [#acc.device_type, #acc.device_type], seq = [#acc.device_type]} loc // CHECK: acc.yield // CHECK-NEXT: } loc #pragma acc kernels loop independent device_type(nvidia, radeon) @@ -143,7 +143,7 @@ extern "C" void acc_combined(int N, int cond) { // CHECK: acc.parallel combined(loop) { // CHECK: acc.loop combined(parallel) { // CHECK: acc.yield - // CHECK-NEXT: } attributes {collapse = [1], collapseDeviceType = [#acc.device_type]} + // CHECK-NEXT: } attributes {collapse = [1], collapseDeviceType = [#acc.device_type], independent = [#acc.device_type]} // CHECK: acc.yield // CHECK-NEXT: } loc @@ -154,7 +154,7 @@ extern "C" void acc_combined(int N, int cond) { // CHECK: acc.serial combined(loop) { // CHECK: acc.loop combined(serial) { // CHECK: acc.yield - // CHECK-NEXT: } attributes {collapse = [1, 2], collapseDeviceType = [#acc.device_type, #acc.device_type]} + // CHECK-NEXT: } attributes {collapse = [1, 2], collapseDeviceType = [#acc.device_type, #acc.device_type], seq = [#acc.device_type]} // CHECK: acc.yield // CHECK-NEXT: } loc @@ -165,7 +165,7 @@ extern "C" void acc_combined(int N, int cond) { // CHECK: acc.kernels combined(loop) { // CHECK: acc.loop combined(kernels) { // CHECK: acc.yield - // CHECK-NEXT: } attributes {collapse = [1, 2, 2], collapseDeviceType = [#acc.device_type, #acc.device_type, #acc.device_type]} + // CHECK-NEXT: } attributes {auto_ = [#acc.device_type], collapse = [1, 2, 2], collapseDeviceType = [#acc.device_type, #acc.device_type, #acc.device_type]} // CHECK: acc.terminator // CHECK-NEXT: } loc #pragma acc parallel loop collapse(1) device_type(radeon, nvidia) collapse(2) device_type(host) collapse(3) @@ -175,7 +175,7 @@ extern "C" void acc_combined(int N, int cond) { // CHECK: acc.parallel combined(loop) { // CHECK: acc.loop combined(parallel) { // CHECK: acc.yield - // CHECK-NEXT: } attributes {collapse = [1, 2, 2, 3], collapseDeviceType = [#acc.device_type, #acc.device_type, #acc.device_type, #acc.device_type]} + // CHECK-NEXT: } attributes {collapse = [1, 2, 2, 3], collapseDeviceType = [#acc.device_type, #acc.device_type, #acc.device_type, #acc.device_type], independent = [#acc.device_type]} // CHECK: acc.yield // CHECK-NEXT: } loc @@ -1184,4 +1184,59 @@ extern "C" void acc_combined_data_clauses(int *arg1, int *arg2) { // CHECK-NEXT: } loc // CHECK-NEXT: acc.detach accPtr(%[[ATTACH2]] : !cir.ptr>) async([#acc.device_type]) {dataClause = #acc, name = "arg2"} // CHECK-NEXT: acc.detach accPtr(%[[ATTACH1]] : !cir.ptr>) async([#acc.device_type]) {dataClause = #acc, name = "arg1"} + + // Checking the automatic-addition of parallelism clauses. +#pragma acc parallel loop + for(unsigned I = 0; I < 5; ++I); + // CHECK-NEXT: acc.parallel combined(loop) { + // CHECK-NEXT: acc.loop combined(parallel) { + // CHECK: acc.yield + // CHECK-NEXT: } attributes {independent = [#acc.device_type]} loc + // CHECK-NEXT: acc.yield + // CHECK-NEXT: } loc + +#pragma acc kernels loop + for(unsigned I = 0; I < 5; ++I); + // CHECK-NEXT: acc.kernels combined(loop) { + // CHECK-NEXT: acc.loop combined(kernels) { + // CHECK: acc.yield + // CHECK-NEXT: } attributes {auto_ = [#acc.device_type]} loc + // CHECK-NEXT: acc.terminator + // CHECK-NEXT: } loc + +#pragma acc serial loop + for(unsigned I = 0; I < 5; ++I); + // CHECK-NEXT: acc.serial combined(loop) { + // CHECK-NEXT: acc.loop combined(serial) { + // CHECK: acc.yield + // CHECK-NEXT: } attributes {seq = [#acc.device_type]} loc + // CHECK-NEXT: acc.yield + // CHECK-NEXT: } loc + +#pragma acc serial loop worker + for(unsigned I = 0; I < 5; ++I); + // CHECK-NEXT: acc.serial combined(loop) { + // CHECK-NEXT: acc.loop combined(serial) worker { + // CHECK: acc.yield + // CHECK-NEXT: } attributes {auto_ = [#acc.device_type]} loc + // CHECK-NEXT: acc.yield + // CHECK-NEXT: } loc + +#pragma acc serial loop vector + for(unsigned I = 0; I < 5; ++I); + // CHECK-NEXT: acc.serial combined(loop) { + // CHECK-NEXT: acc.loop combined(serial) vector { + // CHECK: acc.yield + // CHECK-NEXT: } attributes {auto_ = [#acc.device_type]} loc + // CHECK-NEXT: acc.yield + // CHECK-NEXT: } loc + +#pragma acc serial loop gang + for(unsigned I = 0; I < 5; ++I); + // CHECK-NEXT: acc.serial combined(loop) { + // CHECK-NEXT: acc.loop combined(serial) gang { + // CHECK: acc.yield + // CHECK-NEXT: } attributes {auto_ = [#acc.device_type]} loc + // CHECK-NEXT: acc.yield + // CHECK-NEXT: } loc } diff --git a/clang/test/CIR/CodeGenOpenACC/loop.cpp b/clang/test/CIR/CodeGenOpenACC/loop.cpp index db94e2819b30..c0bf11e35395 100644 --- a/clang/test/CIR/CodeGenOpenACC/loop.cpp +++ b/clang/test/CIR/CodeGenOpenACC/loop.cpp @@ -41,12 +41,12 @@ extern "C" void acc_loop(int *A, int *B, int *C, int N) { for(unsigned I = 0; I < N; ++I); // CHECK: acc.loop { // CHECK: acc.yield - // CHECK-NEXT: } attributes {seq = [#acc.device_type, #acc.device_type]} loc + // CHECK-NEXT: } attributes {independent = [#acc.device_type], seq = [#acc.device_type, #acc.device_type]} loc #pragma acc loop device_type(radeon) seq for(unsigned I = 0; I < N; ++I); // CHECK: acc.loop { // CHECK: acc.yield - // CHECK-NEXT: } attributes {seq = [#acc.device_type]} loc + // CHECK-NEXT: } attributes {independent = [#acc.device_type], seq = [#acc.device_type]} loc #pragma acc loop seq device_type(nvidia, radeon) for(unsigned I = 0; I < N; ++I); // CHECK: acc.loop { @@ -67,12 +67,12 @@ extern "C" void acc_loop(int *A, int *B, int *C, int N) { for(unsigned I = 0; I < N; ++I); // CHECK: acc.loop { // CHECK: acc.yield - // CHECK-NEXT: } attributes {independent = [#acc.device_type, #acc.device_type]} loc + // CHECK-NEXT: } attributes {independent = [#acc.device_type, #acc.device_type, #acc.device_type]} loc #pragma acc loop device_type(radeon) independent for(unsigned I = 0; I < N; ++I); // CHECK: acc.loop { // CHECK: acc.yield - // CHECK-NEXT: } attributes {independent = [#acc.device_type]} loc + // CHECK-NEXT: } attributes {independent = [#acc.device_type, #acc.device_type]} loc #pragma acc loop independent device_type(nvidia, radeon) for(unsigned I = 0; I < N; ++I); // CHECK: acc.loop { @@ -93,12 +93,12 @@ extern "C" void acc_loop(int *A, int *B, int *C, int N) { for(unsigned I = 0; I < N; ++I); // CHECK: acc.loop { // CHECK: acc.yield - // CHECK-NEXT: } attributes {auto_ = [#acc.device_type, #acc.device_type]} loc + // CHECK-NEXT: } attributes {auto_ = [#acc.device_type, #acc.device_type], independent = [#acc.device_type]} loc #pragma acc loop device_type(radeon) auto for(unsigned I = 0; I < N; ++I); // CHECK: acc.loop { // CHECK: acc.yield - // CHECK-NEXT: } attributes {auto_ = [#acc.device_type]} loc + // CHECK-NEXT: } attributes {auto_ = [#acc.device_type], independent = [#acc.device_type]} loc #pragma acc loop auto device_type(nvidia, radeon) for(unsigned I = 0; I < N; ++I); // CHECK: acc.loop { @@ -116,7 +116,7 @@ extern "C" void acc_loop(int *A, int *B, int *C, int N) { for(unsigned K = 0; K < N; ++K); // CHECK: acc.loop { // CHECK: acc.yield - // CHECK-NEXT: } attributes {collapse = [1], collapseDeviceType = [#acc.device_type]} + // CHECK-NEXT: } attributes {collapse = [1], collapseDeviceType = [#acc.device_type], independent = [#acc.device_type]} #pragma acc loop collapse(1) device_type(radeon) collapse (2) for(unsigned I = 0; I < N; ++I) @@ -124,7 +124,7 @@ extern "C" void acc_loop(int *A, int *B, int *C, int N) { for(unsigned K = 0; K < N; ++K); // CHECK: acc.loop { // CHECK: acc.yield - // CHECK-NEXT: } attributes {collapse = [1, 2], collapseDeviceType = [#acc.device_type, #acc.device_type]} + // CHECK-NEXT: } attributes {collapse = [1, 2], collapseDeviceType = [#acc.device_type, #acc.device_type], independent = [#acc.device_type]} #pragma acc loop collapse(1) device_type(radeon, nvidia) collapse (2) for(unsigned I = 0; I < N; ++I) @@ -132,14 +132,14 @@ extern "C" void acc_loop(int *A, int *B, int *C, int N) { for(unsigned K = 0; K < N; ++K); // CHECK: acc.loop { // CHECK: acc.yield - // CHECK-NEXT: } attributes {collapse = [1, 2, 2], collapseDeviceType = [#acc.device_type, #acc.device_type, #acc.device_type]} + // CHECK-NEXT: } attributes {collapse = [1, 2, 2], collapseDeviceType = [#acc.device_type, #acc.device_type, #acc.device_type], independent = [#acc.device_type]} #pragma acc loop collapse(1) device_type(radeon, nvidia) collapse(2) device_type(host) collapse(3) for(unsigned I = 0; I < N; ++I) for(unsigned J = 0; J < N; ++J) for(unsigned K = 0; K < N; ++K); // CHECK: acc.loop { // CHECK: acc.yield - // CHECK-NEXT: } attributes {collapse = [1, 2, 2, 3], collapseDeviceType = [#acc.device_type, #acc.device_type, #acc.device_type, #acc.device_type]} + // CHECK-NEXT: } attributes {collapse = [1, 2, 2, 3], collapseDeviceType = [#acc.device_type, #acc.device_type, #acc.device_type, #acc.device_type], independent = [#acc.device_type]} #pragma acc loop tile(1, 2, 3) for(unsigned I = 0; I < N; ++I) @@ -392,4 +392,85 @@ extern "C" void acc_loop(int *A, int *B, int *C, int N) { // CHECK: acc.yield // CHECK-NEXT: } loc } + // CHECK-NEXT: acc.terminator + // CHECK-NEXT: } loc + + // Checking the automatic-addition of parallelism clauses. +#pragma acc loop + for(unsigned I = 0; I < N; ++I); + // CHECK-NEXT: acc.loop { + // CHECK: acc.yield + // CHECK-NEXT: } attributes {independent = [#acc.device_type]} loc + +#pragma acc parallel + { + // CHECK-NEXT: acc.parallel { +#pragma acc loop + for(unsigned I = 0; I < N; ++I); + // CHECK-NEXT: acc.loop { + // CHECK: acc.yield + // CHECK-NEXT: } attributes {independent = [#acc.device_type]} loc + } + // CHECK-NEXT: acc.yield + // CHECK-NEXT: } loc + +#pragma acc kernels + { + // CHECK-NEXT: acc.kernels { +#pragma acc loop + for(unsigned I = 0; I < N; ++I); + // CHECK-NEXT: acc.loop { + // CHECK: acc.yield + // CHECK-NEXT: } attributes {auto_ = [#acc.device_type]} loc + } + // CHECK-NEXT: acc.terminator + // CHECK-NEXT: } loc + +#pragma acc serial + { + // CHECK-NEXT: acc.serial { +#pragma acc loop + for(unsigned I = 0; I < N; ++I); + // CHECK-NEXT: acc.loop { + // CHECK: acc.yield + // CHECK-NEXT: } attributes {seq = [#acc.device_type]} loc + } + // CHECK-NEXT: acc.yield + // CHECK-NEXT: } loc + +#pragma acc serial + { + // CHECK-NEXT: acc.serial { +#pragma acc loop worker + for(unsigned I = 0; I < N; ++I); + // CHECK-NEXT: acc.loop worker { + // CHECK: acc.yield + // CHECK-NEXT: } attributes {auto_ = [#acc.device_type]} loc + } + // CHECK-NEXT: acc.yield + // CHECK-NEXT: } loc + +#pragma acc serial + { + // CHECK-NEXT: acc.serial { +#pragma acc loop vector + for(unsigned I = 0; I < N; ++I); + // CHECK-NEXT: acc.loop vector { + // CHECK: acc.yield + // CHECK-NEXT: } attributes {auto_ = [#acc.device_type]} loc + } + // CHECK-NEXT: acc.yield + // CHECK-NEXT: } loc + +#pragma acc serial + { + // CHECK-NEXT: acc.serial { +#pragma acc loop gang + for(unsigned I = 0; I < N; ++I); + // CHECK-NEXT: acc.loop gang { + // CHECK: acc.yield + // CHECK-NEXT: } attributes {auto_ = [#acc.device_type]} loc + } + // CHECK-NEXT: acc.yield + // CHECK-NEXT: } loc } diff --git a/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td b/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td index 083a18d80704..34312655115a 100644 --- a/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td +++ b/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td @@ -2246,6 +2246,14 @@ def OpenACC_LoopOp : OpenACC_Op<"loop", // device_types. This is for the case where there is no expression specified // in a 'gang'. void addEmptyGang(MLIRContext *, llvm::ArrayRef); + + // Return whether this LoopOp has an auto, seq, or independent for the + // specified device-type. + bool hasParallelismFlag(DeviceType); + + // Return whether this LoopOp has a gang, worker, or vector applying to the + // 'default'/None device-type. + bool hasDefaultGangWorkerVector(); }]; let hasCustomAssemblyFormat = 1; diff --git a/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp b/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp index c72ec47be9f0..21e6b9d85f1a 100644 --- a/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp +++ b/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp @@ -2839,6 +2839,30 @@ void acc::LoopOp::addEmptyGang( effectiveDeviceTypes)); } +bool acc::LoopOp::hasParallelismFlag(DeviceType dt) { + auto hasDevice = [=](DeviceTypeAttr attr) -> bool { + return attr.getValue() == dt; + }; + auto testFromArr = [=](ArrayAttr arr) -> bool { + return llvm::any_of(arr.getAsRange(), hasDevice); + }; + + if (ArrayAttr arr = getSeqAttr(); arr && testFromArr(arr)) + return true; + if (ArrayAttr arr = getIndependentAttr(); arr && testFromArr(arr)) + return true; + if (ArrayAttr arr = getAuto_Attr(); arr && testFromArr(arr)) + return true; + + return false; +} + +bool acc::LoopOp::hasDefaultGangWorkerVector() { + return hasVector() || getVectorValue() || hasWorker() || getWorkerValue() || + hasGang() || getGangValue(GangArgType::Num) || + getGangValue(GangArgType::Dim) || getGangValue(GangArgType::Static); +} + void acc::LoopOp::addGangOperands( MLIRContext *context, llvm::ArrayRef effectiveDeviceTypes, llvm::ArrayRef argTypes, mlir::ValueRange values) { From d5f68cb145059fc6d2944e1d17ef561e183ade83 Mon Sep 17 00:00:00 2001 From: Jorge Gorbe Moya Date: Wed, 11 Jun 2025 12:09:44 -0700 Subject: [PATCH 0035/1322] [bazel] Port fe7bf4b90b1a835418bddd2b2aa63b4977a9f6d2 --- utils/bazel/llvm-project-overlay/mlir/BUILD.bazel | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel index f6a7cd7dea85..7bcb1d4ca883 100644 --- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel @@ -6881,8 +6881,8 @@ cc_library( deps = [ ":SPIRVDialect", ":Support", - "//llvm:config", "//llvm:Support", + "//llvm:config", ], ) @@ -11249,7 +11249,7 @@ td_library( ) gentbl_cc_library( - name = "TransformDialectEnumsIncGen", + name = "TransformAttrsIncGen", tbl_outs = { "include/mlir/Dialect/Transform/IR/TransformDialectEnums.h.inc": [ "-gen-enum-decls", @@ -11257,6 +11257,12 @@ gentbl_cc_library( "include/mlir/Dialect/Transform/IR/TransformDialectEnums.cpp.inc": [ "-gen-enum-defs", ], + "include/mlir/Dialect/Transform/IR/TransformAttrs.h.inc": [ + "-gen-attrdef-decls", + ], + "include/mlir/Dialect/Transform/IR/TransformAttrs.cpp.inc": [ + "-gen-attrdef-defs", + ], }, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/Transform/IR/TransformAttrs.td", @@ -11382,7 +11388,7 @@ cc_library( ":Rewrite", ":SideEffectInterfaces", ":Support", - ":TransformDialectEnumsIncGen", + ":TransformAttrsIncGen", ":TransformDialectIncGen", ":TransformDialectInterfaces", ":TransformDialectUtils", From 5dafe9dca867b90f20dcd71c620ad823aee4262b Mon Sep 17 00:00:00 2001 From: Alexey Samsonov Date: Wed, 11 Jun 2025 12:23:17 -0700 Subject: [PATCH 0036/1322] [libc] Reduce direct use of errno in src/stdlib and src/__support tests. (#143767) * Get rid of libc_errno assignments in str_to_* __support tests, since those API have been migrated to return error in a struct instead. * Migrate tests for atof and to strto* functions from and for strdup from to use ErrnoCheckingTest harness. --- libc/test/src/__support/CMakeLists.txt | 2 - .../test/src/__support/str_to_double_test.cpp | 1 - libc/test/src/__support/str_to_float_test.cpp | 1 - libc/test/src/__support/str_to_fp_test.h | 2 - .../src/__support/str_to_integer_test.cpp | 1 - libc/test/src/stdlib/CMakeLists.txt | 5 ++ libc/test/src/stdlib/StrtolTest.h | 60 +------------------ libc/test/src/stdlib/atof_test.cpp | 9 ++- libc/test/src/stdlib/strtod_test.cpp | 5 +- libc/test/src/stdlib/strtof_test.cpp | 5 +- libc/test/src/stdlib/strtold_test.cpp | 5 +- libc/test/src/string/CMakeLists.txt | 1 + libc/test/src/string/strdup_test.cpp | 13 ++-- 13 files changed, 24 insertions(+), 86 deletions(-) diff --git a/libc/test/src/__support/CMakeLists.txt b/libc/test/src/__support/CMakeLists.txt index c1736c8fe59e..4fb0dae86e5c 100644 --- a/libc/test/src/__support/CMakeLists.txt +++ b/libc/test/src/__support/CMakeLists.txt @@ -127,7 +127,6 @@ add_libc_test( libc.src.__support.integer_literals libc.src.__support.str_to_float libc.src.__support.uint128 - libc.src.errno.errno ) @@ -140,7 +139,6 @@ add_libc_test( DEPENDS libc.src.__support.integer_literals libc.src.__support.str_to_integer - libc.src.errno.errno ) add_libc_test( diff --git a/libc/test/src/__support/str_to_double_test.cpp b/libc/test/src/__support/str_to_double_test.cpp index ccfa44f12d8e..dc503aa16f08 100644 --- a/libc/test/src/__support/str_to_double_test.cpp +++ b/libc/test/src/__support/str_to_double_test.cpp @@ -99,7 +99,6 @@ TEST(LlvmLibcStrToDblTest, SimpleDecimalConversionExtraTypes) { uint64_t double_output_mantissa = 0; uint32_t output_exp2 = 0; - LIBC_NAMESPACE::libc_errno = 0; auto double_result = internal::simple_decimal_conversion("123456789012345678900"); diff --git a/libc/test/src/__support/str_to_float_test.cpp b/libc/test/src/__support/str_to_float_test.cpp index 66f7db742eb4..03ae80fc2ee3 100644 --- a/libc/test/src/__support/str_to_float_test.cpp +++ b/libc/test/src/__support/str_to_float_test.cpp @@ -55,7 +55,6 @@ TEST(LlvmLibcStrToFltTest, SimpleDecimalConversionExtraTypes) { uint32_t float_output_mantissa = 0; uint32_t output_exp2 = 0; - LIBC_NAMESPACE::libc_errno = 0; auto float_result = internal::simple_decimal_conversion("123456789012345678900"); float_output_mantissa = float_result.num.mantissa; diff --git a/libc/test/src/__support/str_to_fp_test.h b/libc/test/src/__support/str_to_fp_test.h index c7bc57b845fe..d349192f107c 100644 --- a/libc/test/src/__support/str_to_fp_test.h +++ b/libc/test/src/__support/str_to_fp_test.h @@ -10,7 +10,6 @@ #include "src/__support/macros/config.h" #include "src/__support/str_to_float.h" #include "src/__support/uint128.h" -#include "src/errno/libc_errno.h" #include "test/UnitTest/Test.h" @@ -67,7 +66,6 @@ template struct LlvmLibcStrToFloatTest : public testing::Test { const int expectedErrno = 0) { StorageType actual_output_mantissa = 0; uint32_t actual_output_exp2 = 0; - LIBC_NAMESPACE::libc_errno = 0; auto result = internal::simple_decimal_conversion(numStart); diff --git a/libc/test/src/__support/str_to_integer_test.cpp b/libc/test/src/__support/str_to_integer_test.cpp index 34b645b4b38c..1ec882b212b8 100644 --- a/libc/test/src/__support/str_to_integer_test.cpp +++ b/libc/test/src/__support/str_to_integer_test.cpp @@ -7,7 +7,6 @@ //===----------------------------------------------------------------------===// #include "src/__support/str_to_integer.h" -#include "src/errno/libc_errno.h" #include #include "test/UnitTest/Test.h" diff --git a/libc/test/src/stdlib/CMakeLists.txt b/libc/test/src/stdlib/CMakeLists.txt index 302971a078c1..45fd49b6d352 100644 --- a/libc/test/src/stdlib/CMakeLists.txt +++ b/libc/test/src/stdlib/CMakeLists.txt @@ -9,6 +9,7 @@ add_libc_test( DEPENDS libc.src.errno.errno libc.src.stdlib.atof + libc.test.UnitTest.ErrnoCheckingTest ) add_header_library( @@ -64,6 +65,7 @@ add_fp_unittest( libc.src.errno.errno libc.src.stdlib.strtod libc.src.__support.FPUtil.fenv_impl + libc.test.UnitTest.ErrnoCheckingTest ) add_fp_unittest( @@ -76,6 +78,7 @@ add_fp_unittest( libc.src.errno.errno libc.src.stdlib.strtof libc.src.__support.FPUtil.fenv_impl + libc.test.UnitTest.ErrnoCheckingTest ) add_header_library( @@ -86,6 +89,7 @@ add_header_library( libc.src.__support.CPP.limits libc.src.__support.CPP.type_traits libc.src.errno.errno + libc.test.UnitTest.ErrnoCheckingTest ) add_libc_test( @@ -133,6 +137,7 @@ add_libc_test( libc.src.errno.errno libc.src.__support.uint128 libc.src.stdlib.strtold + libc.test.UnitTest.ErrnoCheckingTest ) add_libc_test( diff --git a/libc/test/src/stdlib/StrtolTest.h b/libc/test/src/stdlib/StrtolTest.h index ed302f14d03e..03f0a6539c78 100644 --- a/libc/test/src/stdlib/StrtolTest.h +++ b/libc/test/src/stdlib/StrtolTest.h @@ -10,7 +10,7 @@ #include "src/__support/CPP/type_traits.h" #include "src/__support/ctype_utils.h" #include "src/__support/macros/properties/architectures.h" -#include "src/errno/libc_errno.h" +#include "test/UnitTest/ErrnoCheckingTest.h" #include "test/UnitTest/Test.h" #include @@ -18,7 +18,7 @@ using LIBC_NAMESPACE::cpp::is_signed_v; template -struct StrtoTest : public LIBC_NAMESPACE::testing::Test { +struct StrtoTest : public LIBC_NAMESPACE::testing::ErrnoCheckingTest { using FunctionT = ReturnT (*)(const char *, char **, int); static constexpr ReturnT T_MAX = @@ -28,7 +28,6 @@ struct StrtoTest : public LIBC_NAMESPACE::testing::Test { void InvalidBase(FunctionT func) { const char *ten = "10"; - LIBC_NAMESPACE::libc_errno = 0; ASSERT_EQ(func(ten, nullptr, -1), ReturnT(0)); ASSERT_ERRNO_EQ(EINVAL); } @@ -38,23 +37,19 @@ struct StrtoTest : public LIBC_NAMESPACE::testing::Test { // TODO: Look into collapsing these repeated segments. const char *ten = "10"; - LIBC_NAMESPACE::libc_errno = 0; ASSERT_EQ(func(ten, &str_end, 10), ReturnT(10)); ASSERT_ERRNO_SUCCESS(); EXPECT_EQ(str_end - ten, ptrdiff_t(2)); - LIBC_NAMESPACE::libc_errno = 0; ASSERT_EQ(func(ten, nullptr, 10), ReturnT(10)); ASSERT_ERRNO_SUCCESS(); const char *hundred = "100"; - LIBC_NAMESPACE::libc_errno = 0; ASSERT_EQ(func(hundred, &str_end, 10), ReturnT(100)); ASSERT_ERRNO_SUCCESS(); EXPECT_EQ(str_end - hundred, ptrdiff_t(3)); const char *big_number = "1234567890"; - LIBC_NAMESPACE::libc_errno = 0; ASSERT_EQ(func(big_number, &str_end, 10), ReturnT(1234567890)); ASSERT_ERRNO_SUCCESS(); EXPECT_EQ(str_end - big_number, ptrdiff_t(10)); @@ -62,7 +57,6 @@ struct StrtoTest : public LIBC_NAMESPACE::testing::Test { // This number is larger than 2^32, meaning that if long is only 32 bits // wide, strtol will return LONG_MAX. const char *bigger_number = "12345678900"; - LIBC_NAMESPACE::libc_errno = 0; if constexpr (sizeof(ReturnT) < 8) { ASSERT_EQ(func(bigger_number, &str_end, 10), T_MAX); ASSERT_ERRNO_EQ(ERANGE); @@ -73,14 +67,12 @@ struct StrtoTest : public LIBC_NAMESPACE::testing::Test { EXPECT_EQ(str_end - bigger_number, ptrdiff_t(11)); const char *too_big_number = "123456789012345678901"; - LIBC_NAMESPACE::libc_errno = 0; ASSERT_EQ(func(too_big_number, &str_end, 10), T_MAX); ASSERT_ERRNO_EQ(ERANGE); EXPECT_EQ(str_end - too_big_number, ptrdiff_t(21)); const char *long_number_range_test = "10000000000000000000000000000000000000000000000000"; - LIBC_NAMESPACE::libc_errno = 0; ASSERT_EQ(func(long_number_range_test, &str_end, 10), T_MAX); ASSERT_ERRNO_EQ(ERANGE); EXPECT_EQ(str_end - long_number_range_test, ptrdiff_t(50)); @@ -88,19 +80,16 @@ struct StrtoTest : public LIBC_NAMESPACE::testing::Test { // For most negative numbers, the unsigned functions treat it the same as // casting a negative variable to an unsigned type. const char *negative = "-100"; - LIBC_NAMESPACE::libc_errno = 0; ASSERT_EQ(func(negative, &str_end, 10), ReturnT(-100)); ASSERT_ERRNO_SUCCESS(); EXPECT_EQ(str_end - negative, ptrdiff_t(4)); const char *big_negative_number = "-1234567890"; - LIBC_NAMESPACE::libc_errno = 0; ASSERT_EQ(func(big_negative_number, &str_end, 10), ReturnT(-1234567890)); ASSERT_ERRNO_SUCCESS(); EXPECT_EQ(str_end - big_negative_number, ptrdiff_t(11)); const char *too_big_negative_number = "-123456789012345678901"; - LIBC_NAMESPACE::libc_errno = 0; // If the number is signed, it should return the smallest negative number // for the current type, but if it's unsigned it should max out and return // the largest positive number for the current type. From the standard: @@ -118,73 +107,61 @@ struct StrtoTest : public LIBC_NAMESPACE::testing::Test { char *str_end = nullptr; const char *spaces_before = " 10"; - LIBC_NAMESPACE::libc_errno = 0; ASSERT_EQ(func(spaces_before, &str_end, 10), ReturnT(10)); ASSERT_ERRNO_SUCCESS(); EXPECT_EQ(str_end - spaces_before, ptrdiff_t(7)); const char *spaces_after = "10 "; - LIBC_NAMESPACE::libc_errno = 0; ASSERT_EQ(func(spaces_after, &str_end, 10), ReturnT(10)); ASSERT_ERRNO_SUCCESS(); EXPECT_EQ(str_end - spaces_after, ptrdiff_t(2)); const char *word_before = "word10"; - LIBC_NAMESPACE::libc_errno = 0; ASSERT_EQ(func(word_before, &str_end, 10), ReturnT(0)); ASSERT_ERRNO_SUCCESS(); EXPECT_EQ(str_end - word_before, ptrdiff_t(0)); const char *word_after = "10word"; - LIBC_NAMESPACE::libc_errno = 0; ASSERT_EQ(func(word_after, &str_end, 10), ReturnT(10)); ASSERT_ERRNO_SUCCESS(); EXPECT_EQ(str_end - word_after, ptrdiff_t(2)); const char *two_numbers = "10 999"; - LIBC_NAMESPACE::libc_errno = 0; ASSERT_EQ(func(two_numbers, &str_end, 10), ReturnT(10)); ASSERT_ERRNO_SUCCESS(); EXPECT_EQ(str_end - two_numbers, ptrdiff_t(2)); const char *two_signs = "--10 999"; - LIBC_NAMESPACE::libc_errno = 0; ASSERT_EQ(func(two_signs, &str_end, 10), ReturnT(0)); ASSERT_ERRNO_SUCCESS(); EXPECT_EQ(str_end - two_signs, ptrdiff_t(0)); const char *sign_before = "+2=4"; - LIBC_NAMESPACE::libc_errno = 0; ASSERT_EQ(func(sign_before, &str_end, 10), ReturnT(2)); ASSERT_ERRNO_SUCCESS(); EXPECT_EQ(str_end - sign_before, ptrdiff_t(2)); const char *sign_after = "2+2=4"; - LIBC_NAMESPACE::libc_errno = 0; ASSERT_EQ(func(sign_after, &str_end, 10), ReturnT(2)); ASSERT_ERRNO_SUCCESS(); EXPECT_EQ(str_end - sign_after, ptrdiff_t(1)); const char *tab_before = "\t10"; - LIBC_NAMESPACE::libc_errno = 0; ASSERT_EQ(func(tab_before, &str_end, 10), ReturnT(10)); ASSERT_ERRNO_SUCCESS(); EXPECT_EQ(str_end - tab_before, ptrdiff_t(3)); const char *all_together = "\t -12345and+67890"; - LIBC_NAMESPACE::libc_errno = 0; ASSERT_EQ(func(all_together, &str_end, 10), ReturnT(-12345)); ASSERT_ERRNO_SUCCESS(); EXPECT_EQ(str_end - all_together, ptrdiff_t(9)); const char *just_spaces = " "; - LIBC_NAMESPACE::libc_errno = 0; ASSERT_EQ(func(just_spaces, &str_end, 10), ReturnT(0)); ASSERT_ERRNO_SUCCESS(); EXPECT_EQ(str_end - just_spaces, ptrdiff_t(0)); const char *just_space_and_sign = " +"; - LIBC_NAMESPACE::libc_errno = 0; ASSERT_EQ(func(just_space_and_sign, &str_end, 10), ReturnT(0)); ASSERT_ERRNO_SUCCESS(); EXPECT_EQ(str_end - just_space_and_sign, ptrdiff_t(0)); @@ -203,12 +180,10 @@ struct StrtoTest : public LIBC_NAMESPACE::testing::Test { small_string[0] = static_cast( LIBC_NAMESPACE::internal::int_to_b36_char(first_digit)); if (first_digit < base) { - LIBC_NAMESPACE::libc_errno = 0; ASSERT_EQ(func(small_string, nullptr, base), static_cast(first_digit)); ASSERT_ERRNO_SUCCESS(); } else { - LIBC_NAMESPACE::libc_errno = 0; ASSERT_EQ(func(small_string, nullptr, base), ReturnT(0)); ASSERT_ERRNO_SUCCESS(); } @@ -223,18 +198,15 @@ struct StrtoTest : public LIBC_NAMESPACE::testing::Test { small_string[1] = static_cast( LIBC_NAMESPACE::internal::int_to_b36_char(second_digit)); if (first_digit < base && second_digit < base) { - LIBC_NAMESPACE::libc_errno = 0; ASSERT_EQ( func(small_string, nullptr, base), static_cast(second_digit + (first_digit * base))); ASSERT_ERRNO_SUCCESS(); } else if (first_digit < base) { - LIBC_NAMESPACE::libc_errno = 0; ASSERT_EQ(func(small_string, nullptr, base), static_cast(first_digit)); ASSERT_ERRNO_SUCCESS(); } else { - LIBC_NAMESPACE::libc_errno = 0; ASSERT_EQ(func(small_string, nullptr, base), ReturnT(0)); ASSERT_ERRNO_SUCCESS(); } @@ -255,14 +227,12 @@ struct StrtoTest : public LIBC_NAMESPACE::testing::Test { if (first_digit < base && second_digit < base && third_digit < base) { - LIBC_NAMESPACE::libc_errno = 0; ASSERT_EQ(func(small_string, nullptr, base), static_cast(third_digit + (second_digit * base) + (first_digit * base * base))); ASSERT_ERRNO_SUCCESS(); } else if (first_digit < base && second_digit < base) { - LIBC_NAMESPACE::libc_errno = 0; ASSERT_EQ( func(small_string, nullptr, base), static_cast(second_digit + (first_digit * base))); @@ -272,23 +242,19 @@ struct StrtoTest : public LIBC_NAMESPACE::testing::Test { // The number is treated as a one digit hexadecimal. if (base == 16 && first_digit == 0 && second_digit == 33) { if (third_digit < base) { - LIBC_NAMESPACE::libc_errno = 0; ASSERT_EQ(func(small_string, nullptr, base), static_cast(third_digit)); ASSERT_ERRNO_SUCCESS(); } else { - LIBC_NAMESPACE::libc_errno = 0; ASSERT_EQ(func(small_string, nullptr, base), ReturnT(0)); ASSERT_ERRNO_SUCCESS(); } } else { - LIBC_NAMESPACE::libc_errno = 0; ASSERT_EQ(func(small_string, nullptr, base), static_cast(first_digit)); ASSERT_ERRNO_SUCCESS(); } } else { - LIBC_NAMESPACE::libc_errno = 0; ASSERT_EQ(func(small_string, nullptr, base), ReturnT(0)); ASSERT_ERRNO_SUCCESS(); } @@ -302,19 +268,16 @@ struct StrtoTest : public LIBC_NAMESPACE::testing::Test { char *str_end = nullptr; const char *no_prefix = "123abc"; - LIBC_NAMESPACE::libc_errno = 0; ASSERT_EQ(func(no_prefix, &str_end, 16), ReturnT(0x123abc)); ASSERT_ERRNO_SUCCESS(); EXPECT_EQ(str_end - no_prefix, ptrdiff_t(6)); const char *yes_prefix = "0x456def"; - LIBC_NAMESPACE::libc_errno = 0; ASSERT_EQ(func(yes_prefix, &str_end, 16), ReturnT(0x456def)); ASSERT_ERRNO_SUCCESS(); EXPECT_EQ(str_end - yes_prefix, ptrdiff_t(8)); const char *letter_after_prefix = "0xabc123"; - LIBC_NAMESPACE::libc_errno = 0; ASSERT_EQ(func(letter_after_prefix, &str_end, 16), ReturnT(0xabc123)); ASSERT_ERRNO_SUCCESS(); EXPECT_EQ(str_end - letter_after_prefix, ptrdiff_t(8)); @@ -325,7 +288,6 @@ struct StrtoTest : public LIBC_NAMESPACE::testing::Test { // Max size for unsigned 32 bit numbers const char *max_32_bit_value = "0xFFFFFFFF"; - LIBC_NAMESPACE::libc_errno = 0; ASSERT_EQ(func(max_32_bit_value, &str_end, 0), ((is_signed_v && sizeof(ReturnT) == 4) ? T_MAX @@ -334,7 +296,6 @@ struct StrtoTest : public LIBC_NAMESPACE::testing::Test { EXPECT_EQ(str_end - max_32_bit_value, ptrdiff_t(10)); const char *negative_max_32_bit_value = "-0xFFFFFFFF"; - LIBC_NAMESPACE::libc_errno = 0; ASSERT_EQ(func(negative_max_32_bit_value, &str_end, 0), ((is_signed_v && sizeof(ReturnT) == 4) ? T_MIN @@ -345,13 +306,11 @@ struct StrtoTest : public LIBC_NAMESPACE::testing::Test { // Max size for signed 32 bit numbers const char *max_31_bit_value = "0x7FFFFFFF"; - LIBC_NAMESPACE::libc_errno = 0; ASSERT_EQ(func(max_31_bit_value, &str_end, 0), ReturnT(0x7FFFFFFF)); ASSERT_ERRNO_SUCCESS(); EXPECT_EQ(str_end - max_31_bit_value, ptrdiff_t(10)); const char *negative_max_31_bit_value = "-0x7FFFFFFF"; - LIBC_NAMESPACE::libc_errno = 0; ASSERT_EQ(func(negative_max_31_bit_value, &str_end, 0), -ReturnT(0x7FFFFFFF)); ASSERT_ERRNO_SUCCESS(); @@ -360,7 +319,6 @@ struct StrtoTest : public LIBC_NAMESPACE::testing::Test { // Max size for unsigned 64 bit numbers const char *max_64_bit_value = "0xFFFFFFFFFFFFFFFF"; - LIBC_NAMESPACE::libc_errno = 0; ASSERT_EQ(func(max_64_bit_value, &str_end, 0), (is_signed_v || sizeof(ReturnT) < 8 ? T_MAX @@ -371,7 +329,6 @@ struct StrtoTest : public LIBC_NAMESPACE::testing::Test { // See the end of CleanBase10Decode for an explanation of how this large // negative number can end up as T_MAX. const char *negative_max_64_bit_value = "-0xFFFFFFFFFFFFFFFF"; - LIBC_NAMESPACE::libc_errno = 0; ASSERT_EQ( func(negative_max_64_bit_value, &str_end, 0), (is_signed_v @@ -383,14 +340,12 @@ struct StrtoTest : public LIBC_NAMESPACE::testing::Test { // Max size for signed 64 bit numbers const char *max_63_bit_value = "0x7FFFFFFFFFFFFFFF"; - LIBC_NAMESPACE::libc_errno = 0; ASSERT_EQ(func(max_63_bit_value, &str_end, 0), (sizeof(ReturnT) < 8 ? T_MAX : ReturnT(0x7FFFFFFFFFFFFFFF))); ASSERT_ERRNO_EQ(sizeof(ReturnT) < 8 ? ERANGE : 0); EXPECT_EQ(str_end - max_63_bit_value, ptrdiff_t(18)); const char *negative_max_63_bit_value = "-0x7FFFFFFFFFFFFFFF"; - LIBC_NAMESPACE::libc_errno = 0; ASSERT_EQ(func(negative_max_63_bit_value, &str_end, 0), (sizeof(ReturnT) >= 8 ? -ReturnT(0x7FFFFFFFFFFFFFFF) : (is_signed_v ? T_MIN : T_MAX))); @@ -402,23 +357,19 @@ struct StrtoTest : public LIBC_NAMESPACE::testing::Test { char *str_end = nullptr; const char *just_prefix = "0x"; - LIBC_NAMESPACE::libc_errno = 0; ASSERT_EQ(func(just_prefix, &str_end, 16), ReturnT(0)); ASSERT_ERRNO_SUCCESS(); EXPECT_EQ(str_end - just_prefix, ptrdiff_t(1)); - LIBC_NAMESPACE::libc_errno = 0; ASSERT_EQ(func(just_prefix, &str_end, 0), ReturnT(0)); ASSERT_ERRNO_SUCCESS(); EXPECT_EQ(str_end - just_prefix, ptrdiff_t(1)); const char *prefix_with_x_after = "0xx"; - LIBC_NAMESPACE::libc_errno = 0; ASSERT_EQ(func(prefix_with_x_after, &str_end, 16), ReturnT(0)); ASSERT_ERRNO_SUCCESS(); EXPECT_EQ(str_end - prefix_with_x_after, ptrdiff_t(1)); - LIBC_NAMESPACE::libc_errno = 0; ASSERT_EQ(func(prefix_with_x_after, &str_end, 0), ReturnT(0)); ASSERT_ERRNO_SUCCESS(); EXPECT_EQ(str_end - prefix_with_x_after, ptrdiff_t(1)); @@ -428,43 +379,36 @@ struct StrtoTest : public LIBC_NAMESPACE::testing::Test { char *str_end = nullptr; const char *base_ten = "12345"; - LIBC_NAMESPACE::libc_errno = 0; ASSERT_EQ(func(base_ten, &str_end, 0), ReturnT(12345)); ASSERT_ERRNO_SUCCESS(); EXPECT_EQ(str_end - base_ten, ptrdiff_t(5)); const char *base_sixteen_no_prefix = "123abc"; - LIBC_NAMESPACE::libc_errno = 0; ASSERT_EQ(func(base_sixteen_no_prefix, &str_end, 0), ReturnT(123)); ASSERT_ERRNO_SUCCESS(); EXPECT_EQ(str_end - base_sixteen_no_prefix, ptrdiff_t(3)); const char *base_sixteen_with_prefix = "0x456def"; - LIBC_NAMESPACE::libc_errno = 0; ASSERT_EQ(func(base_sixteen_with_prefix, &str_end, 0), ReturnT(0x456def)); ASSERT_ERRNO_SUCCESS(); EXPECT_EQ(str_end - base_sixteen_with_prefix, ptrdiff_t(8)); const char *base_eight_with_prefix = "012345"; - LIBC_NAMESPACE::libc_errno = 0; ASSERT_EQ(func(base_eight_with_prefix, &str_end, 0), ReturnT(012345)); ASSERT_ERRNO_SUCCESS(); EXPECT_EQ(str_end - base_eight_with_prefix, ptrdiff_t(6)); const char *just_zero = "0"; - LIBC_NAMESPACE::libc_errno = 0; ASSERT_EQ(func(just_zero, &str_end, 0), ReturnT(0)); ASSERT_ERRNO_SUCCESS(); EXPECT_EQ(str_end - just_zero, ptrdiff_t(1)); const char *just_zero_x = "0x"; - LIBC_NAMESPACE::libc_errno = 0; ASSERT_EQ(func(just_zero_x, &str_end, 0), ReturnT(0)); ASSERT_ERRNO_SUCCESS(); EXPECT_EQ(str_end - just_zero_x, ptrdiff_t(1)); const char *just_zero_eight = "08"; - LIBC_NAMESPACE::libc_errno = 0; ASSERT_EQ(func(just_zero_eight, &str_end, 0), ReturnT(0)); ASSERT_ERRNO_SUCCESS(); EXPECT_EQ(str_end - just_zero_eight, ptrdiff_t(1)); diff --git a/libc/test/src/stdlib/atof_test.cpp b/libc/test/src/stdlib/atof_test.cpp index 1e4259b792d7..92b904ecad94 100644 --- a/libc/test/src/stdlib/atof_test.cpp +++ b/libc/test/src/stdlib/atof_test.cpp @@ -7,29 +7,28 @@ //===----------------------------------------------------------------------===// #include "src/__support/FPUtil/FPBits.h" -#include "src/errno/libc_errno.h" #include "src/stdlib/atof.h" +#include "test/UnitTest/ErrnoCheckingTest.h" #include "test/UnitTest/ErrnoSetterMatcher.h" #include "test/UnitTest/Test.h" #include +using LlvmLibcAToFTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest; using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds; // This is just a simple test to make sure that this function works at all. It's // functionally identical to strtod so the bulk of the testing is there. -TEST(LlvmLibcAToFTest, SimpleTest) { +TEST_F(LlvmLibcAToFTest, SimpleTest) { LIBC_NAMESPACE::fputil::FPBits expected_fp = LIBC_NAMESPACE::fputil::FPBits(uint64_t(0x405ec00000000000)); - LIBC_NAMESPACE::libc_errno = 0; EXPECT_THAT(LIBC_NAMESPACE::atof("123"), Succeeds(expected_fp.get_val())); } -TEST(LlvmLibcAToFTest, FailedParsingTest) { - LIBC_NAMESPACE::libc_errno = 0; +TEST_F(LlvmLibcAToFTest, FailedParsingTest) { // atof does not flag errors. EXPECT_THAT(LIBC_NAMESPACE::atof("???"), Succeeds(0.0)); } diff --git a/libc/test/src/stdlib/strtod_test.cpp b/libc/test/src/stdlib/strtod_test.cpp index 92d14640e653..db3c1d73bd22 100644 --- a/libc/test/src/stdlib/strtod_test.cpp +++ b/libc/test/src/stdlib/strtod_test.cpp @@ -7,9 +7,9 @@ //===----------------------------------------------------------------------===// #include "src/__support/FPUtil/FPBits.h" -#include "src/errno/libc_errno.h" #include "src/stdlib/strtod.h" +#include "test/UnitTest/ErrnoCheckingTest.h" #include "test/UnitTest/ErrnoSetterMatcher.h" #include "test/UnitTest/RoundingModeUtils.h" #include "test/UnitTest/Test.h" @@ -22,7 +22,7 @@ using LIBC_NAMESPACE::fputil::testing::RoundingMode; using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Fails; using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds; -class LlvmLibcStrToDTest : public LIBC_NAMESPACE::testing::Test, +class LlvmLibcStrToDTest : public LIBC_NAMESPACE::testing::ErrnoCheckingTest, ForceRoundingModeTest { public: void run_test(const char *inputString, const ptrdiff_t expectedStrLen, @@ -46,7 +46,6 @@ public: LIBC_NAMESPACE::fputil::FPBits expected_fp = LIBC_NAMESPACE::fputil::FPBits(expectedRawData); - LIBC_NAMESPACE::libc_errno = 0; double result = LIBC_NAMESPACE::strtod(inputString, &str_end); if (expectedErrno == 0) EXPECT_THAT(result, Succeeds(expected_fp.get_val())); diff --git a/libc/test/src/stdlib/strtof_test.cpp b/libc/test/src/stdlib/strtof_test.cpp index 6a716c956291..6df1ddda93bf 100644 --- a/libc/test/src/stdlib/strtof_test.cpp +++ b/libc/test/src/stdlib/strtof_test.cpp @@ -7,9 +7,9 @@ //===----------------------------------------------------------------------===// #include "src/__support/FPUtil/FPBits.h" -#include "src/errno/libc_errno.h" #include "src/stdlib/strtof.h" +#include "test/UnitTest/ErrnoCheckingTest.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/RoundingModeUtils.h" #include "test/UnitTest/Test.h" @@ -19,7 +19,7 @@ using LIBC_NAMESPACE::fputil::testing::ForceRoundingModeTest; using LIBC_NAMESPACE::fputil::testing::RoundingMode; -class LlvmLibcStrToFTest : public LIBC_NAMESPACE::testing::Test, +class LlvmLibcStrToFTest : public LIBC_NAMESPACE::testing::ErrnoCheckingTest, ForceRoundingModeTest { public: void run_test(const char *inputString, const ptrdiff_t expectedStrLen, @@ -43,7 +43,6 @@ public: LIBC_NAMESPACE::fputil::FPBits expected_fp = LIBC_NAMESPACE::fputil::FPBits(expectedRawData); - LIBC_NAMESPACE::libc_errno = 0; float result = LIBC_NAMESPACE::strtof(inputString, &str_end); EXPECT_EQ(str_end - inputString, expectedStrLen); diff --git a/libc/test/src/stdlib/strtold_test.cpp b/libc/test/src/stdlib/strtold_test.cpp index b209c85b88e3..eb4056dc7ba6 100644 --- a/libc/test/src/stdlib/strtold_test.cpp +++ b/libc/test/src/stdlib/strtold_test.cpp @@ -8,9 +8,9 @@ #include "src/__support/FPUtil/FPBits.h" #include "src/__support/uint128.h" -#include "src/errno/libc_errno.h" #include "src/stdlib/strtold.h" +#include "test/UnitTest/ErrnoCheckingTest.h" #include "test/UnitTest/Test.h" #include @@ -25,7 +25,7 @@ #error "Unknown long double type" #endif -class LlvmLibcStrToLDTest : public LIBC_NAMESPACE::testing::Test { +class LlvmLibcStrToLDTest : public LIBC_NAMESPACE::testing::ErrnoCheckingTest { public: #if defined(LIBC_TYPES_LONG_DOUBLE_IS_FLOAT64) void run_test(const char *inputString, const ptrdiff_t expectedStrLen, @@ -80,7 +80,6 @@ public: FPBits(static_cast(expectedRawData)); const int expected_errno = expectedErrno; - LIBC_NAMESPACE::libc_errno = 0; long double result = LIBC_NAMESPACE::strtold(inputString, &str_end); LIBC_NAMESPACE::fputil::FPBits actual_fp = diff --git a/libc/test/src/string/CMakeLists.txt b/libc/test/src/string/CMakeLists.txt index a675373938e9..ced60750a45c 100644 --- a/libc/test/src/string/CMakeLists.txt +++ b/libc/test/src/string/CMakeLists.txt @@ -168,6 +168,7 @@ add_libc_test( DEPENDS libc.src.string.strdup libc.src.errno.errno + libc.test.UnitTest.ErrnoCheckingTest ) # FIXME: This is failing on the bot for some reason, disable for now. diff --git a/libc/test/src/string/strdup_test.cpp b/libc/test/src/string/strdup_test.cpp index 20b85c37637d..4b18fc7f1bde 100644 --- a/libc/test/src/string/strdup_test.cpp +++ b/libc/test/src/string/strdup_test.cpp @@ -6,14 +6,15 @@ // //===----------------------------------------------------------------------===// -#include "src/errno/libc_errno.h" #include "src/string/strdup.h" +#include "test/UnitTest/ErrnoCheckingTest.h" #include "test/UnitTest/Test.h" -TEST(LlvmLibcStrDupTest, EmptyString) { +using LlvmLibcStrDupTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest; + +TEST_F(LlvmLibcStrDupTest, EmptyString) { const char *empty = ""; - LIBC_NAMESPACE::libc_errno = 0; char *result = LIBC_NAMESPACE::strdup(empty); ASSERT_ERRNO_SUCCESS(); @@ -23,10 +24,9 @@ TEST(LlvmLibcStrDupTest, EmptyString) { ::free(result); } -TEST(LlvmLibcStrDupTest, AnyString) { +TEST_F(LlvmLibcStrDupTest, AnyString) { const char *abc = "abc"; - LIBC_NAMESPACE::libc_errno = 0; char *result = LIBC_NAMESPACE::strdup(abc); ASSERT_ERRNO_SUCCESS(); @@ -36,8 +36,7 @@ TEST(LlvmLibcStrDupTest, AnyString) { ::free(result); } -TEST(LlvmLibcStrDupTest, NullPtr) { - LIBC_NAMESPACE::libc_errno = 0; +TEST_F(LlvmLibcStrDupTest, NullPtr) { char *result = LIBC_NAMESPACE::strdup(nullptr); ASSERT_ERRNO_SUCCESS(); From 22fd11fe66a0d64f5ef359e21ae67a7d40936eaf Mon Sep 17 00:00:00 2001 From: Abhina Sree Date: Wed, 11 Jun 2025 15:26:49 -0400 Subject: [PATCH 0037/1322] [SystemZ][z/OS] Refactor AutoConvert.h to remove large MVS guard (#143174) This AutoConvert.h header frequently gets mislabeled as an unused include because it is guarded by MVS internally and every usage is also guarded. This refactors the change to remove this guard and instead make these functions a noop on other non-z/OS platforms. --- llvm/include/llvm/Support/AutoConvert.h | 46 +++++++++++++++++++++++-- llvm/lib/Support/AutoConvert.cpp | 21 ----------- llvm/lib/Support/InitLLVM.cpp | 30 ++++++++++------ llvm/lib/Support/MemoryBuffer.cpp | 10 +++--- llvm/lib/Support/raw_ostream.cpp | 19 +++++----- 5 files changed, 78 insertions(+), 48 deletions(-) diff --git a/llvm/include/llvm/Support/AutoConvert.h b/llvm/include/llvm/Support/AutoConvert.h index 352493e9be25..56ad91425bcc 100644 --- a/llvm/include/llvm/Support/AutoConvert.h +++ b/llvm/include/llvm/Support/AutoConvert.h @@ -16,6 +16,7 @@ #ifdef __MVS__ #include <_Ccsid.h> +#endif #ifdef __cplusplus #include "llvm/Support/ErrorOr.h" #include @@ -28,9 +29,11 @@ #ifdef __cplusplus extern "C" { #endif /* __cplusplus */ + int enablezOSAutoConversion(int FD); int disablezOSAutoConversion(int FD); int restorezOSStdHandleAutoConversion(int FD); + #ifdef __cplusplus } #endif /* __cplusplus */ @@ -38,6 +41,46 @@ int restorezOSStdHandleAutoConversion(int FD); #ifdef __cplusplus namespace llvm { +inline std::error_code disableAutoConversion(int FD) { +#ifdef __MVS__ + if (::disablezOSAutoConversion(FD) == -1) + return errnoAsErrorCode(); +#endif + return std::error_code(); +} + +inline std::error_code enableAutoConversion(int FD) { +#ifdef __MVS__ + if (::enablezOSAutoConversion(FD) == -1) + return errnoAsErrorCode(); +#endif + return std::error_code(); +} + +inline std::error_code restoreStdHandleAutoConversion(int FD) { +#ifdef __MVS__ + if (::restorezOSStdHandleAutoConversion(FD) == -1) + return errnoAsErrorCode(); +#endif + return std::error_code(); +} + +inline std::error_code setFileTag(int FD, int CCSID, bool Text) { +#ifdef __MVS__ + return setzOSFileTag(FD, CCSID, Text); +#endif + return std::error_code(); +} + +inline ErrorOr needConversion(const char *FileName, const int FD = -1) { +#ifdef __MVS__ + return needzOSConversion(FileName, FD); +#endif + return false; +} + +#ifdef __MVS__ + /** \brief Disable the z/OS enhanced ASCII auto-conversion for the file * descriptor. */ @@ -63,9 +106,8 @@ ErrorOr<__ccsid_t> getzOSFileTag(const char *FileName, const int FD = -1); */ ErrorOr needzOSConversion(const char *FileName, const int FD = -1); +#endif /* __MVS__*/ } /* namespace llvm */ #endif /* __cplusplus */ -#endif /* __MVS__ */ - #endif /* LLVM_SUPPORT_AUTOCONVERT_H */ diff --git a/llvm/lib/Support/AutoConvert.cpp b/llvm/lib/Support/AutoConvert.cpp index f7918548df1d..c69e9a8f97c0 100644 --- a/llvm/lib/Support/AutoConvert.cpp +++ b/llvm/lib/Support/AutoConvert.cpp @@ -83,27 +83,6 @@ int enablezOSAutoConversion(int FD) { return fcntl(FD, F_CONTROL_CVT, &Query); } -std::error_code llvm::disablezOSAutoConversion(int FD) { - if (::disablezOSAutoConversion(FD) == -1) - return errnoAsErrorCode(); - - return std::error_code(); -} - -std::error_code llvm::enablezOSAutoConversion(int FD) { - if (::enablezOSAutoConversion(FD) == -1) - return errnoAsErrorCode(); - - return std::error_code(); -} - -std::error_code llvm::restorezOSStdHandleAutoConversion(int FD) { - if (::restorezOSStdHandleAutoConversion(FD) == -1) - return errnoAsErrorCode(); - - return std::error_code(); -} - std::error_code llvm::setzOSFileTag(int FD, int CCSID, bool Text) { assert((!Text || (CCSID != FT_UNTAGGED && CCSID != FT_BINARY)) && "FT_UNTAGGED and FT_BINARY are not allowed for text files"); diff --git a/llvm/lib/Support/InitLLVM.cpp b/llvm/lib/Support/InitLLVM.cpp index 50f7a43cc34a..b8fbfd21c4f2 100644 --- a/llvm/lib/Support/InitLLVM.cpp +++ b/llvm/lib/Support/InitLLVM.cpp @@ -18,18 +18,28 @@ #include "llvm/Support/Windows/WindowsSupport.h" #endif -#ifdef __MVS__ +#if defined(HAVE_UNISTD_H) #include +#else +#ifndef STDIN_FILENO +#define STDIN_FILENO 0 +#endif +#ifndef STDOUT_FILENO +#define STDOUT_FILENO 1 +#endif +#ifndef STDERR_FILENO +#define STDERR_FILENO 2 +#endif +#endif void CleanupStdHandles(void *Cookie) { llvm::raw_ostream *Outs = &llvm::outs(), *Errs = &llvm::errs(); Outs->flush(); Errs->flush(); - llvm::restorezOSStdHandleAutoConversion(STDIN_FILENO); - llvm::restorezOSStdHandleAutoConversion(STDOUT_FILENO); - llvm::restorezOSStdHandleAutoConversion(STDERR_FILENO); + llvm::restoreStdHandleAutoConversion(STDIN_FILENO); + llvm::restoreStdHandleAutoConversion(STDOUT_FILENO); + llvm::restoreStdHandleAutoConversion(STDERR_FILENO); } -#endif using namespace llvm; using namespace llvm::sys; @@ -41,10 +51,10 @@ InitLLVM::InitLLVM(int &Argc, const char **&Argv, assert(!Initialized && "InitLLVM was already initialized!"); Initialized = true; #endif -#ifdef __MVS__ + // Bring stdin/stdout/stderr into a known state. sys::AddSignalHandler(CleanupStdHandles, nullptr); -#endif + if (InstallPipeSignalExitHandler) // The pipe signal handler must be installed before any other handlers are // registered. This is because the Unix \ref RegisterHandlers function does @@ -68,8 +78,8 @@ InitLLVM::InitLLVM(int &Argc, const char **&Argv, // If turning on conversion for stderr fails then the error message // may be garbled. There is no solution to this problem. - ExitOnErr(errorCodeToError(llvm::enablezOSAutoConversion(STDERR_FILENO))); - ExitOnErr(errorCodeToError(llvm::enablezOSAutoConversion(STDOUT_FILENO))); + ExitOnErr(errorCodeToError(llvm::enableAutoConversion(STDERR_FILENO))); + ExitOnErr(errorCodeToError(llvm::enableAutoConversion(STDOUT_FILENO))); #endif #ifdef _WIN32 @@ -97,8 +107,6 @@ InitLLVM::InitLLVM(int &Argc, const char **&Argv, } InitLLVM::~InitLLVM() { -#ifdef __MVS__ CleanupStdHandles(nullptr); -#endif llvm_shutdown(); } diff --git a/llvm/lib/Support/MemoryBuffer.cpp b/llvm/lib/Support/MemoryBuffer.cpp index e2044bcc4e4f..601f11f6d23c 100644 --- a/llvm/lib/Support/MemoryBuffer.cpp +++ b/llvm/lib/Support/MemoryBuffer.cpp @@ -15,6 +15,7 @@ #include "llvm/ADT/SmallString.h" #include "llvm/Config/config.h" #include "llvm/Support/Alignment.h" +#include "llvm/Support/AutoConvert.h" #include "llvm/Support/Errc.h" #include "llvm/Support/Error.h" #include "llvm/Support/ErrorHandling.h" @@ -34,9 +35,6 @@ #include #endif -#ifdef __MVS__ -#include "llvm/Support/AutoConvert.h" -#endif using namespace llvm; //===----------------------------------------------------------------------===// @@ -508,15 +506,15 @@ getOpenFileImpl(sys::fs::file_t FD, const Twine &Filename, uint64_t FileSize, } #ifdef __MVS__ - ErrorOr NeedConversion = needzOSConversion(Filename.str().c_str(), FD); - if (std::error_code EC = NeedConversion.getError()) + ErrorOr NeedsConversion = needConversion(Filename.str().c_str(), FD); + if (std::error_code EC = NeedsConversion.getError()) return EC; // File size may increase due to EBCDIC -> UTF-8 conversion, therefore we // cannot trust the file size and we create the memory buffer by copying // off the stream. // Note: This only works with the assumption of reading a full file (i.e, // Offset == 0 and MapSize == FileSize). Reading a file slice does not work. - if (Offset == 0 && MapSize == FileSize && *NeedConversion) + if (*NeedsConversion && Offset == 0 && MapSize == FileSize) return getMemoryBufferForStream(FD, Filename); #endif diff --git a/llvm/lib/Support/raw_ostream.cpp b/llvm/lib/Support/raw_ostream.cpp index 16631a63d192..07b99896543b 100644 --- a/llvm/lib/Support/raw_ostream.cpp +++ b/llvm/lib/Support/raw_ostream.cpp @@ -894,21 +894,24 @@ void raw_fd_ostream::anchor() {} raw_fd_ostream &llvm::outs() { // Set buffer settings to model stdout behavior. std::error_code EC; -#ifdef __MVS__ - EC = enablezOSAutoConversion(STDOUT_FILENO); - assert(!EC); -#endif + + // On z/OS we need to enable auto conversion + static std::error_code EC1 = enableAutoConversion(STDOUT_FILENO); + assert(!EC1); + (void)EC1; + static raw_fd_ostream S("-", EC, sys::fs::OF_None); assert(!EC); return S; } raw_fd_ostream &llvm::errs() { - // Set standard error to be unbuffered. -#ifdef __MVS__ - std::error_code EC = enablezOSAutoConversion(STDERR_FILENO); + // On z/OS we need to enable auto conversion + static std::error_code EC = enableAutoConversion(STDERR_FILENO); assert(!EC); -#endif + (void)EC; + + // Set standard error to be unbuffered. static raw_fd_ostream S(STDERR_FILENO, false, true); return S; } From 34a1b8ce2518d7868c080519a05892cd3b197192 Mon Sep 17 00:00:00 2001 From: Razvan Lupusoru Date: Wed, 11 Jun 2025 12:37:08 -0700 Subject: [PATCH 0038/1322] [acc] acc.loop verifier now requires parallelism determination flag (#143720) The OpenACC specification for `acc loop` describe that a loop's parallelism determination mode is either auto, independent, or seq. The rules are as follows. - As per OpenACC 3.3 standard section 2.9.6 independent clause: A loop construct with no auto or seq clause is treated as if it has the independent clause when it is an orphaned loop construct or its parent compute construct is a parallel construct. - As per OpenACC 3.3 standard section 2.9.7 auto clause: When the parent compute construct is a kernels construct, a loop construct with no independent or seq clause is treated as if it has the auto clause. - Additionally, loops marked with gang, worker, or vector are not guaranteed to be parallel. Specifically noted in 2.9.7 auto clause: If not, or if it is unable to make a determination, it must treat the auto clause as if it is a seq clause, and it must ignore any gang, worker, or vector clauses on the loop construct. The verifier for `acc.loop` was updated to enforce this marking because the context in which a loop appears is not trivially determined once IR transformations begin. For example, orphaned loops are implicitly `independent`, but after inlining into an `acc.kernels` region they would be implicitly considered `auto`. Thus now the verifier requires that a frontend specifically generates acc dialect with this marking since it knows the context. --- mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp | 35 +++++++++-- mlir/test/Dialect/OpenACC/canonicalize.mlir | 4 +- mlir/test/Dialect/OpenACC/invalid.mlir | 28 ++++----- mlir/test/Dialect/OpenACC/legalize-data.mlir | 16 ++--- mlir/test/Dialect/OpenACC/ops.mlir | 66 ++++++++++---------- 5 files changed, 86 insertions(+), 63 deletions(-) diff --git a/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp b/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp index 21e6b9d85f1a..0dfead98b7e7 100644 --- a/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp +++ b/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp @@ -2461,10 +2461,34 @@ LogicalResult acc::LoopOp::verify() { if (hasDuplicateDeviceTypes(getAuto_(), deviceTypes) || hasDuplicateDeviceTypes(getIndependent(), deviceTypes) || hasDuplicateDeviceTypes(getSeq(), deviceTypes)) { - return emitError() << "only one of \"" << acc::LoopOp::getAutoAttrStrName() - << "\", " << getIndependentAttrName() << ", " - << getSeqAttrName() - << " can be present at the same time"; + return emitError() << "only one of auto, independent, seq can be present " + "at the same time"; + } + + // Check that at least one of auto, independent, or seq is present + // for the device-independent default clauses. + auto hasDeviceNone = [](mlir::acc::DeviceTypeAttr attr) -> bool { + return attr.getValue() == mlir::acc::DeviceType::None; + }; + bool hasDefaultSeq = + getSeqAttr() + ? llvm::any_of(getSeqAttr().getAsRange(), + hasDeviceNone) + : false; + bool hasDefaultIndependent = + getIndependentAttr() + ? llvm::any_of( + getIndependentAttr().getAsRange(), + hasDeviceNone) + : false; + bool hasDefaultAuto = + getAuto_Attr() + ? llvm::any_of(getAuto_Attr().getAsRange(), + hasDeviceNone) + : false; + if (!hasDefaultSeq && !hasDefaultIndependent && !hasDefaultAuto) { + return emitError() + << "at least one of auto, independent, seq must be present"; } // Gang, worker and vector are incompatible with seq. @@ -2482,8 +2506,7 @@ LogicalResult acc::LoopOp::verify() { deviceTypeAttr.getValue()) || getGangValue(mlir::acc::GangArgType::Static, deviceTypeAttr.getValue())) - return emitError() - << "gang, worker or vector cannot appear with the seq attr"; + return emitError() << "gang, worker or vector cannot appear with seq"; } } diff --git a/mlir/test/Dialect/OpenACC/canonicalize.mlir b/mlir/test/Dialect/OpenACC/canonicalize.mlir index e43a27f6b9e8..fdc8e6b5cae6 100644 --- a/mlir/test/Dialect/OpenACC/canonicalize.mlir +++ b/mlir/test/Dialect/OpenACC/canonicalize.mlir @@ -116,10 +116,10 @@ func.func @testhostdataop(%a: memref, %ifCond: i1) -> () { acc.host_data dataOperands(%0 : memref) if(%false) { acc.loop control(%iv : i32) = (%1 : i32) to (%2 : i32) step (%1 : i32) { acc.yield - } attributes { inclusiveUpperbound = array } + } attributes {inclusiveUpperbound = array, independent = [#acc.device_type]} acc.loop control(%iv : i32) = (%1 : i32) to (%2 : i32) step (%1 : i32) { acc.yield - } attributes { inclusiveUpperbound = array } + } attributes {inclusiveUpperbound = array, independent = [#acc.device_type]} acc.terminator } return diff --git a/mlir/test/Dialect/OpenACC/invalid.mlir b/mlir/test/Dialect/OpenACC/invalid.mlir index aadf18927321..8f6e961a0616 100644 --- a/mlir/test/Dialect/OpenACC/invalid.mlir +++ b/mlir/test/Dialect/OpenACC/invalid.mlir @@ -2,7 +2,7 @@ %1 = arith.constant 1 : i32 %2 = arith.constant 10 : i32 -// expected-error@+1 {{gang, worker or vector cannot appear with the seq attr}} +// expected-error@+1 {{gang, worker or vector cannot appear with seq}} acc.loop control(%iv : i32) = (%1 : i32) to (%2 : i32) step (%1 : i32) { "test.openacc_dummy_op"() : () -> () acc.yield @@ -12,7 +12,7 @@ acc.loop control(%iv : i32) = (%1 : i32) to (%2 : i32) step (%1 : i32) { %1 = arith.constant 1 : i32 %2 = arith.constant 10 : i32 -// expected-error@+1 {{gang, worker or vector cannot appear with the seq attr}} +// expected-error@+1 {{gang, worker or vector cannot appear with seq}} acc.loop control(%iv : i32) = (%1 : i32) to (%2 : i32) step (%1 : i32) { "test.openacc_dummy_op"() : () -> () acc.yield @@ -22,7 +22,7 @@ acc.loop control(%iv : i32) = (%1 : i32) to (%2 : i32) step (%1 : i32) { %1 = arith.constant 1 : i32 %2 = arith.constant 10 : i32 -// expected-error@+1 {{gang, worker or vector cannot appear with the seq attr}} +// expected-error@+1 {{gang, worker or vector cannot appear with seq}} acc.loop control(%iv : i32) = (%1 : i32) to (%2 : i32) step (%1 : i32) { "test.openacc_dummy_op"() : () -> () acc.yield @@ -32,7 +32,7 @@ acc.loop control(%iv : i32) = (%1 : i32) to (%2 : i32) step (%1 : i32) { %1 = arith.constant 1 : i32 %2 = arith.constant 10 : i32 -// expected-error@+1 {{gang, worker or vector cannot appear with the seq attr}} +// expected-error@+1 {{gang, worker or vector cannot appear with seq}} acc.loop control(%iv : i32) = (%1 : i32) to (%2 : i32) step (%1 : i32) { "test.openacc_dummy_op"() : () -> () acc.yield @@ -42,7 +42,7 @@ acc.loop control(%iv : i32) = (%1 : i32) to (%2 : i32) step (%1 : i32) { %1 = arith.constant 1 : i32 %2 = arith.constant 10 : i32 -// expected-error@+1 {{gang, worker or vector cannot appear with the seq attr}} +// expected-error@+1 {{gang, worker or vector cannot appear with seq}} acc.loop control(%iv : i32) = (%1 : i32) to (%2 : i32) step (%1 : i32) { "test.openacc_dummy_op"() : () -> () acc.yield @@ -52,7 +52,7 @@ acc.loop control(%iv : i32) = (%1 : i32) to (%2 : i32) step (%1 : i32) { %1 = arith.constant 1 : i32 %2 = arith.constant 10 : i32 -// expected-error@+1 {{gang, worker or vector cannot appear with the seq attr}} +// expected-error@+1 {{gang, worker or vector cannot appear with seq}} acc.loop control(%iv : i32) = (%1 : i32) to (%2 : i32) step (%1 : i32) { "test.openacc_dummy_op"() : () -> () acc.yield @@ -62,7 +62,7 @@ acc.loop control(%iv : i32) = (%1 : i32) to (%2 : i32) step (%1 : i32) { %1 = arith.constant 1 : i32 %2 = arith.constant 10 : i32 -// expected-error@+1 {{gang, worker or vector cannot appear with the seq attr}} +// expected-error@+1 {{gang, worker or vector cannot appear with seq}} acc.loop { "test.openacc_dummy_op"() : () -> () acc.yield @@ -72,7 +72,7 @@ acc.loop { // expected-error@+1 {{expected non-empty body.}} acc.loop { -} +} attributes {independent = [#acc.device_type]} // ----- @@ -99,7 +99,7 @@ acc.loop { %1 = arith.constant 1 : i32 %2 = arith.constant 10 : i32 -// expected-error@+1 {{only one of "auto", "independent", "seq" can be present at the same time}} +// expected-error@+1 {{only one of auto, independent, seq can be present at the same time}} acc.loop control(%iv : i32) = (%1 : i32) to (%2 : i32) step (%1 : i32) { acc.yield } attributes {auto_ = [#acc.device_type], seq = [#acc.device_type], inclusiveUpperbound = array} @@ -168,7 +168,7 @@ acc.loop control(%iv : i32) = (%1 : i32) to (%2 : i32) step (%1 : i32){ // expected-error@+1 {{'acc.init' op cannot be nested in a compute operation}} acc.init acc.yield -} attributes {inclusiveUpperbound = array} +} attributes {inclusiveUpperbound = array, independent = [#acc.device_type]} // ----- @@ -186,7 +186,7 @@ acc.loop control(%iv : i32) = (%1 : i32) to (%2 : i32) step (%1 : i32) { // expected-error@+1 {{'acc.shutdown' op cannot be nested in a compute operation}} acc.shutdown acc.yield -} attributes {inclusiveUpperbound = array} +} attributes {inclusiveUpperbound = array, independent = [#acc.device_type]} // ----- @@ -198,7 +198,7 @@ acc.loop control(%iv : i32) = (%1 : i32) to (%2 : i32) step (%1 : i32) { acc.shutdown }) : () -> () acc.yield -} attributes {inclusiveUpperbound = array} +} attributes {inclusiveUpperbound = array, independent = [#acc.device_type]} // ----- @@ -797,7 +797,7 @@ func.func @acc_loop_container() { scf.yield } acc.yield - } attributes { collapse = [2], collapseDeviceType = [#acc.device_type] } + } attributes { collapse = [2], collapseDeviceType = [#acc.device_type], independent = [#acc.device_type]} return } @@ -816,6 +816,6 @@ func.func @acc_loop_container() { scf.yield } acc.yield - } attributes { collapse = [3], collapseDeviceType = [#acc.device_type] } + } attributes { collapse = [3], collapseDeviceType = [#acc.device_type], independent = [#acc.device_type]} return } diff --git a/mlir/test/Dialect/OpenACC/legalize-data.mlir b/mlir/test/Dialect/OpenACC/legalize-data.mlir index 28ef6761a6ef..40604dcc736d 100644 --- a/mlir/test/Dialect/OpenACC/legalize-data.mlir +++ b/mlir/test/Dialect/OpenACC/legalize-data.mlir @@ -96,7 +96,7 @@ func.func @test(%a: memref<10xf32>) { acc.loop control(%i : index) = (%lb : index) to (%c10 : index) step (%st : index) { %ci = memref.load %a[%i] : memref<10xf32> acc.yield - } + } attributes {independent = [#acc.device_type]} acc.yield } return @@ -109,7 +109,7 @@ func.func @test(%a: memref<10xf32>) { // CHECK: acc.loop control(%[[I:.*]] : index) = (%{{.*}} : index) to (%{{.*}} : index) step (%{{.*}} : index) { // DEVICE: %{{.*}} = memref.load %[[CREATE:.*]][%[[I]]] : memref<10xf32> // CHECK: acc.yield -// CHECK: } +// CHECK: } attributes {independent = [#acc.device_type]} // CHECK: acc.yield // CHECK: } @@ -134,7 +134,7 @@ func.func @test(%a: memref<10xf32>) { acc.loop control(%i : index) = (%lb : index) to (%c10 : index) step (%st : index) { %ci = memref.load %a[%i] : memref<10xf32> acc.yield - } + } attributes {independent = [#acc.device_type]} acc.yield } return @@ -147,7 +147,7 @@ func.func @test(%a: memref<10xf32>) { // CHECK: acc.loop control(%[[I:.*]] : index) = (%{{.*}} : index) to (%{{.*}} : index) step (%{{.*}} : index) { // DEVICE: %{{.*}} = memref.load %[[PRIVATE:.*]][%[[I]]] : memref<10xf32> // CHECK: acc.yield -// CHECK: } +// CHECK: } attributes {independent = [#acc.device_type]} // CHECK: acc.yield // CHECK: } @@ -172,7 +172,7 @@ func.func @test(%a: memref<10xf32>) { acc.loop private(@privatization_memref_10_f32 -> %p1 : memref<10xf32>) control(%i : index) = (%lb : index) to (%c10 : index) step (%st : index) { %ci = memref.load %a[%i] : memref<10xf32> acc.yield - } + } attributes {independent = [#acc.device_type]} acc.yield } return @@ -185,7 +185,7 @@ func.func @test(%a: memref<10xf32>) { // CHECK: acc.loop private(@privatization_memref_10_f32 -> %[[PRIVATE]] : memref<10xf32>) control(%[[I:.*]] : index) = (%{{.*}} : index) to (%{{.*}} : index) step (%{{.*}} : index) { // DEVICE: %{{.*}} = memref.load %[[PRIVATE:.*]][%[[I]]] : memref<10xf32> // CHECK: acc.yield -// CHECK: } +// CHECK: } attributes {independent = [#acc.device_type]} // CHECK: acc.yield // CHECK: } @@ -210,7 +210,7 @@ func.func @test(%a: memref<10xf32>) { acc.loop control(%i : index) = (%lb : index) to (%c10 : index) step (%st : index) { %ci = memref.load %a[%i] : memref<10xf32> acc.yield - } + } attributes {seq = [#acc.device_type]} acc.yield } return @@ -223,7 +223,7 @@ func.func @test(%a: memref<10xf32>) { // CHECK: acc.loop control(%[[I:.*]] : index) = (%{{.*}} : index) to (%{{.*}} : index) step (%{{.*}} : index) { // DEVICE: %{{.*}} = memref.load %[[PRIVATE:.*]][%[[I]]] : memref<10xf32> // CHECK: acc.yield -// CHECK: } +// CHECK: } attributes {seq = [#acc.device_type]} // CHECK: acc.yield // CHECK: } diff --git a/mlir/test/Dialect/OpenACC/ops.mlir b/mlir/test/Dialect/OpenACC/ops.mlir index 550f295f074a..97278f869534 100644 --- a/mlir/test/Dialect/OpenACC/ops.mlir +++ b/mlir/test/Dialect/OpenACC/ops.mlir @@ -19,7 +19,7 @@ func.func @compute1(%A: memref<10x10xf32>, %B: memref<10x10xf32>, %C: memref<10x %co = arith.addf %cij, %p : f32 memref.store %co, %C[%arg3, %arg4] : memref<10x10xf32> acc.yield - } attributes { collapse = [3], collapseDeviceType = [#acc.device_type], inclusiveUpperbound = array} + } attributes { collapse = [3], collapseDeviceType = [#acc.device_type], inclusiveUpperbound = array, independent = [#acc.device_type]} acc.yield } @@ -40,7 +40,7 @@ func.func @compute1(%A: memref<10x10xf32>, %B: memref<10x10xf32>, %C: memref<10x // CHECK-NEXT: %{{.*}} = arith.addf %{{.*}}, %{{.*}} : f32 // CHECK-NEXT: memref.store %{{.*}}, %{{.*}}[%{{.*}}, %{{.*}}] : memref<10x10xf32> // CHECK-NEXT: acc.yield -// CHECK-NEXT: } attributes {collapse = [3], collapseDeviceType = [#acc.device_type], inclusiveUpperbound = array} +// CHECK-NEXT: } attributes {collapse = [3], collapseDeviceType = [#acc.device_type], inclusiveUpperbound = array, independent = [#acc.device_type]} // CHECK-NEXT: acc.yield // CHECK-NEXT: } // CHECK-NEXT: return %{{.*}} : memref<10x10xf32> @@ -129,7 +129,7 @@ func.func @compute3(%a: memref<10x10xf32>, %b: memref<10x10xf32>, %c: memref<10x %tmp = arith.addf %axy, %bxy : f32 memref.store %tmp, %c[%y] : memref<10xf32> acc.yield - } attributes {inclusiveUpperbound = array} + } attributes {inclusiveUpperbound = array, independent = [#acc.device_type]} acc.loop control(%i : index) = (%lb : index) to (%c10 : index) step (%st : index) { // for i = 0 to 10 step 1 @@ -139,9 +139,9 @@ func.func @compute3(%a: memref<10x10xf32>, %b: memref<10x10xf32>, %c: memref<10x %z = arith.addf %ci, %dx : f32 memref.store %z, %d[%x] : memref<10xf32> acc.yield - } attributes {inclusiveUpperbound = array, seq = [#acc.device_type]} + } attributes {inclusiveUpperbound = array, independent = [#acc.device_type], seq = [#acc.device_type]} acc.yield - } attributes {inclusiveUpperbound = array} + } attributes {inclusiveUpperbound = array, independent = [#acc.device_type]} acc.yield } acc.terminator @@ -166,16 +166,16 @@ func.func @compute3(%a: memref<10x10xf32>, %b: memref<10x10xf32>, %c: memref<10x // CHECK-NEXT: %{{.*}} = arith.addf %{{.*}}, %{{.*}} : f32 // CHECK-NEXT: memref.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32> // CHECK-NEXT: acc.yield -// CHECK-NEXT: } attributes {inclusiveUpperbound = array} +// CHECK-NEXT: } attributes {inclusiveUpperbound = array, independent = [#acc.device_type]} // CHECK-NEXT: acc.loop control(%{{.*}}) = (%{{.*}}) to (%{{.*}}) step (%{{.*}}) { // CHECK-NEXT: %{{.*}} = memref.load %{{.*}}[%{{.*}}] : memref<10xf32> // CHECK-NEXT: %{{.*}} = memref.load %{{.*}}[%{{.*}}] : memref<10xf32> // CHECK-NEXT: %{{.*}} = arith.addf %{{.*}}, %{{.*}} : f32 // CHECK-NEXT: memref.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32> // CHECK-NEXT: acc.yield -// CHECK-NEXT: } attributes {inclusiveUpperbound = array, seq = [#acc.device_type]} +// CHECK-NEXT: } attributes {inclusiveUpperbound = array, independent = [#acc.device_type], seq = [#acc.device_type]} // CHECK-NEXT: acc.yield -// CHECK-NEXT: } attributes {inclusiveUpperbound = array} +// CHECK-NEXT: } attributes {inclusiveUpperbound = array, independent = [#acc.device_type]} // CHECK-NEXT: acc.yield // CHECK-NEXT: } // CHECK-NEXT: acc.terminator @@ -196,72 +196,72 @@ func.func @testloopop(%a : memref<10xf32>) -> () { acc.loop gang vector worker control(%iv : index) = (%c0 : index) to (%c10 : index) step (%c1 : index) { "test.openacc_dummy_op"() : () -> () acc.yield - } attributes {inclusiveUpperbound = array} + } attributes {inclusiveUpperbound = array, independent = [#acc.device_type]} acc.loop gang({num=%i64Value: i64}) control(%iv : index) = (%c0 : index) to (%c10 : index) step (%c1 : index) { "test.openacc_dummy_op"() : () -> () acc.yield - } attributes {inclusiveUpperbound = array} + } attributes {inclusiveUpperbound = array, independent = [#acc.device_type]} acc.loop gang({static=%i64Value: i64}) control(%iv : index) = (%c0 : index) to (%c10 : index) step (%c1 : index) { "test.openacc_dummy_op"() : () -> () acc.yield - } attributes {inclusiveUpperbound = array} + } attributes {inclusiveUpperbound = array, independent = [#acc.device_type]} acc.loop worker(%i64Value: i64) control(%iv : index) = (%c0 : index) to (%c10 : index) step (%c1 : index) { "test.openacc_dummy_op"() : () -> () acc.yield - } attributes {inclusiveUpperbound = array} + } attributes {inclusiveUpperbound = array, independent = [#acc.device_type]} acc.loop worker(%i32Value: i32) control(%iv : index) = (%c0 : index) to (%c10 : index) step (%c1 : index) { "test.openacc_dummy_op"() : () -> () acc.yield - } attributes {inclusiveUpperbound = array} + } attributes {inclusiveUpperbound = array, independent = [#acc.device_type]} acc.loop worker(%idxValue: index) control(%iv : index) = (%c0 : index) to (%c10 : index) step (%c1 : index) { "test.openacc_dummy_op"() : () -> () acc.yield - } attributes {inclusiveUpperbound = array} + } attributes {inclusiveUpperbound = array, independent = [#acc.device_type]} acc.loop vector(%i64Value: i64) control(%iv : index) = (%c0 : index) to (%c10 : index) step (%c1 : index) { "test.openacc_dummy_op"() : () -> () acc.yield - } attributes {inclusiveUpperbound = array} + } attributes {inclusiveUpperbound = array, independent = [#acc.device_type]} acc.loop vector(%i32Value: i32) control(%iv : index) = (%c0 : index) to (%c10 : index) step (%c1 : index) { "test.openacc_dummy_op"() : () -> () acc.yield - } attributes {inclusiveUpperbound = array} + } attributes {inclusiveUpperbound = array, independent = [#acc.device_type]} acc.loop vector(%idxValue: index) control(%iv : index) = (%c0 : index) to (%c10 : index) step (%c1 : index) { "test.openacc_dummy_op"() : () -> () acc.yield - } attributes {inclusiveUpperbound = array} + } attributes {inclusiveUpperbound = array, independent = [#acc.device_type]} acc.loop gang({num=%i64Value: i64}) worker vector control(%iv : index) = (%c0 : index) to (%c10 : index) step (%c1 : index) { "test.openacc_dummy_op"() : () -> () acc.yield - } attributes {inclusiveUpperbound = array} + } attributes {inclusiveUpperbound = array, independent = [#acc.device_type]} acc.loop gang({num=%i64Value: i64, static=%i64Value: i64}) worker(%i64Value: i64) vector(%i64Value: i64) control(%iv : index) = (%c0 : index) to (%c10 : index) step (%c1 : index) { "test.openacc_dummy_op"() : () -> () acc.yield - } attributes {inclusiveUpperbound = array} + } attributes {inclusiveUpperbound = array, independent = [#acc.device_type]} acc.loop gang({num=%i32Value: i32, static=%idxValue: index}) control(%iv : index) = (%c0 : index) to (%c10 : index) step (%c1 : index) { "test.openacc_dummy_op"() : () -> () acc.yield - } attributes {inclusiveUpperbound = array} + } attributes {inclusiveUpperbound = array, independent = [#acc.device_type]} acc.loop tile({%i64Value : i64, %i64Value : i64}) control(%iv : index) = (%c0 : index) to (%c10 : index) step (%c1 : index) { "test.openacc_dummy_op"() : () -> () acc.yield - } attributes {inclusiveUpperbound = array} + } attributes {inclusiveUpperbound = array, independent = [#acc.device_type]} acc.loop tile({%i32Value : i32, %i32Value : i32}) control(%iv : index) = (%c0 : index) to (%c10 : index) step (%c1 : index) { "test.openacc_dummy_op"() : () -> () acc.yield - } attributes {inclusiveUpperbound = array} + } attributes {inclusiveUpperbound = array, independent = [#acc.device_type]} acc.loop gang({static=%i64Value: i64, num=%i64Value: i64}) control(%iv : index) = (%c0 : index) to (%c10 : index) step (%c1 : index) { "test.openacc_dummy_op"() : () -> () acc.yield - } attributes {inclusiveUpperbound = array} + } attributes {inclusiveUpperbound = array, independent = [#acc.device_type]} acc.loop gang({dim=%i64Value : i64, static=%i64Value: i64}) control(%iv : index) = (%c0 : index) to (%c10 : index) step (%c1 : index) { "test.openacc_dummy_op"() : () -> () acc.yield - } attributes {inclusiveUpperbound = array} + } attributes {inclusiveUpperbound = array, independent = [#acc.device_type]} %b = acc.cache varPtr(%a : memref<10xf32>) varType(tensor<10xf32>) -> memref<10xf32> acc.loop cache(%b : memref<10xf32>) control(%iv : index) = (%c0 : index) to (%c10 : index) step (%c1 : index) { "test.openacc_dummy_op"() : () -> () acc.yield - } attributes {inclusiveUpperbound = array} + } attributes {inclusiveUpperbound = array, independent = [#acc.device_type]} return } @@ -271,7 +271,7 @@ func.func @testloopop(%a : memref<10xf32>) -> () { // CHECK: acc.loop // CHECK-NEXT: "test.openacc_dummy_op"() : () -> () // CHECK-NEXT: acc.yield -// CHECK-NEXT: attributes {inclusiveUpperbound = array} +// CHECK-NEXT: attributes {inclusiveUpperbound = array, independent = [#acc.device_type]} // CHECK: acc.loop gang({num=[[I64VALUE]] : i64}) // CHECK-NEXT: "test.openacc_dummy_op"() : () -> () // CHECK-NEXT: acc.yield @@ -343,7 +343,7 @@ func.func @acc_loop_multiple_block() { cf.br ^bb1(%22 : index) ^bb3: acc.yield - } attributes {inclusiveUpperbound = array} + } attributes {inclusiveUpperbound = array, independent = [#acc.device_type]} acc.yield } return @@ -1477,7 +1477,7 @@ func.func @acc_reduc_test(%a : i64) -> () { acc.parallel reduction(@reduction_add_i64 -> %a : i64) { acc.loop reduction(@reduction_add_i64 -> %a : i64) control(%iv : index) = (%c0 : index) to (%c10 : index) step (%c1 : index) { acc.yield - } attributes { inclusiveUpperbound = array } + } attributes {inclusiveUpperbound = array, independent = [#acc.device_type]} acc.yield } return @@ -1869,21 +1869,21 @@ func.func @acc_combined() { acc.parallel combined(loop) { acc.loop combined(parallel) control(%arg3 : index) = (%c0 : index) to (%c10 : index) step (%c1 : index) { acc.yield - } + } attributes {independent = [#acc.device_type]} acc.terminator } acc.kernels combined(loop) { acc.loop combined(kernels) control(%arg3 : index) = (%c0 : index) to (%c10 : index) step (%c1 : index) { acc.yield - } + } attributes {auto_ = [#acc.device_type]} acc.terminator } acc.serial combined(loop) { acc.loop combined(serial) control(%arg3 : index) = (%c0 : index) to (%c10 : index) step (%c1 : index) { acc.yield - } + } attributes {seq = [#acc.device_type]} acc.terminator } @@ -1949,7 +1949,7 @@ func.func @acc_loop_container() { scf.yield } acc.yield - } + } attributes {independent = [#acc.device_type]} return } @@ -1971,7 +1971,7 @@ func.func @acc_loop_container() { scf.yield } acc.yield - } attributes { collapse = [2], collapseDeviceType = [#acc.device_type] } + } attributes { collapse = [2], collapseDeviceType = [#acc.device_type], independent = [#acc.device_type]} return } From 02161c635fd70e0214bd8b8320a80992c50ec325 Mon Sep 17 00:00:00 2001 From: Alex MacLean Date: Wed, 11 Jun 2025 12:44:51 -0700 Subject: [PATCH 0039/1322] [NVPTX] Misc table-gen cleanup (NFC) (#142877) --- llvm/lib/Target/NVPTX/NVPTXInstrInfo.td | 194 +- llvm/lib/Target/NVPTX/NVPTXIntrinsics.td | 2488 ++++------------- .../Target/NVPTX/NVPTXReplaceImageHandles.cpp | 840 +++--- 3 files changed, 1056 insertions(+), 2466 deletions(-) diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td index b646d39194c7..9ca4e8d20650 100644 --- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td +++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td @@ -160,7 +160,6 @@ def hasDotInstructions : Predicate<"Subtarget->hasDotInstructions()">; def hasTcgen05Instructions : Predicate<"Subtarget->hasTcgen05Instructions()">; def True : Predicate<"true">; -def False : Predicate<"false">; class hasPTX: Predicate<"Subtarget->getPTXVersion() >= " # version>; class hasSM: Predicate<"Subtarget->getSmVersion() >= " # version>; @@ -257,6 +256,11 @@ def BF16X2RT : RegTyInfo; // "prmt.b32${mode}">; // ---> "prmt.b32${mode} \t$d, $a, $b, $c;" // +// * BasicFlagsNVPTXInst<(outs Int64Regs:$state), +// (ins ADDR:$addr), +// "mbarrier.arrive.b64">; +// ---> "mbarrier.arrive.b64 \t$state, [$addr];" +// class BasicFlagsNVPTXInst pattern = []> : NVPTXInst< @@ -274,7 +278,11 @@ class BasicFlagsNVPTXInst(!getdagarg(ins_dag, i)), "ADDR"), + "[$" # !getdagname(ins_dag, i) # "]", + "$" # !getdagname(ins_dag, i) + ) + ), ", "))), ";"), pattern>; @@ -956,31 +964,17 @@ def mul_wide_signed : SDNode<"NVPTXISD::MUL_WIDE_SIGNED", SDTMulWide>; def mul_wide_unsigned : SDNode<"NVPTXISD::MUL_WIDE_UNSIGNED", SDTMulWide>; // Matchers for signed, unsigned mul.wide ISD nodes. -def : Pat<(i32 (mul_wide_signed i16:$a, i16:$b)), - (MULWIDES32 $a, $b)>, - Requires<[doMulWide]>; -def : Pat<(i32 (mul_wide_signed i16:$a, imm:$b)), - (MULWIDES32Imm $a, imm:$b)>, - Requires<[doMulWide]>; -def : Pat<(i32 (mul_wide_unsigned i16:$a, i16:$b)), - (MULWIDEU32 $a, $b)>, - Requires<[doMulWide]>; -def : Pat<(i32 (mul_wide_unsigned i16:$a, imm:$b)), - (MULWIDEU32Imm $a, imm:$b)>, - Requires<[doMulWide]>; +let Predicates = [doMulWide] in { + def : Pat<(i32 (mul_wide_signed i16:$a, i16:$b)), (MULWIDES32 $a, $b)>; + def : Pat<(i32 (mul_wide_signed i16:$a, imm:$b)), (MULWIDES32Imm $a, imm:$b)>; + def : Pat<(i32 (mul_wide_unsigned i16:$a, i16:$b)), (MULWIDEU32 $a, $b)>; + def : Pat<(i32 (mul_wide_unsigned i16:$a, imm:$b)), (MULWIDEU32Imm $a, imm:$b)>; -def : Pat<(i64 (mul_wide_signed i32:$a, i32:$b)), - (MULWIDES64 $a, $b)>, - Requires<[doMulWide]>; -def : Pat<(i64 (mul_wide_signed i32:$a, imm:$b)), - (MULWIDES64Imm $a, imm:$b)>, - Requires<[doMulWide]>; -def : Pat<(i64 (mul_wide_unsigned i32:$a, i32:$b)), - (MULWIDEU64 $a, $b)>, - Requires<[doMulWide]>; -def : Pat<(i64 (mul_wide_unsigned i32:$a, imm:$b)), - (MULWIDEU64Imm $a, imm:$b)>, - Requires<[doMulWide]>; + def : Pat<(i64 (mul_wide_signed i32:$a, i32:$b)), (MULWIDES64 $a, $b)>; + def : Pat<(i64 (mul_wide_signed i32:$a, imm:$b)), (MULWIDES64Imm $a, imm:$b)>; + def : Pat<(i64 (mul_wide_unsigned i32:$a, i32:$b)), (MULWIDEU64 $a, $b)>; + def : Pat<(i64 (mul_wide_unsigned i32:$a, imm:$b)), (MULWIDEU64Imm $a, imm:$b)>; +} // Predicates used for converting some patterns to mul.wide. def SInt32Const : PatLeaf<(imm), [{ @@ -1106,18 +1100,12 @@ defm MAD32 : MAD<"mad.lo.s32", i32, Int32Regs, i32imm>; defm MAD64 : MAD<"mad.lo.s64", i64, Int64Regs, i64imm>; } -def INEG16 : - BasicNVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src), - "neg.s16", - [(set i16:$dst, (ineg i16:$src))]>; -def INEG32 : - BasicNVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src), - "neg.s32", - [(set i32:$dst, (ineg i32:$src))]>; -def INEG64 : - BasicNVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src), - "neg.s64", - [(set i64:$dst, (ineg i64:$src))]>; +foreach t = [I16RT, I32RT, I64RT] in { + def NEG_S # t.Size : + BasicNVPTXInst<(outs t.RC:$dst), (ins t.RC:$src), + "neg.s" # t.Size, + [(set t.Ty:$dst, (ineg t.Ty:$src))]>; +} //----------------------------------- // Floating Point Arithmetic @@ -1538,7 +1526,7 @@ def bfi : SDNode<"NVPTXISD::BFI", SDTBFI>; def SDTPRMT : SDTypeProfile<1, 4, [SDTCisVT<0, i32>, SDTCisVT<1, i32>, - SDTCisVT<2, i32>, SDTCisVT<3, i32>, SDTCisVT<4, i32>,]>; + SDTCisVT<2, i32>, SDTCisVT<3, i32>, SDTCisVT<4, i32>]>; def prmt : SDNode<"NVPTXISD::PRMT", SDTPRMT>; multiclass BFE { @@ -1961,7 +1949,7 @@ multiclass FSET_FORMAT { // f16 -> pred def : Pat<(i1 (OpNode f16:$a, f16:$b)), (SETP_f16rr $a, $b, ModeFTZ)>, - Requires<[useFP16Math,doF32FTZ]>; + Requires<[useFP16Math, doF32FTZ]>; def : Pat<(i1 (OpNode f16:$a, f16:$b)), (SETP_f16rr $a, $b, Mode)>, Requires<[useFP16Math]>; @@ -1969,7 +1957,7 @@ multiclass FSET_FORMAT { // bf16 -> pred def : Pat<(i1 (OpNode bf16:$a, bf16:$b)), (SETP_bf16rr $a, $b, ModeFTZ)>, - Requires<[hasBF16Math,doF32FTZ]>; + Requires<[hasBF16Math, doF32FTZ]>; def : Pat<(i1 (OpNode bf16:$a, bf16:$b)), (SETP_bf16rr $a, $b, Mode)>, Requires<[hasBF16Math]>; @@ -2497,24 +2485,20 @@ def : Pat<(f16 (uint_to_fp i32:$a)), (CVT_f16_u32 $a, CvtRN)>; def : Pat<(f16 (uint_to_fp i64:$a)), (CVT_f16_u64 $a, CvtRN)>; // sint -> bf16 -def : Pat<(bf16 (sint_to_fp i1:$a)), (CVT_bf16_s32 (SELP_b32ii 1, 0, $a), CvtRN)>, - Requires<[hasPTX<78>, hasSM<90>]>; -def : Pat<(bf16 (sint_to_fp i16:$a)), (CVT_bf16_s16 $a, CvtRN)>, - Requires<[hasPTX<78>, hasSM<90>]>; -def : Pat<(bf16 (sint_to_fp i32:$a)), (CVT_bf16_s32 $a, CvtRN)>, - Requires<[hasPTX<78>, hasSM<90>]>; -def : Pat<(bf16 (sint_to_fp i64:$a)), (CVT_bf16_s64 $a, CvtRN)>, - Requires<[hasPTX<78>, hasSM<90>]>; +let Predicates = [hasPTX<78>, hasSM<90>] in { + def : Pat<(bf16 (sint_to_fp i1:$a)), (CVT_bf16_s32 (SELP_b32ii 1, 0, $a), CvtRN)>; + def : Pat<(bf16 (sint_to_fp i16:$a)), (CVT_bf16_s16 $a, CvtRN)>; + def : Pat<(bf16 (sint_to_fp i32:$a)), (CVT_bf16_s32 $a, CvtRN)>; + def : Pat<(bf16 (sint_to_fp i64:$a)), (CVT_bf16_s64 $a, CvtRN)>; +} // uint -> bf16 -def : Pat<(bf16 (uint_to_fp i1:$a)), (CVT_bf16_u32 (SELP_b32ii 1, 0, $a), CvtRN)>, - Requires<[hasPTX<78>, hasSM<90>]>; -def : Pat<(bf16 (uint_to_fp i16:$a)), (CVT_bf16_u16 $a, CvtRN)>, - Requires<[hasPTX<78>, hasSM<90>]>; -def : Pat<(bf16 (uint_to_fp i32:$a)), (CVT_bf16_u32 $a, CvtRN)>, - Requires<[hasPTX<78>, hasSM<90>]>; -def : Pat<(bf16 (uint_to_fp i64:$a)), (CVT_bf16_u64 $a, CvtRN)>, - Requires<[hasPTX<78>, hasSM<90>]>; +let Predicates = [hasPTX<78>, hasSM<90>] in { + def : Pat<(bf16 (uint_to_fp i1:$a)), (CVT_bf16_u32 (SELP_b32ii 1, 0, $a), CvtRN)>; + def : Pat<(bf16 (uint_to_fp i16:$a)), (CVT_bf16_u16 $a, CvtRN)>; + def : Pat<(bf16 (uint_to_fp i32:$a)), (CVT_bf16_u32 $a, CvtRN)>; + def : Pat<(bf16 (uint_to_fp i64:$a)), (CVT_bf16_u64 $a, CvtRN)>; +} // sint -> f32 def : Pat<(f32 (sint_to_fp i1:$a)), (CVT_f32_s32 (SELP_b32ii -1, 0, $a), CvtRN)>; @@ -2565,27 +2549,25 @@ def : Pat<(i16 (fp_to_uint bf16:$a)), (CVT_u16_bf16 $a, CvtRZI)>; def : Pat<(i32 (fp_to_uint bf16:$a)), (CVT_u32_bf16 $a, CvtRZI)>; def : Pat<(i64 (fp_to_uint bf16:$a)), (CVT_u64_bf16 $a, CvtRZI)>; // f32 -> sint -def : Pat<(i1 (fp_to_sint f32:$a)), (SETP_b32ri $a, 0, CmpEQ)>; -def : Pat<(i16 (fp_to_sint f32:$a)), (CVT_s16_f32 $a, CvtRZI_FTZ)>, - Requires<[doF32FTZ]>; +let Predicates = [doF32FTZ] in { + def : Pat<(i16 (fp_to_sint f32:$a)), (CVT_s16_f32 $a, CvtRZI_FTZ)>; + def : Pat<(i32 (fp_to_sint f32:$a)), (CVT_s32_f32 $a, CvtRZI_FTZ)>; + def : Pat<(i64 (fp_to_sint f32:$a)), (CVT_s64_f32 $a, CvtRZI_FTZ)>; +} +def : Pat<(i1 (fp_to_sint f32:$a)), (SETP_b32ri $a, 0, CmpEQ)>; def : Pat<(i16 (fp_to_sint f32:$a)), (CVT_s16_f32 $a, CvtRZI)>; -def : Pat<(i32 (fp_to_sint f32:$a)), (CVT_s32_f32 $a, CvtRZI_FTZ)>, - Requires<[doF32FTZ]>; def : Pat<(i32 (fp_to_sint f32:$a)), (CVT_s32_f32 $a, CvtRZI)>; -def : Pat<(i64 (fp_to_sint f32:$a)), (CVT_s64_f32 $a, CvtRZI_FTZ)>, - Requires<[doF32FTZ]>; def : Pat<(i64 (fp_to_sint f32:$a)), (CVT_s64_f32 $a, CvtRZI)>; // f32 -> uint +let Predicates = [doF32FTZ] in { + def : Pat<(i16 (fp_to_uint f32:$a)), (CVT_u16_f32 $a, CvtRZI_FTZ)>; + def : Pat<(i32 (fp_to_uint f32:$a)), (CVT_u32_f32 $a, CvtRZI_FTZ)>; + def : Pat<(i64 (fp_to_uint f32:$a)), (CVT_u64_f32 $a, CvtRZI_FTZ)>; +} def : Pat<(i1 (fp_to_uint f32:$a)), (SETP_b32ri $a, 0, CmpEQ)>; -def : Pat<(i16 (fp_to_uint f32:$a)), (CVT_u16_f32 $a, CvtRZI_FTZ)>, - Requires<[doF32FTZ]>; def : Pat<(i16 (fp_to_uint f32:$a)), (CVT_u16_f32 $a, CvtRZI)>; -def : Pat<(i32 (fp_to_uint f32:$a)), (CVT_u32_f32 $a, CvtRZI_FTZ)>, - Requires<[doF32FTZ]>; def : Pat<(i32 (fp_to_uint f32:$a)), (CVT_u32_f32 $a, CvtRZI)>; -def : Pat<(i64 (fp_to_uint f32:$a)), (CVT_u64_f32 $a, CvtRZI_FTZ)>, - Requires<[doF32FTZ]>; def : Pat<(i64 (fp_to_uint f32:$a)), (CVT_u64_f32 $a, CvtRZI)>; // f64 -> sint @@ -2707,28 +2689,24 @@ let hasSideEffects = false in { // PTX 7.1 lets you avoid a temp register and just use _ as a "sink" for the // unused high/low part. - def I32toI16H_Sink : NVPTXInst<(outs Int16Regs:$high), - (ins Int32Regs:$s), - "mov.b32 \t{{_, $high}}, $s;", - []>, Requires<[hasPTX<71>]>; - def I32toI16L_Sink : NVPTXInst<(outs Int16Regs:$low), - (ins Int32Regs:$s), - "mov.b32 \t{{$low, _}}, $s;", - []>, Requires<[hasPTX<71>]>; - def I64toI32H_Sink : NVPTXInst<(outs Int32Regs:$high), - (ins Int64Regs:$s), - "mov.b64 \t{{_, $high}}, $s;", - []>, Requires<[hasPTX<71>]>; - def I64toI32L_Sink : NVPTXInst<(outs Int32Regs:$low), - (ins Int64Regs:$s), - "mov.b64 \t{{$low, _}}, $s;", - []>, Requires<[hasPTX<71>]>; + let Predicates = [hasPTX<71>] in { + def I32toI16H_Sink : NVPTXInst<(outs Int16Regs:$high), (ins Int32Regs:$s), + "mov.b32 \t{{_, $high}}, $s;", []>; + def I32toI16L_Sink : NVPTXInst<(outs Int16Regs:$low), (ins Int32Regs:$s), + "mov.b32 \t{{$low, _}}, $s;", []>; + def I64toI32H_Sink : NVPTXInst<(outs Int32Regs:$high), (ins Int64Regs:$s), + "mov.b64 \t{{_, $high}}, $s;", []>; + def I64toI32L_Sink : NVPTXInst<(outs Int32Regs:$low), (ins Int64Regs:$s), + "mov.b64 \t{{$low, _}}, $s;", []>; + } } -def : Pat<(i16 (trunc (srl i32:$s, (i32 16)))), (I32toI16H_Sink i32:$s)>, Requires<[hasPTX<71>]>; -def : Pat<(i16 (trunc (sra i32:$s, (i32 16)))), (I32toI16H_Sink i32:$s)>, Requires<[hasPTX<71>]>; -def : Pat<(i32 (trunc (srl i64:$s, (i32 32)))), (I64toI32H_Sink i64:$s)>, Requires<[hasPTX<71>]>; -def : Pat<(i32 (trunc (sra i64:$s, (i32 32)))), (I64toI32H_Sink i64:$s)>, Requires<[hasPTX<71>]>; +let Predicates = [hasPTX<71>] in { + def : Pat<(i16 (trunc (srl i32:$s, (i32 16)))), (I32toI16H_Sink i32:$s)>; + def : Pat<(i16 (trunc (sra i32:$s, (i32 16)))), (I32toI16H_Sink i32:$s)>; + def : Pat<(i32 (trunc (srl i64:$s, (i32 32)))), (I64toI32H_Sink i64:$s)>; + def : Pat<(i32 (trunc (sra i64:$s, (i32 32)))), (I64toI32H_Sink i64:$s)>; +} // Fall back to the old way if we don't have PTX 7.1. def : Pat<(i16 (trunc (srl i32:$s, (i32 16)))), (I32toI16H $s)>; @@ -3061,29 +3039,19 @@ def stacksave : SDNode<"NVPTXISD::STACKSAVE", SDTIntLeaf, [SDNPHasChain, SDNPSideEffect]>; -def STACKRESTORE_32 : - BasicNVPTXInst<(outs), (ins Int32Regs:$ptr), - "stackrestore.u32", - [(stackrestore i32:$ptr)]>, - Requires<[hasPTX<73>, hasSM<52>]>; +let Predicates = [hasPTX<73>, hasSM<52>] in { + foreach t = [I32RT, I64RT] in { + def STACKRESTORE_ # t.Size : + BasicNVPTXInst<(outs), (ins t.RC:$ptr), + "stackrestore.u" # t.Size, + [(stackrestore t.Ty:$ptr)]>; -def STACKSAVE_32 : - BasicNVPTXInst<(outs Int32Regs:$dst), (ins), - "stacksave.u32", - [(set i32:$dst, (i32 stacksave))]>, - Requires<[hasPTX<73>, hasSM<52>]>; - -def STACKRESTORE_64 : - BasicNVPTXInst<(outs), (ins Int64Regs:$ptr), - "stackrestore.u64", - [(stackrestore i64:$ptr)]>, - Requires<[hasPTX<73>, hasSM<52>]>; - -def STACKSAVE_64 : - BasicNVPTXInst<(outs Int64Regs:$dst), (ins), - "stacksave.u64", - [(set i64:$dst, (i64 stacksave))]>, - Requires<[hasPTX<73>, hasSM<52>]>; + def STACKSAVE_ # t.Size : + BasicNVPTXInst<(outs t.RC:$dst), (ins), + "stacksave.u" # t.Size, + [(set t.Ty:$dst, (t.Ty stacksave))]>; + } +} include "NVPTXIntrinsics.td" @@ -3124,7 +3092,7 @@ def : Pat < //////////////////////////////////////////////////////////////////////////////// class NVPTXFenceInst: - NVPTXInst<(outs), (ins), "fence."#sem#"."#scope#";", []>, + BasicNVPTXInst<(outs), (ins), "fence."#sem#"."#scope>, Requires<[ptx, hasSM<70>]>; foreach scope = ["sys", "gpu", "cluster", "cta"] in { diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td index f918160001ba..83d7defe6d9a 100644 --- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td +++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td @@ -52,7 +52,7 @@ class PTX { def ptx : PTX; // Generates list of n sequential register names. -// E.g. RegNames<3,"r">.ret -> ["r0", "r1", "r2" ] +// E.g. RegNames<3, "r">.ret -> ["r0", "r1", "r2" ] class RegSeq { list ret = !if(n, !listconcat(RegSeq.ret, [prefix # !sub(n, 1)]), @@ -137,7 +137,7 @@ defm BARRIER_CTA_ARRIVE : BARRIER2<"barrier.arrive", int_nvvm_barrier_cta_arrive class INT_BARRIER_CLUSTER Preds = [hasPTX<78>, hasSM<90>]>: - NVPTXInst<(outs), (ins), "barrier.cluster."# variant #";", [(Intr)]>, + BasicNVPTXInst<(outs), (ins), "barrier.cluster."# variant, [(Intr)]>, Requires; def barrier_cluster_arrive: @@ -400,13 +400,9 @@ def INT_FENCE_PROXY_TENSORMAP_GENERIC_ACQUIRE_SYS : //----------------------------------- multiclass CP_ASYNC_MBARRIER_ARRIVE { - def _32 : NVPTXInst<(outs), (ins Int32Regs:$addr), - !strconcat("cp.async.mbarrier.arrive", NoInc, AddrSpace, ".b64 [$addr];"), - [(Intrin i32:$addr)]>, - Requires<[hasPTX<70>, hasSM<80>]>; - def _64 : NVPTXInst<(outs), (ins Int64Regs:$addr), - !strconcat("cp.async.mbarrier.arrive", NoInc, AddrSpace, ".b64 [$addr];"), - [(Intrin i64:$addr)]>, + def "" : BasicNVPTXInst<(outs), (ins ADDR:$addr), + "cp.async.mbarrier.arrive" # NoInc # AddrSpace # ".b64", + [(Intrin addr:$addr)]>, Requires<[hasPTX<70>, hasSM<80>]>; } @@ -420,30 +416,19 @@ defm CP_ASYNC_MBARRIER_ARRIVE_NOINC_SHARED : CP_ASYNC_MBARRIER_ARRIVE<".noinc", ".shared", int_nvvm_cp_async_mbarrier_arrive_noinc_shared>; multiclass CP_ASYNC_SHARED_GLOBAL_I { - def _32 : NVPTXInst<(outs), (ins Int32Regs:$dst, Int32Regs:$src), - !strconcat("cp.async.", cc, ".shared.global [$dst], [$src], ", cpsize, ";"), - [(Intrin i32:$dst, i32:$src)]>, - Requires<[hasPTX<70>, hasSM<80>]>; - def _64 : NVPTXInst<(outs), (ins Int64Regs:$dst, Int64Regs:$src), - !strconcat("cp.async.", cc, ".shared.global [$dst], [$src], ", cpsize, ";"), - [(Intrin i64:$dst, i64:$src)]>, + def "" : NVPTXInst<(outs), (ins ADDR:$dst, ADDR:$src), + "cp.async." # cc # ".shared.global" # " [$dst], [$src], " # cpsize # ";", + [(Intrin addr:$dst, addr:$src)]>, Requires<[hasPTX<70>, hasSM<80>]>; + // Variant with src_size parameter - def _32s : NVPTXInst<(outs), (ins Int32Regs:$dst, Int32Regs:$src, Int32Regs:$src_size), - !strconcat("cp.async.", cc, ".shared.global [$dst], [$src], ", cpsize, ", $src_size;"), - [(IntrinS i32:$dst, i32:$src, i32:$src_size)]>, + def _s : NVPTXInst<(outs), (ins ADDR:$dst, ADDR:$src, Int32Regs:$src_size), + "cp.async." # cc # ".shared.global" # " [$dst], [$src], " # cpsize # ", $src_size;", + [(IntrinS addr:$dst, addr:$src, i32:$src_size)]>, Requires<[hasPTX<70>, hasSM<80>]>; - def _32si: NVPTXInst<(outs), (ins Int32Regs:$dst, Int32Regs:$src, i32imm:$src_size), - !strconcat("cp.async.", cc, ".shared.global [$dst], [$src], ", cpsize, ", $src_size;"), - [(IntrinS i32:$dst, i32:$src, imm:$src_size)]>, - Requires<[hasPTX<70>, hasSM<80>]>; - def _64s : NVPTXInst<(outs), (ins Int64Regs:$dst, Int64Regs:$src, Int32Regs:$src_size), - !strconcat("cp.async.", cc, ".shared.global [$dst], [$src], ", cpsize, ", $src_size;"), - [(IntrinS i64:$dst, i64:$src, i32:$src_size)]>, - Requires<[hasPTX<70>, hasSM<80>]>; - def _64si: NVPTXInst<(outs), (ins Int64Regs:$dst, Int64Regs:$src, i32imm:$src_size), - !strconcat("cp.async.", cc, ".shared.global [$dst], [$src], ", cpsize, ", $src_size;"), - [(IntrinS i64:$dst, i64:$src, imm:$src_size)]>, + def _si: NVPTXInst<(outs), (ins ADDR:$dst, ADDR:$src, i32imm:$src_size), + "cp.async." # cc # ".shared.global" # " [$dst], [$src], " # cpsize # ", $src_size;", + [(IntrinS addr:$dst, addr:$src, imm:$src_size)]>, Requires<[hasPTX<70>, hasSM<80>]>; } @@ -513,14 +498,14 @@ class CpAsyncBulkStr { } multiclass CP_ASYNC_BULK_S2G_INTR { - def NAME : NVPTXInst<(outs), (ins ADDR:$dst, ADDR:$src, Int32Regs:$size, Int64Regs:$ch), + def "" : NVPTXInst<(outs), (ins ADDR:$dst, ADDR:$src, Int32Regs:$size, Int64Regs:$ch), !if(has_ch, CpAsyncBulkStr<0, 1>.S2G # " [$dst], [$src], $size, $ch;", CpAsyncBulkStr<0, 0>.S2G # " [$dst], [$src], $size;"), [(int_nvvm_cp_async_bulk_shared_cta_to_global addr:$dst, addr:$src, i32:$size, i64:$ch, !if(has_ch, -1, 0))]>, Requires<[hasPTX<80>, hasSM<90>]>; - def NAME # _BM : NVPTXInst<(outs), (ins ADDR:$dst, ADDR:$src, Int32Regs:$size, Int64Regs:$ch, Int16Regs:$mask), + def _BM : NVPTXInst<(outs), (ins ADDR:$dst, ADDR:$src, Int32Regs:$size, Int64Regs:$ch, Int16Regs:$mask), !if(has_ch, CpAsyncBulkStr<0, 1, 1>.S2G # " [$dst], [$src], $size, $ch, $mask;", CpAsyncBulkStr<0, 0, 1>.S2G # " [$dst], [$src], $size, $mask;"), @@ -533,7 +518,7 @@ defm CP_ASYNC_BULK_S2G_CH : CP_ASYNC_BULK_S2G_INTR; multiclass CP_ASYNC_BULK_G2S_INTR { defvar Intr = int_nvvm_cp_async_bulk_global_to_shared_cluster; - def NAME : NVPTXInst<(outs), + def "" : NVPTXInst<(outs), (ins ADDR:$dst, ADDR:$mbar, ADDR:$src, Int32Regs:$size, Int16Regs:$mask, Int64Regs:$ch), !if(has_ch, @@ -542,7 +527,7 @@ multiclass CP_ASYNC_BULK_G2S_INTR { [(Intr addr:$dst, addr:$mbar, addr:$src, i32:$size, i16:$mask, i64:$ch, 0, !if(has_ch, -1, 0))]>, Requires<[hasPTX<80>, hasSM<90>]>; - def NAME # _MC : NVPTXInst<(outs), + def _MC : NVPTXInst<(outs), (ins ADDR:$dst, ADDR:$mbar, ADDR:$src, Int32Regs:$size, Int16Regs:$mask, Int64Regs:$ch), !if(has_ch, @@ -561,7 +546,7 @@ def CP_ASYNC_BULK_CTA_TO_CLUSTER : NVPTXInst<(outs), Requires<[hasPTX<80>, hasSM<90>]>; multiclass CP_ASYNC_BULK_PREFETCH_INTR { - def NAME : NVPTXInst<(outs), (ins ADDR:$src, Int32Regs:$size, Int64Regs:$ch), + def "" : NVPTXInst<(outs), (ins ADDR:$src, Int32Regs:$size, Int64Regs:$ch), !if(has_ch, "cp.async.bulk.prefetch.L2.global.L2::cache_hint" # " [$src], $size, $ch;", "cp.async.bulk.prefetch.L2.global" # " [$src], $size;"), @@ -609,19 +594,19 @@ multiclass CP_ASYNC_BULK_TENSOR_G2S_INTR defvar asm_str = !if(!eq(mode, "im2col"), !strconcat(asm_str_default, im2col_asm_str), asm_str_default); - def NAME: NVPTXInst<(outs), + def "" : NVPTXInst<(outs), !con((ins rc:$dst, rc:$mbar, Int64Regs:$tmap), dims_dag, im2col_dag), !strconcat(G2S_STRINGS.inst_name, asm_str, ";"), []>, Requires<[hasPTX<80>, hasSM<90>]>; - def NAME # _MC: NVPTXInst<(outs), + def _MC : NVPTXInst<(outs), !con((ins rc:$dst, rc:$mbar, Int64Regs:$tmap), dims_dag, im2col_dag, (ins Int16Regs:$mc)), !strconcat(G2S_STRINGS.inst_name, asm_str, ", $mc;"), []>, Requires<[hasPTX<80>, hasSM<90>]>; - def NAME # _CH: NVPTXInst<(outs), + def _CH : NVPTXInst<(outs), !con((ins rc:$dst, rc:$mbar, Int64Regs:$tmap), dims_dag, im2col_dag, (ins Int64Regs:$ch)), !strconcat(G2S_STRINGS.inst_name, asm_str, ", $ch;"), []>, Requires<[hasPTX<80>, hasSM<90>]>; - def NAME # _MC_CH: NVPTXInst<(outs), + def _MC_CH : NVPTXInst<(outs), !con((ins rc:$dst, rc:$mbar, Int64Regs:$tmap), dims_dag, im2col_dag, (ins Int16Regs:$mc, Int64Regs:$ch)), !strconcat(G2S_STRINGS.inst_name, asm_str, ", $mc, $ch;"), []>, Requires<[hasPTX<80>, hasSM<90>]>; @@ -661,11 +646,11 @@ multiclass CP_ASYNC_BULK_TENSOR_S2G_INTR { defvar asm_str = " [$tmap, {{" # dims_str # "}}], [$src]"; defvar rc = !if(shared32, Int32Regs, Int64Regs); - def NAME: NVPTXInst<(outs), + def "" : NVPTXInst<(outs), !con((ins rc:$src, Int64Regs:$tmap), dims_dag), !strconcat(S2G_STRINGS.inst_name, asm_str, ";"), []>, Requires<[hasPTX<80>, hasSM<90>]>; - def NAME # _CH: NVPTXInst<(outs), + def _CH : NVPTXInst<(outs), !con((ins rc:$src, Int64Regs:$tmap), dims_dag, (ins Int64Regs:$ch)), !strconcat(S2G_STRINGS.inst_name, asm_str, ", $ch;"), []>, Requires<[hasPTX<80>, hasSM<90>]>; @@ -685,11 +670,11 @@ multiclass CP_ASYNC_BULK_TENSOR_REDUCE_INTR defvar prefix = "cp.reduce.async.bulk.tensor" # "." # dim # "d" # ".global.shared::cta"; defvar suffix = "." # mode # ".bulk_group"; - def NAME: NVPTXInst<(outs), + def "" : NVPTXInst<(outs), !con((ins rc:$src, Int64Regs:$tmap), dims_dag, (ins TMAReductionFlags:$red_op)), !strconcat(prefix, "${red_op}", suffix, asm_str, ";"), []>, Requires<[hasPTX<80>, hasSM<90>]>; - def NAME # _CH: NVPTXInst<(outs), + def _CH : NVPTXInst<(outs), !con((ins rc:$src, Int64Regs:$tmap), dims_dag, (ins Int64Regs:$ch, TMAReductionFlags:$red_op)), !strconcat(prefix, "${red_op}", suffix, ".L2::cache_hint", asm_str, ", $ch;"), []>, Requires<[hasPTX<80>, hasSM<90>]>; @@ -735,11 +720,11 @@ multiclass CP_ASYNC_BULK_TENSOR_PREFETCH_INTR { defvar asm_str = !if(!eq(mode, "im2col"), !strconcat(asm_str_default, im2col_asm_str), asm_str_default); - def NAME: NVPTXInst<(outs), + def "" : NVPTXInst<(outs), !con((ins Int64Regs:$tmap), dims_dag, im2col_dag), !strconcat(PREFETCH_STRINGS.inst_name, asm_str, ";"), []>, Requires<[hasPTX<80>, hasSM<90>]>; - def NAME # _CH: NVPTXInst<(outs), + def _CH : NVPTXInst<(outs), !con((ins Int64Regs:$tmap), dims_dag, im2col_dag, (ins Int64Regs:$ch)), !strconcat(PREFETCH_STRINGS.inst_name, asm_str, ", $ch;"), []>, Requires<[hasPTX<80>, hasSM<90>]>; @@ -755,10 +740,10 @@ foreach dim = [1, 2, 3, 4, 5] in { //Prefetch and Prefetchu class PREFETCH_INTRS : - NVPTXInst<(outs), (ins Int64Regs:$addr), - InstName # " [$addr];", + BasicNVPTXInst<(outs), (ins ADDR:$addr), + InstName, [(!cast(!strconcat("int_nvvm_", - !subst(".", "_", InstName))) i64:$addr)]>, + !subst(".", "_", InstName))) addr:$addr)]>, Requires<[hasPTX<80>, hasSM<90>]>; @@ -769,36 +754,39 @@ def PREFETCH_LOCAL_L1 : PREFETCH_INTRS<"prefetch.local.L1">; def PREFETCH_GLOBAL_L2 : PREFETCH_INTRS<"prefetch.global.L2">; def PREFETCH_LOCAL_L2 : PREFETCH_INTRS<"prefetch.local.L2">; -def PREFETCH_GLOBAL_L2_EVICT_NORMAL : NVPTXInst<(outs), (ins Int64Regs:$addr), - "prefetch.global.L2::evict_normal" # " [$addr];", - [(!cast("int_nvvm_prefetch_global_L2_evict_normal") i64:$addr)]>, +def PREFETCH_GLOBAL_L2_EVICT_NORMAL : BasicNVPTXInst<(outs), (ins ADDR:$addr), + "prefetch.global.L2::evict_normal", + [(int_nvvm_prefetch_global_L2_evict_normal addr:$addr)]>, Requires<[hasPTX<80>, hasSM<90>]>; -def PREFETCH_GLOBAL_L2_EVICT_LAST : NVPTXInst<(outs), (ins Int64Regs:$addr), - "prefetch.global.L2::evict_last" # " [$addr];", - [(!cast("int_nvvm_prefetch_global_L2_evict_last") i64:$addr)]>, +def PREFETCH_GLOBAL_L2_EVICT_LAST : BasicNVPTXInst<(outs), (ins ADDR:$addr), + "prefetch.global.L2::evict_last", + [(int_nvvm_prefetch_global_L2_evict_last addr:$addr)]>, Requires<[hasPTX<80>, hasSM<90>]>; def PREFETCHU_L1 : PREFETCH_INTRS<"prefetchu.L1">; //Applypriority intrinsics -class APPLYPRIORITY_L2_INTRS : - NVPTXInst<(outs), (ins Int64Regs:$addr, Int64Regs:$size), - StrJoin<".", ["applypriority", addr , "L2::evict_normal"]>.ret # " [$addr], $size;", - [(!cast(StrJoin<"_", ["int_nvvm_applypriority", addr , "L2_evict_normal"]>.ret) - i64:$addr, i64:$size)]>, +class APPLYPRIORITY_L2_INTRS : + BasicNVPTXInst<(outs), (ins ADDR:$addr, Int64Regs:$size), + StrJoin<".", ["applypriority", addrspace , "L2::evict_normal"]>.ret, + [(!cast(StrJoin<"_", ["int_nvvm_applypriority", addrspace , "L2_evict_normal"]>.ret) + addr:$addr, i64:$size)]>, Requires<[hasPTX<74>, hasSM<80>]>; def APPLYPRIORITY_L2_EVICT_NORMAL : APPLYPRIORITY_L2_INTRS<"">; def APPLYPRIORITY_GLOBAL_L2_EVICT_NORMAL : APPLYPRIORITY_L2_INTRS<"global">; //Discard Intrinsics -class DISCARD_L2_INTRS : - NVPTXInst<(outs), (ins Int64Regs:$addr), - StrJoin<".", ["discard", Addr , "L2"]>.ret # " [$addr], 128;", - [(!cast(StrJoin<"_", ["int_nvvm_discard", Addr , "L2"]>.ret) - i64:$addr, (i64 128))]>, + +def discard_size_imm : TImmLeaf; + +class DISCARD_L2_INTRS : + BasicNVPTXInst<(outs), (ins ADDR:$addr, i64imm:$size), + StrJoin<".", ["discard", addrspace , "L2"]>.ret, + [(!cast(StrJoin<"_", ["int_nvvm_discard", addrspace , "L2"]>.ret) + addr:$addr, discard_size_imm:$size)]>, Requires<[hasPTX<74>, hasSM<80>]>; def DISCARD_L2 : DISCARD_L2_INTRS<"">; @@ -809,8 +797,8 @@ def DISCARD_GLOBAL_L2 : DISCARD_L2_INTRS<"global">; //----------------------------------- multiclass MBARRIER_INIT { - def "" : NVPTXInst<(outs), (ins ADDR:$addr, Int32Regs:$count), - "mbarrier.init" # AddrSpace # ".b64 [$addr], $count;", + def "" : BasicNVPTXInst<(outs), (ins ADDR:$addr, Int32Regs:$count), + "mbarrier.init" # AddrSpace # ".b64", [(Intrin addr:$addr, i32:$count)]>, Requires<[hasPTX<70>, hasSM<80>]>; } @@ -820,8 +808,8 @@ defm MBARRIER_INIT_SHARED : MBARRIER_INIT<".shared", int_nvvm_mbarrier_init_shared>; multiclass MBARRIER_INVAL { - def "" : NVPTXInst<(outs), (ins ADDR:$addr), - "mbarrier.inval" # AddrSpace # ".b64 [$addr];", + def "" : BasicNVPTXInst<(outs), (ins ADDR:$addr), + "mbarrier.inval" # AddrSpace # ".b64", [(Intrin addr:$addr)]>, Requires<[hasPTX<70>, hasSM<80>]>; } @@ -831,8 +819,8 @@ defm MBARRIER_INVAL_SHARED : MBARRIER_INVAL<".shared", int_nvvm_mbarrier_inval_shared>; multiclass MBARRIER_ARRIVE { - def "" : NVPTXInst<(outs Int64Regs:$state), (ins ADDR:$addr), - "mbarrier.arrive" # AddrSpace # ".b64 $state, [$addr];", + def "" : BasicNVPTXInst<(outs Int64Regs:$state), (ins ADDR:$addr), + "mbarrier.arrive" # AddrSpace # ".b64", [(set i64:$state, (Intrin addr:$addr))]>, Requires<[hasPTX<70>, hasSM<80>]>; } @@ -842,9 +830,9 @@ defm MBARRIER_ARRIVE_SHARED : MBARRIER_ARRIVE<".shared", int_nvvm_mbarrier_arrive_shared>; multiclass MBARRIER_ARRIVE_NOCOMPLETE { - def "" : NVPTXInst<(outs Int64Regs:$state), + def "" : BasicNVPTXInst<(outs Int64Regs:$state), (ins ADDR:$addr, Int32Regs:$count), - "mbarrier.arrive.noComplete" # AddrSpace # ".b64 $state, [$addr], $count;", + "mbarrier.arrive.noComplete" # AddrSpace # ".b64", [(set i64:$state, (Intrin addr:$addr, i32:$count))]>, Requires<[hasPTX<70>, hasSM<80>]>; } @@ -855,8 +843,8 @@ defm MBARRIER_ARRIVE_NOCOMPLETE_SHARED : MBARRIER_ARRIVE_NOCOMPLETE<".shared", int_nvvm_mbarrier_arrive_noComplete_shared>; multiclass MBARRIER_ARRIVE_DROP { - def "" : NVPTXInst<(outs Int64Regs:$state), (ins ADDR:$addr), - "mbarrier.arrive_drop" # AddrSpace # ".b64 $state, [$addr];", + def "" : BasicNVPTXInst<(outs Int64Regs:$state), (ins ADDR:$addr), + "mbarrier.arrive_drop" # AddrSpace # ".b64", [(set i64:$state, (Intrin addr:$addr))]>, Requires<[hasPTX<70>, hasSM<80>]>; } @@ -867,9 +855,9 @@ defm MBARRIER_ARRIVE_DROP_SHARED : MBARRIER_ARRIVE_DROP<".shared", int_nvvm_mbarrier_arrive_drop_shared>; multiclass MBARRIER_ARRIVE_DROP_NOCOMPLETE { - def "" : NVPTXInst<(outs Int64Regs:$state), + def "" : BasicNVPTXInst<(outs Int64Regs:$state), (ins ADDR:$addr, Int32Regs:$count), - "mbarrier.arrive_drop.noComplete" # AddrSpace # ".b64 $state, [$addr], $count;", + "mbarrier.arrive_drop.noComplete" # AddrSpace # ".b64", [(set i64:$state, (Intrin addr:$addr, i32:$count))]>, Requires<[hasPTX<70>, hasSM<80>]>; } @@ -881,8 +869,8 @@ defm MBARRIER_ARRIVE_DROP_NOCOMPLETE_SHARED : int_nvvm_mbarrier_arrive_drop_noComplete_shared>; multiclass MBARRIER_TEST_WAIT { - def "" : NVPTXInst<(outs Int1Regs:$res), (ins ADDR:$addr, Int64Regs:$state), - "mbarrier.test_wait" # AddrSpace # ".b64 $res, [$addr], $state;", + def "" : BasicNVPTXInst<(outs Int1Regs:$res), (ins ADDR:$addr, Int64Regs:$state), + "mbarrier.test_wait" # AddrSpace # ".b64", [(set i1:$res, (Intrin addr:$addr, i64:$state))]>, Requires<[hasPTX<70>, hasSM<80>]>; } @@ -1790,93 +1778,74 @@ def : Pat<(int_nvvm_ff_to_e5m2x2_rn f32:$a, f32:$b), def : Pat<(int_nvvm_ff_to_e5m2x2_rn_relu f32:$a, f32:$b), (CVT_e5m2x2_f32 $a, $b, CvtRN_RELU)>; -def : Pat<(int_nvvm_f16x2_to_e4m3x2_rn Int32Regs:$a), +def : Pat<(int_nvvm_f16x2_to_e4m3x2_rn v2f16:$a), (CVT_e4m3x2_f16x2 $a, CvtRN)>; -def : Pat<(int_nvvm_f16x2_to_e4m3x2_rn_relu Int32Regs:$a), +def : Pat<(int_nvvm_f16x2_to_e4m3x2_rn_relu v2f16:$a), (CVT_e4m3x2_f16x2 $a, CvtRN_RELU)>; -def : Pat<(int_nvvm_f16x2_to_e5m2x2_rn Int32Regs:$a), +def : Pat<(int_nvvm_f16x2_to_e5m2x2_rn v2f16:$a), (CVT_e5m2x2_f16x2 $a, CvtRN)>; -def : Pat<(int_nvvm_f16x2_to_e5m2x2_rn_relu Int32Regs:$a), +def : Pat<(int_nvvm_f16x2_to_e5m2x2_rn_relu v2f16:$a), (CVT_e5m2x2_f16x2 $a, CvtRN_RELU)>; -def : Pat<(int_nvvm_e4m3x2_to_f16x2_rn Int16Regs:$a), +def : Pat<(int_nvvm_e4m3x2_to_f16x2_rn i16:$a), (CVT_f16x2_e4m3x2 $a, CvtRN)>; -def : Pat<(int_nvvm_e4m3x2_to_f16x2_rn_relu Int16Regs:$a), +def : Pat<(int_nvvm_e4m3x2_to_f16x2_rn_relu i16:$a), (CVT_f16x2_e4m3x2 $a, CvtRN_RELU)>; -def : Pat<(int_nvvm_e5m2x2_to_f16x2_rn Int16Regs:$a), +def : Pat<(int_nvvm_e5m2x2_to_f16x2_rn i16:$a), (CVT_f16x2_e5m2x2 $a, CvtRN)>; -def : Pat<(int_nvvm_e5m2x2_to_f16x2_rn_relu Int16Regs:$a), +def : Pat<(int_nvvm_e5m2x2_to_f16x2_rn_relu i16:$a), (CVT_f16x2_e5m2x2 $a, CvtRN_RELU)>; -def : Pat<(int_nvvm_ff_to_e2m3x2_rn_satfinite f32:$a, f32:$b), - (CVT_e2m3x2_f32_sf $a, $b, CvtRN)>, - Requires<[hasPTX<86>, hasSM<100>, hasArchAccelFeatures]>; -def : Pat<(int_nvvm_ff_to_e2m3x2_rn_relu_satfinite f32:$a, f32:$b), - (CVT_e2m3x2_f32_sf $a, $b, CvtRN_RELU)>, - Requires<[hasPTX<86>, hasSM<100>, hasArchAccelFeatures]>; -def : Pat<(int_nvvm_ff_to_e3m2x2_rn_satfinite f32:$a, f32:$b), - (CVT_e3m2x2_f32_sf $a, $b, CvtRN)>, - Requires<[hasPTX<86>, hasSM<100>, hasArchAccelFeatures]>; -def : Pat<(int_nvvm_ff_to_e3m2x2_rn_relu_satfinite f32:$a, f32:$b), - (CVT_e3m2x2_f32_sf $a, $b, CvtRN_RELU)>, - Requires<[hasPTX<86>, hasSM<100>, hasArchAccelFeatures]>; +let Predicates = [hasPTX<86>, hasSM<100>, hasArchAccelFeatures] in { + def : Pat<(int_nvvm_ff_to_e2m3x2_rn_satfinite f32:$a, f32:$b), + (CVT_e2m3x2_f32_sf $a, $b, CvtRN)>; + def : Pat<(int_nvvm_ff_to_e2m3x2_rn_relu_satfinite f32:$a, f32:$b), + (CVT_e2m3x2_f32_sf $a, $b, CvtRN_RELU)>; + def : Pat<(int_nvvm_ff_to_e3m2x2_rn_satfinite f32:$a, f32:$b), + (CVT_e3m2x2_f32_sf $a, $b, CvtRN)>; + def : Pat<(int_nvvm_ff_to_e3m2x2_rn_relu_satfinite f32:$a, f32:$b), + (CVT_e3m2x2_f32_sf $a, $b, CvtRN_RELU)>; -def : Pat<(int_nvvm_e2m3x2_to_f16x2_rn i16:$a), - (CVT_f16x2_e2m3x2 $a, CvtRN)>, - Requires<[hasPTX<86>, hasSM<100>, hasArchAccelFeatures]>; -def : Pat<(int_nvvm_e2m3x2_to_f16x2_rn_relu i16:$a), - (CVT_f16x2_e2m3x2 $a, CvtRN_RELU)>, - Requires<[hasPTX<86>, hasSM<100>, hasArchAccelFeatures]>; -def : Pat<(int_nvvm_e3m2x2_to_f16x2_rn i16:$a), - (CVT_f16x2_e3m2x2 $a, CvtRN)>, - Requires<[hasPTX<86>, hasSM<100>, hasArchAccelFeatures]>; -def : Pat<(int_nvvm_e3m2x2_to_f16x2_rn_relu i16:$a), - (CVT_f16x2_e3m2x2 $a, CvtRN_RELU)>, - Requires<[hasPTX<86>, hasSM<100>, hasArchAccelFeatures]>; - -def : Pat<(int_nvvm_ff_to_e2m1x2_rn_satfinite f32:$a, f32:$b), - (CVT_e2m1x2_f32_sf $a, $b, CvtRN)>, - Requires<[hasPTX<86>, hasSM<100>, hasArchAccelFeatures]>; -def : Pat<(int_nvvm_ff_to_e2m1x2_rn_relu_satfinite f32:$a, f32:$b), - (CVT_e2m1x2_f32_sf $a, $b, CvtRN_RELU)>, - Requires<[hasPTX<86>, hasSM<100>, hasArchAccelFeatures]>; - -def : Pat<(int_nvvm_e2m1x2_to_f16x2_rn Int16Regs:$a), - (CVT_f16x2_e2m1x2 $a, CvtRN)>, - Requires<[hasPTX<86>, hasSM<100>, hasArchAccelFeatures]>; -def : Pat<(int_nvvm_e2m1x2_to_f16x2_rn_relu Int16Regs:$a), - (CVT_f16x2_e2m1x2 $a, CvtRN_RELU)>, - Requires<[hasPTX<86>, hasSM<100>, hasArchAccelFeatures]>; + def : Pat<(int_nvvm_e2m3x2_to_f16x2_rn i16:$a), + (CVT_f16x2_e2m3x2 $a, CvtRN)>; + def : Pat<(int_nvvm_e2m3x2_to_f16x2_rn_relu i16:$a), + (CVT_f16x2_e2m3x2 $a, CvtRN_RELU)>; + def : Pat<(int_nvvm_e3m2x2_to_f16x2_rn i16:$a), + (CVT_f16x2_e3m2x2 $a, CvtRN)>; + def : Pat<(int_nvvm_e3m2x2_to_f16x2_rn_relu i16:$a), + (CVT_f16x2_e3m2x2 $a, CvtRN_RELU)>; -def : Pat<(int_nvvm_ff_to_ue8m0x2_rz f32:$a, f32:$b), - (CVT_ue8m0x2_f32 $a, $b, CvtRZ)>, - Requires<[hasPTX<86>, hasSM<100>, hasArchAccelFeatures]>; -def : Pat<(int_nvvm_ff_to_ue8m0x2_rz_satfinite f32:$a, f32:$b), - (CVT_ue8m0x2_f32_sf $a, $b, CvtRZ)>, - Requires<[hasPTX<86>, hasSM<100>, hasArchAccelFeatures]>; -def : Pat<(int_nvvm_ff_to_ue8m0x2_rp f32:$a, f32:$b), - (CVT_ue8m0x2_f32 $a, $b, CvtRP)>, - Requires<[hasPTX<86>, hasSM<100>, hasArchAccelFeatures]>; -def : Pat<(int_nvvm_ff_to_ue8m0x2_rp_satfinite f32:$a, f32:$b), - (CVT_ue8m0x2_f32_sf $a, $b, CvtRP)>, - Requires<[hasPTX<86>, hasSM<100>, hasArchAccelFeatures]>; - -def : Pat<(int_nvvm_bf16x2_to_ue8m0x2_rz Int32Regs:$a), - (CVT_ue8m0x2_bf16x2 $a, CvtRZ)>, - Requires<[hasPTX<86>, hasSM<100>, hasArchAccelFeatures]>; -def : Pat<(int_nvvm_bf16x2_to_ue8m0x2_rz_satfinite Int32Regs:$a), - (CVT_ue8m0x2_bf16x2_sf $a, CvtRZ)>, - Requires<[hasPTX<86>, hasSM<100>, hasArchAccelFeatures]>; -def : Pat<(int_nvvm_bf16x2_to_ue8m0x2_rp Int32Regs:$a), - (CVT_ue8m0x2_bf16x2 $a, CvtRP)>, - Requires<[hasPTX<86>, hasSM<100>, hasArchAccelFeatures]>; -def : Pat<(int_nvvm_bf16x2_to_ue8m0x2_rp_satfinite Int32Regs:$a), - (CVT_ue8m0x2_bf16x2_sf $a, CvtRP)>, - Requires<[hasPTX<86>, hasSM<100>, hasArchAccelFeatures]>; - -def : Pat<(int_nvvm_ue8m0x2_to_bf16x2 i16:$a), - (CVT_bf16x2_ue8m0x2 $a)>, - Requires<[hasPTX<86>, hasSM<100>, hasArchAccelFeatures]>; + def : Pat<(int_nvvm_ff_to_e2m1x2_rn_satfinite f32:$a, f32:$b), + (CVT_e2m1x2_f32_sf $a, $b, CvtRN)>; + def : Pat<(int_nvvm_ff_to_e2m1x2_rn_relu_satfinite f32:$a, f32:$b), + (CVT_e2m1x2_f32_sf $a, $b, CvtRN_RELU)>; + + def : Pat<(int_nvvm_e2m1x2_to_f16x2_rn i16:$a), + (CVT_f16x2_e2m1x2 $a, CvtRN)>; + def : Pat<(int_nvvm_e2m1x2_to_f16x2_rn_relu i16:$a), + (CVT_f16x2_e2m1x2 $a, CvtRN_RELU)>; + + def : Pat<(int_nvvm_ff_to_ue8m0x2_rz f32:$a, f32:$b), + (CVT_ue8m0x2_f32 $a, $b, CvtRZ)>; + def : Pat<(int_nvvm_ff_to_ue8m0x2_rz_satfinite f32:$a, f32:$b), + (CVT_ue8m0x2_f32_sf $a, $b, CvtRZ)>; + def : Pat<(int_nvvm_ff_to_ue8m0x2_rp f32:$a, f32:$b), + (CVT_ue8m0x2_f32 $a, $b, CvtRP)>; + def : Pat<(int_nvvm_ff_to_ue8m0x2_rp_satfinite f32:$a, f32:$b), + (CVT_ue8m0x2_f32_sf $a, $b, CvtRP)>; + + def : Pat<(int_nvvm_bf16x2_to_ue8m0x2_rz v2bf16:$a), + (CVT_ue8m0x2_bf16x2 $a, CvtRZ)>; + def : Pat<(int_nvvm_bf16x2_to_ue8m0x2_rz_satfinite v2bf16:$a), + (CVT_ue8m0x2_bf16x2_sf $a, CvtRZ)>; + def : Pat<(int_nvvm_bf16x2_to_ue8m0x2_rp v2bf16:$a), + (CVT_ue8m0x2_bf16x2 $a, CvtRP)>; + def : Pat<(int_nvvm_bf16x2_to_ue8m0x2_rp_satfinite v2bf16:$a), + (CVT_ue8m0x2_bf16x2_sf $a, CvtRP)>; + + def : Pat<(int_nvvm_ue8m0x2_to_bf16x2 i16:$a), + (CVT_bf16x2_ue8m0x2 $a)>; +} // // FNS @@ -1920,14 +1889,14 @@ class ATOMIC_GENERIC_CHK multiclass F_ATOMIC_2 preds> { - defvar asm_str = "atom" # sem_str # as_str # "." # op_str # " \t$dst, [$addr], $b;"; + defvar asm_str = "atom" # sem_str # as_str # "." # op_str; let mayLoad = 1, mayStore = 1, hasSideEffects = 1 in { - def r : NVPTXInst<(outs t.RC:$dst), (ins ADDR:$addr, t.RC:$b), + def r : BasicNVPTXInst<(outs t.RC:$dst), (ins ADDR:$addr, t.RC:$b), asm_str, [(set t.Ty:$dst, (op addr:$addr, t.Ty:$b))]>, Requires; if t.SupportsImm then - def i : NVPTXInst<(outs t.RC:$dst), (ins ADDR:$addr, t.Imm:$b), + def i : BasicNVPTXInst<(outs t.RC:$dst), (ins ADDR:$addr, t.Imm:$b), asm_str, [(set t.Ty:$dst, (op addr:$addr, (t.Ty t.ImmNode:$b)))]>, Requires; @@ -1937,27 +1906,27 @@ multiclass F_ATOMIC_2 preds> { - defvar asm_str = "atom" # sem_str # as_str # "." # op_str # " \t$dst, [$addr], $b, $c;"; + defvar asm_str = "atom" # sem_str # as_str # "." # op_str; let mayLoad = 1, mayStore = 1, hasSideEffects = 1 in { - def rr : NVPTXInst<(outs t.RC:$dst), + def rr : BasicNVPTXInst<(outs t.RC:$dst), (ins ADDR:$addr, t.RC:$b, t.RC:$c), asm_str, [(set t.Ty:$dst, (op addr:$addr, t.Ty:$b, t.Ty:$c))]>, Requires; - def ir : NVPTXInst<(outs t.RC:$dst), + def ir : BasicNVPTXInst<(outs t.RC:$dst), (ins ADDR:$addr, t.Imm:$b, t.RC:$c), asm_str, [(set t.Ty:$dst, (op addr:$addr, (t.Ty t.ImmNode:$b), t.Ty:$c))]>, Requires; - def ri : NVPTXInst<(outs t.RC:$dst), + def ri : BasicNVPTXInst<(outs t.RC:$dst), (ins ADDR:$addr, t.RC:$b, t.Imm:$c), asm_str, [(set t.Ty:$dst, (op addr:$addr, t.Ty:$b, (t.Ty t.ImmNode:$c)))]>, Requires; - def ii : NVPTXInst<(outs t.RC:$dst), + def ii : BasicNVPTXInst<(outs t.RC:$dst), (ins ADDR:$addr, t.Imm:$b, t.Imm:$c), asm_str, [(set t.Ty:$dst, (op addr:$addr, (t.Ty t.ImmNode:$b), (t.Ty t.ImmNode:$c)))]>, @@ -2100,7 +2069,7 @@ multiclass ATOM3S_impl; + t, !listconcat(Preds, [hasAtomScope])>; } } } @@ -4454,1956 +4423,616 @@ defm SULD_3D_V4I32_ZERO : SULD_3D_V4<"suld.b.3d.v4.b32.zero", Int32Regs>; //----------------------------------- let IsSurfTexQuery = true in { -def TXQ_CHANNEL_ORDER_R - : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a), - "txq.channel_order.b32 \t$d, [$a];", - []>; -def TXQ_CHANNEL_ORDER_I - : NVPTXInst<(outs Int32Regs:$d), (ins i64imm:$a), - "txq.channel_order.b32 \t$d, [$a];", - []>; -def TXQ_CHANNEL_DATA_TYPE_R - : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a), - "txq.channel_data_type.b32 \t$d, [$a];", - []>; -def TXQ_CHANNEL_DATA_TYPE_I - : NVPTXInst<(outs Int32Regs:$d), (ins i64imm:$a), - "txq.channel_data_type.b32 \t$d, [$a];", - []>; -def TXQ_WIDTH_R - : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a), - "txq.width.b32 \t$d, [$a];", - []>; -def TXQ_WIDTH_I - : NVPTXInst<(outs Int32Regs:$d), (ins i64imm:$a), - "txq.width.b32 \t$d, [$a];", - []>; -def TXQ_HEIGHT_R - : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a), - "txq.height.b32 \t$d, [$a];", - []>; -def TXQ_HEIGHT_I - : NVPTXInst<(outs Int32Regs:$d), (ins i64imm:$a), - "txq.height.b32 \t$d, [$a];", - []>; -def TXQ_DEPTH_R - : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a), - "txq.depth.b32 \t$d, [$a];", - []>; -def TXQ_DEPTH_I - : NVPTXInst<(outs Int32Regs:$d), (ins i64imm:$a), - "txq.depth.b32 \t$d, [$a];", - []>; -def TXQ_ARRAY_SIZE_R - : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a), - "txq.array_size.b32 \t$d, [$a];", - []>; -def TXQ_ARRAY_SIZE_I - : NVPTXInst<(outs Int32Regs:$d), (ins i64imm:$a), - "txq.array_size.b32 \t$d, [$a];", - []>; -def TXQ_NUM_SAMPLES_R - : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a), - "txq.num_samples.b32 \t$d, [$a];", - []>; -def TXQ_NUM_SAMPLES_I - : NVPTXInst<(outs Int32Regs:$d), (ins i64imm:$a), - "txq.num_samples.b32 \t$d, [$a];", - []>; -def TXQ_NUM_MIPMAP_LEVELS_R - : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a), - "txq.num_mipmap_levels.b32 \t$d, [$a];", - []>; -def TXQ_NUM_MIPMAP_LEVELS_I - : NVPTXInst<(outs Int32Regs:$d), (ins i64imm:$a), - "txq.num_mipmap_levels.b32 \t$d, [$a];", - []>; + foreach query = ["channel_order", "channel_data_type", "width", "height", + "depth", "array_size", "num_samples", "num_mipmap_levels"] in { + def TXQ_ # !toupper(query) # _R + : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a), + "txq." # query # ".b32 \t$d, [$a];", + [(set i32:$d, (!cast("int_nvvm_txq_" # query) i64:$a))]>; + def TXQ_ # !toupper(query) # _I + : NVPTXInst<(outs Int32Regs:$d), (ins i64imm:$a), + "txq." # query # ".b32 \t$d, [$a];", + []>; + } } -def : Pat<(int_nvvm_txq_channel_order i64:$a), - (TXQ_CHANNEL_ORDER_R $a)>; -def : Pat<(int_nvvm_txq_channel_data_type i64:$a), - (TXQ_CHANNEL_DATA_TYPE_R $a)>; -def : Pat<(int_nvvm_txq_width i64:$a), - (TXQ_WIDTH_R $a)>; -def : Pat<(int_nvvm_txq_height i64:$a), - (TXQ_HEIGHT_R $a)>; -def : Pat<(int_nvvm_txq_depth i64:$a), - (TXQ_DEPTH_R $a)>; -def : Pat<(int_nvvm_txq_array_size i64:$a), - (TXQ_ARRAY_SIZE_R $a)>; -def : Pat<(int_nvvm_txq_num_samples i64:$a), - (TXQ_NUM_SAMPLES_R $a)>; -def : Pat<(int_nvvm_txq_num_mipmap_levels i64:$a), - (TXQ_NUM_MIPMAP_LEVELS_R $a)>; - - //----------------------------------- // Surface Query Intrinsics //----------------------------------- let IsSurfTexQuery = true in { -def SUQ_CHANNEL_ORDER_R - : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a), - "suq.channel_order.b32 \t$d, [$a];", - []>; -def SUQ_CHANNEL_ORDER_I - : NVPTXInst<(outs Int32Regs:$d), (ins i64imm:$a), - "suq.channel_order.b32 \t$d, [$a];", - []>; -def SUQ_CHANNEL_DATA_TYPE_R - : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a), - "suq.channel_data_type.b32 \t$d, [$a];", - []>; -def SUQ_CHANNEL_DATA_TYPE_I - : NVPTXInst<(outs Int32Regs:$d), (ins i64imm:$a), - "suq.channel_data_type.b32 \t$d, [$a];", - []>; -def SUQ_WIDTH_R - : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a), - "suq.width.b32 \t$d, [$a];", - []>; -def SUQ_WIDTH_I - : NVPTXInst<(outs Int32Regs:$d), (ins i64imm:$a), - "suq.width.b32 \t$d, [$a];", - []>; -def SUQ_HEIGHT_R - : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a), - "suq.height.b32 \t$d, [$a];", - []>; -def SUQ_HEIGHT_I - : NVPTXInst<(outs Int32Regs:$d), (ins i64imm:$a), - "suq.height.b32 \t$d, [$a];", - []>; -def SUQ_DEPTH_R - : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a), - "suq.depth.b32 \t$d, [$a];", - []>; -def SUQ_DEPTH_I - : NVPTXInst<(outs Int32Regs:$d), (ins i64imm:$a), - "suq.depth.b32 \t$d, [$a];", - []>; -def SUQ_ARRAY_SIZE_R - : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a), - "suq.array_size.b32 \t$d, [$a];", - []>; -def SUQ_ARRAY_SIZE_I - : NVPTXInst<(outs Int32Regs:$d), (ins i64imm:$a), - "suq.array_size.b32 \t$d, [$a];", - []>; + foreach query = ["channel_order", "channel_data_type", "width", "height", "depth", "array_size"] in { + def SUQ_ # !toupper(query) # _R + : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a), + "suq." # query # ".b32 \t$d, [$a];", + [(set i32:$d, (!cast("int_nvvm_suq_" # query) i64:$a))]>; + def SUQ_ # !toupper(query) # _I + : NVPTXInst<(outs Int32Regs:$d), (ins i64imm:$a), + "suq." # query # ".b32 \t$d, [$a];", + []>; + } } -def : Pat<(int_nvvm_suq_channel_order i64:$a), - (SUQ_CHANNEL_ORDER_R $a)>; -def : Pat<(int_nvvm_suq_channel_data_type i64:$a), - (SUQ_CHANNEL_DATA_TYPE_R $a)>; -def : Pat<(int_nvvm_suq_width i64:$a), - (SUQ_WIDTH_R $a)>; -def : Pat<(int_nvvm_suq_height i64:$a), - (SUQ_HEIGHT_R $a)>; -def : Pat<(int_nvvm_suq_depth i64:$a), - (SUQ_DEPTH_R $a)>; -def : Pat<(int_nvvm_suq_array_size i64:$a), - (SUQ_ARRAY_SIZE_R $a)>; - - //===- Handle Query -------------------------------------------------------===// // TODO: These intrinsics are not yet finalized, pending PTX ISA design work def ISTYPEP_SAMPLER - : NVPTXInst<(outs Int1Regs:$d), (ins Int64Regs:$a), - "istypep.samplerref \t$d, $a;", + : BasicNVPTXInst<(outs Int1Regs:$d), (ins Int64Regs:$a), + "istypep.samplerref", [(set i1:$d, (int_nvvm_istypep_sampler i64:$a))]>; def ISTYPEP_SURFACE - : NVPTXInst<(outs Int1Regs:$d), (ins Int64Regs:$a), - "istypep.surfref \t$d, $a;", + : BasicNVPTXInst<(outs Int1Regs:$d), (ins Int64Regs:$a), + "istypep.surfref", [(set i1:$d, (int_nvvm_istypep_surface i64:$a))]>; def ISTYPEP_TEXTURE - : NVPTXInst<(outs Int1Regs:$d), (ins Int64Regs:$a), - "istypep.texref \t$d, $a;", + : BasicNVPTXInst<(outs Int1Regs:$d), (ins Int64Regs:$a), + "istypep.texref", [(set i1:$d, (int_nvvm_istypep_texture i64:$a))]>; //===- Surface Stores -----------------------------------------------------===// let IsSust = true in { -class SUST_1D_base +class SUST_1D_base pat> : NVPTXInst<(outs), !con(surf, (ins Int32Regs:$x, intype:$r)), - inst # " \t[$s, \\{$x\\}], \\{$r\\};", - []>; + inst # " \t[$s, \\{$x\\}], \\{$r\\};", pat>; multiclass SUST_1D { - def _R : SUST_1D_base; - def _I : SUST_1D_base; + defvar intr = !cast("int_nvvm_" # !tolower(NAME)); + + def _R : SUST_1D_base; + def _I : SUST_1D_base; } -defm SUST_B_1D_B8_CLAMP : SUST_1D<"sust.b.1d.b8.clamp", Int16Regs>; -defm SUST_B_1D_B16_CLAMP : SUST_1D<"sust.b.1d.b16.clamp", Int16Regs>; -defm SUST_B_1D_B32_CLAMP : SUST_1D<"sust.b.1d.b32.clamp", Int32Regs>; -defm SUST_B_1D_B64_CLAMP : SUST_1D<"sust.b.1d.b64.clamp", Int64Regs>; +defm SUST_B_1D_I8_CLAMP : SUST_1D<"sust.b.1d.b8.clamp", Int16Regs>; +defm SUST_B_1D_I16_CLAMP : SUST_1D<"sust.b.1d.b16.clamp", Int16Regs>; +defm SUST_B_1D_I32_CLAMP : SUST_1D<"sust.b.1d.b32.clamp", Int32Regs>; +defm SUST_B_1D_I64_CLAMP : SUST_1D<"sust.b.1d.b64.clamp", Int64Regs>; -defm SUST_B_1D_B8_TRAP : SUST_1D<"sust.b.1d.b8.trap", Int16Regs>; -defm SUST_B_1D_B16_TRAP : SUST_1D<"sust.b.1d.b16.trap", Int16Regs>; -defm SUST_B_1D_B32_TRAP : SUST_1D<"sust.b.1d.b32.trap", Int32Regs>; -defm SUST_B_1D_B64_TRAP : SUST_1D<"sust.b.1d.b64.trap", Int64Regs>; +defm SUST_B_1D_I8_TRAP : SUST_1D<"sust.b.1d.b8.trap", Int16Regs>; +defm SUST_B_1D_I16_TRAP : SUST_1D<"sust.b.1d.b16.trap", Int16Regs>; +defm SUST_B_1D_I32_TRAP : SUST_1D<"sust.b.1d.b32.trap", Int32Regs>; +defm SUST_B_1D_I64_TRAP : SUST_1D<"sust.b.1d.b64.trap", Int64Regs>; -defm SUST_B_1D_B8_ZERO : SUST_1D<"sust.b.1d.b8.zero", Int16Regs>; -defm SUST_B_1D_B16_ZERO : SUST_1D<"sust.b.1d.b16.zero", Int16Regs>; -defm SUST_B_1D_B32_ZERO : SUST_1D<"sust.b.1d.b32.zero", Int32Regs>; -defm SUST_B_1D_B64_ZERO : SUST_1D<"sust.b.1d.b64.zero", Int64Regs>; +defm SUST_B_1D_I8_ZERO : SUST_1D<"sust.b.1d.b8.zero", Int16Regs>; +defm SUST_B_1D_I16_ZERO : SUST_1D<"sust.b.1d.b16.zero", Int16Regs>; +defm SUST_B_1D_I32_ZERO : SUST_1D<"sust.b.1d.b32.zero", Int32Regs>; +defm SUST_B_1D_I64_ZERO : SUST_1D<"sust.b.1d.b64.zero", Int64Regs>; -defm SUST_P_1D_B8_TRAP : SUST_1D<"sust.p.1d.b8.trap", Int16Regs>; -defm SUST_P_1D_B16_TRAP : SUST_1D<"sust.p.1d.b16.trap", Int16Regs>; -defm SUST_P_1D_B32_TRAP : SUST_1D<"sust.p.1d.b32.trap", Int32Regs>; +defm SUST_P_1D_I8_TRAP : SUST_1D<"sust.p.1d.b8.trap", Int16Regs>; +defm SUST_P_1D_I16_TRAP : SUST_1D<"sust.p.1d.b16.trap", Int16Regs>; +defm SUST_P_1D_I32_TRAP : SUST_1D<"sust.p.1d.b32.trap", Int32Regs>; -class SUST_1D_V2_base +class SUST_1D_V2_base pat> : NVPTXInst<(outs), !con(surf, (ins Int32Regs:$x, intype:$r, intype:$g)), inst # " \t[$s, \\{$x\\}], \\{$r, $g\\};", - []>; + pat>; multiclass SUST_1D_V2 { - def _R : SUST_1D_V2_base; - def _I : SUST_1D_V2_base; + defvar intr = !cast("int_nvvm_" # !tolower(NAME)); + def _R : SUST_1D_V2_base; + def _I : SUST_1D_V2_base; } -defm SUST_B_1D_V2B8_CLAMP : SUST_1D_V2<"sust.b.1d.v2.b8.clamp", Int16Regs>; -defm SUST_B_1D_V2B16_CLAMP : SUST_1D_V2<"sust.b.1d.v2.b16.clamp", Int16Regs>; -defm SUST_B_1D_V2B32_CLAMP : SUST_1D_V2<"sust.b.1d.v2.b32.clamp", Int32Regs>; -defm SUST_B_1D_V2B64_CLAMP : SUST_1D_V2<"sust.b.1d.v2.b64.clamp", Int64Regs>; +// int_nvvm_sust_b_1d_v2i8_clamp -defm SUST_B_1D_V2B8_TRAP : SUST_1D_V2<"sust.b.1d.v2.b8.trap", Int16Regs>; -defm SUST_B_1D_V2B16_TRAP : SUST_1D_V2<"sust.b.1d.v2.b16.trap", Int16Regs>; -defm SUST_B_1D_V2B32_TRAP : SUST_1D_V2<"sust.b.1d.v2.b32.trap", Int32Regs>; -defm SUST_B_1D_V2B64_TRAP : SUST_1D_V2<"sust.b.1d.v2.b64.trap", Int64Regs>; +defm SUST_B_1D_V2I8_CLAMP : SUST_1D_V2<"sust.b.1d.v2.b8.clamp", Int16Regs>; +defm SUST_B_1D_V2I16_CLAMP : SUST_1D_V2<"sust.b.1d.v2.b16.clamp", Int16Regs>; +defm SUST_B_1D_V2I32_CLAMP : SUST_1D_V2<"sust.b.1d.v2.b32.clamp", Int32Regs>; +defm SUST_B_1D_V2I64_CLAMP : SUST_1D_V2<"sust.b.1d.v2.b64.clamp", Int64Regs>; -defm SUST_B_1D_V2B8_ZERO : SUST_1D_V2<"sust.b.1d.v2.b8.zero", Int16Regs>; -defm SUST_B_1D_V2B16_ZERO : SUST_1D_V2<"sust.b.1d.v2.b16.zero", Int16Regs>; -defm SUST_B_1D_V2B32_ZERO : SUST_1D_V2<"sust.b.1d.v2.b32.zero", Int32Regs>; -defm SUST_B_1D_V2B64_ZERO : SUST_1D_V2<"sust.b.1d.v2.b64.zero", Int64Regs>; +defm SUST_B_1D_V2I8_TRAP : SUST_1D_V2<"sust.b.1d.v2.b8.trap", Int16Regs>; +defm SUST_B_1D_V2I16_TRAP : SUST_1D_V2<"sust.b.1d.v2.b16.trap", Int16Regs>; +defm SUST_B_1D_V2I32_TRAP : SUST_1D_V2<"sust.b.1d.v2.b32.trap", Int32Regs>; +defm SUST_B_1D_V2I64_TRAP : SUST_1D_V2<"sust.b.1d.v2.b64.trap", Int64Regs>; -defm SUST_P_1D_V2B8_TRAP : SUST_1D_V2<"sust.p.1d.v2.b8.trap", Int16Regs>; -defm SUST_P_1D_V2B16_TRAP : SUST_1D_V2<"sust.p.1d.v2.b16.trap", Int16Regs>; -defm SUST_P_1D_V2B32_TRAP : SUST_1D_V2<"sust.p.1d.v2.b32.trap", Int32Regs>; +defm SUST_B_1D_V2I8_ZERO : SUST_1D_V2<"sust.b.1d.v2.b8.zero", Int16Regs>; +defm SUST_B_1D_V2I16_ZERO : SUST_1D_V2<"sust.b.1d.v2.b16.zero", Int16Regs>; +defm SUST_B_1D_V2I32_ZERO : SUST_1D_V2<"sust.b.1d.v2.b32.zero", Int32Regs>; +defm SUST_B_1D_V2I64_ZERO : SUST_1D_V2<"sust.b.1d.v2.b64.zero", Int64Regs>; -class SUST_1D_V4_base +defm SUST_P_1D_V2I8_TRAP : SUST_1D_V2<"sust.p.1d.v2.b8.trap", Int16Regs>; +defm SUST_P_1D_V2I16_TRAP : SUST_1D_V2<"sust.p.1d.v2.b16.trap", Int16Regs>; +defm SUST_P_1D_V2I32_TRAP : SUST_1D_V2<"sust.p.1d.v2.b32.trap", Int32Regs>; + +class SUST_1D_V4_base pat> : NVPTXInst<(outs), !con(surf, (ins Int32Regs:$x, intype:$r, intype:$g, intype:$b, intype:$a)), inst # " \t[$s, \\{$x\\}], \\{$r, $g, $b, $a\\};", - []>; + pat>; multiclass SUST_1D_V4 { - def _R : SUST_1D_V4_base; - def _I : SUST_1D_V4_base; + defvar intr = !cast("int_nvvm_" # !tolower(NAME)); + def _R : SUST_1D_V4_base; + def _I : SUST_1D_V4_base; } -defm SUST_B_1D_V4B8_CLAMP : SUST_1D_V4<"sust.b.1d.v4.b8.clamp", Int16Regs>; -defm SUST_B_1D_V4B16_CLAMP : SUST_1D_V4<"sust.b.1d.v4.b16.clamp", Int16Regs>; -defm SUST_B_1D_V4B32_CLAMP : SUST_1D_V4<"sust.b.1d.v4.b32.clamp", Int32Regs>; +defm SUST_B_1D_V4I8_CLAMP : SUST_1D_V4<"sust.b.1d.v4.b8.clamp", Int16Regs>; +defm SUST_B_1D_V4I16_CLAMP : SUST_1D_V4<"sust.b.1d.v4.b16.clamp", Int16Regs>; +defm SUST_B_1D_V4I32_CLAMP : SUST_1D_V4<"sust.b.1d.v4.b32.clamp", Int32Regs>; -defm SUST_B_1D_V4B8_TRAP : SUST_1D_V4<"sust.b.1d.v4.b8.trap", Int16Regs>; -defm SUST_B_1D_V4B16_TRAP : SUST_1D_V4<"sust.b.1d.v4.b16.trap", Int16Regs>; -defm SUST_B_1D_V4B32_TRAP : SUST_1D_V4<"sust.b.1d.v4.b32.trap", Int32Regs>; +defm SUST_B_1D_V4I8_TRAP : SUST_1D_V4<"sust.b.1d.v4.b8.trap", Int16Regs>; +defm SUST_B_1D_V4I16_TRAP : SUST_1D_V4<"sust.b.1d.v4.b16.trap", Int16Regs>; +defm SUST_B_1D_V4I32_TRAP : SUST_1D_V4<"sust.b.1d.v4.b32.trap", Int32Regs>; -defm SUST_B_1D_V4B8_ZERO : SUST_1D_V4<"sust.b.1d.v4.b8.zero", Int16Regs>; -defm SUST_B_1D_V4B16_ZERO : SUST_1D_V4<"sust.b.1d.v4.b16.zero", Int16Regs>; -defm SUST_B_1D_V4B32_ZERO : SUST_1D_V4<"sust.b.1d.v4.b32.zero", Int32Regs>; +defm SUST_B_1D_V4I8_ZERO : SUST_1D_V4<"sust.b.1d.v4.b8.zero", Int16Regs>; +defm SUST_B_1D_V4I16_ZERO : SUST_1D_V4<"sust.b.1d.v4.b16.zero", Int16Regs>; +defm SUST_B_1D_V4I32_ZERO : SUST_1D_V4<"sust.b.1d.v4.b32.zero", Int32Regs>; -defm SUST_P_1D_V4B8_TRAP : SUST_1D_V4<"sust.p.1d.v4.b8.trap", Int16Regs>; -defm SUST_P_1D_V4B16_TRAP : SUST_1D_V4<"sust.p.1d.v4.b16.trap", Int16Regs>; -defm SUST_P_1D_V4B32_TRAP : SUST_1D_V4<"sust.p.1d.v4.b32.trap", Int32Regs>; +defm SUST_P_1D_V4I8_TRAP : SUST_1D_V4<"sust.p.1d.v4.b8.trap", Int16Regs>; +defm SUST_P_1D_V4I16_TRAP : SUST_1D_V4<"sust.p.1d.v4.b16.trap", Int16Regs>; +defm SUST_P_1D_V4I32_TRAP : SUST_1D_V4<"sust.p.1d.v4.b32.trap", Int32Regs>; -class SUST_1D_ARRAY_base +class SUST_1D_ARRAY_base pat> : NVPTXInst<(outs), !con(surf, (ins Int32Regs:$idx, Int32Regs:$x, intype:$r)), inst # " \t[$s, \\{$idx, $x\\}], \\{$r\\};", - []>; + pat>; multiclass SUST_1D_ARRAY { - def _R : SUST_1D_ARRAY_base; - def _I : SUST_1D_ARRAY_base; + defvar intr = !cast("int_nvvm_" # !tolower(NAME)); + def _R : SUST_1D_ARRAY_base; + def _I : SUST_1D_ARRAY_base; } -defm SUST_B_1D_ARRAY_B8_CLAMP +defm SUST_B_1D_ARRAY_I8_CLAMP : SUST_1D_ARRAY<"sust.b.a1d.b8.clamp", Int16Regs>; -defm SUST_B_1D_ARRAY_B16_CLAMP +defm SUST_B_1D_ARRAY_I16_CLAMP : SUST_1D_ARRAY<"sust.b.a1d.b16.clamp", Int16Regs>; -defm SUST_B_1D_ARRAY_B32_CLAMP +defm SUST_B_1D_ARRAY_I32_CLAMP : SUST_1D_ARRAY<"sust.b.a1d.b32.clamp", Int32Regs>; -defm SUST_B_1D_ARRAY_B64_CLAMP +defm SUST_B_1D_ARRAY_I64_CLAMP : SUST_1D_ARRAY<"sust.b.a1d.b64.clamp", Int64Regs>; -defm SUST_B_1D_ARRAY_B8_TRAP +defm SUST_B_1D_ARRAY_I8_TRAP : SUST_1D_ARRAY<"sust.b.a1d.b8.trap", Int16Regs>; -defm SUST_B_1D_ARRAY_B16_TRAP +defm SUST_B_1D_ARRAY_I16_TRAP : SUST_1D_ARRAY<"sust.b.a1d.b16.trap", Int16Regs>; -defm SUST_B_1D_ARRAY_B32_TRAP +defm SUST_B_1D_ARRAY_I32_TRAP : SUST_1D_ARRAY<"sust.b.a1d.b32.trap", Int32Regs>; -defm SUST_B_1D_ARRAY_B64_TRAP +defm SUST_B_1D_ARRAY_I64_TRAP : SUST_1D_ARRAY<"sust.b.a1d.b64.trap", Int64Regs>; -defm SUST_B_1D_ARRAY_B8_ZERO +defm SUST_B_1D_ARRAY_I8_ZERO : SUST_1D_ARRAY<"sust.b.a1d.b8.zero", Int16Regs>; -defm SUST_B_1D_ARRAY_B16_ZERO +defm SUST_B_1D_ARRAY_I16_ZERO : SUST_1D_ARRAY<"sust.b.a1d.b16.zero", Int16Regs>; -defm SUST_B_1D_ARRAY_B32_ZERO +defm SUST_B_1D_ARRAY_I32_ZERO : SUST_1D_ARRAY<"sust.b.a1d.b32.zero", Int32Regs>; -defm SUST_B_1D_ARRAY_B64_ZERO +defm SUST_B_1D_ARRAY_I64_ZERO : SUST_1D_ARRAY<"sust.b.a1d.b64.zero", Int64Regs>; -defm SUST_P_1D_ARRAY_B8_TRAP +defm SUST_P_1D_ARRAY_I8_TRAP : SUST_1D_ARRAY<"sust.p.a1d.b8.trap", Int16Regs>; -defm SUST_P_1D_ARRAY_B16_TRAP +defm SUST_P_1D_ARRAY_I16_TRAP : SUST_1D_ARRAY<"sust.p.a1d.b16.trap", Int16Regs>; -defm SUST_P_1D_ARRAY_B32_TRAP +defm SUST_P_1D_ARRAY_I32_TRAP : SUST_1D_ARRAY<"sust.p.a1d.b32.trap", Int32Regs>; -class SUST_1D_ARRAY_V2_base +class SUST_1D_ARRAY_V2_base pat> : NVPTXInst<(outs), !con(surf, (ins Int32Regs:$idx, Int32Regs:$x, intype:$r, intype:$g)), inst # " \t[$s, \\{$idx, $x\\}], \\{$r, $g\\};", - []>; + pat>; multiclass SUST_1D_ARRAY_V2 { - def _R : SUST_1D_ARRAY_V2_base; - def _I : SUST_1D_ARRAY_V2_base; + defvar intr = !cast("int_nvvm_" # !tolower(NAME)); + def _R : SUST_1D_ARRAY_V2_base; + def _I : SUST_1D_ARRAY_V2_base; } -defm SUST_B_1D_ARRAY_V2B8_CLAMP +defm SUST_B_1D_ARRAY_V2I8_CLAMP : SUST_1D_ARRAY_V2<"sust.b.a1d.v2.b8.clamp", Int16Regs>; -defm SUST_B_1D_ARRAY_V2B16_CLAMP +defm SUST_B_1D_ARRAY_V2I16_CLAMP : SUST_1D_ARRAY_V2<"sust.b.a1d.v2.b16.clamp", Int16Regs>; -defm SUST_B_1D_ARRAY_V2B32_CLAMP +defm SUST_B_1D_ARRAY_V2I32_CLAMP : SUST_1D_ARRAY_V2<"sust.b.a1d.v2.b32.clamp", Int32Regs>; -defm SUST_B_1D_ARRAY_V2B64_CLAMP +defm SUST_B_1D_ARRAY_V2I64_CLAMP : SUST_1D_ARRAY_V2<"sust.b.a1d.v2.b64.clamp", Int64Regs>; -defm SUST_B_1D_ARRAY_V2B8_TRAP +defm SUST_B_1D_ARRAY_V2I8_TRAP : SUST_1D_ARRAY_V2<"sust.b.a1d.v2.b8.trap", Int16Regs>; -defm SUST_B_1D_ARRAY_V2B16_TRAP +defm SUST_B_1D_ARRAY_V2I16_TRAP : SUST_1D_ARRAY_V2<"sust.b.a1d.v2.b16.trap", Int16Regs>; -defm SUST_B_1D_ARRAY_V2B32_TRAP +defm SUST_B_1D_ARRAY_V2I32_TRAP : SUST_1D_ARRAY_V2<"sust.b.a1d.v2.b32.trap", Int32Regs>; -defm SUST_B_1D_ARRAY_V2B64_TRAP +defm SUST_B_1D_ARRAY_V2I64_TRAP : SUST_1D_ARRAY_V2<"sust.b.a1d.v2.b64.trap", Int64Regs>; -defm SUST_B_1D_ARRAY_V2B8_ZERO +defm SUST_B_1D_ARRAY_V2I8_ZERO : SUST_1D_ARRAY_V2<"sust.b.a1d.v2.b8.zero", Int16Regs>; -defm SUST_B_1D_ARRAY_V2B16_ZERO +defm SUST_B_1D_ARRAY_V2I16_ZERO : SUST_1D_ARRAY_V2<"sust.b.a1d.v2.b16.zero", Int16Regs>; -defm SUST_B_1D_ARRAY_V2B32_ZERO +defm SUST_B_1D_ARRAY_V2I32_ZERO : SUST_1D_ARRAY_V2<"sust.b.a1d.v2.b32.zero", Int32Regs>; -defm SUST_B_1D_ARRAY_V2B64_ZERO +defm SUST_B_1D_ARRAY_V2I64_ZERO : SUST_1D_ARRAY_V2<"sust.b.a1d.v2.b64.zero", Int64Regs>; -defm SUST_P_1D_ARRAY_V2B8_TRAP +defm SUST_P_1D_ARRAY_V2I8_TRAP : SUST_1D_ARRAY_V2<"sust.p.a1d.v2.b8.trap", Int16Regs>; -defm SUST_P_1D_ARRAY_V2B16_TRAP +defm SUST_P_1D_ARRAY_V2I16_TRAP : SUST_1D_ARRAY_V2<"sust.p.a1d.v2.b16.trap", Int16Regs>; -defm SUST_P_1D_ARRAY_V2B32_TRAP +defm SUST_P_1D_ARRAY_V2I32_TRAP : SUST_1D_ARRAY_V2<"sust.p.a1d.v2.b32.trap", Int32Regs>; -class SUST_1D_ARRAY_V4_base +class SUST_1D_ARRAY_V4_base pat> : NVPTXInst<(outs), !con(surf, (ins Int32Regs:$idx, Int32Regs:$x, intype:$r, intype:$g, intype:$b, intype:$a)), inst # " \t[$s, \\{$idx, $x\\}], \\{$r, $g, $b, $a\\};", - []>; + pat>; multiclass SUST_1D_ARRAY_V4 { - def _R : SUST_1D_ARRAY_V4_base; - def _I : SUST_1D_ARRAY_V4_base; + defvar intr = !cast("int_nvvm_" # !tolower(NAME)); + def _R : SUST_1D_ARRAY_V4_base; + def _I : SUST_1D_ARRAY_V4_base; } -defm SUST_B_1D_ARRAY_V4B8_CLAMP +defm SUST_B_1D_ARRAY_V4I8_CLAMP : SUST_1D_ARRAY_V4<"sust.b.a1d.v4.b8.clamp", Int16Regs>; -defm SUST_B_1D_ARRAY_V4B16_CLAMP +defm SUST_B_1D_ARRAY_V4I16_CLAMP : SUST_1D_ARRAY_V4<"sust.b.a1d.v4.b16.clamp", Int16Regs>; -defm SUST_B_1D_ARRAY_V4B32_CLAMP +defm SUST_B_1D_ARRAY_V4I32_CLAMP : SUST_1D_ARRAY_V4<"sust.b.a1d.v4.b32.clamp", Int32Regs>; -defm SUST_B_1D_ARRAY_V4B8_TRAP +defm SUST_B_1D_ARRAY_V4I8_TRAP : SUST_1D_ARRAY_V4<"sust.b.a1d.v4.b8.trap", Int16Regs>; -defm SUST_B_1D_ARRAY_V4B16_TRAP +defm SUST_B_1D_ARRAY_V4I16_TRAP : SUST_1D_ARRAY_V4<"sust.b.a1d.v4.b16.trap", Int16Regs>; -defm SUST_B_1D_ARRAY_V4B32_TRAP +defm SUST_B_1D_ARRAY_V4I32_TRAP : SUST_1D_ARRAY_V4<"sust.b.a1d.v4.b32.trap", Int32Regs>; -defm SUST_B_1D_ARRAY_V4B8_ZERO +defm SUST_B_1D_ARRAY_V4I8_ZERO : SUST_1D_ARRAY_V4<"sust.b.a1d.v4.b8.zero", Int16Regs>; -defm SUST_B_1D_ARRAY_V4B16_ZERO +defm SUST_B_1D_ARRAY_V4I16_ZERO : SUST_1D_ARRAY_V4<"sust.b.a1d.v4.b16.zero", Int16Regs>; -defm SUST_B_1D_ARRAY_V4B32_ZERO +defm SUST_B_1D_ARRAY_V4I32_ZERO : SUST_1D_ARRAY_V4<"sust.b.a1d.v4.b32.zero", Int32Regs>; -defm SUST_P_1D_ARRAY_V4B8_TRAP +defm SUST_P_1D_ARRAY_V4I8_TRAP : SUST_1D_ARRAY_V4<"sust.p.a1d.v4.b8.trap", Int16Regs>; -defm SUST_P_1D_ARRAY_V4B16_TRAP +defm SUST_P_1D_ARRAY_V4I16_TRAP : SUST_1D_ARRAY_V4<"sust.p.a1d.v4.b16.trap", Int16Regs>; -defm SUST_P_1D_ARRAY_V4B32_TRAP +defm SUST_P_1D_ARRAY_V4I32_TRAP : SUST_1D_ARRAY_V4<"sust.p.a1d.v4.b32.trap", Int32Regs>; -class SUST_2D_base +class SUST_2D_base pat> : NVPTXInst<(outs), !con(surf, (ins Int32Regs:$x, Int32Regs:$y, intype:$r)), inst # " \t[$s, \\{$x, $y\\}], \\{$r\\};", - []>; + pat>; multiclass SUST_2D { - def _R : SUST_2D_base; - def _I : SUST_2D_base; + defvar intr = !cast("int_nvvm_" # !tolower(NAME)); + def _R : SUST_2D_base; + def _I : SUST_2D_base; } -defm SUST_B_2D_B8_CLAMP : SUST_2D<"sust.b.2d.b8.clamp", Int16Regs>; -defm SUST_B_2D_B16_CLAMP : SUST_2D<"sust.b.2d.b16.clamp", Int16Regs>; -defm SUST_B_2D_B32_CLAMP : SUST_2D<"sust.b.2d.b32.clamp", Int32Regs>; -defm SUST_B_2D_B64_CLAMP : SUST_2D<"sust.b.2d.b64.clamp", Int64Regs>; +defm SUST_B_2D_I8_CLAMP : SUST_2D<"sust.b.2d.b8.clamp", Int16Regs>; +defm SUST_B_2D_I16_CLAMP : SUST_2D<"sust.b.2d.b16.clamp", Int16Regs>; +defm SUST_B_2D_I32_CLAMP : SUST_2D<"sust.b.2d.b32.clamp", Int32Regs>; +defm SUST_B_2D_I64_CLAMP : SUST_2D<"sust.b.2d.b64.clamp", Int64Regs>; -defm SUST_B_2D_B8_TRAP : SUST_2D<"sust.b.2d.b8.trap", Int16Regs>; -defm SUST_B_2D_B16_TRAP : SUST_2D<"sust.b.2d.b16.trap", Int16Regs>; -defm SUST_B_2D_B32_TRAP : SUST_2D<"sust.b.2d.b32.trap", Int32Regs>; -defm SUST_B_2D_B64_TRAP : SUST_2D<"sust.b.2d.b64.trap", Int64Regs>; +defm SUST_B_2D_I8_TRAP : SUST_2D<"sust.b.2d.b8.trap", Int16Regs>; +defm SUST_B_2D_I16_TRAP : SUST_2D<"sust.b.2d.b16.trap", Int16Regs>; +defm SUST_B_2D_I32_TRAP : SUST_2D<"sust.b.2d.b32.trap", Int32Regs>; +defm SUST_B_2D_I64_TRAP : SUST_2D<"sust.b.2d.b64.trap", Int64Regs>; -defm SUST_B_2D_B8_ZERO : SUST_2D<"sust.b.2d.b8.zero", Int16Regs>; -defm SUST_B_2D_B16_ZERO : SUST_2D<"sust.b.2d.b16.zero", Int16Regs>; -defm SUST_B_2D_B32_ZERO : SUST_2D<"sust.b.2d.b32.zero", Int32Regs>; -defm SUST_B_2D_B64_ZERO : SUST_2D<"sust.b.2d.b64.zero", Int64Regs>; +defm SUST_B_2D_I8_ZERO : SUST_2D<"sust.b.2d.b8.zero", Int16Regs>; +defm SUST_B_2D_I16_ZERO : SUST_2D<"sust.b.2d.b16.zero", Int16Regs>; +defm SUST_B_2D_I32_ZERO : SUST_2D<"sust.b.2d.b32.zero", Int32Regs>; +defm SUST_B_2D_I64_ZERO : SUST_2D<"sust.b.2d.b64.zero", Int64Regs>; -defm SUST_P_2D_B8_TRAP : SUST_2D<"sust.p.2d.b8.trap", Int16Regs>; -defm SUST_P_2D_B16_TRAP : SUST_2D<"sust.p.2d.b16.trap", Int16Regs>; -defm SUST_P_2D_B32_TRAP : SUST_2D<"sust.p.2d.b32.trap", Int32Regs>; +defm SUST_P_2D_I8_TRAP : SUST_2D<"sust.p.2d.b8.trap", Int16Regs>; +defm SUST_P_2D_I16_TRAP : SUST_2D<"sust.p.2d.b16.trap", Int16Regs>; +defm SUST_P_2D_I32_TRAP : SUST_2D<"sust.p.2d.b32.trap", Int32Regs>; -class SUST_2D_V2_base +class SUST_2D_V2_base pat> : NVPTXInst<(outs), !con(surf, (ins Int32Regs:$x, Int32Regs:$y, intype:$r, intype:$g)), inst # " \t[$s, \\{$x, $y\\}], \\{$r, $g\\};", - []>; + pat>; multiclass SUST_2D_V2 { - def _R : SUST_2D_V2_base; - def _I : SUST_2D_V2_base; + defvar intr = !cast("int_nvvm_" # !tolower(NAME)); + def _R : SUST_2D_V2_base; + def _I : SUST_2D_V2_base; } -defm SUST_B_2D_V2B8_CLAMP : SUST_2D_V2<"sust.b.2d.v2.b8.clamp", Int16Regs>; -defm SUST_B_2D_V2B16_CLAMP : SUST_2D_V2<"sust.b.2d.v2.b16.clamp", Int16Regs>; -defm SUST_B_2D_V2B32_CLAMP : SUST_2D_V2<"sust.b.2d.v2.b32.clamp", Int32Regs>; -defm SUST_B_2D_V2B64_CLAMP : SUST_2D_V2<"sust.b.2d.v2.b64.clamp", Int64Regs>; +defm SUST_B_2D_V2I8_CLAMP : SUST_2D_V2<"sust.b.2d.v2.b8.clamp", Int16Regs>; +defm SUST_B_2D_V2I16_CLAMP : SUST_2D_V2<"sust.b.2d.v2.b16.clamp", Int16Regs>; +defm SUST_B_2D_V2I32_CLAMP : SUST_2D_V2<"sust.b.2d.v2.b32.clamp", Int32Regs>; +defm SUST_B_2D_V2I64_CLAMP : SUST_2D_V2<"sust.b.2d.v2.b64.clamp", Int64Regs>; -defm SUST_B_2D_V2B8_TRAP : SUST_2D_V2<"sust.b.2d.v2.b8.trap", Int16Regs>; -defm SUST_B_2D_V2B16_TRAP : SUST_2D_V2<"sust.b.2d.v2.b16.trap", Int16Regs>; -defm SUST_B_2D_V2B32_TRAP : SUST_2D_V2<"sust.b.2d.v2.b32.trap", Int32Regs>; -defm SUST_B_2D_V2B64_TRAP : SUST_2D_V2<"sust.b.2d.v2.b64.trap", Int64Regs>; +defm SUST_B_2D_V2I8_TRAP : SUST_2D_V2<"sust.b.2d.v2.b8.trap", Int16Regs>; +defm SUST_B_2D_V2I16_TRAP : SUST_2D_V2<"sust.b.2d.v2.b16.trap", Int16Regs>; +defm SUST_B_2D_V2I32_TRAP : SUST_2D_V2<"sust.b.2d.v2.b32.trap", Int32Regs>; +defm SUST_B_2D_V2I64_TRAP : SUST_2D_V2<"sust.b.2d.v2.b64.trap", Int64Regs>; -defm SUST_B_2D_V2B8_ZERO : SUST_2D_V2<"sust.b.2d.v2.b8.zero", Int16Regs>; -defm SUST_B_2D_V2B16_ZERO : SUST_2D_V2<"sust.b.2d.v2.b16.zero", Int16Regs>; -defm SUST_B_2D_V2B32_ZERO : SUST_2D_V2<"sust.b.2d.v2.b32.zero", Int32Regs>; -defm SUST_B_2D_V2B64_ZERO : SUST_2D_V2<"sust.b.2d.v2.b64.zero", Int64Regs>; +defm SUST_B_2D_V2I8_ZERO : SUST_2D_V2<"sust.b.2d.v2.b8.zero", Int16Regs>; +defm SUST_B_2D_V2I16_ZERO : SUST_2D_V2<"sust.b.2d.v2.b16.zero", Int16Regs>; +defm SUST_B_2D_V2I32_ZERO : SUST_2D_V2<"sust.b.2d.v2.b32.zero", Int32Regs>; +defm SUST_B_2D_V2I64_ZERO : SUST_2D_V2<"sust.b.2d.v2.b64.zero", Int64Regs>; -defm SUST_P_2D_V2B8_TRAP : SUST_2D_V2<"sust.p.2d.v2.b8.trap", Int16Regs>; -defm SUST_P_2D_V2B16_TRAP : SUST_2D_V2<"sust.p.2d.v2.b16.trap", Int16Regs>; -defm SUST_P_2D_V2B32_TRAP : SUST_2D_V2<"sust.p.2d.v2.b32.trap", Int32Regs>; +defm SUST_P_2D_V2I8_TRAP : SUST_2D_V2<"sust.p.2d.v2.b8.trap", Int16Regs>; +defm SUST_P_2D_V2I16_TRAP : SUST_2D_V2<"sust.p.2d.v2.b16.trap", Int16Regs>; +defm SUST_P_2D_V2I32_TRAP : SUST_2D_V2<"sust.p.2d.v2.b32.trap", Int32Regs>; -class SUST_2D_V4_base +class SUST_2D_V4_base pat> : NVPTXInst<(outs), !con(surf, (ins Int32Regs:$x, Int32Regs:$y, intype:$r, intype:$g, intype:$b, intype:$a)), inst # " \t[$s, \\{$x, $y\\}], \\{$r, $g, $b, $a\\};", - []>; + pat>; multiclass SUST_2D_V4 { - def _R : SUST_2D_V4_base; - def _I : SUST_2D_V4_base; + defvar intr = !cast("int_nvvm_" # !tolower(NAME)); + def _R : SUST_2D_V4_base; + def _I : SUST_2D_V4_base; } -defm SUST_B_2D_V4B8_CLAMP : SUST_2D_V4<"sust.b.2d.v4.b8.clamp", Int16Regs>; -defm SUST_B_2D_V4B16_CLAMP : SUST_2D_V4<"sust.b.2d.v4.b16.clamp", Int16Regs>; -defm SUST_B_2D_V4B32_CLAMP : SUST_2D_V4<"sust.b.2d.v4.b32.clamp", Int32Regs>; +defm SUST_B_2D_V4I8_CLAMP : SUST_2D_V4<"sust.b.2d.v4.b8.clamp", Int16Regs>; +defm SUST_B_2D_V4I16_CLAMP : SUST_2D_V4<"sust.b.2d.v4.b16.clamp", Int16Regs>; +defm SUST_B_2D_V4I32_CLAMP : SUST_2D_V4<"sust.b.2d.v4.b32.clamp", Int32Regs>; -defm SUST_B_2D_V4B8_TRAP : SUST_2D_V4<"sust.b.2d.v4.b8.trap", Int16Regs>; -defm SUST_B_2D_V4B16_TRAP : SUST_2D_V4<"sust.b.2d.v4.b16.trap", Int16Regs>; -defm SUST_B_2D_V4B32_TRAP : SUST_2D_V4<"sust.b.2d.v4.b32.trap", Int32Regs>; +defm SUST_B_2D_V4I8_TRAP : SUST_2D_V4<"sust.b.2d.v4.b8.trap", Int16Regs>; +defm SUST_B_2D_V4I16_TRAP : SUST_2D_V4<"sust.b.2d.v4.b16.trap", Int16Regs>; +defm SUST_B_2D_V4I32_TRAP : SUST_2D_V4<"sust.b.2d.v4.b32.trap", Int32Regs>; -defm SUST_B_2D_V4B8_ZERO : SUST_2D_V4<"sust.b.2d.v4.b8.zero", Int16Regs>; -defm SUST_B_2D_V4B16_ZERO : SUST_2D_V4<"sust.b.2d.v4.b16.zero", Int16Regs>; -defm SUST_B_2D_V4B32_ZERO : SUST_2D_V4<"sust.b.2d.v4.b32.zero", Int32Regs>; +defm SUST_B_2D_V4I8_ZERO : SUST_2D_V4<"sust.b.2d.v4.b8.zero", Int16Regs>; +defm SUST_B_2D_V4I16_ZERO : SUST_2D_V4<"sust.b.2d.v4.b16.zero", Int16Regs>; +defm SUST_B_2D_V4I32_ZERO : SUST_2D_V4<"sust.b.2d.v4.b32.zero", Int32Regs>; -defm SUST_P_2D_V4B8_TRAP : SUST_2D_V4<"sust.p.2d.v4.b8.trap", Int16Regs>; -defm SUST_P_2D_V4B16_TRAP : SUST_2D_V4<"sust.p.2d.v4.b16.trap", Int16Regs>; -defm SUST_P_2D_V4B32_TRAP : SUST_2D_V4<"sust.p.2d.v4.b32.trap", Int32Regs>; +defm SUST_P_2D_V4I8_TRAP : SUST_2D_V4<"sust.p.2d.v4.b8.trap", Int16Regs>; +defm SUST_P_2D_V4I16_TRAP : SUST_2D_V4<"sust.p.2d.v4.b16.trap", Int16Regs>; +defm SUST_P_2D_V4I32_TRAP : SUST_2D_V4<"sust.p.2d.v4.b32.trap", Int32Regs>; -class SUST_2D_ARRAY_base +class SUST_2D_ARRAY_base pat> : NVPTXInst<(outs), !con(surf, (ins Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y, intype:$r)), inst # " \t[$s, \\{$idx, $x, $y, $y\\}], \\{$r\\};", - []>; + pat>; multiclass SUST_2D_ARRAY { - def _R : SUST_2D_ARRAY_base; - def _I : SUST_2D_ARRAY_base; + defvar intr = !cast("int_nvvm_" # !tolower(NAME)); + def _R : SUST_2D_ARRAY_base; + def _I : SUST_2D_ARRAY_base; } -defm SUST_B_2D_ARRAY_B8_CLAMP +defm SUST_B_2D_ARRAY_I8_CLAMP : SUST_2D_ARRAY<"sust.b.a2d.b8.clamp", Int16Regs>; -defm SUST_B_2D_ARRAY_B16_CLAMP +defm SUST_B_2D_ARRAY_I16_CLAMP : SUST_2D_ARRAY<"sust.b.a2d.b16.clamp", Int16Regs>; -defm SUST_B_2D_ARRAY_B32_CLAMP +defm SUST_B_2D_ARRAY_I32_CLAMP : SUST_2D_ARRAY<"sust.b.a2d.b32.clamp", Int32Regs>; -defm SUST_B_2D_ARRAY_B64_CLAMP +defm SUST_B_2D_ARRAY_I64_CLAMP : SUST_2D_ARRAY<"sust.b.a2d.b64.clamp", Int64Regs>; -defm SUST_B_2D_ARRAY_B8_TRAP +defm SUST_B_2D_ARRAY_I8_TRAP : SUST_2D_ARRAY<"sust.b.a2d.b8.trap", Int16Regs>; -defm SUST_B_2D_ARRAY_B16_TRAP +defm SUST_B_2D_ARRAY_I16_TRAP : SUST_2D_ARRAY<"sust.b.a2d.b16.trap", Int16Regs>; -defm SUST_B_2D_ARRAY_B32_TRAP +defm SUST_B_2D_ARRAY_I32_TRAP : SUST_2D_ARRAY<"sust.b.a2d.b32.trap", Int32Regs>; -defm SUST_B_2D_ARRAY_B64_TRAP +defm SUST_B_2D_ARRAY_I64_TRAP : SUST_2D_ARRAY<"sust.b.a2d.b64.trap", Int64Regs>; -defm SUST_B_2D_ARRAY_B8_ZERO +defm SUST_B_2D_ARRAY_I8_ZERO : SUST_2D_ARRAY<"sust.b.a2d.b8.zero", Int16Regs>; -defm SUST_B_2D_ARRAY_B16_ZERO +defm SUST_B_2D_ARRAY_I16_ZERO : SUST_2D_ARRAY<"sust.b.a2d.b16.zero", Int16Regs>; -defm SUST_B_2D_ARRAY_B32_ZERO +defm SUST_B_2D_ARRAY_I32_ZERO : SUST_2D_ARRAY<"sust.b.a2d.b32.zero", Int32Regs>; -defm SUST_B_2D_ARRAY_B64_ZERO +defm SUST_B_2D_ARRAY_I64_ZERO : SUST_2D_ARRAY<"sust.b.a2d.b64.zero", Int64Regs>; -defm SUST_P_2D_ARRAY_B8_TRAP +defm SUST_P_2D_ARRAY_I8_TRAP : SUST_2D_ARRAY<"sust.p.a2d.b8.trap", Int16Regs>; -defm SUST_P_2D_ARRAY_B16_TRAP +defm SUST_P_2D_ARRAY_I16_TRAP : SUST_2D_ARRAY<"sust.p.a2d.b16.trap", Int16Regs>; -defm SUST_P_2D_ARRAY_B32_TRAP +defm SUST_P_2D_ARRAY_I32_TRAP : SUST_2D_ARRAY<"sust.p.a2d.b32.trap", Int32Regs>; -class SUST_2D_ARRAY_V2_base +class SUST_2D_ARRAY_V2_base pat> : NVPTXInst<(outs), !con(surf, (ins Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y, intype:$r, intype:$g)), inst # " \t[$s, \\{$idx, $x, $y, $y\\}], \\{$r, $g\\};", - []>; + pat>; multiclass SUST_2D_ARRAY_V2 { - def _R : SUST_2D_ARRAY_V2_base; - def _I : SUST_2D_ARRAY_V2_base; + defvar intr = !cast("int_nvvm_" # !tolower(NAME)); + def _R : SUST_2D_ARRAY_V2_base; + def _I : SUST_2D_ARRAY_V2_base; } -defm SUST_B_2D_ARRAY_V2B8_CLAMP +defm SUST_B_2D_ARRAY_V2I8_CLAMP : SUST_2D_ARRAY_V2<"sust.b.a2d.v2.b8.clamp", Int16Regs>; -defm SUST_B_2D_ARRAY_V2B16_CLAMP +defm SUST_B_2D_ARRAY_V2I16_CLAMP : SUST_2D_ARRAY_V2<"sust.b.a2d.v2.b16.clamp", Int16Regs>; -defm SUST_B_2D_ARRAY_V2B32_CLAMP +defm SUST_B_2D_ARRAY_V2I32_CLAMP : SUST_2D_ARRAY_V2<"sust.b.a2d.v2.b32.clamp", Int32Regs>; -defm SUST_B_2D_ARRAY_V2B64_CLAMP +defm SUST_B_2D_ARRAY_V2I64_CLAMP : SUST_2D_ARRAY_V2<"sust.b.a2d.v2.b64.clamp", Int64Regs>; -defm SUST_B_2D_ARRAY_V2B8_TRAP +defm SUST_B_2D_ARRAY_V2I8_TRAP : SUST_2D_ARRAY_V2<"sust.b.a2d.v2.b8.trap", Int16Regs>; -defm SUST_B_2D_ARRAY_V2B16_TRAP +defm SUST_B_2D_ARRAY_V2I16_TRAP : SUST_2D_ARRAY_V2<"sust.b.a2d.v2.b16.trap", Int16Regs>; -defm SUST_B_2D_ARRAY_V2B32_TRAP +defm SUST_B_2D_ARRAY_V2I32_TRAP : SUST_2D_ARRAY_V2<"sust.b.a2d.v2.b32.trap", Int32Regs>; -defm SUST_B_2D_ARRAY_V2B64_TRAP +defm SUST_B_2D_ARRAY_V2I64_TRAP : SUST_2D_ARRAY_V2<"sust.b.a2d.v2.b64.trap", Int64Regs>; -defm SUST_B_2D_ARRAY_V2B8_ZERO +defm SUST_B_2D_ARRAY_V2I8_ZERO : SUST_2D_ARRAY_V2<"sust.b.a2d.v2.b8.zero", Int16Regs>; -defm SUST_B_2D_ARRAY_V2B16_ZERO +defm SUST_B_2D_ARRAY_V2I16_ZERO : SUST_2D_ARRAY_V2<"sust.b.a2d.v2.b16.zero", Int16Regs>; -defm SUST_B_2D_ARRAY_V2B32_ZERO +defm SUST_B_2D_ARRAY_V2I32_ZERO : SUST_2D_ARRAY_V2<"sust.b.a2d.v2.b32.zero", Int32Regs>; -defm SUST_B_2D_ARRAY_V2B64_ZERO +defm SUST_B_2D_ARRAY_V2I64_ZERO : SUST_2D_ARRAY_V2<"sust.b.a2d.v2.b64.zero", Int64Regs>; -defm SUST_P_2D_ARRAY_V2B8_TRAP +defm SUST_P_2D_ARRAY_V2I8_TRAP : SUST_2D_ARRAY_V2<"sust.p.a2d.v2.b8.trap", Int16Regs>; -defm SUST_P_2D_ARRAY_V2B16_TRAP +defm SUST_P_2D_ARRAY_V2I16_TRAP : SUST_2D_ARRAY_V2<"sust.p.a2d.v2.b16.trap", Int16Regs>; -defm SUST_P_2D_ARRAY_V2B32_TRAP +defm SUST_P_2D_ARRAY_V2I32_TRAP : SUST_2D_ARRAY_V2<"sust.p.a2d.v2.b32.trap", Int32Regs>; -class SUST_2D_ARRAY_V4_base +class SUST_2D_ARRAY_V4_base pat> : NVPTXInst<(outs), !con(surf, (ins Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y, intype:$r, intype:$g, intype:$b, intype:$a)), inst # " \t[$s, \\{$idx, $x, $y, $y\\}], \\{$r, $g, $b, $a\\};", - []>; + pat>; multiclass SUST_2D_ARRAY_V4 { - def _R : SUST_2D_ARRAY_V4_base; - def _I : SUST_2D_ARRAY_V4_base; + defvar intr = !cast("int_nvvm_" # !tolower(NAME)); + def _R : SUST_2D_ARRAY_V4_base; + def _I : SUST_2D_ARRAY_V4_base; } -defm SUST_B_2D_ARRAY_V4B8_CLAMP +defm SUST_B_2D_ARRAY_V4I8_CLAMP : SUST_2D_ARRAY_V4<"sust.b.a2d.v4.b8.clamp", Int16Regs>; -defm SUST_B_2D_ARRAY_V4B16_CLAMP +defm SUST_B_2D_ARRAY_V4I16_CLAMP : SUST_2D_ARRAY_V4<"sust.b.a2d.v4.b16.clamp", Int16Regs>; -defm SUST_B_2D_ARRAY_V4B32_CLAMP +defm SUST_B_2D_ARRAY_V4I32_CLAMP : SUST_2D_ARRAY_V4<"sust.b.a2d.v4.b32.clamp", Int32Regs>; -defm SUST_B_2D_ARRAY_V4B8_TRAP +defm SUST_B_2D_ARRAY_V4I8_TRAP : SUST_2D_ARRAY_V4<"sust.b.a2d.v4.b8.trap", Int16Regs>; -defm SUST_B_2D_ARRAY_V4B16_TRAP +defm SUST_B_2D_ARRAY_V4I16_TRAP : SUST_2D_ARRAY_V4<"sust.b.a2d.v4.b16.trap", Int16Regs>; -defm SUST_B_2D_ARRAY_V4B32_TRAP +defm SUST_B_2D_ARRAY_V4I32_TRAP : SUST_2D_ARRAY_V4<"sust.b.a2d.v4.b32.trap", Int32Regs>; -defm SUST_B_2D_ARRAY_V4B8_ZERO +defm SUST_B_2D_ARRAY_V4I8_ZERO : SUST_2D_ARRAY_V4<"sust.b.a2d.v4.b8.zero", Int16Regs>; -defm SUST_B_2D_ARRAY_V4B16_ZERO +defm SUST_B_2D_ARRAY_V4I16_ZERO : SUST_2D_ARRAY_V4<"sust.b.a2d.v4.b16.zero", Int16Regs>; -defm SUST_B_2D_ARRAY_V4B32_ZERO +defm SUST_B_2D_ARRAY_V4I32_ZERO : SUST_2D_ARRAY_V4<"sust.b.a2d.v4.b32.zero", Int32Regs>; -defm SUST_P_2D_ARRAY_V4B8_TRAP +defm SUST_P_2D_ARRAY_V4I8_TRAP : SUST_2D_ARRAY_V4<"sust.p.a2d.v4.b8.trap", Int16Regs>; -defm SUST_P_2D_ARRAY_V4B16_TRAP +defm SUST_P_2D_ARRAY_V4I16_TRAP : SUST_2D_ARRAY_V4<"sust.p.a2d.v4.b16.trap", Int16Regs>; -defm SUST_P_2D_ARRAY_V4B32_TRAP +defm SUST_P_2D_ARRAY_V4I32_TRAP : SUST_2D_ARRAY_V4<"sust.p.a2d.v4.b32.trap", Int32Regs>; -class SUST_3D_base +class SUST_3D_base pat> : NVPTXInst<(outs), !con(surf, (ins Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, intype:$r)), inst # " \t[$s, \\{$x, $y, $z, $z\\}], \\{$r\\};", - []>; + pat>; multiclass SUST_3D { - def _R : SUST_3D_base; - def _I : SUST_3D_base; + defvar intr = !cast("int_nvvm_" # !tolower(NAME)); + def _R : SUST_3D_base; + def _I : SUST_3D_base; } -defm SUST_B_3D_B8_CLAMP : SUST_3D<"sust.b.3d.b8.clamp", Int16Regs>; -defm SUST_B_3D_B16_CLAMP : SUST_3D<"sust.b.3d.b16.clamp", Int16Regs>; -defm SUST_B_3D_B32_CLAMP : SUST_3D<"sust.b.3d.b32.clamp", Int32Regs>; -defm SUST_B_3D_B64_CLAMP : SUST_3D<"sust.b.3d.b64.clamp", Int64Regs>; +defm SUST_B_3D_I8_CLAMP : SUST_3D<"sust.b.3d.b8.clamp", Int16Regs>; +defm SUST_B_3D_I16_CLAMP : SUST_3D<"sust.b.3d.b16.clamp", Int16Regs>; +defm SUST_B_3D_I32_CLAMP : SUST_3D<"sust.b.3d.b32.clamp", Int32Regs>; +defm SUST_B_3D_I64_CLAMP : SUST_3D<"sust.b.3d.b64.clamp", Int64Regs>; -defm SUST_B_3D_B8_TRAP : SUST_3D<"sust.b.3d.b8.trap", Int16Regs>; -defm SUST_B_3D_B16_TRAP : SUST_3D<"sust.b.3d.b16.trap", Int16Regs>; -defm SUST_B_3D_B32_TRAP : SUST_3D<"sust.b.3d.b32.trap", Int32Regs>; -defm SUST_B_3D_B64_TRAP : SUST_3D<"sust.b.3d.b64.trap", Int64Regs>; +defm SUST_B_3D_I8_TRAP : SUST_3D<"sust.b.3d.b8.trap", Int16Regs>; +defm SUST_B_3D_I16_TRAP : SUST_3D<"sust.b.3d.b16.trap", Int16Regs>; +defm SUST_B_3D_I32_TRAP : SUST_3D<"sust.b.3d.b32.trap", Int32Regs>; +defm SUST_B_3D_I64_TRAP : SUST_3D<"sust.b.3d.b64.trap", Int64Regs>; -defm SUST_B_3D_B8_ZERO : SUST_3D<"sust.b.3d.b8.zero", Int16Regs>; -defm SUST_B_3D_B16_ZERO : SUST_3D<"sust.b.3d.b16.zero", Int16Regs>; -defm SUST_B_3D_B32_ZERO : SUST_3D<"sust.b.3d.b32.zero", Int32Regs>; -defm SUST_B_3D_B64_ZERO : SUST_3D<"sust.b.3d.b64.zero", Int64Regs>; +defm SUST_B_3D_I8_ZERO : SUST_3D<"sust.b.3d.b8.zero", Int16Regs>; +defm SUST_B_3D_I16_ZERO : SUST_3D<"sust.b.3d.b16.zero", Int16Regs>; +defm SUST_B_3D_I32_ZERO : SUST_3D<"sust.b.3d.b32.zero", Int32Regs>; +defm SUST_B_3D_I64_ZERO : SUST_3D<"sust.b.3d.b64.zero", Int64Regs>; -defm SUST_P_3D_B8_TRAP : SUST_3D<"sust.p.3d.b8.trap", Int16Regs>; -defm SUST_P_3D_B16_TRAP : SUST_3D<"sust.p.3d.b16.trap", Int16Regs>; -defm SUST_P_3D_B32_TRAP : SUST_3D<"sust.p.3d.b32.trap", Int32Regs>; +defm SUST_P_3D_I8_TRAP : SUST_3D<"sust.p.3d.b8.trap", Int16Regs>; +defm SUST_P_3D_I16_TRAP : SUST_3D<"sust.p.3d.b16.trap", Int16Regs>; +defm SUST_P_3D_I32_TRAP : SUST_3D<"sust.p.3d.b32.trap", Int32Regs>; -class SUST_3D_V2_base +class SUST_3D_V2_base pat> : NVPTXInst<(outs), !con(surf, (ins Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, intype:$r, intype:$g)), inst # " \t[$s, \\{$x, $y, $z, $z\\}], \\{$r, $g\\};", - []>; + pat>; multiclass SUST_3D_V2 { - def _R : SUST_3D_V2_base; - def _I : SUST_3D_V2_base; + defvar intr = !cast("int_nvvm_" # !tolower(NAME)); + def _R : SUST_3D_V2_base; + def _I : SUST_3D_V2_base; } -defm SUST_B_3D_V2B8_CLAMP : SUST_3D_V2<"sust.b.3d.v2.b8.clamp", Int16Regs>; -defm SUST_B_3D_V2B16_CLAMP : SUST_3D_V2<"sust.b.3d.v2.b16.clamp", Int16Regs>; -defm SUST_B_3D_V2B32_CLAMP : SUST_3D_V2<"sust.b.3d.v2.b32.clamp", Int32Regs>; -defm SUST_B_3D_V2B64_CLAMP : SUST_3D_V2<"sust.b.3d.v2.b64.clamp", Int64Regs>; +defm SUST_B_3D_V2I8_CLAMP : SUST_3D_V2<"sust.b.3d.v2.b8.clamp", Int16Regs>; +defm SUST_B_3D_V2I16_CLAMP : SUST_3D_V2<"sust.b.3d.v2.b16.clamp", Int16Regs>; +defm SUST_B_3D_V2I32_CLAMP : SUST_3D_V2<"sust.b.3d.v2.b32.clamp", Int32Regs>; +defm SUST_B_3D_V2I64_CLAMP : SUST_3D_V2<"sust.b.3d.v2.b64.clamp", Int64Regs>; -defm SUST_B_3D_V2B8_TRAP : SUST_3D_V2<"sust.b.3d.v2.b8.trap", Int16Regs>; -defm SUST_B_3D_V2B16_TRAP : SUST_3D_V2<"sust.b.3d.v2.b16.trap", Int16Regs>; -defm SUST_B_3D_V2B32_TRAP : SUST_3D_V2<"sust.b.3d.v2.b32.trap", Int32Regs>; -defm SUST_B_3D_V2B64_TRAP : SUST_3D_V2<"sust.b.3d.v2.b64.trap", Int64Regs>; +defm SUST_B_3D_V2I8_TRAP : SUST_3D_V2<"sust.b.3d.v2.b8.trap", Int16Regs>; +defm SUST_B_3D_V2I16_TRAP : SUST_3D_V2<"sust.b.3d.v2.b16.trap", Int16Regs>; +defm SUST_B_3D_V2I32_TRAP : SUST_3D_V2<"sust.b.3d.v2.b32.trap", Int32Regs>; +defm SUST_B_3D_V2I64_TRAP : SUST_3D_V2<"sust.b.3d.v2.b64.trap", Int64Regs>; -defm SUST_B_3D_V2B8_ZERO : SUST_3D_V2<"sust.b.3d.v2.b8.zero", Int16Regs>; -defm SUST_B_3D_V2B16_ZERO : SUST_3D_V2<"sust.b.3d.v2.b16.zero", Int16Regs>; -defm SUST_B_3D_V2B32_ZERO : SUST_3D_V2<"sust.b.3d.v2.b32.zero", Int32Regs>; -defm SUST_B_3D_V2B64_ZERO : SUST_3D_V2<"sust.b.3d.v2.b64.zero", Int64Regs>; +defm SUST_B_3D_V2I8_ZERO : SUST_3D_V2<"sust.b.3d.v2.b8.zero", Int16Regs>; +defm SUST_B_3D_V2I16_ZERO : SUST_3D_V2<"sust.b.3d.v2.b16.zero", Int16Regs>; +defm SUST_B_3D_V2I32_ZERO : SUST_3D_V2<"sust.b.3d.v2.b32.zero", Int32Regs>; +defm SUST_B_3D_V2I64_ZERO : SUST_3D_V2<"sust.b.3d.v2.b64.zero", Int64Regs>; -defm SUST_P_3D_V2B8_TRAP : SUST_3D_V2<"sust.p.3d.v2.b8.trap", Int16Regs>; -defm SUST_P_3D_V2B16_TRAP : SUST_3D_V2<"sust.p.3d.v2.b16.trap", Int16Regs>; -defm SUST_P_3D_V2B32_TRAP : SUST_3D_V2<"sust.p.3d.v2.b32.trap", Int32Regs>; +defm SUST_P_3D_V2I8_TRAP : SUST_3D_V2<"sust.p.3d.v2.b8.trap", Int16Regs>; +defm SUST_P_3D_V2I16_TRAP : SUST_3D_V2<"sust.p.3d.v2.b16.trap", Int16Regs>; +defm SUST_P_3D_V2I32_TRAP : SUST_3D_V2<"sust.p.3d.v2.b32.trap", Int32Regs>; -class SUST_3D_V4_base +class SUST_3D_V4_base pat> : NVPTXInst<(outs), !con(surf, (ins Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, intype:$r, intype:$g, intype:$b, intype:$a)), inst # " \t[$s, \\{$x, $y, $z, $z\\}], \\{$r, $g, $b, $a\\};", - []>; + pat>; multiclass SUST_3D_V4 { - def _R : SUST_3D_V4_base; - def _I : SUST_3D_V4_base; + defvar intr = !cast("int_nvvm_" # !tolower(NAME)); + def _R : SUST_3D_V4_base; + def _I : SUST_3D_V4_base; } -defm SUST_B_3D_V4B8_CLAMP : SUST_3D_V4<"sust.b.3d.v4.b8.clamp", Int16Regs>; -defm SUST_B_3D_V4B16_CLAMP : SUST_3D_V4<"sust.b.3d.v4.b16.clamp", Int16Regs>; -defm SUST_B_3D_V4B32_CLAMP : SUST_3D_V4<"sust.b.3d.v4.b32.clamp", Int32Regs>; +defm SUST_B_3D_V4I8_CLAMP : SUST_3D_V4<"sust.b.3d.v4.b8.clamp", Int16Regs>; +defm SUST_B_3D_V4I16_CLAMP : SUST_3D_V4<"sust.b.3d.v4.b16.clamp", Int16Regs>; +defm SUST_B_3D_V4I32_CLAMP : SUST_3D_V4<"sust.b.3d.v4.b32.clamp", Int32Regs>; -defm SUST_B_3D_V4B8_TRAP : SUST_3D_V4<"sust.b.3d.v4.b8.trap", Int16Regs>; -defm SUST_B_3D_V4B16_TRAP : SUST_3D_V4<"sust.b.3d.v4.b16.trap", Int16Regs>; -defm SUST_B_3D_V4B32_TRAP : SUST_3D_V4<"sust.b.3d.v4.b32.trap", Int32Regs>; +defm SUST_B_3D_V4I8_TRAP : SUST_3D_V4<"sust.b.3d.v4.b8.trap", Int16Regs>; +defm SUST_B_3D_V4I16_TRAP : SUST_3D_V4<"sust.b.3d.v4.b16.trap", Int16Regs>; +defm SUST_B_3D_V4I32_TRAP : SUST_3D_V4<"sust.b.3d.v4.b32.trap", Int32Regs>; -defm SUST_B_3D_V4B8_ZERO : SUST_3D_V4<"sust.b.3d.v4.b8.zero", Int16Regs>; -defm SUST_B_3D_V4B16_ZERO : SUST_3D_V4<"sust.b.3d.v4.b16.zero", Int16Regs>; -defm SUST_B_3D_V4B32_ZERO : SUST_3D_V4<"sust.b.3d.v4.b32.zero", Int32Regs>; +defm SUST_B_3D_V4I8_ZERO : SUST_3D_V4<"sust.b.3d.v4.b8.zero", Int16Regs>; +defm SUST_B_3D_V4I16_ZERO : SUST_3D_V4<"sust.b.3d.v4.b16.zero", Int16Regs>; +defm SUST_B_3D_V4I32_ZERO : SUST_3D_V4<"sust.b.3d.v4.b32.zero", Int32Regs>; -defm SUST_P_3D_V4B8_TRAP : SUST_3D_V4<"sust.p.3d.v4.b8.trap", Int16Regs>; -defm SUST_P_3D_V4B16_TRAP : SUST_3D_V4<"sust.p.3d.v4.b16.trap", Int16Regs>; -defm SUST_P_3D_V4B32_TRAP : SUST_3D_V4<"sust.p.3d.v4.b32.trap", Int32Regs>; +defm SUST_P_3D_V4I8_TRAP : SUST_3D_V4<"sust.p.3d.v4.b8.trap", Int16Regs>; +defm SUST_P_3D_V4I16_TRAP : SUST_3D_V4<"sust.p.3d.v4.b16.trap", Int16Regs>; +defm SUST_P_3D_V4I32_TRAP : SUST_3D_V4<"sust.p.3d.v4.b32.trap", Int32Regs>; } -// Surface store instruction patterns -// I'm not sure why we can't just include these in the instruction definitions, -// but TableGen complains of type errors :( - -// .clamp variant -def : Pat<(int_nvvm_sust_b_1d_i8_clamp - Int64Regs:$s, Int32Regs:$x, Int16Regs:$r), - (SUST_B_1D_B8_CLAMP_R Int64Regs:$s, Int32Regs:$x, Int16Regs:$r)>; - -def : Pat<(int_nvvm_sust_b_1d_i16_clamp - Int64Regs:$s, Int32Regs:$x, Int16Regs:$r), - (SUST_B_1D_B16_CLAMP_R Int64Regs:$s, Int32Regs:$x, Int16Regs:$r)>; - -def : Pat<(int_nvvm_sust_b_1d_i32_clamp - Int64Regs:$s, Int32Regs:$x, Int32Regs:$r), - (SUST_B_1D_B32_CLAMP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$r)>; - -def : Pat<(int_nvvm_sust_b_1d_i64_clamp - Int64Regs:$s, Int32Regs:$x, Int64Regs:$r), - (SUST_B_1D_B64_CLAMP_R Int64Regs:$s, Int32Regs:$x, Int64Regs:$r)>; - -def : Pat<(int_nvvm_sust_b_1d_v2i8_clamp - Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g), - (SUST_B_1D_V2B8_CLAMP_R Int64Regs:$s, Int32Regs:$x, - Int16Regs:$r, Int16Regs:$g)>; - -def : Pat<(int_nvvm_sust_b_1d_v2i16_clamp - Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g), - (SUST_B_1D_V2B16_CLAMP_R Int64Regs:$s, Int32Regs:$x, - Int16Regs:$r, Int16Regs:$g)>; - -def : Pat<(int_nvvm_sust_b_1d_v2i32_clamp - Int64Regs:$s, Int32Regs:$x, Int32Regs:$r, Int32Regs:$g), - (SUST_B_1D_V2B32_CLAMP_R Int64Regs:$s, Int32Regs:$x, - Int32Regs:$r, Int32Regs:$g)>; - -def : Pat<(int_nvvm_sust_b_1d_v2i64_clamp - Int64Regs:$s, Int32Regs:$x, Int64Regs:$r, Int64Regs:$g), - (SUST_B_1D_V2B64_CLAMP_R Int64Regs:$s, Int32Regs:$x, - Int64Regs:$r, Int64Regs:$g)>; - -def : Pat<(int_nvvm_sust_b_1d_v4i8_clamp - Int64Regs:$s, Int32Regs:$x, - Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), - (SUST_B_1D_V4B8_CLAMP_R Int64Regs:$s, Int32Regs:$x, - Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>; - -def : Pat<(int_nvvm_sust_b_1d_v4i16_clamp - Int64Regs:$s, Int32Regs:$x, - Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), - (SUST_B_1D_V4B16_CLAMP_R Int64Regs:$s, Int32Regs:$x, - Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>; - -def : Pat<(int_nvvm_sust_b_1d_v4i32_clamp - Int64Regs:$s, Int32Regs:$x, - Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a), - (SUST_B_1D_V4B32_CLAMP_R Int64Regs:$s, Int32Regs:$x, - Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>; - - - -def : Pat<(int_nvvm_sust_b_1d_array_i8_clamp - Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r), - (SUST_B_1D_ARRAY_B8_CLAMP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, - Int16Regs:$r)>; - -def : Pat<(int_nvvm_sust_b_1d_array_i16_clamp - Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r), - (SUST_B_1D_ARRAY_B16_CLAMP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, - Int16Regs:$r)>; - -def : Pat<(int_nvvm_sust_b_1d_array_i32_clamp - Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$r), - (SUST_B_1D_ARRAY_B32_CLAMP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, - Int32Regs:$r)>; - -def : Pat<(int_nvvm_sust_b_1d_array_i64_clamp - Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int64Regs:$r), - (SUST_B_1D_ARRAY_B64_CLAMP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, - Int64Regs:$r)>; - -def : Pat<(int_nvvm_sust_b_1d_array_v2i8_clamp - Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g), - (SUST_B_1D_ARRAY_V2B8_CLAMP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, - Int16Regs:$r, Int16Regs:$g)>; - -def : Pat<(int_nvvm_sust_b_1d_array_v2i16_clamp - Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g), - (SUST_B_1D_ARRAY_V2B16_CLAMP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, - Int16Regs:$r, Int16Regs:$g)>; - -def : Pat<(int_nvvm_sust_b_1d_array_v2i32_clamp - Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$r, Int32Regs:$g), - (SUST_B_1D_ARRAY_V2B32_CLAMP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, - Int32Regs:$r, Int32Regs:$g)>; - -def : Pat<(int_nvvm_sust_b_1d_array_v2i64_clamp - Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int64Regs:$r, Int64Regs:$g), - (SUST_B_1D_ARRAY_V2B64_CLAMP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, - Int64Regs:$r, Int64Regs:$g)>; - -def : Pat<(int_nvvm_sust_b_1d_array_v4i8_clamp - Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, - Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), - (SUST_B_1D_ARRAY_V4B8_CLAMP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, - Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>; - -def : Pat<(int_nvvm_sust_b_1d_array_v4i16_clamp - Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, - Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), - (SUST_B_1D_ARRAY_V4B16_CLAMP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, - Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>; - -def : Pat<(int_nvvm_sust_b_1d_array_v4i32_clamp - Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, - Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a), - (SUST_B_1D_ARRAY_V4B32_CLAMP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, - Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>; - - - -def : Pat<(int_nvvm_sust_b_2d_i8_clamp - Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r), - (SUST_B_2D_B8_CLAMP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, - Int16Regs:$r)>; - -def : Pat<(int_nvvm_sust_b_2d_i16_clamp - Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r), - (SUST_B_2D_B16_CLAMP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, - Int16Regs:$r)>; - -def : Pat<(int_nvvm_sust_b_2d_i32_clamp - Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r), - (SUST_B_2D_B32_CLAMP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, - Int32Regs:$r)>; - -def : Pat<(int_nvvm_sust_b_2d_i64_clamp - Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r), - (SUST_B_2D_B64_CLAMP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, - Int64Regs:$r)>; - -def : Pat<(int_nvvm_sust_b_2d_v2i8_clamp - Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r, Int16Regs:$g), - (SUST_B_2D_V2B8_CLAMP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, - Int16Regs:$r, Int16Regs:$g)>; - -def : Pat<(int_nvvm_sust_b_2d_v2i16_clamp - Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r, Int16Regs:$g), - (SUST_B_2D_V2B16_CLAMP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, - Int16Regs:$r, Int16Regs:$g)>; - -def : Pat<(int_nvvm_sust_b_2d_v2i32_clamp - Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r, Int32Regs:$g), - (SUST_B_2D_V2B32_CLAMP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, - Int32Regs:$r, Int32Regs:$g)>; - -def : Pat<(int_nvvm_sust_b_2d_v2i64_clamp - Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r, Int64Regs:$g), - (SUST_B_2D_V2B64_CLAMP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, - Int64Regs:$r, Int64Regs:$g)>; - -def : Pat<(int_nvvm_sust_b_2d_v4i8_clamp - Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, - Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), - (SUST_B_2D_V4B8_CLAMP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, - Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>; - -def : Pat<(int_nvvm_sust_b_2d_v4i16_clamp - Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, - Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), - (SUST_B_2D_V4B16_CLAMP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, - Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>; - -def : Pat<(int_nvvm_sust_b_2d_v4i32_clamp - Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, - Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a), - (SUST_B_2D_V4B32_CLAMP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, - Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>; - - - -def : Pat<(int_nvvm_sust_b_2d_array_i8_clamp - Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r), - (SUST_B_2D_ARRAY_B8_CLAMP_R Int64Regs:$s, - Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, - Int16Regs:$r)>; - -def : Pat<(int_nvvm_sust_b_2d_array_i16_clamp - Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r), - (SUST_B_2D_ARRAY_B16_CLAMP_R Int64Regs:$s, - Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, - Int16Regs:$r)>; - -def : Pat<(int_nvvm_sust_b_2d_array_i32_clamp - Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r), - (SUST_B_2D_ARRAY_B32_CLAMP_R Int64Regs:$s, - Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, - Int32Regs:$r)>; - -def : Pat<(int_nvvm_sust_b_2d_array_i64_clamp - Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r), - (SUST_B_2D_ARRAY_B64_CLAMP_R Int64Regs:$s, - Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, - Int64Regs:$r)>; - -def : Pat<(int_nvvm_sust_b_2d_array_v2i8_clamp - Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, - Int16Regs:$r, Int16Regs:$g), - (SUST_B_2D_ARRAY_V2B8_CLAMP_R Int64Regs:$s, Int32Regs:$l, - Int32Regs:$x, Int32Regs:$y, - Int16Regs:$r, Int16Regs:$g)>; - -def : Pat<(int_nvvm_sust_b_2d_array_v2i16_clamp - Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, - Int16Regs:$r, Int16Regs:$g), - (SUST_B_2D_ARRAY_V2B16_CLAMP_R Int64Regs:$s, Int32Regs:$l, - Int32Regs:$x, Int32Regs:$y, - Int16Regs:$r, Int16Regs:$g)>; - -def : Pat<(int_nvvm_sust_b_2d_array_v2i32_clamp - Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r, - Int32Regs:$g), - (SUST_B_2D_ARRAY_V2B32_CLAMP_R Int64Regs:$s, Int32Regs:$l, - Int32Regs:$x, Int32Regs:$y, Int32Regs:$r, Int32Regs:$g)>; - -def : Pat<(int_nvvm_sust_b_2d_array_v2i64_clamp - Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r, - Int64Regs:$g), - (SUST_B_2D_ARRAY_V2B64_CLAMP_R Int64Regs:$s, Int32Regs:$l, - Int32Regs:$x, Int32Regs:$y, Int64Regs:$r, Int64Regs:$g)>; - -def : Pat<(int_nvvm_sust_b_2d_array_v4i8_clamp - Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, - Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), - (SUST_B_2D_ARRAY_V4B8_CLAMP_R Int64Regs:$s, - Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, - Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>; - -def : Pat<(int_nvvm_sust_b_2d_array_v4i16_clamp - Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, - Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), - (SUST_B_2D_ARRAY_V4B16_CLAMP_R Int64Regs:$s, - Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, - Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>; - -def : Pat<(int_nvvm_sust_b_2d_array_v4i32_clamp - Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, - Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a), - (SUST_B_2D_ARRAY_V4B32_CLAMP_R Int64Regs:$s, Int32Regs:$l, - Int32Regs:$x, Int32Regs:$y, - Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>; - - - -def : Pat<(int_nvvm_sust_b_3d_i8_clamp - Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, - Int16Regs:$r), - (SUST_B_3D_B8_CLAMP_R Int64Regs:$s, - Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, - Int16Regs:$r)>; - -def : Pat<(int_nvvm_sust_b_3d_i16_clamp - Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, - Int16Regs:$r), - (SUST_B_3D_B16_CLAMP_R Int64Regs:$s, - Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, - Int16Regs:$r)>; - -def : Pat<(int_nvvm_sust_b_3d_i32_clamp - Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, - Int32Regs:$r), - (SUST_B_3D_B32_CLAMP_R Int64Regs:$s, - Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, - Int32Regs:$r)>; - -def : Pat<(int_nvvm_sust_b_3d_i64_clamp - Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, - Int64Regs:$r), - (SUST_B_3D_B64_CLAMP_R Int64Regs:$s, - Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, - Int64Regs:$r)>; - -def : Pat<(int_nvvm_sust_b_3d_v2i8_clamp - Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, - Int16Regs:$r, Int16Regs:$g), - (SUST_B_3D_V2B8_CLAMP_R Int64Regs:$s, - Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, - Int16Regs:$r, Int16Regs:$g)>; - -def : Pat<(int_nvvm_sust_b_3d_v2i16_clamp - Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, - Int16Regs:$r, Int16Regs:$g), - (SUST_B_3D_V2B16_CLAMP_R Int64Regs:$s, - Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, - Int16Regs:$r, Int16Regs:$g)>; - -def : Pat<(int_nvvm_sust_b_3d_v2i32_clamp - Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, - Int32Regs:$r, Int32Regs:$g), - (SUST_B_3D_V2B32_CLAMP_R Int64Regs:$s, - Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, - Int32Regs:$r, Int32Regs:$g)>; - -def : Pat<(int_nvvm_sust_b_3d_v2i64_clamp - Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, - Int64Regs:$r, Int64Regs:$g), - (SUST_B_3D_V2B64_CLAMP_R Int64Regs:$s, - Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, - Int64Regs:$r, Int64Regs:$g)>; - -def : Pat<(int_nvvm_sust_b_3d_v4i8_clamp - Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, - Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), - (SUST_B_3D_V4B8_CLAMP_R Int64Regs:$s, - Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, - Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>; - -def : Pat<(int_nvvm_sust_b_3d_v4i16_clamp - Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, - Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), - (SUST_B_3D_V4B16_CLAMP_R Int64Regs:$s, - Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, - Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>; - -def : Pat<(int_nvvm_sust_b_3d_v4i32_clamp - Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, - Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a), - (SUST_B_3D_V4B32_CLAMP_R Int64Regs:$s, - Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, - Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>; - - -// .trap variant -def : Pat<(int_nvvm_sust_b_1d_i8_trap - Int64Regs:$s, Int32Regs:$x, Int16Regs:$r), - (SUST_B_1D_B8_TRAP_R Int64Regs:$s, Int32Regs:$x, Int16Regs:$r)>; - -def : Pat<(int_nvvm_sust_b_1d_i16_trap - Int64Regs:$s, Int32Regs:$x, Int16Regs:$r), - (SUST_B_1D_B16_TRAP_R Int64Regs:$s, Int32Regs:$x, Int16Regs:$r)>; - -def : Pat<(int_nvvm_sust_b_1d_i32_trap - Int64Regs:$s, Int32Regs:$x, Int32Regs:$r), - (SUST_B_1D_B32_TRAP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$r)>; - -def : Pat<(int_nvvm_sust_b_1d_i64_trap - Int64Regs:$s, Int32Regs:$x, Int64Regs:$r), - (SUST_B_1D_B64_TRAP_R Int64Regs:$s, Int32Regs:$x, Int64Regs:$r)>; - -def : Pat<(int_nvvm_sust_b_1d_v2i8_trap - Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g), - (SUST_B_1D_V2B8_TRAP_R Int64Regs:$s, Int32Regs:$x, - Int16Regs:$r, Int16Regs:$g)>; - -def : Pat<(int_nvvm_sust_b_1d_v2i16_trap - Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g), - (SUST_B_1D_V2B16_TRAP_R Int64Regs:$s, Int32Regs:$x, - Int16Regs:$r, Int16Regs:$g)>; - -def : Pat<(int_nvvm_sust_b_1d_v2i32_trap - Int64Regs:$s, Int32Regs:$x, Int32Regs:$r, Int32Regs:$g), - (SUST_B_1D_V2B32_TRAP_R Int64Regs:$s, Int32Regs:$x, - Int32Regs:$r, Int32Regs:$g)>; - -def : Pat<(int_nvvm_sust_b_1d_v2i64_trap - Int64Regs:$s, Int32Regs:$x, Int64Regs:$r, Int64Regs:$g), - (SUST_B_1D_V2B64_TRAP_R Int64Regs:$s, Int32Regs:$x, - Int64Regs:$r, Int64Regs:$g)>; - -def : Pat<(int_nvvm_sust_b_1d_v4i8_trap - Int64Regs:$s, Int32Regs:$x, - Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), - (SUST_B_1D_V4B8_TRAP_R Int64Regs:$s, Int32Regs:$x, - Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>; - -def : Pat<(int_nvvm_sust_b_1d_v4i16_trap - Int64Regs:$s, Int32Regs:$x, - Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), - (SUST_B_1D_V4B16_TRAP_R Int64Regs:$s, Int32Regs:$x, - Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>; - -def : Pat<(int_nvvm_sust_b_1d_v4i32_trap - Int64Regs:$s, Int32Regs:$x, - Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a), - (SUST_B_1D_V4B32_TRAP_R Int64Regs:$s, Int32Regs:$x, - Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>; - - - -def : Pat<(int_nvvm_sust_b_1d_array_i8_trap - Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r), - (SUST_B_1D_ARRAY_B8_TRAP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, - Int16Regs:$r)>; - -def : Pat<(int_nvvm_sust_b_1d_array_i16_trap - Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r), - (SUST_B_1D_ARRAY_B16_TRAP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, - Int16Regs:$r)>; - -def : Pat<(int_nvvm_sust_b_1d_array_i32_trap - Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$r), - (SUST_B_1D_ARRAY_B32_TRAP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, - Int32Regs:$r)>; - -def : Pat<(int_nvvm_sust_b_1d_array_i64_trap - Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int64Regs:$r), - (SUST_B_1D_ARRAY_B64_TRAP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, - Int64Regs:$r)>; - -def : Pat<(int_nvvm_sust_b_1d_array_v2i8_trap - Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g), - (SUST_B_1D_ARRAY_V2B8_TRAP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, - Int16Regs:$r, Int16Regs:$g)>; - -def : Pat<(int_nvvm_sust_b_1d_array_v2i16_trap - Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g), - (SUST_B_1D_ARRAY_V2B16_TRAP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, - Int16Regs:$r, Int16Regs:$g)>; - -def : Pat<(int_nvvm_sust_b_1d_array_v2i32_trap - Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$r, Int32Regs:$g), - (SUST_B_1D_ARRAY_V2B32_TRAP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, - Int32Regs:$r, Int32Regs:$g)>; - -def : Pat<(int_nvvm_sust_b_1d_array_v2i64_trap - Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int64Regs:$r, Int64Regs:$g), - (SUST_B_1D_ARRAY_V2B64_TRAP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, - Int64Regs:$r, Int64Regs:$g)>; - -def : Pat<(int_nvvm_sust_b_1d_array_v4i8_trap - Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, - Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), - (SUST_B_1D_ARRAY_V4B8_TRAP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, - Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>; - -def : Pat<(int_nvvm_sust_b_1d_array_v4i16_trap - Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, - Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), - (SUST_B_1D_ARRAY_V4B16_TRAP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, - Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>; - -def : Pat<(int_nvvm_sust_b_1d_array_v4i32_trap - Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, - Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a), - (SUST_B_1D_ARRAY_V4B32_TRAP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, - Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>; - - - -def : Pat<(int_nvvm_sust_b_2d_i8_trap - Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r), - (SUST_B_2D_B8_TRAP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, - Int16Regs:$r)>; - -def : Pat<(int_nvvm_sust_b_2d_i16_trap - Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r), - (SUST_B_2D_B16_TRAP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, - Int16Regs:$r)>; - -def : Pat<(int_nvvm_sust_b_2d_i32_trap - Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r), - (SUST_B_2D_B32_TRAP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, - Int32Regs:$r)>; - -def : Pat<(int_nvvm_sust_b_2d_i64_trap - Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r), - (SUST_B_2D_B64_TRAP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, - Int64Regs:$r)>; - -def : Pat<(int_nvvm_sust_b_2d_v2i8_trap - Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r, Int16Regs:$g), - (SUST_B_2D_V2B8_TRAP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, - Int16Regs:$r, Int16Regs:$g)>; - -def : Pat<(int_nvvm_sust_b_2d_v2i16_trap - Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r, Int16Regs:$g), - (SUST_B_2D_V2B16_TRAP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, - Int16Regs:$r, Int16Regs:$g)>; - -def : Pat<(int_nvvm_sust_b_2d_v2i32_trap - Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r, Int32Regs:$g), - (SUST_B_2D_V2B32_TRAP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, - Int32Regs:$r, Int32Regs:$g)>; - -def : Pat<(int_nvvm_sust_b_2d_v2i64_trap - Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r, Int64Regs:$g), - (SUST_B_2D_V2B64_TRAP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, - Int64Regs:$r, Int64Regs:$g)>; - -def : Pat<(int_nvvm_sust_b_2d_v4i8_trap - Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, - Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), - (SUST_B_2D_V4B8_TRAP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, - Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>; - -def : Pat<(int_nvvm_sust_b_2d_v4i16_trap - Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, - Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), - (SUST_B_2D_V4B16_TRAP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, - Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>; - -def : Pat<(int_nvvm_sust_b_2d_v4i32_trap - Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, - Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a), - (SUST_B_2D_V4B32_TRAP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, - Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>; - - - -def : Pat<(int_nvvm_sust_b_2d_array_i8_trap - Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r), - (SUST_B_2D_ARRAY_B8_TRAP_R Int64Regs:$s, - Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, - Int16Regs:$r)>; - -def : Pat<(int_nvvm_sust_b_2d_array_i16_trap - Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r), - (SUST_B_2D_ARRAY_B16_TRAP_R Int64Regs:$s, - Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, - Int16Regs:$r)>; - -def : Pat<(int_nvvm_sust_b_2d_array_i32_trap - Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r), - (SUST_B_2D_ARRAY_B32_TRAP_R Int64Regs:$s, - Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, - Int32Regs:$r)>; - -def : Pat<(int_nvvm_sust_b_2d_array_i64_trap - Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r), - (SUST_B_2D_ARRAY_B64_TRAP_R Int64Regs:$s, - Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, - Int64Regs:$r)>; - -def : Pat<(int_nvvm_sust_b_2d_array_v2i8_trap - Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, - Int16Regs:$r, Int16Regs:$g), - (SUST_B_2D_ARRAY_V2B8_TRAP_R Int64Regs:$s, Int32Regs:$l, - Int32Regs:$x, Int32Regs:$y, - Int16Regs:$r, Int16Regs:$g)>; - -def : Pat<(int_nvvm_sust_b_2d_array_v2i16_trap - Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, - Int16Regs:$r, Int16Regs:$g), - (SUST_B_2D_ARRAY_V2B16_TRAP_R Int64Regs:$s, Int32Regs:$l, - Int32Regs:$x, Int32Regs:$y, - Int16Regs:$r, Int16Regs:$g)>; - -def : Pat<(int_nvvm_sust_b_2d_array_v2i32_trap - Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r, - Int32Regs:$g), - (SUST_B_2D_ARRAY_V2B32_TRAP_R Int64Regs:$s, Int32Regs:$l, - Int32Regs:$x, Int32Regs:$y, Int32Regs:$r, Int32Regs:$g)>; - -def : Pat<(int_nvvm_sust_b_2d_array_v2i64_trap - Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r, - Int64Regs:$g), - (SUST_B_2D_ARRAY_V2B64_TRAP_R Int64Regs:$s, Int32Regs:$l, - Int32Regs:$x, Int32Regs:$y, Int64Regs:$r, Int64Regs:$g)>; - -def : Pat<(int_nvvm_sust_b_2d_array_v4i8_trap - Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, - Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), - (SUST_B_2D_ARRAY_V4B8_TRAP_R Int64Regs:$s, - Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, - Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>; - -def : Pat<(int_nvvm_sust_b_2d_array_v4i16_trap - Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, - Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), - (SUST_B_2D_ARRAY_V4B16_TRAP_R Int64Regs:$s, - Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, - Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>; - -def : Pat<(int_nvvm_sust_b_2d_array_v4i32_trap - Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, - Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a), - (SUST_B_2D_ARRAY_V4B32_TRAP_R Int64Regs:$s, Int32Regs:$l, - Int32Regs:$x, Int32Regs:$y, - Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>; - - - -def : Pat<(int_nvvm_sust_b_3d_i8_trap - Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, - Int16Regs:$r), - (SUST_B_3D_B8_TRAP_R Int64Regs:$s, - Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, - Int16Regs:$r)>; - -def : Pat<(int_nvvm_sust_b_3d_i16_trap - Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, - Int16Regs:$r), - (SUST_B_3D_B16_TRAP_R Int64Regs:$s, - Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, - Int16Regs:$r)>; - -def : Pat<(int_nvvm_sust_b_3d_i32_trap - Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, - Int32Regs:$r), - (SUST_B_3D_B32_TRAP_R Int64Regs:$s, - Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, - Int32Regs:$r)>; - -def : Pat<(int_nvvm_sust_b_3d_i64_trap - Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, - Int64Regs:$r), - (SUST_B_3D_B64_TRAP_R Int64Regs:$s, - Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, - Int64Regs:$r)>; - -def : Pat<(int_nvvm_sust_b_3d_v2i8_trap - Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, - Int16Regs:$r, Int16Regs:$g), - (SUST_B_3D_V2B8_TRAP_R Int64Regs:$s, - Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, - Int16Regs:$r, Int16Regs:$g)>; - -def : Pat<(int_nvvm_sust_b_3d_v2i16_trap - Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, - Int16Regs:$r, Int16Regs:$g), - (SUST_B_3D_V2B16_TRAP_R Int64Regs:$s, - Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, - Int16Regs:$r, Int16Regs:$g)>; - -def : Pat<(int_nvvm_sust_b_3d_v2i32_trap - Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, - Int32Regs:$r, Int32Regs:$g), - (SUST_B_3D_V2B32_TRAP_R Int64Regs:$s, - Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, - Int32Regs:$r, Int32Regs:$g)>; - -def : Pat<(int_nvvm_sust_b_3d_v2i64_trap - Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, - Int64Regs:$r, Int64Regs:$g), - (SUST_B_3D_V2B64_TRAP_R Int64Regs:$s, - Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, - Int64Regs:$r, Int64Regs:$g)>; - -def : Pat<(int_nvvm_sust_b_3d_v4i8_trap - Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, - Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), - (SUST_B_3D_V4B8_TRAP_R Int64Regs:$s, - Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, - Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>; - -def : Pat<(int_nvvm_sust_b_3d_v4i16_trap - Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, - Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), - (SUST_B_3D_V4B16_TRAP_R Int64Regs:$s, - Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, - Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>; - -def : Pat<(int_nvvm_sust_b_3d_v4i32_trap - Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, - Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a), - (SUST_B_3D_V4B32_TRAP_R Int64Regs:$s, - Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, - Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>; - - -// .zero variant -def : Pat<(int_nvvm_sust_b_1d_i8_zero - Int64Regs:$s, Int32Regs:$x, Int16Regs:$r), - (SUST_B_1D_B8_ZERO_R Int64Regs:$s, Int32Regs:$x, Int16Regs:$r)>; - -def : Pat<(int_nvvm_sust_b_1d_i16_zero - Int64Regs:$s, Int32Regs:$x, Int16Regs:$r), - (SUST_B_1D_B16_ZERO_R Int64Regs:$s, Int32Regs:$x, Int16Regs:$r)>; - -def : Pat<(int_nvvm_sust_b_1d_i32_zero - Int64Regs:$s, Int32Regs:$x, Int32Regs:$r), - (SUST_B_1D_B32_ZERO_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$r)>; - -def : Pat<(int_nvvm_sust_b_1d_i64_zero - Int64Regs:$s, Int32Regs:$x, Int64Regs:$r), - (SUST_B_1D_B64_ZERO_R Int64Regs:$s, Int32Regs:$x, Int64Regs:$r)>; - -def : Pat<(int_nvvm_sust_b_1d_v2i8_zero - Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g), - (SUST_B_1D_V2B8_ZERO_R Int64Regs:$s, Int32Regs:$x, - Int16Regs:$r, Int16Regs:$g)>; - -def : Pat<(int_nvvm_sust_b_1d_v2i16_zero - Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g), - (SUST_B_1D_V2B16_ZERO_R Int64Regs:$s, Int32Regs:$x, - Int16Regs:$r, Int16Regs:$g)>; - -def : Pat<(int_nvvm_sust_b_1d_v2i32_zero - Int64Regs:$s, Int32Regs:$x, Int32Regs:$r, Int32Regs:$g), - (SUST_B_1D_V2B32_ZERO_R Int64Regs:$s, Int32Regs:$x, - Int32Regs:$r, Int32Regs:$g)>; - -def : Pat<(int_nvvm_sust_b_1d_v2i64_zero - Int64Regs:$s, Int32Regs:$x, Int64Regs:$r, Int64Regs:$g), - (SUST_B_1D_V2B64_ZERO_R Int64Regs:$s, Int32Regs:$x, - Int64Regs:$r, Int64Regs:$g)>; - -def : Pat<(int_nvvm_sust_b_1d_v4i8_zero - Int64Regs:$s, Int32Regs:$x, - Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), - (SUST_B_1D_V4B8_ZERO_R Int64Regs:$s, Int32Regs:$x, - Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>; - -def : Pat<(int_nvvm_sust_b_1d_v4i16_zero - Int64Regs:$s, Int32Regs:$x, - Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), - (SUST_B_1D_V4B16_ZERO_R Int64Regs:$s, Int32Regs:$x, - Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>; - -def : Pat<(int_nvvm_sust_b_1d_v4i32_zero - Int64Regs:$s, Int32Regs:$x, - Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a), - (SUST_B_1D_V4B32_ZERO_R Int64Regs:$s, Int32Regs:$x, - Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>; - - - -def : Pat<(int_nvvm_sust_b_1d_array_i8_zero - Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r), - (SUST_B_1D_ARRAY_B8_ZERO_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, - Int16Regs:$r)>; - -def : Pat<(int_nvvm_sust_b_1d_array_i16_zero - Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r), - (SUST_B_1D_ARRAY_B16_ZERO_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, - Int16Regs:$r)>; - -def : Pat<(int_nvvm_sust_b_1d_array_i32_zero - Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$r), - (SUST_B_1D_ARRAY_B32_ZERO_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, - Int32Regs:$r)>; - -def : Pat<(int_nvvm_sust_b_1d_array_i64_zero - Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int64Regs:$r), - (SUST_B_1D_ARRAY_B64_ZERO_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, - Int64Regs:$r)>; - -def : Pat<(int_nvvm_sust_b_1d_array_v2i8_zero - Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g), - (SUST_B_1D_ARRAY_V2B8_ZERO_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, - Int16Regs:$r, Int16Regs:$g)>; - -def : Pat<(int_nvvm_sust_b_1d_array_v2i16_zero - Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g), - (SUST_B_1D_ARRAY_V2B16_ZERO_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, - Int16Regs:$r, Int16Regs:$g)>; - -def : Pat<(int_nvvm_sust_b_1d_array_v2i32_zero - Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$r, Int32Regs:$g), - (SUST_B_1D_ARRAY_V2B32_ZERO_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, - Int32Regs:$r, Int32Regs:$g)>; - -def : Pat<(int_nvvm_sust_b_1d_array_v2i64_zero - Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int64Regs:$r, Int64Regs:$g), - (SUST_B_1D_ARRAY_V2B64_ZERO_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, - Int64Regs:$r, Int64Regs:$g)>; - -def : Pat<(int_nvvm_sust_b_1d_array_v4i8_zero - Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, - Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), - (SUST_B_1D_ARRAY_V4B8_ZERO_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, - Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>; - -def : Pat<(int_nvvm_sust_b_1d_array_v4i16_zero - Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, - Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), - (SUST_B_1D_ARRAY_V4B16_ZERO_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, - Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>; - -def : Pat<(int_nvvm_sust_b_1d_array_v4i32_zero - Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, - Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a), - (SUST_B_1D_ARRAY_V4B32_ZERO_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, - Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>; - - - -def : Pat<(int_nvvm_sust_b_2d_i8_zero - Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r), - (SUST_B_2D_B8_ZERO_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, - Int16Regs:$r)>; - -def : Pat<(int_nvvm_sust_b_2d_i16_zero - Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r), - (SUST_B_2D_B16_ZERO_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, - Int16Regs:$r)>; - -def : Pat<(int_nvvm_sust_b_2d_i32_zero - Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r), - (SUST_B_2D_B32_ZERO_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, - Int32Regs:$r)>; - -def : Pat<(int_nvvm_sust_b_2d_i64_zero - Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r), - (SUST_B_2D_B64_ZERO_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, - Int64Regs:$r)>; - -def : Pat<(int_nvvm_sust_b_2d_v2i8_zero - Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r, Int16Regs:$g), - (SUST_B_2D_V2B8_ZERO_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, - Int16Regs:$r, Int16Regs:$g)>; - -def : Pat<(int_nvvm_sust_b_2d_v2i16_zero - Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r, Int16Regs:$g), - (SUST_B_2D_V2B16_ZERO_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, - Int16Regs:$r, Int16Regs:$g)>; - -def : Pat<(int_nvvm_sust_b_2d_v2i32_zero - Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r, Int32Regs:$g), - (SUST_B_2D_V2B32_ZERO_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, - Int32Regs:$r, Int32Regs:$g)>; - -def : Pat<(int_nvvm_sust_b_2d_v2i64_zero - Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r, Int64Regs:$g), - (SUST_B_2D_V2B64_ZERO_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, - Int64Regs:$r, Int64Regs:$g)>; - -def : Pat<(int_nvvm_sust_b_2d_v4i8_zero - Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, - Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), - (SUST_B_2D_V4B8_ZERO_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, - Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>; - -def : Pat<(int_nvvm_sust_b_2d_v4i16_zero - Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, - Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), - (SUST_B_2D_V4B16_ZERO_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, - Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>; - -def : Pat<(int_nvvm_sust_b_2d_v4i32_zero - Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, - Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a), - (SUST_B_2D_V4B32_ZERO_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, - Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>; - - - -def : Pat<(int_nvvm_sust_b_2d_array_i8_zero - Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r), - (SUST_B_2D_ARRAY_B8_ZERO_R Int64Regs:$s, - Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, - Int16Regs:$r)>; - -def : Pat<(int_nvvm_sust_b_2d_array_i16_zero - Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r), - (SUST_B_2D_ARRAY_B16_ZERO_R Int64Regs:$s, - Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, - Int16Regs:$r)>; - -def : Pat<(int_nvvm_sust_b_2d_array_i32_zero - Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r), - (SUST_B_2D_ARRAY_B32_ZERO_R Int64Regs:$s, - Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, - Int32Regs:$r)>; - -def : Pat<(int_nvvm_sust_b_2d_array_i64_zero - Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r), - (SUST_B_2D_ARRAY_B64_ZERO_R Int64Regs:$s, - Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, - Int64Regs:$r)>; - -def : Pat<(int_nvvm_sust_b_2d_array_v2i8_zero - Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, - Int16Regs:$r, Int16Regs:$g), - (SUST_B_2D_ARRAY_V2B8_ZERO_R Int64Regs:$s, Int32Regs:$l, - Int32Regs:$x, Int32Regs:$y, - Int16Regs:$r, Int16Regs:$g)>; - -def : Pat<(int_nvvm_sust_b_2d_array_v2i16_zero - Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, - Int16Regs:$r, Int16Regs:$g), - (SUST_B_2D_ARRAY_V2B16_ZERO_R Int64Regs:$s, Int32Regs:$l, - Int32Regs:$x, Int32Regs:$y, - Int16Regs:$r, Int16Regs:$g)>; - -def : Pat<(int_nvvm_sust_b_2d_array_v2i32_zero - Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r, - Int32Regs:$g), - (SUST_B_2D_ARRAY_V2B32_ZERO_R Int64Regs:$s, Int32Regs:$l, - Int32Regs:$x, Int32Regs:$y, Int32Regs:$r, Int32Regs:$g)>; - -def : Pat<(int_nvvm_sust_b_2d_array_v2i64_zero - Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r, - Int64Regs:$g), - (SUST_B_2D_ARRAY_V2B64_ZERO_R Int64Regs:$s, Int32Regs:$l, - Int32Regs:$x, Int32Regs:$y, Int64Regs:$r, Int64Regs:$g)>; - -def : Pat<(int_nvvm_sust_b_2d_array_v4i8_zero - Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, - Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), - (SUST_B_2D_ARRAY_V4B8_ZERO_R Int64Regs:$s, - Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, - Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>; - -def : Pat<(int_nvvm_sust_b_2d_array_v4i16_zero - Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, - Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), - (SUST_B_2D_ARRAY_V4B16_ZERO_R Int64Regs:$s, - Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, - Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>; - -def : Pat<(int_nvvm_sust_b_2d_array_v4i32_zero - Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, - Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a), - (SUST_B_2D_ARRAY_V4B32_ZERO_R Int64Regs:$s, Int32Regs:$l, - Int32Regs:$x, Int32Regs:$y, - Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>; - - - -def : Pat<(int_nvvm_sust_b_3d_i8_zero - Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, - Int16Regs:$r), - (SUST_B_3D_B8_ZERO_R Int64Regs:$s, - Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, - Int16Regs:$r)>; - -def : Pat<(int_nvvm_sust_b_3d_i16_zero - Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, - Int16Regs:$r), - (SUST_B_3D_B16_ZERO_R Int64Regs:$s, - Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, - Int16Regs:$r)>; - -def : Pat<(int_nvvm_sust_b_3d_i32_zero - Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, - Int32Regs:$r), - (SUST_B_3D_B32_ZERO_R Int64Regs:$s, - Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, - Int32Regs:$r)>; - -def : Pat<(int_nvvm_sust_b_3d_i64_zero - Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, - Int64Regs:$r), - (SUST_B_3D_B64_ZERO_R Int64Regs:$s, - Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, - Int64Regs:$r)>; - -def : Pat<(int_nvvm_sust_b_3d_v2i8_zero - Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, - Int16Regs:$r, Int16Regs:$g), - (SUST_B_3D_V2B8_ZERO_R Int64Regs:$s, - Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, - Int16Regs:$r, Int16Regs:$g)>; - -def : Pat<(int_nvvm_sust_b_3d_v2i16_zero - Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, - Int16Regs:$r, Int16Regs:$g), - (SUST_B_3D_V2B16_ZERO_R Int64Regs:$s, - Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, - Int16Regs:$r, Int16Regs:$g)>; - -def : Pat<(int_nvvm_sust_b_3d_v2i32_zero - Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, - Int32Regs:$r, Int32Regs:$g), - (SUST_B_3D_V2B32_ZERO_R Int64Regs:$s, - Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, - Int32Regs:$r, Int32Regs:$g)>; - -def : Pat<(int_nvvm_sust_b_3d_v2i64_zero - Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, - Int64Regs:$r, Int64Regs:$g), - (SUST_B_3D_V2B64_ZERO_R Int64Regs:$s, - Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, - Int64Regs:$r, Int64Regs:$g)>; - -def : Pat<(int_nvvm_sust_b_3d_v4i8_zero - Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, - Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), - (SUST_B_3D_V4B8_ZERO_R Int64Regs:$s, - Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, - Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>; - -def : Pat<(int_nvvm_sust_b_3d_v4i16_zero - Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, - Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), - (SUST_B_3D_V4B16_ZERO_R Int64Regs:$s, - Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, - Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>; - -def : Pat<(int_nvvm_sust_b_3d_v4i32_zero - Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, - Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a), - (SUST_B_3D_V4B32_ZERO_R Int64Regs:$s, - Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, - Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>; - - - - -def : Pat<(int_nvvm_sust_p_1d_i8_trap - Int64Regs:$s, Int32Regs:$x, Int16Regs:$r), - (SUST_P_1D_B8_TRAP_R Int64Regs:$s, Int32Regs:$x, Int16Regs:$r)>; - -def : Pat<(int_nvvm_sust_p_1d_i16_trap - Int64Regs:$s, Int32Regs:$x, Int16Regs:$r), - (SUST_P_1D_B16_TRAP_R Int64Regs:$s, Int32Regs:$x, Int16Regs:$r)>; - -def : Pat<(int_nvvm_sust_p_1d_i32_trap - Int64Regs:$s, Int32Regs:$x, Int32Regs:$r), - (SUST_P_1D_B32_TRAP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$r)>; - -def : Pat<(int_nvvm_sust_p_1d_v2i8_trap - Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g), - (SUST_P_1D_V2B8_TRAP_R Int64Regs:$s, Int32Regs:$x, - Int16Regs:$r, Int16Regs:$g)>; - -def : Pat<(int_nvvm_sust_p_1d_v2i16_trap - Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g), - (SUST_P_1D_V2B16_TRAP_R Int64Regs:$s, Int32Regs:$x, - Int16Regs:$r, Int16Regs:$g)>; - -def : Pat<(int_nvvm_sust_p_1d_v2i32_trap - Int64Regs:$s, Int32Regs:$x, Int32Regs:$r, Int32Regs:$g), - (SUST_P_1D_V2B32_TRAP_R Int64Regs:$s, Int32Regs:$x, - Int32Regs:$r, Int32Regs:$g)>; - -def : Pat<(int_nvvm_sust_p_1d_v4i8_trap - Int64Regs:$s, Int32Regs:$x, - Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), - (SUST_P_1D_V4B8_TRAP_R Int64Regs:$s, Int32Regs:$x, - Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>; - -def : Pat<(int_nvvm_sust_p_1d_v4i16_trap - Int64Regs:$s, Int32Regs:$x, - Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), - (SUST_P_1D_V4B16_TRAP_R Int64Regs:$s, Int32Regs:$x, - Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>; - -def : Pat<(int_nvvm_sust_p_1d_v4i32_trap - Int64Regs:$s, Int32Regs:$x, - Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a), - (SUST_P_1D_V4B32_TRAP_R Int64Regs:$s, Int32Regs:$x, - Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>; - - - -def : Pat<(int_nvvm_sust_p_1d_array_i8_trap - Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r), - (SUST_P_1D_ARRAY_B8_TRAP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, - Int16Regs:$r)>; - -def : Pat<(int_nvvm_sust_p_1d_array_i16_trap - Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r), - (SUST_P_1D_ARRAY_B16_TRAP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, - Int16Regs:$r)>; - -def : Pat<(int_nvvm_sust_p_1d_array_i32_trap - Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$r), - (SUST_P_1D_ARRAY_B32_TRAP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, - Int32Regs:$r)>; - -def : Pat<(int_nvvm_sust_p_1d_array_v2i8_trap - Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g), - (SUST_P_1D_ARRAY_V2B8_TRAP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, - Int16Regs:$r, Int16Regs:$g)>; - -def : Pat<(int_nvvm_sust_p_1d_array_v2i16_trap - Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g), - (SUST_P_1D_ARRAY_V2B16_TRAP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, - Int16Regs:$r, Int16Regs:$g)>; - -def : Pat<(int_nvvm_sust_p_1d_array_v2i32_trap - Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$r, Int32Regs:$g), - (SUST_P_1D_ARRAY_V2B32_TRAP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, - Int32Regs:$r, Int32Regs:$g)>; - -def : Pat<(int_nvvm_sust_p_1d_array_v4i8_trap - Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, - Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), - (SUST_P_1D_ARRAY_V4B8_TRAP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, - Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>; - -def : Pat<(int_nvvm_sust_p_1d_array_v4i16_trap - Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, - Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), - (SUST_P_1D_ARRAY_V4B16_TRAP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, - Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>; - -def : Pat<(int_nvvm_sust_p_1d_array_v4i32_trap - Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, - Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a), - (SUST_P_1D_ARRAY_V4B32_TRAP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, - Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>; - - - -def : Pat<(int_nvvm_sust_p_2d_i8_trap - Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r), - (SUST_P_2D_B8_TRAP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, - Int16Regs:$r)>; - -def : Pat<(int_nvvm_sust_p_2d_i16_trap - Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r), - (SUST_P_2D_B16_TRAP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, - Int16Regs:$r)>; - -def : Pat<(int_nvvm_sust_p_2d_i32_trap - Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r), - (SUST_P_2D_B32_TRAP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, - Int32Regs:$r)>; - -def : Pat<(int_nvvm_sust_p_2d_v2i8_trap - Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r, Int16Regs:$g), - (SUST_P_2D_V2B8_TRAP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, - Int16Regs:$r, Int16Regs:$g)>; - -def : Pat<(int_nvvm_sust_p_2d_v2i16_trap - Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r, Int16Regs:$g), - (SUST_P_2D_V2B16_TRAP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, - Int16Regs:$r, Int16Regs:$g)>; - -def : Pat<(int_nvvm_sust_p_2d_v2i32_trap - Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r, Int32Regs:$g), - (SUST_P_2D_V2B32_TRAP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, - Int32Regs:$r, Int32Regs:$g)>; - -def : Pat<(int_nvvm_sust_p_2d_v4i8_trap - Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, - Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), - (SUST_P_2D_V4B8_TRAP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, - Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>; - -def : Pat<(int_nvvm_sust_p_2d_v4i16_trap - Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, - Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), - (SUST_P_2D_V4B16_TRAP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, - Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>; - -def : Pat<(int_nvvm_sust_p_2d_v4i32_trap - Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, - Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a), - (SUST_P_2D_V4B32_TRAP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, - Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>; - - - -def : Pat<(int_nvvm_sust_p_2d_array_i8_trap - Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r), - (SUST_P_2D_ARRAY_B8_TRAP_R Int64Regs:$s, - Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, - Int16Regs:$r)>; - -def : Pat<(int_nvvm_sust_p_2d_array_i16_trap - Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r), - (SUST_P_2D_ARRAY_B16_TRAP_R Int64Regs:$s, - Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, - Int16Regs:$r)>; - -def : Pat<(int_nvvm_sust_p_2d_array_i32_trap - Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r), - (SUST_P_2D_ARRAY_B32_TRAP_R Int64Regs:$s, - Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, - Int32Regs:$r)>; - -def : Pat<(int_nvvm_sust_p_2d_array_v2i8_trap - Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, - Int16Regs:$r, Int16Regs:$g), - (SUST_P_2D_ARRAY_V2B8_TRAP_R Int64Regs:$s, Int32Regs:$l, - Int32Regs:$x, Int32Regs:$y, - Int16Regs:$r, Int16Regs:$g)>; - -def : Pat<(int_nvvm_sust_p_2d_array_v2i16_trap - Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, - Int16Regs:$r, Int16Regs:$g), - (SUST_P_2D_ARRAY_V2B16_TRAP_R Int64Regs:$s, Int32Regs:$l, - Int32Regs:$x, Int32Regs:$y, - Int16Regs:$r, Int16Regs:$g)>; - -def : Pat<(int_nvvm_sust_p_2d_array_v2i32_trap - Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r, - Int32Regs:$g), - (SUST_P_2D_ARRAY_V2B32_TRAP_R Int64Regs:$s, Int32Regs:$l, - Int32Regs:$x, Int32Regs:$y, Int32Regs:$r, Int32Regs:$g)>; - -def : Pat<(int_nvvm_sust_p_2d_array_v4i8_trap - Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, - Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), - (SUST_P_2D_ARRAY_V4B8_TRAP_R Int64Regs:$s, - Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, - Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>; - -def : Pat<(int_nvvm_sust_p_2d_array_v4i16_trap - Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, - Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), - (SUST_P_2D_ARRAY_V4B16_TRAP_R Int64Regs:$s, - Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, - Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>; - -def : Pat<(int_nvvm_sust_p_2d_array_v4i32_trap - Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, - Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a), - (SUST_P_2D_ARRAY_V4B32_TRAP_R Int64Regs:$s, Int32Regs:$l, - Int32Regs:$x, Int32Regs:$y, - Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>; - - - -def : Pat<(int_nvvm_sust_p_3d_i8_trap - Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, - Int16Regs:$r), - (SUST_P_3D_B8_TRAP_R Int64Regs:$s, - Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, - Int16Regs:$r)>; - -def : Pat<(int_nvvm_sust_p_3d_i16_trap - Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, - Int16Regs:$r), - (SUST_P_3D_B16_TRAP_R Int64Regs:$s, - Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, - Int16Regs:$r)>; - -def : Pat<(int_nvvm_sust_p_3d_i32_trap - Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, - Int32Regs:$r), - (SUST_P_3D_B32_TRAP_R Int64Regs:$s, - Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, - Int32Regs:$r)>; - -def : Pat<(int_nvvm_sust_p_3d_v2i8_trap - Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, - Int16Regs:$r, Int16Regs:$g), - (SUST_P_3D_V2B8_TRAP_R Int64Regs:$s, - Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, - Int16Regs:$r, Int16Regs:$g)>; - -def : Pat<(int_nvvm_sust_p_3d_v2i16_trap - Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, - Int16Regs:$r, Int16Regs:$g), - (SUST_P_3D_V2B16_TRAP_R Int64Regs:$s, - Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, - Int16Regs:$r, Int16Regs:$g)>; - -def : Pat<(int_nvvm_sust_p_3d_v2i32_trap - Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, - Int32Regs:$r, Int32Regs:$g), - (SUST_P_3D_V2B32_TRAP_R Int64Regs:$s, - Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, - Int32Regs:$r, Int32Regs:$g)>; - -def : Pat<(int_nvvm_sust_p_3d_v4i8_trap - Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, - Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), - (SUST_P_3D_V4B8_TRAP_R Int64Regs:$s, - Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, - Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>; - -def : Pat<(int_nvvm_sust_p_3d_v4i16_trap - Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, - Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), - (SUST_P_3D_V4B16_TRAP_R Int64Regs:$s, - Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, - Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>; - -def : Pat<(int_nvvm_sust_p_3d_v4i32_trap - Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, - Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a), - (SUST_P_3D_V4B32_TRAP_R Int64Regs:$s, - Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, - Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>; //----------------------------------- // Read Special Registers @@ -6411,13 +5040,13 @@ def : Pat<(int_nvvm_sust_p_3d_v4i32_trap class PTX_READ_SREG_R64 Preds=[]> : NVPTXInst<(outs Int64Regs:$d), (ins), - !strconcat("mov.u64 \t$d, %", regname, ";"), + "mov.u64 \t$d, %" # regname # ";", [(set i64:$d, (intop))]>, Requires; class PTX_READ_SREG_R32 Preds=[]> : NVPTXInst<(outs Int32Regs:$d), (ins), - !strconcat("mov.u32 \t$d, %", regname, ";"), + "mov.u32 \t$d, %" # regname # ";", [(set i32:$d, (intop))]>, Requires; @@ -6547,7 +5176,7 @@ class WMMA_REGINFO !or(!eq(ptx_elt_type, "f16"), !eq(ptx_elt_type, "f32"))) : [hasSM<70>, hasPTX<60>], - !and(!eq(geom,"m8n8k4"), + !and(!eq(geom, "m8n8k4"), !eq(ptx_elt_type, "f64")) : [hasSM<80>, hasPTX<70>], // fp16 -> fp16/fp32 @ m8n32k16/m32n8k16 @@ -6557,46 +5186,46 @@ class WMMA_REGINFO !eq(ptx_elt_type, "f32"))) : [hasSM<70>, hasPTX<61>], // u8/s8 -> s32 @ m16n16k16/m8n32k16/m32n8k16 - !and(!or(!eq(geom,"m16n16k16"), - !eq(geom,"m8n32k16"), - !eq(geom,"m32n8k16")), + !and(!or(!eq(geom, "m16n16k16"), + !eq(geom, "m8n32k16"), + !eq(geom, "m32n8k16")), !or(!eq(ptx_elt_type, "u8"), !eq(ptx_elt_type, "s8"), !eq(ptx_elt_type, "s32"))) : [hasSM<72>, hasPTX<63>], - !and(!or(!eq(geom,"m16n16k16"), - !eq(geom,"m8n32k16"), - !eq(geom,"m32n8k16")), + !and(!or(!eq(geom, "m16n16k16"), + !eq(geom, "m8n32k16"), + !eq(geom, "m32n8k16")), !eq(ptx_elt_type, "bf16")) : [hasSM<80>, hasPTX<70>], - !and(!eq(geom,"m16n16k8"), + !and(!eq(geom, "m16n16k8"), !eq(ptx_elt_type, "tf32")) : [hasSM<80>, hasPTX<70>], - !and(!eq(geom,"m16n16k8"), + !and(!eq(geom, "m16n16k8"), !eq(ptx_elt_type, "f32")) : [hasSM<80>, hasPTX<70>], // b1 -> s32 @ m8n8k128(b1) - !and(!ne(op,"mma"), - !eq(geom,"m8n8k128")) : [hasSM<75>, hasPTX<63>], + !and(!ne(op, "mma"), + !eq(geom, "m8n8k128")) : [hasSM<75>, hasPTX<63>], // u4/s4 -> s32 @ m8n8k32 (u4/s4) - !and(!ne(op,"mma"), - !eq(geom,"m8n8k32")) : [hasSM<75>, hasPTX<63>], + !and(!ne(op, "mma"), + !eq(geom, "m8n8k32")) : [hasSM<75>, hasPTX<63>], - !or(!eq(geom,"m16n8k8"), - !eq(geom,"m8n8k16")) : [hasSM<75>, hasPTX<65>], + !or(!eq(geom, "m16n8k8"), + !eq(geom, "m8n8k16")) : [hasSM<75>, hasPTX<65>], - !and(!ne(ptx_elt_type,"f64"), + !and(!ne(ptx_elt_type, "f64"), !eq(geom, "m8n8k4")) : [hasSM<70>, hasPTX<64>], // mma m8n8k32 requires higher PTX version - !and(!eq(op,"mma"), - !eq(geom,"m8n8k32")) : [hasSM<75>, hasPTX<65>], + !and(!eq(op, "mma"), + !eq(geom, "m8n8k32")) : [hasSM<75>, hasPTX<65>], - !and(!eq(ptx_elt_type,"f64"), + !and(!eq(ptx_elt_type, "f64"), !eq(geom, "m8n8k4")) : [hasSM<80>, hasPTX<70>], - !and(!eq(op,"mma"), + !and(!eq(op, "mma"), !or(!eq(geom, "m16n8k16"), !eq(geom, "m16n8k4"), !eq(geom, "m16n8k32"), @@ -6605,28 +5234,28 @@ class WMMA_REGINFO !eq(geom, "m16n8k128"), !eq(geom, "m16n8k256"))) : [hasSM<80>, hasPTX<70>], - !and(!eq(op,"ldmatrix"), - !eq(ptx_elt_type,"b16"), + !and(!eq(op, "ldmatrix"), + !eq(ptx_elt_type, "b16"), !eq(geom, "m8n8")) : [hasSM<75>, hasPTX<65>], - !and(!eq(op,"ldmatrix"), - !eq(ptx_elt_type,"b8"), + !and(!eq(op, "ldmatrix"), + !eq(ptx_elt_type, "b8"), !eq(geom, "m16n16")) : [hasSM<100>, hasArchAccelFeatures, hasPTX<86>], - !and(!eq(op,"ldmatrix"), - !eq(ptx_elt_type,"b8x16.b6x16_p32"), + !and(!eq(op, "ldmatrix"), + !eq(ptx_elt_type, "b8x16.b6x16_p32"), !eq(geom, "m16n16")) : [hasSM<100>, hasArchAccelFeatures, hasPTX<86>], - !and(!eq(op,"ldmatrix"), - !eq(ptx_elt_type,"b8x16.b4x16_p64"), + !and(!eq(op, "ldmatrix"), + !eq(ptx_elt_type, "b8x16.b4x16_p64"), !eq(geom, "m16n16")) : [hasSM<100>, hasArchAccelFeatures, hasPTX<86>], - !and(!eq(op,"ldmatrix"), - !eq(ptx_elt_type,"b8x16.b6x16_p32"), + !and(!eq(op, "ldmatrix"), + !eq(ptx_elt_type, "b8x16.b6x16_p32"), !eq(geom, "m8n16")) : [hasSM<100>, hasArchAccelFeatures, hasPTX<86>], - !and(!eq(op,"ldmatrix"), - !eq(ptx_elt_type,"b8x16.b4x16_p64"), + !and(!eq(op, "ldmatrix"), + !eq(ptx_elt_type, "b8x16.b4x16_p64"), !eq(geom, "m8n16")) : [hasSM<100>, hasArchAccelFeatures, hasPTX<86>]); // template DAGs for instruction inputs/output. @@ -6655,7 +5284,7 @@ class WMMA_INSTR _Args> : NVPTXInst<(outs), (ins), "?", []> { Intrinsic Intr = !cast(_Intr); // Concatenate all arguments into a single dag. - dag Args = !foldl((ins), _Args, a, b, !con(a,b)); + dag Args = !foldl((ins), _Args, a, b, !con(a, b)); // Pre-build the pattern to match (intrinsic arg0, arg1, ...). dag IntrinsicPattern = BuildPatternI(Intr), Args>.ret; } @@ -6761,7 +5390,7 @@ class MMA_OP_PREDICATES { WMMA_REGINFO Frag = FragA; list ret = !listconcat( FragA.Predicates, - !if(!eq(b1op, ".and.popc"), [hasSM<80>,hasPTX<71>],[]) + !if(!eq(b1op, ".and.popc"), [hasSM<80>, hasPTX<71>], []) ); } // WMMA.MMA @@ -7008,25 +5637,22 @@ def INT_EXIT : BasicNVPTXInst<(outs), (ins), "exit", [(int_nvvm_exit)]>; // Tcgen05 intrinsics let isConvergent = true in { -multiclass TCGEN05_ALLOC_INTR { - def NAME : NVPTXInst<(outs), - (ins rc:$dst, Int32Regs:$ncols), - !strconcat("tcgen05.alloc.cta_group::", num, ".sync.aligned", AS, ".b32 [$dst], $ncols;"), - [(Intr rc:$dst, Int32Regs:$ncols)]>, +multiclass TCGEN05_ALLOC_INTR { + def "" : BasicNVPTXInst<(outs), + (ins ADDR:$dst, Int32Regs:$ncols), + "tcgen05.alloc.cta_group::" # num # ".sync.aligned" # AS # ".b32", + [(Intr addr:$dst, Int32Regs:$ncols)]>, Requires<[hasTcgen05Instructions]>; } -defm TCGEN05_ALLOC_CG1 : TCGEN05_ALLOC_INTR; -defm TCGEN05_ALLOC_CG2 : TCGEN05_ALLOC_INTR; +defm TCGEN05_ALLOC_CG1 : TCGEN05_ALLOC_INTR<"", "1", int_nvvm_tcgen05_alloc_cg1>; +defm TCGEN05_ALLOC_CG2 : TCGEN05_ALLOC_INTR<"", "2", int_nvvm_tcgen05_alloc_cg2>; -defm TCGEN05_ALLOC_S64_CG1 : TCGEN05_ALLOC_INTR; -defm TCGEN05_ALLOC_S64_CG2 : TCGEN05_ALLOC_INTR; - -defm TCGEN05_ALLOC_S32_CG1 : TCGEN05_ALLOC_INTR; -defm TCGEN05_ALLOC_S32_CG2 : TCGEN05_ALLOC_INTR; +defm TCGEN05_ALLOC_S64_CG1 : TCGEN05_ALLOC_INTR<".shared::cta", "1", int_nvvm_tcgen05_alloc_shared_cg1>; +defm TCGEN05_ALLOC_S64_CG2 : TCGEN05_ALLOC_INTR<".shared::cta", "2", int_nvvm_tcgen05_alloc_shared_cg2>; multiclass TCGEN05_DEALLOC_INTR { - def NAME : BasicNVPTXInst<(outs), + def "" : BasicNVPTXInst<(outs), (ins Int32Regs:$tmem_addr, Int32Regs:$ncols), "tcgen05.dealloc.cta_group::" # num # ".sync.aligned.b32", [(Intr Int32Regs:$tmem_addr, Int32Regs:$ncols)]>, @@ -7036,7 +5662,7 @@ defm TCGEN05_DEALLOC_CG1: TCGEN05_DEALLOC_INTR<"1", int_nvvm_tcgen05_dealloc_cg1 defm TCGEN05_DEALLOC_CG2: TCGEN05_DEALLOC_INTR<"2", int_nvvm_tcgen05_dealloc_cg2>; multiclass TCGEN05_RELINQ_PERMIT_INTR { - def NAME : BasicNVPTXInst<(outs), (ins), + def "" : BasicNVPTXInst<(outs), (ins), "tcgen05.relinquish_alloc_permit.cta_group::" # num # ".sync.aligned", [(Intr)]>, Requires<[hasTcgen05Instructions]>; @@ -7052,36 +5678,33 @@ def tcgen05_wait_st: BasicNVPTXInst<(outs), (ins), "tcgen05.wait::st.sync.aligne [(int_nvvm_tcgen05_wait_st)]>, Requires<[hasTcgen05Instructions]>; -multiclass TCGEN05_COMMIT_INTR { - defvar prefix = "tcgen05.commit.cta_group::" # num; - defvar suffix = ".mbarrier::arrive::one.shared::cluster"; +multiclass TCGEN05_COMMIT_INTR { + defvar prefix = "tcgen05.commit.cta_group::" # num #".mbarrier::arrive::one.shared::cluster"; defvar intr_suffix = !if(!eq(AS, "shared"), "_shared", "") # "_cg" # num; defvar Intr = !cast("int_nvvm_tcgen05_commit" # intr_suffix); defvar IntrMC = !cast("int_nvvm_tcgen05_commit_mc" # intr_suffix); - def NAME : NVPTXInst<(outs), (ins rc:$mbar), - !strconcat(prefix, suffix, ".b64 [$mbar];"), - [(Intr rc:$mbar)]>, + def "" : BasicNVPTXInst<(outs), (ins ADDR:$mbar), + prefix # ".b64", + [(Intr addr:$mbar)]>, Requires<[hasTcgen05Instructions]>; - def NAME # _MC : NVPTXInst<(outs), (ins rc:$mbar, Int16Regs:$mc), - !strconcat(prefix, suffix, ".multicast::cluster.b64 [$mbar], $mc;"), - [(IntrMC rc:$mbar, Int16Regs:$mc)]>, + def _MC : BasicNVPTXInst<(outs), (ins ADDR:$mbar, Int16Regs:$mc), + prefix # ".multicast::cluster.b64", + [(IntrMC addr:$mbar, Int16Regs:$mc)]>, Requires<[hasTcgen05Instructions]>; } -defm TCGEN05_COMMIT_CG1 : TCGEN05_COMMIT_INTR; -defm TCGEN05_COMMIT_CG2 : TCGEN05_COMMIT_INTR; -defm TCGEN05_COMMIT_S64_CG1 : TCGEN05_COMMIT_INTR; -defm TCGEN05_COMMIT_S64_CG2 : TCGEN05_COMMIT_INTR; -defm TCGEN05_COMMIT_S32_CG1 : TCGEN05_COMMIT_INTR; -defm TCGEN05_COMMIT_S32_CG2 : TCGEN05_COMMIT_INTR; +defm TCGEN05_COMMIT_CG1 : TCGEN05_COMMIT_INTR<"", "1">; +defm TCGEN05_COMMIT_CG2 : TCGEN05_COMMIT_INTR<"", "2">; +defm TCGEN05_COMMIT_S64_CG1 : TCGEN05_COMMIT_INTR<"shared", "1">; +defm TCGEN05_COMMIT_S64_CG2 : TCGEN05_COMMIT_INTR<"shared", "2">; multiclass TCGEN05_SHIFT_INTR { - def NAME : NVPTXInst<(outs), - (ins Int32Regs:$tmem_addr), - !strconcat("tcgen05.shift.cta_group::", num, ".down [$tmem_addr];"), - [(Intr Int32Regs:$tmem_addr)]>, + def "" : BasicNVPTXInst<(outs), + (ins ADDR:$tmem_addr), + "tcgen05.shift.cta_group::" # num # ".down", + [(Intr addr:$tmem_addr)]>, Requires<[hasTcgen05Instructions]>; } defm TCGEN05_SHIFT_CG1: TCGEN05_SHIFT_INTR<"1", int_nvvm_tcgen05_shift_down_cg1>; @@ -7099,15 +5722,15 @@ multiclass TCGEN05_CP_INTR { defvar IntrCG1 = !cast(intr_prefix # "_cg1"); defvar IntrCG2 = !cast(intr_prefix # "_cg2"); - def NAME # _cg1 : NVPTXInst<(outs), - (ins Int32Regs:$tmem_addr, Int64Regs:$sdesc), - "tcgen05.cp.cta_group::1." # shape_mc_asm # fmt_asm # " [$tmem_addr], $sdesc;", - [(IntrCG1 Int32Regs:$tmem_addr, Int64Regs:$sdesc)]>, + def _cg1 : BasicNVPTXInst<(outs), + (ins ADDR:$tmem_addr, Int64Regs:$sdesc), + "tcgen05.cp.cta_group::1." # shape_mc_asm # fmt_asm, + [(IntrCG1 addr:$tmem_addr, Int64Regs:$sdesc)]>, Requires<[hasTcgen05Instructions]>; - def NAME # _cg2 : NVPTXInst<(outs), - (ins Int32Regs:$tmem_addr, Int64Regs:$sdesc), - "tcgen05.cp.cta_group::2." # shape_mc_asm # fmt_asm # " [$tmem_addr], $sdesc;", - [(IntrCG2 Int32Regs:$tmem_addr, Int64Regs:$sdesc)]>, + def _cg2 : BasicNVPTXInst<(outs), + (ins ADDR:$tmem_addr, Int64Regs:$sdesc), + "tcgen05.cp.cta_group::2." # shape_mc_asm # fmt_asm, + [(IntrCG2 addr:$tmem_addr, Int64Regs:$sdesc)]>, Requires<[hasTcgen05Instructions]>; } @@ -7222,17 +5845,18 @@ foreach shape = ["16x64b", "16x128b", "16x256b", "32x32b", "16x32bx2"] in { } // isConvergent // Bulk store instructions - +def st_bulk_imm : TImmLeaf; + def INT_NVVM_ST_BULK_GENERIC : - NVPTXInst<(outs), (ins ADDR:$dest_addr, Int64Regs:$size), - "st.bulk [$dest_addr], $size, 0;", - [(int_nvvm_st_bulk addr:$dest_addr, i64:$size, (i64 0))]>, + BasicNVPTXInst<(outs), (ins ADDR:$dest_addr, Int64Regs:$size, i64imm:$value), + "st.bulk", + [(int_nvvm_st_bulk addr:$dest_addr, i64:$size, st_bulk_imm:$value)]>, Requires<[hasSM<100>, hasPTX<86>]>; def INT_NVVM_ST_BULK_SHARED_CTA: - NVPTXInst<(outs), (ins ADDR:$dest_addr, Int64Regs:$size), - "st.bulk.shared::cta [$dest_addr], $size, 0;", - [(int_nvvm_st_bulk_shared_cta addr:$dest_addr, i64:$size, (i64 0))]>, + BasicNVPTXInst<(outs), (ins ADDR:$dest_addr, Int64Regs:$size, i64imm:$value), + "st.bulk.shared::cta", + [(int_nvvm_st_bulk_shared_cta addr:$dest_addr, i64:$size, st_bulk_imm:$value)]>, Requires<[hasSM<100>, hasPTX<86>]>; // @@ -7240,17 +5864,15 @@ def INT_NVVM_ST_BULK_SHARED_CTA: // def CLUSTERLAUNCHCONTRL_TRY_CANCEL: - NVPTXInst<(outs), (ins ADDR:$addr, ADDR:$mbar), - "clusterlaunchcontrol.try_cancel.async.shared::cta.mbarrier::complete_tx::bytes.b128 " # - "[$addr], [$mbar];", + BasicNVPTXInst<(outs), (ins ADDR:$addr, ADDR:$mbar), + "clusterlaunchcontrol.try_cancel.async.shared::cta.mbarrier::complete_tx::bytes.b128", [(int_nvvm_clusterlaunchcontrol_try_cancel_async_shared addr:$addr, addr:$mbar)]>, Requires<[hasSM<100>, hasPTX<86>]>; def CLUSTERLAUNCHCONTRL_TRY_CANCEL_MULTICAST: - NVPTXInst<(outs), (ins ADDR:$addr, ADDR:$mbar), + BasicNVPTXInst<(outs), (ins ADDR:$addr, ADDR:$mbar), "clusterlaunchcontrol.try_cancel.async.shared::cta.mbarrier::complete_tx::bytes" # - ".multicast::cluster::all.b128 " # - "[$addr], [$mbar];", + ".multicast::cluster::all.b128", [(int_nvvm_clusterlaunchcontrol_try_cancel_async_multicast_shared addr:$addr, addr:$mbar)]>, Requires<[hasSM<100>, hasArchAccelFeatures, hasPTX<86>]>; diff --git a/llvm/lib/Target/NVPTX/NVPTXReplaceImageHandles.cpp b/llvm/lib/Target/NVPTX/NVPTXReplaceImageHandles.cpp index 9b5fe473521a..320c0fb6950a 100644 --- a/llvm/lib/Target/NVPTX/NVPTXReplaceImageHandles.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXReplaceImageHandles.cpp @@ -408,426 +408,426 @@ static unsigned suldRegisterToIndexOpcode(unsigned RegOC) { static unsigned sustRegisterToIndexOpcode(unsigned RegOC) { switch (RegOC) { - case NVPTX::SUST_B_1D_B8_CLAMP_R: - return NVPTX::SUST_B_1D_B8_CLAMP_I; - case NVPTX::SUST_B_1D_B16_CLAMP_R: - return NVPTX::SUST_B_1D_B16_CLAMP_I; - case NVPTX::SUST_B_1D_B32_CLAMP_R: - return NVPTX::SUST_B_1D_B32_CLAMP_I; - case NVPTX::SUST_B_1D_B64_CLAMP_R: - return NVPTX::SUST_B_1D_B64_CLAMP_I; - case NVPTX::SUST_B_1D_V2B8_CLAMP_R: - return NVPTX::SUST_B_1D_V2B8_CLAMP_I; - case NVPTX::SUST_B_1D_V2B16_CLAMP_R: - return NVPTX::SUST_B_1D_V2B16_CLAMP_I; - case NVPTX::SUST_B_1D_V2B32_CLAMP_R: - return NVPTX::SUST_B_1D_V2B32_CLAMP_I; - case NVPTX::SUST_B_1D_V2B64_CLAMP_R: - return NVPTX::SUST_B_1D_V2B64_CLAMP_I; - case NVPTX::SUST_B_1D_V4B8_CLAMP_R: - return NVPTX::SUST_B_1D_V4B8_CLAMP_I; - case NVPTX::SUST_B_1D_V4B16_CLAMP_R: - return NVPTX::SUST_B_1D_V4B16_CLAMP_I; - case NVPTX::SUST_B_1D_V4B32_CLAMP_R: - return NVPTX::SUST_B_1D_V4B32_CLAMP_I; - case NVPTX::SUST_B_1D_ARRAY_B8_CLAMP_R: - return NVPTX::SUST_B_1D_ARRAY_B8_CLAMP_I; - case NVPTX::SUST_B_1D_ARRAY_B16_CLAMP_R: - return NVPTX::SUST_B_1D_ARRAY_B16_CLAMP_I; - case NVPTX::SUST_B_1D_ARRAY_B32_CLAMP_R: - return NVPTX::SUST_B_1D_ARRAY_B32_CLAMP_I; - case NVPTX::SUST_B_1D_ARRAY_B64_CLAMP_R: - return NVPTX::SUST_B_1D_ARRAY_B64_CLAMP_I; - case NVPTX::SUST_B_1D_ARRAY_V2B8_CLAMP_R: - return NVPTX::SUST_B_1D_ARRAY_V2B8_CLAMP_I; - case NVPTX::SUST_B_1D_ARRAY_V2B16_CLAMP_R: - return NVPTX::SUST_B_1D_ARRAY_V2B16_CLAMP_I; - case NVPTX::SUST_B_1D_ARRAY_V2B32_CLAMP_R: - return NVPTX::SUST_B_1D_ARRAY_V2B32_CLAMP_I; - case NVPTX::SUST_B_1D_ARRAY_V2B64_CLAMP_R: - return NVPTX::SUST_B_1D_ARRAY_V2B64_CLAMP_I; - case NVPTX::SUST_B_1D_ARRAY_V4B8_CLAMP_R: - return NVPTX::SUST_B_1D_ARRAY_V4B8_CLAMP_I; - case NVPTX::SUST_B_1D_ARRAY_V4B16_CLAMP_R: - return NVPTX::SUST_B_1D_ARRAY_V4B16_CLAMP_I; - case NVPTX::SUST_B_1D_ARRAY_V4B32_CLAMP_R: - return NVPTX::SUST_B_1D_ARRAY_V4B32_CLAMP_I; - case NVPTX::SUST_B_2D_B8_CLAMP_R: - return NVPTX::SUST_B_2D_B8_CLAMP_I; - case NVPTX::SUST_B_2D_B16_CLAMP_R: - return NVPTX::SUST_B_2D_B16_CLAMP_I; - case NVPTX::SUST_B_2D_B32_CLAMP_R: - return NVPTX::SUST_B_2D_B32_CLAMP_I; - case NVPTX::SUST_B_2D_B64_CLAMP_R: - return NVPTX::SUST_B_2D_B64_CLAMP_I; - case NVPTX::SUST_B_2D_V2B8_CLAMP_R: - return NVPTX::SUST_B_2D_V2B8_CLAMP_I; - case NVPTX::SUST_B_2D_V2B16_CLAMP_R: - return NVPTX::SUST_B_2D_V2B16_CLAMP_I; - case NVPTX::SUST_B_2D_V2B32_CLAMP_R: - return NVPTX::SUST_B_2D_V2B32_CLAMP_I; - case NVPTX::SUST_B_2D_V2B64_CLAMP_R: - return NVPTX::SUST_B_2D_V2B64_CLAMP_I; - case NVPTX::SUST_B_2D_V4B8_CLAMP_R: - return NVPTX::SUST_B_2D_V4B8_CLAMP_I; - case NVPTX::SUST_B_2D_V4B16_CLAMP_R: - return NVPTX::SUST_B_2D_V4B16_CLAMP_I; - case NVPTX::SUST_B_2D_V4B32_CLAMP_R: - return NVPTX::SUST_B_2D_V4B32_CLAMP_I; - case NVPTX::SUST_B_2D_ARRAY_B8_CLAMP_R: - return NVPTX::SUST_B_2D_ARRAY_B8_CLAMP_I; - case NVPTX::SUST_B_2D_ARRAY_B16_CLAMP_R: - return NVPTX::SUST_B_2D_ARRAY_B16_CLAMP_I; - case NVPTX::SUST_B_2D_ARRAY_B32_CLAMP_R: - return NVPTX::SUST_B_2D_ARRAY_B32_CLAMP_I; - case NVPTX::SUST_B_2D_ARRAY_B64_CLAMP_R: - return NVPTX::SUST_B_2D_ARRAY_B64_CLAMP_I; - case NVPTX::SUST_B_2D_ARRAY_V2B8_CLAMP_R: - return NVPTX::SUST_B_2D_ARRAY_V2B8_CLAMP_I; - case NVPTX::SUST_B_2D_ARRAY_V2B16_CLAMP_R: - return NVPTX::SUST_B_2D_ARRAY_V2B16_CLAMP_I; - case NVPTX::SUST_B_2D_ARRAY_V2B32_CLAMP_R: - return NVPTX::SUST_B_2D_ARRAY_V2B32_CLAMP_I; - case NVPTX::SUST_B_2D_ARRAY_V2B64_CLAMP_R: - return NVPTX::SUST_B_2D_ARRAY_V2B64_CLAMP_I; - case NVPTX::SUST_B_2D_ARRAY_V4B8_CLAMP_R: - return NVPTX::SUST_B_2D_ARRAY_V4B8_CLAMP_I; - case NVPTX::SUST_B_2D_ARRAY_V4B16_CLAMP_R: - return NVPTX::SUST_B_2D_ARRAY_V4B16_CLAMP_I; - case NVPTX::SUST_B_2D_ARRAY_V4B32_CLAMP_R: - return NVPTX::SUST_B_2D_ARRAY_V4B32_CLAMP_I; - case NVPTX::SUST_B_3D_B8_CLAMP_R: - return NVPTX::SUST_B_3D_B8_CLAMP_I; - case NVPTX::SUST_B_3D_B16_CLAMP_R: - return NVPTX::SUST_B_3D_B16_CLAMP_I; - case NVPTX::SUST_B_3D_B32_CLAMP_R: - return NVPTX::SUST_B_3D_B32_CLAMP_I; - case NVPTX::SUST_B_3D_B64_CLAMP_R: - return NVPTX::SUST_B_3D_B64_CLAMP_I; - case NVPTX::SUST_B_3D_V2B8_CLAMP_R: - return NVPTX::SUST_B_3D_V2B8_CLAMP_I; - case NVPTX::SUST_B_3D_V2B16_CLAMP_R: - return NVPTX::SUST_B_3D_V2B16_CLAMP_I; - case NVPTX::SUST_B_3D_V2B32_CLAMP_R: - return NVPTX::SUST_B_3D_V2B32_CLAMP_I; - case NVPTX::SUST_B_3D_V2B64_CLAMP_R: - return NVPTX::SUST_B_3D_V2B64_CLAMP_I; - case NVPTX::SUST_B_3D_V4B8_CLAMP_R: - return NVPTX::SUST_B_3D_V4B8_CLAMP_I; - case NVPTX::SUST_B_3D_V4B16_CLAMP_R: - return NVPTX::SUST_B_3D_V4B16_CLAMP_I; - case NVPTX::SUST_B_3D_V4B32_CLAMP_R: - return NVPTX::SUST_B_3D_V4B32_CLAMP_I; - case NVPTX::SUST_B_1D_B8_TRAP_R: - return NVPTX::SUST_B_1D_B8_TRAP_I; - case NVPTX::SUST_B_1D_B16_TRAP_R: - return NVPTX::SUST_B_1D_B16_TRAP_I; - case NVPTX::SUST_B_1D_B32_TRAP_R: - return NVPTX::SUST_B_1D_B32_TRAP_I; - case NVPTX::SUST_B_1D_B64_TRAP_R: - return NVPTX::SUST_B_1D_B64_TRAP_I; - case NVPTX::SUST_B_1D_V2B8_TRAP_R: - return NVPTX::SUST_B_1D_V2B8_TRAP_I; - case NVPTX::SUST_B_1D_V2B16_TRAP_R: - return NVPTX::SUST_B_1D_V2B16_TRAP_I; - case NVPTX::SUST_B_1D_V2B32_TRAP_R: - return NVPTX::SUST_B_1D_V2B32_TRAP_I; - case NVPTX::SUST_B_1D_V2B64_TRAP_R: - return NVPTX::SUST_B_1D_V2B64_TRAP_I; - case NVPTX::SUST_B_1D_V4B8_TRAP_R: - return NVPTX::SUST_B_1D_V4B8_TRAP_I; - case NVPTX::SUST_B_1D_V4B16_TRAP_R: - return NVPTX::SUST_B_1D_V4B16_TRAP_I; - case NVPTX::SUST_B_1D_V4B32_TRAP_R: - return NVPTX::SUST_B_1D_V4B32_TRAP_I; - case NVPTX::SUST_B_1D_ARRAY_B8_TRAP_R: - return NVPTX::SUST_B_1D_ARRAY_B8_TRAP_I; - case NVPTX::SUST_B_1D_ARRAY_B16_TRAP_R: - return NVPTX::SUST_B_1D_ARRAY_B16_TRAP_I; - case NVPTX::SUST_B_1D_ARRAY_B32_TRAP_R: - return NVPTX::SUST_B_1D_ARRAY_B32_TRAP_I; - case NVPTX::SUST_B_1D_ARRAY_B64_TRAP_R: - return NVPTX::SUST_B_1D_ARRAY_B64_TRAP_I; - case NVPTX::SUST_B_1D_ARRAY_V2B8_TRAP_R: - return NVPTX::SUST_B_1D_ARRAY_V2B8_TRAP_I; - case NVPTX::SUST_B_1D_ARRAY_V2B16_TRAP_R: - return NVPTX::SUST_B_1D_ARRAY_V2B16_TRAP_I; - case NVPTX::SUST_B_1D_ARRAY_V2B32_TRAP_R: - return NVPTX::SUST_B_1D_ARRAY_V2B32_TRAP_I; - case NVPTX::SUST_B_1D_ARRAY_V2B64_TRAP_R: - return NVPTX::SUST_B_1D_ARRAY_V2B64_TRAP_I; - case NVPTX::SUST_B_1D_ARRAY_V4B8_TRAP_R: - return NVPTX::SUST_B_1D_ARRAY_V4B8_TRAP_I; - case NVPTX::SUST_B_1D_ARRAY_V4B16_TRAP_R: - return NVPTX::SUST_B_1D_ARRAY_V4B16_TRAP_I; - case NVPTX::SUST_B_1D_ARRAY_V4B32_TRAP_R: - return NVPTX::SUST_B_1D_ARRAY_V4B32_TRAP_I; - case NVPTX::SUST_B_2D_B8_TRAP_R: - return NVPTX::SUST_B_2D_B8_TRAP_I; - case NVPTX::SUST_B_2D_B16_TRAP_R: - return NVPTX::SUST_B_2D_B16_TRAP_I; - case NVPTX::SUST_B_2D_B32_TRAP_R: - return NVPTX::SUST_B_2D_B32_TRAP_I; - case NVPTX::SUST_B_2D_B64_TRAP_R: - return NVPTX::SUST_B_2D_B64_TRAP_I; - case NVPTX::SUST_B_2D_V2B8_TRAP_R: - return NVPTX::SUST_B_2D_V2B8_TRAP_I; - case NVPTX::SUST_B_2D_V2B16_TRAP_R: - return NVPTX::SUST_B_2D_V2B16_TRAP_I; - case NVPTX::SUST_B_2D_V2B32_TRAP_R: - return NVPTX::SUST_B_2D_V2B32_TRAP_I; - case NVPTX::SUST_B_2D_V2B64_TRAP_R: - return NVPTX::SUST_B_2D_V2B64_TRAP_I; - case NVPTX::SUST_B_2D_V4B8_TRAP_R: - return NVPTX::SUST_B_2D_V4B8_TRAP_I; - case NVPTX::SUST_B_2D_V4B16_TRAP_R: - return NVPTX::SUST_B_2D_V4B16_TRAP_I; - case NVPTX::SUST_B_2D_V4B32_TRAP_R: - return NVPTX::SUST_B_2D_V4B32_TRAP_I; - case NVPTX::SUST_B_2D_ARRAY_B8_TRAP_R: - return NVPTX::SUST_B_2D_ARRAY_B8_TRAP_I; - case NVPTX::SUST_B_2D_ARRAY_B16_TRAP_R: - return NVPTX::SUST_B_2D_ARRAY_B16_TRAP_I; - case NVPTX::SUST_B_2D_ARRAY_B32_TRAP_R: - return NVPTX::SUST_B_2D_ARRAY_B32_TRAP_I; - case NVPTX::SUST_B_2D_ARRAY_B64_TRAP_R: - return NVPTX::SUST_B_2D_ARRAY_B64_TRAP_I; - case NVPTX::SUST_B_2D_ARRAY_V2B8_TRAP_R: - return NVPTX::SUST_B_2D_ARRAY_V2B8_TRAP_I; - case NVPTX::SUST_B_2D_ARRAY_V2B16_TRAP_R: - return NVPTX::SUST_B_2D_ARRAY_V2B16_TRAP_I; - case NVPTX::SUST_B_2D_ARRAY_V2B32_TRAP_R: - return NVPTX::SUST_B_2D_ARRAY_V2B32_TRAP_I; - case NVPTX::SUST_B_2D_ARRAY_V2B64_TRAP_R: - return NVPTX::SUST_B_2D_ARRAY_V2B64_TRAP_I; - case NVPTX::SUST_B_2D_ARRAY_V4B8_TRAP_R: - return NVPTX::SUST_B_2D_ARRAY_V4B8_TRAP_I; - case NVPTX::SUST_B_2D_ARRAY_V4B16_TRAP_R: - return NVPTX::SUST_B_2D_ARRAY_V4B16_TRAP_I; - case NVPTX::SUST_B_2D_ARRAY_V4B32_TRAP_R: - return NVPTX::SUST_B_2D_ARRAY_V4B32_TRAP_I; - case NVPTX::SUST_B_3D_B8_TRAP_R: - return NVPTX::SUST_B_3D_B8_TRAP_I; - case NVPTX::SUST_B_3D_B16_TRAP_R: - return NVPTX::SUST_B_3D_B16_TRAP_I; - case NVPTX::SUST_B_3D_B32_TRAP_R: - return NVPTX::SUST_B_3D_B32_TRAP_I; - case NVPTX::SUST_B_3D_B64_TRAP_R: - return NVPTX::SUST_B_3D_B64_TRAP_I; - case NVPTX::SUST_B_3D_V2B8_TRAP_R: - return NVPTX::SUST_B_3D_V2B8_TRAP_I; - case NVPTX::SUST_B_3D_V2B16_TRAP_R: - return NVPTX::SUST_B_3D_V2B16_TRAP_I; - case NVPTX::SUST_B_3D_V2B32_TRAP_R: - return NVPTX::SUST_B_3D_V2B32_TRAP_I; - case NVPTX::SUST_B_3D_V2B64_TRAP_R: - return NVPTX::SUST_B_3D_V2B64_TRAP_I; - case NVPTX::SUST_B_3D_V4B8_TRAP_R: - return NVPTX::SUST_B_3D_V4B8_TRAP_I; - case NVPTX::SUST_B_3D_V4B16_TRAP_R: - return NVPTX::SUST_B_3D_V4B16_TRAP_I; - case NVPTX::SUST_B_3D_V4B32_TRAP_R: - return NVPTX::SUST_B_3D_V4B32_TRAP_I; - case NVPTX::SUST_B_1D_B8_ZERO_R: - return NVPTX::SUST_B_1D_B8_ZERO_I; - case NVPTX::SUST_B_1D_B16_ZERO_R: - return NVPTX::SUST_B_1D_B16_ZERO_I; - case NVPTX::SUST_B_1D_B32_ZERO_R: - return NVPTX::SUST_B_1D_B32_ZERO_I; - case NVPTX::SUST_B_1D_B64_ZERO_R: - return NVPTX::SUST_B_1D_B64_ZERO_I; - case NVPTX::SUST_B_1D_V2B8_ZERO_R: - return NVPTX::SUST_B_1D_V2B8_ZERO_I; - case NVPTX::SUST_B_1D_V2B16_ZERO_R: - return NVPTX::SUST_B_1D_V2B16_ZERO_I; - case NVPTX::SUST_B_1D_V2B32_ZERO_R: - return NVPTX::SUST_B_1D_V2B32_ZERO_I; - case NVPTX::SUST_B_1D_V2B64_ZERO_R: - return NVPTX::SUST_B_1D_V2B64_ZERO_I; - case NVPTX::SUST_B_1D_V4B8_ZERO_R: - return NVPTX::SUST_B_1D_V4B8_ZERO_I; - case NVPTX::SUST_B_1D_V4B16_ZERO_R: - return NVPTX::SUST_B_1D_V4B16_ZERO_I; - case NVPTX::SUST_B_1D_V4B32_ZERO_R: - return NVPTX::SUST_B_1D_V4B32_ZERO_I; - case NVPTX::SUST_B_1D_ARRAY_B8_ZERO_R: - return NVPTX::SUST_B_1D_ARRAY_B8_ZERO_I; - case NVPTX::SUST_B_1D_ARRAY_B16_ZERO_R: - return NVPTX::SUST_B_1D_ARRAY_B16_ZERO_I; - case NVPTX::SUST_B_1D_ARRAY_B32_ZERO_R: - return NVPTX::SUST_B_1D_ARRAY_B32_ZERO_I; - case NVPTX::SUST_B_1D_ARRAY_B64_ZERO_R: - return NVPTX::SUST_B_1D_ARRAY_B64_ZERO_I; - case NVPTX::SUST_B_1D_ARRAY_V2B8_ZERO_R: - return NVPTX::SUST_B_1D_ARRAY_V2B8_ZERO_I; - case NVPTX::SUST_B_1D_ARRAY_V2B16_ZERO_R: - return NVPTX::SUST_B_1D_ARRAY_V2B16_ZERO_I; - case NVPTX::SUST_B_1D_ARRAY_V2B32_ZERO_R: - return NVPTX::SUST_B_1D_ARRAY_V2B32_ZERO_I; - case NVPTX::SUST_B_1D_ARRAY_V2B64_ZERO_R: - return NVPTX::SUST_B_1D_ARRAY_V2B64_ZERO_I; - case NVPTX::SUST_B_1D_ARRAY_V4B8_ZERO_R: - return NVPTX::SUST_B_1D_ARRAY_V4B8_ZERO_I; - case NVPTX::SUST_B_1D_ARRAY_V4B16_ZERO_R: - return NVPTX::SUST_B_1D_ARRAY_V4B16_ZERO_I; - case NVPTX::SUST_B_1D_ARRAY_V4B32_ZERO_R: - return NVPTX::SUST_B_1D_ARRAY_V4B32_ZERO_I; - case NVPTX::SUST_B_2D_B8_ZERO_R: - return NVPTX::SUST_B_2D_B8_ZERO_I; - case NVPTX::SUST_B_2D_B16_ZERO_R: - return NVPTX::SUST_B_2D_B16_ZERO_I; - case NVPTX::SUST_B_2D_B32_ZERO_R: - return NVPTX::SUST_B_2D_B32_ZERO_I; - case NVPTX::SUST_B_2D_B64_ZERO_R: - return NVPTX::SUST_B_2D_B64_ZERO_I; - case NVPTX::SUST_B_2D_V2B8_ZERO_R: - return NVPTX::SUST_B_2D_V2B8_ZERO_I; - case NVPTX::SUST_B_2D_V2B16_ZERO_R: - return NVPTX::SUST_B_2D_V2B16_ZERO_I; - case NVPTX::SUST_B_2D_V2B32_ZERO_R: - return NVPTX::SUST_B_2D_V2B32_ZERO_I; - case NVPTX::SUST_B_2D_V2B64_ZERO_R: - return NVPTX::SUST_B_2D_V2B64_ZERO_I; - case NVPTX::SUST_B_2D_V4B8_ZERO_R: - return NVPTX::SUST_B_2D_V4B8_ZERO_I; - case NVPTX::SUST_B_2D_V4B16_ZERO_R: - return NVPTX::SUST_B_2D_V4B16_ZERO_I; - case NVPTX::SUST_B_2D_V4B32_ZERO_R: - return NVPTX::SUST_B_2D_V4B32_ZERO_I; - case NVPTX::SUST_B_2D_ARRAY_B8_ZERO_R: - return NVPTX::SUST_B_2D_ARRAY_B8_ZERO_I; - case NVPTX::SUST_B_2D_ARRAY_B16_ZERO_R: - return NVPTX::SUST_B_2D_ARRAY_B16_ZERO_I; - case NVPTX::SUST_B_2D_ARRAY_B32_ZERO_R: - return NVPTX::SUST_B_2D_ARRAY_B32_ZERO_I; - case NVPTX::SUST_B_2D_ARRAY_B64_ZERO_R: - return NVPTX::SUST_B_2D_ARRAY_B64_ZERO_I; - case NVPTX::SUST_B_2D_ARRAY_V2B8_ZERO_R: - return NVPTX::SUST_B_2D_ARRAY_V2B8_ZERO_I; - case NVPTX::SUST_B_2D_ARRAY_V2B16_ZERO_R: - return NVPTX::SUST_B_2D_ARRAY_V2B16_ZERO_I; - case NVPTX::SUST_B_2D_ARRAY_V2B32_ZERO_R: - return NVPTX::SUST_B_2D_ARRAY_V2B32_ZERO_I; - case NVPTX::SUST_B_2D_ARRAY_V2B64_ZERO_R: - return NVPTX::SUST_B_2D_ARRAY_V2B64_ZERO_I; - case NVPTX::SUST_B_2D_ARRAY_V4B8_ZERO_R: - return NVPTX::SUST_B_2D_ARRAY_V4B8_ZERO_I; - case NVPTX::SUST_B_2D_ARRAY_V4B16_ZERO_R: - return NVPTX::SUST_B_2D_ARRAY_V4B16_ZERO_I; - case NVPTX::SUST_B_2D_ARRAY_V4B32_ZERO_R: - return NVPTX::SUST_B_2D_ARRAY_V4B32_ZERO_I; - case NVPTX::SUST_B_3D_B8_ZERO_R: - return NVPTX::SUST_B_3D_B8_ZERO_I; - case NVPTX::SUST_B_3D_B16_ZERO_R: - return NVPTX::SUST_B_3D_B16_ZERO_I; - case NVPTX::SUST_B_3D_B32_ZERO_R: - return NVPTX::SUST_B_3D_B32_ZERO_I; - case NVPTX::SUST_B_3D_B64_ZERO_R: - return NVPTX::SUST_B_3D_B64_ZERO_I; - case NVPTX::SUST_B_3D_V2B8_ZERO_R: - return NVPTX::SUST_B_3D_V2B8_ZERO_I; - case NVPTX::SUST_B_3D_V2B16_ZERO_R: - return NVPTX::SUST_B_3D_V2B16_ZERO_I; - case NVPTX::SUST_B_3D_V2B32_ZERO_R: - return NVPTX::SUST_B_3D_V2B32_ZERO_I; - case NVPTX::SUST_B_3D_V2B64_ZERO_R: - return NVPTX::SUST_B_3D_V2B64_ZERO_I; - case NVPTX::SUST_B_3D_V4B8_ZERO_R: - return NVPTX::SUST_B_3D_V4B8_ZERO_I; - case NVPTX::SUST_B_3D_V4B16_ZERO_R: - return NVPTX::SUST_B_3D_V4B16_ZERO_I; - case NVPTX::SUST_B_3D_V4B32_ZERO_R: - return NVPTX::SUST_B_3D_V4B32_ZERO_I; - case NVPTX::SUST_P_1D_B8_TRAP_R: - return NVPTX::SUST_P_1D_B8_TRAP_I; - case NVPTX::SUST_P_1D_B16_TRAP_R: - return NVPTX::SUST_P_1D_B16_TRAP_I; - case NVPTX::SUST_P_1D_B32_TRAP_R: - return NVPTX::SUST_P_1D_B32_TRAP_I; - case NVPTX::SUST_P_1D_V2B8_TRAP_R: - return NVPTX::SUST_P_1D_V2B8_TRAP_I; - case NVPTX::SUST_P_1D_V2B16_TRAP_R: - return NVPTX::SUST_P_1D_V2B16_TRAP_I; - case NVPTX::SUST_P_1D_V2B32_TRAP_R: - return NVPTX::SUST_P_1D_V2B32_TRAP_I; - case NVPTX::SUST_P_1D_V4B8_TRAP_R: - return NVPTX::SUST_P_1D_V4B8_TRAP_I; - case NVPTX::SUST_P_1D_V4B16_TRAP_R: - return NVPTX::SUST_P_1D_V4B16_TRAP_I; - case NVPTX::SUST_P_1D_V4B32_TRAP_R: - return NVPTX::SUST_P_1D_V4B32_TRAP_I; - case NVPTX::SUST_P_1D_ARRAY_B8_TRAP_R: - return NVPTX::SUST_P_1D_ARRAY_B8_TRAP_I; - case NVPTX::SUST_P_1D_ARRAY_B16_TRAP_R: - return NVPTX::SUST_P_1D_ARRAY_B16_TRAP_I; - case NVPTX::SUST_P_1D_ARRAY_B32_TRAP_R: - return NVPTX::SUST_P_1D_ARRAY_B32_TRAP_I; - case NVPTX::SUST_P_1D_ARRAY_V2B8_TRAP_R: - return NVPTX::SUST_P_1D_ARRAY_V2B8_TRAP_I; - case NVPTX::SUST_P_1D_ARRAY_V2B16_TRAP_R: - return NVPTX::SUST_P_1D_ARRAY_V2B16_TRAP_I; - case NVPTX::SUST_P_1D_ARRAY_V2B32_TRAP_R: - return NVPTX::SUST_P_1D_ARRAY_V2B32_TRAP_I; - case NVPTX::SUST_P_1D_ARRAY_V4B8_TRAP_R: - return NVPTX::SUST_P_1D_ARRAY_V4B8_TRAP_I; - case NVPTX::SUST_P_1D_ARRAY_V4B16_TRAP_R: - return NVPTX::SUST_P_1D_ARRAY_V4B16_TRAP_I; - case NVPTX::SUST_P_1D_ARRAY_V4B32_TRAP_R: - return NVPTX::SUST_P_1D_ARRAY_V4B32_TRAP_I; - case NVPTX::SUST_P_2D_B8_TRAP_R: - return NVPTX::SUST_P_2D_B8_TRAP_I; - case NVPTX::SUST_P_2D_B16_TRAP_R: - return NVPTX::SUST_P_2D_B16_TRAP_I; - case NVPTX::SUST_P_2D_B32_TRAP_R: - return NVPTX::SUST_P_2D_B32_TRAP_I; - case NVPTX::SUST_P_2D_V2B8_TRAP_R: - return NVPTX::SUST_P_2D_V2B8_TRAP_I; - case NVPTX::SUST_P_2D_V2B16_TRAP_R: - return NVPTX::SUST_P_2D_V2B16_TRAP_I; - case NVPTX::SUST_P_2D_V2B32_TRAP_R: - return NVPTX::SUST_P_2D_V2B32_TRAP_I; - case NVPTX::SUST_P_2D_V4B8_TRAP_R: - return NVPTX::SUST_P_2D_V4B8_TRAP_I; - case NVPTX::SUST_P_2D_V4B16_TRAP_R: - return NVPTX::SUST_P_2D_V4B16_TRAP_I; - case NVPTX::SUST_P_2D_V4B32_TRAP_R: - return NVPTX::SUST_P_2D_V4B32_TRAP_I; - case NVPTX::SUST_P_2D_ARRAY_B8_TRAP_R: - return NVPTX::SUST_P_2D_ARRAY_B8_TRAP_I; - case NVPTX::SUST_P_2D_ARRAY_B16_TRAP_R: - return NVPTX::SUST_P_2D_ARRAY_B16_TRAP_I; - case NVPTX::SUST_P_2D_ARRAY_B32_TRAP_R: - return NVPTX::SUST_P_2D_ARRAY_B32_TRAP_I; - case NVPTX::SUST_P_2D_ARRAY_V2B8_TRAP_R: - return NVPTX::SUST_P_2D_ARRAY_V2B8_TRAP_I; - case NVPTX::SUST_P_2D_ARRAY_V2B16_TRAP_R: - return NVPTX::SUST_P_2D_ARRAY_V2B16_TRAP_I; - case NVPTX::SUST_P_2D_ARRAY_V2B32_TRAP_R: - return NVPTX::SUST_P_2D_ARRAY_V2B32_TRAP_I; - case NVPTX::SUST_P_2D_ARRAY_V4B8_TRAP_R: - return NVPTX::SUST_P_2D_ARRAY_V4B8_TRAP_I; - case NVPTX::SUST_P_2D_ARRAY_V4B16_TRAP_R: - return NVPTX::SUST_P_2D_ARRAY_V4B16_TRAP_I; - case NVPTX::SUST_P_2D_ARRAY_V4B32_TRAP_R: - return NVPTX::SUST_P_2D_ARRAY_V4B32_TRAP_I; - case NVPTX::SUST_P_3D_B8_TRAP_R: - return NVPTX::SUST_P_3D_B8_TRAP_I; - case NVPTX::SUST_P_3D_B16_TRAP_R: - return NVPTX::SUST_P_3D_B16_TRAP_I; - case NVPTX::SUST_P_3D_B32_TRAP_R: - return NVPTX::SUST_P_3D_B32_TRAP_I; - case NVPTX::SUST_P_3D_V2B8_TRAP_R: - return NVPTX::SUST_P_3D_V2B8_TRAP_I; - case NVPTX::SUST_P_3D_V2B16_TRAP_R: - return NVPTX::SUST_P_3D_V2B16_TRAP_I; - case NVPTX::SUST_P_3D_V2B32_TRAP_R: - return NVPTX::SUST_P_3D_V2B32_TRAP_I; - case NVPTX::SUST_P_3D_V4B8_TRAP_R: - return NVPTX::SUST_P_3D_V4B8_TRAP_I; - case NVPTX::SUST_P_3D_V4B16_TRAP_R: - return NVPTX::SUST_P_3D_V4B16_TRAP_I; - case NVPTX::SUST_P_3D_V4B32_TRAP_R: - return NVPTX::SUST_P_3D_V4B32_TRAP_I; + case NVPTX::SUST_B_1D_I8_CLAMP_R: + return NVPTX::SUST_B_1D_I8_CLAMP_I; + case NVPTX::SUST_B_1D_I16_CLAMP_R: + return NVPTX::SUST_B_1D_I16_CLAMP_I; + case NVPTX::SUST_B_1D_I32_CLAMP_R: + return NVPTX::SUST_B_1D_I32_CLAMP_I; + case NVPTX::SUST_B_1D_I64_CLAMP_R: + return NVPTX::SUST_B_1D_I64_CLAMP_I; + case NVPTX::SUST_B_1D_V2I8_CLAMP_R: + return NVPTX::SUST_B_1D_V2I8_CLAMP_I; + case NVPTX::SUST_B_1D_V2I16_CLAMP_R: + return NVPTX::SUST_B_1D_V2I16_CLAMP_I; + case NVPTX::SUST_B_1D_V2I32_CLAMP_R: + return NVPTX::SUST_B_1D_V2I32_CLAMP_I; + case NVPTX::SUST_B_1D_V2I64_CLAMP_R: + return NVPTX::SUST_B_1D_V2I64_CLAMP_I; + case NVPTX::SUST_B_1D_V4I8_CLAMP_R: + return NVPTX::SUST_B_1D_V4I8_CLAMP_I; + case NVPTX::SUST_B_1D_V4I16_CLAMP_R: + return NVPTX::SUST_B_1D_V4I16_CLAMP_I; + case NVPTX::SUST_B_1D_V4I32_CLAMP_R: + return NVPTX::SUST_B_1D_V4I32_CLAMP_I; + case NVPTX::SUST_B_1D_ARRAY_I8_CLAMP_R: + return NVPTX::SUST_B_1D_ARRAY_I8_CLAMP_I; + case NVPTX::SUST_B_1D_ARRAY_I16_CLAMP_R: + return NVPTX::SUST_B_1D_ARRAY_I16_CLAMP_I; + case NVPTX::SUST_B_1D_ARRAY_I32_CLAMP_R: + return NVPTX::SUST_B_1D_ARRAY_I32_CLAMP_I; + case NVPTX::SUST_B_1D_ARRAY_I64_CLAMP_R: + return NVPTX::SUST_B_1D_ARRAY_I64_CLAMP_I; + case NVPTX::SUST_B_1D_ARRAY_V2I8_CLAMP_R: + return NVPTX::SUST_B_1D_ARRAY_V2I8_CLAMP_I; + case NVPTX::SUST_B_1D_ARRAY_V2I16_CLAMP_R: + return NVPTX::SUST_B_1D_ARRAY_V2I16_CLAMP_I; + case NVPTX::SUST_B_1D_ARRAY_V2I32_CLAMP_R: + return NVPTX::SUST_B_1D_ARRAY_V2I32_CLAMP_I; + case NVPTX::SUST_B_1D_ARRAY_V2I64_CLAMP_R: + return NVPTX::SUST_B_1D_ARRAY_V2I64_CLAMP_I; + case NVPTX::SUST_B_1D_ARRAY_V4I8_CLAMP_R: + return NVPTX::SUST_B_1D_ARRAY_V4I8_CLAMP_I; + case NVPTX::SUST_B_1D_ARRAY_V4I16_CLAMP_R: + return NVPTX::SUST_B_1D_ARRAY_V4I16_CLAMP_I; + case NVPTX::SUST_B_1D_ARRAY_V4I32_CLAMP_R: + return NVPTX::SUST_B_1D_ARRAY_V4I32_CLAMP_I; + case NVPTX::SUST_B_2D_I8_CLAMP_R: + return NVPTX::SUST_B_2D_I8_CLAMP_I; + case NVPTX::SUST_B_2D_I16_CLAMP_R: + return NVPTX::SUST_B_2D_I16_CLAMP_I; + case NVPTX::SUST_B_2D_I32_CLAMP_R: + return NVPTX::SUST_B_2D_I32_CLAMP_I; + case NVPTX::SUST_B_2D_I64_CLAMP_R: + return NVPTX::SUST_B_2D_I64_CLAMP_I; + case NVPTX::SUST_B_2D_V2I8_CLAMP_R: + return NVPTX::SUST_B_2D_V2I8_CLAMP_I; + case NVPTX::SUST_B_2D_V2I16_CLAMP_R: + return NVPTX::SUST_B_2D_V2I16_CLAMP_I; + case NVPTX::SUST_B_2D_V2I32_CLAMP_R: + return NVPTX::SUST_B_2D_V2I32_CLAMP_I; + case NVPTX::SUST_B_2D_V2I64_CLAMP_R: + return NVPTX::SUST_B_2D_V2I64_CLAMP_I; + case NVPTX::SUST_B_2D_V4I8_CLAMP_R: + return NVPTX::SUST_B_2D_V4I8_CLAMP_I; + case NVPTX::SUST_B_2D_V4I16_CLAMP_R: + return NVPTX::SUST_B_2D_V4I16_CLAMP_I; + case NVPTX::SUST_B_2D_V4I32_CLAMP_R: + return NVPTX::SUST_B_2D_V4I32_CLAMP_I; + case NVPTX::SUST_B_2D_ARRAY_I8_CLAMP_R: + return NVPTX::SUST_B_2D_ARRAY_I8_CLAMP_I; + case NVPTX::SUST_B_2D_ARRAY_I16_CLAMP_R: + return NVPTX::SUST_B_2D_ARRAY_I16_CLAMP_I; + case NVPTX::SUST_B_2D_ARRAY_I32_CLAMP_R: + return NVPTX::SUST_B_2D_ARRAY_I32_CLAMP_I; + case NVPTX::SUST_B_2D_ARRAY_I64_CLAMP_R: + return NVPTX::SUST_B_2D_ARRAY_I64_CLAMP_I; + case NVPTX::SUST_B_2D_ARRAY_V2I8_CLAMP_R: + return NVPTX::SUST_B_2D_ARRAY_V2I8_CLAMP_I; + case NVPTX::SUST_B_2D_ARRAY_V2I16_CLAMP_R: + return NVPTX::SUST_B_2D_ARRAY_V2I16_CLAMP_I; + case NVPTX::SUST_B_2D_ARRAY_V2I32_CLAMP_R: + return NVPTX::SUST_B_2D_ARRAY_V2I32_CLAMP_I; + case NVPTX::SUST_B_2D_ARRAY_V2I64_CLAMP_R: + return NVPTX::SUST_B_2D_ARRAY_V2I64_CLAMP_I; + case NVPTX::SUST_B_2D_ARRAY_V4I8_CLAMP_R: + return NVPTX::SUST_B_2D_ARRAY_V4I8_CLAMP_I; + case NVPTX::SUST_B_2D_ARRAY_V4I16_CLAMP_R: + return NVPTX::SUST_B_2D_ARRAY_V4I16_CLAMP_I; + case NVPTX::SUST_B_2D_ARRAY_V4I32_CLAMP_R: + return NVPTX::SUST_B_2D_ARRAY_V4I32_CLAMP_I; + case NVPTX::SUST_B_3D_I8_CLAMP_R: + return NVPTX::SUST_B_3D_I8_CLAMP_I; + case NVPTX::SUST_B_3D_I16_CLAMP_R: + return NVPTX::SUST_B_3D_I16_CLAMP_I; + case NVPTX::SUST_B_3D_I32_CLAMP_R: + return NVPTX::SUST_B_3D_I32_CLAMP_I; + case NVPTX::SUST_B_3D_I64_CLAMP_R: + return NVPTX::SUST_B_3D_I64_CLAMP_I; + case NVPTX::SUST_B_3D_V2I8_CLAMP_R: + return NVPTX::SUST_B_3D_V2I8_CLAMP_I; + case NVPTX::SUST_B_3D_V2I16_CLAMP_R: + return NVPTX::SUST_B_3D_V2I16_CLAMP_I; + case NVPTX::SUST_B_3D_V2I32_CLAMP_R: + return NVPTX::SUST_B_3D_V2I32_CLAMP_I; + case NVPTX::SUST_B_3D_V2I64_CLAMP_R: + return NVPTX::SUST_B_3D_V2I64_CLAMP_I; + case NVPTX::SUST_B_3D_V4I8_CLAMP_R: + return NVPTX::SUST_B_3D_V4I8_CLAMP_I; + case NVPTX::SUST_B_3D_V4I16_CLAMP_R: + return NVPTX::SUST_B_3D_V4I16_CLAMP_I; + case NVPTX::SUST_B_3D_V4I32_CLAMP_R: + return NVPTX::SUST_B_3D_V4I32_CLAMP_I; + case NVPTX::SUST_B_1D_I8_TRAP_R: + return NVPTX::SUST_B_1D_I8_TRAP_I; + case NVPTX::SUST_B_1D_I16_TRAP_R: + return NVPTX::SUST_B_1D_I16_TRAP_I; + case NVPTX::SUST_B_1D_I32_TRAP_R: + return NVPTX::SUST_B_1D_I32_TRAP_I; + case NVPTX::SUST_B_1D_I64_TRAP_R: + return NVPTX::SUST_B_1D_I64_TRAP_I; + case NVPTX::SUST_B_1D_V2I8_TRAP_R: + return NVPTX::SUST_B_1D_V2I8_TRAP_I; + case NVPTX::SUST_B_1D_V2I16_TRAP_R: + return NVPTX::SUST_B_1D_V2I16_TRAP_I; + case NVPTX::SUST_B_1D_V2I32_TRAP_R: + return NVPTX::SUST_B_1D_V2I32_TRAP_I; + case NVPTX::SUST_B_1D_V2I64_TRAP_R: + return NVPTX::SUST_B_1D_V2I64_TRAP_I; + case NVPTX::SUST_B_1D_V4I8_TRAP_R: + return NVPTX::SUST_B_1D_V4I8_TRAP_I; + case NVPTX::SUST_B_1D_V4I16_TRAP_R: + return NVPTX::SUST_B_1D_V4I16_TRAP_I; + case NVPTX::SUST_B_1D_V4I32_TRAP_R: + return NVPTX::SUST_B_1D_V4I32_TRAP_I; + case NVPTX::SUST_B_1D_ARRAY_I8_TRAP_R: + return NVPTX::SUST_B_1D_ARRAY_I8_TRAP_I; + case NVPTX::SUST_B_1D_ARRAY_I16_TRAP_R: + return NVPTX::SUST_B_1D_ARRAY_I16_TRAP_I; + case NVPTX::SUST_B_1D_ARRAY_I32_TRAP_R: + return NVPTX::SUST_B_1D_ARRAY_I32_TRAP_I; + case NVPTX::SUST_B_1D_ARRAY_I64_TRAP_R: + return NVPTX::SUST_B_1D_ARRAY_I64_TRAP_I; + case NVPTX::SUST_B_1D_ARRAY_V2I8_TRAP_R: + return NVPTX::SUST_B_1D_ARRAY_V2I8_TRAP_I; + case NVPTX::SUST_B_1D_ARRAY_V2I16_TRAP_R: + return NVPTX::SUST_B_1D_ARRAY_V2I16_TRAP_I; + case NVPTX::SUST_B_1D_ARRAY_V2I32_TRAP_R: + return NVPTX::SUST_B_1D_ARRAY_V2I32_TRAP_I; + case NVPTX::SUST_B_1D_ARRAY_V2I64_TRAP_R: + return NVPTX::SUST_B_1D_ARRAY_V2I64_TRAP_I; + case NVPTX::SUST_B_1D_ARRAY_V4I8_TRAP_R: + return NVPTX::SUST_B_1D_ARRAY_V4I8_TRAP_I; + case NVPTX::SUST_B_1D_ARRAY_V4I16_TRAP_R: + return NVPTX::SUST_B_1D_ARRAY_V4I16_TRAP_I; + case NVPTX::SUST_B_1D_ARRAY_V4I32_TRAP_R: + return NVPTX::SUST_B_1D_ARRAY_V4I32_TRAP_I; + case NVPTX::SUST_B_2D_I8_TRAP_R: + return NVPTX::SUST_B_2D_I8_TRAP_I; + case NVPTX::SUST_B_2D_I16_TRAP_R: + return NVPTX::SUST_B_2D_I16_TRAP_I; + case NVPTX::SUST_B_2D_I32_TRAP_R: + return NVPTX::SUST_B_2D_I32_TRAP_I; + case NVPTX::SUST_B_2D_I64_TRAP_R: + return NVPTX::SUST_B_2D_I64_TRAP_I; + case NVPTX::SUST_B_2D_V2I8_TRAP_R: + return NVPTX::SUST_B_2D_V2I8_TRAP_I; + case NVPTX::SUST_B_2D_V2I16_TRAP_R: + return NVPTX::SUST_B_2D_V2I16_TRAP_I; + case NVPTX::SUST_B_2D_V2I32_TRAP_R: + return NVPTX::SUST_B_2D_V2I32_TRAP_I; + case NVPTX::SUST_B_2D_V2I64_TRAP_R: + return NVPTX::SUST_B_2D_V2I64_TRAP_I; + case NVPTX::SUST_B_2D_V4I8_TRAP_R: + return NVPTX::SUST_B_2D_V4I8_TRAP_I; + case NVPTX::SUST_B_2D_V4I16_TRAP_R: + return NVPTX::SUST_B_2D_V4I16_TRAP_I; + case NVPTX::SUST_B_2D_V4I32_TRAP_R: + return NVPTX::SUST_B_2D_V4I32_TRAP_I; + case NVPTX::SUST_B_2D_ARRAY_I8_TRAP_R: + return NVPTX::SUST_B_2D_ARRAY_I8_TRAP_I; + case NVPTX::SUST_B_2D_ARRAY_I16_TRAP_R: + return NVPTX::SUST_B_2D_ARRAY_I16_TRAP_I; + case NVPTX::SUST_B_2D_ARRAY_I32_TRAP_R: + return NVPTX::SUST_B_2D_ARRAY_I32_TRAP_I; + case NVPTX::SUST_B_2D_ARRAY_I64_TRAP_R: + return NVPTX::SUST_B_2D_ARRAY_I64_TRAP_I; + case NVPTX::SUST_B_2D_ARRAY_V2I8_TRAP_R: + return NVPTX::SUST_B_2D_ARRAY_V2I8_TRAP_I; + case NVPTX::SUST_B_2D_ARRAY_V2I16_TRAP_R: + return NVPTX::SUST_B_2D_ARRAY_V2I16_TRAP_I; + case NVPTX::SUST_B_2D_ARRAY_V2I32_TRAP_R: + return NVPTX::SUST_B_2D_ARRAY_V2I32_TRAP_I; + case NVPTX::SUST_B_2D_ARRAY_V2I64_TRAP_R: + return NVPTX::SUST_B_2D_ARRAY_V2I64_TRAP_I; + case NVPTX::SUST_B_2D_ARRAY_V4I8_TRAP_R: + return NVPTX::SUST_B_2D_ARRAY_V4I8_TRAP_I; + case NVPTX::SUST_B_2D_ARRAY_V4I16_TRAP_R: + return NVPTX::SUST_B_2D_ARRAY_V4I16_TRAP_I; + case NVPTX::SUST_B_2D_ARRAY_V4I32_TRAP_R: + return NVPTX::SUST_B_2D_ARRAY_V4I32_TRAP_I; + case NVPTX::SUST_B_3D_I8_TRAP_R: + return NVPTX::SUST_B_3D_I8_TRAP_I; + case NVPTX::SUST_B_3D_I16_TRAP_R: + return NVPTX::SUST_B_3D_I16_TRAP_I; + case NVPTX::SUST_B_3D_I32_TRAP_R: + return NVPTX::SUST_B_3D_I32_TRAP_I; + case NVPTX::SUST_B_3D_I64_TRAP_R: + return NVPTX::SUST_B_3D_I64_TRAP_I; + case NVPTX::SUST_B_3D_V2I8_TRAP_R: + return NVPTX::SUST_B_3D_V2I8_TRAP_I; + case NVPTX::SUST_B_3D_V2I16_TRAP_R: + return NVPTX::SUST_B_3D_V2I16_TRAP_I; + case NVPTX::SUST_B_3D_V2I32_TRAP_R: + return NVPTX::SUST_B_3D_V2I32_TRAP_I; + case NVPTX::SUST_B_3D_V2I64_TRAP_R: + return NVPTX::SUST_B_3D_V2I64_TRAP_I; + case NVPTX::SUST_B_3D_V4I8_TRAP_R: + return NVPTX::SUST_B_3D_V4I8_TRAP_I; + case NVPTX::SUST_B_3D_V4I16_TRAP_R: + return NVPTX::SUST_B_3D_V4I16_TRAP_I; + case NVPTX::SUST_B_3D_V4I32_TRAP_R: + return NVPTX::SUST_B_3D_V4I32_TRAP_I; + case NVPTX::SUST_B_1D_I8_ZERO_R: + return NVPTX::SUST_B_1D_I8_ZERO_I; + case NVPTX::SUST_B_1D_I16_ZERO_R: + return NVPTX::SUST_B_1D_I16_ZERO_I; + case NVPTX::SUST_B_1D_I32_ZERO_R: + return NVPTX::SUST_B_1D_I32_ZERO_I; + case NVPTX::SUST_B_1D_I64_ZERO_R: + return NVPTX::SUST_B_1D_I64_ZERO_I; + case NVPTX::SUST_B_1D_V2I8_ZERO_R: + return NVPTX::SUST_B_1D_V2I8_ZERO_I; + case NVPTX::SUST_B_1D_V2I16_ZERO_R: + return NVPTX::SUST_B_1D_V2I16_ZERO_I; + case NVPTX::SUST_B_1D_V2I32_ZERO_R: + return NVPTX::SUST_B_1D_V2I32_ZERO_I; + case NVPTX::SUST_B_1D_V2I64_ZERO_R: + return NVPTX::SUST_B_1D_V2I64_ZERO_I; + case NVPTX::SUST_B_1D_V4I8_ZERO_R: + return NVPTX::SUST_B_1D_V4I8_ZERO_I; + case NVPTX::SUST_B_1D_V4I16_ZERO_R: + return NVPTX::SUST_B_1D_V4I16_ZERO_I; + case NVPTX::SUST_B_1D_V4I32_ZERO_R: + return NVPTX::SUST_B_1D_V4I32_ZERO_I; + case NVPTX::SUST_B_1D_ARRAY_I8_ZERO_R: + return NVPTX::SUST_B_1D_ARRAY_I8_ZERO_I; + case NVPTX::SUST_B_1D_ARRAY_I16_ZERO_R: + return NVPTX::SUST_B_1D_ARRAY_I16_ZERO_I; + case NVPTX::SUST_B_1D_ARRAY_I32_ZERO_R: + return NVPTX::SUST_B_1D_ARRAY_I32_ZERO_I; + case NVPTX::SUST_B_1D_ARRAY_I64_ZERO_R: + return NVPTX::SUST_B_1D_ARRAY_I64_ZERO_I; + case NVPTX::SUST_B_1D_ARRAY_V2I8_ZERO_R: + return NVPTX::SUST_B_1D_ARRAY_V2I8_ZERO_I; + case NVPTX::SUST_B_1D_ARRAY_V2I16_ZERO_R: + return NVPTX::SUST_B_1D_ARRAY_V2I16_ZERO_I; + case NVPTX::SUST_B_1D_ARRAY_V2I32_ZERO_R: + return NVPTX::SUST_B_1D_ARRAY_V2I32_ZERO_I; + case NVPTX::SUST_B_1D_ARRAY_V2I64_ZERO_R: + return NVPTX::SUST_B_1D_ARRAY_V2I64_ZERO_I; + case NVPTX::SUST_B_1D_ARRAY_V4I8_ZERO_R: + return NVPTX::SUST_B_1D_ARRAY_V4I8_ZERO_I; + case NVPTX::SUST_B_1D_ARRAY_V4I16_ZERO_R: + return NVPTX::SUST_B_1D_ARRAY_V4I16_ZERO_I; + case NVPTX::SUST_B_1D_ARRAY_V4I32_ZERO_R: + return NVPTX::SUST_B_1D_ARRAY_V4I32_ZERO_I; + case NVPTX::SUST_B_2D_I8_ZERO_R: + return NVPTX::SUST_B_2D_I8_ZERO_I; + case NVPTX::SUST_B_2D_I16_ZERO_R: + return NVPTX::SUST_B_2D_I16_ZERO_I; + case NVPTX::SUST_B_2D_I32_ZERO_R: + return NVPTX::SUST_B_2D_I32_ZERO_I; + case NVPTX::SUST_B_2D_I64_ZERO_R: + return NVPTX::SUST_B_2D_I64_ZERO_I; + case NVPTX::SUST_B_2D_V2I8_ZERO_R: + return NVPTX::SUST_B_2D_V2I8_ZERO_I; + case NVPTX::SUST_B_2D_V2I16_ZERO_R: + return NVPTX::SUST_B_2D_V2I16_ZERO_I; + case NVPTX::SUST_B_2D_V2I32_ZERO_R: + return NVPTX::SUST_B_2D_V2I32_ZERO_I; + case NVPTX::SUST_B_2D_V2I64_ZERO_R: + return NVPTX::SUST_B_2D_V2I64_ZERO_I; + case NVPTX::SUST_B_2D_V4I8_ZERO_R: + return NVPTX::SUST_B_2D_V4I8_ZERO_I; + case NVPTX::SUST_B_2D_V4I16_ZERO_R: + return NVPTX::SUST_B_2D_V4I16_ZERO_I; + case NVPTX::SUST_B_2D_V4I32_ZERO_R: + return NVPTX::SUST_B_2D_V4I32_ZERO_I; + case NVPTX::SUST_B_2D_ARRAY_I8_ZERO_R: + return NVPTX::SUST_B_2D_ARRAY_I8_ZERO_I; + case NVPTX::SUST_B_2D_ARRAY_I16_ZERO_R: + return NVPTX::SUST_B_2D_ARRAY_I16_ZERO_I; + case NVPTX::SUST_B_2D_ARRAY_I32_ZERO_R: + return NVPTX::SUST_B_2D_ARRAY_I32_ZERO_I; + case NVPTX::SUST_B_2D_ARRAY_I64_ZERO_R: + return NVPTX::SUST_B_2D_ARRAY_I64_ZERO_I; + case NVPTX::SUST_B_2D_ARRAY_V2I8_ZERO_R: + return NVPTX::SUST_B_2D_ARRAY_V2I8_ZERO_I; + case NVPTX::SUST_B_2D_ARRAY_V2I16_ZERO_R: + return NVPTX::SUST_B_2D_ARRAY_V2I16_ZERO_I; + case NVPTX::SUST_B_2D_ARRAY_V2I32_ZERO_R: + return NVPTX::SUST_B_2D_ARRAY_V2I32_ZERO_I; + case NVPTX::SUST_B_2D_ARRAY_V2I64_ZERO_R: + return NVPTX::SUST_B_2D_ARRAY_V2I64_ZERO_I; + case NVPTX::SUST_B_2D_ARRAY_V4I8_ZERO_R: + return NVPTX::SUST_B_2D_ARRAY_V4I8_ZERO_I; + case NVPTX::SUST_B_2D_ARRAY_V4I16_ZERO_R: + return NVPTX::SUST_B_2D_ARRAY_V4I16_ZERO_I; + case NVPTX::SUST_B_2D_ARRAY_V4I32_ZERO_R: + return NVPTX::SUST_B_2D_ARRAY_V4I32_ZERO_I; + case NVPTX::SUST_B_3D_I8_ZERO_R: + return NVPTX::SUST_B_3D_I8_ZERO_I; + case NVPTX::SUST_B_3D_I16_ZERO_R: + return NVPTX::SUST_B_3D_I16_ZERO_I; + case NVPTX::SUST_B_3D_I32_ZERO_R: + return NVPTX::SUST_B_3D_I32_ZERO_I; + case NVPTX::SUST_B_3D_I64_ZERO_R: + return NVPTX::SUST_B_3D_I64_ZERO_I; + case NVPTX::SUST_B_3D_V2I8_ZERO_R: + return NVPTX::SUST_B_3D_V2I8_ZERO_I; + case NVPTX::SUST_B_3D_V2I16_ZERO_R: + return NVPTX::SUST_B_3D_V2I16_ZERO_I; + case NVPTX::SUST_B_3D_V2I32_ZERO_R: + return NVPTX::SUST_B_3D_V2I32_ZERO_I; + case NVPTX::SUST_B_3D_V2I64_ZERO_R: + return NVPTX::SUST_B_3D_V2I64_ZERO_I; + case NVPTX::SUST_B_3D_V4I8_ZERO_R: + return NVPTX::SUST_B_3D_V4I8_ZERO_I; + case NVPTX::SUST_B_3D_V4I16_ZERO_R: + return NVPTX::SUST_B_3D_V4I16_ZERO_I; + case NVPTX::SUST_B_3D_V4I32_ZERO_R: + return NVPTX::SUST_B_3D_V4I32_ZERO_I; + case NVPTX::SUST_P_1D_I8_TRAP_R: + return NVPTX::SUST_P_1D_I8_TRAP_I; + case NVPTX::SUST_P_1D_I16_TRAP_R: + return NVPTX::SUST_P_1D_I16_TRAP_I; + case NVPTX::SUST_P_1D_I32_TRAP_R: + return NVPTX::SUST_P_1D_I32_TRAP_I; + case NVPTX::SUST_P_1D_V2I8_TRAP_R: + return NVPTX::SUST_P_1D_V2I8_TRAP_I; + case NVPTX::SUST_P_1D_V2I16_TRAP_R: + return NVPTX::SUST_P_1D_V2I16_TRAP_I; + case NVPTX::SUST_P_1D_V2I32_TRAP_R: + return NVPTX::SUST_P_1D_V2I32_TRAP_I; + case NVPTX::SUST_P_1D_V4I8_TRAP_R: + return NVPTX::SUST_P_1D_V4I8_TRAP_I; + case NVPTX::SUST_P_1D_V4I16_TRAP_R: + return NVPTX::SUST_P_1D_V4I16_TRAP_I; + case NVPTX::SUST_P_1D_V4I32_TRAP_R: + return NVPTX::SUST_P_1D_V4I32_TRAP_I; + case NVPTX::SUST_P_1D_ARRAY_I8_TRAP_R: + return NVPTX::SUST_P_1D_ARRAY_I8_TRAP_I; + case NVPTX::SUST_P_1D_ARRAY_I16_TRAP_R: + return NVPTX::SUST_P_1D_ARRAY_I16_TRAP_I; + case NVPTX::SUST_P_1D_ARRAY_I32_TRAP_R: + return NVPTX::SUST_P_1D_ARRAY_I32_TRAP_I; + case NVPTX::SUST_P_1D_ARRAY_V2I8_TRAP_R: + return NVPTX::SUST_P_1D_ARRAY_V2I8_TRAP_I; + case NVPTX::SUST_P_1D_ARRAY_V2I16_TRAP_R: + return NVPTX::SUST_P_1D_ARRAY_V2I16_TRAP_I; + case NVPTX::SUST_P_1D_ARRAY_V2I32_TRAP_R: + return NVPTX::SUST_P_1D_ARRAY_V2I32_TRAP_I; + case NVPTX::SUST_P_1D_ARRAY_V4I8_TRAP_R: + return NVPTX::SUST_P_1D_ARRAY_V4I8_TRAP_I; + case NVPTX::SUST_P_1D_ARRAY_V4I16_TRAP_R: + return NVPTX::SUST_P_1D_ARRAY_V4I16_TRAP_I; + case NVPTX::SUST_P_1D_ARRAY_V4I32_TRAP_R: + return NVPTX::SUST_P_1D_ARRAY_V4I32_TRAP_I; + case NVPTX::SUST_P_2D_I8_TRAP_R: + return NVPTX::SUST_P_2D_I8_TRAP_I; + case NVPTX::SUST_P_2D_I16_TRAP_R: + return NVPTX::SUST_P_2D_I16_TRAP_I; + case NVPTX::SUST_P_2D_I32_TRAP_R: + return NVPTX::SUST_P_2D_I32_TRAP_I; + case NVPTX::SUST_P_2D_V2I8_TRAP_R: + return NVPTX::SUST_P_2D_V2I8_TRAP_I; + case NVPTX::SUST_P_2D_V2I16_TRAP_R: + return NVPTX::SUST_P_2D_V2I16_TRAP_I; + case NVPTX::SUST_P_2D_V2I32_TRAP_R: + return NVPTX::SUST_P_2D_V2I32_TRAP_I; + case NVPTX::SUST_P_2D_V4I8_TRAP_R: + return NVPTX::SUST_P_2D_V4I8_TRAP_I; + case NVPTX::SUST_P_2D_V4I16_TRAP_R: + return NVPTX::SUST_P_2D_V4I16_TRAP_I; + case NVPTX::SUST_P_2D_V4I32_TRAP_R: + return NVPTX::SUST_P_2D_V4I32_TRAP_I; + case NVPTX::SUST_P_2D_ARRAY_I8_TRAP_R: + return NVPTX::SUST_P_2D_ARRAY_I8_TRAP_I; + case NVPTX::SUST_P_2D_ARRAY_I16_TRAP_R: + return NVPTX::SUST_P_2D_ARRAY_I16_TRAP_I; + case NVPTX::SUST_P_2D_ARRAY_I32_TRAP_R: + return NVPTX::SUST_P_2D_ARRAY_I32_TRAP_I; + case NVPTX::SUST_P_2D_ARRAY_V2I8_TRAP_R: + return NVPTX::SUST_P_2D_ARRAY_V2I8_TRAP_I; + case NVPTX::SUST_P_2D_ARRAY_V2I16_TRAP_R: + return NVPTX::SUST_P_2D_ARRAY_V2I16_TRAP_I; + case NVPTX::SUST_P_2D_ARRAY_V2I32_TRAP_R: + return NVPTX::SUST_P_2D_ARRAY_V2I32_TRAP_I; + case NVPTX::SUST_P_2D_ARRAY_V4I8_TRAP_R: + return NVPTX::SUST_P_2D_ARRAY_V4I8_TRAP_I; + case NVPTX::SUST_P_2D_ARRAY_V4I16_TRAP_R: + return NVPTX::SUST_P_2D_ARRAY_V4I16_TRAP_I; + case NVPTX::SUST_P_2D_ARRAY_V4I32_TRAP_R: + return NVPTX::SUST_P_2D_ARRAY_V4I32_TRAP_I; + case NVPTX::SUST_P_3D_I8_TRAP_R: + return NVPTX::SUST_P_3D_I8_TRAP_I; + case NVPTX::SUST_P_3D_I16_TRAP_R: + return NVPTX::SUST_P_3D_I16_TRAP_I; + case NVPTX::SUST_P_3D_I32_TRAP_R: + return NVPTX::SUST_P_3D_I32_TRAP_I; + case NVPTX::SUST_P_3D_V2I8_TRAP_R: + return NVPTX::SUST_P_3D_V2I8_TRAP_I; + case NVPTX::SUST_P_3D_V2I16_TRAP_R: + return NVPTX::SUST_P_3D_V2I16_TRAP_I; + case NVPTX::SUST_P_3D_V2I32_TRAP_R: + return NVPTX::SUST_P_3D_V2I32_TRAP_I; + case NVPTX::SUST_P_3D_V4I8_TRAP_R: + return NVPTX::SUST_P_3D_V4I8_TRAP_I; + case NVPTX::SUST_P_3D_V4I16_TRAP_R: + return NVPTX::SUST_P_3D_V4I16_TRAP_I; + case NVPTX::SUST_P_3D_V4I32_TRAP_R: + return NVPTX::SUST_P_3D_V4I32_TRAP_I; default: llvm_unreachable("Unhandled SUST opcode"); } From ace356bc9777e6a5b5aa0ba2335d2546ac6f330e Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Wed, 11 Jun 2025 20:45:32 +0100 Subject: [PATCH 0040/1322] [VPlan] Always verify VPCanonicalIVPHIRecipe placement (NFC). Loop regions are dissolved since dcef154b5caf6556e69bb1, remove the check for VerifyLate and corresponding TODO. --- llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp index 45010d002158..fba4a68f4a27 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp @@ -429,8 +429,7 @@ bool VPlanVerifier::verify(const VPlan &Plan) { return false; } - // TODO: Remove once loop regions are dissolved before execution. - if (!VerifyLate && !isa(&*Entry->begin())) { + if (!isa(&*Entry->begin())) { errs() << "VPlan vector loop header does not start with a " "VPCanonicalIVPHIRecipe\n"; return false; From ebc90d50b88a7c46634ea21e40ddb25c679ac874 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Wed, 11 Jun 2025 12:50:03 -0700 Subject: [PATCH 0041/1322] [SandboxVectorizer] Use llvm::find (NFC) (#143724) llvm::find allows us to pass a range. --- .../llvm/Transforms/Vectorize/SandboxVectorizer/SeedCollector.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/SeedCollector.h b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/SeedCollector.h index d4cb34647cf5..6d2144b14bb0 100644 --- a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/SeedCollector.h +++ b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/SeedCollector.h @@ -68,7 +68,7 @@ public: /// the seeds in a bundle. This allows constant time evaluation /// and "removal" from the list. void setUsed(Instruction *I) { - auto It = std::find(begin(), end(), I); + auto It = llvm::find(*this, I); assert(It != end() && "Instruction not in the bundle!"); auto Idx = It - begin(); setUsed(Idx, 1, /*VerifyUnused=*/false); From e266d6a5da6871c89747416c70a4a39181b594fb Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Wed, 11 Jun 2025 12:50:11 -0700 Subject: [PATCH 0042/1322] [Format] Use llvm::min_element (NFC) (#143725) llvm::min_elements allows us to pass a range. --- clang/lib/Format/MacroCallReconstructor.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/clang/lib/Format/MacroCallReconstructor.cpp b/clang/lib/Format/MacroCallReconstructor.cpp index 116bbad320e1..895d9f93dfce 100644 --- a/clang/lib/Format/MacroCallReconstructor.cpp +++ b/clang/lib/Format/MacroCallReconstructor.cpp @@ -528,10 +528,10 @@ MacroCallReconstructor::createUnwrappedLine(const ReconstructedLine &Line, // 1. One level below the current line's level. // 2. At the correct level relative to each other. unsigned MinChildLevel = - std::min_element(N->Children.begin(), N->Children.end(), - [](const auto &E1, const auto &E2) { - return E1->Level < E2->Level; - }) + llvm::min_element(N->Children, + [](const auto &E1, const auto &E2) { + return E1->Level < E2->Level; + }) ->get() ->Level; for (const auto &Child : N->Children) { From c1d21f44340901f6a23ae7eb7c5379f5ad197b27 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Wed, 11 Jun 2025 12:50:19 -0700 Subject: [PATCH 0043/1322] [lld] Use std::tie to implement comparison operators (NFC) (#143726) std::tie facilitates lexicographical comparisons through std::tuple's built-in operator< and operator>. --- lld/ELF/SyntheticSections.cpp | 7 ++----- lld/MachO/UnwindInfoSection.cpp | 8 +++----- 2 files changed, 5 insertions(+), 10 deletions(-) diff --git a/lld/ELF/SyntheticSections.cpp b/lld/ELF/SyntheticSections.cpp index 785a56cdb349..0a9c7a081eb8 100644 --- a/lld/ELF/SyntheticSections.cpp +++ b/lld/ELF/SyntheticSections.cpp @@ -1939,11 +1939,8 @@ bool AndroidPackedRelocationSection::updateAllocSize(Ctx &ctx) { // For Rela, we also want to sort by r_addend when r_info is the same. This // enables us to group by r_addend as well. llvm::sort(nonRelatives, [](const Elf_Rela &a, const Elf_Rela &b) { - if (a.r_info != b.r_info) - return a.r_info < b.r_info; - if (a.r_addend != b.r_addend) - return a.r_addend < b.r_addend; - return a.r_offset < b.r_offset; + return std::tie(a.r_info, a.r_addend, a.r_offset) < + std::tie(b.r_info, b.r_addend, b.r_offset); }); // Group relocations with the same r_info. Note that each group emits a group diff --git a/lld/MachO/UnwindInfoSection.cpp b/lld/MachO/UnwindInfoSection.cpp index 624464e41d77..6e9f6c2aba74 100644 --- a/lld/MachO/UnwindInfoSection.cpp +++ b/lld/MachO/UnwindInfoSection.cpp @@ -535,11 +535,9 @@ void UnwindInfoSectionImpl::finalize() { llvm::sort(commonEncodings, [](const std::pair &a, const std::pair &b) { - if (a.second == b.second) - // When frequencies match, secondarily sort on encoding - // to maintain parity with validate-unwind-info.py - return a.first > b.first; - return a.second > b.second; + // When frequencies match, secondarily sort on encoding + // to maintain parity with validate-unwind-info.py + return std::tie(a.second, a.first) > std::tie(b.second, b.first); }); // Truncate the vector to 127 elements. From 8da1ac98efa0d315824a92d8b563299eccc3e0f1 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Wed, 11 Jun 2025 12:50:27 -0700 Subject: [PATCH 0044/1322] [llvm] Use std::tie to implement operator< (NFC) (#143728) std::tie facilitates lexicographical comparisons through std::tuple's built-in operator<. --- .../ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h | 12 +++--------- llvm/lib/Target/Hexagon/HexagonConstExtenders.cpp | 8 ++------ 2 files changed, 5 insertions(+), 15 deletions(-) diff --git a/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h b/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h index 24b03a058981..89b20978c40e 100644 --- a/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h +++ b/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h @@ -202,15 +202,9 @@ public: IsStubThumb == Other.IsStubThumb; } inline bool operator<(const RelocationValueRef &Other) const { - if (SectionID != Other.SectionID) - return SectionID < Other.SectionID; - if (Offset != Other.Offset) - return Offset < Other.Offset; - if (Addend != Other.Addend) - return Addend < Other.Addend; - if (IsStubThumb != Other.IsStubThumb) - return IsStubThumb < Other.IsStubThumb; - return SymbolName < Other.SymbolName; + return std::tie(SectionID, Offset, Addend, IsStubThumb, SymbolName) < + std::tie(Other.SectionID, Other.Offset, Other.Addend, + Other.IsStubThumb, Other.SymbolName); } }; diff --git a/llvm/lib/Target/Hexagon/HexagonConstExtenders.cpp b/llvm/lib/Target/Hexagon/HexagonConstExtenders.cpp index f38e7b879e5f..5dde47ab3de5 100644 --- a/llvm/lib/Target/Hexagon/HexagonConstExtenders.cpp +++ b/llvm/lib/Target/Hexagon/HexagonConstExtenders.cpp @@ -253,7 +253,7 @@ namespace { bool operator!=(Register R) const { return !operator==(R); } bool operator<(Register R) const { // For std::map. - return Reg < R.Reg || (Reg == R.Reg && Sub < R.Sub); + return std::tie(Reg, Sub) < std::tie(R.Reg, R.Sub); } llvm::Register Reg; unsigned Sub = 0; @@ -298,11 +298,7 @@ namespace { return !operator==(Ex); } bool operator<(const ExtExpr &Ex) const { - if (Rs != Ex.Rs) - return Rs < Ex.Rs; - if (S != Ex.S) - return S < Ex.S; - return !Neg && Ex.Neg; + return std::tie(Rs, S, Neg) < std::tie(Ex.Rs, Ex.S, Ex.Neg); } }; From 43c35e858ccae05d69151ccf9712a725aae37b52 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Wed, 11 Jun 2025 12:50:35 -0700 Subject: [PATCH 0045/1322] [mlir] Simplify calls to *Map::{insert,try_emplace} (NFC) (#143729) This patch simplifies code by removing the values from insert/try_emplace. Note that default values inserted by try_emplace are immediately overrideen in all these cases. --- mlir/lib/IR/AsmPrinter.cpp | 3 +-- mlir/lib/IR/SymbolTable.cpp | 2 +- mlir/lib/Transforms/Utils/CFGToSCF.cpp | 2 +- 3 files changed, 3 insertions(+), 4 deletions(-) diff --git a/mlir/lib/IR/AsmPrinter.cpp b/mlir/lib/IR/AsmPrinter.cpp index fc1806900c0a..c7cc6a02ad20 100644 --- a/mlir/lib/IR/AsmPrinter.cpp +++ b/mlir/lib/IR/AsmPrinter.cpp @@ -1146,8 +1146,7 @@ template std::pair AliasInitializer::visitImpl( T value, llvm::MapVector &aliases, bool canBeDeferred, PrintArgs &&...printArgs) { - auto [it, inserted] = - aliases.insert({value.getAsOpaquePointer(), InProgressAliasInfo()}); + auto [it, inserted] = aliases.try_emplace(value.getAsOpaquePointer()); size_t aliasIndex = std::distance(aliases.begin(), it); if (!inserted) { // Make sure that the alias isn't deferred if we don't permit it. diff --git a/mlir/lib/IR/SymbolTable.cpp b/mlir/lib/IR/SymbolTable.cpp index 075a0ba15d7c..aaa4d5617eb4 100644 --- a/mlir/lib/IR/SymbolTable.cpp +++ b/mlir/lib/IR/SymbolTable.cpp @@ -1100,7 +1100,7 @@ void SymbolUserMap::replaceAllUsesWith(Operation *symbol, if (newSymbol != symbol) { // Transfer over the users to the new symbol. The reference to the old one // is fetched again as the iterator is invalidated during the insertion. - auto newIt = symbolToUsers.try_emplace(newSymbol, SetVector{}); + auto newIt = symbolToUsers.try_emplace(newSymbol); auto oldIt = symbolToUsers.find(symbol); assert(oldIt != symbolToUsers.end() && "missing old users list"); if (newIt.second) diff --git a/mlir/lib/Transforms/Utils/CFGToSCF.cpp b/mlir/lib/Transforms/Utils/CFGToSCF.cpp index de380fc325f5..7c1781044d2a 100644 --- a/mlir/lib/Transforms/Utils/CFGToSCF.cpp +++ b/mlir/lib/Transforms/Utils/CFGToSCF.cpp @@ -709,7 +709,7 @@ transformToReduceLoop(Block *loopHeader, Block *exitBlock, llvm::SmallDenseMap dominanceCache; // Returns true if `loopBlock` dominates `block`. auto loopBlockDominates = [&](Block *block) { - auto [iter, inserted] = dominanceCache.insert({block, false}); + auto [iter, inserted] = dominanceCache.try_emplace(block); if (!inserted) return iter->second; iter->second = dominanceInfo.dominates(loopBlock, block); From ad2a2b8eed2f3ed1e050833ea8a8d88b0878c6a7 Mon Sep 17 00:00:00 2001 From: Paul Kirth Date: Wed, 11 Jun 2025 13:05:21 -0700 Subject: [PATCH 0046/1322] [llvm] Add a tool to check mustache compliance against the public spec (#142813) This is a cli tool to that tests the conformance of LLVM's mustache implementation against the public Mustache spec, hosted at https://github.com/mustache/spec. This is a revised version of the patches in #111487. Co-authored-by: Peter Chou --- llvm/CMakeLists.txt | 1 + llvm/docs/CommandGuide/index.rst | 1 + .../CommandGuide/llvm-test-mustache-spec.rst | 37 +++ .../llvm-test-mustache-spec/CMakeLists.txt | 5 + .../llvm-test-mustache-spec.cpp | 268 ++++++++++++++++++ 5 files changed, 312 insertions(+) create mode 100644 llvm/docs/CommandGuide/llvm-test-mustache-spec.rst create mode 100644 llvm/utils/llvm-test-mustache-spec/CMakeLists.txt create mode 100644 llvm/utils/llvm-test-mustache-spec/llvm-test-mustache-spec.cpp diff --git a/llvm/CMakeLists.txt b/llvm/CMakeLists.txt index 206f009b45f5..cfb67472aa71 100644 --- a/llvm/CMakeLists.txt +++ b/llvm/CMakeLists.txt @@ -1313,6 +1313,7 @@ if( LLVM_INCLUDE_UTILS ) add_subdirectory(utils/yaml-bench) add_subdirectory(utils/split-file) add_subdirectory(utils/mlgo-utils) + add_subdirectory(utils/llvm-test-mustache-spec) if( LLVM_INCLUDE_TESTS ) set(LLVM_SUBPROJECT_TITLE "Third-Party/Google Test") add_subdirectory(${LLVM_THIRD_PARTY_DIR}/unittest ${CMAKE_CURRENT_BINARY_DIR}/third-party/unittest) diff --git a/llvm/docs/CommandGuide/index.rst b/llvm/docs/CommandGuide/index.rst index 643951eca2a2..88fc1fd326b7 100644 --- a/llvm/docs/CommandGuide/index.rst +++ b/llvm/docs/CommandGuide/index.rst @@ -87,6 +87,7 @@ Developer Tools llvm-exegesis llvm-ifs llvm-locstats + llvm-test-mustache-spec llvm-pdbutil llvm-profgen llvm-tli-checker diff --git a/llvm/docs/CommandGuide/llvm-test-mustache-spec.rst b/llvm/docs/CommandGuide/llvm-test-mustache-spec.rst new file mode 100644 index 000000000000..8cd5a349e7e4 --- /dev/null +++ b/llvm/docs/CommandGuide/llvm-test-mustache-spec.rst @@ -0,0 +1,37 @@ +llvm-test-mustache-spec - LLVM tool to test Mustache library compliance +======================================================================= + +.. program:: llvm-test-mustache-spec + +SYNOPSIS +-------- + +:program:`llvm-test-mustache-spec` [*inputs...*] + +Description +----------- + +``llvm-test-mustache-spec`` tests the mustache spec conformance of the LLVM +mustache library. The spec can be found here: https://github.com/mustache/spec + +To test against the spec, simply download the spec and pass the test JSON files +to the driver. Each spec file should have a list of tests for compliance with +the spec. These are loaded as test cases, and rendered with our Mustache +implementation, which is then compared against the expected output from the +spec. + +The current implementation only supports non-optional parts of the spec, so +we do not expect any of the dynamic-names, inheritance, or lambda tests to +pass. Additionally, Triple Mustache is not supported. Unsupported tests are +marked as XFail and are removed from the XFail list as they are fixed. + +The tool prints the number of test failures and successes in each of the test +files to standard output. + +EXAMPLE +------- + +.. code-block:: console + + $ llvm-test-mustache-spec path/to/specs/\*.json + diff --git a/llvm/utils/llvm-test-mustache-spec/CMakeLists.txt b/llvm/utils/llvm-test-mustache-spec/CMakeLists.txt new file mode 100644 index 000000000000..dc1aa73371ff --- /dev/null +++ b/llvm/utils/llvm-test-mustache-spec/CMakeLists.txt @@ -0,0 +1,5 @@ +add_llvm_utility(llvm-test-mustache-spec + llvm-test-mustache-spec.cpp +) + +target_link_libraries(llvm-test-mustache-spec PRIVATE LLVMSupport) diff --git a/llvm/utils/llvm-test-mustache-spec/llvm-test-mustache-spec.cpp b/llvm/utils/llvm-test-mustache-spec/llvm-test-mustache-spec.cpp new file mode 100644 index 000000000000..28ed1b876672 --- /dev/null +++ b/llvm/utils/llvm-test-mustache-spec/llvm-test-mustache-spec.cpp @@ -0,0 +1,268 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Simple drivers to test the mustache spec found at: +// https://github.com/mustache/spec +// +// It is used to verify that the current implementation conforms to the spec. +// Simply download the spec and pass the test JSON files to the driver. Each +// spec file should have a list of tests for compliance with the spec. These +// are loaded as test cases, and rendered with our Mustache implementation, +// which is then compared against the expected output from the spec. +// +// The current implementation only supports non-optional parts of the spec, so +// we do not expect any of the dynamic-names, inheritance, or lambda tests to +// pass. Additionally, Triple Mustache is not supported. Unsupported tests are +// marked as XFail and are removed from the XFail list as they are fixed. +// +// Usage: +// llvm-test-mustache-spec path/to/test/file.json path/to/test/file2.json ... +//===----------------------------------------------------------------------===// + +#include "llvm/ADT/StringSet.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/Error.h" +#include "llvm/Support/MemoryBuffer.h" +#include "llvm/Support/Mustache.h" +#include "llvm/Support/Path.h" +#include + +using namespace llvm; +using namespace llvm::json; +using namespace llvm::mustache; + +#define DEBUG_TYPE "llvm-test-mustache-spec" + +static cl::OptionCategory Cat("llvm-test-mustache-spec Options"); + +static cl::list + InputFiles(cl::Positional, cl::desc(""), cl::OneOrMore); + +static cl::opt ReportErrors("report-errors", + cl::desc("Report errors in spec tests"), + cl::cat(Cat)); + +static ExitOnError ExitOnErr; + +static int NumXFail = 0; +static int NumSuccess = 0; + +static const StringMap> XFailTestNames = {{ + {"delimiters.json", + { + "Pair Behavior", + "Special Characters", + "Sections", + "Inverted Sections", + "Partial Inheritence", + "Post-Partial Behavior", + "Standalone Tag", + "Indented Standalone Tag", + "Standalone Line Endings", + "Standalone Without Previous Line", + "Standalone Without Newline", + }}, + {"~dynamic-names.json", + { + "Basic Behavior - Partial", + "Basic Behavior - Name Resolution", + "Context", + "Dotted Names", + "Dotted Names - Failed Lookup", + "Dotted names - Context Stacking", + "Dotted names - Context Stacking Under Repetition", + "Dotted names - Context Stacking Failed Lookup", + "Recursion", + "Surrounding Whitespace", + "Inline Indentation", + "Standalone Line Endings", + "Standalone Without Previous Line", + "Standalone Without Newline", + "Standalone Indentation", + "Padding Whitespace", + }}, + {"~inheritance.json", + { + "Default", + "Variable", + "Triple Mustache", + "Sections", + "Negative Sections", + "Mustache Injection", + "Inherit", + "Overridden content", + "Data does not override block default", + "Two overridden parents", + "Override parent with newlines", + "Inherit indentation", + "Only one override", + "Parent template", + "Recursion", + "Multi-level inheritance, no sub child", + "Text inside parent", + "Text inside parent", + "Block scope", + "Standalone parent", + "Standalone block", + "Block reindentation", + "Intrinsic indentation", + "Nested block reindentation", + + }}, + {"~lambdas.json", + { + "Interpolation", + "Interpolation - Expansion", + "Interpolation - Alternate Delimiters", + "Interpolation - Multiple Calls", + "Escaping", + "Section", + "Section - Expansion", + "Section - Alternate Delimiters", + "Section - Multiple Calls", + + }}, + {"interpolation.json", + { + "Triple Mustache", + "Triple Mustache Integer Interpolation", + "Triple Mustache Decimal Interpolation", + "Triple Mustache Null Interpolation", + "Triple Mustache Context Miss Interpolation", + "Dotted Names - Triple Mustache Interpolation", + "Implicit Iterators - Triple Mustache", + "Triple Mustache - Surrounding Whitespace", + "Triple Mustache - Standalone", + "Triple Mustache With Padding", + }}, + {"partials.json", {"Standalone Indentation"}}, + {"sections.json", {"Implicit Iterator - Triple mustache"}}, +}}; + +struct TestData { + static Expected createTestData(json::Object *TestCase, + StringRef InputFile) { + // If any of the needed elements are missing, we cannot continue. + // NOTE: partials are optional in the test schema. + if (!TestCase || !TestCase->getString("template") || + !TestCase->getString("expected") || !TestCase->getString("name") || + !TestCase->get("data")) + return createStringError( + llvm::inconvertibleErrorCode(), + "invalid JSON schema in test file: " + InputFile + "\n"); + + return TestData{TestCase->getString("template").value(), + TestCase->getString("expected").value(), + TestCase->getString("name").value(), TestCase->get("data"), + TestCase->get("partials")}; + } + + TestData() = default; + + StringRef TemplateStr; + StringRef ExpectedStr; + StringRef Name; + Value *Data; + Value *Partials; +}; + +static void reportTestFailure(const TestData &TD, StringRef ActualStr, + bool IsXFail) { + LLVM_DEBUG(dbgs() << "Template: " << TD.TemplateStr << "\n"); + if (TD.Partials) { + LLVM_DEBUG(dbgs() << "Partial: "); + LLVM_DEBUG(TD.Partials->print(dbgs())); + LLVM_DEBUG(dbgs() << "\n"); + } + LLVM_DEBUG(dbgs() << "JSON Data: "); + LLVM_DEBUG(TD.Data->print(dbgs())); + LLVM_DEBUG(dbgs() << "\n"); + outs() << formatv("Test {}: {}\n", (IsXFail ? "XFailed" : "Failed"), TD.Name); + if (ReportErrors) { + outs() << " Expected: \'" << TD.ExpectedStr << "\'\n" + << " Actual: \'" << ActualStr << "\'\n" + << " ====================\n"; + } +} + +static void registerPartials(Value *Partials, Template &T) { + if (!Partials) + return; + for (const auto &[Partial, Str] : *Partials->getAsObject()) + T.registerPartial(Partial.str(), Str.getAsString()->str()); +} + +static json::Value readJsonFromFile(StringRef &InputFile) { + std::unique_ptr Buffer = + ExitOnErr(errorOrToExpected(MemoryBuffer::getFile(InputFile))); + return ExitOnErr(parse(Buffer->getBuffer())); +} + +static bool isTestXFail(StringRef FileName, StringRef TestName) { + auto P = llvm::sys::path::filename(FileName); + auto It = XFailTestNames.find(P); + return It != XFailTestNames.end() && It->second.contains(TestName); +} + +static bool evaluateTest(StringRef &InputFile, TestData &TestData, + std::string &ActualStr) { + bool IsXFail = isTestXFail(InputFile, TestData.Name); + bool Matches = TestData.ExpectedStr == ActualStr; + if ((Matches && IsXFail) || (!Matches && !IsXFail)) { + reportTestFailure(TestData, ActualStr, IsXFail); + return false; + } + IsXFail ? NumXFail++ : NumSuccess++; + return true; +} + +static void runTest(StringRef InputFile) { + NumXFail = 0; + NumSuccess = 0; + outs() << "Running Tests: " << InputFile << "\n"; + json::Value Json = readJsonFromFile(InputFile); + + json::Object *Obj = Json.getAsObject(); + Array *TestArray = Obj->getArray("tests"); + // Even though we parsed the JSON, it can have a bad format, so check it. + if (!TestArray) + ExitOnErr(createStringError( + llvm::inconvertibleErrorCode(), + "invalid JSON schema in test file: " + InputFile + "\n")); + + const size_t Total = TestArray->size(); + + for (Value V : *TestArray) { + auto TestData = + ExitOnErr(TestData::createTestData(V.getAsObject(), InputFile)); + Template T(TestData.TemplateStr); + registerPartials(TestData.Partials, T); + + std::string ActualStr; + raw_string_ostream OS(ActualStr); + T.render(*TestData.Data, OS); + evaluateTest(InputFile, TestData, ActualStr); + } + + const int NumFailed = Total - NumSuccess - NumXFail; + outs() << formatv("===Results===\n" + " Suceeded: {}\n" + " Expectedly Failed: {}\n" + " Failed: {}\n" + " Total: {}\n", + NumSuccess, NumXFail, NumFailed, Total); +} + +int main(int argc, char **argv) { + ExitOnErr.setBanner(std::string(argv[0]) + " error: "); + cl::ParseCommandLineOptions(argc, argv); + for (const auto &FileName : InputFiles) + runTest(FileName); + return 0; +} From e7e491f6ee2baee4e2ab2947e1c64bc54e3ebbec Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Wed, 11 Jun 2025 13:06:22 -0700 Subject: [PATCH 0047/1322] [SelectionDAG] Add ISD::VSELECT to SelectionDAG::canCreateUndefOrPoison. (#143760) --- .../lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 1 + .../RISCV/rvv/combine-reduce-add-to-vcpop.ll | 73 +++++++++---------- .../CodeGen/RISCV/rvv/vector-interleave.ll | 16 ++-- .../test/CodeGen/X86/avx10_2_512bf16-arith.ll | 2 +- llvm/test/CodeGen/X86/avx10_2bf16-arith.ll | 4 +- 5 files changed, 48 insertions(+), 48 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index 4fc026ca562b..45a37622a531 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -5553,6 +5553,7 @@ bool SelectionDAG::canCreateUndefOrPoison(SDValue Op, const APInt &DemandedElts, case ISD::BUILD_VECTOR: case ISD::BUILD_PAIR: case ISD::SPLAT_VECTOR: + case ISD::VSELECT: return false; case ISD::SELECT_CC: diff --git a/llvm/test/CodeGen/RISCV/rvv/combine-reduce-add-to-vcpop.ll b/llvm/test/CodeGen/RISCV/rvv/combine-reduce-add-to-vcpop.ll index 88894f887cc2..5dc532273b77 100644 --- a/llvm/test/CodeGen/RISCV/rvv/combine-reduce-add-to-vcpop.ll +++ b/llvm/test/CodeGen/RISCV/rvv/combine-reduce-add-to-vcpop.ll @@ -313,12 +313,12 @@ define i32 @test_nxv128i1( %x) { ; CHECK-NEXT: vslidedown.vx v0, v6, a0 ; CHECK-NEXT: vsetvli a2, zero, e8, m1, ta, ma ; CHECK-NEXT: vslidedown.vx v6, v7, a1 -; CHECK-NEXT: vsetvli a1, zero, e8, mf2, ta, ma -; CHECK-NEXT: vslidedown.vx v5, v6, a0 -; CHECK-NEXT: vslidedown.vx v4, v7, a0 -; CHECK-NEXT: vsetvli a0, zero, e32, m8, ta, mu +; CHECK-NEXT: vsetvli a1, zero, e32, m8, ta, ma ; CHECK-NEXT: vmerge.vim v8, v8, 1, v0 -; CHECK-NEXT: vmv1r.v v0, v4 +; CHECK-NEXT: vsetvli a1, zero, e8, mf2, ta, ma +; CHECK-NEXT: vslidedown.vx v0, v7, a0 +; CHECK-NEXT: vslidedown.vx v5, v6, a0 +; CHECK-NEXT: vsetvli a0, zero, e32, m8, ta, mu ; CHECK-NEXT: vadd.vi v8, v8, 1, v0.t ; CHECK-NEXT: vmv1r.v v0, v5 ; CHECK-NEXT: vadd.vi v16, v16, 1, v0.t @@ -364,9 +364,9 @@ define i32 @test_nxv256i1( %x) { ; CHECK-NEXT: vmv1r.v v7, v9 ; CHECK-NEXT: vmv1r.v v5, v8 ; CHECK-NEXT: vmv1r.v v4, v0 -; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vmv.v.i v24, 0 ; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: vmerge.vim v8, v16, 1, v0 +; CHECK-NEXT: vmerge.vim v8, v24, 1, v0 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: mv a2, a0 @@ -376,7 +376,7 @@ define i32 @test_nxv256i1( %x) { ; CHECK-NEXT: addi a0, a0, 16 ; CHECK-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill ; CHECK-NEXT: vmv1r.v v0, v5 -; CHECK-NEXT: vmerge.vim v8, v16, 1, v0 +; CHECK-NEXT: vmerge.vim v8, v24, 1, v0 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 5 ; CHECK-NEXT: add a0, sp, a0 @@ -388,9 +388,8 @@ define i32 @test_nxv256i1( %x) { ; CHECK-NEXT: vslidedown.vx v3, v4, a0 ; CHECK-NEXT: vslidedown.vx v2, v5, a0 ; CHECK-NEXT: vmv.v.v v0, v3 -; CHECK-NEXT: vmv8r.v v8, v16 ; CHECK-NEXT: vsetvli a2, zero, e32, m8, ta, ma -; CHECK-NEXT: vmerge.vim v16, v16, 1, v0 +; CHECK-NEXT: vmerge.vim v8, v24, 1, v0 ; CHECK-NEXT: csrr a2, vlenb ; CHECK-NEXT: slli a2, a2, 3 ; CHECK-NEXT: mv a3, a2 @@ -398,42 +397,43 @@ define i32 @test_nxv256i1( %x) { ; CHECK-NEXT: add a2, a2, a3 ; CHECK-NEXT: add a2, sp, a2 ; CHECK-NEXT: addi a2, a2, 16 -; CHECK-NEXT: vs8r.v v16, (a2) # vscale x 64-byte Folded Spill +; CHECK-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill ; CHECK-NEXT: vmv1r.v v0, v2 -; CHECK-NEXT: vmerge.vim v16, v8, 1, v0 +; CHECK-NEXT: vmerge.vim v8, v24, 1, v0 ; CHECK-NEXT: csrr a2, vlenb ; CHECK-NEXT: slli a2, a2, 4 ; CHECK-NEXT: add a2, sp, a2 ; CHECK-NEXT: addi a2, a2, 16 -; CHECK-NEXT: vs8r.v v16, (a2) # vscale x 64-byte Folded Spill +; CHECK-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill ; CHECK-NEXT: vsetvli a2, zero, e8, mf2, ta, ma ; CHECK-NEXT: vslidedown.vx v0, v3, a1 ; CHECK-NEXT: vsetvli a2, zero, e32, m8, ta, ma -; CHECK-NEXT: vmerge.vim v16, v8, 1, v0 +; CHECK-NEXT: vmerge.vim v8, v24, 1, v0 ; CHECK-NEXT: csrr a2, vlenb ; CHECK-NEXT: slli a2, a2, 3 ; CHECK-NEXT: add a2, sp, a2 ; CHECK-NEXT: addi a2, a2, 16 -; CHECK-NEXT: vs8r.v v16, (a2) # vscale x 64-byte Folded Spill +; CHECK-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill ; CHECK-NEXT: vsetvli a2, zero, e8, mf2, ta, ma ; CHECK-NEXT: vslidedown.vx v0, v2, a1 ; CHECK-NEXT: vsetvli a2, zero, e32, m8, ta, ma -; CHECK-NEXT: vmerge.vim v24, v8, 1, v0 +; CHECK-NEXT: vmerge.vim v16, v24, 1, v0 ; CHECK-NEXT: vsetvli a2, zero, e8, mf2, ta, ma ; CHECK-NEXT: vslidedown.vx v0, v4, a1 ; CHECK-NEXT: vsetvli a2, zero, e32, m8, ta, ma -; CHECK-NEXT: vmerge.vim v16, v8, 1, v0 +; CHECK-NEXT: vmerge.vim v8, v24, 1, v0 ; CHECK-NEXT: vsetvli a2, zero, e8, mf2, ta, ma ; CHECK-NEXT: vslidedown.vx v0, v5, a1 +; CHECK-NEXT: vsetvli a2, zero, e32, m8, ta, ma +; CHECK-NEXT: vmerge.vim v24, v24, 1, v0 +; CHECK-NEXT: vsetvli a2, zero, e8, mf2, ta, ma +; CHECK-NEXT: vslidedown.vx v0, v6, a1 ; CHECK-NEXT: vslidedown.vx v5, v7, a1 -; CHECK-NEXT: vslidedown.vx v4, v6, a1 ; CHECK-NEXT: vsetvli a2, zero, e32, m8, ta, mu -; CHECK-NEXT: vmerge.vim v8, v8, 1, v0 -; CHECK-NEXT: vmv1r.v v0, v4 -; CHECK-NEXT: vadd.vi v8, v8, 1, v0.t +; CHECK-NEXT: vadd.vi v24, v24, 1, v0.t ; CHECK-NEXT: vmv1r.v v0, v5 -; CHECK-NEXT: vadd.vi v16, v16, 1, v0.t -; CHECK-NEXT: vadd.vv v8, v16, v8 +; CHECK-NEXT: vadd.vi v8, v8, 1, v0.t +; CHECK-NEXT: vadd.vv v8, v8, v24 ; CHECK-NEXT: addi a2, sp, 16 ; CHECK-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill ; CHECK-NEXT: vsetvli a2, zero, e8, m1, ta, ma @@ -443,7 +443,7 @@ define i32 @test_nxv256i1( %x) { ; CHECK-NEXT: vslidedown.vx v0, v4, a1 ; CHECK-NEXT: vslidedown.vx v3, v5, a1 ; CHECK-NEXT: vsetvli a0, zero, e32, m8, ta, mu -; CHECK-NEXT: vadd.vi v24, v24, 1, v0.t +; CHECK-NEXT: vadd.vi v16, v16, 1, v0.t ; CHECK-NEXT: vmv1r.v v0, v3 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 @@ -451,7 +451,7 @@ define i32 @test_nxv256i1( %x) { ; CHECK-NEXT: addi a0, a0, 16 ; CHECK-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload ; CHECK-NEXT: vadd.vi v8, v8, 1, v0.t -; CHECK-NEXT: vadd.vv v8, v8, v24 +; CHECK-NEXT: vadd.vv v8, v8, v16 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add a0, sp, a0 @@ -492,16 +492,16 @@ define i32 @test_nxv256i1( %x) { ; CHECK-NEXT: addi a0, a0, 16 ; CHECK-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload ; CHECK-NEXT: vadd.vi v24, v24, 1, v0.t -; CHECK-NEXT: vadd.vv v24, v24, v8 +; CHECK-NEXT: vadd.vv v0, v24, v8 ; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8r.v v0, (a0) # vscale x 64-byte Folded Reload -; CHECK-NEXT: vadd.vv v8, v8, v0 -; CHECK-NEXT: vadd.vv v16, v24, v16 +; CHECK-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload +; CHECK-NEXT: vadd.vv v8, v8, v24 +; CHECK-NEXT: vadd.vv v16, v0, v16 ; CHECK-NEXT: vadd.vv v8, v16, v8 ; CHECK-NEXT: vmv.s.x v16, zero ; CHECK-NEXT: vredsum.vs v8, v8, v16 @@ -537,18 +537,17 @@ entry: define i16 @test_narrow_nxv64i1( %x) { ; CHECK-LABEL: test_narrow_nxv64i1: ; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli a0, zero, e16, m8, ta, ma +; CHECK-NEXT: vmv.v.i v8, 0 ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: vsetvli a1, zero, e16, m8, ta, ma -; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vmerge.vim v8, v8, 1, v0 ; CHECK-NEXT: srli a0, a0, 1 ; CHECK-NEXT: vsetvli a1, zero, e8, m1, ta, ma -; CHECK-NEXT: vslidedown.vx v8, v0, a0 +; CHECK-NEXT: vslidedown.vx v0, v0, a0 ; CHECK-NEXT: vsetvli a0, zero, e16, m8, ta, mu -; CHECK-NEXT: vmerge.vim v16, v16, 1, v0 -; CHECK-NEXT: vmv1r.v v0, v8 -; CHECK-NEXT: vadd.vi v16, v16, 1, v0.t -; CHECK-NEXT: vmv.s.x v8, zero -; CHECK-NEXT: vredsum.vs v8, v16, v8 +; CHECK-NEXT: vadd.vi v8, v8, 1, v0.t +; CHECK-NEXT: vmv.s.x v16, zero +; CHECK-NEXT: vredsum.vs v8, v8, v16 ; CHECK-NEXT: vmv.x.s a0, v8 ; CHECK-NEXT: ret entry: diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-interleave.ll b/llvm/test/CodeGen/RISCV/rvv/vector-interleave.ll index 77723609a60c..e297e88c71f1 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vector-interleave.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vector-interleave.ll @@ -260,18 +260,18 @@ define @vector_interleave_nxv128i1_nxv64i1( @llvm.vector.interleave2.nxv128i1( %a, %b) diff --git a/llvm/test/CodeGen/X86/avx10_2_512bf16-arith.ll b/llvm/test/CodeGen/X86/avx10_2_512bf16-arith.ll index 1e2cf4956bd0..c22a394e6c4e 100644 --- a/llvm/test/CodeGen/X86/avx10_2_512bf16-arith.ll +++ b/llvm/test/CodeGen/X86/avx10_2_512bf16-arith.ll @@ -94,8 +94,8 @@ define <32 x bfloat> @test_int_x86_avx10_maskz_sub_bf16_512(<32 x bfloat> %src, ; ; X86-LABEL: test_int_x86_avx10_maskz_sub_bf16_512: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08] ; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08] ; X86-NEXT: vsubbf16 %zmm2, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf5,0x75,0xc9,0x5c,0xc2] ; X86-NEXT: vsubbf16 (%eax), %zmm1, %zmm1 # encoding: [0x62,0xf5,0x75,0x48,0x5c,0x08] ; X86-NEXT: vsubbf16 %zmm1, %zmm0, %zmm0 {%k1} # encoding: [0x62,0xf5,0x7d,0x49,0x5c,0xc1] diff --git a/llvm/test/CodeGen/X86/avx10_2bf16-arith.ll b/llvm/test/CodeGen/X86/avx10_2bf16-arith.ll index 42831a453cb1..435f67a0f1e4 100644 --- a/llvm/test/CodeGen/X86/avx10_2bf16-arith.ll +++ b/llvm/test/CodeGen/X86/avx10_2bf16-arith.ll @@ -147,8 +147,8 @@ define <16 x bfloat> @test_int_x86_avx10_maskz_sub_bf16_256(<16 x bfloat> %src, ; ; X86-LABEL: test_int_x86_avx10_maskz_sub_bf16_256: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08] ; X86-NEXT: vsubbf16 %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf5,0x75,0xa9,0x5c,0xc2] ; X86-NEXT: vsubbf16 (%eax), %ymm1, %ymm1 # encoding: [0x62,0xf5,0x75,0x28,0x5c,0x08] ; X86-NEXT: vsubbf16 %ymm1, %ymm0, %ymm0 {%k1} # encoding: [0x62,0xf5,0x7d,0x29,0x5c,0xc1] @@ -201,8 +201,8 @@ define <8 x bfloat> @test_int_x86_avx10_maskz_sub_bf16_128(<8 x bfloat> %src, <8 ; ; X86-LABEL: test_int_x86_avx10_maskz_sub_bf16_128: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08] ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08] ; X86-NEXT: vsubbf16 %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf5,0x75,0x89,0x5c,0xc2] ; X86-NEXT: vsubbf16 (%eax), %xmm1, %xmm1 # encoding: [0x62,0xf5,0x75,0x08,0x5c,0x08] ; X86-NEXT: vsubbf16 %xmm1, %xmm0, %xmm0 {%k1} # encoding: [0x62,0xf5,0x7d,0x09,0x5c,0xc1] From 5623b7f2d56ecba84de5d62444feed2dea2b7e25 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Wed, 11 Jun 2025 21:08:35 +0100 Subject: [PATCH 0048/1322] [LV] Use GeneratedRTChecks to check if safety checks were added (NFC). Directly check via GeneratedRTChecks if any checks have been added, instead of needing to go through ILV. This simplifies the code and enables further refactoring in follow-up patches. --- .../Transforms/Vectorize/LoopVectorize.cpp | 23 ++++++++----------- 1 file changed, 10 insertions(+), 13 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 2a237f42e404..d23611183639 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -505,9 +505,6 @@ public: /// Fix the vectorized code, taking care of header phi's, and more. void fixVectorizedLoop(VPTransformState &State); - // Return true if any runtime check is added. - bool areSafetyChecksAdded() { return AddedSafetyChecks; } - /// Fix the non-induction PHIs in \p Plan. void fixNonInductionPHIs(VPTransformState &State); @@ -620,9 +617,6 @@ protected: /// The profitablity analysis. LoopVectorizationCostModel *Cost; - // Record whether runtime checks are added. - bool AddedSafetyChecks = false; - /// BFI and PSI are used to check for profile guided size optimizations. BlockFrequencyInfo *BFI; ProfileSummaryInfo *PSI; @@ -1777,6 +1771,9 @@ class GeneratedRTChecks { /// they have been used. Value *MemRuntimeCheckCond = nullptr; + /// True if any checks have been added. + bool AddedAnyChecks = false; + DominatorTree *DT; LoopInfo *LI; TargetTransformInfo *TTI; @@ -2038,9 +2035,9 @@ public: if (AddBranchWeights) setBranchWeights(BI, SCEVCheckBypassWeights, /*IsExpected=*/false); ReplaceInstWithInst(SCEVCheckBlock->getTerminator(), &BI); - // Mark the check as used, to prevent it from being removed during cleanup. SCEVCheckCond = nullptr; + AddedAnyChecks = true; return SCEVCheckBlock; } @@ -2070,8 +2067,12 @@ public: // Mark the check as used, to prevent it from being removed during cleanup. MemRuntimeCheckCond = nullptr; + AddedAnyChecks = true; return MemCheckBlock; } + + /// Return true if any runtime checks have been added + bool hasChecks() const { return AddedAnyChecks; } }; } // namespace @@ -2459,7 +2460,6 @@ BasicBlock *InnerLoopVectorizer::emitSCEVChecks(BasicBlock *Bypass) { assert((!Cost->OptForSize || Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled) && "Cannot SCEV check stride or overflow when optimizing for size"); - AddedSafetyChecks = true; introduceCheckBlockInVPlan(SCEVCheckBlock); return SCEVCheckBlock; @@ -2494,9 +2494,6 @@ BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(BasicBlock *Bypass) { }); } - - AddedSafetyChecks = true; - introduceCheckBlockInVPlan(MemCheckBlock); return MemCheckBlock; } @@ -10287,7 +10284,7 @@ bool LoopVectorizePass::processLoop(Loop *L) { } ++LoopsEpilogueVectorized; - if (!MainILV.areSafetyChecksAdded()) + if (!Checks.hasChecks()) DisableRuntimeUnroll = true; } else { InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, @@ -10299,7 +10296,7 @@ bool LoopVectorizePass::processLoop(Loop *L) { // Add metadata to disable runtime unrolling a scalar loop when there // are no runtime checks about strides and memory. A scalar loop that is // rarely used is not worth unrolling. - if (!LB.areSafetyChecksAdded()) + if (!Checks.hasChecks()) DisableRuntimeUnroll = true; } // Report the vectorization decision. From c70658e32debfc3b2c0f6c2b2228ac48e976fd51 Mon Sep 17 00:00:00 2001 From: Jorge Gorbe Moya Date: Wed, 11 Jun 2025 13:09:05 -0700 Subject: [PATCH 0049/1322] [bazel] port 5dafe9dca867b90f20dcd71c620ad823aee4262b --- .../llvm-project-overlay/libc/test/src/stdlib/BUILD.bazel | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/utils/bazel/llvm-project-overlay/libc/test/src/stdlib/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/test/src/stdlib/BUILD.bazel index 40f672d8099f..610978059d7e 100644 --- a/utils/bazel/llvm-project-overlay/libc/test/src/stdlib/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/libc/test/src/stdlib/BUILD.bazel @@ -107,6 +107,7 @@ libc_test( deps = [ "//libc:__support_fputil_fp_bits", "//libc:atof", + "//libc/test/UnitTest:errno_test_helpers", ], ) @@ -206,6 +207,7 @@ libc_test_library( "//libc:__support_macros_properties_architectures", "//libc:errno", "//libc/test/UnitTest:LibcUnitTest", + "//libc/test/UnitTest:errno_test_helpers", ], ) @@ -251,6 +253,7 @@ libc_test( deps = [ "//libc:__support_fputil_fp_bits", "//libc:strtof", + "//libc/test/UnitTest:errno_test_helpers", "//libc/test/UnitTest:fp_test_helpers", ], ) @@ -261,6 +264,7 @@ libc_test( deps = [ "//libc:__support_fputil_fp_bits", "//libc:strtod", + "//libc/test/UnitTest:errno_test_helpers", "//libc/test/UnitTest:fp_test_helpers", ], ) @@ -272,5 +276,6 @@ libc_test( "//libc:__support_fputil_fp_bits", "//libc:__support_uint128", "//libc:strtold", + "//libc/test/UnitTest:errno_test_helpers", ], ) From 52583b3ed7dd39788360361fc1e21039c8eb5479 Mon Sep 17 00:00:00 2001 From: Uzair Nawaz Date: Wed, 11 Jun 2025 20:11:31 +0000 Subject: [PATCH 0050/1322] [libc] Character converter skeleton class (#143619) Made CharacterConverter class skeleton --- libc/hdr/types/char32_t.h | 22 ++++++ libc/hdr/types/char8_t.h | 22 ++++++ libc/hdr/uchar_overlay.h | 69 +++++++++++++++++++ libc/src/__support/wchar/CMakeLists.txt | 26 +++++++ .../__support/wchar/character_converter.cpp | 32 +++++++++ .../src/__support/wchar/character_converter.h | 39 +++++++++++ libc/src/__support/wchar/mbstate.h | 27 ++++++++ libc/src/__support/wchar/utf_ret.h | 21 ++++++ 8 files changed, 258 insertions(+) create mode 100644 libc/hdr/types/char32_t.h create mode 100644 libc/hdr/types/char8_t.h create mode 100644 libc/hdr/uchar_overlay.h create mode 100644 libc/src/__support/wchar/CMakeLists.txt create mode 100644 libc/src/__support/wchar/character_converter.cpp create mode 100644 libc/src/__support/wchar/character_converter.h create mode 100644 libc/src/__support/wchar/mbstate.h create mode 100644 libc/src/__support/wchar/utf_ret.h diff --git a/libc/hdr/types/char32_t.h b/libc/hdr/types/char32_t.h new file mode 100644 index 000000000000..94fe5747d341 --- /dev/null +++ b/libc/hdr/types/char32_t.h @@ -0,0 +1,22 @@ +//===-- Definition of char32_t.h ------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_HDR_TYPES_CHAR32_T_H +#define LLVM_LIBC_HDR_TYPES_CHAR32_T_H + +#ifdef LIBC_FULL_BUILD + +#include "include/llvm-libc-types/char32_t.h" + +#else // overlay mode + +#include "hdr/uchar_overlay.h" + +#endif // LLVM_LIBC_FULL_BUILD + +#endif // LLVM_LIBC_HDR_TYPES_CHAR32_T_H diff --git a/libc/hdr/types/char8_t.h b/libc/hdr/types/char8_t.h new file mode 100644 index 000000000000..31de764658f9 --- /dev/null +++ b/libc/hdr/types/char8_t.h @@ -0,0 +1,22 @@ +//===-- Definition of char8_t.h -------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_HDR_TYPES_CHAR8_T_H +#define LLVM_LIBC_HDR_TYPES_CHAR8_T_H + +#ifdef LIBC_FULL_BUILD + +#include "include/llvm-libc-types/char8_t.h" + +#else // overlay mode + +#include "hdr/uchar_overlay.h" + +#endif // LLVM_LIBC_FULL_BUILD + +#endif // LLVM_LIBC_HDR_TYPES_CHAR8_T_H diff --git a/libc/hdr/uchar_overlay.h b/libc/hdr/uchar_overlay.h new file mode 100644 index 000000000000..44ed3d48c6c1 --- /dev/null +++ b/libc/hdr/uchar_overlay.h @@ -0,0 +1,69 @@ +//===-- Including uchar.h in overlay mode ---------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_HDR_UCHAR_OVERLAY_H +#define LLVM_LIBC_HDR_UCHAR_OVERLAY_H + +#ifdef LIBC_FULL_BUILD +#error "This header should only be included in overlay mode" +#endif + +// Overlay mode + +// glibc header might provide extern inline definitions for few +// functions, causing external alias errors. They are guarded by +// `__USE_EXTERN_INLINES` macro. We temporarily disable `__USE_EXTERN_INLINES` +// macro by defining `__NO_INLINE__` before including . +// And the same with `__USE_FORTIFY_LEVEL`, which will be temporarily disabled +// with `_FORTIFY_SOURCE`. + +#ifdef _FORTIFY_SOURCE +#define LIBC_OLD_FORTIFY_SOURCE _FORTIFY_SOURCE +#undef _FORTIFY_SOURCE +#endif + +#ifndef __NO_INLINE__ +#define __NO_INLINE__ 1 +#define LIBC_SET_NO_INLINE +#endif + +#ifdef __USE_EXTERN_INLINES +#define LIBC_OLD_USE_EXTERN_INLINES +#undef __USE_EXTERN_INLINES +#endif + +#ifdef __USE_FORTIFY_LEVEL +#define LIBC_OLD_USE_FORTIFY_LEVEL __USE_FORTIFY_LEVEL +#undef __USE_FORTIFY_LEVEL +#define __USE_FORTIFY_LEVEL 0 +#endif + +#include + +#ifdef LIBC_OLD_FORTIFY_SOURCE +#define _FORTIFY_SOURCE LIBC_OLD_FORTIFY_SOURCE +#undef LIBC_OLD_FORTIFY_SOURCE +#endif + +#ifdef LIBC_SET_NO_INLINE +#undef __NO_INLINE__ +#undef LIBC_SET_NO_INLINE +#endif + +#ifdef LIBC_OLD_USE_FORTIFY_LEVEL +#undef __USE_FORTIFY_LEVEL +#define __USE_FORTIFY_LEVEL LIBC_OLD_USE_FORTIFY_LEVEL +#undef LIBC_OLD_USE_FORTIFY_LEVEL +#endif + +#ifdef LIBC_OLD_USE_EXTERN_INLINES +#define __USE_EXTERN_INLINES +#undef LIBC_OLD_USE_EXTERN_INLINES +#endif + +#endif // LLVM_LIBC_HDR_UCHAR_OVERLAY_H diff --git a/libc/src/__support/wchar/CMakeLists.txt b/libc/src/__support/wchar/CMakeLists.txt new file mode 100644 index 000000000000..5cca58400ff4 --- /dev/null +++ b/libc/src/__support/wchar/CMakeLists.txt @@ -0,0 +1,26 @@ +add_header_library( + mbstate + HDRS + mbstate.h + DEPENDS + libc.hdr.types.char32_t +) + +add_object_library( + character_converter + HDRS + character_converter.h + SRCS + character_converter.cpp + DEPENDS + libc.hdr.types.char8_t + libc.hdr.types.char32_t + .mbstate + .utf_ret +) + +add_header_library( + utf_ret + HDRS + utf_ret.h +) diff --git a/libc/src/__support/wchar/character_converter.cpp b/libc/src/__support/wchar/character_converter.cpp new file mode 100644 index 000000000000..0afc2a6f59e6 --- /dev/null +++ b/libc/src/__support/wchar/character_converter.cpp @@ -0,0 +1,32 @@ +//===-- Implementation of a class for conversion --------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "hdr/types/char32_t.h" +#include "hdr/types/char8_t.h" +#include "src/__support/wchar/mbstate.h" +#include "src/__support/wchar/utf_ret.h" + +#include "character_converter.h" + +namespace LIBC_NAMESPACE_DECL { +namespace internal { + +CharacterConverter::CharacterConverter(mbstate_t *mbstate) { state = mbstate; } + +bool CharacterConverter::isComplete() {} + +int CharacterConverter::push(char8_t utf8_byte) {} + +int CharacterConverter::push(char32_t utf32) {} + +utf_ret CharacterConverter::pop_utf8() {} + +utf_ret CharacterConverter::pop_utf32() {} + +} // namespace internal +} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/__support/wchar/character_converter.h b/libc/src/__support/wchar/character_converter.h new file mode 100644 index 000000000000..a6bac4380537 --- /dev/null +++ b/libc/src/__support/wchar/character_converter.h @@ -0,0 +1,39 @@ +//===-- Definition of a class for mbstate_t and conversion -----*-- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC___SUPPORT_CHARACTER_CONVERTER_H +#define LLVM_LIBC_SRC___SUPPORT_CHARACTER_CONVERTER_H + +#include "hdr/types/char32_t.h" +#include "hdr/types/char8_t.h" +#include "src/__support/wchar/mbstate.h" +#include "src/__support/wchar/utf_ret.h" + +namespace LIBC_NAMESPACE_DECL { +namespace internal { + +class CharacterConverter { +private: + mbstate_t *state; + +public: + CharacterConverter(mbstate_t *mbstate); + + bool isComplete(); + + int push(char8_t utf8_byte); + int push(char32_t utf32); + + utf_ret pop_utf8(); + utf_ret pop_utf32(); +}; + +} // namespace internal +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC___SUPPORT_CHARACTER_CONVERTER_H diff --git a/libc/src/__support/wchar/mbstate.h b/libc/src/__support/wchar/mbstate.h new file mode 100644 index 000000000000..72ec72756000 --- /dev/null +++ b/libc/src/__support/wchar/mbstate.h @@ -0,0 +1,27 @@ +//===-- Definition of mbstate-----------------------------------*-- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC___SUPPORT_MBSTATE_H +#define LLVM_LIBC_SRC___SUPPORT_MBSTATE_H + +#include "hdr/types/char32_t.h" +#include + +namespace LIBC_NAMESPACE_DECL { +namespace internal { + +struct mbstate { + char32_t partial; + uint8_t bits_processed; + uint8_t total_bytes; +}; + +} // namespace internal +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC___SUPPORT_MBSTATE_H diff --git a/libc/src/__support/wchar/utf_ret.h b/libc/src/__support/wchar/utf_ret.h new file mode 100644 index 000000000000..b8a8f6f09414 --- /dev/null +++ b/libc/src/__support/wchar/utf_ret.h @@ -0,0 +1,21 @@ +//===-- Definition of utf_ret ----------------------------------*-- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC___SUPPORT_UTF_RET_H +#define LLVM_LIBC_SRC___SUPPORT_UTF_RET_H + +namespace LIBC_NAMESPACE_DECL { + +template struct utf_ret { + T out; + int error; +}; + +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC___SUPPORT_UTF_RET_H From a2d2941830d9c141d7f43da1ff58e7b7235a9f7d Mon Sep 17 00:00:00 2001 From: Chelsea Cassanova Date: Wed, 11 Jun 2025 13:12:37 -0700 Subject: [PATCH 0051/1322] [lldb][RPC] Upstream LLDB to RPC converstion Python script (#138028) As part of upstreaming LLDB RPC, this commit adds a python script that is used by LLDB RPC to modify the public lldb header files for use with RPC. https://discourse.llvm.org/t/rfc-upstreaming-lldb-rpc/85804 --- .../convert-lldb-header-to-rpc-header.py | 108 ++++++++++++++++++ .../TestConvertScript/CheckLLDBDefines.test | 22 ++++ .../CheckLLDBEnumerations.test | 17 +++ .../TestConvertScript/CheckLLDBTypes.test | 24 ++++ .../TestConvertScript/CheckSBDefines.test | 22 ++++ .../TestConvertScript/Inputs/SBDefines.h | 22 ++++ .../TestConvertScript/Inputs/lldb-defines.h | 23 ++++ .../Inputs/lldb-enumerations.h | 17 +++ .../TestConvertScript/Inputs/lldb-types.h | 23 ++++ 9 files changed, 278 insertions(+) create mode 100755 lldb/scripts/convert-lldb-header-to-rpc-header.py create mode 100644 lldb/test/Shell/RPC/Scripts/TestConvertScript/CheckLLDBDefines.test create mode 100644 lldb/test/Shell/RPC/Scripts/TestConvertScript/CheckLLDBEnumerations.test create mode 100644 lldb/test/Shell/RPC/Scripts/TestConvertScript/CheckLLDBTypes.test create mode 100644 lldb/test/Shell/RPC/Scripts/TestConvertScript/CheckSBDefines.test create mode 100644 lldb/test/Shell/RPC/Scripts/TestConvertScript/Inputs/SBDefines.h create mode 100644 lldb/test/Shell/RPC/Scripts/TestConvertScript/Inputs/lldb-defines.h create mode 100644 lldb/test/Shell/RPC/Scripts/TestConvertScript/Inputs/lldb-enumerations.h create mode 100644 lldb/test/Shell/RPC/Scripts/TestConvertScript/Inputs/lldb-types.h diff --git a/lldb/scripts/convert-lldb-header-to-rpc-header.py b/lldb/scripts/convert-lldb-header-to-rpc-header.py new file mode 100755 index 000000000000..d7734280076f --- /dev/null +++ b/lldb/scripts/convert-lldb-header-to-rpc-header.py @@ -0,0 +1,108 @@ +#!/usr/bin/env python3 +""" +Usage: convert-lldb-header-to-rpc-header.py + +This scripts takes common LLDB headers (such as lldb-defines.h) and replaces references to LLDB +with those for RPC. This happens for: +- namespace definitions +- namespace usage +- version string macros +- ifdef/ifndef lines +""" + +import argparse +import os +import re + + +INCLUDES_TO_REMOVE_REGEX = re.compile( + r'#include "lldb/lldb-forward.h"|#include "lldb/lldb-versioning.h"' +) +LLDB_GUARD_REGEX = re.compile(r"(?P#.+)LLDB_LLDB_\s*", re.M) +LLDB_API_GUARD_REGEX = re.compile(r"(?P#.+)LLDB_API_\s*", re.M) +LLDB_VERSION_REGEX = re.compile(r"#define LLDB_VERSION", re.M) +LLDB_REVISION_REGEX = re.compile(r"#define LLDB_REVISION", re.M) +LLDB_VERSION_STRING_REGEX = re.compile(r"#define LLDB_VERSION_STRING", re.M) +LLDB_LOCAL_INCLUDE_REGEX = re.compile(r'#include "lldb/lldb-\s*', re.M) +LLDB_NAMESPACE_DEFINITION_REGEX = re.compile( + r"(?P//\s*){,1}namespace lldb\s{1}", re.M +) +LLDB_NAMESPACE_REGEX = re.compile(r"\s*.+lldb::\s*", re.M) + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("input") + parser.add_argument("output") + args = parser.parse_args() + input_path = str(args.input) + output_path = str(args.output) + with open(input_path, "r") as input_file: + lines = input_file.readlines() + file_buffer = "".join(lines) + + with open(output_path, "w") as output_file: + # NOTE: We do not use lldb-forward.h or lldb-versioning.h in RPC, so remove + # all includes that are found for these files. + file_buffer = re.sub(INCLUDES_TO_REMOVE_REGEX, r"", file_buffer) + + # For lldb-rpc-defines.h, replace the ifndef LLDB_LLDB_ portion with LLDB_RPC_ as we're not + # using LLDB private definitions in RPC. + lldb_guard_matches = LLDB_GUARD_REGEX.finditer(file_buffer) + for match in lldb_guard_matches: + file_buffer = re.sub( + match.group(), + r"{0}LLDB_RPC_".format(match.group("guard_type")), + file_buffer, + ) + + # Similarly to lldb-rpc-defines.h, replace the ifndef for LLDB_API in SBDefines.h to LLDB_RPC_API_ for the same reason. + lldb_api_guard_matches = LLDB_API_GUARD_REGEX.finditer(file_buffer) + for match in lldb_api_guard_matches: + file_buffer = re.sub( + match.group(), + r"{0}LLDB_RPC_API_".format(match.group("guard_type")), + file_buffer, + ) + + # Replace the references for the macros that define the versioning strings in + # lldb-rpc-defines.h. + # NOTE: Here we assume that the versioning info has already been uncommented and + # populated from the original lldb-defines.h. + file_buffer = re.sub( + LLDB_VERSION_REGEX, r"#define LLDB_RPC_VERSION", file_buffer + ) + file_buffer = re.sub( + LLDB_REVISION_REGEX, r"#define LLDB_RPC_REVISION", file_buffer + ) + file_buffer = re.sub( + LLDB_VERSION_STRING_REGEX, r"#define LLDB_RPC_VERSION_STRING", file_buffer + ) + + # For local #includes + file_buffer = re.sub( + LLDB_LOCAL_INCLUDE_REGEX, r'#include "lldb-rpc-', file_buffer + ) + + # Rename the lldb namespace definition to lldb-rpc. + lldb_rpc_namespace_definition_matches = ( + LLDB_NAMESPACE_DEFINITION_REGEX.finditer(file_buffer) + ) + for match in lldb_rpc_namespace_definition_matches: + comment_marker = ( + match.group("comment_marker") if match.group("comment_marker") else "" + ) + file_buffer = re.sub( + match.group(), + r"{0}namespace lldb_rpc ".format(comment_marker), + file_buffer, + ) + + # Rename the lldb namespace definition to lldb-rpc. + file_buffer = re.sub(LLDB_NAMESPACE_REGEX, r"lldb_rpc::", file_buffer) + + output_file.write(file_buffer) + + +if __name__ == "__main__": + main() diff --git a/lldb/test/Shell/RPC/Scripts/TestConvertScript/CheckLLDBDefines.test b/lldb/test/Shell/RPC/Scripts/TestConvertScript/CheckLLDBDefines.test new file mode 100644 index 000000000000..0d89d627cfed --- /dev/null +++ b/lldb/test/Shell/RPC/Scripts/TestConvertScript/CheckLLDBDefines.test @@ -0,0 +1,22 @@ +RUN: mkdir -p %t/Outputs + +# Run the convert script on lldb-defines.h. +RUN: %python %p/../../../../../scripts/convert-lldb-header-to-rpc-header.py %p/Inputs/lldb-defines.h %t/Outputs/lldb-rpc-defines.h + +# Check the output +RUN: cat %t/Outputs/lldb-rpc-defines.h | FileCheck %s + +# The include guards must change from LLDB_LLDB_DEFINES_H to LLDB_RPC_DEFINES_H. +CHECK: #ifndef LLDB_RPC_DEFINES_H +CHECK: #define LLDB_RPC_DEFINES_H + +# Includes of other lldb headers must begin with "lldb-rpc-". +CHECK: #include "lldb-rpc-types.h" + +# The version info must be changed from LLDB_VERSION to LLDB_RPC_VERSION +CHECK: #define LLDB_RPC_VERSION 21 +CHECK: #define LLDB_RPC_REVISION 12 +CHECK: #define LLDB_RPC_VERSION_STRING "21.0.12" + +# The comment that closes the include guard should match the guard. +CHECK: #endif // LLDB_RPC_DEFINES_H diff --git a/lldb/test/Shell/RPC/Scripts/TestConvertScript/CheckLLDBEnumerations.test b/lldb/test/Shell/RPC/Scripts/TestConvertScript/CheckLLDBEnumerations.test new file mode 100644 index 000000000000..0fb3c6f73dd0 --- /dev/null +++ b/lldb/test/Shell/RPC/Scripts/TestConvertScript/CheckLLDBEnumerations.test @@ -0,0 +1,17 @@ +RUN: mkdir -p %t/Outputs + +# Run the convert script on lldb-enumerations.h. +RUN: %python %p/../../../../../scripts/convert-lldb-header-to-rpc-header.py %p/Inputs/lldb-enumerations.h %t/Outputs/lldb-rpc-enumerations.h + +# Check the output +RUN: cat %t/Outputs/lldb-rpc-enumerations.h | FileCheck %s + +# The include guards must change from LLDB_LLDB_ENUMERATIONS_H to LLDB_RPC_ENUMERATIONS_H. +CHECK: #ifndef LLDB_RPC_ENUMERATIONS_H +CHECK: #define LLDB_RPC_ENUMERATIONS_H + +# Change the namespace to lldb_rpc. Also, the comment that closes the namespace should match the namespace. +CHECK: namespace lldb_rpc {} // namespace lldb_rpc + +# The comment that closes the include guard should match the guard. +CHECK: #endif // LLDB_RPC_ENUMERATIONS_H diff --git a/lldb/test/Shell/RPC/Scripts/TestConvertScript/CheckLLDBTypes.test b/lldb/test/Shell/RPC/Scripts/TestConvertScript/CheckLLDBTypes.test new file mode 100644 index 000000000000..86f2d290209e --- /dev/null +++ b/lldb/test/Shell/RPC/Scripts/TestConvertScript/CheckLLDBTypes.test @@ -0,0 +1,24 @@ +RUN: mkdir -p %t/Outputs + +# Run the convert script on lldb-types.h. +RUN: %python %p/../../../../../scripts/convert-lldb-header-to-rpc-header.py %p/Inputs/lldb-types.h %t/Outputs/lldb-rpc-types.h + +# Check the output +RUN: cat %t/Outputs/lldb-rpc-types.h | FileCheck %s + +# The include guards must change from LLDB_LLDB_TYPES_H to LLDB_RPC_TYPES_H. +CHECK: #ifndef LLDB_RPC_TYPES_H +CHECK: #define LLDB_RPC_TYPES_H + +# Includes of other lldb headers must begin with "lldb-rpc-". +# Also, the includes for lldb-forward.h should be removed. +CHECK: #include "lldb-rpc-enumerations.h" + +# Change the namespace to lldb_rpc. +CHECK: namespace lldb_rpc + +# The comment that closes the namespace should match the namespace. +CHECK: // namespace lldb_rpc + +# The comment that closes the include guard should match the guard. +CHECK: #endif // LLDB_RPC_TYPES_H diff --git a/lldb/test/Shell/RPC/Scripts/TestConvertScript/CheckSBDefines.test b/lldb/test/Shell/RPC/Scripts/TestConvertScript/CheckSBDefines.test new file mode 100644 index 000000000000..72444aaf069a --- /dev/null +++ b/lldb/test/Shell/RPC/Scripts/TestConvertScript/CheckSBDefines.test @@ -0,0 +1,22 @@ +RUN: mkdir -p %t/Outputs + +# Run the convert script on SBDefines.h. +RUN: %python %p/../../../../../scripts/convert-lldb-header-to-rpc-header.py %p/Inputs/SBDefines.h %t/Outputs/SBDefines.h + +# Check the output +RUN: cat %t/Outputs/SBDefines.h | FileCheck %s + +# The include guards must change from LLDB_LLDB_API_SBDEFINES_H to LLDB_RPC_API_SBDEFINES_H. +CHECK: #ifndef LLDB_RPC_API_SBDEFINES_H +CHECK: #define LLDB_RPC_API_SBDEFINES_H + +# Includes of other lldb headers must begin with "lldb-rpc-". +# Also, the includes for lldb-forward.h and lldb-versioning.h should be removed. +CHECK: #include "lldb-rpc-defines.h" +CHECK-NOT: #include "lldb-rpc-forward.h" +CHECK: #include "lldb-rpc-enumerations.h" +CHECK: #include "lldb-rpc-types.h" +CHECK-NOT: #include "lldb-rpc-versioning.h" + +# The comment that closes the include guard should match the guard. +CHECK: #endif // LLDB_RPC_API_SBDEFINES_H diff --git a/lldb/test/Shell/RPC/Scripts/TestConvertScript/Inputs/SBDefines.h b/lldb/test/Shell/RPC/Scripts/TestConvertScript/Inputs/SBDefines.h new file mode 100644 index 000000000000..50476c402ba7 --- /dev/null +++ b/lldb/test/Shell/RPC/Scripts/TestConvertScript/Inputs/SBDefines.h @@ -0,0 +1,22 @@ +// This is a truncated version of SBDefines.h used to test that the script +// convert-lldb-header-to-rpc-header.py works correctly. The script changes LLDB references in +// the original file to RPC references. + +// The include guard should change from LLDB_LLDB to LLDB_RPC. +// LLDB_API_SBDEFINES_H -> LLDB_RPC_SBDEFINES_H +#ifndef LLDB_API_SBDEFINES_H +#define LLDB_API_SBDEFINES_H + +// Includes of public main LLDB headers should change to their RPC equivalents: +// "lldb/lldb-defines.h" -> "lldb-rpc-defines.h" +// Also, the includes for lldb-forward.h and lldb-versioning.h should be removed. +#include "lldb/lldb-defines.h" +#include "lldb/lldb-enumerations.h" +#include "lldb/lldb-forward.h" +#include "lldb/lldb-types.h" +#include "lldb/lldb-versioning.h" + +// The comment that closes the include guard must change in the same way +// the original guard did. +// #endif // LLDB_API_SBDEFINES_H -> #endif // LLDB_RPC_API_SBDEFINES_H +#endif // LLDB_API_SBDEFINES_H diff --git a/lldb/test/Shell/RPC/Scripts/TestConvertScript/Inputs/lldb-defines.h b/lldb/test/Shell/RPC/Scripts/TestConvertScript/Inputs/lldb-defines.h new file mode 100644 index 000000000000..32064430b3d0 --- /dev/null +++ b/lldb/test/Shell/RPC/Scripts/TestConvertScript/Inputs/lldb-defines.h @@ -0,0 +1,23 @@ +// This is a truncated version of lldb-defines.h used to test that the script +// convert-lldb-header-to-rpc-header.py works correctly. The script changes LLDB references in +// the original file to RPC references. + +// The include guard should change from LLDB_LLDB to LLDB_RPC. +// LLDB_LLDB_DEFINES_H -> LLDB_RPC_DEFINES_H +#ifndef LLDB_LLDB_DEFINES_H +#define LLDB_LLDB_DEFINES_H + +// Includes of public main LLDB headers should change to their RPC equivalents: +// "lldb/lldb-types.h" -> "lldb-rpc-types.h" +#include "lldb/lldb-types.h" + +// The LLDB version must change from LLDB to LLDB_RPC +// LLDB_VERSION -> LLDB_RPC_VERSION +#define LLDB_VERSION 21 +#define LLDB_REVISION 12 +#define LLDB_VERSION_STRING "21.0.12" + +// The comment that closes the include guard must change in the same way +// the original guard did. +// #endif // LLDB_LLDB_DEFINES_H -> #endif // LLDB_RPC_DEFINES_H +#endif // LLDB_LLDB_DEFINES_H diff --git a/lldb/test/Shell/RPC/Scripts/TestConvertScript/Inputs/lldb-enumerations.h b/lldb/test/Shell/RPC/Scripts/TestConvertScript/Inputs/lldb-enumerations.h new file mode 100644 index 000000000000..42c4bb277fc4 --- /dev/null +++ b/lldb/test/Shell/RPC/Scripts/TestConvertScript/Inputs/lldb-enumerations.h @@ -0,0 +1,17 @@ +// This is a truncated version of lldb-enumerations.h used to test that the script +// convert-lldb-header-to-rpc-header.py works correctly. The script changes LLDB references in +// the original file to RPC references. + +// The include guard should change from LLDB_LLDB to LLDB_RPC. +// LLDB_LLDB_ENUMERATIONS_H -> LLDB_RPC_ENUMERATIONS_H +#ifndef LLDB_LLDB_ENUMERATIONS_H +#define LLDB_LLDB_ENUMERATIONS_H + +// The namespace definition should change to the lldb_rpc namespace, so should the comment that closes it: +// namespace lldb -> namespace lldb_rpc +namespace lldb {} // namespace lldb + +// The comment that closes the include guard must change in the same way +// the original guard did: +// #endif // LLDB_LLDB_ENUMERATIONS_H -> #endif // LLDB_RPC_ENUMERATIONS_H +#endif // LLDB_LLDB_ENUMERATIONS_H diff --git a/lldb/test/Shell/RPC/Scripts/TestConvertScript/Inputs/lldb-types.h b/lldb/test/Shell/RPC/Scripts/TestConvertScript/Inputs/lldb-types.h new file mode 100644 index 000000000000..5a49920405ec --- /dev/null +++ b/lldb/test/Shell/RPC/Scripts/TestConvertScript/Inputs/lldb-types.h @@ -0,0 +1,23 @@ +// This is a truncated version of lldb-types.h used to test that the script +// convert-lldb-header-to-rpc-header.py works correctly. The script changes LLDB references in +// the original file to RPC references. + +// The include guard should change from LLDB_LLDB to LLDB_RPC. +// LLDB_LLDB_TYPES_H -> LLDB_RPC_TYPES_H +#ifndef LLDB_LLDB_TYPES_H +#define LLDB_LLDB_TYPES_H + +// Includes of public main LLDB headers should change to their RPC equivalents: +// "lldb/lldb-defines.h" -> "lldb-rpc-defines.h": +// Also, the includes for lldb-forward.h should be removed. +#include "lldb/lldb-enumerations.h" +#include "lldb/lldb-forward.h" + +// The namespace definition should change to the lldb_rpc namespace, so should the comment that closes it: +// namespace lldb -> namespace lldb_rpc +namespace lldb {} // namespace lldb + +// The comment that closes the include guard must change in the same way +// the original guard did: +// #endif // LLDB_LLDB_TYPES_H -> #endif // LLDB_RPC_TYPES_H +#endif // LLDB_LLDB_TYPES_H From b42aef5e6f32a3ac6c259cb4cacf58239400b5aa Mon Sep 17 00:00:00 2001 From: Peter Klausler Date: Wed, 11 Jun 2025 13:12:59 -0700 Subject: [PATCH 0052/1322] [flang] Don't duplicate hermetic module file dependencies (#143605) When emitting the modules on which a module depends under the -fhermetic-module-files options, eliminate duplicates by name rather than by symbol addresses. This way, when a dependent module is in the symbol table more than once due to the use of a nested hermetic module, it doesn't get emitted multiple times to the new module file. --- flang/lib/Semantics/mod-file.cpp | 18 +++++++++------ flang/test/Semantics/modfile77.F90 | 37 ++++++++++++++++++++++++++++++ flang/test/Semantics/modfile78.F90 | 33 ++++++++++++++++++++++++++ 3 files changed, 81 insertions(+), 7 deletions(-) create mode 100644 flang/test/Semantics/modfile77.F90 create mode 100644 flang/test/Semantics/modfile78.F90 diff --git a/flang/lib/Semantics/mod-file.cpp b/flang/lib/Semantics/mod-file.cpp index a72641866aa1..9f9e9f584045 100644 --- a/flang/lib/Semantics/mod-file.cpp +++ b/flang/lib/Semantics/mod-file.cpp @@ -143,18 +143,22 @@ void ModFileWriter::Write(const Symbol &symbol) { std::string path{context_.moduleDirectory() + '/' + ModFileName(symbol.name(), ancestorName, context_.moduleFileSuffix())}; - UnorderedSymbolSet hermeticModules; - hermeticModules.insert(symbol); + std::set hermeticModuleNames; + hermeticModuleNames.insert(symbol.name().ToString()); UnorderedSymbolSet additionalModules; PutSymbols(DEREF(symbol.scope()), hermeticModuleFileOutput_ ? &additionalModules : nullptr); auto asStr{GetAsString(symbol)}; while (!additionalModules.empty()) { - for (auto ref : UnorderedSymbolSet{std::move(additionalModules)}) { - if (hermeticModules.insert(*ref).second && - !ref->owner().IsIntrinsicModules()) { - PutSymbols(DEREF(ref->scope()), &additionalModules); - asStr += GetAsString(*ref); + UnorderedSymbolSet nextPass{std::move(additionalModules)}; + additionalModules.clear(); + for (const Symbol &modSym : nextPass) { + if (!modSym.owner().IsIntrinsicModules() && + hermeticModuleNames.find(modSym.name().ToString()) == + hermeticModuleNames.end()) { + hermeticModuleNames.insert(modSym.name().ToString()); + PutSymbols(DEREF(modSym.scope()), &additionalModules); + asStr += GetAsString(modSym); } } } diff --git a/flang/test/Semantics/modfile77.F90 b/flang/test/Semantics/modfile77.F90 new file mode 100644 index 000000000000..a82904ebbcc2 --- /dev/null +++ b/flang/test/Semantics/modfile77.F90 @@ -0,0 +1,37 @@ +!RUN: %flang -c -fhermetic-module-files -DWHICH=1 %s && %flang -c -fhermetic-module-files -DWHICH=2 %s && %flang -c -fhermetic-module-files %s && cat modfile77c.mod | FileCheck %s + +#if WHICH == 1 +module modfile77a + interface gen + procedure proc + end interface + contains + subroutine proc + print *, 'ok' + end +end +#elif WHICH == 2 +module modfile77b + use modfile77a +end +#else +module modfile77c + use modfile77a + use modfile77b +end +#endif + +!CHECK: module modfile77c +!CHECK: use modfile77a,only:proc +!CHECK: use modfile77a,only:gen +!CHECK: interface gen +!CHECK: end interface +!CHECK: end +!CHECK: module modfile77a +!CHECK: interface gen +!CHECK: procedure::proc +!CHECK: end interface +!CHECK: contains +!CHECK: subroutine proc() +!CHECK: end +!CHECK: end diff --git a/flang/test/Semantics/modfile78.F90 b/flang/test/Semantics/modfile78.F90 new file mode 100644 index 000000000000..cb3eccd9a410 --- /dev/null +++ b/flang/test/Semantics/modfile78.F90 @@ -0,0 +1,33 @@ +!RUN: %flang -c -fhermetic-module-files -DWHICH=1 %s && %flang -c -fhermetic-module-files -DWHICH=2 %s && %flang -c -fhermetic-module-files %s && cat modfile78c.mod | FileCheck %s + +#if WHICH == 1 +module modfile78a + integer :: global_variable = 0 +end +#elif WHICH == 2 +module modfile78b + use modfile78a + contains + subroutine test + end +end +#else +module modfile78c + use modfile78a + use modfile78b +end +#endif + +!CHECK: module modfile78c +!CHECK: use modfile78a,only:global_variable +!CHECK: use modfile78b,only:test +!CHECK: end +!CHECK: module modfile78a +!CHECK: integer(4)::global_variable +!CHECK: end +!CHECK: module modfile78b +!CHECK: use modfile78a,only:global_variable +!CHECK: contains +!CHECK: subroutine test() +!CHECK: end +!CHECK: end From e389a0e7bb3d7aabbd10b9ba8f432f292de65649 Mon Sep 17 00:00:00 2001 From: Uzair Nawaz Date: Wed, 11 Jun 2025 20:17:35 +0000 Subject: [PATCH 0053/1322] [libc] Switched calls to inline_memcpy to __builtin_memcpy for wide char utilities (#143011) Switched calls to inline_memcpy to __builtin_memcpy for wide char utilities Removed unnecessary wctype_utils dependencies from the cmake file --- libc/src/wchar/CMakeLists.txt | 9 --------- libc/src/wchar/wcscpy.cpp | 3 +-- libc/src/wchar/wcsncpy.cpp | 2 -- libc/src/wchar/wmemcpy.cpp | 3 +-- libc/src/wchar/wmempcpy.cpp | 3 +-- 5 files changed, 3 insertions(+), 17 deletions(-) diff --git a/libc/src/wchar/CMakeLists.txt b/libc/src/wchar/CMakeLists.txt index 759f708c2247..4b8802ede5f5 100644 --- a/libc/src/wchar/CMakeLists.txt +++ b/libc/src/wchar/CMakeLists.txt @@ -43,7 +43,6 @@ add_entrypoint_object( DEPENDS libc.hdr.types.size_t libc.hdr.types.wchar_t - libc.src.__support.wctype_utils ) add_entrypoint_object( @@ -54,7 +53,6 @@ add_entrypoint_object( wcschr.h DEPENDS libc.hdr.wchar_macros - libc.src.__support.wctype_utils ) add_entrypoint_object( @@ -75,7 +73,6 @@ add_entrypoint_object( wcspbrk.h DEPENDS libc.hdr.wchar_macros - libc.src.__support.wctype_utils libc.src.__support.macros.null_check ) @@ -109,7 +106,6 @@ add_entrypoint_object( DEPENDS libc.hdr.wchar_macros libc.hdr.types.size_t - libc.src.__support.wctype_utils ) add_entrypoint_object( @@ -121,7 +117,6 @@ add_entrypoint_object( DEPENDS libc.hdr.types.size_t libc.hdr.wchar_macros - libc.src.__support.wctype_utils libc.src.__support.macros.null_check ) @@ -134,7 +129,6 @@ add_entrypoint_object( DEPENDS libc.hdr.types.size_t libc.hdr.wchar_macros - libc.src.__support.wctype_utils ) add_entrypoint_object( @@ -205,8 +199,6 @@ add_entrypoint_object( DEPENDS libc.hdr.types.size_t libc.hdr.wchar_macros - libc.src.__support.wctype_utils - libc.src.string.memory_utils.inline_memcpy ) add_entrypoint_object( @@ -218,6 +210,5 @@ add_entrypoint_object( DEPENDS libc.hdr.types.size_t libc.hdr.wchar_macros - libc.src.string.memory_utils.inline_memcpy libc.src.string.string_utils ) diff --git a/libc/src/wchar/wcscpy.cpp b/libc/src/wchar/wcscpy.cpp index dc46b972c59f..01ba994cecbb 100644 --- a/libc/src/wchar/wcscpy.cpp +++ b/libc/src/wchar/wcscpy.cpp @@ -12,7 +12,6 @@ #include "hdr/types/wchar_t.h" #include "src/__support/common.h" #include "src/__support/macros/config.h" -#include "src/string/memory_utils/inline_memcpy.h" #include "src/string/string_utils.h" namespace LIBC_NAMESPACE_DECL { @@ -20,7 +19,7 @@ namespace LIBC_NAMESPACE_DECL { LLVM_LIBC_FUNCTION(wchar_t *, wcscpy, (wchar_t *__restrict s1, const wchar_t *__restrict s2)) { size_t size = internal::string_length(s2) + 1; - inline_memcpy(s1, s2, size * sizeof(wchar_t)); + __builtin_memcpy(s1, s2, size * sizeof(wchar_t)); return s1; } diff --git a/libc/src/wchar/wcsncpy.cpp b/libc/src/wchar/wcsncpy.cpp index e7ae9a4a0da7..7ad6730cd776 100644 --- a/libc/src/wchar/wcsncpy.cpp +++ b/libc/src/wchar/wcsncpy.cpp @@ -12,8 +12,6 @@ #include "hdr/types/wchar_t.h" #include "src/__support/common.h" #include "src/__support/macros/config.h" -#include "src/string/memory_utils/inline_memcpy.h" -#include "src/string/string_utils.h" namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/wchar/wmemcpy.cpp b/libc/src/wchar/wmemcpy.cpp index 56708d6cee49..bf92309b2094 100644 --- a/libc/src/wchar/wmemcpy.cpp +++ b/libc/src/wchar/wmemcpy.cpp @@ -12,14 +12,13 @@ #include "hdr/types/wchar_t.h" #include "src/__support/common.h" #include "src/__support/macros/config.h" -#include "src/string/memory_utils/inline_memcpy.h" namespace LIBC_NAMESPACE_DECL { LLVM_LIBC_FUNCTION(wchar_t *, wmemcpy, (wchar_t *__restrict s1, const wchar_t *__restrict s2, size_t n)) { - inline_memcpy(s1, s2, n * sizeof(wchar_t)); + __builtin_memcpy(s1, s2, n * sizeof(wchar_t)); return s1; } diff --git a/libc/src/wchar/wmempcpy.cpp b/libc/src/wchar/wmempcpy.cpp index d8b89c0a88d0..21e16210a757 100644 --- a/libc/src/wchar/wmempcpy.cpp +++ b/libc/src/wchar/wmempcpy.cpp @@ -11,14 +11,13 @@ #include "hdr/types/size_t.h" #include "hdr/types/wchar_t.h" #include "src/__support/common.h" -#include "src/string/memory_utils/inline_memcpy.h" namespace LIBC_NAMESPACE_DECL { LLVM_LIBC_FUNCTION(wchar_t *, wmempcpy, (wchar_t *__restrict to, const wchar_t *__restrict from, size_t size)) { - inline_memcpy(to, from, size * sizeof(wchar_t)); + __builtin_memcpy(to, from, size * sizeof(wchar_t)); return reinterpret_cast(to) + size; } From fb761aa38b0bc01ab911f5dbbfb474b70aaafbb4 Mon Sep 17 00:00:00 2001 From: Rolf Morel Date: Wed, 11 Jun 2025 21:19:52 +0100 Subject: [PATCH 0054/1322] [MLIR][Transform] apply_registered_op fixes: arg order & python options auto-conversion (#143779) --- .../mlir/Dialect/Transform/IR/TransformOps.td | 6 +++--- .../mlir/dialects/transform/__init__.py | 18 +++++++++++------- .../Transform/test-pass-application.mlir | 19 +++++++++---------- mlir/test/python/dialects/transform.py | 10 +++++----- 4 files changed, 28 insertions(+), 25 deletions(-) diff --git a/mlir/include/mlir/Dialect/Transform/IR/TransformOps.td b/mlir/include/mlir/Dialect/Transform/IR/TransformOps.td index f75ba27e58e7..0aa750e62543 100644 --- a/mlir/include/mlir/Dialect/Transform/IR/TransformOps.td +++ b/mlir/include/mlir/Dialect/Transform/IR/TransformOps.td @@ -434,10 +434,10 @@ def ApplyRegisteredPassOp : TransformDialectOp<"apply_registered_pass", of targeted ops. }]; - let arguments = (ins StrAttr:$pass_name, + let arguments = (ins TransformHandleTypeInterface:$target, + StrAttr:$pass_name, DefaultValuedAttr:$options, - Variadic:$dynamic_options, - TransformHandleTypeInterface:$target); + Variadic:$dynamic_options); let results = (outs TransformHandleTypeInterface:$result); let assemblyFormat = [{ $pass_name (`with` `options` `=` diff --git a/mlir/python/mlir/dialects/transform/__init__.py b/mlir/python/mlir/dialects/transform/__init__.py index 10a04b0cc14e..bfe96b1b3e5d 100644 --- a/mlir/python/mlir/dialects/transform/__init__.py +++ b/mlir/python/mlir/dialects/transform/__init__.py @@ -224,13 +224,13 @@ class ApplyRegisteredPassOp(ApplyRegisteredPassOp): def __init__( self, result: Type, - pass_name: Union[str, StringAttr], target: Union[Operation, Value, OpView], + pass_name: Union[str, StringAttr], *, options: Optional[ Dict[ Union[str, StringAttr], - Union[Attribute, Value, Operation, OpView], + Union[Attribute, Value, Operation, OpView, str, int, bool], ] ] = None, loc=None, @@ -253,17 +253,21 @@ class ApplyRegisteredPassOp(ApplyRegisteredPassOp): cur_param_operand_idx += 1 elif isinstance(value, Attribute): options_dict[key] = value + # The following cases auto-convert Python values to attributes. + elif isinstance(value, bool): + options_dict[key] = BoolAttr.get(value) + elif isinstance(value, int): + default_int_type = IntegerType.get_signless(64, context) + options_dict[key] = IntegerAttr.get(default_int_type, value) elif isinstance(value, str): options_dict[key] = StringAttr.get(value) else: raise TypeError(f"Unsupported option type: {type(value)}") - if len(options_dict) > 0: - print(options_dict, cur_param_operand_idx) super().__init__( result, + _get_op_result_or_value(target), pass_name, dynamic_options, - target=_get_op_result_or_value(target), options=DictAttr.get(options_dict), loc=loc, ip=ip, @@ -272,13 +276,13 @@ class ApplyRegisteredPassOp(ApplyRegisteredPassOp): def apply_registered_pass( result: Type, - pass_name: Union[str, StringAttr], target: Union[Operation, Value, OpView], + pass_name: Union[str, StringAttr], *, options: Optional[ Dict[ Union[str, StringAttr], - Union[Attribute, Value, Operation, OpView], + Union[Attribute, Value, Operation, OpView, str, int, bool], ] ] = None, loc=None, diff --git a/mlir/test/Dialect/Transform/test-pass-application.mlir b/mlir/test/Dialect/Transform/test-pass-application.mlir index 6e6d4eb7e249..1d1be9eda349 100644 --- a/mlir/test/Dialect/Transform/test-pass-application.mlir +++ b/mlir/test/Dialect/Transform/test-pass-application.mlir @@ -157,7 +157,7 @@ module attributes {transform.with_named_sequence} { "test-convergence" = true, "max-num-rewrites" = %max_rewrites } to %1 - : (!transform.any_param, !transform.any_param, !transform.any_op) -> !transform.any_op + : (!transform.any_op, !transform.any_param, !transform.any_param) -> !transform.any_op transform.yield } } @@ -171,7 +171,6 @@ func.func @invalid_options_as_str() { module attributes {transform.with_named_sequence} { transform.named_sequence @__transform_main(%arg1: !transform.any_op) { %1 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op - %max_iter = transform.param.constant "max-iterations=10" -> !transform.any_param // expected-error @+2 {{expected '{' in options dictionary}} %2 = transform.apply_registered_pass "canonicalize" with options = "top-down=false" to %1 : (!transform.any_op) -> !transform.any_op @@ -256,7 +255,7 @@ module attributes {transform.with_named_sequence} { // expected-error @+2 {{expected '{' in options dictionary}} transform.apply_registered_pass "canonicalize" with options = %pass_options to %1 - : (!transform.any_param, !transform.any_op) -> !transform.any_op + : (!transform.any_op, !transform.any_param) -> !transform.any_op transform.yield } } @@ -276,7 +275,7 @@ module attributes {transform.with_named_sequence} { // expected-error @below {{options passed as a param must have a single value associated, param 0 associates 2}} transform.apply_registered_pass "canonicalize" with options = { "top-down" = %topdown_options } to %1 - : (!transform.any_param, !transform.any_op) -> !transform.any_op + : (!transform.any_op, !transform.any_param) -> !transform.any_op transform.yield } } @@ -316,12 +315,12 @@ module attributes {transform.with_named_sequence} { %0 = "transform.structured.match"(%arg0) <{ops = ["func.func"]}> : (!transform.any_op) -> !transform.any_op %1 = "transform.param.constant"() <{value = 10 : i64}> : () -> !transform.any_param // expected-error @below {{dynamic option index 1 is out of bounds for the number of dynamic options: 1}} - %2 = "transform.apply_registered_pass"(%1, %0) <{ + %2 = "transform.apply_registered_pass"(%0, %1) <{ options = {"max-iterations" = #transform.param_operand, "test-convergence" = true, "top-down" = false}, pass_name = "canonicalize"}> - : (!transform.any_param, !transform.any_op) -> !transform.any_op + : (!transform.any_op, !transform.any_param) -> !transform.any_op "transform.yield"() : () -> () }) : () -> () }) {transform.with_named_sequence} : () -> () @@ -340,13 +339,13 @@ module attributes {transform.with_named_sequence} { %1 = "transform.param.constant"() <{value = 10 : i64}> : () -> !transform.any_param %2 = "transform.param.constant"() <{value = 1 : i64}> : () -> !transform.any_param // expected-error @below {{dynamic option index 0 is already used in options}} - %3 = "transform.apply_registered_pass"(%1, %2, %0) <{ + %3 = "transform.apply_registered_pass"(%0, %1, %2) <{ options = {"max-iterations" = #transform.param_operand, "max-num-rewrites" = #transform.param_operand, "test-convergence" = true, "top-down" = false}, pass_name = "canonicalize"}> - : (!transform.any_param, !transform.any_param, !transform.any_op) -> !transform.any_op + : (!transform.any_op, !transform.any_param, !transform.any_param) -> !transform.any_op "transform.yield"() : () -> () }) : () -> () }) {transform.with_named_sequence} : () -> () @@ -364,12 +363,12 @@ module attributes {transform.with_named_sequence} { %1 = "transform.param.constant"() <{value = 10 : i64}> : () -> !transform.any_param %2 = "transform.param.constant"() <{value = 1 : i64}> : () -> !transform.any_param // expected-error @below {{a param operand does not have a corresponding param_operand attr in the options dict}} - %3 = "transform.apply_registered_pass"(%1, %2, %0) <{ + %3 = "transform.apply_registered_pass"(%0, %1, %2) <{ options = {"max-iterations" = #transform.param_operand, "test-convergence" = true, "top-down" = false}, pass_name = "canonicalize"}> - : (!transform.any_param, !transform.any_param, !transform.any_op) -> !transform.any_op + : (!transform.any_op, !transform.any_param, !transform.any_param) -> !transform.any_op "transform.yield"() : () -> () }) : () -> () }) {transform.with_named_sequence} : () -> () diff --git a/mlir/test/python/dialects/transform.py b/mlir/test/python/dialects/transform.py index 48bc9bad37a1..eeb95605d7a9 100644 --- a/mlir/test/python/dialects/transform.py +++ b/mlir/test/python/dialects/transform.py @@ -263,12 +263,12 @@ def testApplyRegisteredPassOp(module: Module): ) with InsertionPoint(sequence.body): mod = transform.ApplyRegisteredPassOp( - transform.AnyOpType.get(), "canonicalize", sequence.bodyTarget + transform.AnyOpType.get(), sequence.bodyTarget, "canonicalize" ) mod = transform.ApplyRegisteredPassOp( transform.AnyOpType.get(), - "canonicalize", mod.result, + "canonicalize", options={"top-down": BoolAttr.get(False)}, ) max_iter = transform.param_constant( @@ -281,12 +281,12 @@ def testApplyRegisteredPassOp(module: Module): ) transform.apply_registered_pass( transform.AnyOpType.get(), - "canonicalize", mod, + "canonicalize", options={ "top-down": BoolAttr.get(False), "max-iterations": max_iter, - "test-convergence": BoolAttr.get(True), + "test-convergence": True, "max-rewrites": max_rewrites, }, ) @@ -305,4 +305,4 @@ def testApplyRegisteredPassOp(module: Module): # CHECK-SAME: "max-rewrites" = %[[MAX_REWRITE]], # CHECK-SAME: "test-convergence" = true, # CHECK-SAME: "top-down" = false} - # CHECK-SAME: to %{{.*}} : (!transform.any_param, !transform.any_param, !transform.any_op) -> !transform.any_op + # CHECK-SAME: to %{{.*}} : (!transform.any_op, !transform.any_param, !transform.any_param) -> !transform.any_op From d87eea35fac5a34a841c637db8908128409a184e Mon Sep 17 00:00:00 2001 From: lntue Date: Wed, 11 Jun 2025 16:25:27 -0400 Subject: [PATCH 0055/1322] [libc] Move libc_errno.h to libc/src/__support and make LIBC_ERRNO_MODE_SYSTEM to be header-only. (#143187) This is the first step in preparation for: https://discourse.llvm.org/t/rfc-make-clang-builtin-math-functions-constexpr-with-llvm-libc-to-support-c-23-constexpr-math-functions/86450 --- .../modules/LLVMLibCCompileOptionRules.cmake | 4 + libc/config/config.json | 2 +- libc/docs/dev/code_style.rst | 4 +- libc/shared/fp_bits.h | 1 + libc/shared/libc_common.h | 26 +++++ libc/shared/rpc_server.h | 1 + libc/shared/str_to_float.h | 1 + libc/shared/str_to_integer.h | 1 + libc/src/__support/CMakeLists.txt | 9 ++ libc/src/__support/FPUtil/FEnvImpl.h | 2 +- libc/src/__support/File/dir.cpp | 2 +- libc/src/__support/File/file.cpp | 2 +- libc/src/__support/File/linux/file.cpp | 2 +- libc/src/__support/File/linux/lseekImpl.h | 2 +- libc/src/__support/HashTable/randomness.h | 2 +- libc/src/__support/OSUtil/linux/fcntl.cpp | 2 +- libc/src/__support/OSUtil/linux/vdso.cpp | 2 +- .../tables/linux_extension_errors.h | 2 +- libc/src/__support/libc_errno.h | 108 ++++++++++++++++++ libc/src/__support/threads/linux/thread.cpp | 2 +- libc/src/dirent/closedir.cpp | 2 +- libc/src/dirent/opendir.cpp | 2 +- libc/src/dirent/readdir.cpp | 2 +- libc/src/errno/CMakeLists.txt | 20 +--- libc/src/errno/libc_errno.cpp | 47 +------- libc/src/errno/libc_errno.h | 47 -------- libc/src/fcntl/linux/creat.cpp | 2 +- libc/src/fcntl/linux/open.cpp | 2 +- libc/src/fcntl/linux/openat.cpp | 2 +- libc/src/inttypes/strtoimax.cpp | 2 +- libc/src/inttypes/strtoumax.cpp | 2 +- libc/src/math/generic/exp10m1f.cpp | 2 +- libc/src/math/generic/exp2m1f.cpp | 2 +- libc/src/math/generic/nan.cpp | 2 +- libc/src/math/generic/nanf.cpp | 2 +- libc/src/math/generic/nanf128.cpp | 2 +- libc/src/math/generic/nanf16.cpp | 2 +- libc/src/math/generic/nanl.cpp | 2 +- libc/src/poll/linux/poll.cpp | 2 +- libc/src/pthread/pthread_atfork.cpp | 2 +- .../pthread/pthread_attr_setdetachstate.cpp | 2 +- .../src/pthread/pthread_attr_setguardsize.cpp | 2 +- libc/src/pthread/pthread_attr_setstack.cpp | 2 +- .../src/pthread/pthread_attr_setstacksize.cpp | 2 +- .../src/pthread/pthread_condattr_setclock.cpp | 2 +- .../pthread/pthread_condattr_setpshared.cpp | 2 +- libc/src/pthread/pthread_create.cpp | 2 +- libc/src/pthread/pthread_key_create.cpp | 2 +- libc/src/pthread/pthread_key_delete.cpp | 2 +- .../pthread/pthread_mutexattr_setpshared.cpp | 2 +- .../pthread/pthread_mutexattr_setrobust.cpp | 2 +- .../src/pthread/pthread_mutexattr_settype.cpp | 2 +- .../pthread/pthread_rwlock_timedrdlock.cpp | 2 +- libc/src/pthread/pthread_rwlock_trywrlock.cpp | 2 +- libc/src/pthread/pthread_rwlock_unlock.cpp | 2 +- .../pthread/pthread_rwlockattr_setkind_np.cpp | 2 +- .../pthread/pthread_rwlockattr_setpshared.cpp | 2 +- libc/src/pthread/pthread_setspecific.cpp | 2 +- .../sched/linux/sched_get_priority_max.cpp | 2 +- .../sched/linux/sched_get_priority_min.cpp | 2 +- libc/src/sched/linux/sched_getaffinity.cpp | 2 +- libc/src/sched/linux/sched_getparam.cpp | 2 +- libc/src/sched/linux/sched_getscheduler.cpp | 2 +- .../src/sched/linux/sched_rr_get_interval.cpp | 2 +- libc/src/sched/linux/sched_setaffinity.cpp | 2 +- libc/src/sched/linux/sched_setparam.cpp | 2 +- libc/src/sched/linux/sched_setscheduler.cpp | 2 +- libc/src/sched/linux/sched_yield.cpp | 2 +- libc/src/search/hcreate.cpp | 2 +- libc/src/search/hcreate_r.cpp | 2 +- libc/src/search/hdestroy_r.cpp | 2 +- libc/src/search/hsearch.cpp | 2 +- libc/src/search/hsearch_r.cpp | 2 +- libc/src/signal/linux/kill.cpp | 2 +- libc/src/signal/linux/sigaction.cpp | 2 +- libc/src/signal/linux/sigaddset.cpp | 2 +- libc/src/signal/linux/sigaltstack.cpp | 2 +- libc/src/signal/linux/sigdelset.cpp | 2 +- libc/src/signal/linux/sigemptyset.cpp | 2 +- libc/src/signal/linux/sigfillset.cpp | 2 +- libc/src/signal/linux/sigprocmask.cpp | 2 +- .../posix_spawn_file_actions_addclose.cpp | 2 +- .../posix_spawn_file_actions_adddup2.cpp | 2 +- .../posix_spawn_file_actions_addopen.cpp | 2 +- .../posix_spawn_file_actions_destroy.cpp | 2 +- libc/src/stdio/fopencookie.cpp | 2 +- libc/src/stdio/generic/fclose.cpp | 2 +- libc/src/stdio/generic/fflush.cpp | 2 +- libc/src/stdio/generic/fgetc.cpp | 2 +- libc/src/stdio/generic/fgetc_unlocked.cpp | 2 +- libc/src/stdio/generic/fgets.cpp | 2 +- libc/src/stdio/generic/fopen.cpp | 2 +- libc/src/stdio/generic/fputc.cpp | 2 +- libc/src/stdio/generic/fputs.cpp | 2 +- libc/src/stdio/generic/fread.cpp | 2 +- libc/src/stdio/generic/fread_unlocked.cpp | 2 +- libc/src/stdio/generic/fseek.cpp | 2 +- libc/src/stdio/generic/fseeko.cpp | 2 +- libc/src/stdio/generic/ftell.cpp | 2 +- libc/src/stdio/generic/ftello.cpp | 2 +- libc/src/stdio/generic/fwrite.cpp | 2 +- libc/src/stdio/generic/fwrite_unlocked.cpp | 2 +- libc/src/stdio/generic/getc.cpp | 2 +- libc/src/stdio/generic/getc_unlocked.cpp | 2 +- libc/src/stdio/generic/getchar.cpp | 2 +- libc/src/stdio/generic/getchar_unlocked.cpp | 2 +- libc/src/stdio/generic/putc.cpp | 2 +- libc/src/stdio/generic/putchar.cpp | 2 +- libc/src/stdio/generic/puts.cpp | 2 +- libc/src/stdio/gpu/fprintf.cpp | 2 +- libc/src/stdio/gpu/printf.cpp | 2 +- libc/src/stdio/linux/fdopen.cpp | 2 +- libc/src/stdio/linux/remove.cpp | 2 +- libc/src/stdio/linux/rename.cpp | 2 +- libc/src/stdio/printf_core/parser.h | 2 +- libc/src/stdio/setbuf.cpp | 2 +- libc/src/stdio/setvbuf.cpp | 2 +- libc/src/stdlib/atof.cpp | 2 +- libc/src/stdlib/atoi.cpp | 2 +- libc/src/stdlib/atol.cpp | 2 +- libc/src/stdlib/atoll.cpp | 2 +- libc/src/stdlib/strtod.cpp | 2 +- libc/src/stdlib/strtod_l.cpp | 2 +- libc/src/stdlib/strtof.cpp | 2 +- libc/src/stdlib/strtof_l.cpp | 2 +- libc/src/stdlib/strtol.cpp | 2 +- libc/src/stdlib/strtol_l.cpp | 2 +- libc/src/stdlib/strtold.cpp | 2 +- libc/src/stdlib/strtold_l.cpp | 2 +- libc/src/stdlib/strtoll.cpp | 2 +- libc/src/stdlib/strtoll_l.cpp | 2 +- libc/src/stdlib/strtoul.cpp | 2 +- libc/src/stdlib/strtoul_l.cpp | 2 +- libc/src/stdlib/strtoull.cpp | 2 +- libc/src/stdlib/strtoull_l.cpp | 2 +- libc/src/string/strdup.cpp | 2 +- libc/src/sys/auxv/linux/getauxval.cpp | 2 +- libc/src/sys/epoll/linux/epoll_create.cpp | 2 +- libc/src/sys/epoll/linux/epoll_create1.cpp | 2 +- libc/src/sys/epoll/linux/epoll_ctl.cpp | 2 +- libc/src/sys/epoll/linux/epoll_pwait.cpp | 2 +- libc/src/sys/epoll/linux/epoll_pwait2.cpp | 2 +- libc/src/sys/epoll/linux/epoll_wait.cpp | 2 +- libc/src/sys/mman/linux/madvise.cpp | 2 +- libc/src/sys/mman/linux/mincore.cpp | 2 +- libc/src/sys/mman/linux/mlock.cpp | 2 +- libc/src/sys/mman/linux/mlock2.cpp | 2 +- libc/src/sys/mman/linux/mlockall.cpp | 2 +- libc/src/sys/mman/linux/mmap.cpp | 2 +- libc/src/sys/mman/linux/mprotect.cpp | 2 +- libc/src/sys/mman/linux/mremap.cpp | 2 +- libc/src/sys/mman/linux/msync.cpp | 2 +- libc/src/sys/mman/linux/munlock.cpp | 2 +- libc/src/sys/mman/linux/munlockall.cpp | 2 +- libc/src/sys/mman/linux/munmap.cpp | 4 +- libc/src/sys/mman/linux/remap_file_pages.cpp | 2 +- libc/src/sys/mman/linux/shm_common.h | 2 +- libc/src/sys/prctl/linux/prctl.cpp | 2 +- libc/src/sys/random/linux/getrandom.cpp | 2 +- libc/src/sys/resource/linux/getrlimit.cpp | 2 +- libc/src/sys/resource/linux/setrlimit.cpp | 2 +- libc/src/sys/select/linux/select.cpp | 2 +- libc/src/sys/sendfile/linux/sendfile.cpp | 2 +- libc/src/sys/socket/linux/bind.cpp | 2 +- libc/src/sys/socket/linux/recv.cpp | 2 +- libc/src/sys/socket/linux/recvfrom.cpp | 2 +- libc/src/sys/socket/linux/recvmsg.cpp | 2 +- libc/src/sys/socket/linux/send.cpp | 2 +- libc/src/sys/socket/linux/sendmsg.cpp | 2 +- libc/src/sys/socket/linux/sendto.cpp | 2 +- libc/src/sys/socket/linux/socket.cpp | 2 +- libc/src/sys/socket/linux/socketpair.cpp | 2 +- libc/src/sys/stat/linux/chmod.cpp | 2 +- libc/src/sys/stat/linux/fchmod.cpp | 2 +- libc/src/sys/stat/linux/fchmodat.cpp | 2 +- libc/src/sys/stat/linux/fstat.cpp | 2 +- libc/src/sys/stat/linux/lstat.cpp | 2 +- libc/src/sys/stat/linux/mkdir.cpp | 2 +- libc/src/sys/stat/linux/mkdirat.cpp | 2 +- libc/src/sys/stat/linux/stat.cpp | 2 +- libc/src/sys/statvfs/linux/statfs_utils.h | 2 +- libc/src/sys/time/linux/getitimer.cpp | 2 +- libc/src/sys/time/linux/setitimer.cpp | 2 +- libc/src/sys/time/linux/utimes.cpp | 2 +- libc/src/sys/uio/linux/readv.cpp | 2 +- libc/src/sys/uio/linux/writev.cpp | 2 +- libc/src/sys/utsname/linux/uname.cpp | 2 +- libc/src/sys/wait/wait4Impl.h | 2 +- libc/src/termios/linux/cfsetispeed.cpp | 2 +- libc/src/termios/linux/cfsetospeed.cpp | 2 +- libc/src/termios/linux/tcdrain.cpp | 2 +- libc/src/termios/linux/tcflow.cpp | 2 +- libc/src/termios/linux/tcflush.cpp | 2 +- libc/src/termios/linux/tcgetattr.cpp | 2 +- libc/src/termios/linux/tcgetsid.cpp | 2 +- libc/src/termios/linux/tcsendbreak.cpp | 2 +- libc/src/termios/linux/tcsetattr.cpp | 2 +- libc/src/threads/thrd_create.cpp | 2 +- libc/src/time/linux/clock.cpp | 2 +- libc/src/time/linux/clock_gettime.cpp | 2 +- libc/src/time/linux/gettimeofday.cpp | 2 +- libc/src/time/linux/nanosleep.cpp | 2 +- libc/src/time/linux/timespec_get.cpp | 2 +- libc/src/time/time.cpp | 2 +- libc/src/time/time_utils.h | 2 +- libc/src/time/windows/clock_getres.cpp | 2 +- libc/src/unistd/linux/access.cpp | 2 +- libc/src/unistd/linux/chdir.cpp | 2 +- libc/src/unistd/linux/close.cpp | 2 +- libc/src/unistd/linux/dup.cpp | 2 +- libc/src/unistd/linux/dup2.cpp | 2 +- libc/src/unistd/linux/dup3.cpp | 2 +- libc/src/unistd/linux/execv.cpp | 2 +- libc/src/unistd/linux/execve.cpp | 2 +- libc/src/unistd/linux/fchdir.cpp | 2 +- libc/src/unistd/linux/fork.cpp | 2 +- libc/src/unistd/linux/fsync.cpp | 2 +- libc/src/unistd/linux/ftruncate.cpp | 2 +- libc/src/unistd/linux/getcwd.cpp | 2 +- libc/src/unistd/linux/getentropy.cpp | 2 +- libc/src/unistd/linux/getsid.cpp | 2 +- libc/src/unistd/linux/isatty.cpp | 2 +- libc/src/unistd/linux/link.cpp | 2 +- libc/src/unistd/linux/linkat.cpp | 2 +- libc/src/unistd/linux/lseek.cpp | 2 +- libc/src/unistd/linux/pathconf.cpp | 2 +- libc/src/unistd/linux/pathconf_utils.cpp | 2 +- libc/src/unistd/linux/pipe.cpp | 4 +- libc/src/unistd/linux/pipe2.cpp | 2 +- libc/src/unistd/linux/pread.cpp | 6 +- libc/src/unistd/linux/pwrite.cpp | 2 +- libc/src/unistd/linux/read.cpp | 4 +- libc/src/unistd/linux/readlink.cpp | 2 +- libc/src/unistd/linux/readlinkat.cpp | 2 +- libc/src/unistd/linux/rmdir.cpp | 2 +- libc/src/unistd/linux/symlink.cpp | 2 +- libc/src/unistd/linux/symlinkat.cpp | 2 +- libc/src/unistd/linux/syscall.cpp | 2 +- libc/src/unistd/linux/sysconf.cpp | 2 +- libc/src/unistd/linux/truncate.cpp | 2 +- libc/src/unistd/linux/unlink.cpp | 2 +- libc/src/unistd/linux/unlinkat.cpp | 2 +- libc/src/unistd/linux/write.cpp | 2 +- libc/src/unistd/windows/getentropy.cpp | 2 +- libc/test/IntegrationTest/test.h | 9 +- libc/test/UnitTest/ErrnoCheckingTest.h | 4 +- libc/test/UnitTest/ErrnoSetterMatcher.h | 6 +- libc/test/UnitTest/FPMatcher.h | 8 +- libc/test/UnitTest/Test.h | 11 +- .../src/pthread/pthread_create_test.cpp | 4 +- .../src/pthread/pthread_join_test.cpp | 4 +- .../src/pthread/pthread_name_test.cpp | 2 +- .../integration/src/unistd/getcwd_test.cpp | 6 +- .../integration/startup/linux/tls_test.cpp | 2 +- libc/test/src/__support/str_to_fp_test.h | 1 + .../src/__support/str_to_integer_test.cpp | 1 + libc/test/src/dirent/dirent_test.cpp | 10 +- libc/test/src/errno/errno_test.cpp | 4 +- libc/test/src/fcntl/creat_test.cpp | 2 +- libc/test/src/fcntl/fcntl_test.cpp | 4 +- libc/test/src/fcntl/openat_test.cpp | 2 +- libc/test/src/math/RoundToIntegerTest.h | 2 +- libc/test/src/math/acosf_test.cpp | 4 +- libc/test/src/math/acoshf16_test.cpp | 2 +- libc/test/src/math/acoshf_test.cpp | 4 +- libc/test/src/math/asin_test.cpp | 2 +- libc/test/src/math/asinf_test.cpp | 4 +- libc/test/src/math/asinhf_test.cpp | 4 +- libc/test/src/math/atan2f_test.cpp | 2 +- libc/test/src/math/atan_test.cpp | 2 +- libc/test/src/math/atanf_test.cpp | 4 +- libc/test/src/math/atanhf_test.cpp | 4 +- libc/test/src/math/cosf_test.cpp | 4 +- libc/test/src/math/coshf_test.cpp | 6 +- libc/test/src/math/cospif_test.cpp | 4 +- libc/test/src/math/exp10_test.cpp | 4 +- libc/test/src/math/exp10f_test.cpp | 15 ++- libc/test/src/math/exp10m1f_test.cpp | 8 +- libc/test/src/math/exp2_test.cpp | 4 +- libc/test/src/math/exp2f_test.cpp | 15 ++- libc/test/src/math/exp2m1f_test.cpp | 9 +- libc/test/src/math/exp_test.cpp | 4 +- libc/test/src/math/expf_test.cpp | 15 ++- libc/test/src/math/expm1_test.cpp | 4 +- libc/test/src/math/expm1f_test.cpp | 15 ++- libc/test/src/math/log10_test.cpp | 4 +- libc/test/src/math/log1p_test.cpp | 4 +- libc/test/src/math/log1pf_test.cpp | 4 +- libc/test/src/math/log2_test.cpp | 4 +- libc/test/src/math/log2f_test.cpp | 7 +- libc/test/src/math/log_test.cpp | 4 +- libc/test/src/math/powf_test.cpp | 2 +- libc/test/src/math/sin_test.cpp | 2 +- libc/test/src/math/sincosf_test.cpp | 4 +- libc/test/src/math/sinf_test.cpp | 4 +- libc/test/src/math/sinhf_test.cpp | 6 +- libc/test/src/math/sinpif_test.cpp | 4 +- libc/test/src/math/smoke/FModTest.h | 2 +- libc/test/src/math/smoke/RoundToIntegerTest.h | 2 +- libc/test/src/math/smoke/acos_test.cpp | 4 +- libc/test/src/math/smoke/acosf16_test.cpp | 4 +- libc/test/src/math/smoke/acosf_test.cpp | 4 +- libc/test/src/math/smoke/acoshf16_test.cpp | 4 +- libc/test/src/math/smoke/acoshf_test.cpp | 4 +- libc/test/src/math/smoke/acospif16_test.cpp | 4 +- libc/test/src/math/smoke/asinf16_test.cpp | 4 +- libc/test/src/math/smoke/asinf_test.cpp | 4 +- libc/test/src/math/smoke/asinhf16_test.cpp | 4 +- libc/test/src/math/smoke/asinhf_test.cpp | 4 +- libc/test/src/math/smoke/atan2f_test.cpp | 4 +- libc/test/src/math/smoke/atanf16_test.cpp | 4 +- libc/test/src/math/smoke/atanf_test.cpp | 4 +- libc/test/src/math/smoke/atanhf16_test.cpp | 4 +- libc/test/src/math/smoke/atanhf_test.cpp | 4 +- libc/test/src/math/smoke/cosf16_test.cpp | 4 +- libc/test/src/math/smoke/cosf_test.cpp | 4 +- libc/test/src/math/smoke/coshf16_test.cpp | 6 +- libc/test/src/math/smoke/coshf_test.cpp | 6 +- libc/test/src/math/smoke/cospif16_test.cpp | 4 +- libc/test/src/math/smoke/cospif_test.cpp | 4 +- libc/test/src/math/smoke/exp10_test.cpp | 2 +- libc/test/src/math/smoke/exp10f16_test.cpp | 8 +- libc/test/src/math/smoke/exp10f_test.cpp | 6 +- libc/test/src/math/smoke/exp10m1f16_test.cpp | 8 +- libc/test/src/math/smoke/exp10m1f_test.cpp | 8 +- libc/test/src/math/smoke/exp2_test.cpp | 2 +- libc/test/src/math/smoke/exp2f16_test.cpp | 8 +- libc/test/src/math/smoke/exp2f_test.cpp | 6 +- libc/test/src/math/smoke/exp2m1f16_test.cpp | 8 +- libc/test/src/math/smoke/exp2m1f_test.cpp | 8 +- libc/test/src/math/smoke/exp_test.cpp | 2 +- libc/test/src/math/smoke/expf16_test.cpp | 8 +- libc/test/src/math/smoke/expf_test.cpp | 6 +- libc/test/src/math/smoke/expm1_test.cpp | 2 +- libc/test/src/math/smoke/expm1f16_test.cpp | 8 +- libc/test/src/math/smoke/expm1f_test.cpp | 6 +- libc/test/src/math/smoke/log10_test.cpp | 2 +- libc/test/src/math/smoke/log10f16_test.cpp | 4 +- libc/test/src/math/smoke/log1p_test.cpp | 2 +- libc/test/src/math/smoke/log1pf_test.cpp | 2 +- libc/test/src/math/smoke/log2_test.cpp | 2 +- libc/test/src/math/smoke/log2f16_test.cpp | 4 +- libc/test/src/math/smoke/log2f_test.cpp | 2 +- libc/test/src/math/smoke/log_test.cpp | 2 +- libc/test/src/math/smoke/logf16_test.cpp | 4 +- libc/test/src/math/smoke/sincosf_test.cpp | 4 +- libc/test/src/math/smoke/sinf16_test.cpp | 4 +- libc/test/src/math/smoke/sinf_test.cpp | 4 +- libc/test/src/math/smoke/sinhf16_test.cpp | 6 +- libc/test/src/math/smoke/sinhf_test.cpp | 6 +- libc/test/src/math/smoke/sinpif16_test.cpp | 4 +- libc/test/src/math/smoke/sinpif_test.cpp | 4 +- libc/test/src/math/smoke/tanf16_test.cpp | 4 +- libc/test/src/math/smoke/tanf_test.cpp | 4 +- libc/test/src/math/smoke/tanhf16_test.cpp | 6 +- libc/test/src/math/smoke/tanhf_test.cpp | 4 +- libc/test/src/math/smoke/tanpif16_test.cpp | 4 +- libc/test/src/math/tanf_test.cpp | 4 +- libc/test/src/math/tanhf_test.cpp | 4 +- libc/test/src/poll/poll_test.cpp | 6 +- libc/test/src/sched/affinity_test.cpp | 10 +- libc/test/src/sched/cpu_count_test.cpp | 4 +- libc/test/src/sched/get_priority_test.cpp | 4 +- .../src/sched/param_and_scheduler_test.cpp | 49 ++++---- .../src/sched/sched_rr_get_interval_test.cpp | 10 +- libc/test/src/sched/yield_test.cpp | 4 +- libc/test/src/signal/sigaltstack_test.cpp | 4 +- libc/test/src/signal/signal_test.cpp | 4 +- libc/test/src/signal/sigprocmask_test.cpp | 4 +- .../spawn/posix_spawn_file_actions_test.cpp | 2 +- libc/test/src/stdio/fdopen_test.cpp | 10 +- libc/test/src/stdio/fgetc_test.cpp | 4 +- libc/test/src/stdio/fgetc_unlocked_test.cpp | 4 +- libc/test/src/stdio/fgets_test.cpp | 4 +- libc/test/src/stdio/fileop_test.cpp | 24 ++-- libc/test/src/stdio/fopencookie_test.cpp | 10 +- libc/test/src/stdio/remove_test.cpp | 6 +- libc/test/src/stdio/rename_test.cpp | 4 +- libc/test/src/stdio/setvbuf_test.cpp | 4 +- libc/test/src/stdio/sprintf_test.cpp | 76 ++++++------ libc/test/src/stdio/unlocked_fileop_test.cpp | 6 +- libc/test/src/stdlib/StrtolTest.h | 1 + libc/test/src/stdlib/strtoint32_test.cpp | 6 +- libc/test/src/stdlib/strtoint64_test.cpp | 6 +- libc/test/src/stdlib/strtold_test.cpp | 1 + libc/test/src/sys/mman/linux/mlock_test.cpp | 17 ++- .../src/sys/statvfs/linux/fstatvfs_test.cpp | 4 +- .../src/sys/statvfs/linux/statvfs_test.cpp | 4 +- libc/test/src/sys/time/setitimer_test.cpp | 2 +- libc/test/src/termios/termios_test.cpp | 12 +- libc/test/src/time/asctime_r_test.cpp | 2 +- libc/test/src/time/asctime_test.cpp | 2 +- libc/test/src/time/ctime_r_test.cpp | 2 +- libc/test/src/time/ctime_test.cpp | 2 +- libc/test/src/time/gmtime_test.cpp | 4 +- libc/test/src/time/nanosleep_test.cpp | 4 +- .../llvm-project-overlay/libc/BUILD.bazel | 3 +- 397 files changed, 829 insertions(+), 783 deletions(-) create mode 100644 libc/shared/libc_common.h create mode 100644 libc/src/__support/libc_errno.h delete mode 100644 libc/src/errno/libc_errno.h diff --git a/libc/cmake/modules/LLVMLibCCompileOptionRules.cmake b/libc/cmake/modules/LLVMLibCCompileOptionRules.cmake index 0facb0b9be0c..a98e7276bef8 100644 --- a/libc/cmake/modules/LLVMLibCCompileOptionRules.cmake +++ b/libc/cmake/modules/LLVMLibCCompileOptionRules.cmake @@ -106,6 +106,10 @@ function(_get_compile_options_from_config output_var) list(APPEND config_options "-DLIBC_MATH=${LIBC_CONF_MATH_OPTIMIZATIONS}") endif() + if(LIBC_CONF_ERRNO_MODE) + set(APPEND config_options "-DLIBC_ERRNO_MODE=${LIBC_CONF_ERRNO_MODE}") + endif() + set(${output_var} ${config_options} PARENT_SCOPE) endfunction(_get_compile_options_from_config) diff --git a/libc/config/config.json b/libc/config/config.json index bfe956855cb5..d53b2936edb0 100644 --- a/libc/config/config.json +++ b/libc/config/config.json @@ -2,7 +2,7 @@ "errno": { "LIBC_CONF_ERRNO_MODE": { "value": "LIBC_ERRNO_MODE_DEFAULT", - "doc": "The implementation used for errno, acceptable values are LIBC_ERRNO_MODE_DEFAULT, LIBC_ERRNO_MODE_UNDEFINED, LIBC_ERRNO_MODE_THREAD_LOCAL, LIBC_ERRNO_MODE_SHARED, LIBC_ERRNO_MODE_EXTERNAL, and LIBC_ERRNO_MODE_SYSTEM." + "doc": "The implementation used for errno, acceptable values are LIBC_ERRNO_MODE_DEFAULT, LIBC_ERRNO_MODE_UNDEFINED, LIBC_ERRNO_MODE_THREAD_LOCAL, LIBC_ERRNO_MODE_SHARED, LIBC_ERRNO_MODE_EXTERNAL, LIBC_ERRNO_MODE_SYSTEM, and LIBC_ERRNO_MODE_SYSTEM_INLINE." } }, "printf": { diff --git a/libc/docs/dev/code_style.rst b/libc/docs/dev/code_style.rst index 0bd3a69ae3ff..86247966552f 100644 --- a/libc/docs/dev/code_style.rst +++ b/libc/docs/dev/code_style.rst @@ -101,7 +101,7 @@ test infrastructure itself can be affected. To avoid perturbing the unit test infrastructure around the setting of ``errno``, the following rules are to be followed: -#. A special macro named ``libc_errno`` defined in ``src/errno/libc_errno.h`` +#. A special macro named ``libc_errno`` defined in ``src/__support/libc_errno.h`` should be used when setting ``errno`` from libc runtime code. For example, code to set ``errno`` to ``EINVAL`` should be: @@ -117,7 +117,7 @@ followed: `ErrorOr `_ to return error values. -#. The header file ``src/errno/libc_errno.h`` is shipped as part of the target +#. The header file ``src/__support/libc_errno.h`` is shipped as part of the target corresponding to the ``errno`` entrypoint ``libc.src.errno.errno``. We do not in general allow dependencies between entrypoints. However, the ``errno`` entrypoint is the only exceptional entrypoint on which other entrypoints diff --git a/libc/shared/fp_bits.h b/libc/shared/fp_bits.h index 2898c508b777..e6bb1e17b80c 100644 --- a/libc/shared/fp_bits.h +++ b/libc/shared/fp_bits.h @@ -9,6 +9,7 @@ #ifndef LLVM_LIBC_SHARED_FP_BITS_H #define LLVM_LIBC_SHARED_FP_BITS_H +#include "libc_common.h" #include "src/__support/FPUtil/FPBits.h" namespace LIBC_NAMESPACE_DECL { diff --git a/libc/shared/libc_common.h b/libc/shared/libc_common.h new file mode 100644 index 000000000000..c4560bbb0276 --- /dev/null +++ b/libc/shared/libc_common.h @@ -0,0 +1,26 @@ +//===-- Common defines for sharing LLVM libc with LLVM projects -*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SHARED_LIBC_COMMON_H +#define LLVM_LIBC_SHARED_LIBC_COMMON_H + +// Use system errno. +#ifdef LIBC_ERRNO_MODE +#if LIBC_ERRNO_MODE != LIBC_ERRNO_MODE_SYSTEM_INLINE +#error \ + "LIBC_ERRNO_MODE was set to something different from LIBC_ERRNO_MODE_SYSTEM_INLINE." +#endif // LIBC_ERRNO_MODE != LIBC_ERRNO_MODE_SYSTEM_INLINE +#else +#define LIBC_ERRNO_MODE LIBC_ERRNO_MODE_SYSTEM_INLINE +#endif // LIBC_ERRNO_MODE + +#ifndef LIBC_NAMESPACE +#define LIBC_NAMESPACE __llvm_libc +#endif // LIBC_NAMESPACE + +#endif // LLVM_LIBC_SHARED_LIBC_COMMON_H diff --git a/libc/shared/rpc_server.h b/libc/shared/rpc_server.h index 5509094b944a..46e35f13f0ea 100644 --- a/libc/shared/rpc_server.h +++ b/libc/shared/rpc_server.h @@ -9,6 +9,7 @@ #ifndef LLVM_LIBC_SHARED_RPC_SERVER_H #define LLVM_LIBC_SHARED_RPC_SERVER_H +#include "libc_common.h" #include "src/__support/RPC/rpc_server.h" namespace LIBC_NAMESPACE_DECL { diff --git a/libc/shared/str_to_float.h b/libc/shared/str_to_float.h index b133a28e26ef..dcc6027d6c77 100644 --- a/libc/shared/str_to_float.h +++ b/libc/shared/str_to_float.h @@ -9,6 +9,7 @@ #ifndef LLVM_LIBC_SHARED_STR_TO_FLOAT_H #define LLVM_LIBC_SHARED_STR_TO_FLOAT_H +#include "libc_common.h" #include "src/__support/str_to_float.h" namespace LIBC_NAMESPACE_DECL { diff --git a/libc/shared/str_to_integer.h b/libc/shared/str_to_integer.h index 15bee698d5a6..6ed38c932662 100644 --- a/libc/shared/str_to_integer.h +++ b/libc/shared/str_to_integer.h @@ -9,6 +9,7 @@ #ifndef LLVM_LIBC_SHARED_STR_TO_INTEGER_H #define LLVM_LIBC_SHARED_STR_TO_INTEGER_H +#include "libc_common.h" #include "src/__support/str_to_integer.h" namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/__support/CMakeLists.txt b/libc/src/__support/CMakeLists.txt index f92499fdbf45..327ff5e0c6a3 100644 --- a/libc/src/__support/CMakeLists.txt +++ b/libc/src/__support/CMakeLists.txt @@ -1,6 +1,15 @@ add_subdirectory(CPP) add_subdirectory(macros) +add_header_library( + libc_errno + HDRS + libc_errno.h + DEPENDS + libc.hdr.errno_macros + libc.src.__support.macros.config +) + add_header_library( block HDRS diff --git a/libc/src/__support/FPUtil/FEnvImpl.h b/libc/src/__support/FPUtil/FEnvImpl.h index 4c8f34a435bd..50a101f833c5 100644 --- a/libc/src/__support/FPUtil/FEnvImpl.h +++ b/libc/src/__support/FPUtil/FEnvImpl.h @@ -12,10 +12,10 @@ #include "hdr/fenv_macros.h" #include "hdr/math_macros.h" #include "hdr/types/fenv_t.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/attributes.h" // LIBC_INLINE #include "src/__support/macros/config.h" #include "src/__support/macros/properties/architectures.h" -#include "src/errno/libc_errno.h" #if defined(LIBC_TARGET_ARCH_IS_AARCH64) && defined(__ARM_FP) #if defined(__APPLE__) diff --git a/libc/src/__support/File/dir.cpp b/libc/src/__support/File/dir.cpp index 21b0106f7010..aea8862c15f7 100644 --- a/libc/src/__support/File/dir.cpp +++ b/libc/src/__support/File/dir.cpp @@ -11,8 +11,8 @@ #include "src/__support/CPP/mutex.h" // lock_guard #include "src/__support/CPP/new.h" #include "src/__support/error_or.h" +#include "src/__support/libc_errno.h" // For error macros #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" // For error macros namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/__support/File/file.cpp b/libc/src/__support/File/file.cpp index 528542cccf32..303852dbbb71 100644 --- a/libc/src/__support/File/file.cpp +++ b/libc/src/__support/File/file.cpp @@ -13,8 +13,8 @@ #include "hdr/types/off_t.h" #include "src/__support/CPP/new.h" #include "src/__support/CPP/span.h" +#include "src/__support/libc_errno.h" // For error macros #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" // For error macros namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/__support/File/linux/file.cpp b/libc/src/__support/File/linux/file.cpp index 824c1f200e8c..761e352f74ea 100644 --- a/libc/src/__support/File/linux/file.cpp +++ b/libc/src/__support/File/linux/file.cpp @@ -15,8 +15,8 @@ #include "src/__support/File/linux/lseekImpl.h" #include "src/__support/OSUtil/fcntl.h" #include "src/__support/OSUtil/syscall.h" // For internal syscall function. +#include "src/__support/libc_errno.h" // For error macros #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" // For error macros #include "hdr/fcntl_macros.h" // For mode_t and other flags to the open syscall #include // For S_IS*, S_IF*, and S_IR* flags. diff --git a/libc/src/__support/File/linux/lseekImpl.h b/libc/src/__support/File/linux/lseekImpl.h index a034913d9f6e..300e5c5dd55b 100644 --- a/libc/src/__support/File/linux/lseekImpl.h +++ b/libc/src/__support/File/linux/lseekImpl.h @@ -13,8 +13,8 @@ #include "src/__support/OSUtil/syscall.h" // For internal syscall function. #include "src/__support/common.h" #include "src/__support/error_or.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include // For uint64_t. #include // For syscall numbers. diff --git a/libc/src/__support/HashTable/randomness.h b/libc/src/__support/HashTable/randomness.h index 244dd41be3ee..6b58a4125f78 100644 --- a/libc/src/__support/HashTable/randomness.h +++ b/libc/src/__support/HashTable/randomness.h @@ -14,7 +14,7 @@ #include "src/__support/macros/attributes.h" #include "src/__support/macros/config.h" #if defined(LIBC_HASHTABLE_USE_GETRANDOM) -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/sys/random/getrandom.h" #endif diff --git a/libc/src/__support/OSUtil/linux/fcntl.cpp b/libc/src/__support/OSUtil/linux/fcntl.cpp index 4742b2a00220..99e16ad58c91 100644 --- a/libc/src/__support/OSUtil/linux/fcntl.cpp +++ b/libc/src/__support/OSUtil/linux/fcntl.cpp @@ -15,8 +15,8 @@ #include "hdr/types/struct_flock64.h" #include "src/__support/OSUtil/syscall.h" // For internal syscall function. #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include #include // For syscall numbers. diff --git a/libc/src/__support/OSUtil/linux/vdso.cpp b/libc/src/__support/OSUtil/linux/vdso.cpp index 8c9bd3e1bcc7..e4e53c3c2a0f 100644 --- a/libc/src/__support/OSUtil/linux/vdso.cpp +++ b/libc/src/__support/OSUtil/linux/vdso.cpp @@ -11,9 +11,9 @@ #include "src/__support/CPP/array.h" #include "src/__support/CPP/optional.h" #include "src/__support/CPP/string_view.h" +#include "src/__support/libc_errno.h" #include "src/__support/threads/callonce.h" #include "src/__support/threads/linux/futex_word.h" -#include "src/errno/libc_errno.h" #include "src/sys/auxv/getauxval.h" #include diff --git a/libc/src/__support/StringUtil/tables/linux_extension_errors.h b/libc/src/__support/StringUtil/tables/linux_extension_errors.h index 425590f6e91c..de637d60bea9 100644 --- a/libc/src/__support/StringUtil/tables/linux_extension_errors.h +++ b/libc/src/__support/StringUtil/tables/linux_extension_errors.h @@ -10,8 +10,8 @@ #define LLVM_LIBC_SRC___SUPPORT_STRINGUTIL_TABLES_LINUX_EXTENSION_ERRORS_H #include "src/__support/StringUtil/message_mapper.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/__support/libc_errno.h b/libc/src/__support/libc_errno.h new file mode 100644 index 000000000000..ab5f6a9c4b9d --- /dev/null +++ b/libc/src/__support/libc_errno.h @@ -0,0 +1,108 @@ +//===-- Implementation header for libc_errno --------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC___SUPPORT_LIBC_ERRNO_H +#define LLVM_LIBC_SRC___SUPPORT_LIBC_ERRNO_H + +// This header is to be consumed by internal implementations, in which all of +// them should refer to `libc_errno` instead of using `errno` directly from +// header. + +// Unit and hermetic tests should: +// - #include "src/__support/libc_errno.h" +// - NOT #include +// - Only use `libc_errno` in the code +// - Depend on libc.src.errno.errno + +// Integration tests should: +// - NOT #include "src/__support/libc_errno.h" +// - #include +// - Use regular `errno` in the code +// - Still depend on libc.src.errno.errno + +// libc uses a fallback default value, either system or thread local. +#define LIBC_ERRNO_MODE_DEFAULT 0 +// libc never stores a value; `errno` macro uses get link-time failure. +#define LIBC_ERRNO_MODE_UNDEFINED 1 +// libc maintains per-thread state (requires C++ `thread_local` support). +#define LIBC_ERRNO_MODE_THREAD_LOCAL 2 +// libc maintains shared state used by all threads, contrary to standard C +// semantics unless always single-threaded; nothing prevents data races. +#define LIBC_ERRNO_MODE_SHARED 3 +// libc doesn't maintain any internal state, instead the embedder must define +// `int *__llvm_libc_errno(void);` C function. +#define LIBC_ERRNO_MODE_EXTERNAL 4 +// libc uses system `` `errno` macro directly in the overlay mode; in +// fullbuild mode, effectively the same as `LIBC_ERRNO_MODE_EXTERNAL`. +// In this mode, the public C++ symbol `LIBC_NAMESPACE::libc_errno ` is still +// exported and get redirected to the system `errno` inside its implementation. + +// TODO: Investigate deprecating LIBC_ERRNO_MODE_SYSTEM in favor of +// LIBC_ERRNO_MODE_SYSTEM_INLINE. +// https://github.com/llvm/llvm-project/issues/143454 +#define LIBC_ERRNO_MODE_SYSTEM 5 +// In this mode, the libc_errno is simply a macro resolved to `errno` from the +// system header . There is no need to link against the +// `libc.src.errno.errno` object. +#define LIBC_ERRNO_MODE_SYSTEM_INLINE 6 + +#if !defined(LIBC_ERRNO_MODE) || LIBC_ERRNO_MODE == LIBC_ERRNO_MODE_DEFAULT +#undef LIBC_ERRNO_MODE +#if defined(LIBC_FULL_BUILD) || !defined(LIBC_COPT_PUBLIC_PACKAGING) +#define LIBC_ERRNO_MODE LIBC_ERRNO_MODE_THREAD_LOCAL +#else +#define LIBC_ERRNO_MODE LIBC_ERRNO_MODE_SYSTEM +#endif +#endif // LIBC_ERRNO_MODE + +#if LIBC_ERRNO_MODE != LIBC_ERRNO_MODE_DEFAULT && \ + LIBC_ERRNO_MODE != LIBC_ERRNO_MODE_UNDEFINED && \ + LIBC_ERRNO_MODE != LIBC_ERRNO_MODE_THREAD_LOCAL && \ + LIBC_ERRNO_MODE != LIBC_ERRNO_MODE_SHARED && \ + LIBC_ERRNO_MODE != LIBC_ERRNO_MODE_EXTERNAL && \ + LIBC_ERRNO_MODE != LIBC_ERRNO_MODE_SYSTEM && \ + LIBC_ERRNO_MODE != LIBC_ERRNO_MODE_SYSTEM_INLINE +#error LIBC_ERRNO_MODE must be one of the following values: \ +LIBC_ERRNO_MODE_DEFAULT, \ +LIBC_ERRNO_MODE_UNDEFINED, \ +LIBC_ERRNO_MODE_THREAD_LOCAL, \ +LIBC_ERRNO_MODE_SHARED, \ +LIBC_ERRNO_MODE_EXTERNAL, \ +LIBC_ERRNO_MODE_SYSTEM, \ +LIBC_ERRNO_MODE_SYSTEM_INLINE. +#endif + +#if LIBC_ERRNO_MODE == LIBC_ERRNO_MODE_SYSTEM_INLINE + +#include + +#define libc_errno errno + +#else // !LIBC_ERRNO_MODE_SYSTEM_INLINE + +#include "hdr/errno_macros.h" +#include "src/__support/macros/config.h" + +namespace LIBC_NAMESPACE_DECL { + +extern "C" int *__llvm_libc_errno() noexcept; + +struct Errno { + void operator=(int); + operator int(); +}; + +extern Errno libc_errno; + +} // namespace LIBC_NAMESPACE_DECL + +using LIBC_NAMESPACE::libc_errno; + +#endif // LIBC_ERRNO_MODE_SYSTEM_INLINE + +#endif // LLVM_LIBC_SRC___SUPPORT_LIBC_ERRNO_H diff --git a/libc/src/__support/threads/linux/thread.cpp b/libc/src/__support/threads/linux/thread.cpp index c531d74c5335..baad26aed685 100644 --- a/libc/src/__support/threads/linux/thread.cpp +++ b/libc/src/__support/threads/linux/thread.cpp @@ -14,9 +14,9 @@ #include "src/__support/OSUtil/syscall.h" // For syscall functions. #include "src/__support/common.h" #include "src/__support/error_or.h" +#include "src/__support/libc_errno.h" // For error macros #include "src/__support/macros/config.h" #include "src/__support/threads/linux/futex_utils.h" // For FutexWordType -#include "src/errno/libc_errno.h" // For error macros #ifdef LIBC_TARGET_ARCH_IS_AARCH64 #include diff --git a/libc/src/dirent/closedir.cpp b/libc/src/dirent/closedir.cpp index 1249ef94cf41..2f8f6f0c044d 100644 --- a/libc/src/dirent/closedir.cpp +++ b/libc/src/dirent/closedir.cpp @@ -10,8 +10,8 @@ #include "src/__support/File/dir.h" #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include diff --git a/libc/src/dirent/opendir.cpp b/libc/src/dirent/opendir.cpp index fee14ef0f558..bf47d0edac18 100644 --- a/libc/src/dirent/opendir.cpp +++ b/libc/src/dirent/opendir.cpp @@ -10,8 +10,8 @@ #include "src/__support/File/dir.h" #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include diff --git a/libc/src/dirent/readdir.cpp b/libc/src/dirent/readdir.cpp index ad460b5e80b8..f95f7c1ae864 100644 --- a/libc/src/dirent/readdir.cpp +++ b/libc/src/dirent/readdir.cpp @@ -10,8 +10,8 @@ #include "src/__support/File/dir.h" #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include diff --git a/libc/src/errno/CMakeLists.txt b/libc/src/errno/CMakeLists.txt index 1d78a5eedff9..2852044e9416 100644 --- a/libc/src/errno/CMakeLists.txt +++ b/libc/src/errno/CMakeLists.txt @@ -1,28 +1,16 @@ # If we are in full build mode, we will provide the errno definition ourselves, # and if we are in overlay mode, we will just re-use the system's errno. -# We are passing LIBC_FULL_BUILD flag in full build mode so that the -# implementation of libc_errno will know if we are in full build mode or not. - -# TODO: Move LIBC_FULL_BUILD flag to _get_common_compile_options. -set(full_build_flag "") -if(LLVM_LIBC_FULL_BUILD) - set(full_build_flag "-DLIBC_FULL_BUILD") -endif() - -if(LIBC_CONF_ERRNO_MODE) - set(errno_config_copts "-DLIBC_ERRNO_MODE=${LIBC_CONF_ERRNO_MODE}") -endif() add_entrypoint_object( errno SRCS libc_errno.cpp HDRS - libc_errno.h # Include this - COMPILE_OPTIONS - ${full_build_flag} - ${errno_config_copts} + ../__support/libc_errno.h DEPENDS libc.hdr.errno_macros libc.src.__support.common + libc.src.__support.libc_errno + libc.src.__support.macros.attributes + libc.src.__support.macros.config ) diff --git a/libc/src/errno/libc_errno.cpp b/libc/src/errno/libc_errno.cpp index d1600d1b050e..8ff1eec1b103 100644 --- a/libc/src/errno/libc_errno.cpp +++ b/libc/src/errno/libc_errno.cpp @@ -6,51 +6,14 @@ // //===----------------------------------------------------------------------===// -#include "libc_errno.h" +#include "src/__support/libc_errno.h" +#include "src/__support/macros/attributes.h" #include "src/__support/macros/config.h" -// libc uses a fallback default value, either system or thread local. -#define LIBC_ERRNO_MODE_DEFAULT 0 -// libc never stores a value; `errno` macro uses get link-time failure. -#define LIBC_ERRNO_MODE_UNDEFINED 1 -// libc maintains per-thread state (requires C++ `thread_local` support). -#define LIBC_ERRNO_MODE_THREAD_LOCAL 2 -// libc maintains shared state used by all threads, contrary to standard C -// semantics unless always single-threaded; nothing prevents data races. -#define LIBC_ERRNO_MODE_SHARED 3 -// libc doesn't maintain any internal state, instead the embedder must define -// `int *__llvm_libc_errno(void);` C function. -#define LIBC_ERRNO_MODE_EXTERNAL 4 -// libc uses system `` `errno` macro directly in the overlay mode; in -// fullbuild mode, effectively the same as `LIBC_ERRNO_MODE_EXTERNAL`. -#define LIBC_ERRNO_MODE_SYSTEM 5 - -#if !defined(LIBC_ERRNO_MODE) || LIBC_ERRNO_MODE == LIBC_ERRNO_MODE_DEFAULT -#undef LIBC_ERRNO_MODE -#if defined(LIBC_FULL_BUILD) || !defined(LIBC_COPT_PUBLIC_PACKAGING) -#define LIBC_ERRNO_MODE LIBC_ERRNO_MODE_THREAD_LOCAL -#else -#define LIBC_ERRNO_MODE LIBC_ERRNO_MODE_SYSTEM -#endif -#endif // LIBC_ERRNO_MODE - -#if LIBC_ERRNO_MODE != LIBC_ERRNO_MODE_DEFAULT && \ - LIBC_ERRNO_MODE != LIBC_ERRNO_MODE_UNDEFINED && \ - LIBC_ERRNO_MODE != LIBC_ERRNO_MODE_THREAD_LOCAL && \ - LIBC_ERRNO_MODE != LIBC_ERRNO_MODE_SHARED && \ - LIBC_ERRNO_MODE != LIBC_ERRNO_MODE_EXTERNAL && \ - LIBC_ERRNO_MODE != LIBC_ERRNO_MODE_SYSTEM -#error LIBC_ERRNO_MODE must be one of the following values: \ -LIBC_ERRNO_MODE_DEFAULT, \ -LIBC_ERRNO_MODE_UNDEFINED, \ -LIBC_ERRNO_MODE_THREAD_LOCAL, \ -LIBC_ERRNO_MODE_SHARED, \ -LIBC_ERRNO_MODE_EXTERNAL, \ -LIBC_ERRNO_MODE_SYSTEM -#endif - namespace LIBC_NAMESPACE_DECL { +#if LIBC_ERRNO_MODE != LIBC_ERRNO_MODE_SYSTEM_INLINE + #if LIBC_ERRNO_MODE == LIBC_ERRNO_MODE_UNDEFINED void Errno::operator=(int) {} @@ -93,4 +56,6 @@ Errno::operator int() { return errno; } // Define the global `libc_errno` instance. Errno libc_errno; +#endif // LIBC_ERRNO_MODE != LIBC_ERRNO_MODE_SYSTEM_INLINE + } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/errno/libc_errno.h b/libc/src/errno/libc_errno.h deleted file mode 100644 index 44ee2714843b..000000000000 --- a/libc/src/errno/libc_errno.h +++ /dev/null @@ -1,47 +0,0 @@ -//===-- Implementation header for libc_errno --------------------*- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIBC_SRC_ERRNO_LIBC_ERRNO_H -#define LLVM_LIBC_SRC_ERRNO_LIBC_ERRNO_H - -#include "src/__support/macros/attributes.h" -#include "src/__support/macros/config.h" -#include "src/__support/macros/properties/architectures.h" - -#include "hdr/errno_macros.h" - -// This header is to be consumed by internal implementations, in which all of -// them should refer to `libc_errno` instead of using `errno` directly from -// header. - -// Unit and hermetic tests should: -// - #include "src/errno/libc_errno.h" -// - NOT #include -// - Only use `libc_errno` in the code -// - Depend on libc.src.errno.errno - -// Integration tests should: -// - NOT #include "src/errno/libc_errno.h" -// - #include -// - Use regular `errno` in the code -// - Still depend on libc.src.errno.errno - -namespace LIBC_NAMESPACE_DECL { - -extern "C" int *__llvm_libc_errno() noexcept; - -struct Errno { - void operator=(int); - operator int(); -}; - -extern Errno libc_errno; - -} // namespace LIBC_NAMESPACE_DECL - -#endif // LLVM_LIBC_SRC_ERRNO_LIBC_ERRNO_H diff --git a/libc/src/fcntl/linux/creat.cpp b/libc/src/fcntl/linux/creat.cpp index 23abae243aed..71412a8e68c5 100644 --- a/libc/src/fcntl/linux/creat.cpp +++ b/libc/src/fcntl/linux/creat.cpp @@ -10,8 +10,8 @@ #include "src/__support/OSUtil/syscall.h" // For internal syscall function. #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include "hdr/fcntl_macros.h" #include // For syscall numbers. diff --git a/libc/src/fcntl/linux/open.cpp b/libc/src/fcntl/linux/open.cpp index 8b699ecdd204..a21a03788dea 100644 --- a/libc/src/fcntl/linux/open.cpp +++ b/libc/src/fcntl/linux/open.cpp @@ -10,8 +10,8 @@ #include "src/__support/OSUtil/syscall.h" // For internal syscall function. #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include "hdr/fcntl_macros.h" #include "hdr/types/mode_t.h" diff --git a/libc/src/fcntl/linux/openat.cpp b/libc/src/fcntl/linux/openat.cpp index 6063d9c00ad6..b47ad1fb3bb0 100644 --- a/libc/src/fcntl/linux/openat.cpp +++ b/libc/src/fcntl/linux/openat.cpp @@ -10,8 +10,8 @@ #include "src/__support/OSUtil/syscall.h" // For internal syscall function. #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include "hdr/types/mode_t.h" #include diff --git a/libc/src/inttypes/strtoimax.cpp b/libc/src/inttypes/strtoimax.cpp index 85f197c75d90..6e55a4b56aac 100644 --- a/libc/src/inttypes/strtoimax.cpp +++ b/libc/src/inttypes/strtoimax.cpp @@ -8,9 +8,9 @@ #include "src/inttypes/strtoimax.h" #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" #include "src/__support/str_to_integer.h" -#include "src/errno/libc_errno.h" namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/inttypes/strtoumax.cpp b/libc/src/inttypes/strtoumax.cpp index 2e9cbc9acba7..ce5a0a782d97 100644 --- a/libc/src/inttypes/strtoumax.cpp +++ b/libc/src/inttypes/strtoumax.cpp @@ -8,9 +8,9 @@ #include "src/inttypes/strtoumax.h" #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" #include "src/__support/str_to_integer.h" -#include "src/errno/libc_errno.h" namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/math/generic/exp10m1f.cpp b/libc/src/math/generic/exp10m1f.cpp index e973b2921c2e..27729104e038 100644 --- a/libc/src/math/generic/exp10m1f.cpp +++ b/libc/src/math/generic/exp10m1f.cpp @@ -14,9 +14,9 @@ #include "src/__support/FPUtil/multiply_add.h" #include "src/__support/FPUtil/rounding_mode.h" #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" #include "src/__support/macros/optimization.h" -#include "src/errno/libc_errno.h" #include "explogxf.h" diff --git a/libc/src/math/generic/exp2m1f.cpp b/libc/src/math/generic/exp2m1f.cpp index 4913a5e4277e..127c6eaa494d 100644 --- a/libc/src/math/generic/exp2m1f.cpp +++ b/libc/src/math/generic/exp2m1f.cpp @@ -14,10 +14,10 @@ #include "src/__support/FPUtil/multiply_add.h" #include "src/__support/FPUtil/rounding_mode.h" #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" #include "src/__support/macros/optimization.h" #include "src/__support/macros/properties/cpu_features.h" -#include "src/errno/libc_errno.h" #include "explogxf.h" diff --git a/libc/src/math/generic/nan.cpp b/libc/src/math/generic/nan.cpp index f92cd3ff5eb5..829a2ea435ac 100644 --- a/libc/src/math/generic/nan.cpp +++ b/libc/src/math/generic/nan.cpp @@ -8,9 +8,9 @@ #include "src/math/nan.h" #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" #include "src/__support/str_to_float.h" -#include "src/errno/libc_errno.h" namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/math/generic/nanf.cpp b/libc/src/math/generic/nanf.cpp index 7287182406ac..1cb66160e736 100644 --- a/libc/src/math/generic/nanf.cpp +++ b/libc/src/math/generic/nanf.cpp @@ -8,9 +8,9 @@ #include "src/math/nanf.h" #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" #include "src/__support/str_to_float.h" -#include "src/errno/libc_errno.h" namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/math/generic/nanf128.cpp b/libc/src/math/generic/nanf128.cpp index 3d8581afa037..4155c5333a9c 100644 --- a/libc/src/math/generic/nanf128.cpp +++ b/libc/src/math/generic/nanf128.cpp @@ -8,9 +8,9 @@ #include "src/math/nanf128.h" #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" #include "src/__support/str_to_float.h" -#include "src/errno/libc_errno.h" namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/math/generic/nanf16.cpp b/libc/src/math/generic/nanf16.cpp index 27d9d165f4a8..7b166400601b 100644 --- a/libc/src/math/generic/nanf16.cpp +++ b/libc/src/math/generic/nanf16.cpp @@ -8,9 +8,9 @@ #include "src/math/nanf16.h" #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" #include "src/__support/str_to_float.h" -#include "src/errno/libc_errno.h" namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/math/generic/nanl.cpp b/libc/src/math/generic/nanl.cpp index 4f698cb3c88d..58d638c4b531 100644 --- a/libc/src/math/generic/nanl.cpp +++ b/libc/src/math/generic/nanl.cpp @@ -8,9 +8,9 @@ #include "src/math/nanl.h" #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" #include "src/__support/str_to_float.h" -#include "src/errno/libc_errno.h" namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/poll/linux/poll.cpp b/libc/src/poll/linux/poll.cpp index f82fcbcc6577..4cac75b9687c 100644 --- a/libc/src/poll/linux/poll.cpp +++ b/libc/src/poll/linux/poll.cpp @@ -13,8 +13,8 @@ #include "hdr/types/struct_timespec.h" #include "src/__support/OSUtil/syscall.h" // syscall_impl #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include // SYS_poll, SYS_ppoll diff --git a/libc/src/pthread/pthread_atfork.cpp b/libc/src/pthread/pthread_atfork.cpp index b2c67c78e5d9..4cad16a02de7 100644 --- a/libc/src/pthread/pthread_atfork.cpp +++ b/libc/src/pthread/pthread_atfork.cpp @@ -9,9 +9,9 @@ #include "pthread_atfork.h" #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" #include "src/__support/threads/fork_callbacks.h" -#include "src/errno/libc_errno.h" #include // For pthread_* type definitions. diff --git a/libc/src/pthread/pthread_attr_setdetachstate.cpp b/libc/src/pthread/pthread_attr_setdetachstate.cpp index 872f694e01f3..c482d25610c2 100644 --- a/libc/src/pthread/pthread_attr_setdetachstate.cpp +++ b/libc/src/pthread/pthread_attr_setdetachstate.cpp @@ -9,8 +9,8 @@ #include "pthread_attr_setdetachstate.h" #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include diff --git a/libc/src/pthread/pthread_attr_setguardsize.cpp b/libc/src/pthread/pthread_attr_setguardsize.cpp index fa4375e915ab..c996210a61d8 100644 --- a/libc/src/pthread/pthread_attr_setguardsize.cpp +++ b/libc/src/pthread/pthread_attr_setguardsize.cpp @@ -9,8 +9,8 @@ #include "pthread_attr_setguardsize.h" #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include // For EXEC_PAGESIZE. #include diff --git a/libc/src/pthread/pthread_attr_setstack.cpp b/libc/src/pthread/pthread_attr_setstack.cpp index 1154055a63a7..767f959b1400 100644 --- a/libc/src/pthread/pthread_attr_setstack.cpp +++ b/libc/src/pthread/pthread_attr_setstack.cpp @@ -10,9 +10,9 @@ #include "pthread_attr_setstacksize.h" #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" #include "src/__support/threads/thread.h" // For STACK_ALIGNMENT -#include "src/errno/libc_errno.h" #include #include diff --git a/libc/src/pthread/pthread_attr_setstacksize.cpp b/libc/src/pthread/pthread_attr_setstacksize.cpp index 0a5d1af661ab..38c77ca761d6 100644 --- a/libc/src/pthread/pthread_attr_setstacksize.cpp +++ b/libc/src/pthread/pthread_attr_setstacksize.cpp @@ -9,8 +9,8 @@ #include "pthread_attr_setstacksize.h" #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include diff --git a/libc/src/pthread/pthread_condattr_setclock.cpp b/libc/src/pthread/pthread_condattr_setclock.cpp index 5e825d5ecea6..2f63d5e9d194 100644 --- a/libc/src/pthread/pthread_condattr_setclock.cpp +++ b/libc/src/pthread/pthread_condattr_setclock.cpp @@ -9,8 +9,8 @@ #include "pthread_condattr_setclock.h" #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include "hdr/time_macros.h" // CLOCK_MONOTONIC, CLOCK_REALTIME #include // pthread_condattr_t diff --git a/libc/src/pthread/pthread_condattr_setpshared.cpp b/libc/src/pthread/pthread_condattr_setpshared.cpp index 433b2dc1d2d9..9c117499a559 100644 --- a/libc/src/pthread/pthread_condattr_setpshared.cpp +++ b/libc/src/pthread/pthread_condattr_setpshared.cpp @@ -9,8 +9,8 @@ #include "pthread_condattr_setpshared.h" #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include // pthread_condattr_t, PTHREAD_PROCESS_SHARED, PTHREAD_PROCESS_PRIVATE diff --git a/libc/src/pthread/pthread_create.cpp b/libc/src/pthread/pthread_create.cpp index e1b1f3b325d1..45be2807fa83 100644 --- a/libc/src/pthread/pthread_create.cpp +++ b/libc/src/pthread/pthread_create.cpp @@ -16,10 +16,10 @@ #include "pthread_attr_getstack.h" #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" #include "src/__support/macros/optimization.h" #include "src/__support/threads/thread.h" -#include "src/errno/libc_errno.h" #include // For pthread_* type definitions. diff --git a/libc/src/pthread/pthread_key_create.cpp b/libc/src/pthread/pthread_key_create.cpp index 383762f273e7..7253de14cc0d 100644 --- a/libc/src/pthread/pthread_key_create.cpp +++ b/libc/src/pthread/pthread_key_create.cpp @@ -9,9 +9,9 @@ #include "pthread_key_create.h" #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" #include "src/__support/threads/thread.h" -#include "src/errno/libc_errno.h" #include diff --git a/libc/src/pthread/pthread_key_delete.cpp b/libc/src/pthread/pthread_key_delete.cpp index b54db821ab05..2b14d874fe31 100644 --- a/libc/src/pthread/pthread_key_delete.cpp +++ b/libc/src/pthread/pthread_key_delete.cpp @@ -9,9 +9,9 @@ #include "pthread_key_delete.h" #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" #include "src/__support/threads/thread.h" -#include "src/errno/libc_errno.h" #include diff --git a/libc/src/pthread/pthread_mutexattr_setpshared.cpp b/libc/src/pthread/pthread_mutexattr_setpshared.cpp index deeae15be230..a87a08259c4b 100644 --- a/libc/src/pthread/pthread_mutexattr_setpshared.cpp +++ b/libc/src/pthread/pthread_mutexattr_setpshared.cpp @@ -10,8 +10,8 @@ #include "pthread_mutexattr.h" #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include diff --git a/libc/src/pthread/pthread_mutexattr_setrobust.cpp b/libc/src/pthread/pthread_mutexattr_setrobust.cpp index 9fd46f4c928d..fd7a8d7ce1d1 100644 --- a/libc/src/pthread/pthread_mutexattr_setrobust.cpp +++ b/libc/src/pthread/pthread_mutexattr_setrobust.cpp @@ -10,8 +10,8 @@ #include "pthread_mutexattr.h" #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include diff --git a/libc/src/pthread/pthread_mutexattr_settype.cpp b/libc/src/pthread/pthread_mutexattr_settype.cpp index c7e78271f9c3..5a65f031045d 100644 --- a/libc/src/pthread/pthread_mutexattr_settype.cpp +++ b/libc/src/pthread/pthread_mutexattr_settype.cpp @@ -10,8 +10,8 @@ #include "pthread_mutexattr.h" #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include diff --git a/libc/src/pthread/pthread_rwlock_timedrdlock.cpp b/libc/src/pthread/pthread_rwlock_timedrdlock.cpp index 112ff5c9cdad..fcddfed22490 100644 --- a/libc/src/pthread/pthread_rwlock_timedrdlock.cpp +++ b/libc/src/pthread/pthread_rwlock_timedrdlock.cpp @@ -9,11 +9,11 @@ #include "src/pthread/pthread_rwlock_timedrdlock.h" #include "src/__support/common.h" #include "src/__support/libc_assert.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" #include "src/__support/macros/optimization.h" #include "src/__support/threads/linux/rwlock.h" #include "src/__support/time/linux/abs_timeout.h" -#include "src/errno/libc_errno.h" #include diff --git a/libc/src/pthread/pthread_rwlock_trywrlock.cpp b/libc/src/pthread/pthread_rwlock_trywrlock.cpp index a63dc893e716..660c15a87b36 100644 --- a/libc/src/pthread/pthread_rwlock_trywrlock.cpp +++ b/libc/src/pthread/pthread_rwlock_trywrlock.cpp @@ -9,9 +9,9 @@ #include "src/pthread/pthread_rwlock_trywrlock.h" #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" #include "src/__support/threads/linux/rwlock.h" -#include "src/errno/libc_errno.h" #include diff --git a/libc/src/pthread/pthread_rwlock_unlock.cpp b/libc/src/pthread/pthread_rwlock_unlock.cpp index e61290179bd6..5496bea929c5 100644 --- a/libc/src/pthread/pthread_rwlock_unlock.cpp +++ b/libc/src/pthread/pthread_rwlock_unlock.cpp @@ -9,9 +9,9 @@ #include "src/pthread/pthread_rwlock_unlock.h" #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" #include "src/__support/threads/linux/rwlock.h" -#include "src/errno/libc_errno.h" #include diff --git a/libc/src/pthread/pthread_rwlockattr_setkind_np.cpp b/libc/src/pthread/pthread_rwlockattr_setkind_np.cpp index 80d34a35c717..e6800311b858 100644 --- a/libc/src/pthread/pthread_rwlockattr_setkind_np.cpp +++ b/libc/src/pthread/pthread_rwlockattr_setkind_np.cpp @@ -9,8 +9,8 @@ #include "pthread_rwlockattr_setkind_np.h" #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include // pthread_rwlockattr_t diff --git a/libc/src/pthread/pthread_rwlockattr_setpshared.cpp b/libc/src/pthread/pthread_rwlockattr_setpshared.cpp index 5a7191aefd3d..4fbd095ac2b4 100644 --- a/libc/src/pthread/pthread_rwlockattr_setpshared.cpp +++ b/libc/src/pthread/pthread_rwlockattr_setpshared.cpp @@ -9,8 +9,8 @@ #include "pthread_rwlockattr_setpshared.h" #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include // pthread_rwlockattr_t, PTHREAD_PROCESS_SHARED, PTHREAD_PROCESS_PRIVATE diff --git a/libc/src/pthread/pthread_setspecific.cpp b/libc/src/pthread/pthread_setspecific.cpp index 70c29c167084..b147a66d2fad 100644 --- a/libc/src/pthread/pthread_setspecific.cpp +++ b/libc/src/pthread/pthread_setspecific.cpp @@ -9,9 +9,9 @@ #include "pthread_setspecific.h" #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" #include "src/__support/threads/thread.h" -#include "src/errno/libc_errno.h" #include diff --git a/libc/src/sched/linux/sched_get_priority_max.cpp b/libc/src/sched/linux/sched_get_priority_max.cpp index 77a82c77405f..fb30b1e319e7 100644 --- a/libc/src/sched/linux/sched_get_priority_max.cpp +++ b/libc/src/sched/linux/sched_get_priority_max.cpp @@ -10,8 +10,8 @@ #include "src/__support/OSUtil/syscall.h" // For internal syscall function. #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include // For syscall numbers. diff --git a/libc/src/sched/linux/sched_get_priority_min.cpp b/libc/src/sched/linux/sched_get_priority_min.cpp index fca66a15edb5..54f67e915fc1 100644 --- a/libc/src/sched/linux/sched_get_priority_min.cpp +++ b/libc/src/sched/linux/sched_get_priority_min.cpp @@ -10,8 +10,8 @@ #include "src/__support/OSUtil/syscall.h" // For internal syscall function. #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include // For syscall numbers. diff --git a/libc/src/sched/linux/sched_getaffinity.cpp b/libc/src/sched/linux/sched_getaffinity.cpp index 7b1fd8c5aa2a..e005819e2a97 100644 --- a/libc/src/sched/linux/sched_getaffinity.cpp +++ b/libc/src/sched/linux/sched_getaffinity.cpp @@ -10,8 +10,8 @@ #include "src/__support/OSUtil/syscall.h" // For internal syscall function. #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include #include diff --git a/libc/src/sched/linux/sched_getparam.cpp b/libc/src/sched/linux/sched_getparam.cpp index 75756a65f0ed..b0576c3ac65b 100644 --- a/libc/src/sched/linux/sched_getparam.cpp +++ b/libc/src/sched/linux/sched_getparam.cpp @@ -10,8 +10,8 @@ #include "src/__support/OSUtil/syscall.h" // For internal syscall function. #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include // For syscall numbers. diff --git a/libc/src/sched/linux/sched_getscheduler.cpp b/libc/src/sched/linux/sched_getscheduler.cpp index 545cda8e7484..d8e02967a633 100644 --- a/libc/src/sched/linux/sched_getscheduler.cpp +++ b/libc/src/sched/linux/sched_getscheduler.cpp @@ -10,8 +10,8 @@ #include "src/__support/OSUtil/syscall.h" // For internal syscall function. #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include // For syscall numbers. diff --git a/libc/src/sched/linux/sched_rr_get_interval.cpp b/libc/src/sched/linux/sched_rr_get_interval.cpp index 1f0ef69dfc89..5668d596bce1 100644 --- a/libc/src/sched/linux/sched_rr_get_interval.cpp +++ b/libc/src/sched/linux/sched_rr_get_interval.cpp @@ -10,8 +10,8 @@ #include "src/__support/OSUtil/syscall.h" // For internal syscall function. #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include // For syscall numbers. diff --git a/libc/src/sched/linux/sched_setaffinity.cpp b/libc/src/sched/linux/sched_setaffinity.cpp index cad48c26bf93..93e930dcf2e3 100644 --- a/libc/src/sched/linux/sched_setaffinity.cpp +++ b/libc/src/sched/linux/sched_setaffinity.cpp @@ -10,8 +10,8 @@ #include "src/__support/OSUtil/syscall.h" // For internal syscall function. #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include #include // For syscall numbers. diff --git a/libc/src/sched/linux/sched_setparam.cpp b/libc/src/sched/linux/sched_setparam.cpp index e78e78a707e0..7875d9e2f19b 100644 --- a/libc/src/sched/linux/sched_setparam.cpp +++ b/libc/src/sched/linux/sched_setparam.cpp @@ -10,8 +10,8 @@ #include "src/__support/OSUtil/syscall.h" // For internal syscall function. #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include // For syscall numbers. diff --git a/libc/src/sched/linux/sched_setscheduler.cpp b/libc/src/sched/linux/sched_setscheduler.cpp index b6b6f667b3f9..232e5a59b185 100644 --- a/libc/src/sched/linux/sched_setscheduler.cpp +++ b/libc/src/sched/linux/sched_setscheduler.cpp @@ -10,8 +10,8 @@ #include "src/__support/OSUtil/syscall.h" // For internal syscall function. #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include // For syscall numbers. diff --git a/libc/src/sched/linux/sched_yield.cpp b/libc/src/sched/linux/sched_yield.cpp index 3de9d0ba3571..c1e9168f34d0 100644 --- a/libc/src/sched/linux/sched_yield.cpp +++ b/libc/src/sched/linux/sched_yield.cpp @@ -10,8 +10,8 @@ #include "src/__support/OSUtil/syscall.h" // For internal syscall function. #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include // For syscall numbers. diff --git a/libc/src/search/hcreate.cpp b/libc/src/search/hcreate.cpp index ac816a902e22..68bdb29e51df 100644 --- a/libc/src/search/hcreate.cpp +++ b/libc/src/search/hcreate.cpp @@ -9,8 +9,8 @@ #include "src/search/hcreate.h" #include "src/__support/HashTable/randomness.h" #include "src/__support/HashTable/table.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include "src/search/hsearch/global.h" namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/search/hcreate_r.cpp b/libc/src/search/hcreate_r.cpp index 17acd808c19a..c89be803b4e1 100644 --- a/libc/src/search/hcreate_r.cpp +++ b/libc/src/search/hcreate_r.cpp @@ -9,8 +9,8 @@ #include "src/search/hcreate_r.h" #include "src/__support/HashTable/randomness.h" #include "src/__support/HashTable/table.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" namespace LIBC_NAMESPACE_DECL { LLVM_LIBC_FUNCTION(int, hcreate_r, diff --git a/libc/src/search/hdestroy_r.cpp b/libc/src/search/hdestroy_r.cpp index 7eff5bb6fff9..ba5476098be2 100644 --- a/libc/src/search/hdestroy_r.cpp +++ b/libc/src/search/hdestroy_r.cpp @@ -8,8 +8,8 @@ #include "src/search/hdestroy_r.h" #include "src/__support/HashTable/table.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" namespace LIBC_NAMESPACE_DECL { LLVM_LIBC_FUNCTION(void, hdestroy_r, (struct hsearch_data * htab)) { diff --git a/libc/src/search/hsearch.cpp b/libc/src/search/hsearch.cpp index c18b5d3d7f54..034333d17057 100644 --- a/libc/src/search/hsearch.cpp +++ b/libc/src/search/hsearch.cpp @@ -9,8 +9,8 @@ #include "src/search/hsearch.h" #include "src/__support/HashTable/randomness.h" #include "src/__support/HashTable/table.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include "src/search/hsearch/global.h" namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/search/hsearch_r.cpp b/libc/src/search/hsearch_r.cpp index f93e608a190b..323001e1b103 100644 --- a/libc/src/search/hsearch_r.cpp +++ b/libc/src/search/hsearch_r.cpp @@ -8,8 +8,8 @@ #include "src/search/hsearch_r.h" #include "src/__support/HashTable/table.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" namespace LIBC_NAMESPACE_DECL { LLVM_LIBC_FUNCTION(int, hsearch_r, diff --git a/libc/src/signal/linux/kill.cpp b/libc/src/signal/linux/kill.cpp index ed117858f51e..0f5e88757acb 100644 --- a/libc/src/signal/linux/kill.cpp +++ b/libc/src/signal/linux/kill.cpp @@ -10,8 +10,8 @@ #include "src/__support/OSUtil/syscall.h" // For internal syscall function. #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include "src/signal/linux/signal_utils.h" #include diff --git a/libc/src/signal/linux/sigaction.cpp b/libc/src/signal/linux/sigaction.cpp index 65ec36741683..43a3e195474e 100644 --- a/libc/src/signal/linux/sigaction.cpp +++ b/libc/src/signal/linux/sigaction.cpp @@ -10,8 +10,8 @@ #include "hdr/types/sigset_t.h" #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include "src/signal/linux/signal_utils.h" namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/signal/linux/sigaddset.cpp b/libc/src/signal/linux/sigaddset.cpp index 628883e13b88..2091e8b51453 100644 --- a/libc/src/signal/linux/sigaddset.cpp +++ b/libc/src/signal/linux/sigaddset.cpp @@ -10,8 +10,8 @@ #include "hdr/types/sigset_t.h" #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include "src/signal/linux/signal_utils.h" namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/signal/linux/sigaltstack.cpp b/libc/src/signal/linux/sigaltstack.cpp index c19394cd1791..990b841c6d90 100644 --- a/libc/src/signal/linux/sigaltstack.cpp +++ b/libc/src/signal/linux/sigaltstack.cpp @@ -8,8 +8,8 @@ #include "src/signal/sigaltstack.h" #include "hdr/types/stack_t.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include "src/signal/linux/signal_utils.h" #include "src/__support/common.h" diff --git a/libc/src/signal/linux/sigdelset.cpp b/libc/src/signal/linux/sigdelset.cpp index 2e964051ebde..6fce0d7a6e14 100644 --- a/libc/src/signal/linux/sigdelset.cpp +++ b/libc/src/signal/linux/sigdelset.cpp @@ -10,8 +10,8 @@ #include "hdr/types/sigset_t.h" #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include "src/signal/linux/signal_utils.h" namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/signal/linux/sigemptyset.cpp b/libc/src/signal/linux/sigemptyset.cpp index d347477695e6..034a9e2cbe15 100644 --- a/libc/src/signal/linux/sigemptyset.cpp +++ b/libc/src/signal/linux/sigemptyset.cpp @@ -7,8 +7,8 @@ //===----------------------------------------------------------------------===// #include "src/signal/sigemptyset.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include "src/signal/linux/signal_utils.h" #include "src/__support/common.h" diff --git a/libc/src/signal/linux/sigfillset.cpp b/libc/src/signal/linux/sigfillset.cpp index 3e9897a03bb7..f0b499093b31 100644 --- a/libc/src/signal/linux/sigfillset.cpp +++ b/libc/src/signal/linux/sigfillset.cpp @@ -10,8 +10,8 @@ #include "hdr/types/sigset_t.h" #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include "src/signal/linux/signal_utils.h" namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/signal/linux/sigprocmask.cpp b/libc/src/signal/linux/sigprocmask.cpp index 8838379ae5d3..af3c424c5f34 100644 --- a/libc/src/signal/linux/sigprocmask.cpp +++ b/libc/src/signal/linux/sigprocmask.cpp @@ -11,8 +11,8 @@ #include "hdr/types/sigset_t.h" #include "src/__support/OSUtil/syscall.h" // For internal syscall function. #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include "src/signal/linux/signal_utils.h" #include // For syscall numbers. diff --git a/libc/src/spawn/posix_spawn_file_actions_addclose.cpp b/libc/src/spawn/posix_spawn_file_actions_addclose.cpp index bb8504f655c4..9a575bd59163 100644 --- a/libc/src/spawn/posix_spawn_file_actions_addclose.cpp +++ b/libc/src/spawn/posix_spawn_file_actions_addclose.cpp @@ -11,8 +11,8 @@ #include "file_actions.h" #include "src/__support/CPP/new.h" #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include diff --git a/libc/src/spawn/posix_spawn_file_actions_adddup2.cpp b/libc/src/spawn/posix_spawn_file_actions_adddup2.cpp index 710063d52e74..1ad45ed942bb 100644 --- a/libc/src/spawn/posix_spawn_file_actions_adddup2.cpp +++ b/libc/src/spawn/posix_spawn_file_actions_adddup2.cpp @@ -11,8 +11,8 @@ #include "file_actions.h" #include "src/__support/CPP/new.h" #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include diff --git a/libc/src/spawn/posix_spawn_file_actions_addopen.cpp b/libc/src/spawn/posix_spawn_file_actions_addopen.cpp index 028d6e895f3c..9977fc2d0a21 100644 --- a/libc/src/spawn/posix_spawn_file_actions_addopen.cpp +++ b/libc/src/spawn/posix_spawn_file_actions_addopen.cpp @@ -11,8 +11,8 @@ #include "file_actions.h" #include "src/__support/CPP/new.h" #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include diff --git a/libc/src/spawn/posix_spawn_file_actions_destroy.cpp b/libc/src/spawn/posix_spawn_file_actions_destroy.cpp index 168118da249d..affd338005cf 100644 --- a/libc/src/spawn/posix_spawn_file_actions_destroy.cpp +++ b/libc/src/spawn/posix_spawn_file_actions_destroy.cpp @@ -12,8 +12,8 @@ #include "src/__support/CPP/new.h" #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include diff --git a/libc/src/stdio/fopencookie.cpp b/libc/src/stdio/fopencookie.cpp index 9f5694e8e058..da8a132a4db6 100644 --- a/libc/src/stdio/fopencookie.cpp +++ b/libc/src/stdio/fopencookie.cpp @@ -14,8 +14,8 @@ #include "src/__support/CPP/new.h" #include "src/__support/File/file.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/stdio/generic/fclose.cpp b/libc/src/stdio/generic/fclose.cpp index 388407a58d41..902b4cf97237 100644 --- a/libc/src/stdio/generic/fclose.cpp +++ b/libc/src/stdio/generic/fclose.cpp @@ -10,8 +10,8 @@ #include "src/__support/File/file.h" #include "hdr/types/FILE.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/stdio/generic/fflush.cpp b/libc/src/stdio/generic/fflush.cpp index 5bdf71ad3594..d0271d9154c8 100644 --- a/libc/src/stdio/generic/fflush.cpp +++ b/libc/src/stdio/generic/fflush.cpp @@ -10,8 +10,8 @@ #include "src/__support/File/file.h" #include "hdr/types/FILE.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/stdio/generic/fgetc.cpp b/libc/src/stdio/generic/fgetc.cpp index aa6660ca180c..e65ce2fda49b 100644 --- a/libc/src/stdio/generic/fgetc.cpp +++ b/libc/src/stdio/generic/fgetc.cpp @@ -10,8 +10,8 @@ #include "src/__support/File/file.h" #include "hdr/types/FILE.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/stdio/generic/fgetc_unlocked.cpp b/libc/src/stdio/generic/fgetc_unlocked.cpp index 34a27f1d1c42..5c07d4feb513 100644 --- a/libc/src/stdio/generic/fgetc_unlocked.cpp +++ b/libc/src/stdio/generic/fgetc_unlocked.cpp @@ -10,8 +10,8 @@ #include "src/__support/File/file.h" #include "hdr/types/FILE.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/stdio/generic/fgets.cpp b/libc/src/stdio/generic/fgets.cpp index de6474087a14..e0ad9b6e2f56 100644 --- a/libc/src/stdio/generic/fgets.cpp +++ b/libc/src/stdio/generic/fgets.cpp @@ -10,8 +10,8 @@ #include "src/__support/File/file.h" #include "hdr/types/FILE.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/stdio/generic/fopen.cpp b/libc/src/stdio/generic/fopen.cpp index d6e418bacf37..57c85c2e54e1 100644 --- a/libc/src/stdio/generic/fopen.cpp +++ b/libc/src/stdio/generic/fopen.cpp @@ -10,8 +10,8 @@ #include "src/__support/File/file.h" #include "hdr/types/FILE.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/stdio/generic/fputc.cpp b/libc/src/stdio/generic/fputc.cpp index 54a38aeb2f1e..6639f0687c87 100644 --- a/libc/src/stdio/generic/fputc.cpp +++ b/libc/src/stdio/generic/fputc.cpp @@ -10,8 +10,8 @@ #include "src/__support/File/file.h" #include "hdr/types/FILE.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/stdio/generic/fputs.cpp b/libc/src/stdio/generic/fputs.cpp index 8aef7683b3ce..621b40f63c91 100644 --- a/libc/src/stdio/generic/fputs.cpp +++ b/libc/src/stdio/generic/fputs.cpp @@ -11,8 +11,8 @@ #include "src/__support/File/file.h" #include "hdr/types/FILE.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/stdio/generic/fread.cpp b/libc/src/stdio/generic/fread.cpp index 3a04094ea8b4..1b576ec34688 100644 --- a/libc/src/stdio/generic/fread.cpp +++ b/libc/src/stdio/generic/fread.cpp @@ -10,8 +10,8 @@ #include "src/__support/File/file.h" #include "hdr/types/FILE.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/stdio/generic/fread_unlocked.cpp b/libc/src/stdio/generic/fread_unlocked.cpp index 151f43c6bbeb..257f1a212add 100644 --- a/libc/src/stdio/generic/fread_unlocked.cpp +++ b/libc/src/stdio/generic/fread_unlocked.cpp @@ -10,8 +10,8 @@ #include "src/__support/File/file.h" #include "hdr/types/FILE.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/stdio/generic/fseek.cpp b/libc/src/stdio/generic/fseek.cpp index 21820da18542..99191e7c4194 100644 --- a/libc/src/stdio/generic/fseek.cpp +++ b/libc/src/stdio/generic/fseek.cpp @@ -9,8 +9,8 @@ #include "src/stdio/fseek.h" #include "src/__support/File/file.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/stdio/generic/fseeko.cpp b/libc/src/stdio/generic/fseeko.cpp index 7456b4a21907..afcfc71c7c09 100644 --- a/libc/src/stdio/generic/fseeko.cpp +++ b/libc/src/stdio/generic/fseeko.cpp @@ -9,8 +9,8 @@ #include "src/stdio/fseeko.h" #include "src/__support/File/file.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/stdio/generic/ftell.cpp b/libc/src/stdio/generic/ftell.cpp index ec15ca4e96ca..b55a806007af 100644 --- a/libc/src/stdio/generic/ftell.cpp +++ b/libc/src/stdio/generic/ftell.cpp @@ -9,8 +9,8 @@ #include "src/stdio/ftell.h" #include "src/__support/File/file.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/stdio/generic/ftello.cpp b/libc/src/stdio/generic/ftello.cpp index e3d0726ec484..91031cb7fad7 100644 --- a/libc/src/stdio/generic/ftello.cpp +++ b/libc/src/stdio/generic/ftello.cpp @@ -9,8 +9,8 @@ #include "src/stdio/ftello.h" #include "src/__support/File/file.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/stdio/generic/fwrite.cpp b/libc/src/stdio/generic/fwrite.cpp index 66eb9a3c7185..b44ecb283811 100644 --- a/libc/src/stdio/generic/fwrite.cpp +++ b/libc/src/stdio/generic/fwrite.cpp @@ -10,8 +10,8 @@ #include "src/__support/File/file.h" #include "hdr/types/FILE.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/stdio/generic/fwrite_unlocked.cpp b/libc/src/stdio/generic/fwrite_unlocked.cpp index a0d9014cd68d..2f9ec26f2f80 100644 --- a/libc/src/stdio/generic/fwrite_unlocked.cpp +++ b/libc/src/stdio/generic/fwrite_unlocked.cpp @@ -10,8 +10,8 @@ #include "src/__support/File/file.h" #include "hdr/types/FILE.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/stdio/generic/getc.cpp b/libc/src/stdio/generic/getc.cpp index e988468898c5..0ac010ebc599 100644 --- a/libc/src/stdio/generic/getc.cpp +++ b/libc/src/stdio/generic/getc.cpp @@ -10,8 +10,8 @@ #include "src/__support/File/file.h" #include "hdr/types/FILE.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/stdio/generic/getc_unlocked.cpp b/libc/src/stdio/generic/getc_unlocked.cpp index 92d5092623ac..eee23a18d05d 100644 --- a/libc/src/stdio/generic/getc_unlocked.cpp +++ b/libc/src/stdio/generic/getc_unlocked.cpp @@ -10,8 +10,8 @@ #include "src/__support/File/file.h" #include "hdr/types/FILE.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/stdio/generic/getchar.cpp b/libc/src/stdio/generic/getchar.cpp index 371fc70eb214..87d24a2b1f09 100644 --- a/libc/src/stdio/generic/getchar.cpp +++ b/libc/src/stdio/generic/getchar.cpp @@ -10,8 +10,8 @@ #include "src/__support/File/file.h" #include "hdr/types/FILE.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/stdio/generic/getchar_unlocked.cpp b/libc/src/stdio/generic/getchar_unlocked.cpp index b898f5cb2596..f321969483e3 100644 --- a/libc/src/stdio/generic/getchar_unlocked.cpp +++ b/libc/src/stdio/generic/getchar_unlocked.cpp @@ -10,8 +10,8 @@ #include "src/__support/File/file.h" #include "hdr/types/FILE.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/stdio/generic/putc.cpp b/libc/src/stdio/generic/putc.cpp index b5f008fdce44..83bc3d4131e7 100644 --- a/libc/src/stdio/generic/putc.cpp +++ b/libc/src/stdio/generic/putc.cpp @@ -10,8 +10,8 @@ #include "src/__support/File/file.h" #include "hdr/types/FILE.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/stdio/generic/putchar.cpp b/libc/src/stdio/generic/putchar.cpp index e86df23d6716..2b3509e5e414 100644 --- a/libc/src/stdio/generic/putchar.cpp +++ b/libc/src/stdio/generic/putchar.cpp @@ -10,8 +10,8 @@ #include "src/__support/File/file.h" #include "hdr/types/FILE.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/stdio/generic/puts.cpp b/libc/src/stdio/generic/puts.cpp index 7dbe2c79f920..4267dd546c4d 100644 --- a/libc/src/stdio/generic/puts.cpp +++ b/libc/src/stdio/generic/puts.cpp @@ -11,8 +11,8 @@ #include "src/__support/File/file.h" #include "hdr/types/FILE.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/stdio/gpu/fprintf.cpp b/libc/src/stdio/gpu/fprintf.cpp index 5b8f01d7d534..9877817d9209 100644 --- a/libc/src/stdio/gpu/fprintf.cpp +++ b/libc/src/stdio/gpu/fprintf.cpp @@ -12,7 +12,7 @@ #include "src/__support/CPP/string_view.h" #include "src/__support/arg_list.h" #include "src/__support/common.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/stdio/gpu/vfprintf_utils.h" #include diff --git a/libc/src/stdio/gpu/printf.cpp b/libc/src/stdio/gpu/printf.cpp index 53fe69d5e2eb..8a9174d7397a 100644 --- a/libc/src/stdio/gpu/printf.cpp +++ b/libc/src/stdio/gpu/printf.cpp @@ -11,7 +11,7 @@ #include "src/__support/CPP/string_view.h" #include "src/__support/arg_list.h" #include "src/__support/common.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/stdio/gpu/vfprintf_utils.h" #include diff --git a/libc/src/stdio/linux/fdopen.cpp b/libc/src/stdio/linux/fdopen.cpp index 7d72fdc88e9f..5623f06b7cff 100644 --- a/libc/src/stdio/linux/fdopen.cpp +++ b/libc/src/stdio/linux/fdopen.cpp @@ -9,8 +9,8 @@ #include "src/stdio/fdopen.h" #include "src/__support/File/linux/file.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/stdio/linux/remove.cpp b/libc/src/stdio/linux/remove.cpp index dbb4491d0e6c..ac755db0bc78 100644 --- a/libc/src/stdio/linux/remove.cpp +++ b/libc/src/stdio/linux/remove.cpp @@ -12,8 +12,8 @@ #include "src/__support/common.h" #include "hdr/fcntl_macros.h" // For AT_* macros. +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include // For syscall numbers. namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/stdio/linux/rename.cpp b/libc/src/stdio/linux/rename.cpp index fbcb29be48f4..426c8698e557 100644 --- a/libc/src/stdio/linux/rename.cpp +++ b/libc/src/stdio/linux/rename.cpp @@ -10,8 +10,8 @@ #include "hdr/fcntl_macros.h" #include "src/__support/OSUtil/syscall.h" // For internal syscall function. #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include // For syscall numbers. namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/stdio/printf_core/parser.h b/libc/src/stdio/printf_core/parser.h index 89556f1a9e5f..cef9b1ae58fa 100644 --- a/libc/src/stdio/printf_core/parser.h +++ b/libc/src/stdio/printf_core/parser.h @@ -25,7 +25,7 @@ #include "src/__support/fixed_point/fx_rep.h" #endif // LIBC_INTERNAL_PRINTF_HAS_FIXED_POINT #ifndef LIBC_COPT_PRINTF_DISABLE_STRERROR -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #endif // LIBC_COPT_PRINTF_DISABLE_STRERROR namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/stdio/setbuf.cpp b/libc/src/stdio/setbuf.cpp index f3db97de5837..fcc6df12ddb0 100644 --- a/libc/src/stdio/setbuf.cpp +++ b/libc/src/stdio/setbuf.cpp @@ -9,8 +9,8 @@ #include "src/stdio/setbuf.h" #include "hdr/stdio_macros.h" #include "src/__support/File/file.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/stdio/setvbuf.cpp b/libc/src/stdio/setvbuf.cpp index 0a6b8cacb59c..9fc6cb040233 100644 --- a/libc/src/stdio/setvbuf.cpp +++ b/libc/src/stdio/setvbuf.cpp @@ -10,8 +10,8 @@ #include "src/__support/File/file.h" #include "hdr/types/FILE.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/stdlib/atof.cpp b/libc/src/stdlib/atof.cpp index 18a65c67705d..d0d8d211dea8 100644 --- a/libc/src/stdlib/atof.cpp +++ b/libc/src/stdlib/atof.cpp @@ -8,9 +8,9 @@ #include "src/stdlib/atof.h" #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" #include "src/__support/str_to_float.h" -#include "src/errno/libc_errno.h" namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/stdlib/atoi.cpp b/libc/src/stdlib/atoi.cpp index 9e46b53b1aa0..420bbc8143d5 100644 --- a/libc/src/stdlib/atoi.cpp +++ b/libc/src/stdlib/atoi.cpp @@ -8,9 +8,9 @@ #include "src/stdlib/atoi.h" #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" #include "src/__support/str_to_integer.h" -#include "src/errno/libc_errno.h" namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/stdlib/atol.cpp b/libc/src/stdlib/atol.cpp index 7f3414a4afdd..e1110ffa449b 100644 --- a/libc/src/stdlib/atol.cpp +++ b/libc/src/stdlib/atol.cpp @@ -8,9 +8,9 @@ #include "src/stdlib/atol.h" #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" #include "src/__support/str_to_integer.h" -#include "src/errno/libc_errno.h" namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/stdlib/atoll.cpp b/libc/src/stdlib/atoll.cpp index 4f1a02ad8315..063e817f9b79 100644 --- a/libc/src/stdlib/atoll.cpp +++ b/libc/src/stdlib/atoll.cpp @@ -8,9 +8,9 @@ #include "src/stdlib/atoll.h" #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" #include "src/__support/str_to_integer.h" -#include "src/errno/libc_errno.h" namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/stdlib/strtod.cpp b/libc/src/stdlib/strtod.cpp index 2c6819163aa4..deb2390c7fcd 100644 --- a/libc/src/stdlib/strtod.cpp +++ b/libc/src/stdlib/strtod.cpp @@ -8,9 +8,9 @@ #include "src/stdlib/strtod.h" #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" #include "src/__support/str_to_float.h" -#include "src/errno/libc_errno.h" namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/stdlib/strtod_l.cpp b/libc/src/stdlib/strtod_l.cpp index 247314398315..ad333b32d240 100644 --- a/libc/src/stdlib/strtod_l.cpp +++ b/libc/src/stdlib/strtod_l.cpp @@ -8,9 +8,9 @@ #include "src/stdlib/strtod_l.h" #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" #include "src/__support/str_to_float.h" -#include "src/errno/libc_errno.h" namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/stdlib/strtof.cpp b/libc/src/stdlib/strtof.cpp index 351bf64ad4f7..fc52dc85ffc5 100644 --- a/libc/src/stdlib/strtof.cpp +++ b/libc/src/stdlib/strtof.cpp @@ -8,9 +8,9 @@ #include "src/stdlib/strtof.h" #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" #include "src/__support/str_to_float.h" -#include "src/errno/libc_errno.h" namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/stdlib/strtof_l.cpp b/libc/src/stdlib/strtof_l.cpp index d54efa66e084..c6e03ff51fa2 100644 --- a/libc/src/stdlib/strtof_l.cpp +++ b/libc/src/stdlib/strtof_l.cpp @@ -8,9 +8,9 @@ #include "src/stdlib/strtof_l.h" #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" #include "src/__support/str_to_float.h" -#include "src/errno/libc_errno.h" namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/stdlib/strtol.cpp b/libc/src/stdlib/strtol.cpp index 77f8712d7c13..42db36b2052b 100644 --- a/libc/src/stdlib/strtol.cpp +++ b/libc/src/stdlib/strtol.cpp @@ -8,9 +8,9 @@ #include "src/stdlib/strtol.h" #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" #include "src/__support/str_to_integer.h" -#include "src/errno/libc_errno.h" namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/stdlib/strtol_l.cpp b/libc/src/stdlib/strtol_l.cpp index f94aff1a0d7b..497a4403eff4 100644 --- a/libc/src/stdlib/strtol_l.cpp +++ b/libc/src/stdlib/strtol_l.cpp @@ -8,9 +8,9 @@ #include "src/stdlib/strtol_l.h" #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" #include "src/__support/str_to_integer.h" -#include "src/errno/libc_errno.h" namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/stdlib/strtold.cpp b/libc/src/stdlib/strtold.cpp index 88d29c9f3627..44046c2c6f61 100644 --- a/libc/src/stdlib/strtold.cpp +++ b/libc/src/stdlib/strtold.cpp @@ -8,9 +8,9 @@ #include "src/stdlib/strtold.h" #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" #include "src/__support/str_to_float.h" -#include "src/errno/libc_errno.h" namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/stdlib/strtold_l.cpp b/libc/src/stdlib/strtold_l.cpp index d0c57f50246b..c3af30a1b9ec 100644 --- a/libc/src/stdlib/strtold_l.cpp +++ b/libc/src/stdlib/strtold_l.cpp @@ -8,9 +8,9 @@ #include "src/stdlib/strtold_l.h" #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" #include "src/__support/str_to_float.h" -#include "src/errno/libc_errno.h" namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/stdlib/strtoll.cpp b/libc/src/stdlib/strtoll.cpp index 8d1b3efdcf87..c1dca13112e0 100644 --- a/libc/src/stdlib/strtoll.cpp +++ b/libc/src/stdlib/strtoll.cpp @@ -8,9 +8,9 @@ #include "src/stdlib/strtoll.h" #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" #include "src/__support/str_to_integer.h" -#include "src/errno/libc_errno.h" namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/stdlib/strtoll_l.cpp b/libc/src/stdlib/strtoll_l.cpp index e82971d59c48..6f30d7794c5c 100644 --- a/libc/src/stdlib/strtoll_l.cpp +++ b/libc/src/stdlib/strtoll_l.cpp @@ -8,9 +8,9 @@ #include "src/stdlib/strtoll_l.h" #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" #include "src/__support/str_to_integer.h" -#include "src/errno/libc_errno.h" namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/stdlib/strtoul.cpp b/libc/src/stdlib/strtoul.cpp index 1d832318c448..d26ca5e5a10a 100644 --- a/libc/src/stdlib/strtoul.cpp +++ b/libc/src/stdlib/strtoul.cpp @@ -8,9 +8,9 @@ #include "src/stdlib/strtoul.h" #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" #include "src/__support/str_to_integer.h" -#include "src/errno/libc_errno.h" namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/stdlib/strtoul_l.cpp b/libc/src/stdlib/strtoul_l.cpp index 74fce00a0ac3..9a875ddee902 100644 --- a/libc/src/stdlib/strtoul_l.cpp +++ b/libc/src/stdlib/strtoul_l.cpp @@ -8,9 +8,9 @@ #include "src/stdlib/strtoul_l.h" #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" #include "src/__support/str_to_integer.h" -#include "src/errno/libc_errno.h" namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/stdlib/strtoull.cpp b/libc/src/stdlib/strtoull.cpp index dba22611cfb0..8f929f577311 100644 --- a/libc/src/stdlib/strtoull.cpp +++ b/libc/src/stdlib/strtoull.cpp @@ -8,9 +8,9 @@ #include "src/stdlib/strtoull.h" #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" #include "src/__support/str_to_integer.h" -#include "src/errno/libc_errno.h" namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/stdlib/strtoull_l.cpp b/libc/src/stdlib/strtoull_l.cpp index 2ea8a43a40ef..9eb056b0e59b 100644 --- a/libc/src/stdlib/strtoull_l.cpp +++ b/libc/src/stdlib/strtoull_l.cpp @@ -8,9 +8,9 @@ #include "src/stdlib/strtoull_l.h" #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" #include "src/__support/str_to_integer.h" -#include "src/errno/libc_errno.h" namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/string/strdup.cpp b/libc/src/string/strdup.cpp index 4cf4173a27bf..dab0ab4288c9 100644 --- a/libc/src/string/strdup.cpp +++ b/libc/src/string/strdup.cpp @@ -8,8 +8,8 @@ #include "src/string/strdup.h" #include "hdr/stdlib_macros.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include "src/string/allocating_string_utils.h" #include "src/string/memory_utils/inline_memcpy.h" diff --git a/libc/src/sys/auxv/linux/getauxval.cpp b/libc/src/sys/auxv/linux/getauxval.cpp index 236fd25698f6..f3ae7c5c4e07 100644 --- a/libc/src/sys/auxv/linux/getauxval.cpp +++ b/libc/src/sys/auxv/linux/getauxval.cpp @@ -9,8 +9,8 @@ #include "src/sys/auxv/getauxval.h" #include "config/app.h" #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include // for guarded initialization diff --git a/libc/src/sys/epoll/linux/epoll_create.cpp b/libc/src/sys/epoll/linux/epoll_create.cpp index 7196ac7410c3..2e44e883ddf0 100644 --- a/libc/src/sys/epoll/linux/epoll_create.cpp +++ b/libc/src/sys/epoll/linux/epoll_create.cpp @@ -10,8 +10,8 @@ #include "src/__support/OSUtil/syscall.h" // For internal syscall function. #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include // For syscall numbers. namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/sys/epoll/linux/epoll_create1.cpp b/libc/src/sys/epoll/linux/epoll_create1.cpp index efff282e2714..3c60090fb7b4 100644 --- a/libc/src/sys/epoll/linux/epoll_create1.cpp +++ b/libc/src/sys/epoll/linux/epoll_create1.cpp @@ -10,8 +10,8 @@ #include "src/__support/OSUtil/syscall.h" // For internal syscall function. #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include // For syscall numbers. namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/sys/epoll/linux/epoll_ctl.cpp b/libc/src/sys/epoll/linux/epoll_ctl.cpp index 5f7dbb77b1e5..079bd60403b0 100644 --- a/libc/src/sys/epoll/linux/epoll_ctl.cpp +++ b/libc/src/sys/epoll/linux/epoll_ctl.cpp @@ -11,8 +11,8 @@ #include "hdr/types/struct_epoll_event.h" #include "src/__support/OSUtil/syscall.h" // For internal syscall function. #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include // For syscall numbers. namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/sys/epoll/linux/epoll_pwait.cpp b/libc/src/sys/epoll/linux/epoll_pwait.cpp index d7836549928c..24fd1dbdc467 100644 --- a/libc/src/sys/epoll/linux/epoll_pwait.cpp +++ b/libc/src/sys/epoll/linux/epoll_pwait.cpp @@ -13,9 +13,9 @@ #include "hdr/types/struct_epoll_event.h" #include "src/__support/OSUtil/syscall.h" // For internal syscall function. #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" #include "src/__support/macros/sanitizer.h" -#include "src/errno/libc_errno.h" #include // For syscall numbers. diff --git a/libc/src/sys/epoll/linux/epoll_pwait2.cpp b/libc/src/sys/epoll/linux/epoll_pwait2.cpp index 14b419399fe9..219984528efd 100644 --- a/libc/src/sys/epoll/linux/epoll_pwait2.cpp +++ b/libc/src/sys/epoll/linux/epoll_pwait2.cpp @@ -14,9 +14,9 @@ #include "hdr/types/struct_timespec.h" #include "src/__support/OSUtil/syscall.h" // For internal syscall function. #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" #include "src/__support/macros/sanitizer.h" -#include "src/errno/libc_errno.h" #include // For syscall numbers. diff --git a/libc/src/sys/epoll/linux/epoll_wait.cpp b/libc/src/sys/epoll/linux/epoll_wait.cpp index 1a63be5e260f..7fae7b55992f 100644 --- a/libc/src/sys/epoll/linux/epoll_wait.cpp +++ b/libc/src/sys/epoll/linux/epoll_wait.cpp @@ -13,9 +13,9 @@ #include "hdr/types/struct_epoll_event.h" #include "src/__support/OSUtil/syscall.h" // For internal syscall function. #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" #include "src/__support/macros/sanitizer.h" -#include "src/errno/libc_errno.h" #include // For syscall numbers. diff --git a/libc/src/sys/mman/linux/madvise.cpp b/libc/src/sys/mman/linux/madvise.cpp index 332d6c2db4ac..1bb284f62b89 100644 --- a/libc/src/sys/mman/linux/madvise.cpp +++ b/libc/src/sys/mman/linux/madvise.cpp @@ -11,8 +11,8 @@ #include "src/__support/OSUtil/syscall.h" // For internal syscall function. #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include // For syscall numbers. namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/sys/mman/linux/mincore.cpp b/libc/src/sys/mman/linux/mincore.cpp index b5436fda3853..d583f1ef85f3 100644 --- a/libc/src/sys/mman/linux/mincore.cpp +++ b/libc/src/sys/mman/linux/mincore.cpp @@ -10,8 +10,8 @@ #include "src/__support/OSUtil/syscall.h" // For internal syscall function. +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include // For syscall numbers. namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/sys/mman/linux/mlock.cpp b/libc/src/sys/mman/linux/mlock.cpp index be7eb28e29c4..8582eb7c0063 100644 --- a/libc/src/sys/mman/linux/mlock.cpp +++ b/libc/src/sys/mman/linux/mlock.cpp @@ -10,8 +10,8 @@ #include "src/__support/OSUtil/syscall.h" // For internal syscall function. +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include // For syscall numbers. namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/sys/mman/linux/mlock2.cpp b/libc/src/sys/mman/linux/mlock2.cpp index 7bc557f9bf58..955cfe128de7 100644 --- a/libc/src/sys/mman/linux/mlock2.cpp +++ b/libc/src/sys/mman/linux/mlock2.cpp @@ -10,8 +10,8 @@ #include "src/__support/OSUtil/syscall.h" // For internal syscall function. +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include // For syscall numbers. namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/sys/mman/linux/mlockall.cpp b/libc/src/sys/mman/linux/mlockall.cpp index eae3a9ea0a18..c3502fbb3af3 100644 --- a/libc/src/sys/mman/linux/mlockall.cpp +++ b/libc/src/sys/mman/linux/mlockall.cpp @@ -10,8 +10,8 @@ #include "src/__support/OSUtil/syscall.h" // For internal syscall function. +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include // For syscall numbers. namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/sys/mman/linux/mmap.cpp b/libc/src/sys/mman/linux/mmap.cpp index ee9a0a32e8f5..33f9fe8ff370 100644 --- a/libc/src/sys/mman/linux/mmap.cpp +++ b/libc/src/sys/mman/linux/mmap.cpp @@ -11,8 +11,8 @@ #include "src/__support/OSUtil/syscall.h" // For internal syscall function. #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include // For EXEC_PAGESIZE. #include // For syscall numbers. diff --git a/libc/src/sys/mman/linux/mprotect.cpp b/libc/src/sys/mman/linux/mprotect.cpp index e2351028e2c7..6b14915b60c9 100644 --- a/libc/src/sys/mman/linux/mprotect.cpp +++ b/libc/src/sys/mman/linux/mprotect.cpp @@ -11,8 +11,8 @@ #include "src/__support/OSUtil/syscall.h" // For internal syscall function. #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include // For syscall numbers. namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/sys/mman/linux/mremap.cpp b/libc/src/sys/mman/linux/mremap.cpp index 38bcfce833d3..6cdda9435bb6 100644 --- a/libc/src/sys/mman/linux/mremap.cpp +++ b/libc/src/sys/mman/linux/mremap.cpp @@ -11,8 +11,8 @@ #include "src/__support/OSUtil/syscall.h" // For internal syscall function. #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include // For EXEC_PAGESIZE. #include #include // For syscall numbers. diff --git a/libc/src/sys/mman/linux/msync.cpp b/libc/src/sys/mman/linux/msync.cpp index e2b4f81d616a..650678bcb36e 100644 --- a/libc/src/sys/mman/linux/msync.cpp +++ b/libc/src/sys/mman/linux/msync.cpp @@ -10,8 +10,8 @@ #include "src/__support/OSUtil/syscall.h" // For internal syscall function. +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include // For syscall numbers. namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/sys/mman/linux/munlock.cpp b/libc/src/sys/mman/linux/munlock.cpp index 93c25f844c6e..9638949f5fcb 100644 --- a/libc/src/sys/mman/linux/munlock.cpp +++ b/libc/src/sys/mman/linux/munlock.cpp @@ -10,8 +10,8 @@ #include "src/__support/OSUtil/syscall.h" // For internal syscall function. +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include // For syscall numbers. namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/sys/mman/linux/munlockall.cpp b/libc/src/sys/mman/linux/munlockall.cpp index f5911cb01bc2..f47eaece178e 100644 --- a/libc/src/sys/mman/linux/munlockall.cpp +++ b/libc/src/sys/mman/linux/munlockall.cpp @@ -10,8 +10,8 @@ #include "src/__support/OSUtil/syscall.h" // For internal syscall function. +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include // For syscall numbers. namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/sys/mman/linux/munmap.cpp b/libc/src/sys/mman/linux/munmap.cpp index 9c01b15ac8dc..61b1f1549dd1 100644 --- a/libc/src/sys/mman/linux/munmap.cpp +++ b/libc/src/sys/mman/linux/munmap.cpp @@ -11,9 +11,9 @@ #include "src/__support/OSUtil/syscall.h" // For internal syscall function. #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" -#include // For syscall numbers. +#include // For syscall numbers. namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/sys/mman/linux/remap_file_pages.cpp b/libc/src/sys/mman/linux/remap_file_pages.cpp index f616e1915ecc..58ae4017f628 100644 --- a/libc/src/sys/mman/linux/remap_file_pages.cpp +++ b/libc/src/sys/mman/linux/remap_file_pages.cpp @@ -10,8 +10,8 @@ #include "src/__support/OSUtil/syscall.h" // For internal syscall function. #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include // For syscall numbers. namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/sys/mman/linux/shm_common.h b/libc/src/sys/mman/linux/shm_common.h index ce75c2b5b699..69911012ff7e 100644 --- a/libc/src/sys/mman/linux/shm_common.h +++ b/libc/src/sys/mman/linux/shm_common.h @@ -9,8 +9,8 @@ #include "src/__support/CPP/array.h" #include "src/__support/CPP/optional.h" #include "src/__support/CPP/string_view.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include "src/string/memory_utils/inline_memcpy.h" // TODO: Get PATH_MAX via https://github.com/llvm/llvm-project/issues/85121 diff --git a/libc/src/sys/prctl/linux/prctl.cpp b/libc/src/sys/prctl/linux/prctl.cpp index 5d4e9046b877..c726b0a53959 100644 --- a/libc/src/sys/prctl/linux/prctl.cpp +++ b/libc/src/sys/prctl/linux/prctl.cpp @@ -10,8 +10,8 @@ #include "src/__support/OSUtil/syscall.h" // For internal syscall function. +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include // For syscall numbers. namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/sys/random/linux/getrandom.cpp b/libc/src/sys/random/linux/getrandom.cpp index 9a8869a2d6d3..0b8471ed8b37 100644 --- a/libc/src/sys/random/linux/getrandom.cpp +++ b/libc/src/sys/random/linux/getrandom.cpp @@ -11,8 +11,8 @@ #include "src/__support/OSUtil/syscall.h" // For internal syscall function. #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include // For syscall numbers. namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/sys/resource/linux/getrlimit.cpp b/libc/src/sys/resource/linux/getrlimit.cpp index 30c2e91b036d..d27213419494 100644 --- a/libc/src/sys/resource/linux/getrlimit.cpp +++ b/libc/src/sys/resource/linux/getrlimit.cpp @@ -11,8 +11,8 @@ #include "src/__support/OSUtil/syscall.h" // For internal syscall function. #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include // For struct rlimit #include // For syscall numbers. diff --git a/libc/src/sys/resource/linux/setrlimit.cpp b/libc/src/sys/resource/linux/setrlimit.cpp index 85f07900aaef..300bad75baa6 100644 --- a/libc/src/sys/resource/linux/setrlimit.cpp +++ b/libc/src/sys/resource/linux/setrlimit.cpp @@ -11,8 +11,8 @@ #include "src/__support/OSUtil/syscall.h" // For internal syscall function. #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include // For struct rlimit #include // For syscall numbers. diff --git a/libc/src/sys/select/linux/select.cpp b/libc/src/sys/select/linux/select.cpp index 9ccb1e95f275..6c434eb58459 100644 --- a/libc/src/sys/select/linux/select.cpp +++ b/libc/src/sys/select/linux/select.cpp @@ -13,8 +13,8 @@ #include "src/__support/CPP/limits.h" #include "src/__support/OSUtil/syscall.h" // For internal syscall function. #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include // For size_t #include // For syscall numbers. diff --git a/libc/src/sys/sendfile/linux/sendfile.cpp b/libc/src/sys/sendfile/linux/sendfile.cpp index 9d4174cb8c91..ec892323def5 100644 --- a/libc/src/sys/sendfile/linux/sendfile.cpp +++ b/libc/src/sys/sendfile/linux/sendfile.cpp @@ -11,8 +11,8 @@ #include "src/__support/OSUtil/syscall.h" // For internal syscall function. #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include #include // For syscall numbers. diff --git a/libc/src/sys/socket/linux/bind.cpp b/libc/src/sys/socket/linux/bind.cpp index 72a3307a91dd..83a3d06f5380 100644 --- a/libc/src/sys/socket/linux/bind.cpp +++ b/libc/src/sys/socket/linux/bind.cpp @@ -11,8 +11,8 @@ #include "src/__support/OSUtil/syscall.h" // For internal syscall function. #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include // For SYS_SOCKET socketcall number. #include // For syscall numbers. diff --git a/libc/src/sys/socket/linux/recv.cpp b/libc/src/sys/socket/linux/recv.cpp index 5e9f2d3233fc..baf4de1b5eb5 100644 --- a/libc/src/sys/socket/linux/recv.cpp +++ b/libc/src/sys/socket/linux/recv.cpp @@ -16,8 +16,8 @@ #include "hdr/types/struct_sockaddr.h" #include "src/__support/OSUtil/syscall.h" // For internal syscall function. #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/sanitizer.h" -#include "src/errno/libc_errno.h" namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/sys/socket/linux/recvfrom.cpp b/libc/src/sys/socket/linux/recvfrom.cpp index 574e65f64a54..3d8397b478cc 100644 --- a/libc/src/sys/socket/linux/recvfrom.cpp +++ b/libc/src/sys/socket/linux/recvfrom.cpp @@ -16,8 +16,8 @@ #include "hdr/types/struct_sockaddr.h" #include "src/__support/OSUtil/syscall.h" // For internal syscall function. #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/sanitizer.h" -#include "src/errno/libc_errno.h" namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/sys/socket/linux/recvmsg.cpp b/libc/src/sys/socket/linux/recvmsg.cpp index e42b6346f330..bc6d072dbf9a 100644 --- a/libc/src/sys/socket/linux/recvmsg.cpp +++ b/libc/src/sys/socket/linux/recvmsg.cpp @@ -15,8 +15,8 @@ #include "hdr/types/struct_msghdr.h" #include "src/__support/OSUtil/syscall.h" // For internal syscall function. #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/sanitizer.h" -#include "src/errno/libc_errno.h" namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/sys/socket/linux/send.cpp b/libc/src/sys/socket/linux/send.cpp index cb3b4d5a9ece..43b01e7e6e0f 100644 --- a/libc/src/sys/socket/linux/send.cpp +++ b/libc/src/sys/socket/linux/send.cpp @@ -16,7 +16,7 @@ #include "hdr/types/struct_sockaddr.h" #include "src/__support/OSUtil/syscall.h" // For internal syscall function. #include "src/__support/common.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/sys/socket/linux/sendmsg.cpp b/libc/src/sys/socket/linux/sendmsg.cpp index b4d9c9deda02..b04783ebfe7e 100644 --- a/libc/src/sys/socket/linux/sendmsg.cpp +++ b/libc/src/sys/socket/linux/sendmsg.cpp @@ -15,7 +15,7 @@ #include "hdr/types/struct_msghdr.h" #include "src/__support/OSUtil/syscall.h" // For internal syscall function. #include "src/__support/common.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/sys/socket/linux/sendto.cpp b/libc/src/sys/socket/linux/sendto.cpp index 2fada192b086..9dda127f872d 100644 --- a/libc/src/sys/socket/linux/sendto.cpp +++ b/libc/src/sys/socket/linux/sendto.cpp @@ -16,7 +16,7 @@ #include "hdr/types/struct_sockaddr.h" #include "src/__support/OSUtil/syscall.h" // For internal syscall function. #include "src/__support/common.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/sys/socket/linux/socket.cpp b/libc/src/sys/socket/linux/socket.cpp index 3e6df4d487a5..69eb6cfa01ce 100644 --- a/libc/src/sys/socket/linux/socket.cpp +++ b/libc/src/sys/socket/linux/socket.cpp @@ -11,8 +11,8 @@ #include "src/__support/OSUtil/syscall.h" // For internal syscall function. #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include // For SYS_SOCKET socketcall number. #include // For syscall numbers. diff --git a/libc/src/sys/socket/linux/socketpair.cpp b/libc/src/sys/socket/linux/socketpair.cpp index 60612ac04d61..7ea8ca46cee5 100644 --- a/libc/src/sys/socket/linux/socketpair.cpp +++ b/libc/src/sys/socket/linux/socketpair.cpp @@ -10,9 +10,9 @@ #include "src/__support/OSUtil/syscall.h" // For internal syscall function. #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" #include "src/__support/macros/sanitizer.h" -#include "src/errno/libc_errno.h" #include // For SYS_SOCKET socketcall number. #include // For syscall numbers. diff --git a/libc/src/sys/stat/linux/chmod.cpp b/libc/src/sys/stat/linux/chmod.cpp index 1b787e47e7c6..2bd0788ec1df 100644 --- a/libc/src/sys/stat/linux/chmod.cpp +++ b/libc/src/sys/stat/linux/chmod.cpp @@ -13,8 +13,8 @@ #include "hdr/fcntl_macros.h" #include "hdr/types/mode_t.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include #include // For syscall numbers. diff --git a/libc/src/sys/stat/linux/fchmod.cpp b/libc/src/sys/stat/linux/fchmod.cpp index 0d6fd359169a..3dadfdd1d943 100644 --- a/libc/src/sys/stat/linux/fchmod.cpp +++ b/libc/src/sys/stat/linux/fchmod.cpp @@ -12,8 +12,8 @@ #include "src/__support/common.h" #include "hdr/types/mode_t.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include #include // For syscall numbers. diff --git a/libc/src/sys/stat/linux/fchmodat.cpp b/libc/src/sys/stat/linux/fchmodat.cpp index e76db4d160fb..add2192a558a 100644 --- a/libc/src/sys/stat/linux/fchmodat.cpp +++ b/libc/src/sys/stat/linux/fchmodat.cpp @@ -11,8 +11,8 @@ #include "src/__support/OSUtil/syscall.h" // For internal syscall function. #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include #include // For syscall numbers. diff --git a/libc/src/sys/stat/linux/fstat.cpp b/libc/src/sys/stat/linux/fstat.cpp index 35cf8f08f782..dea002c5e12a 100644 --- a/libc/src/sys/stat/linux/fstat.cpp +++ b/libc/src/sys/stat/linux/fstat.cpp @@ -8,8 +8,8 @@ #include "src/sys/stat/fstat.h" #include "kernel_statx.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include "src/__support/common.h" diff --git a/libc/src/sys/stat/linux/lstat.cpp b/libc/src/sys/stat/linux/lstat.cpp index 354c5b6e029a..5601dd5d78a9 100644 --- a/libc/src/sys/stat/linux/lstat.cpp +++ b/libc/src/sys/stat/linux/lstat.cpp @@ -8,8 +8,8 @@ #include "src/sys/stat/lstat.h" #include "kernel_statx.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include "src/__support/OSUtil/syscall.h" // For internal syscall function. #include "src/__support/common.h" diff --git a/libc/src/sys/stat/linux/mkdir.cpp b/libc/src/sys/stat/linux/mkdir.cpp index b319b5c8393d..0829ff4f9432 100644 --- a/libc/src/sys/stat/linux/mkdir.cpp +++ b/libc/src/sys/stat/linux/mkdir.cpp @@ -13,8 +13,8 @@ #include "hdr/fcntl_macros.h" #include "hdr/types/mode_t.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include #include // For syscall numbers. diff --git a/libc/src/sys/stat/linux/mkdirat.cpp b/libc/src/sys/stat/linux/mkdirat.cpp index 097fc158010d..8f4194dc3275 100644 --- a/libc/src/sys/stat/linux/mkdirat.cpp +++ b/libc/src/sys/stat/linux/mkdirat.cpp @@ -11,8 +11,8 @@ #include "src/__support/OSUtil/syscall.h" // For internal syscall function. #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include #include // For syscall numbers. diff --git a/libc/src/sys/stat/linux/stat.cpp b/libc/src/sys/stat/linux/stat.cpp index de9cdb197d68..5553eaf00be2 100644 --- a/libc/src/sys/stat/linux/stat.cpp +++ b/libc/src/sys/stat/linux/stat.cpp @@ -8,8 +8,8 @@ #include "src/sys/stat/stat.h" #include "kernel_statx.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include "src/__support/common.h" diff --git a/libc/src/sys/statvfs/linux/statfs_utils.h b/libc/src/sys/statvfs/linux/statfs_utils.h index 1e5be5153101..8ee4de288ef6 100644 --- a/libc/src/sys/statvfs/linux/statfs_utils.h +++ b/libc/src/sys/statvfs/linux/statfs_utils.h @@ -12,9 +12,9 @@ #include "include/llvm-libc-types/struct_statvfs.h" #include "src/__support/CPP/optional.h" #include "src/__support/OSUtil/syscall.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/attributes.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include #include namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/sys/time/linux/getitimer.cpp b/libc/src/sys/time/linux/getitimer.cpp index fec06aa4086e..b87406679694 100644 --- a/libc/src/sys/time/linux/getitimer.cpp +++ b/libc/src/sys/time/linux/getitimer.cpp @@ -10,7 +10,7 @@ #include "hdr/types/struct_itimerval.h" #include "src/__support/OSUtil/syscall.h" #include "src/__support/common.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/sys/time/linux/setitimer.cpp b/libc/src/sys/time/linux/setitimer.cpp index def04a474011..1de0d4329776 100644 --- a/libc/src/sys/time/linux/setitimer.cpp +++ b/libc/src/sys/time/linux/setitimer.cpp @@ -9,7 +9,7 @@ #include "hdr/types/struct_itimerval.h" #include "src/__support/OSUtil/syscall.h" #include "src/__support/common.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/sys/time/linux/utimes.cpp b/libc/src/sys/time/linux/utimes.cpp index 76b69937a5f4..ed37b42aedf6 100644 --- a/libc/src/sys/time/linux/utimes.cpp +++ b/libc/src/sys/time/linux/utimes.cpp @@ -15,7 +15,7 @@ #include "src/__support/OSUtil/syscall.h" #include "src/__support/common.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include diff --git a/libc/src/sys/uio/linux/readv.cpp b/libc/src/sys/uio/linux/readv.cpp index f1393a9749be..c9d8d87ddc72 100644 --- a/libc/src/sys/uio/linux/readv.cpp +++ b/libc/src/sys/uio/linux/readv.cpp @@ -10,7 +10,7 @@ #include "hdr/types/struct_iovec.h" #include "src/__support/OSUtil/syscall.h" #include "src/__support/common.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/sys/uio/linux/writev.cpp b/libc/src/sys/uio/linux/writev.cpp index 8992bed95c98..b0b9e1520792 100644 --- a/libc/src/sys/uio/linux/writev.cpp +++ b/libc/src/sys/uio/linux/writev.cpp @@ -10,7 +10,7 @@ #include "hdr/types/struct_iovec.h" #include "src/__support/OSUtil/syscall.h" #include "src/__support/common.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/sys/utsname/linux/uname.cpp b/libc/src/sys/utsname/linux/uname.cpp index 7bb227e801e3..b47ba964faf0 100644 --- a/libc/src/sys/utsname/linux/uname.cpp +++ b/libc/src/sys/utsname/linux/uname.cpp @@ -11,8 +11,8 @@ #include "src/__support/OSUtil/syscall.h" // For internal syscall function. #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include // For syscall numbers. #include diff --git a/libc/src/sys/wait/wait4Impl.h b/libc/src/sys/wait/wait4Impl.h index f2bdeb02f866..77ed3ad22f14 100644 --- a/libc/src/sys/wait/wait4Impl.h +++ b/libc/src/sys/wait/wait4Impl.h @@ -12,8 +12,8 @@ #include "src/__support/OSUtil/syscall.h" // For internal syscall function. #include "src/__support/common.h" #include "src/__support/error_or.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include #include // For syscall numbers. diff --git a/libc/src/termios/linux/cfsetispeed.cpp b/libc/src/termios/linux/cfsetispeed.cpp index 9656b714a8ed..47b19974d21b 100644 --- a/libc/src/termios/linux/cfsetispeed.cpp +++ b/libc/src/termios/linux/cfsetispeed.cpp @@ -9,8 +9,8 @@ #include "src/termios/cfsetispeed.h" #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include diff --git a/libc/src/termios/linux/cfsetospeed.cpp b/libc/src/termios/linux/cfsetospeed.cpp index 6130d266dbff..d2f138257a47 100644 --- a/libc/src/termios/linux/cfsetospeed.cpp +++ b/libc/src/termios/linux/cfsetospeed.cpp @@ -7,8 +7,8 @@ //===----------------------------------------------------------------------===// #include "src/termios/cfsetospeed.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include "src/__support/common.h" diff --git a/libc/src/termios/linux/tcdrain.cpp b/libc/src/termios/linux/tcdrain.cpp index 116e3f0e0cbc..570b15c24fe7 100644 --- a/libc/src/termios/linux/tcdrain.cpp +++ b/libc/src/termios/linux/tcdrain.cpp @@ -10,8 +10,8 @@ #include "src/__support/OSUtil/syscall.h" #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include // Safe to include without the risk of name pollution. #include // For syscall numbers diff --git a/libc/src/termios/linux/tcflow.cpp b/libc/src/termios/linux/tcflow.cpp index d229230b5d13..714ef6aa7129 100644 --- a/libc/src/termios/linux/tcflow.cpp +++ b/libc/src/termios/linux/tcflow.cpp @@ -10,8 +10,8 @@ #include "src/__support/OSUtil/syscall.h" #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include // Safe to include without the risk of name pollution. #include // For syscall numbers diff --git a/libc/src/termios/linux/tcflush.cpp b/libc/src/termios/linux/tcflush.cpp index 028a5414b196..4c7b9fadc446 100644 --- a/libc/src/termios/linux/tcflush.cpp +++ b/libc/src/termios/linux/tcflush.cpp @@ -10,8 +10,8 @@ #include "src/__support/OSUtil/syscall.h" #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include // Safe to include without the risk of name pollution. #include // For syscall numbers diff --git a/libc/src/termios/linux/tcgetattr.cpp b/libc/src/termios/linux/tcgetattr.cpp index 63c096ff88eb..2e768269c874 100644 --- a/libc/src/termios/linux/tcgetattr.cpp +++ b/libc/src/termios/linux/tcgetattr.cpp @@ -11,8 +11,8 @@ #include "src/__support/OSUtil/syscall.h" #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include // Safe to include without the risk of name pollution. #include // For syscall numbers diff --git a/libc/src/termios/linux/tcgetsid.cpp b/libc/src/termios/linux/tcgetsid.cpp index c283d0e4fda9..7487816cf274 100644 --- a/libc/src/termios/linux/tcgetsid.cpp +++ b/libc/src/termios/linux/tcgetsid.cpp @@ -10,8 +10,8 @@ #include "src/__support/OSUtil/syscall.h" #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include // Safe to include without the risk of name pollution. #include // For syscall numbers diff --git a/libc/src/termios/linux/tcsendbreak.cpp b/libc/src/termios/linux/tcsendbreak.cpp index 30bc91cf3de0..1d546c1d5953 100644 --- a/libc/src/termios/linux/tcsendbreak.cpp +++ b/libc/src/termios/linux/tcsendbreak.cpp @@ -10,8 +10,8 @@ #include "src/__support/OSUtil/syscall.h" #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include // Safe to include without the risk of name pollution. #include // For syscall numbers diff --git a/libc/src/termios/linux/tcsetattr.cpp b/libc/src/termios/linux/tcsetattr.cpp index 8aa1e5c57b34..8a2c7290217b 100644 --- a/libc/src/termios/linux/tcsetattr.cpp +++ b/libc/src/termios/linux/tcsetattr.cpp @@ -11,8 +11,8 @@ #include "src/__support/OSUtil/syscall.h" #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include // Safe to include without the risk of name pollution. #include // For syscall numbers diff --git a/libc/src/threads/thrd_create.cpp b/libc/src/threads/thrd_create.cpp index 4680944c2eee..67e22e72fd0e 100644 --- a/libc/src/threads/thrd_create.cpp +++ b/libc/src/threads/thrd_create.cpp @@ -8,9 +8,9 @@ #include "src/threads/thrd_create.h" #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" #include "src/__support/threads/thread.h" -#include "src/errno/libc_errno.h" #include // For thrd_* type definitions. diff --git a/libc/src/time/linux/clock.cpp b/libc/src/time/linux/clock.cpp index ee4fa82b4f89..c38697cd0668 100644 --- a/libc/src/time/linux/clock.cpp +++ b/libc/src/time/linux/clock.cpp @@ -10,10 +10,10 @@ #include "hdr/time_macros.h" #include "src/__support/CPP/limits.h" #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" #include "src/__support/time/clock_gettime.h" #include "src/__support/time/units.h" -#include "src/errno/libc_errno.h" namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/time/linux/clock_gettime.cpp b/libc/src/time/linux/clock_gettime.cpp index 743c644d65d0..b3fcd2b22f9d 100644 --- a/libc/src/time/linux/clock_gettime.cpp +++ b/libc/src/time/linux/clock_gettime.cpp @@ -8,9 +8,9 @@ #include "src/time/clock_gettime.h" #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" #include "src/__support/time/clock_gettime.h" -#include "src/errno/libc_errno.h" namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/time/linux/gettimeofday.cpp b/libc/src/time/linux/gettimeofday.cpp index e8ddf482fc98..237b05903c70 100644 --- a/libc/src/time/linux/gettimeofday.cpp +++ b/libc/src/time/linux/gettimeofday.cpp @@ -10,10 +10,10 @@ #include "hdr/time_macros.h" #include "hdr/types/suseconds_t.h" #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" #include "src/__support/time/clock_gettime.h" #include "src/__support/time/units.h" -#include "src/errno/libc_errno.h" namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/time/linux/nanosleep.cpp b/libc/src/time/linux/nanosleep.cpp index 7a856376ffb2..6b9704126a0a 100644 --- a/libc/src/time/linux/nanosleep.cpp +++ b/libc/src/time/linux/nanosleep.cpp @@ -10,8 +10,8 @@ #include "hdr/time_macros.h" #include "src/__support/OSUtil/syscall.h" // For syscall functions. #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include // For int64_t. #include // For syscall numbers. diff --git a/libc/src/time/linux/timespec_get.cpp b/libc/src/time/linux/timespec_get.cpp index cf5174523aa4..a4d437233273 100644 --- a/libc/src/time/linux/timespec_get.cpp +++ b/libc/src/time/linux/timespec_get.cpp @@ -9,9 +9,9 @@ #include "src/time/timespec_get.h" #include "hdr/time_macros.h" #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" #include "src/__support/time/clock_gettime.h" -#include "src/errno/libc_errno.h" namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/time/time.cpp b/libc/src/time/time.cpp index 860909af7488..2a81f0182c31 100644 --- a/libc/src/time/time.cpp +++ b/libc/src/time/time.cpp @@ -10,9 +10,9 @@ #include "hdr/time_macros.h" #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" #include "src/__support/time/clock_gettime.h" -#include "src/errno/libc_errno.h" namespace LIBC_NAMESPACE_DECL { // avoid inconsitent clang-format behavior diff --git a/libc/src/time/time_utils.h b/libc/src/time/time_utils.h index bbbb1c08a475..0541c24ece82 100644 --- a/libc/src/time/time_utils.h +++ b/libc/src/time/time_utils.h @@ -15,8 +15,8 @@ #include "src/__support/CPP/optional.h" #include "src/__support/CPP/string_view.h" #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include "time_constants.h" #include diff --git a/libc/src/time/windows/clock_getres.cpp b/libc/src/time/windows/clock_getres.cpp index b8c0c82aa641..969bb66be2d2 100644 --- a/libc/src/time/windows/clock_getres.cpp +++ b/libc/src/time/windows/clock_getres.cpp @@ -13,10 +13,10 @@ #include "src/__support/CPP/limits.h" #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/optimization.h" #include "src/__support/time/units.h" #include "src/__support/time/windows/performance_counter.h" -#include "src/errno/libc_errno.h" #include "src/time/clock_getres.h" #define WIN32_LEAN_AND_MEAN diff --git a/libc/src/unistd/linux/access.cpp b/libc/src/unistd/linux/access.cpp index 2f7ebbcdf9e8..55cd6adca779 100644 --- a/libc/src/unistd/linux/access.cpp +++ b/libc/src/unistd/linux/access.cpp @@ -12,8 +12,8 @@ #include "src/__support/common.h" #include "hdr/fcntl_macros.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include // For syscall numbers. namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/unistd/linux/chdir.cpp b/libc/src/unistd/linux/chdir.cpp index a30d1dc883be..04ba509b49a5 100644 --- a/libc/src/unistd/linux/chdir.cpp +++ b/libc/src/unistd/linux/chdir.cpp @@ -11,8 +11,8 @@ #include "src/__support/OSUtil/syscall.h" // For internal syscall function. #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include // For syscall numbers. namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/unistd/linux/close.cpp b/libc/src/unistd/linux/close.cpp index 58d42a9673fb..b5842f2b64d2 100644 --- a/libc/src/unistd/linux/close.cpp +++ b/libc/src/unistd/linux/close.cpp @@ -11,8 +11,8 @@ #include "src/__support/OSUtil/syscall.h" // For internal syscall function. #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include // For syscall numbers. namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/unistd/linux/dup.cpp b/libc/src/unistd/linux/dup.cpp index c1710a37f611..81d30c6cdbc4 100644 --- a/libc/src/unistd/linux/dup.cpp +++ b/libc/src/unistd/linux/dup.cpp @@ -11,8 +11,8 @@ #include "src/__support/OSUtil/syscall.h" // For internal syscall function. #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include // For syscall numbers. namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/unistd/linux/dup2.cpp b/libc/src/unistd/linux/dup2.cpp index 7ffc151a053c..0a0e86573b34 100644 --- a/libc/src/unistd/linux/dup2.cpp +++ b/libc/src/unistd/linux/dup2.cpp @@ -12,8 +12,8 @@ #include "src/__support/common.h" #include "hdr/fcntl_macros.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include // For syscall numbers. namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/unistd/linux/dup3.cpp b/libc/src/unistd/linux/dup3.cpp index c096ba73c96b..770fb73515b2 100644 --- a/libc/src/unistd/linux/dup3.cpp +++ b/libc/src/unistd/linux/dup3.cpp @@ -11,8 +11,8 @@ #include "src/__support/OSUtil/syscall.h" // For internal syscall function. #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include // For syscall numbers. namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/unistd/linux/execv.cpp b/libc/src/unistd/linux/execv.cpp index a3f2525ed7ca..d4f2bd9a5165 100644 --- a/libc/src/unistd/linux/execv.cpp +++ b/libc/src/unistd/linux/execv.cpp @@ -13,7 +13,7 @@ #include "src/__support/OSUtil/syscall.h" // For internal syscall function. #include "src/__support/common.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include // For syscall numbers. namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/unistd/linux/execve.cpp b/libc/src/unistd/linux/execve.cpp index 37162c412178..2214b6df493b 100644 --- a/libc/src/unistd/linux/execve.cpp +++ b/libc/src/unistd/linux/execve.cpp @@ -13,7 +13,7 @@ #include "src/__support/OSUtil/syscall.h" // For internal syscall function. #include "src/__support/common.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include // For syscall numbers. namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/unistd/linux/fchdir.cpp b/libc/src/unistd/linux/fchdir.cpp index 8196dc63ab1e..f7a7422363e6 100644 --- a/libc/src/unistd/linux/fchdir.cpp +++ b/libc/src/unistd/linux/fchdir.cpp @@ -11,8 +11,8 @@ #include "src/__support/OSUtil/syscall.h" // For internal syscall function. #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include // For syscall numbers. namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/unistd/linux/fork.cpp b/libc/src/unistd/linux/fork.cpp index 8aa0477a15d5..75a76fdea50b 100644 --- a/libc/src/unistd/linux/fork.cpp +++ b/libc/src/unistd/linux/fork.cpp @@ -15,7 +15,7 @@ #include "src/__support/threads/identifier.h" #include "src/__support/threads/thread.h" // For thread self object -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include // For SIGCHLD #include // For syscall numbers. diff --git a/libc/src/unistd/linux/fsync.cpp b/libc/src/unistd/linux/fsync.cpp index ae3895bab15f..fe08aed61e25 100644 --- a/libc/src/unistd/linux/fsync.cpp +++ b/libc/src/unistd/linux/fsync.cpp @@ -11,8 +11,8 @@ #include "src/__support/OSUtil/syscall.h" // For internal syscall function. #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include // For syscall numbers. namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/unistd/linux/ftruncate.cpp b/libc/src/unistd/linux/ftruncate.cpp index ccbb0634664a..f6aa6f8b48cc 100644 --- a/libc/src/unistd/linux/ftruncate.cpp +++ b/libc/src/unistd/linux/ftruncate.cpp @@ -12,8 +12,8 @@ #include "src/__support/common.h" #include "hdr/unistd_macros.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include // For uint64_t. #include // For syscall numbers. diff --git a/libc/src/unistd/linux/getcwd.cpp b/libc/src/unistd/linux/getcwd.cpp index 1bb11a7c8e7b..c0e475dd3e8f 100644 --- a/libc/src/unistd/linux/getcwd.cpp +++ b/libc/src/unistd/linux/getcwd.cpp @@ -13,7 +13,7 @@ #include "src/__support/macros/config.h" #include "src/string/allocating_string_utils.h" // For strdup. -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include // This is safe to include without any name pollution. #include // For syscall numbers. diff --git a/libc/src/unistd/linux/getentropy.cpp b/libc/src/unistd/linux/getentropy.cpp index 168a1197734e..65bcbf27601d 100644 --- a/libc/src/unistd/linux/getentropy.cpp +++ b/libc/src/unistd/linux/getentropy.cpp @@ -10,7 +10,7 @@ #include "hdr/errno_macros.h" #include "src/__support/OSUtil/syscall.h" #include "src/__support/common.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include // For syscall numbers. diff --git a/libc/src/unistd/linux/getsid.cpp b/libc/src/unistd/linux/getsid.cpp index 5977c5bf10e9..025b8d1691ac 100644 --- a/libc/src/unistd/linux/getsid.cpp +++ b/libc/src/unistd/linux/getsid.cpp @@ -11,8 +11,8 @@ #include "hdr/types/pid_t.h" #include "src/__support/OSUtil/syscall.h" // For internal syscall function. #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include // For syscall numbers. namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/unistd/linux/isatty.cpp b/libc/src/unistd/linux/isatty.cpp index e6ea22a714c7..a4d17912b57b 100644 --- a/libc/src/unistd/linux/isatty.cpp +++ b/libc/src/unistd/linux/isatty.cpp @@ -11,8 +11,8 @@ #include "src/__support/OSUtil/syscall.h" // For internal syscall function. #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include // For ioctl numbers. #include // For syscall numbers. diff --git a/libc/src/unistd/linux/link.cpp b/libc/src/unistd/linux/link.cpp index 477806a70df7..205cf8a84a5c 100644 --- a/libc/src/unistd/linux/link.cpp +++ b/libc/src/unistd/linux/link.cpp @@ -10,8 +10,8 @@ #include "src/__support/OSUtil/syscall.h" // For internal syscall function. #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include "hdr/fcntl_macros.h" #include // For syscall numbers. diff --git a/libc/src/unistd/linux/linkat.cpp b/libc/src/unistd/linux/linkat.cpp index 40f68cc90c48..ea5bc48cbedc 100644 --- a/libc/src/unistd/linux/linkat.cpp +++ b/libc/src/unistd/linux/linkat.cpp @@ -12,8 +12,8 @@ #include "src/__support/common.h" #include "hdr/fcntl_macros.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include // For syscall numbers. namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/unistd/linux/lseek.cpp b/libc/src/unistd/linux/lseek.cpp index 0e957498da74..26a08269fd8d 100644 --- a/libc/src/unistd/linux/lseek.cpp +++ b/libc/src/unistd/linux/lseek.cpp @@ -7,8 +7,8 @@ //===----------------------------------------------------------------------===// #include "src/unistd/lseek.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include "src/__support/File/linux/lseekImpl.h" #include "src/__support/OSUtil/syscall.h" // For internal syscall function. diff --git a/libc/src/unistd/linux/pathconf.cpp b/libc/src/unistd/linux/pathconf.cpp index ca1c10bb9f7f..7dde857c1cfd 100644 --- a/libc/src/unistd/linux/pathconf.cpp +++ b/libc/src/unistd/linux/pathconf.cpp @@ -7,8 +7,8 @@ //===----------------------------------------------------------------------===// #include "src/unistd/pathconf.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include "src/sys/statvfs/linux/statfs_utils.h" #include "src/unistd/linux/pathconf_utils.h" diff --git a/libc/src/unistd/linux/pathconf_utils.cpp b/libc/src/unistd/linux/pathconf_utils.cpp index 035e628dff25..9a62e31fd188 100644 --- a/libc/src/unistd/linux/pathconf_utils.cpp +++ b/libc/src/unistd/linux/pathconf_utils.cpp @@ -14,8 +14,8 @@ #include "hdr/unistd_macros.h" #include "src/__support/OSUtil/syscall.h" // For internal syscall function. #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include "src/sys/statvfs/linux/statfs_utils.h" // other linux specific includes diff --git a/libc/src/unistd/linux/pipe.cpp b/libc/src/unistd/linux/pipe.cpp index dfcd5bfdaf53..b9943c833805 100644 --- a/libc/src/unistd/linux/pipe.cpp +++ b/libc/src/unistd/linux/pipe.cpp @@ -10,10 +10,10 @@ #include "src/__support/OSUtil/syscall.h" // For internal syscall function. #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" #include "src/__support/macros/sanitizer.h" // for MSAN_UNPOISON -#include "src/errno/libc_errno.h" -#include // For syscall numbers. +#include // For syscall numbers. namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/unistd/linux/pipe2.cpp b/libc/src/unistd/linux/pipe2.cpp index ebe7e0114ae9..d30f3b37a1ad 100644 --- a/libc/src/unistd/linux/pipe2.cpp +++ b/libc/src/unistd/linux/pipe2.cpp @@ -10,8 +10,8 @@ #include "src/__support/OSUtil/syscall.h" // For internal syscall function. #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include // For syscall numbers. namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/unistd/linux/pread.cpp b/libc/src/unistd/linux/pread.cpp index 3e27857f9a2b..2f86e397feef 100644 --- a/libc/src/unistd/linux/pread.cpp +++ b/libc/src/unistd/linux/pread.cpp @@ -10,11 +10,11 @@ #include "src/__support/OSUtil/syscall.h" // For internal syscall function. #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" #include "src/__support/macros/sanitizer.h" // for MSAN_UNPOISON -#include "src/errno/libc_errno.h" -#include // For uint64_t. -#include // For syscall numbers. +#include // For uint64_t. +#include // For syscall numbers. namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/unistd/linux/pwrite.cpp b/libc/src/unistd/linux/pwrite.cpp index 1b81b2a05949..f4cf8e16d766 100644 --- a/libc/src/unistd/linux/pwrite.cpp +++ b/libc/src/unistd/linux/pwrite.cpp @@ -11,8 +11,8 @@ #include "src/__support/OSUtil/syscall.h" // For internal syscall function. #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include // For uint64_t. #include // For syscall numbers. diff --git a/libc/src/unistd/linux/read.cpp b/libc/src/unistd/linux/read.cpp index 4419900f2330..55676f3f7010 100644 --- a/libc/src/unistd/linux/read.cpp +++ b/libc/src/unistd/linux/read.cpp @@ -10,10 +10,10 @@ #include "src/__support/OSUtil/syscall.h" // For internal syscall function. #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" #include "src/__support/macros/sanitizer.h" // for MSAN_UNPOISON -#include "src/errno/libc_errno.h" -#include // For syscall numbers. +#include // For syscall numbers. namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/unistd/linux/readlink.cpp b/libc/src/unistd/linux/readlink.cpp index 2055e6b3400f..b297a41ca37b 100644 --- a/libc/src/unistd/linux/readlink.cpp +++ b/libc/src/unistd/linux/readlink.cpp @@ -12,8 +12,8 @@ #include "src/__support/common.h" #include "hdr/fcntl_macros.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include // For syscall numbers. namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/unistd/linux/readlinkat.cpp b/libc/src/unistd/linux/readlinkat.cpp index e5e4d0d39bc9..cd0dcb8e0ff0 100644 --- a/libc/src/unistd/linux/readlinkat.cpp +++ b/libc/src/unistd/linux/readlinkat.cpp @@ -12,8 +12,8 @@ #include "src/__support/common.h" #include "hdr/fcntl_macros.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include // For syscall numbers. namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/unistd/linux/rmdir.cpp b/libc/src/unistd/linux/rmdir.cpp index 075af12af64c..eca6e954ef89 100644 --- a/libc/src/unistd/linux/rmdir.cpp +++ b/libc/src/unistd/linux/rmdir.cpp @@ -10,8 +10,8 @@ #include "src/__support/OSUtil/syscall.h" // For internal syscall function. #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include "hdr/fcntl_macros.h" #include // For syscall numbers. diff --git a/libc/src/unistd/linux/symlink.cpp b/libc/src/unistd/linux/symlink.cpp index 9e1b2886ea0f..3f43de19d2f4 100644 --- a/libc/src/unistd/linux/symlink.cpp +++ b/libc/src/unistd/linux/symlink.cpp @@ -10,8 +10,8 @@ #include "src/__support/OSUtil/syscall.h" // For internal syscall function. #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include "hdr/fcntl_macros.h" #include // For syscall numbers. diff --git a/libc/src/unistd/linux/symlinkat.cpp b/libc/src/unistd/linux/symlinkat.cpp index bcf2d0f8cc05..8cee172f39df 100644 --- a/libc/src/unistd/linux/symlinkat.cpp +++ b/libc/src/unistd/linux/symlinkat.cpp @@ -12,8 +12,8 @@ #include "src/__support/common.h" #include "hdr/fcntl_macros.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include // For syscall numbers. namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/unistd/linux/syscall.cpp b/libc/src/unistd/linux/syscall.cpp index 5394bff46adf..0f7b3da88d62 100644 --- a/libc/src/unistd/linux/syscall.cpp +++ b/libc/src/unistd/linux/syscall.cpp @@ -11,8 +11,8 @@ #include "src/__support/OSUtil/syscall.h" // For internal syscall function. #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/unistd/linux/sysconf.cpp b/libc/src/unistd/linux/sysconf.cpp index f785ff321c7d..03f224b15027 100644 --- a/libc/src/unistd/linux/sysconf.cpp +++ b/libc/src/unistd/linux/sysconf.cpp @@ -11,8 +11,8 @@ #include "src/__support/common.h" #include "hdr/unistd_macros.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include "src/sys/auxv/getauxval.h" #include diff --git a/libc/src/unistd/linux/truncate.cpp b/libc/src/unistd/linux/truncate.cpp index 8236edb480d1..6103d4b51350 100644 --- a/libc/src/unistd/linux/truncate.cpp +++ b/libc/src/unistd/linux/truncate.cpp @@ -10,8 +10,8 @@ #include "src/__support/OSUtil/syscall.h" // For internal syscall function. #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include "hdr/unistd_macros.h" #include // For uint64_t. diff --git a/libc/src/unistd/linux/unlink.cpp b/libc/src/unistd/linux/unlink.cpp index 72d8e2398e3d..5fde2600937b 100644 --- a/libc/src/unistd/linux/unlink.cpp +++ b/libc/src/unistd/linux/unlink.cpp @@ -12,8 +12,8 @@ #include "src/__support/common.h" #include "hdr/fcntl_macros.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include // For syscall numbers. namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/unistd/linux/unlinkat.cpp b/libc/src/unistd/linux/unlinkat.cpp index 4ed20f542f17..b2012c52b885 100644 --- a/libc/src/unistd/linux/unlinkat.cpp +++ b/libc/src/unistd/linux/unlinkat.cpp @@ -10,8 +10,8 @@ #include "src/__support/OSUtil/syscall.h" // For internal syscall function. #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include "hdr/fcntl_macros.h" #include // For syscall numbers. diff --git a/libc/src/unistd/linux/write.cpp b/libc/src/unistd/linux/write.cpp index 99d5ab7e480b..eecb74429182 100644 --- a/libc/src/unistd/linux/write.cpp +++ b/libc/src/unistd/linux/write.cpp @@ -10,8 +10,8 @@ #include "src/__support/OSUtil/syscall.h" // For internal syscall function. #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include // For syscall numbers. diff --git a/libc/src/unistd/windows/getentropy.cpp b/libc/src/unistd/windows/getentropy.cpp index bfaec723ac63..e25a7a8fed40 100644 --- a/libc/src/unistd/windows/getentropy.cpp +++ b/libc/src/unistd/windows/getentropy.cpp @@ -9,7 +9,7 @@ #include "src/unistd/getentropy.h" #include "hdr/errno_macros.h" #include "src/__support/common.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #define WIN32_LEAN_AND_MEAN #include diff --git a/libc/test/IntegrationTest/test.h b/libc/test/IntegrationTest/test.h index 5be66d9edff0..24c007d2e12e 100644 --- a/libc/test/IntegrationTest/test.h +++ b/libc/test/IntegrationTest/test.h @@ -68,12 +68,9 @@ //////////////////////////////////////////////////////////////////////////////// // Errno checks. -#define ASSERT_ERRNO_EQ(VAL) \ - ASSERT_EQ(VAL, static_cast(LIBC_NAMESPACE::libc_errno)) -#define ASSERT_ERRNO_SUCCESS() \ - ASSERT_EQ(0, static_cast(LIBC_NAMESPACE::libc_errno)) -#define ASSERT_ERRNO_FAILURE() \ - ASSERT_NE(0, static_cast(LIBC_NAMESPACE::libc_errno)) +#define ASSERT_ERRNO_EQ(VAL) ASSERT_EQ(VAL, static_cast(libc_errno)) +#define ASSERT_ERRNO_SUCCESS() ASSERT_EQ(0, static_cast(libc_errno)) +#define ASSERT_ERRNO_FAILURE() ASSERT_NE(0, static_cast(libc_errno)) // Integration tests are compiled with -ffreestanding which stops treating // the main function as a non-overloadable special function. Hence, we use a diff --git a/libc/test/UnitTest/ErrnoCheckingTest.h b/libc/test/UnitTest/ErrnoCheckingTest.h index 3d3b72f80544..4b7ff452f409 100644 --- a/libc/test/UnitTest/ErrnoCheckingTest.h +++ b/libc/test/UnitTest/ErrnoCheckingTest.h @@ -9,8 +9,8 @@ #ifndef LLVM_LIBC_TEST_UNITTEST_ERRNOCHECKINGTEST_H #define LLVM_LIBC_TEST_UNITTEST_ERRNOCHECKINGTEST_H +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include "test/UnitTest/Test.h" namespace LIBC_NAMESPACE_DECL { @@ -25,7 +25,7 @@ class ErrnoCheckingTest : public Test { public: void SetUp() override { Test::SetUp(); - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; } void TearDown() override { diff --git a/libc/test/UnitTest/ErrnoSetterMatcher.h b/libc/test/UnitTest/ErrnoSetterMatcher.h index c6eadd25858e..212b7a8f83e7 100644 --- a/libc/test/UnitTest/ErrnoSetterMatcher.h +++ b/libc/test/UnitTest/ErrnoSetterMatcher.h @@ -12,9 +12,9 @@ #include "src/__support/FPUtil/FPBits.h" #include "src/__support/FPUtil/fpbits_str.h" #include "src/__support/StringUtil/error_to_string.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" #include "src/__support/macros/properties/architectures.h" -#include "src/errno/libc_errno.h" #include "test/UnitTest/Test.h" namespace LIBC_NAMESPACE_DECL { @@ -114,8 +114,8 @@ public: bool match(T got) { actual_return = got; - actual_errno = LIBC_NAMESPACE::libc_errno; - LIBC_NAMESPACE::libc_errno = 0; + actual_errno = libc_errno; + libc_errno = 0; if constexpr (ignore_errno()) return return_cmp.compare(actual_return); else diff --git a/libc/test/UnitTest/FPMatcher.h b/libc/test/UnitTest/FPMatcher.h index 21b8a45b0726..da15cf2907f7 100644 --- a/libc/test/UnitTest/FPMatcher.h +++ b/libc/test/UnitTest/FPMatcher.h @@ -279,8 +279,8 @@ private: #define EXPECT_MATH_ERRNO(expected) \ do { \ if (math_errhandling & MATH_ERRNO) { \ - int actual = LIBC_NAMESPACE::libc_errno; \ - LIBC_NAMESPACE::libc_errno = 0; \ + int actual = libc_errno; \ + libc_errno = 0; \ EXPECT_EQ(actual, expected); \ } \ } while (0) @@ -288,8 +288,8 @@ private: #define ASSERT_MATH_ERRNO(expected) \ do { \ if (math_errhandling & MATH_ERRNO) { \ - int actual = LIBC_NAMESPACE::libc_errno; \ - LIBC_NAMESPACE::libc_errno = 0; \ + int actual = libc_errno; \ + libc_errno = 0; \ ASSERT_EQ(actual, expected); \ } \ } while (0) diff --git a/libc/test/UnitTest/Test.h b/libc/test/UnitTest/Test.h index 95d48f40914e..a5a2a3c7cf58 100644 --- a/libc/test/UnitTest/Test.h +++ b/libc/test/UnitTest/Test.h @@ -42,15 +42,14 @@ #define ASSERT_ERRNO_EQ(VAL) \ do { \ - ASSERT_EQ(VAL, static_cast(LIBC_NAMESPACE::libc_errno)); \ - LIBC_NAMESPACE::libc_errno = 0; \ + ASSERT_EQ(VAL, static_cast(libc_errno)); \ + libc_errno = 0; \ } while (0) -#define ASSERT_ERRNO_SUCCESS() \ - ASSERT_EQ(0, static_cast(LIBC_NAMESPACE::libc_errno)) +#define ASSERT_ERRNO_SUCCESS() ASSERT_EQ(0, static_cast(libc_errno)) #define ASSERT_ERRNO_FAILURE() \ do { \ - ASSERT_NE(0, static_cast(LIBC_NAMESPACE::libc_errno)); \ - LIBC_NAMESPACE::libc_errno = 0; \ + ASSERT_NE(0, static_cast(libc_errno)); \ + libc_errno = 0; \ } while (0) #endif // LLVM_LIBC_TEST_UNITTEST_TEST_H diff --git a/libc/test/integration/src/pthread/pthread_create_test.cpp b/libc/test/integration/src/pthread/pthread_create_test.cpp index 29da4d5c3c8d..aecbad6514aa 100644 --- a/libc/test/integration/src/pthread/pthread_create_test.cpp +++ b/libc/test/integration/src/pthread/pthread_create_test.cpp @@ -29,7 +29,7 @@ #include "src/__support/CPP/new.h" #include "src/__support/threads/thread.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "test/IntegrationTest/test.h" @@ -332,7 +332,7 @@ static void run_failure_tests() { } TEST_MAIN() { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; run_success_tests(); run_failure_tests(); return 0; diff --git a/libc/test/integration/src/pthread/pthread_join_test.cpp b/libc/test/integration/src/pthread/pthread_join_test.cpp index 994fa57a6b33..5d0bcd8e2365 100644 --- a/libc/test/integration/src/pthread/pthread_join_test.cpp +++ b/libc/test/integration/src/pthread/pthread_join_test.cpp @@ -9,7 +9,7 @@ #include "src/pthread/pthread_create.h" #include "src/pthread/pthread_join.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "test/IntegrationTest/test.h" #include @@ -25,7 +25,7 @@ static void nullJoinTest() { } TEST_MAIN() { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; nullJoinTest(); return 0; } diff --git a/libc/test/integration/src/pthread/pthread_name_test.cpp b/libc/test/integration/src/pthread/pthread_name_test.cpp index 37ceceee880d..35dd3b165e0e 100644 --- a/libc/test/integration/src/pthread/pthread_name_test.cpp +++ b/libc/test/integration/src/pthread/pthread_name_test.cpp @@ -7,7 +7,7 @@ //===----------------------------------------------------------------------===// #include "src/__support/CPP/string_view.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/pthread/pthread_create.h" #include "src/pthread/pthread_getname_np.h" #include "src/pthread/pthread_join.h" diff --git a/libc/test/integration/src/unistd/getcwd_test.cpp b/libc/test/integration/src/unistd/getcwd_test.cpp index 551768187bf0..1b321b01e931 100644 --- a/libc/test/integration/src/unistd/getcwd_test.cpp +++ b/libc/test/integration/src/unistd/getcwd_test.cpp @@ -7,7 +7,7 @@ //===----------------------------------------------------------------------===// #include "src/__support/CPP/string_view.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/stdlib/getenv.h" #include "src/unistd/getcwd.h" @@ -31,12 +31,12 @@ TEST_MAIN(int argc, char **argv, char **envp) { cwd = LIBC_NAMESPACE::getcwd(buffer, 0); ASSERT_TRUE(cwd == nullptr); ASSERT_ERRNO_EQ(EINVAL); - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; // Insufficient size cwd = LIBC_NAMESPACE::getcwd(buffer, 2); ASSERT_TRUE(cwd == nullptr); - int err = LIBC_NAMESPACE::libc_errno; + int err = libc_errno; ASSERT_EQ(err, ERANGE); return 0; diff --git a/libc/test/integration/startup/linux/tls_test.cpp b/libc/test/integration/startup/linux/tls_test.cpp index ef9fd9fcb7ff..de3bd06c39cf 100644 --- a/libc/test/integration/startup/linux/tls_test.cpp +++ b/libc/test/integration/startup/linux/tls_test.cpp @@ -6,7 +6,7 @@ // //===----------------------------------------------------------------------===// -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/sys/mman/mmap.h" #include "test/IntegrationTest/test.h" diff --git a/libc/test/src/__support/str_to_fp_test.h b/libc/test/src/__support/str_to_fp_test.h index d349192f107c..9b4844d410db 100644 --- a/libc/test/src/__support/str_to_fp_test.h +++ b/libc/test/src/__support/str_to_fp_test.h @@ -7,6 +7,7 @@ //===----------------------------------------------------------------------===// #include "src/__support/FPUtil/FPBits.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" #include "src/__support/str_to_float.h" #include "src/__support/uint128.h" diff --git a/libc/test/src/__support/str_to_integer_test.cpp b/libc/test/src/__support/str_to_integer_test.cpp index 1ec882b212b8..40cb76a8bd6a 100644 --- a/libc/test/src/__support/str_to_integer_test.cpp +++ b/libc/test/src/__support/str_to_integer_test.cpp @@ -6,6 +6,7 @@ // //===----------------------------------------------------------------------===// +#include "src/__support/libc_errno.h" #include "src/__support/str_to_integer.h" #include diff --git a/libc/test/src/dirent/dirent_test.cpp b/libc/test/src/dirent/dirent_test.cpp index 41f522a6a75f..3f0095ca5ebe 100644 --- a/libc/test/src/dirent/dirent_test.cpp +++ b/libc/test/src/dirent/dirent_test.cpp @@ -7,11 +7,11 @@ //===----------------------------------------------------------------------===// #include "src/__support/CPP/string_view.h" +#include "src/__support/libc_errno.h" #include "src/dirent/closedir.h" #include "src/dirent/dirfd.h" #include "src/dirent/opendir.h" #include "src/dirent/readdir.h" -#include "src/errno/libc_errno.h" #include "test/UnitTest/Test.h" @@ -55,17 +55,17 @@ TEST(LlvmLibcDirentTest, SimpleOpenAndRead) { } TEST(LlvmLibcDirentTest, OpenNonExistentDir) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; ::DIR *dir = LIBC_NAMESPACE::opendir("___xyz123__.non_existent__"); ASSERT_TRUE(dir == nullptr); ASSERT_ERRNO_EQ(ENOENT); - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; } TEST(LlvmLibcDirentTest, OpenFile) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; ::DIR *dir = LIBC_NAMESPACE::opendir("testdata/file1.txt"); ASSERT_TRUE(dir == nullptr); ASSERT_ERRNO_EQ(ENOTDIR); - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; } diff --git a/libc/test/src/errno/errno_test.cpp b/libc/test/src/errno/errno_test.cpp index b0db22a85f3b..de82b0077f17 100644 --- a/libc/test/src/errno/errno_test.cpp +++ b/libc/test/src/errno/errno_test.cpp @@ -6,11 +6,11 @@ // //===----------------------------------------------------------------------===// -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "test/UnitTest/Test.h" TEST(LlvmLibcErrnoTest, Basic) { int test_val = 123; - LIBC_NAMESPACE::libc_errno = test_val; + libc_errno = test_val; ASSERT_ERRNO_EQ(test_val); } diff --git a/libc/test/src/fcntl/creat_test.cpp b/libc/test/src/fcntl/creat_test.cpp index 4c9d2cbc33f4..d60c98493470 100644 --- a/libc/test/src/fcntl/creat_test.cpp +++ b/libc/test/src/fcntl/creat_test.cpp @@ -6,7 +6,7 @@ // //===----------------------------------------------------------------------===// -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/fcntl/creat.h" #include "src/fcntl/open.h" #include "src/unistd/close.h" diff --git a/libc/test/src/fcntl/fcntl_test.cpp b/libc/test/src/fcntl/fcntl_test.cpp index 1a21afe51085..082c42481777 100644 --- a/libc/test/src/fcntl/fcntl_test.cpp +++ b/libc/test/src/fcntl/fcntl_test.cpp @@ -9,7 +9,7 @@ #include "hdr/fcntl_macros.h" #include "hdr/stdio_macros.h" #include "hdr/types/struct_flock.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/fcntl/fcntl.h" #include "src/fcntl/open.h" #include "src/unistd/close.h" @@ -166,7 +166,7 @@ TEST(LlvmLibcFcntlTest, UseAfterClose) { } TEST(LlvmLibcFcntlTest, SetGetOwnerTest) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds; pid_t pid = LIBC_NAMESPACE::getpid(); ASSERT_GT(pid, -1); diff --git a/libc/test/src/fcntl/openat_test.cpp b/libc/test/src/fcntl/openat_test.cpp index 213b074799c8..1997476f16a6 100644 --- a/libc/test/src/fcntl/openat_test.cpp +++ b/libc/test/src/fcntl/openat_test.cpp @@ -6,7 +6,7 @@ // //===----------------------------------------------------------------------===// -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/fcntl/open.h" #include "src/fcntl/openat.h" #include "src/unistd/close.h" diff --git a/libc/test/src/math/RoundToIntegerTest.h b/libc/test/src/math/RoundToIntegerTest.h index 77b465a3a0e6..6af9cfea0e0a 100644 --- a/libc/test/src/math/RoundToIntegerTest.h +++ b/libc/test/src/math/RoundToIntegerTest.h @@ -55,7 +55,7 @@ private: void test_one_input(RoundToIntegerFunc func, FloatType input, IntType expected, bool expectError) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; LIBC_NAMESPACE::fputil::clear_except(FE_ALL_EXCEPT); ASSERT_EQ(func(input), expected); diff --git a/libc/test/src/math/acosf_test.cpp b/libc/test/src/math/acosf_test.cpp index 2e4c8eb2ab96..aa0128fee999 100644 --- a/libc/test/src/math/acosf_test.cpp +++ b/libc/test/src/math/acosf_test.cpp @@ -8,7 +8,7 @@ #include "hdr/math_macros.h" #include "src/__support/FPUtil/FPBits.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/math/acosf.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" @@ -21,7 +21,7 @@ namespace mpfr = LIBC_NAMESPACE::testing::mpfr; using LlvmLibcAcosfTest = LIBC_NAMESPACE::testing::FPTest; TEST_F(LlvmLibcAcosfTest, SpecialNumbers) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ_ALL_ROUNDING(aNaN, LIBC_NAMESPACE::acosf(aNaN)); EXPECT_MATH_ERRNO(0); diff --git a/libc/test/src/math/acoshf16_test.cpp b/libc/test/src/math/acoshf16_test.cpp index 7348018396bd..2eb95215e4e8 100644 --- a/libc/test/src/math/acoshf16_test.cpp +++ b/libc/test/src/math/acoshf16_test.cpp @@ -6,7 +6,7 @@ // //===----------------------------------------------------------------------===// -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/math/acoshf16.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" diff --git a/libc/test/src/math/acoshf_test.cpp b/libc/test/src/math/acoshf_test.cpp index 18ed5a11d50a..3d3b827411a4 100644 --- a/libc/test/src/math/acoshf_test.cpp +++ b/libc/test/src/math/acoshf_test.cpp @@ -8,7 +8,7 @@ #include "hdr/math_macros.h" #include "src/__support/FPUtil/FPBits.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/math/acoshf.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" @@ -21,7 +21,7 @@ using LlvmLibcAcoshfTest = LIBC_NAMESPACE::testing::FPTest; namespace mpfr = LIBC_NAMESPACE::testing::mpfr; TEST_F(LlvmLibcAcoshfTest, SpecialNumbers) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ_ALL_ROUNDING(aNaN, LIBC_NAMESPACE::acoshf(aNaN)); EXPECT_MATH_ERRNO(0); diff --git a/libc/test/src/math/asin_test.cpp b/libc/test/src/math/asin_test.cpp index 385e341318ae..03ae963e9f92 100644 --- a/libc/test/src/math/asin_test.cpp +++ b/libc/test/src/math/asin_test.cpp @@ -38,7 +38,7 @@ TEST_F(LlvmLibcAsinTest, InDoubleRange) { double x = FPBits(v).get_val(); if (FPBits(v).is_nan() || FPBits(v).is_inf()) continue; - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; double result = LIBC_NAMESPACE::asin(x); ++cc; if (FPBits(result).is_nan() || FPBits(result).is_inf()) diff --git a/libc/test/src/math/asinf_test.cpp b/libc/test/src/math/asinf_test.cpp index 5197810d8bd5..1eaa6b8a5135 100644 --- a/libc/test/src/math/asinf_test.cpp +++ b/libc/test/src/math/asinf_test.cpp @@ -9,7 +9,7 @@ #include "hdr/math_macros.h" #include "src/__support/FPUtil/FPBits.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/math/asinf.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" @@ -22,7 +22,7 @@ using LlvmLibcAsinfTest = LIBC_NAMESPACE::testing::FPTest; namespace mpfr = LIBC_NAMESPACE::testing::mpfr; TEST_F(LlvmLibcAsinfTest, SpecialNumbers) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ_ALL_ROUNDING(aNaN, LIBC_NAMESPACE::asinf(aNaN)); EXPECT_MATH_ERRNO(0); diff --git a/libc/test/src/math/asinhf_test.cpp b/libc/test/src/math/asinhf_test.cpp index ac125c3520c4..8c78f939cabf 100644 --- a/libc/test/src/math/asinhf_test.cpp +++ b/libc/test/src/math/asinhf_test.cpp @@ -8,7 +8,7 @@ #include "hdr/math_macros.h" #include "src/__support/FPUtil/FPBits.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/math/asinhf.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" @@ -21,7 +21,7 @@ using LlvmLibcAsinhfTest = LIBC_NAMESPACE::testing::FPTest; namespace mpfr = LIBC_NAMESPACE::testing::mpfr; TEST_F(LlvmLibcAsinhfTest, SpecialNumbers) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ_ALL_ROUNDING(aNaN, LIBC_NAMESPACE::asinhf(aNaN)); EXPECT_MATH_ERRNO(0); diff --git a/libc/test/src/math/atan2f_test.cpp b/libc/test/src/math/atan2f_test.cpp index 331f4281af83..50ab38208089 100644 --- a/libc/test/src/math/atan2f_test.cpp +++ b/libc/test/src/math/atan2f_test.cpp @@ -81,7 +81,7 @@ TEST_F(LlvmLibcAtan2fTest, InFloatRange) { if (FPBits(w).is_nan() || FPBits(w).is_inf()) continue; - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; float result = LIBC_NAMESPACE::atan2f(x, y); ++total_count; if (FPBits(result).is_nan() || FPBits(result).is_inf()) diff --git a/libc/test/src/math/atan_test.cpp b/libc/test/src/math/atan_test.cpp index 7f52578b9efe..7fa0dffd607e 100644 --- a/libc/test/src/math/atan_test.cpp +++ b/libc/test/src/math/atan_test.cpp @@ -39,7 +39,7 @@ TEST_F(LlvmLibcAtanTest, InDoubleRange) { double x = FPBits(v).get_val(); if (FPBits(v).is_nan() || FPBits(v).is_inf()) continue; - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; double result = LIBC_NAMESPACE::atan(x); ++cc; if (FPBits(result).is_nan() || FPBits(result).is_inf()) diff --git a/libc/test/src/math/atanf_test.cpp b/libc/test/src/math/atanf_test.cpp index 575ec89bd493..a4bdf1867c39 100644 --- a/libc/test/src/math/atanf_test.cpp +++ b/libc/test/src/math/atanf_test.cpp @@ -8,7 +8,7 @@ #include "hdr/math_macros.h" #include "src/__support/FPUtil/FPBits.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/math/atanf.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" @@ -23,7 +23,7 @@ namespace mpfr = LIBC_NAMESPACE::testing::mpfr; // TODO: This test needs to have its checks for exceptions, errno // tightened TEST_F(LlvmLibcAtanfTest, SpecialNumbers) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; LIBC_NAMESPACE::fputil::clear_except(FE_ALL_EXCEPT); EXPECT_FP_EQ_ALL_ROUNDING(aNaN, LIBC_NAMESPACE::atanf(aNaN)); // TODO: Uncomment these checks later, RoundingMode affects running diff --git a/libc/test/src/math/atanhf_test.cpp b/libc/test/src/math/atanhf_test.cpp index 8b9db1dfdd97..32272ef482ab 100644 --- a/libc/test/src/math/atanhf_test.cpp +++ b/libc/test/src/math/atanhf_test.cpp @@ -8,7 +8,7 @@ #include "hdr/math_macros.h" #include "src/__support/FPUtil/FPBits.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/math/atanhf.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" @@ -25,7 +25,7 @@ namespace mpfr = LIBC_NAMESPACE::testing::mpfr; // tightened https://github.com/llvm/llvm-project/issues/88819. TEST_F(LlvmLibcAtanhfTest, SpecialNumbers) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; LIBC_NAMESPACE::fputil::clear_except(FE_ALL_EXCEPT); EXPECT_FP_EQ_ALL_ROUNDING(aNaN, LIBC_NAMESPACE::atanhf(aNaN)); // TODO: Uncomment these checks later, RoundingMode affects running diff --git a/libc/test/src/math/cosf_test.cpp b/libc/test/src/math/cosf_test.cpp index 2143c36f3d30..90dc8ff6a0ea 100644 --- a/libc/test/src/math/cosf_test.cpp +++ b/libc/test/src/math/cosf_test.cpp @@ -8,7 +8,7 @@ #include "hdr/math_macros.h" #include "src/__support/FPUtil/FPBits.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/math/cosf.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" @@ -23,7 +23,7 @@ using LlvmLibcCosfTest = LIBC_NAMESPACE::testing::FPTest; namespace mpfr = LIBC_NAMESPACE::testing::mpfr; TEST_F(LlvmLibcCosfTest, SpecialNumbers) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ(aNaN, LIBC_NAMESPACE::cosf(aNaN)); EXPECT_MATH_ERRNO(0); diff --git a/libc/test/src/math/coshf_test.cpp b/libc/test/src/math/coshf_test.cpp index 0d1c322b8e62..bdaba50f1f14 100644 --- a/libc/test/src/math/coshf_test.cpp +++ b/libc/test/src/math/coshf_test.cpp @@ -9,7 +9,7 @@ #include "hdr/math_macros.h" #include "src/__support/CPP/array.h" #include "src/__support/FPUtil/FPBits.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/math/coshf.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" @@ -22,7 +22,7 @@ using LlvmLibcCoshfTest = LIBC_NAMESPACE::testing::FPTest; namespace mpfr = LIBC_NAMESPACE::testing::mpfr; TEST_F(LlvmLibcCoshfTest, SpecialNumbers) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ(aNaN, LIBC_NAMESPACE::coshf(aNaN)); EXPECT_MATH_ERRNO(0); @@ -41,7 +41,7 @@ TEST_F(LlvmLibcCoshfTest, SpecialNumbers) { } TEST_F(LlvmLibcCoshfTest, Overflow) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ_WITH_EXCEPTION( inf, LIBC_NAMESPACE::coshf(FPBits(0x7f7fffffU).get_val()), FE_OVERFLOW); EXPECT_MATH_ERRNO(ERANGE); diff --git a/libc/test/src/math/cospif_test.cpp b/libc/test/src/math/cospif_test.cpp index 37ec2516f6a3..cb88bfcade0d 100644 --- a/libc/test/src/math/cospif_test.cpp +++ b/libc/test/src/math/cospif_test.cpp @@ -6,7 +6,7 @@ // //===----------------------------------------------------------------------===// -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/math/cospif.h" #include "test/UnitTest/FPMatcher.h" #include "test/src/math/sdcomp26094.h" @@ -19,7 +19,7 @@ using LIBC_NAMESPACE::testing::SDCOMP26094_VALUES; namespace mpfr = LIBC_NAMESPACE::testing::mpfr; TEST_F(LlvmLibcCospifTest, SpecialNumbers) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ(aNaN, LIBC_NAMESPACE::cospif(aNaN)); EXPECT_MATH_ERRNO(0); diff --git a/libc/test/src/math/exp10_test.cpp b/libc/test/src/math/exp10_test.cpp index 6fb1d2d9d925..6126e5f211ff 100644 --- a/libc/test/src/math/exp10_test.cpp +++ b/libc/test/src/math/exp10_test.cpp @@ -8,7 +8,7 @@ #include "hdr/math_macros.h" #include "src/__support/FPUtil/FPBits.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/math/exp10.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" @@ -105,7 +105,7 @@ TEST_F(LlvmLibcExp10Test, InDoubleRange) { double x = FPBits(v).get_val(); if (FPBits(v).is_nan() || FPBits(v).is_inf() || x < 0.0) continue; - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; double result = LIBC_NAMESPACE::exp10(x); ++cc; if (FPBits(result).is_nan() || FPBits(result).is_inf()) diff --git a/libc/test/src/math/exp10f_test.cpp b/libc/test/src/math/exp10f_test.cpp index 001b37809d93..89915961c9b9 100644 --- a/libc/test/src/math/exp10f_test.cpp +++ b/libc/test/src/math/exp10f_test.cpp @@ -8,7 +8,7 @@ #include "hdr/math_macros.h" #include "src/__support/FPUtil/FPBits.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/math/exp10f.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" @@ -21,7 +21,7 @@ using LlvmLibcExp10fTest = LIBC_NAMESPACE::testing::FPTest; namespace mpfr = LIBC_NAMESPACE::testing::mpfr; TEST_F(LlvmLibcExp10fTest, SpecialNumbers) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ(aNaN, LIBC_NAMESPACE::exp10f(aNaN)); EXPECT_MATH_ERRNO(0); @@ -40,7 +40,7 @@ TEST_F(LlvmLibcExp10fTest, SpecialNumbers) { } TEST_F(LlvmLibcExp10fTest, Overflow) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ_WITH_EXCEPTION( inf, LIBC_NAMESPACE::exp10f(FPBits(0x7f7fffffU).get_val()), FE_OVERFLOW); EXPECT_MATH_ERRNO(ERANGE); @@ -55,7 +55,7 @@ TEST_F(LlvmLibcExp10fTest, Overflow) { } TEST_F(LlvmLibcExp10fTest, Underflow) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ_WITH_EXCEPTION( 0.0f, LIBC_NAMESPACE::exp10f(FPBits(0xff7fffffU).get_val()), FE_UNDERFLOW); @@ -97,7 +97,7 @@ TEST_F(LlvmLibcExp10fTest, TrickyInputs) { 0x41200000, // x = 10.0f }; for (int i = 0; i < N; ++i) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; float x = FPBits(INPUTS[i]).get_val(); EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Exp10, x, LIBC_NAMESPACE::exp10f(x), 0.5); @@ -113,15 +113,14 @@ TEST_F(LlvmLibcExp10fTest, InFloatRange) { float x = FPBits(v).get_val(); if (FPBits(v).is_nan() || FPBits(v).is_inf()) continue; - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; float result = LIBC_NAMESPACE::exp10f(x); // If the computation resulted in an error or did not produce valid result // in the single-precision floating point range, then ignore comparing with // MPFR result as MPFR can still produce valid results because of its // wider precision. - if (FPBits(result).is_nan() || FPBits(result).is_inf() || - LIBC_NAMESPACE::libc_errno != 0) + if (FPBits(result).is_nan() || FPBits(result).is_inf() || libc_errno != 0) continue; ASSERT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Exp10, x, LIBC_NAMESPACE::exp10f(x), 0.5); diff --git a/libc/test/src/math/exp10m1f_test.cpp b/libc/test/src/math/exp10m1f_test.cpp index aee273384f1a..01802bd68f7e 100644 --- a/libc/test/src/math/exp10m1f_test.cpp +++ b/libc/test/src/math/exp10m1f_test.cpp @@ -8,7 +8,7 @@ #include "hdr/math_macros.h" #include "src/__support/CPP/array.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/math/exp10m1f.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" @@ -69,7 +69,7 @@ TEST_F(LlvmLibcExp10m1fTest, TrickyInputs) { }; for (float x : INPUTS) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Exp10m1, x, LIBC_NAMESPACE::exp10m1f(x), 0.5); } @@ -82,14 +82,14 @@ TEST_F(LlvmLibcExp10m1fTest, InFloatRange) { float x = FPBits(v).get_val(); if (FPBits(v).is_inf_or_nan()) continue; - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; float result = LIBC_NAMESPACE::exp10m1f(x); // If the computation resulted in an error or did not produce valid result // in the single-precision floating point range, then ignore comparing with // MPFR result as MPFR can still produce valid results because of its // wider precision. - if (FPBits(result).is_inf_or_nan() || LIBC_NAMESPACE::libc_errno != 0) + if (FPBits(result).is_inf_or_nan() || libc_errno != 0) continue; ASSERT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Exp10m1, x, LIBC_NAMESPACE::exp10m1f(x), 0.5); diff --git a/libc/test/src/math/exp2_test.cpp b/libc/test/src/math/exp2_test.cpp index adfceceeef4b..4cd95dd5486e 100644 --- a/libc/test/src/math/exp2_test.cpp +++ b/libc/test/src/math/exp2_test.cpp @@ -8,7 +8,7 @@ #include "hdr/math_macros.h" #include "src/__support/FPUtil/FPBits.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/math/exp2.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" @@ -80,7 +80,7 @@ TEST_F(LlvmLibcExp2Test, InDoubleRange) { double x = FPBits(v).get_val(); if (FPBits(v).is_nan() || FPBits(v).is_inf() || x < 0.0) continue; - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; double result = LIBC_NAMESPACE::exp2(x); ++cc; if (FPBits(result).is_nan() || FPBits(result).is_inf()) diff --git a/libc/test/src/math/exp2f_test.cpp b/libc/test/src/math/exp2f_test.cpp index 0c4c82153439..aeecb3e74b07 100644 --- a/libc/test/src/math/exp2f_test.cpp +++ b/libc/test/src/math/exp2f_test.cpp @@ -8,7 +8,7 @@ #include "hdr/math_macros.h" #include "src/__support/FPUtil/FPBits.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/math/exp2f.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" @@ -21,7 +21,7 @@ using LlvmLibcExp2fTest = LIBC_NAMESPACE::testing::FPTest; namespace mpfr = LIBC_NAMESPACE::testing::mpfr; TEST_F(LlvmLibcExp2fTest, SpecialNumbers) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ(aNaN, LIBC_NAMESPACE::exp2f(aNaN)); EXPECT_MATH_ERRNO(0); @@ -40,7 +40,7 @@ TEST_F(LlvmLibcExp2fTest, SpecialNumbers) { } TEST_F(LlvmLibcExp2fTest, Overflow) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ_WITH_EXCEPTION( inf, LIBC_NAMESPACE::exp2f(FPBits(0x7f7fffffU).get_val()), FE_OVERFLOW); EXPECT_MATH_ERRNO(ERANGE); @@ -71,7 +71,7 @@ TEST_F(LlvmLibcExp2fTest, TrickyInputs) { 0xc3150000U, /*-0x1.2ap+7f*/ }; for (int i = 0; i < N; ++i) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; float x = FPBits(INPUTS[i]).get_val(); EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Exp2, x, LIBC_NAMESPACE::exp2f(x), 0.5); @@ -80,7 +80,7 @@ TEST_F(LlvmLibcExp2fTest, TrickyInputs) { } TEST_F(LlvmLibcExp2fTest, Underflow) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ_WITH_EXCEPTION( 0.0f, LIBC_NAMESPACE::exp2f(FPBits(0xff7fffffU).get_val()), FE_UNDERFLOW); EXPECT_MATH_ERRNO(ERANGE); @@ -108,15 +108,14 @@ TEST_F(LlvmLibcExp2fTest, InFloatRange) { float x = FPBits(v).get_val(); if (FPBits(v).is_nan() || FPBits(v).is_inf()) continue; - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; float result = LIBC_NAMESPACE::exp2f(x); // If the computation resulted in an error or did not produce valid result // in the single-precision floating point range, then ignore comparing with // MPFR result as MPFR can still produce valid results because of its // wider precision. - if (FPBits(result).is_nan() || FPBits(result).is_inf() || - LIBC_NAMESPACE::libc_errno != 0) + if (FPBits(result).is_nan() || FPBits(result).is_inf() || libc_errno != 0) continue; ASSERT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Exp2, x, LIBC_NAMESPACE::exp2f(x), 0.5); diff --git a/libc/test/src/math/exp2m1f_test.cpp b/libc/test/src/math/exp2m1f_test.cpp index 793cf0cc2cbb..0c87657abc08 100644 --- a/libc/test/src/math/exp2m1f_test.cpp +++ b/libc/test/src/math/exp2m1f_test.cpp @@ -9,7 +9,7 @@ #include "hdr/math_macros.h" #include "src/__support/CPP/array.h" #include "src/__support/FPUtil/FPBits.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/math/exp2m1f.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" @@ -38,7 +38,7 @@ TEST_F(LlvmLibcExp2m1fTest, TrickyInputs) { }; for (float x : INPUTS) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Exp2m1, x, LIBC_NAMESPACE::exp2m1f(x), 0.5); } @@ -51,15 +51,14 @@ TEST_F(LlvmLibcExp2m1fTest, InFloatRange) { float x = FPBits(v).get_val(); if (FPBits(v).is_nan() || FPBits(v).is_inf()) continue; - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; float result = LIBC_NAMESPACE::exp2m1f(x); // If the computation resulted in an error or did not produce valid result // in the single-precision floating point range, then ignore comparing with // MPFR result as MPFR can still produce valid results because of its // wider precision. - if (FPBits(result).is_nan() || FPBits(result).is_inf() || - LIBC_NAMESPACE::libc_errno != 0) + if (FPBits(result).is_nan() || FPBits(result).is_inf() || libc_errno != 0) continue; ASSERT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Exp2m1, x, LIBC_NAMESPACE::exp2m1f(x), 0.5); diff --git a/libc/test/src/math/exp_test.cpp b/libc/test/src/math/exp_test.cpp index 0ab3a4e54346..83addaeb943d 100644 --- a/libc/test/src/math/exp_test.cpp +++ b/libc/test/src/math/exp_test.cpp @@ -8,7 +8,7 @@ #include "hdr/math_macros.h" #include "src/__support/FPUtil/FPBits.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/math/exp.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" @@ -78,7 +78,7 @@ TEST_F(LlvmLibcExpTest, InDoubleRange) { double x = FPBits(v).get_val(); if (FPBits(v).is_nan() || FPBits(v).is_inf() || x < 0.0) continue; - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; double result = LIBC_NAMESPACE::exp(x); ++cc; if (FPBits(result).is_nan() || FPBits(result).is_inf()) diff --git a/libc/test/src/math/expf_test.cpp b/libc/test/src/math/expf_test.cpp index 26a0bca4ce25..3c10812ff5bc 100644 --- a/libc/test/src/math/expf_test.cpp +++ b/libc/test/src/math/expf_test.cpp @@ -8,7 +8,7 @@ #include "hdr/math_macros.h" #include "src/__support/FPUtil/FPBits.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/math/expf.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" @@ -21,7 +21,7 @@ using LlvmLibcExpfTest = LIBC_NAMESPACE::testing::FPTest; namespace mpfr = LIBC_NAMESPACE::testing::mpfr; TEST_F(LlvmLibcExpfTest, SpecialNumbers) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ(aNaN, LIBC_NAMESPACE::expf(aNaN)); EXPECT_MATH_ERRNO(0); @@ -40,7 +40,7 @@ TEST_F(LlvmLibcExpfTest, SpecialNumbers) { } TEST_F(LlvmLibcExpfTest, Overflow) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ_WITH_EXCEPTION( inf, LIBC_NAMESPACE::expf(FPBits(0x7f7fffffU).get_val()), FE_OVERFLOW); EXPECT_MATH_ERRNO(ERANGE); @@ -55,7 +55,7 @@ TEST_F(LlvmLibcExpfTest, Overflow) { } TEST_F(LlvmLibcExpfTest, Underflow) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ_WITH_EXCEPTION( 0.0f, LIBC_NAMESPACE::expf(FPBits(0xff7fffffU).get_val()), FE_UNDERFLOW); EXPECT_MATH_ERRNO(ERANGE); @@ -76,7 +76,7 @@ TEST_F(LlvmLibcExpfTest, Underflow) { TEST_F(LlvmLibcExpfTest, Borderline) { float x; - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; x = FPBits(0x42affff8U).get_val(); ASSERT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Exp, x, LIBC_NAMESPACE::expf(x), 0.5); @@ -110,15 +110,14 @@ TEST_F(LlvmLibcExpfTest, InFloatRange) { float x = FPBits(v).get_val(); if (FPBits(v).is_nan() || FPBits(v).is_inf()) continue; - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; float result = LIBC_NAMESPACE::expf(x); // If the computation resulted in an error or did not produce valid result // in the single-precision floating point range, then ignore comparing with // MPFR result as MPFR can still produce valid results because of its // wider precision. - if (FPBits(result).is_nan() || FPBits(result).is_inf() || - LIBC_NAMESPACE::libc_errno != 0) + if (FPBits(result).is_nan() || FPBits(result).is_inf() || libc_errno != 0) continue; EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Exp, x, LIBC_NAMESPACE::expf(x), 0.5); diff --git a/libc/test/src/math/expm1_test.cpp b/libc/test/src/math/expm1_test.cpp index 9720773d9f96..0cf07e2e4973 100644 --- a/libc/test/src/math/expm1_test.cpp +++ b/libc/test/src/math/expm1_test.cpp @@ -8,7 +8,7 @@ #include "hdr/math_macros.h" #include "src/__support/FPUtil/FPBits.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/math/expm1.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" @@ -64,7 +64,7 @@ TEST_F(LlvmLibcExpm1Test, InDoubleRange) { double x = FPBits(v).get_val(); if (FPBits(v).is_nan() || FPBits(v).is_inf() || x < 0.0) continue; - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; double result = LIBC_NAMESPACE::expm1(x); ++cc; if (FPBits(result).is_nan() || FPBits(result).is_inf()) diff --git a/libc/test/src/math/expm1f_test.cpp b/libc/test/src/math/expm1f_test.cpp index 274fe3bb7afb..cf3fe9c26ae1 100644 --- a/libc/test/src/math/expm1f_test.cpp +++ b/libc/test/src/math/expm1f_test.cpp @@ -8,7 +8,7 @@ #include "hdr/math_macros.h" #include "src/__support/FPUtil/FPBits.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/math/expm1f.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" @@ -21,7 +21,7 @@ using LlvmLibcExpm1fTest = LIBC_NAMESPACE::testing::FPTest; namespace mpfr = LIBC_NAMESPACE::testing::mpfr; TEST_F(LlvmLibcExpm1fTest, SpecialNumbers) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ(aNaN, LIBC_NAMESPACE::expm1f(aNaN)); EXPECT_MATH_ERRNO(0); @@ -40,7 +40,7 @@ TEST_F(LlvmLibcExpm1fTest, SpecialNumbers) { } TEST_F(LlvmLibcExpm1fTest, Overflow) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ_WITH_EXCEPTION( inf, LIBC_NAMESPACE::expm1f(FPBits(0x7f7fffffU).get_val()), FE_OVERFLOW); EXPECT_MATH_ERRNO(ERANGE); @@ -55,7 +55,7 @@ TEST_F(LlvmLibcExpm1fTest, Overflow) { } TEST_F(LlvmLibcExpm1fTest, Underflow) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ(-1.0f, LIBC_NAMESPACE::expm1f(FPBits(0xff7fffffU).get_val())); float x = FPBits(0xc2cffff8U).get_val(); @@ -70,7 +70,7 @@ TEST_F(LlvmLibcExpm1fTest, Underflow) { TEST_F(LlvmLibcExpm1fTest, Borderline) { float x; - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; x = FPBits(0x42affff8U).get_val(); ASSERT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Expm1, x, LIBC_NAMESPACE::expm1f(x), 0.5); @@ -119,15 +119,14 @@ TEST_F(LlvmLibcExpm1fTest, InFloatRange) { float x = FPBits(v).get_val(); if (FPBits(v).is_nan() || FPBits(v).is_inf()) continue; - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; float result = LIBC_NAMESPACE::expm1f(x); // If the computation resulted in an error or did not produce valid result // in the single-precision floating point range, then ignore comparing with // MPFR result as MPFR can still produce valid results because of its // wider precision. - if (FPBits(result).is_nan() || FPBits(result).is_inf() || - LIBC_NAMESPACE::libc_errno != 0) + if (FPBits(result).is_nan() || FPBits(result).is_inf() || libc_errno != 0) continue; ASSERT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Expm1, x, LIBC_NAMESPACE::expm1f(x), 0.5); diff --git a/libc/test/src/math/log10_test.cpp b/libc/test/src/math/log10_test.cpp index 01aa1f82ae5d..e9529d87c388 100644 --- a/libc/test/src/math/log10_test.cpp +++ b/libc/test/src/math/log10_test.cpp @@ -8,7 +8,7 @@ #include "hdr/math_macros.h" #include "src/__support/FPUtil/FPBits.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/math/log10.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" @@ -101,7 +101,7 @@ TEST_F(LlvmLibcLog10Test, InDoubleRange) { double x = FPBits(v).get_val(); if (FPBits(v).is_nan() || FPBits(v).is_inf() || x < 0.0) continue; - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; double result = LIBC_NAMESPACE::log10(x); ++cc; if (FPBits(result).is_nan() || FPBits(result).is_inf()) diff --git a/libc/test/src/math/log1p_test.cpp b/libc/test/src/math/log1p_test.cpp index 107e965a0d3a..e5747b7e5ec0 100644 --- a/libc/test/src/math/log1p_test.cpp +++ b/libc/test/src/math/log1p_test.cpp @@ -8,7 +8,7 @@ #include "hdr/math_macros.h" #include "src/__support/FPUtil/FPBits.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/math/log1p.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" @@ -102,7 +102,7 @@ TEST_F(LlvmLibcLog1pTest, InDoubleRange) { double x = FPBits(v).get_val(); if (FPBits(v).is_nan() || FPBits(v).is_inf() || x < 0.0) continue; - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; double result = LIBC_NAMESPACE::log1p(x); ++cc; if (FPBits(result).is_nan() || FPBits(result).is_inf()) diff --git a/libc/test/src/math/log1pf_test.cpp b/libc/test/src/math/log1pf_test.cpp index bb181dc5e43b..ffe2dd2c33dd 100644 --- a/libc/test/src/math/log1pf_test.cpp +++ b/libc/test/src/math/log1pf_test.cpp @@ -8,7 +8,7 @@ #include "hdr/math_macros.h" #include "src/__support/FPUtil/FPBits.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/math/log1pf.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" @@ -75,7 +75,7 @@ TEST_F(LlvmLibcLog1pfTest, InFloatRange) { float x = FPBits(v).get_val(); if (FPBits(v).is_nan() || FPBits(v).is_inf()) continue; - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; ASSERT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Log1p, x, LIBC_NAMESPACE::log1pf(x), 0.5); } diff --git a/libc/test/src/math/log2_test.cpp b/libc/test/src/math/log2_test.cpp index 8a07991a6888..fc440c09b42b 100644 --- a/libc/test/src/math/log2_test.cpp +++ b/libc/test/src/math/log2_test.cpp @@ -8,7 +8,7 @@ #include "hdr/math_macros.h" #include "src/__support/FPUtil/FPBits.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/math/log2.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" @@ -100,7 +100,7 @@ TEST_F(LlvmLibcLog2Test, InDoubleRange) { double x = FPBits(v).get_val(); if (FPBits(v).is_nan() || FPBits(v).is_inf() || x < 0.0) continue; - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; double result = LIBC_NAMESPACE::log2(x); ++cc; if (FPBits(result).is_nan() || FPBits(result).is_inf()) diff --git a/libc/test/src/math/log2f_test.cpp b/libc/test/src/math/log2f_test.cpp index 83691fb75300..92226c763f45 100644 --- a/libc/test/src/math/log2f_test.cpp +++ b/libc/test/src/math/log2f_test.cpp @@ -8,7 +8,7 @@ #include "hdr/math_macros.h" #include "src/__support/FPUtil/FPBits.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/math/log2f.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" @@ -52,14 +52,13 @@ TEST_F(LlvmLibcLog2fTest, InFloatRange) { float x = FPBits(v).get_val(); if (FPBits(v).is_nan() || FPBits(v).is_inf()) continue; - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; float result = LIBC_NAMESPACE::log2f(x); // If the computation resulted in an error or did not produce valid result // in the single-precision floating point range, then ignore comparing with // MPFR result as MPFR can still produce valid results because of its // wider precision. - if (FPBits(result).is_nan() || FPBits(result).is_inf() || - LIBC_NAMESPACE::libc_errno != 0) + if (FPBits(result).is_nan() || FPBits(result).is_inf() || libc_errno != 0) continue; ASSERT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Log2, x, LIBC_NAMESPACE::log2f(x), 0.5); diff --git a/libc/test/src/math/log_test.cpp b/libc/test/src/math/log_test.cpp index 969a469b2e1c..54afaa33d135 100644 --- a/libc/test/src/math/log_test.cpp +++ b/libc/test/src/math/log_test.cpp @@ -8,7 +8,7 @@ #include "hdr/math_macros.h" #include "src/__support/FPUtil/FPBits.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/math/log.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" @@ -99,7 +99,7 @@ TEST_F(LlvmLibcLogTest, InDoubleRange) { double x = FPBits(v).get_val(); if (FPBits(v).is_nan() || FPBits(v).is_inf() || x < 0.0) continue; - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; double result = LIBC_NAMESPACE::log(x); ++cc; if (FPBits(result).is_nan() || FPBits(result).is_inf()) diff --git a/libc/test/src/math/powf_test.cpp b/libc/test/src/math/powf_test.cpp index 448dcc0035e9..4d189d813e58 100644 --- a/libc/test/src/math/powf_test.cpp +++ b/libc/test/src/math/powf_test.cpp @@ -78,7 +78,7 @@ TEST_F(LlvmLibcPowfTest, InFloatRange) { if (FPBits(w).is_nan() || FPBits(w).is_inf()) continue; - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; float result = LIBC_NAMESPACE::powf(x, y); ++cc; if (FPBits(result).is_nan() || FPBits(result).is_inf()) diff --git a/libc/test/src/math/sin_test.cpp b/libc/test/src/math/sin_test.cpp index d4c6bd416a40..4d5d9ddf464b 100644 --- a/libc/test/src/math/sin_test.cpp +++ b/libc/test/src/math/sin_test.cpp @@ -71,7 +71,7 @@ TEST_F(LlvmLibcSinTest, InDoubleRange) { double x = FPBits(v).get_val(); if (FPBits(v).is_nan() || FPBits(v).is_inf()) continue; - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; double result = LIBC_NAMESPACE::sin(x); ++cc; if (FPBits(result).is_nan() || FPBits(result).is_inf()) diff --git a/libc/test/src/math/sincosf_test.cpp b/libc/test/src/math/sincosf_test.cpp index 2823110331f3..ad2155f329cd 100644 --- a/libc/test/src/math/sincosf_test.cpp +++ b/libc/test/src/math/sincosf_test.cpp @@ -8,7 +8,7 @@ #include "hdr/math_macros.h" #include "src/__support/FPUtil/FPBits.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/math/sincosf.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" @@ -24,7 +24,7 @@ using LIBC_NAMESPACE::testing::SDCOMP26094_VALUES; namespace mpfr = LIBC_NAMESPACE::testing::mpfr; TEST_F(LlvmLibcSinCosfTest, SpecialNumbers) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; float sin, cos; LIBC_NAMESPACE::sincosf(aNaN, &sin, &cos); diff --git a/libc/test/src/math/sinf_test.cpp b/libc/test/src/math/sinf_test.cpp index 8fd3ed1577ce..e0357e6157fd 100644 --- a/libc/test/src/math/sinf_test.cpp +++ b/libc/test/src/math/sinf_test.cpp @@ -8,7 +8,7 @@ #include "hdr/math_macros.h" #include "src/__support/FPUtil/FPBits.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/math/sinf.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" @@ -24,7 +24,7 @@ using LIBC_NAMESPACE::testing::SDCOMP26094_VALUES; namespace mpfr = LIBC_NAMESPACE::testing::mpfr; TEST_F(LlvmLibcSinfTest, SpecialNumbers) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ(aNaN, LIBC_NAMESPACE::sinf(aNaN)); EXPECT_MATH_ERRNO(0); diff --git a/libc/test/src/math/sinhf_test.cpp b/libc/test/src/math/sinhf_test.cpp index 6867c7aec57d..74f906ebaa98 100644 --- a/libc/test/src/math/sinhf_test.cpp +++ b/libc/test/src/math/sinhf_test.cpp @@ -9,7 +9,7 @@ #include "hdr/math_macros.h" #include "src/__support/CPP/array.h" #include "src/__support/FPUtil/FPBits.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/math/sinhf.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" @@ -22,7 +22,7 @@ using LlvmLibcSinhfTest = LIBC_NAMESPACE::testing::FPTest; namespace mpfr = LIBC_NAMESPACE::testing::mpfr; TEST_F(LlvmLibcSinhfTest, SpecialNumbers) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ(aNaN, LIBC_NAMESPACE::sinhf(aNaN)); EXPECT_MATH_ERRNO(0); @@ -65,7 +65,7 @@ TEST_F(LlvmLibcSinhfTest, SmallValues) { } TEST_F(LlvmLibcSinhfTest, Overflow) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ_WITH_EXCEPTION( inf, LIBC_NAMESPACE::sinhf(FPBits(0x7f7fffffU).get_val()), FE_OVERFLOW); EXPECT_MATH_ERRNO(ERANGE); diff --git a/libc/test/src/math/sinpif_test.cpp b/libc/test/src/math/sinpif_test.cpp index d00fd77d288c..986c676761f0 100644 --- a/libc/test/src/math/sinpif_test.cpp +++ b/libc/test/src/math/sinpif_test.cpp @@ -6,7 +6,7 @@ // //===----------------------------------------------------------------------===// -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/math/sinpif.h" #include "test/UnitTest/FPMatcher.h" #include "test/src/math/sdcomp26094.h" @@ -21,7 +21,7 @@ using LIBC_NAMESPACE::testing::SDCOMP26094_VALUES; namespace mpfr = LIBC_NAMESPACE::testing::mpfr; TEST_F(LlvmLibcSinpifTest, SpecialNumbers) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ(aNaN, LIBC_NAMESPACE::sinpif(aNaN)); EXPECT_MATH_ERRNO(0); diff --git a/libc/test/src/math/smoke/FModTest.h b/libc/test/src/math/smoke/FModTest.h index 8fbcc2a27654..04cbc659ece5 100644 --- a/libc/test/src/math/smoke/FModTest.h +++ b/libc/test/src/math/smoke/FModTest.h @@ -10,7 +10,7 @@ #define LLVM_LIBC_TEST_SRC_MATH_FMODTEST_H #include "src/__support/FPUtil/FEnvImpl.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "test/UnitTest/FEnvSafeTest.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" diff --git a/libc/test/src/math/smoke/RoundToIntegerTest.h b/libc/test/src/math/smoke/RoundToIntegerTest.h index 6ae97ce35a0d..745ccbc748ec 100644 --- a/libc/test/src/math/smoke/RoundToIntegerTest.h +++ b/libc/test/src/math/smoke/RoundToIntegerTest.h @@ -40,7 +40,7 @@ private: void test_one_input(RoundToIntegerFunc func, F input, I expected, bool expectError) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; LIBC_NAMESPACE::fputil::clear_except(FE_ALL_EXCEPT); ASSERT_EQ(func(input), expected); diff --git a/libc/test/src/math/smoke/acos_test.cpp b/libc/test/src/math/smoke/acos_test.cpp index 3a59bce26407..fe2caefb52ab 100644 --- a/libc/test/src/math/smoke/acos_test.cpp +++ b/libc/test/src/math/smoke/acos_test.cpp @@ -7,7 +7,7 @@ //===----------------------------------------------------------------------===// #include "hdr/fenv_macros.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/math/acos.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" @@ -21,7 +21,7 @@ TEST_F(LlvmLibcAcosTest, SpecialNumbers) { EXPECT_FP_EQ(0x1.921fb54442d18p0, LIBC_NAMESPACE::acos(zero)); EXPECT_FP_EQ(0x1.921fb54442d18p0, LIBC_NAMESPACE::acos(neg_zero)); - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ_WITH_EXCEPTION_ALL_ROUNDING(aNaN, LIBC_NAMESPACE::acos(inf), FE_INVALID); EXPECT_MATH_ERRNO(EDOM); diff --git a/libc/test/src/math/smoke/acosf16_test.cpp b/libc/test/src/math/smoke/acosf16_test.cpp index c4274b824509..7103dc33fec3 100644 --- a/libc/test/src/math/smoke/acosf16_test.cpp +++ b/libc/test/src/math/smoke/acosf16_test.cpp @@ -7,7 +7,7 @@ // //===----------------------------------------------------------------------===// -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/math/acosf16.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" @@ -15,7 +15,7 @@ using LlvmLibcAcosf16Test = LIBC_NAMESPACE::testing::FPTest; TEST_F(LlvmLibcAcosf16Test, SpecialNumbers) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ(aNaN, LIBC_NAMESPACE::acosf16(aNaN)); EXPECT_MATH_ERRNO(0); diff --git a/libc/test/src/math/smoke/acosf_test.cpp b/libc/test/src/math/smoke/acosf_test.cpp index 74f68e00011a..257c6a3d1d22 100644 --- a/libc/test/src/math/smoke/acosf_test.cpp +++ b/libc/test/src/math/smoke/acosf_test.cpp @@ -8,7 +8,7 @@ #include "hdr/math_macros.h" #include "src/__support/FPUtil/FPBits.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/math/acosf.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" @@ -18,7 +18,7 @@ using LlvmLibcAcosfTest = LIBC_NAMESPACE::testing::FPTest; TEST_F(LlvmLibcAcosfTest, SpecialNumbers) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::acosf(sNaN), FE_INVALID); EXPECT_MATH_ERRNO(0); diff --git a/libc/test/src/math/smoke/acoshf16_test.cpp b/libc/test/src/math/smoke/acoshf16_test.cpp index 7681c2a4e7fb..6b9c995cf992 100644 --- a/libc/test/src/math/smoke/acoshf16_test.cpp +++ b/libc/test/src/math/smoke/acoshf16_test.cpp @@ -7,7 +7,7 @@ //===----------------------------------------------------------------------===// #include "src/__support/FPUtil/cast.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/math/acoshf16.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" @@ -15,7 +15,7 @@ using LlvmLibcAcoshf16Test = LIBC_NAMESPACE::testing::FPTest; TEST_F(LlvmLibcAcoshf16Test, SpecialNumbers) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ(aNaN, LIBC_NAMESPACE::acoshf16(aNaN)); EXPECT_MATH_ERRNO(0); diff --git a/libc/test/src/math/smoke/acoshf_test.cpp b/libc/test/src/math/smoke/acoshf_test.cpp index c5ba88055ac5..b6abfab99929 100644 --- a/libc/test/src/math/smoke/acoshf_test.cpp +++ b/libc/test/src/math/smoke/acoshf_test.cpp @@ -8,7 +8,7 @@ #include "hdr/math_macros.h" #include "src/__support/FPUtil/FPBits.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/math/acoshf.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" @@ -18,7 +18,7 @@ using LlvmLibcAcoshfTest = LIBC_NAMESPACE::testing::FPTest; TEST_F(LlvmLibcAcoshfTest, SpecialNumbers) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::acoshf(sNaN), FE_INVALID); EXPECT_MATH_ERRNO(0); diff --git a/libc/test/src/math/smoke/acospif16_test.cpp b/libc/test/src/math/smoke/acospif16_test.cpp index 66b94706eab9..4b2f6de3f7e3 100644 --- a/libc/test/src/math/smoke/acospif16_test.cpp +++ b/libc/test/src/math/smoke/acospif16_test.cpp @@ -6,14 +6,14 @@ // //===----------------------------------------------------------------------===// -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/math/acospif16.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" using LlvmLibcAcospif16Test = LIBC_NAMESPACE::testing::FPTest; TEST_F(LlvmLibcAcospif16Test, SpecialNumbers) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ(aNaN, LIBC_NAMESPACE::acospif16(aNaN)); EXPECT_MATH_ERRNO(0); diff --git a/libc/test/src/math/smoke/asinf16_test.cpp b/libc/test/src/math/smoke/asinf16_test.cpp index 9f675b08319c..b03f0a420a49 100644 --- a/libc/test/src/math/smoke/asinf16_test.cpp +++ b/libc/test/src/math/smoke/asinf16_test.cpp @@ -7,7 +7,7 @@ // //===----------------------------------------------------------------------===// -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/math/asinf16.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" @@ -15,7 +15,7 @@ using LlvmLibcAsinf16Test = LIBC_NAMESPACE::testing::FPTest; TEST_F(LlvmLibcAsinf16Test, SpecialNumbers) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ(aNaN, LIBC_NAMESPACE::asinf16(aNaN)); EXPECT_MATH_ERRNO(0); diff --git a/libc/test/src/math/smoke/asinf_test.cpp b/libc/test/src/math/smoke/asinf_test.cpp index d817d2b36619..2615a8ddd16b 100644 --- a/libc/test/src/math/smoke/asinf_test.cpp +++ b/libc/test/src/math/smoke/asinf_test.cpp @@ -8,7 +8,7 @@ #include "hdr/math_macros.h" #include "src/__support/FPUtil/FPBits.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/math/asinf.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" @@ -18,7 +18,7 @@ using LlvmLibcAsinfTest = LIBC_NAMESPACE::testing::FPTest; TEST_F(LlvmLibcAsinfTest, SpecialNumbers) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::asinf(sNaN), FE_INVALID); EXPECT_MATH_ERRNO(0); diff --git a/libc/test/src/math/smoke/asinhf16_test.cpp b/libc/test/src/math/smoke/asinhf16_test.cpp index dcaab217331c..7f612ce3c467 100644 --- a/libc/test/src/math/smoke/asinhf16_test.cpp +++ b/libc/test/src/math/smoke/asinhf16_test.cpp @@ -6,7 +6,7 @@ // //===----------------------------------------------------------------------===// -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/math/asinhf16.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" @@ -14,7 +14,7 @@ using LlvmLibcAsinhf16Test = LIBC_NAMESPACE::testing::FPTest; TEST_F(LlvmLibcAsinhf16Test, SpecialNumbers) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ(aNaN, LIBC_NAMESPACE::asinhf16(aNaN)); EXPECT_MATH_ERRNO(0); diff --git a/libc/test/src/math/smoke/asinhf_test.cpp b/libc/test/src/math/smoke/asinhf_test.cpp index 4a8743c50075..d812a2dffe8a 100644 --- a/libc/test/src/math/smoke/asinhf_test.cpp +++ b/libc/test/src/math/smoke/asinhf_test.cpp @@ -8,7 +8,7 @@ #include "hdr/math_macros.h" #include "src/__support/FPUtil/FPBits.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/math/asinhf.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" @@ -18,7 +18,7 @@ using LlvmLibcAsinhfTest = LIBC_NAMESPACE::testing::FPTest; TEST_F(LlvmLibcAsinhfTest, SpecialNumbers) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::asinhf(sNaN), FE_INVALID); EXPECT_MATH_ERRNO(0); diff --git a/libc/test/src/math/smoke/atan2f_test.cpp b/libc/test/src/math/smoke/atan2f_test.cpp index 1fbcfbe96b2d..7f8cfb9830d2 100644 --- a/libc/test/src/math/smoke/atan2f_test.cpp +++ b/libc/test/src/math/smoke/atan2f_test.cpp @@ -8,7 +8,7 @@ #include "hdr/math_macros.h" #include "src/__support/FPUtil/FPBits.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/math/atan2f.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" @@ -16,7 +16,7 @@ using LlvmLibcAtan2fTest = LIBC_NAMESPACE::testing::FPTest; TEST_F(LlvmLibcAtan2fTest, SpecialNumbers) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::atan2f(sNaN, sNaN), FE_INVALID); diff --git a/libc/test/src/math/smoke/atanf16_test.cpp b/libc/test/src/math/smoke/atanf16_test.cpp index af50287d9b22..ba1e3b2fc8be 100644 --- a/libc/test/src/math/smoke/atanf16_test.cpp +++ b/libc/test/src/math/smoke/atanf16_test.cpp @@ -6,7 +6,7 @@ // //===----------------------------------------------------------------------===// -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/math/atanf16.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" @@ -14,7 +14,7 @@ using LlvmLibcAtanf16Test = LIBC_NAMESPACE::testing::FPTest; TEST_F(LlvmLibcAtanf16Test, SpecialNumbers) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ(aNaN, LIBC_NAMESPACE::atanf16(aNaN)); EXPECT_MATH_ERRNO(0); diff --git a/libc/test/src/math/smoke/atanf_test.cpp b/libc/test/src/math/smoke/atanf_test.cpp index 7d09a28beaa3..b56b9d0162b9 100644 --- a/libc/test/src/math/smoke/atanf_test.cpp +++ b/libc/test/src/math/smoke/atanf_test.cpp @@ -8,7 +8,7 @@ #include "hdr/math_macros.h" #include "src/__support/FPUtil/FPBits.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/math/atanf.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" @@ -18,7 +18,7 @@ using LlvmLibcAtanfTest = LIBC_NAMESPACE::testing::FPTest; TEST_F(LlvmLibcAtanfTest, SpecialNumbers) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::atanf(sNaN), FE_INVALID); EXPECT_MATH_ERRNO(0); diff --git a/libc/test/src/math/smoke/atanhf16_test.cpp b/libc/test/src/math/smoke/atanhf16_test.cpp index 81df6da8cee2..c2a520f7638f 100644 --- a/libc/test/src/math/smoke/atanhf16_test.cpp +++ b/libc/test/src/math/smoke/atanhf16_test.cpp @@ -7,7 +7,7 @@ //===----------------------------------------------------------------------===// #include "src/__support/FPUtil/cast.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/math/atanhf16.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" @@ -15,7 +15,7 @@ using LlvmLibcAtanhf16Test = LIBC_NAMESPACE::testing::FPTest; TEST_F(LlvmLibcAtanhf16Test, SpecialNumbers) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ_WITH_EXCEPTION_ALL_ROUNDING(aNaN, LIBC_NAMESPACE::atanhf16(sNaN), FE_INVALID); EXPECT_MATH_ERRNO(0); diff --git a/libc/test/src/math/smoke/atanhf_test.cpp b/libc/test/src/math/smoke/atanhf_test.cpp index 73a5b81b0240..038cb30d89a4 100644 --- a/libc/test/src/math/smoke/atanhf_test.cpp +++ b/libc/test/src/math/smoke/atanhf_test.cpp @@ -8,7 +8,7 @@ #include "hdr/math_macros.h" #include "src/__support/FPUtil/FPBits.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/math/atanhf.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" @@ -20,7 +20,7 @@ using LIBC_NAMESPACE::Sign; using LlvmLibcAtanhfTest = LIBC_NAMESPACE::testing::FPTest; TEST_F(LlvmLibcAtanhfTest, SpecialNumbers) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::atanhf(sNaN), FE_INVALID); EXPECT_MATH_ERRNO(0); // TODO: Strengthen errno,exception checks and remove these assert macros diff --git a/libc/test/src/math/smoke/cosf16_test.cpp b/libc/test/src/math/smoke/cosf16_test.cpp index 2638551fb1d1..4362a5a3a4bd 100644 --- a/libc/test/src/math/smoke/cosf16_test.cpp +++ b/libc/test/src/math/smoke/cosf16_test.cpp @@ -6,7 +6,7 @@ // //===----------------------------------------------------------------------===// -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/math/cosf16.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" @@ -14,7 +14,7 @@ using LlvmLibcCosf16Test = LIBC_NAMESPACE::testing::FPTest; TEST_F(LlvmLibcCosf16Test, SpecialNumbers) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::cosf16(sNaN), FE_INVALID); EXPECT_MATH_ERRNO(0); diff --git a/libc/test/src/math/smoke/cosf_test.cpp b/libc/test/src/math/smoke/cosf_test.cpp index 99773583dcb1..470a876c63a7 100644 --- a/libc/test/src/math/smoke/cosf_test.cpp +++ b/libc/test/src/math/smoke/cosf_test.cpp @@ -8,7 +8,7 @@ #include "hdr/math_macros.h" #include "src/__support/FPUtil/FPBits.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/math/cosf.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" @@ -18,7 +18,7 @@ using LlvmLibcCosfTest = LIBC_NAMESPACE::testing::FPTest; TEST_F(LlvmLibcCosfTest, SpecialNumbers) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::cosf(sNaN), FE_INVALID); EXPECT_MATH_ERRNO(0); diff --git a/libc/test/src/math/smoke/coshf16_test.cpp b/libc/test/src/math/smoke/coshf16_test.cpp index 08d05ecce86b..7bf62afa24c4 100644 --- a/libc/test/src/math/smoke/coshf16_test.cpp +++ b/libc/test/src/math/smoke/coshf16_test.cpp @@ -8,7 +8,7 @@ #include "hdr/fenv_macros.h" #include "src/__support/FPUtil/cast.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/math/coshf16.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" @@ -16,7 +16,7 @@ using LlvmLibcCoshf16Test = LIBC_NAMESPACE::testing::FPTest; TEST_F(LlvmLibcCoshf16Test, SpecialNumbers) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ_ALL_ROUNDING(aNaN, LIBC_NAMESPACE::coshf16(aNaN)); EXPECT_MATH_ERRNO(0); @@ -40,7 +40,7 @@ TEST_F(LlvmLibcCoshf16Test, SpecialNumbers) { } TEST_F(LlvmLibcCoshf16Test, Overflow) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ_WITH_EXCEPTION(inf, LIBC_NAMESPACE::coshf16(max_normal), FE_OVERFLOW | FE_INEXACT); diff --git a/libc/test/src/math/smoke/coshf_test.cpp b/libc/test/src/math/smoke/coshf_test.cpp index 1611ea1b9292..ee8f0199df3b 100644 --- a/libc/test/src/math/smoke/coshf_test.cpp +++ b/libc/test/src/math/smoke/coshf_test.cpp @@ -9,7 +9,7 @@ #include "hdr/math_macros.h" #include "src/__support/CPP/array.h" #include "src/__support/FPUtil/FPBits.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/math/coshf.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" @@ -19,7 +19,7 @@ using LlvmLibcCoshfTest = LIBC_NAMESPACE::testing::FPTest; TEST_F(LlvmLibcCoshfTest, SpecialNumbers) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::coshf(sNaN), FE_INVALID); EXPECT_MATH_ERRNO(0); @@ -41,7 +41,7 @@ TEST_F(LlvmLibcCoshfTest, SpecialNumbers) { } TEST_F(LlvmLibcCoshfTest, Overflow) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ_WITH_EXCEPTION( inf, LIBC_NAMESPACE::coshf(FPBits(0x7f7fffffU).get_val()), FE_OVERFLOW); EXPECT_MATH_ERRNO(ERANGE); diff --git a/libc/test/src/math/smoke/cospif16_test.cpp b/libc/test/src/math/smoke/cospif16_test.cpp index edd8ed97b30f..fcde0cc79e35 100644 --- a/libc/test/src/math/smoke/cospif16_test.cpp +++ b/libc/test/src/math/smoke/cospif16_test.cpp @@ -7,7 +7,7 @@ //===----------------------------------------------------------------------===// #include "src/__support/FPUtil/cast.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/math/cospif16.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" @@ -15,7 +15,7 @@ using LlvmLibcCospif16Test = LIBC_NAMESPACE::testing::FPTest; TEST_F(LlvmLibcCospif16Test, SpecialNumbers) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::cospif16(sNaN), FE_INVALID); EXPECT_MATH_ERRNO(0); diff --git a/libc/test/src/math/smoke/cospif_test.cpp b/libc/test/src/math/smoke/cospif_test.cpp index 20153897dc45..3d48909cca93 100644 --- a/libc/test/src/math/smoke/cospif_test.cpp +++ b/libc/test/src/math/smoke/cospif_test.cpp @@ -6,7 +6,7 @@ // //===----------------------------------------------------------------------===// -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/math/cospif.h" #include "test/UnitTest/FPMatcher.h" @@ -15,7 +15,7 @@ using LlvmLibcCospifTest = LIBC_NAMESPACE::testing::FPTest; TEST_F(LlvmLibcCospifTest, SpecialNumbers) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::cospif(sNaN), FE_INVALID); EXPECT_MATH_ERRNO(0); diff --git a/libc/test/src/math/smoke/exp10_test.cpp b/libc/test/src/math/smoke/exp10_test.cpp index baf8a7681097..50d3de0c7fe7 100644 --- a/libc/test/src/math/smoke/exp10_test.cpp +++ b/libc/test/src/math/smoke/exp10_test.cpp @@ -8,7 +8,7 @@ #include "hdr/math_macros.h" #include "src/__support/FPUtil/FPBits.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/math/exp10.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" diff --git a/libc/test/src/math/smoke/exp10f16_test.cpp b/libc/test/src/math/smoke/exp10f16_test.cpp index 1c4ef2aa08a7..bda40348f883 100644 --- a/libc/test/src/math/smoke/exp10f16_test.cpp +++ b/libc/test/src/math/smoke/exp10f16_test.cpp @@ -8,7 +8,7 @@ #include "hdr/fenv_macros.h" #include "src/__support/FPUtil/cast.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/math/exp10f16.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" @@ -16,7 +16,7 @@ using LlvmLibcExp10f16Test = LIBC_NAMESPACE::testing::FPTest; TEST_F(LlvmLibcExp10f16Test, SpecialNumbers) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ_ALL_ROUNDING(aNaN, LIBC_NAMESPACE::exp10f16(aNaN)); EXPECT_MATH_ERRNO(0); @@ -40,7 +40,7 @@ TEST_F(LlvmLibcExp10f16Test, SpecialNumbers) { } TEST_F(LlvmLibcExp10f16Test, Overflow) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ_WITH_EXCEPTION(inf, LIBC_NAMESPACE::exp10f16(max_normal), FE_OVERFLOW); @@ -53,7 +53,7 @@ TEST_F(LlvmLibcExp10f16Test, Overflow) { } TEST_F(LlvmLibcExp10f16Test, Underflow) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ_WITH_EXCEPTION(zero, LIBC_NAMESPACE::exp10f16(neg_max_normal), FE_UNDERFLOW | FE_INEXACT); diff --git a/libc/test/src/math/smoke/exp10f_test.cpp b/libc/test/src/math/smoke/exp10f_test.cpp index bf39e2cc12d0..fcd334bb9e36 100644 --- a/libc/test/src/math/smoke/exp10f_test.cpp +++ b/libc/test/src/math/smoke/exp10f_test.cpp @@ -8,7 +8,7 @@ #include "hdr/math_macros.h" #include "src/__support/FPUtil/FPBits.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/math/exp10f.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" @@ -18,7 +18,7 @@ using LlvmLibcExp10fTest = LIBC_NAMESPACE::testing::FPTest; TEST_F(LlvmLibcExp10fTest, SpecialNumbers) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::exp10f(sNaN), FE_INVALID); EXPECT_MATH_ERRNO(0); @@ -44,7 +44,7 @@ TEST_F(LlvmLibcExp10fTest, SpecialNumbers) { } TEST_F(LlvmLibcExp10fTest, Overflow) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ_WITH_EXCEPTION( inf, LIBC_NAMESPACE::exp10f(FPBits(0x7f7fffffU).get_val()), FE_OVERFLOW); EXPECT_MATH_ERRNO(ERANGE); diff --git a/libc/test/src/math/smoke/exp10m1f16_test.cpp b/libc/test/src/math/smoke/exp10m1f16_test.cpp index dfa7fa477d3d..ed2d5a48b316 100644 --- a/libc/test/src/math/smoke/exp10m1f16_test.cpp +++ b/libc/test/src/math/smoke/exp10m1f16_test.cpp @@ -8,7 +8,7 @@ #include "hdr/fenv_macros.h" #include "src/__support/FPUtil/cast.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/math/exp10m1f16.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" @@ -16,7 +16,7 @@ using LlvmLibcExp10m1f16Test = LIBC_NAMESPACE::testing::FPTest; TEST_F(LlvmLibcExp10m1f16Test, SpecialNumbers) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ_ALL_ROUNDING(aNaN, LIBC_NAMESPACE::exp10m1f16(aNaN)); EXPECT_MATH_ERRNO(0); @@ -40,7 +40,7 @@ TEST_F(LlvmLibcExp10m1f16Test, SpecialNumbers) { } TEST_F(LlvmLibcExp10m1f16Test, Overflow) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ_WITH_EXCEPTION(inf, LIBC_NAMESPACE::exp10m1f16(max_normal), FE_OVERFLOW | FE_INEXACT); @@ -67,7 +67,7 @@ TEST_F(LlvmLibcExp10m1f16Test, Overflow) { } TEST_F(LlvmLibcExp10m1f16Test, ResultNearNegOne) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ_WITH_EXCEPTION(LIBC_NAMESPACE::fputil::cast(-1.0), LIBC_NAMESPACE::exp10m1f16(neg_max_normal), diff --git a/libc/test/src/math/smoke/exp10m1f_test.cpp b/libc/test/src/math/smoke/exp10m1f_test.cpp index 2c2cfdbb08a3..19369a897aaa 100644 --- a/libc/test/src/math/smoke/exp10m1f_test.cpp +++ b/libc/test/src/math/smoke/exp10m1f_test.cpp @@ -6,7 +6,7 @@ // //===----------------------------------------------------------------------===// -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/math/exp10m1f.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" @@ -14,7 +14,7 @@ using LlvmLibcExp10m1fTest = LIBC_NAMESPACE::testing::FPTest; TEST_F(LlvmLibcExp10m1fTest, SpecialNumbers) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::exp10m1f(sNaN), FE_INVALID); EXPECT_MATH_ERRNO(0); @@ -34,7 +34,7 @@ TEST_F(LlvmLibcExp10m1fTest, SpecialNumbers) { } TEST_F(LlvmLibcExp10m1fTest, Overflow) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ_WITH_EXCEPTION(inf, LIBC_NAMESPACE::exp10m1f(0x1.fffffep+127f), FE_OVERFLOW); @@ -50,7 +50,7 @@ TEST_F(LlvmLibcExp10m1fTest, Overflow) { } TEST_F(LlvmLibcExp10m1fTest, Underflow) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ_WITH_EXCEPTION(-1.0f, LIBC_NAMESPACE::exp10m1f(-max_normal), FE_UNDERFLOW); diff --git a/libc/test/src/math/smoke/exp2_test.cpp b/libc/test/src/math/smoke/exp2_test.cpp index 9ab9129416da..aebf80835072 100644 --- a/libc/test/src/math/smoke/exp2_test.cpp +++ b/libc/test/src/math/smoke/exp2_test.cpp @@ -8,7 +8,7 @@ #include "hdr/math_macros.h" #include "src/__support/FPUtil/FPBits.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/math/exp2.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" diff --git a/libc/test/src/math/smoke/exp2f16_test.cpp b/libc/test/src/math/smoke/exp2f16_test.cpp index f69b33a3cf37..1eb7343dcd22 100644 --- a/libc/test/src/math/smoke/exp2f16_test.cpp +++ b/libc/test/src/math/smoke/exp2f16_test.cpp @@ -8,7 +8,7 @@ #include "hdr/fenv_macros.h" #include "src/__support/FPUtil/cast.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/math/exp2f16.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" @@ -16,7 +16,7 @@ using LlvmLibcExp2f16Test = LIBC_NAMESPACE::testing::FPTest; TEST_F(LlvmLibcExp2f16Test, SpecialNumbers) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ_ALL_ROUNDING(aNaN, LIBC_NAMESPACE::exp2f16(aNaN)); EXPECT_MATH_ERRNO(0); @@ -40,7 +40,7 @@ TEST_F(LlvmLibcExp2f16Test, SpecialNumbers) { } TEST_F(LlvmLibcExp2f16Test, Overflow) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ_WITH_EXCEPTION(inf, LIBC_NAMESPACE::exp2f16(max_normal), FE_OVERFLOW); @@ -53,7 +53,7 @@ TEST_F(LlvmLibcExp2f16Test, Overflow) { } TEST_F(LlvmLibcExp2f16Test, Underflow) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ_WITH_EXCEPTION(zero, LIBC_NAMESPACE::exp2f16(neg_max_normal), FE_UNDERFLOW | FE_INEXACT); diff --git a/libc/test/src/math/smoke/exp2f_test.cpp b/libc/test/src/math/smoke/exp2f_test.cpp index a928389cc41b..c5243273d9ed 100644 --- a/libc/test/src/math/smoke/exp2f_test.cpp +++ b/libc/test/src/math/smoke/exp2f_test.cpp @@ -8,7 +8,7 @@ #include "hdr/math_macros.h" #include "src/__support/FPUtil/FPBits.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/math/exp2f.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" @@ -18,7 +18,7 @@ using LlvmLibcExp2fTest = LIBC_NAMESPACE::testing::FPTest; TEST_F(LlvmLibcExp2fTest, SpecialNumbers) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::exp2f(sNaN), FE_INVALID); EXPECT_MATH_ERRNO(0); @@ -45,7 +45,7 @@ TEST_F(LlvmLibcExp2fTest, SpecialNumbers) { } TEST_F(LlvmLibcExp2fTest, Overflow) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ_WITH_EXCEPTION( inf, LIBC_NAMESPACE::exp2f(FPBits(0x7f7fffffU).get_val()), FE_OVERFLOW); EXPECT_MATH_ERRNO(ERANGE); diff --git a/libc/test/src/math/smoke/exp2m1f16_test.cpp b/libc/test/src/math/smoke/exp2m1f16_test.cpp index f423196a7036..635b7a6e187d 100644 --- a/libc/test/src/math/smoke/exp2m1f16_test.cpp +++ b/libc/test/src/math/smoke/exp2m1f16_test.cpp @@ -8,7 +8,7 @@ #include "hdr/fenv_macros.h" #include "src/__support/FPUtil/cast.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/math/exp2m1f16.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" @@ -16,7 +16,7 @@ using LlvmLibcExp2m1f16Test = LIBC_NAMESPACE::testing::FPTest; TEST_F(LlvmLibcExp2m1f16Test, SpecialNumbers) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ_ALL_ROUNDING(aNaN, LIBC_NAMESPACE::exp2m1f16(aNaN)); EXPECT_MATH_ERRNO(0); @@ -39,7 +39,7 @@ TEST_F(LlvmLibcExp2m1f16Test, SpecialNumbers) { } TEST_F(LlvmLibcExp2m1f16Test, Overflow) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ_WITH_EXCEPTION(inf, LIBC_NAMESPACE::exp2m1f16(max_normal), FE_OVERFLOW | FE_INEXACT); @@ -65,7 +65,7 @@ TEST_F(LlvmLibcExp2m1f16Test, Overflow) { } TEST_F(LlvmLibcExp2m1f16Test, ResultNearNegOne) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ_WITH_EXCEPTION(-1.0, LIBC_NAMESPACE::exp2m1f16(neg_max_normal), FE_INEXACT); diff --git a/libc/test/src/math/smoke/exp2m1f_test.cpp b/libc/test/src/math/smoke/exp2m1f_test.cpp index 99bdf0035df0..63852e11655a 100644 --- a/libc/test/src/math/smoke/exp2m1f_test.cpp +++ b/libc/test/src/math/smoke/exp2m1f_test.cpp @@ -6,7 +6,7 @@ // //===----------------------------------------------------------------------===// -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/math/exp2m1f.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" @@ -16,7 +16,7 @@ using LIBC_NAMESPACE::fputil::testing::ForceRoundingMode; using LIBC_NAMESPACE::fputil::testing::RoundingMode; TEST_F(LlvmLibcExp2m1fTest, SpecialNumbers) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::exp2m1f(sNaN), FE_INVALID); EXPECT_MATH_ERRNO(0); @@ -34,7 +34,7 @@ TEST_F(LlvmLibcExp2m1fTest, SpecialNumbers) { } TEST_F(LlvmLibcExp2m1fTest, Overflow) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ_WITH_EXCEPTION(inf, LIBC_NAMESPACE::exp2m1f(0x1.fffffep+127), FE_OVERFLOW); @@ -50,7 +50,7 @@ TEST_F(LlvmLibcExp2m1fTest, Overflow) { } TEST_F(LlvmLibcExp2m1fTest, Underflow) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ_WITH_EXCEPTION(-1.0f, LIBC_NAMESPACE::exp2m1f(-0x1.fffffep+127), FE_UNDERFLOW); diff --git a/libc/test/src/math/smoke/exp_test.cpp b/libc/test/src/math/smoke/exp_test.cpp index f86243092f1f..c3b2ae70e1d9 100644 --- a/libc/test/src/math/smoke/exp_test.cpp +++ b/libc/test/src/math/smoke/exp_test.cpp @@ -8,7 +8,7 @@ #include "hdr/math_macros.h" #include "src/__support/FPUtil/FPBits.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/math/exp.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" diff --git a/libc/test/src/math/smoke/expf16_test.cpp b/libc/test/src/math/smoke/expf16_test.cpp index ab745a3cf6f5..863f694ffc41 100644 --- a/libc/test/src/math/smoke/expf16_test.cpp +++ b/libc/test/src/math/smoke/expf16_test.cpp @@ -9,7 +9,7 @@ #include "hdr/errno_macros.h" #include "hdr/fenv_macros.h" #include "src/__support/FPUtil/cast.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/math/expf16.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" @@ -17,7 +17,7 @@ using LlvmLibcExpf16Test = LIBC_NAMESPACE::testing::FPTest; TEST_F(LlvmLibcExpf16Test, SpecialNumbers) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ_ALL_ROUNDING(aNaN, LIBC_NAMESPACE::expf16(aNaN)); EXPECT_MATH_ERRNO(0); @@ -41,7 +41,7 @@ TEST_F(LlvmLibcExpf16Test, SpecialNumbers) { } TEST_F(LlvmLibcExpf16Test, Overflow) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ_WITH_EXCEPTION(inf, LIBC_NAMESPACE::expf16(max_normal), FE_OVERFLOW); @@ -54,7 +54,7 @@ TEST_F(LlvmLibcExpf16Test, Overflow) { } TEST_F(LlvmLibcExpf16Test, Underflow) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ_WITH_EXCEPTION(zero, LIBC_NAMESPACE::expf16(neg_max_normal), FE_UNDERFLOW | FE_INEXACT); diff --git a/libc/test/src/math/smoke/expf_test.cpp b/libc/test/src/math/smoke/expf_test.cpp index eee830499927..d34151735afa 100644 --- a/libc/test/src/math/smoke/expf_test.cpp +++ b/libc/test/src/math/smoke/expf_test.cpp @@ -8,7 +8,7 @@ #include "hdr/math_macros.h" #include "src/__support/FPUtil/FPBits.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/math/expf.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" @@ -18,7 +18,7 @@ using LlvmLibcExpfTest = LIBC_NAMESPACE::testing::FPTest; TEST_F(LlvmLibcExpfTest, SpecialNumbers) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::expf(sNaN), FE_INVALID); EXPECT_MATH_ERRNO(0); @@ -40,7 +40,7 @@ TEST_F(LlvmLibcExpfTest, SpecialNumbers) { } TEST_F(LlvmLibcExpfTest, Overflow) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ_WITH_EXCEPTION( inf, LIBC_NAMESPACE::expf(FPBits(0x7f7fffffU).get_val()), FE_OVERFLOW); EXPECT_MATH_ERRNO(ERANGE); diff --git a/libc/test/src/math/smoke/expm1_test.cpp b/libc/test/src/math/smoke/expm1_test.cpp index bc71c53abc7a..c842fe3c45fe 100644 --- a/libc/test/src/math/smoke/expm1_test.cpp +++ b/libc/test/src/math/smoke/expm1_test.cpp @@ -8,7 +8,7 @@ #include "hdr/math_macros.h" #include "src/__support/FPUtil/FPBits.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/math/expm1.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" diff --git a/libc/test/src/math/smoke/expm1f16_test.cpp b/libc/test/src/math/smoke/expm1f16_test.cpp index f297c5dfc3c7..4d19a9bac5eb 100644 --- a/libc/test/src/math/smoke/expm1f16_test.cpp +++ b/libc/test/src/math/smoke/expm1f16_test.cpp @@ -9,7 +9,7 @@ #include "hdr/errno_macros.h" #include "hdr/fenv_macros.h" #include "src/__support/FPUtil/cast.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/math/expm1f16.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" @@ -17,7 +17,7 @@ using LlvmLibcExpm1f16Test = LIBC_NAMESPACE::testing::FPTest; TEST_F(LlvmLibcExpm1f16Test, SpecialNumbers) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ_ALL_ROUNDING(aNaN, LIBC_NAMESPACE::expm1f16(aNaN)); EXPECT_MATH_ERRNO(0); @@ -40,7 +40,7 @@ TEST_F(LlvmLibcExpm1f16Test, SpecialNumbers) { } TEST_F(LlvmLibcExpm1f16Test, Overflow) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ_WITH_EXCEPTION(inf, LIBC_NAMESPACE::expm1f16(max_normal), FE_OVERFLOW | FE_INEXACT); @@ -67,7 +67,7 @@ TEST_F(LlvmLibcExpm1f16Test, Overflow) { } TEST_F(LlvmLibcExpm1f16Test, ResultNearNegOne) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ_WITH_EXCEPTION(LIBC_NAMESPACE::fputil::cast(-1.0), LIBC_NAMESPACE::expm1f16(neg_max_normal), diff --git a/libc/test/src/math/smoke/expm1f_test.cpp b/libc/test/src/math/smoke/expm1f_test.cpp index dfb474d70fb6..214bfe8abd4d 100644 --- a/libc/test/src/math/smoke/expm1f_test.cpp +++ b/libc/test/src/math/smoke/expm1f_test.cpp @@ -8,7 +8,7 @@ #include "hdr/math_macros.h" #include "src/__support/FPUtil/FPBits.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/math/expm1f.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" @@ -18,7 +18,7 @@ using LlvmLibcExpm1fTest = LIBC_NAMESPACE::testing::FPTest; TEST_F(LlvmLibcExpm1fTest, SpecialNumbers) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::expm1f(sNaN), FE_INVALID); EXPECT_MATH_ERRNO(0); @@ -40,7 +40,7 @@ TEST_F(LlvmLibcExpm1fTest, SpecialNumbers) { } TEST_F(LlvmLibcExpm1fTest, Overflow) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ_WITH_EXCEPTION( inf, LIBC_NAMESPACE::expm1f(FPBits(0x7f7fffffU).get_val()), FE_OVERFLOW); EXPECT_MATH_ERRNO(ERANGE); diff --git a/libc/test/src/math/smoke/log10_test.cpp b/libc/test/src/math/smoke/log10_test.cpp index ff73850c5210..49cfda85111a 100644 --- a/libc/test/src/math/smoke/log10_test.cpp +++ b/libc/test/src/math/smoke/log10_test.cpp @@ -8,7 +8,7 @@ #include "hdr/math_macros.h" #include "src/__support/FPUtil/FPBits.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/math/log10.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" diff --git a/libc/test/src/math/smoke/log10f16_test.cpp b/libc/test/src/math/smoke/log10f16_test.cpp index 471e19893332..53f5ac46aa60 100644 --- a/libc/test/src/math/smoke/log10f16_test.cpp +++ b/libc/test/src/math/smoke/log10f16_test.cpp @@ -8,7 +8,7 @@ #include "hdr/fenv_macros.h" #include "src/__support/FPUtil/cast.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/math/log10f16.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" @@ -16,7 +16,7 @@ using LlvmLibcLog10f16Test = LIBC_NAMESPACE::testing::FPTest; TEST_F(LlvmLibcLog10f16Test, SpecialNumbers) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ_ALL_ROUNDING(aNaN, LIBC_NAMESPACE::log10f16(aNaN)); EXPECT_MATH_ERRNO(0); diff --git a/libc/test/src/math/smoke/log1p_test.cpp b/libc/test/src/math/smoke/log1p_test.cpp index 631c24b8abcf..61c56cd2c6dd 100644 --- a/libc/test/src/math/smoke/log1p_test.cpp +++ b/libc/test/src/math/smoke/log1p_test.cpp @@ -8,7 +8,7 @@ #include "hdr/math_macros.h" #include "src/__support/FPUtil/FPBits.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/math/log1p.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" diff --git a/libc/test/src/math/smoke/log1pf_test.cpp b/libc/test/src/math/smoke/log1pf_test.cpp index bd828ad58c4c..dc3489fddf99 100644 --- a/libc/test/src/math/smoke/log1pf_test.cpp +++ b/libc/test/src/math/smoke/log1pf_test.cpp @@ -8,7 +8,7 @@ #include "hdr/math_macros.h" #include "src/__support/FPUtil/FPBits.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/math/log1pf.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" diff --git a/libc/test/src/math/smoke/log2_test.cpp b/libc/test/src/math/smoke/log2_test.cpp index 9993d442967c..0534d00b1f40 100644 --- a/libc/test/src/math/smoke/log2_test.cpp +++ b/libc/test/src/math/smoke/log2_test.cpp @@ -8,7 +8,7 @@ #include "hdr/math_macros.h" #include "src/__support/FPUtil/FPBits.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/math/log2.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" diff --git a/libc/test/src/math/smoke/log2f16_test.cpp b/libc/test/src/math/smoke/log2f16_test.cpp index 6d98482aa449..fd20652d2f00 100644 --- a/libc/test/src/math/smoke/log2f16_test.cpp +++ b/libc/test/src/math/smoke/log2f16_test.cpp @@ -8,7 +8,7 @@ #include "hdr/fenv_macros.h" #include "src/__support/FPUtil/cast.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/math/log2f16.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" @@ -16,7 +16,7 @@ using LlvmLibcLog2f16Test = LIBC_NAMESPACE::testing::FPTest; TEST_F(LlvmLibcLog2f16Test, SpecialNumbers) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ_ALL_ROUNDING(aNaN, LIBC_NAMESPACE::log2f16(aNaN)); EXPECT_MATH_ERRNO(0); diff --git a/libc/test/src/math/smoke/log2f_test.cpp b/libc/test/src/math/smoke/log2f_test.cpp index 8648b75b88b8..53d54ac36763 100644 --- a/libc/test/src/math/smoke/log2f_test.cpp +++ b/libc/test/src/math/smoke/log2f_test.cpp @@ -8,7 +8,7 @@ #include "hdr/math_macros.h" #include "src/__support/FPUtil/FPBits.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/math/log2f.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" diff --git a/libc/test/src/math/smoke/log_test.cpp b/libc/test/src/math/smoke/log_test.cpp index d31eb0c1db73..09e9ab0a9a4d 100644 --- a/libc/test/src/math/smoke/log_test.cpp +++ b/libc/test/src/math/smoke/log_test.cpp @@ -8,7 +8,7 @@ #include "hdr/math_macros.h" #include "src/__support/FPUtil/FPBits.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/math/log.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" diff --git a/libc/test/src/math/smoke/logf16_test.cpp b/libc/test/src/math/smoke/logf16_test.cpp index c7232aa1c1e3..2784f3d5fa54 100644 --- a/libc/test/src/math/smoke/logf16_test.cpp +++ b/libc/test/src/math/smoke/logf16_test.cpp @@ -8,7 +8,7 @@ #include "hdr/fenv_macros.h" #include "src/__support/FPUtil/cast.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/math/logf16.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" @@ -16,7 +16,7 @@ using LlvmLibcLogf16Test = LIBC_NAMESPACE::testing::FPTest; TEST_F(LlvmLibcLogf16Test, SpecialNumbers) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ_ALL_ROUNDING(aNaN, LIBC_NAMESPACE::logf16(aNaN)); EXPECT_MATH_ERRNO(0); diff --git a/libc/test/src/math/smoke/sincosf_test.cpp b/libc/test/src/math/smoke/sincosf_test.cpp index 5f66868f12a1..8ba0d04347bb 100644 --- a/libc/test/src/math/smoke/sincosf_test.cpp +++ b/libc/test/src/math/smoke/sincosf_test.cpp @@ -8,7 +8,7 @@ #include "hdr/math_macros.h" #include "src/__support/FPUtil/FPBits.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/math/sincosf.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" @@ -18,7 +18,7 @@ using LlvmLibcSinCosfTest = LIBC_NAMESPACE::testing::FPTest; TEST_F(LlvmLibcSinCosfTest, SpecialNumbers) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; float sin, cos; LIBC_NAMESPACE::sincosf(sNaN, &sin, &cos); diff --git a/libc/test/src/math/smoke/sinf16_test.cpp b/libc/test/src/math/smoke/sinf16_test.cpp index a0e7a7ba321f..6b168ac040db 100644 --- a/libc/test/src/math/smoke/sinf16_test.cpp +++ b/libc/test/src/math/smoke/sinf16_test.cpp @@ -6,7 +6,7 @@ // //===----------------------------------------------------------------------===// -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/math/sinf16.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" @@ -14,7 +14,7 @@ using LlvmLibcSinf16Test = LIBC_NAMESPACE::testing::FPTest; TEST_F(LlvmLibcSinf16Test, SpecialNumbers) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::sinf16(sNaN), FE_INVALID); EXPECT_MATH_ERRNO(0); diff --git a/libc/test/src/math/smoke/sinf_test.cpp b/libc/test/src/math/smoke/sinf_test.cpp index de504b4f5335..8173969fb256 100644 --- a/libc/test/src/math/smoke/sinf_test.cpp +++ b/libc/test/src/math/smoke/sinf_test.cpp @@ -8,7 +8,7 @@ #include "hdr/math_macros.h" #include "src/__support/FPUtil/FPBits.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/math/sinf.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" @@ -18,7 +18,7 @@ using LlvmLibcSinfTest = LIBC_NAMESPACE::testing::FPTest; TEST_F(LlvmLibcSinfTest, SpecialNumbers) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::sinf(sNaN), FE_INVALID); EXPECT_MATH_ERRNO(0); diff --git a/libc/test/src/math/smoke/sinhf16_test.cpp b/libc/test/src/math/smoke/sinhf16_test.cpp index 4f21d33ba78e..d52739a9adb3 100644 --- a/libc/test/src/math/smoke/sinhf16_test.cpp +++ b/libc/test/src/math/smoke/sinhf16_test.cpp @@ -8,7 +8,7 @@ #include "hdr/fenv_macros.h" #include "src/__support/FPUtil/cast.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/math/sinhf16.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" @@ -16,7 +16,7 @@ using LlvmLibcSinhf16Test = LIBC_NAMESPACE::testing::FPTest; TEST_F(LlvmLibcSinhf16Test, SpecialNumbers) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ_ALL_ROUNDING(aNaN, LIBC_NAMESPACE::sinhf16(aNaN)); EXPECT_MATH_ERRNO(0); @@ -38,7 +38,7 @@ TEST_F(LlvmLibcSinhf16Test, SpecialNumbers) { } TEST_F(LlvmLibcSinhf16Test, Overflow) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ_WITH_EXCEPTION(inf, LIBC_NAMESPACE::sinhf16(max_normal), FE_OVERFLOW | FE_INEXACT); diff --git a/libc/test/src/math/smoke/sinhf_test.cpp b/libc/test/src/math/smoke/sinhf_test.cpp index e22cfc7ea14d..ea6a4474a780 100644 --- a/libc/test/src/math/smoke/sinhf_test.cpp +++ b/libc/test/src/math/smoke/sinhf_test.cpp @@ -9,7 +9,7 @@ #include "hdr/math_macros.h" #include "src/__support/CPP/array.h" #include "src/__support/FPUtil/FPBits.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/math/sinhf.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" @@ -19,7 +19,7 @@ using LlvmLibcSinhfTest = LIBC_NAMESPACE::testing::FPTest; TEST_F(LlvmLibcSinhfTest, SpecialNumbers) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::sinhf(sNaN), FE_INVALID); EXPECT_MATH_ERRNO(0); @@ -52,7 +52,7 @@ TEST_F(LlvmLibcSinhfTest, SmallValues) { } TEST_F(LlvmLibcSinhfTest, Overflow) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ_WITH_EXCEPTION( inf, LIBC_NAMESPACE::sinhf(FPBits(0x7f7fffffU).get_val()), FE_OVERFLOW); EXPECT_MATH_ERRNO(ERANGE); diff --git a/libc/test/src/math/smoke/sinpif16_test.cpp b/libc/test/src/math/smoke/sinpif16_test.cpp index b2db6fb9f862..9edf2cc663d4 100644 --- a/libc/test/src/math/smoke/sinpif16_test.cpp +++ b/libc/test/src/math/smoke/sinpif16_test.cpp @@ -7,7 +7,7 @@ //===----------------------------------------------------------------------===// #include "src/__support/FPUtil/cast.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/math/sinpif16.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" @@ -15,7 +15,7 @@ using LlvmLibcSinpif16Test = LIBC_NAMESPACE::testing::FPTest; TEST_F(LlvmLibcSinpif16Test, SpecialNumbers) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::sinpif16(sNaN), FE_INVALID); EXPECT_MATH_ERRNO(0); diff --git a/libc/test/src/math/smoke/sinpif_test.cpp b/libc/test/src/math/smoke/sinpif_test.cpp index 1ba5c1d2b720..b840f3980eda 100644 --- a/libc/test/src/math/smoke/sinpif_test.cpp +++ b/libc/test/src/math/smoke/sinpif_test.cpp @@ -6,7 +6,7 @@ // //===----------------------------------------------------------------------===// -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/math/sinpif.h" #include "test/UnitTest/FPMatcher.h" @@ -15,7 +15,7 @@ using LlvmLibcSinpifTest = LIBC_NAMESPACE::testing::FPTest; TEST_F(LlvmLibcSinpifTest, SpecialNumbers) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::sinpif(sNaN), FE_INVALID); EXPECT_MATH_ERRNO(0); diff --git a/libc/test/src/math/smoke/tanf16_test.cpp b/libc/test/src/math/smoke/tanf16_test.cpp index f65b9fced72c..95d200cf5591 100644 --- a/libc/test/src/math/smoke/tanf16_test.cpp +++ b/libc/test/src/math/smoke/tanf16_test.cpp @@ -7,7 +7,7 @@ // //===----------------------------------------------------------------------===// -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/math/tanf16.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" @@ -15,7 +15,7 @@ using LlvmLibcTanf16Test = LIBC_NAMESPACE::testing::FPTest; TEST_F(LlvmLibcTanf16Test, SpecialNumbers) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::tanf16(sNaN), FE_INVALID); EXPECT_MATH_ERRNO(0); diff --git a/libc/test/src/math/smoke/tanf_test.cpp b/libc/test/src/math/smoke/tanf_test.cpp index 178e9065f430..12deca5cf941 100644 --- a/libc/test/src/math/smoke/tanf_test.cpp +++ b/libc/test/src/math/smoke/tanf_test.cpp @@ -8,7 +8,7 @@ #include "hdr/math_macros.h" #include "src/__support/FPUtil/FPBits.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/math/tanf.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" @@ -18,7 +18,7 @@ using LlvmLibcTanfTest = LIBC_NAMESPACE::testing::FPTest; TEST_F(LlvmLibcTanfTest, SpecialNumbers) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::tanf(sNaN), FE_INVALID); EXPECT_MATH_ERRNO(0); diff --git a/libc/test/src/math/smoke/tanhf16_test.cpp b/libc/test/src/math/smoke/tanhf16_test.cpp index fa6328e9ef0a..eb90f02a8d7c 100644 --- a/libc/test/src/math/smoke/tanhf16_test.cpp +++ b/libc/test/src/math/smoke/tanhf16_test.cpp @@ -8,7 +8,7 @@ #include "hdr/fenv_macros.h" #include "src/__support/FPUtil/cast.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/math/tanhf16.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" @@ -16,7 +16,7 @@ using LlvmLibcTanhf16Test = LIBC_NAMESPACE::testing::FPTest; TEST_F(LlvmLibcTanhf16Test, SpecialNumbers) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ_ALL_ROUNDING(aNaN, LIBC_NAMESPACE::tanhf16(aNaN)); EXPECT_MATH_ERRNO(0); @@ -40,7 +40,7 @@ TEST_F(LlvmLibcTanhf16Test, SpecialNumbers) { } TEST_F(LlvmLibcTanhf16Test, ResultNearBounds) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ_WITH_EXCEPTION(LIBC_NAMESPACE::fputil::cast(1.0), LIBC_NAMESPACE::tanhf16(max_normal), FE_INEXACT); diff --git a/libc/test/src/math/smoke/tanhf_test.cpp b/libc/test/src/math/smoke/tanhf_test.cpp index c09761ef531f..b12a331b3190 100644 --- a/libc/test/src/math/smoke/tanhf_test.cpp +++ b/libc/test/src/math/smoke/tanhf_test.cpp @@ -8,7 +8,7 @@ #include "hdr/math_macros.h" #include "src/__support/FPUtil/FPBits.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/math/tanhf.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" @@ -18,7 +18,7 @@ using LlvmLibcTanhfTest = LIBC_NAMESPACE::testing::FPTest; TEST_F(LlvmLibcTanhfTest, SpecialNumbers) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::tanhf(sNaN), FE_INVALID); EXPECT_MATH_ERRNO(0); diff --git a/libc/test/src/math/smoke/tanpif16_test.cpp b/libc/test/src/math/smoke/tanpif16_test.cpp index 74797d1649b1..ea896d7bb3e5 100644 --- a/libc/test/src/math/smoke/tanpif16_test.cpp +++ b/libc/test/src/math/smoke/tanpif16_test.cpp @@ -6,7 +6,7 @@ // //===----------------------------------------------------------------------===// -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/math/tanpif16.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" @@ -14,7 +14,7 @@ using LlvmLibcTanpif16Test = LIBC_NAMESPACE::testing::FPTest; TEST_F(LlvmLibcTanpif16Test, SpecialNumbers) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::tanpif16(sNaN), FE_INVALID); EXPECT_MATH_ERRNO(0); diff --git a/libc/test/src/math/tanf_test.cpp b/libc/test/src/math/tanf_test.cpp index 9061cf6fb30b..ecc70194b649 100644 --- a/libc/test/src/math/tanf_test.cpp +++ b/libc/test/src/math/tanf_test.cpp @@ -8,7 +8,7 @@ #include "hdr/math_macros.h" #include "src/__support/FPUtil/FPBits.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/math/tanf.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" @@ -24,7 +24,7 @@ using LIBC_NAMESPACE::testing::SDCOMP26094_VALUES; namespace mpfr = LIBC_NAMESPACE::testing::mpfr; TEST_F(LlvmLibcTanfTest, SpecialNumbers) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ(aNaN, LIBC_NAMESPACE::tanf(aNaN)); EXPECT_MATH_ERRNO(0); diff --git a/libc/test/src/math/tanhf_test.cpp b/libc/test/src/math/tanhf_test.cpp index 389abe4d8589..966ce649e2b3 100644 --- a/libc/test/src/math/tanhf_test.cpp +++ b/libc/test/src/math/tanhf_test.cpp @@ -8,7 +8,7 @@ #include "hdr/math_macros.h" #include "src/__support/FPUtil/FPBits.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/math/tanhf.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" @@ -21,7 +21,7 @@ using LlvmLibcTanhfTest = LIBC_NAMESPACE::testing::FPTest; namespace mpfr = LIBC_NAMESPACE::testing::mpfr; TEST_F(LlvmLibcTanhfTest, SpecialNumbers) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ(aNaN, LIBC_NAMESPACE::tanhf(aNaN)); EXPECT_MATH_ERRNO(0); diff --git a/libc/test/src/poll/poll_test.cpp b/libc/test/src/poll/poll_test.cpp index 30f5e41c61ec..97b7b0271817 100644 --- a/libc/test/src/poll/poll_test.cpp +++ b/libc/test/src/poll/poll_test.cpp @@ -7,18 +7,18 @@ //===----------------------------------------------------------------------===// #include "hdr/limits_macros.h" // UINT_MAX -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/poll/poll.h" #include "test/UnitTest/Test.h" TEST(LlvmLibcPollTest, SmokeTest) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; int ret = LIBC_NAMESPACE::poll(nullptr, 0, 0); ASSERT_ERRNO_SUCCESS(); ASSERT_EQ(0, ret); } TEST(LlvmLibcPollTest, SmokeFailureTest) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; int ret = LIBC_NAMESPACE::poll(nullptr, UINT_MAX, 0); ASSERT_ERRNO_EQ(EINVAL); ASSERT_EQ(-1, ret); diff --git a/libc/test/src/sched/affinity_test.cpp b/libc/test/src/sched/affinity_test.cpp index b5085203e5ce..b77f22f8e60d 100644 --- a/libc/test/src/sched/affinity_test.cpp +++ b/libc/test/src/sched/affinity_test.cpp @@ -7,7 +7,7 @@ //===----------------------------------------------------------------------===// #include "src/__support/OSUtil/syscall.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/sched/sched_getaffinity.h" #include "src/sched/sched_setaffinity.h" #include "test/UnitTest/ErrnoSetterMatcher.h" @@ -17,7 +17,7 @@ TEST(LlvmLibcSchedAffinityTest, SmokeTest) { cpu_set_t mask; - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds; pid_t tid = LIBC_NAMESPACE::syscall_impl(SYS_gettid); ASSERT_GT(tid, pid_t(0)); @@ -32,15 +32,15 @@ TEST(LlvmLibcSchedAffinityTest, BadMask) { using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Fails; pid_t tid = LIBC_NAMESPACE::syscall_impl(SYS_gettid); - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; ASSERT_THAT( LIBC_NAMESPACE::sched_getaffinity(tid, sizeof(cpu_set_t), nullptr), Fails(EFAULT)); - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; ASSERT_THAT( LIBC_NAMESPACE::sched_setaffinity(tid, sizeof(cpu_set_t), nullptr), Fails(EFAULT)); - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; } diff --git a/libc/test/src/sched/cpu_count_test.cpp b/libc/test/src/sched/cpu_count_test.cpp index 5250368a2616..919f1475e1d4 100644 --- a/libc/test/src/sched/cpu_count_test.cpp +++ b/libc/test/src/sched/cpu_count_test.cpp @@ -7,7 +7,7 @@ //===----------------------------------------------------------------------===// #include "src/__support/OSUtil/syscall.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/sched/sched_getaffinity.h" #include "src/sched/sched_getcpucount.h" #include "test/UnitTest/ErrnoSetterMatcher.h" @@ -17,7 +17,7 @@ TEST(LlvmLibcSchedCpuCountTest, SmokeTest) { cpu_set_t mask; - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds; pid_t tid = LIBC_NAMESPACE::syscall_impl(SYS_gettid); ASSERT_GT(tid, pid_t(0)); diff --git a/libc/test/src/sched/get_priority_test.cpp b/libc/test/src/sched/get_priority_test.cpp index 59205c51e4a1..bb41dc0be201 100644 --- a/libc/test/src/sched/get_priority_test.cpp +++ b/libc/test/src/sched/get_priority_test.cpp @@ -6,7 +6,7 @@ // //===----------------------------------------------------------------------===// -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/sched/sched_get_priority_max.h" #include "src/sched/sched_get_priority_min.h" #include "test/UnitTest/Test.h" @@ -58,7 +58,7 @@ TEST(LlvmLibcSchedGetPriorityTest, HandleBadPolicyTest) { } TEST(LlvmLibcSchedGetPriorityTest, SmokeTest) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; // We Test: // SCHED_OTHER, SCHED_FIFO, SCHED_RR diff --git a/libc/test/src/sched/param_and_scheduler_test.cpp b/libc/test/src/sched/param_and_scheduler_test.cpp index 747c7e3409e4..4f2b6e412a4b 100644 --- a/libc/test/src/sched/param_and_scheduler_test.cpp +++ b/libc/test/src/sched/param_and_scheduler_test.cpp @@ -6,7 +6,7 @@ // //===----------------------------------------------------------------------===// -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/sched/sched_get_priority_max.h" #include "src/sched/sched_get_priority_min.h" #include "src/sched/sched_getparam.h" @@ -37,7 +37,7 @@ class SchedTest : public LIBC_NAMESPACE::testing::Test { public: void testSched(int policy, bool is_mandatory) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; int init_policy = LIBC_NAMESPACE::sched_getscheduler(0); ASSERT_GE(init_policy, 0); @@ -55,30 +55,29 @@ public: // Negative pid ASSERT_EQ(LIBC_NAMESPACE::sched_setscheduler(-1, policy, ¶m), -1); ASSERT_ERRNO_EQ(EINVAL); - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; ASSERT_EQ(LIBC_NAMESPACE::sched_getscheduler(-1), -1); ASSERT_ERRNO_EQ(EINVAL); - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; // Invalid Policy ASSERT_EQ(LIBC_NAMESPACE::sched_setscheduler(0, policy | 128, ¶m), -1); ASSERT_ERRNO_EQ(EINVAL); - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; // Out of bounds priority param.sched_priority = min_priority - 1; ASSERT_EQ(LIBC_NAMESPACE::sched_setscheduler(0, policy, ¶m), -1); ASSERT_ERRNO_EQ(EINVAL); - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; param.sched_priority = max_priority + 1; ASSERT_EQ(LIBC_NAMESPACE::sched_setscheduler(0, policy, ¶m), -1); // A bit hard to test as depending on user privileges we can run into // different issues. - ASSERT_TRUE(LIBC_NAMESPACE::libc_errno == EINVAL || - LIBC_NAMESPACE::libc_errno == EPERM); - LIBC_NAMESPACE::libc_errno = 0; + ASSERT_TRUE(libc_errno == EINVAL || libc_errno == EPERM); + libc_errno = 0; param.sched_priority = min_priority; // Success/unsupported policy/missing permissions. @@ -87,10 +86,9 @@ public: ASSERT_TRUE(setscheduler_result == 0 || setscheduler_result == -1); ASSERT_TRUE( setscheduler_result != -1 - ? (LIBC_NAMESPACE::libc_errno == 0) - : ((!is_mandatory && LIBC_NAMESPACE::libc_errno == EINVAL) || - LIBC_NAMESPACE::libc_errno == EPERM)); - LIBC_NAMESPACE::libc_errno = 0; + ? (libc_errno == 0) + : ((!is_mandatory && libc_errno == EINVAL) || libc_errno == EPERM)); + libc_errno = 0; ASSERT_EQ(LIBC_NAMESPACE::sched_getscheduler(0), setscheduler_result != -1 ? policy : init_policy); @@ -100,12 +98,12 @@ public: param.sched_priority = -1; ASSERT_EQ(LIBC_NAMESPACE::sched_setparam(0, ¶m), -1); ASSERT_ERRNO_EQ(EINVAL); - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; param.sched_priority = max_priority + 1; ASSERT_EQ(LIBC_NAMESPACE::sched_setparam(0, ¶m), -1); ASSERT_ERRNO_EQ(EINVAL); - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; for (int priority = min_priority; priority <= max_priority; ++priority) { ASSERT_EQ(LIBC_NAMESPACE::sched_getparam(0, ¶m), 0); @@ -117,21 +115,20 @@ public: // Negative pid ASSERT_EQ(LIBC_NAMESPACE::sched_setparam(-1, ¶m), -1); ASSERT_ERRNO_EQ(EINVAL); - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; ASSERT_EQ(LIBC_NAMESPACE::sched_getparam(-1, ¶m), -1); ASSERT_ERRNO_EQ(EINVAL); - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; // Success/unsupported policy/missing permissions int setparam_result = LIBC_NAMESPACE::sched_setparam(0, ¶m); ASSERT_TRUE(setparam_result == 0 || setparam_result == -1); ASSERT_TRUE(setparam_result != -1 - ? (LIBC_NAMESPACE::libc_errno == 0) - : ((setscheduler_result == -1 && - LIBC_NAMESPACE::libc_errno == EINVAL) || - LIBC_NAMESPACE::libc_errno == EPERM)); - LIBC_NAMESPACE::libc_errno = 0; + ? (libc_errno == 0) + : ((setscheduler_result == -1 && libc_errno == EINVAL) || + libc_errno == EPERM)); + libc_errno = 0; ASSERT_EQ(LIBC_NAMESPACE::sched_getparam(0, ¶m), 0); ASSERT_ERRNO_SUCCESS(); @@ -143,7 +140,7 @@ public: // Null test ASSERT_EQ(LIBC_NAMESPACE::sched_setscheduler(0, policy, nullptr), -1); ASSERT_ERRNO_EQ(EINVAL); - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; } }; @@ -161,13 +158,13 @@ LIST_SCHED_TESTS(SCHED_BATCH, true) LIST_SCHED_TESTS(SCHED_IDLE, true) TEST(LlvmLibcSchedParamAndSchedulerTest, NullParamTest) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; ASSERT_EQ(LIBC_NAMESPACE::sched_setparam(0, nullptr), -1); ASSERT_ERRNO_EQ(EINVAL); - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; ASSERT_EQ(LIBC_NAMESPACE::sched_getparam(0, nullptr), -1); ASSERT_ERRNO_EQ(EINVAL); - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; } diff --git a/libc/test/src/sched/sched_rr_get_interval_test.cpp b/libc/test/src/sched/sched_rr_get_interval_test.cpp index c22a2c76d743..a0fe5edbe014 100644 --- a/libc/test/src/sched/sched_rr_get_interval_test.cpp +++ b/libc/test/src/sched/sched_rr_get_interval_test.cpp @@ -6,7 +6,7 @@ // //===----------------------------------------------------------------------===// -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/sched/sched_get_priority_min.h" #include "src/sched/sched_getscheduler.h" #include "src/sched/sched_rr_get_interval.h" @@ -17,7 +17,7 @@ #include TEST(LlvmLibcSchedRRGetIntervalTest, SmokeTest) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; auto SetSched = [&](int policy) { int min_priority = LIBC_NAMESPACE::sched_get_priority_min(policy); ASSERT_GE(min_priority, 0); @@ -58,19 +58,19 @@ TEST(LlvmLibcSchedRRGetIntervalTest, SmokeTest) { // Null timespec ASSERT_EQ(LIBC_NAMESPACE::sched_rr_get_interval(0, nullptr), -1); ASSERT_ERRNO_EQ(EFAULT); - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; // Negative pid ASSERT_EQ(LIBC_NAMESPACE::sched_rr_get_interval(-1, &ts), -1); ASSERT_ERRNO_EQ(EINVAL); - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; } // Negative tests don't have SCHED_RR set SetSched(SCHED_OTHER); ASSERT_EQ(LIBC_NAMESPACE::sched_rr_get_interval(0, &ts), 0); ASSERT_ERRNO_SUCCESS(); - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; // TODO: Missing unkown pid -> ESRCH. This is read only so safe to try a few // unlikely values. diff --git a/libc/test/src/sched/yield_test.cpp b/libc/test/src/sched/yield_test.cpp index f1627a71fa9a..4d13d50e25eb 100644 --- a/libc/test/src/sched/yield_test.cpp +++ b/libc/test/src/sched/yield_test.cpp @@ -6,12 +6,12 @@ // //===----------------------------------------------------------------------===// -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/sched/sched_yield.h" #include "test/UnitTest/Test.h" TEST(LlvmLibcSchedYieldTest, SmokeTest) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; // sched_yield() always succeeds, just do a basic test that errno/ret are // properly 0. ASSERT_EQ(LIBC_NAMESPACE::sched_yield(), 0); diff --git a/libc/test/src/signal/sigaltstack_test.cpp b/libc/test/src/signal/sigaltstack_test.cpp index cc392da8f473..ce4dfddae248 100644 --- a/libc/test/src/signal/sigaltstack_test.cpp +++ b/libc/test/src/signal/sigaltstack_test.cpp @@ -8,7 +8,7 @@ #include "hdr/signal_macros.h" #include "src/__support/OSUtil/syscall.h" // For internal syscall function. -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/signal/linux/signal_utils.h" #include "src/signal/raise.h" #include "src/signal/sigaction.h" @@ -46,7 +46,7 @@ static void handler(int) { TEST(LlvmLibcSignalTest, SigaltstackRunOnAltStack) { struct sigaction action; - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; ASSERT_THAT(LIBC_NAMESPACE::sigaction(SIGUSR1, nullptr, &action), Succeeds(0)); action.sa_handler = handler; diff --git a/libc/test/src/signal/signal_test.cpp b/libc/test/src/signal/signal_test.cpp index bac9c3b8b68b..62b86bf44029 100644 --- a/libc/test/src/signal/signal_test.cpp +++ b/libc/test/src/signal/signal_test.cpp @@ -6,7 +6,7 @@ // //===----------------------------------------------------------------------===// -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/signal/raise.h" #include "src/signal/signal.h" @@ -17,7 +17,7 @@ using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Fails; using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds; TEST(LlvmLibcSignal, Invalid) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; auto *valid = +[](int) {}; EXPECT_THAT((void *)LIBC_NAMESPACE::signal(0, valid), Fails(EINVAL, (void *)SIG_ERR)); diff --git a/libc/test/src/signal/sigprocmask_test.cpp b/libc/test/src/signal/sigprocmask_test.cpp index 12403f68b593..891eac0f5bf7 100644 --- a/libc/test/src/signal/sigprocmask_test.cpp +++ b/libc/test/src/signal/sigprocmask_test.cpp @@ -6,7 +6,7 @@ // //===----------------------------------------------------------------------===// -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/signal/raise.h" #include "src/signal/sigaddset.h" #include "src/signal/sigemptyset.h" @@ -33,7 +33,7 @@ using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds; // This tests for invalid input. TEST_F(LlvmLibcSignalTest, SigprocmaskInvalid) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; sigset_t valid; // 17 and -4 are out of the range for sigprocmask's how paramater. diff --git a/libc/test/src/spawn/posix_spawn_file_actions_test.cpp b/libc/test/src/spawn/posix_spawn_file_actions_test.cpp index c1edf56bdbd8..01ccb8218ee2 100644 --- a/libc/test/src/spawn/posix_spawn_file_actions_test.cpp +++ b/libc/test/src/spawn/posix_spawn_file_actions_test.cpp @@ -6,7 +6,7 @@ // //===----------------------------------------------------------------------===// -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/spawn/file_actions.h" #include "src/spawn/posix_spawn_file_actions_addclose.h" #include "src/spawn/posix_spawn_file_actions_adddup2.h" diff --git a/libc/test/src/stdio/fdopen_test.cpp b/libc/test/src/stdio/fdopen_test.cpp index ef36cff2ffbd..104fc478b100 100644 --- a/libc/test/src/stdio/fdopen_test.cpp +++ b/libc/test/src/stdio/fdopen_test.cpp @@ -9,7 +9,7 @@ #include "src/stdio/fdopen.h" #include "hdr/fcntl_macros.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/fcntl/open.h" #include "src/stdio/fclose.h" #include "src/stdio/fgets.h" @@ -22,7 +22,7 @@ TEST(LlvmLibcStdioFdopenTest, WriteAppendRead) { using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds; - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; constexpr const char *TEST_FILE_NAME = "testdata/write_read_append.test"; auto TEST_FILE = libc_make_test_file_path(TEST_FILE_NAME); int fd = LIBC_NAMESPACE::open(TEST_FILE, O_CREAT | O_TRUNC | O_RDWR, S_IRWXU); @@ -53,7 +53,7 @@ TEST(LlvmLibcStdioFdopenTest, WriteAppendRead) { } TEST(LlvmLibcStdioFdopenTest, InvalidFd) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; constexpr const char *TEST_FILE_NAME = "testdata/invalid_fd.test"; auto TEST_FILE = libc_make_test_file_path(TEST_FILE_NAME); int fd = LIBC_NAMESPACE::open(TEST_FILE, O_CREAT | O_TRUNC); @@ -65,7 +65,7 @@ TEST(LlvmLibcStdioFdopenTest, InvalidFd) { } TEST(LlvmLibcStdioFdopenTest, InvalidMode) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; constexpr const char *TEST_FILE_NAME = "testdata/invalid_mode.test"; auto TEST_FILE = libc_make_test_file_path(TEST_FILE_NAME); int fd = LIBC_NAMESPACE::open(TEST_FILE, O_CREAT | O_RDONLY, S_IRWXU); @@ -83,7 +83,7 @@ TEST(LlvmLibcStdioFdopenTest, InvalidMode) { auto *fp2 = LIBC_NAMESPACE::fdopen(fd, "w"); ASSERT_ERRNO_EQ(EINVAL); ASSERT_TRUE(nullptr == fp2); - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; LIBC_NAMESPACE::close(fd); ASSERT_ERRNO_SUCCESS(); } diff --git a/libc/test/src/stdio/fgetc_test.cpp b/libc/test/src/stdio/fgetc_test.cpp index 2cc8436bd66f..56bde5f0099a 100644 --- a/libc/test/src/stdio/fgetc_test.cpp +++ b/libc/test/src/stdio/fgetc_test.cpp @@ -17,7 +17,7 @@ #include "test/UnitTest/Test.h" #include "hdr/stdio_macros.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" class LlvmLibcGetcTest : public LIBC_NAMESPACE::testing::Test { public: @@ -33,7 +33,7 @@ public: // This is an error and not a real EOF. ASSERT_EQ(LIBC_NAMESPACE::feof(file), 0); ASSERT_NE(LIBC_NAMESPACE::ferror(file), 0); - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; ASSERT_EQ(0, LIBC_NAMESPACE::fclose(file)); diff --git a/libc/test/src/stdio/fgetc_unlocked_test.cpp b/libc/test/src/stdio/fgetc_unlocked_test.cpp index 46cf12c2c253..90429ecf4e82 100644 --- a/libc/test/src/stdio/fgetc_unlocked_test.cpp +++ b/libc/test/src/stdio/fgetc_unlocked_test.cpp @@ -20,7 +20,7 @@ #include "test/UnitTest/Test.h" #include "hdr/stdio_macros.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" class LlvmLibcGetcTest : public LIBC_NAMESPACE::testing::Test { public: @@ -36,7 +36,7 @@ public: // This is an error and not a real EOF. ASSERT_EQ(LIBC_NAMESPACE::feof(file), 0); ASSERT_NE(LIBC_NAMESPACE::ferror(file), 0); - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; ASSERT_EQ(0, LIBC_NAMESPACE::fclose(file)); diff --git a/libc/test/src/stdio/fgets_test.cpp b/libc/test/src/stdio/fgets_test.cpp index a8a2c62f07b5..abed3d405293 100644 --- a/libc/test/src/stdio/fgets_test.cpp +++ b/libc/test/src/stdio/fgets_test.cpp @@ -14,7 +14,7 @@ #include "src/stdio/fwrite.h" #include "test/UnitTest/Test.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" TEST(LlvmLibcFgetsTest, WriteAndReadCharacters) { constexpr char FILENAME[] = "testdata/fgets.test"; @@ -35,7 +35,7 @@ TEST(LlvmLibcFgetsTest, WriteAndReadCharacters) { // This is an error and not a real EOF. ASSERT_EQ(LIBC_NAMESPACE::feof(file), 0); ASSERT_NE(LIBC_NAMESPACE::ferror(file), 0); - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; ASSERT_EQ(0, LIBC_NAMESPACE::fclose(file)); diff --git a/libc/test/src/stdio/fileop_test.cpp b/libc/test/src/stdio/fileop_test.cpp index a0368d701a67..e624181c795b 100644 --- a/libc/test/src/stdio/fileop_test.cpp +++ b/libc/test/src/stdio/fileop_test.cpp @@ -21,7 +21,7 @@ #include "test/UnitTest/Test.h" #include "hdr/stdio_macros.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::EQ; using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::NE; @@ -41,7 +41,7 @@ TEST(LlvmLibcFILETest, SimpleFileOperations) { ASSERT_THAT(LIBC_NAMESPACE::fread(read_data, 1, sizeof(CONTENT), file), returns(EQ(size_t(0))).with_errno(NE(0))); ASSERT_NE(LIBC_NAMESPACE::ferror(file), 0); - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; LIBC_NAMESPACE::clearerr(file); ASSERT_EQ(LIBC_NAMESPACE::ferror(file), 0); @@ -72,7 +72,7 @@ TEST(LlvmLibcFILETest, SimpleFileOperations) { ASSERT_THAT(LIBC_NAMESPACE::fwrite(CONTENT, 1, sizeof(CONTENT), file), returns(EQ(size_t(0))).with_errno(NE(0))); ASSERT_NE(LIBC_NAMESPACE::ferror(file), 0); - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; LIBC_NAMESPACE::clearerr(file); @@ -80,15 +80,15 @@ TEST(LlvmLibcFILETest, SimpleFileOperations) { ASSERT_THAT(LIBC_NAMESPACE::fputs(CONTENT, file), returns(EQ(EOF)).with_errno(NE(0))); ASSERT_NE(LIBC_NAMESPACE::ferror(file), 0); - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; LIBC_NAMESPACE::clearerr(file); ASSERT_EQ(LIBC_NAMESPACE::ferror(file), 0); - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; ASSERT_THAT(LIBC_NAMESPACE::fwrite("nothing", 1, 1, file), returns(EQ(size_t(0))).with_errno(NE(0))); - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; ASSERT_EQ(LIBC_NAMESPACE::fclose(file), 0); @@ -103,10 +103,10 @@ TEST(LlvmLibcFILETest, SimpleFileOperations) { ASSERT_EQ(LIBC_NAMESPACE::ferror(file), 0); // This is not a readable file. - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; ASSERT_THAT(LIBC_NAMESPACE::fread(data, 1, 1, file), returns(EQ(0)).with_errno(NE(0))); - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; ASSERT_EQ(0, LIBC_NAMESPACE::fclose(file)); @@ -121,15 +121,15 @@ TEST(LlvmLibcFILETest, SimpleFileOperations) { // Check that the other functions correctly set libc_errno. - // LIBC_NAMESPACE::libc_errno = 0; + // libc_errno = 0; // ASSERT_NE(LIBC_NAMESPACE::fseek(file, 0, SEEK_SET), 0); // ASSERT_ERRNO_FAILURE(); - // LIBC_NAMESPACE::libc_errno = 0; + // libc_errno = 0; // ASSERT_NE(LIBC_NAMESPACE::fclose(file), 0); // ASSERT_ERRNO_FAILURE(); - // LIBC_NAMESPACE::libc_errno = 0; + // libc_errno = 0; // ASSERT_EQ(LIBC_NAMESPACE::fopen("INVALID FILE NAME", "r"), // static_cast(nullptr)); // ASSERT_ERRNO_FAILURE(); @@ -165,7 +165,7 @@ TEST(LlvmLibcFILETest, FOpenFWriteSizeGreaterThanOne) { constexpr size_t WRITE_NMEMB = sizeof(WRITE_DATA) / sizeof(MyStruct); constexpr char FILENAME[] = "testdata/fread_fwrite.test"; - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; FILE *file = LIBC_NAMESPACE::fopen(FILENAME, "w"); ASSERT_FALSE(file == nullptr); ASSERT_EQ(size_t(0), LIBC_NAMESPACE::fwrite(WRITE_DATA, 0, 1, file)); diff --git a/libc/test/src/stdio/fopencookie_test.cpp b/libc/test/src/stdio/fopencookie_test.cpp index 61ce2a207fa1..03e1ac286b64 100644 --- a/libc/test/src/stdio/fopencookie_test.cpp +++ b/libc/test/src/stdio/fopencookie_test.cpp @@ -20,7 +20,7 @@ #include "hdr/stdio_macros.h" #include "hdr/types/size_t.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" using MemoryView = LIBC_NAMESPACE::testing::MemoryView; @@ -67,7 +67,7 @@ int seek_ss(void *cookie, off64_t *offset, int whence) { } else if (whence == SEEK_END) { new_offset = *offset + ss->endpos; } else { - LIBC_NAMESPACE::libc_errno = EINVAL; + libc_errno = EINVAL; return -1; } if (new_offset < 0 || size_t(new_offset) > ss->bufsize) @@ -115,7 +115,7 @@ TEST(LlvmLibcFOpenCookie, ReadOnlyCookieTest) { ASSERT_EQ(size_t(0), LIBC_NAMESPACE::fwrite(CONTENT, 1, sizeof(CONTENT), f)); ASSERT_NE(LIBC_NAMESPACE::ferror(f), 0); ASSERT_ERRNO_FAILURE(); - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; LIBC_NAMESPACE::clearerr(f); ASSERT_EQ(LIBC_NAMESPACE::ferror(f), 0); @@ -149,7 +149,7 @@ TEST(LlvmLibcFOpenCookie, WriteOnlyCookieTest) { LIBC_NAMESPACE::fread(read_data, 1, sizeof(WRITE_DATA), f)); ASSERT_NE(LIBC_NAMESPACE::ferror(f), 0); ASSERT_ERRNO_EQ(EBADF); - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; LIBC_NAMESPACE::clearerr(f); ASSERT_EQ(LIBC_NAMESPACE::ferror(f), 0); @@ -178,7 +178,7 @@ TEST(LlvmLibcFOpenCookie, AppendOnlyCookieTest) { ASSERT_EQ(LIBC_NAMESPACE::fread(read_data, 1, READ_SIZE, f), size_t(0)); ASSERT_NE(LIBC_NAMESPACE::ferror(f), 0); ASSERT_ERRNO_FAILURE(); - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; LIBC_NAMESPACE::clearerr(f); ASSERT_EQ(LIBC_NAMESPACE::ferror(f), 0); diff --git a/libc/test/src/stdio/remove_test.cpp b/libc/test/src/stdio/remove_test.cpp index 72875600903a..84984e26398c 100644 --- a/libc/test/src/stdio/remove_test.cpp +++ b/libc/test/src/stdio/remove_test.cpp @@ -14,13 +14,13 @@ #include "test/UnitTest/ErrnoSetterMatcher.h" #include "test/UnitTest/Test.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include TEST(LlvmLibcRemoveTest, CreateAndRemoveFile) { // The test strategy is to create a file and remove it, and also verify that // it was removed. - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Fails; using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds; @@ -39,7 +39,7 @@ TEST(LlvmLibcRemoveTest, CreateAndRemoveFile) { TEST(LlvmLibcRemoveTest, CreateAndRemoveDir) { // The test strategy is to create a dir and remove it, and also verify that // it was removed. - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Fails; using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds; constexpr const char *FILENAME = "remove.test.dir"; diff --git a/libc/test/src/stdio/rename_test.cpp b/libc/test/src/stdio/rename_test.cpp index a5dd734c6361..ac494a4ecaf8 100644 --- a/libc/test/src/stdio/rename_test.cpp +++ b/libc/test/src/stdio/rename_test.cpp @@ -8,7 +8,7 @@ #include "include/llvm-libc-macros/linux/sys-stat-macros.h" #include "include/llvm-libc-macros/linux/unistd-macros.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/fcntl/open.h" #include "src/stdio/rename.h" #include "src/unistd/access.h" @@ -19,7 +19,7 @@ TEST(LlvmLibcRenameTest, CreateAndRenameFile) { // The test strategy is to create a file and rename it, and also verify that // it was renamed. - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Fails; using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds; diff --git a/libc/test/src/stdio/setvbuf_test.cpp b/libc/test/src/stdio/setvbuf_test.cpp index a1e1fee25db3..5872943c1bb4 100644 --- a/libc/test/src/stdio/setvbuf_test.cpp +++ b/libc/test/src/stdio/setvbuf_test.cpp @@ -14,7 +14,7 @@ #include "test/UnitTest/Test.h" #include "hdr/stdio_macros.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" TEST(LlvmLibcSetvbufTest, SetNBFBuffer) { // The idea in this test is that we open a file for writing and reading, and @@ -102,6 +102,6 @@ TEST(LlvmLibcSetbufTest, InvalidBufferMode) { 0); ASSERT_ERRNO_EQ(EINVAL); - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; ASSERT_EQ(0, LIBC_NAMESPACE::fclose(f)); } diff --git a/libc/test/src/stdio/sprintf_test.cpp b/libc/test/src/stdio/sprintf_test.cpp index f6af6ad3e364..f1b545ba546f 100644 --- a/libc/test/src/stdio/sprintf_test.cpp +++ b/libc/test/src/stdio/sprintf_test.cpp @@ -10,7 +10,7 @@ #include "src/stdio/sprintf.h" #include "src/__support/FPUtil/FPBits.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "test/UnitTest/RoundingModeUtils.h" #include "test/UnitTest/Test.h" #include @@ -3228,46 +3228,46 @@ TEST(LlvmLibcSPrintfTest, StrerrorConv) { char buff[1000]; int written; - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; written = LIBC_NAMESPACE::sprintf(buff, "%m"); ASSERT_STREQ_LEN(written, buff, "Success"); - LIBC_NAMESPACE::libc_errno = ERANGE; + libc_errno = ERANGE; written = LIBC_NAMESPACE::sprintf(buff, "%m"); ASSERT_STREQ_LEN(written, buff, "Numerical result out of range"); // Check that it correctly consumes no arguments. - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; written = LIBC_NAMESPACE::sprintf(buff, "%m %d", 1); ASSERT_STREQ_LEN(written, buff, "Success 1"); // Width Tests - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; written = LIBC_NAMESPACE::sprintf(buff, "%10m"); ASSERT_STREQ_LEN(written, buff, " Success"); - LIBC_NAMESPACE::libc_errno = ERANGE; + libc_errno = ERANGE; written = LIBC_NAMESPACE::sprintf(buff, "%10m"); ASSERT_STREQ_LEN(written, buff, "Numerical result out of range"); // Precision Tests - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; written = LIBC_NAMESPACE::sprintf(buff, "%.10m"); ASSERT_STREQ_LEN(written, buff, "Success"); - LIBC_NAMESPACE::libc_errno = ERANGE; + libc_errno = ERANGE; written = LIBC_NAMESPACE::sprintf(buff, "%.10m"); ASSERT_STREQ_LEN(written, buff, "Numerical "); // Flag Tests (Only '-' since the others only affect ints) - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; written = LIBC_NAMESPACE::sprintf(buff, "%-10m"); ASSERT_STREQ_LEN(written, buff, "Success "); - LIBC_NAMESPACE::libc_errno = ERANGE; + libc_errno = ERANGE; written = LIBC_NAMESPACE::sprintf(buff, "%-10m"); ASSERT_STREQ_LEN(written, buff, "Numerical result out of range"); @@ -3275,93 +3275,93 @@ TEST(LlvmLibcSPrintfTest, StrerrorConv) { // Since alt mode here is effectively a completely separate conversion, it // gets separate tests. - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; written = LIBC_NAMESPACE::sprintf(buff, "%#m"); ASSERT_STREQ_LEN(written, buff, "0"); - LIBC_NAMESPACE::libc_errno = ERANGE; + libc_errno = ERANGE; written = LIBC_NAMESPACE::sprintf(buff, "%#m"); ASSERT_STREQ_LEN(written, buff, "ERANGE"); - LIBC_NAMESPACE::libc_errno = -9999; + libc_errno = -9999; written = LIBC_NAMESPACE::sprintf(buff, "%#m"); ASSERT_STREQ_LEN(written, buff, "-9999"); // Alt Mode Width - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; written = LIBC_NAMESPACE::sprintf(buff, "%#10m"); ASSERT_STREQ_LEN(written, buff, " 0"); - LIBC_NAMESPACE::libc_errno = ERANGE; + libc_errno = ERANGE; written = LIBC_NAMESPACE::sprintf(buff, "%#10m"); ASSERT_STREQ_LEN(written, buff, " ERANGE"); - LIBC_NAMESPACE::libc_errno = -9999; + libc_errno = -9999; written = LIBC_NAMESPACE::sprintf(buff, "%#10m"); ASSERT_STREQ_LEN(written, buff, " -9999"); - LIBC_NAMESPACE::libc_errno = ERANGE; + libc_errno = ERANGE; written = LIBC_NAMESPACE::sprintf(buff, "%#3m"); ASSERT_STREQ_LEN(written, buff, "ERANGE"); - LIBC_NAMESPACE::libc_errno = -9999; + libc_errno = -9999; written = LIBC_NAMESPACE::sprintf(buff, "%#3m"); ASSERT_STREQ_LEN(written, buff, "-9999"); // Alt Mode Precision - LIBC_NAMESPACE::libc_errno = ERANGE; + libc_errno = ERANGE; written = LIBC_NAMESPACE::sprintf(buff, "%#.10m"); ASSERT_STREQ_LEN(written, buff, "ERANGE"); - LIBC_NAMESPACE::libc_errno = -9999; + libc_errno = -9999; written = LIBC_NAMESPACE::sprintf(buff, "%#.10m"); ASSERT_STREQ_LEN(written, buff, "-0000009999"); - LIBC_NAMESPACE::libc_errno = ERANGE; + libc_errno = ERANGE; written = LIBC_NAMESPACE::sprintf(buff, "%#.3m"); ASSERT_STREQ_LEN(written, buff, "ERA"); - LIBC_NAMESPACE::libc_errno = -9999; + libc_errno = -9999; written = LIBC_NAMESPACE::sprintf(buff, "%#.3m"); ASSERT_STREQ_LEN(written, buff, "-9999"); // We don't test precision (or int flags) on errno = 0 because it behaves // weirdly, see the docs for more information. - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; written = LIBC_NAMESPACE::sprintf(buff, "%#.1m"); ASSERT_STREQ_LEN(written, buff, "0"); // Alt Mode Flags // '-' flag - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; written = LIBC_NAMESPACE::sprintf(buff, "%#-10m"); ASSERT_STREQ_LEN(written, buff, "0 "); - LIBC_NAMESPACE::libc_errno = ERANGE; + libc_errno = ERANGE; written = LIBC_NAMESPACE::sprintf(buff, "%#-10m"); ASSERT_STREQ_LEN(written, buff, "ERANGE "); - LIBC_NAMESPACE::libc_errno = -9999; + libc_errno = -9999; written = LIBC_NAMESPACE::sprintf(buff, "%#-10m"); ASSERT_STREQ_LEN(written, buff, "-9999 "); - LIBC_NAMESPACE::libc_errno = ERANGE; + libc_errno = ERANGE; written = LIBC_NAMESPACE::sprintf(buff, "%#-3m"); ASSERT_STREQ_LEN(written, buff, "ERANGE"); - LIBC_NAMESPACE::libc_errno = -9999; + libc_errno = -9999; written = LIBC_NAMESPACE::sprintf(buff, "%#-3m"); ASSERT_STREQ_LEN(written, buff, "-9999"); // '+' flag - LIBC_NAMESPACE::libc_errno = ERANGE; + libc_errno = ERANGE; written = LIBC_NAMESPACE::sprintf(buff, "%#+m"); ASSERT_STREQ_LEN(written, buff, "ERANGE"); - LIBC_NAMESPACE::libc_errno = -9999; + libc_errno = -9999; written = LIBC_NAMESPACE::sprintf(buff, "%#+m"); ASSERT_STREQ_LEN(written, buff, "-9999"); @@ -3370,38 +3370,38 @@ TEST(LlvmLibcSPrintfTest, StrerrorConv) { // come up, but I've avoided it for the other %m tests for ease of // refactoring if necessary. Here it needs to be positive to test that the // flags that only affect positive signed integers are properly passed along. - LIBC_NAMESPACE::libc_errno = 9999; + libc_errno = 9999; written = LIBC_NAMESPACE::sprintf(buff, "%#+m"); ASSERT_STREQ_LEN(written, buff, "+9999"); // ' ' flag - LIBC_NAMESPACE::libc_errno = ERANGE; + libc_errno = ERANGE; written = LIBC_NAMESPACE::sprintf(buff, "%# m"); ASSERT_STREQ_LEN(written, buff, "ERANGE"); - LIBC_NAMESPACE::libc_errno = -9999; + libc_errno = -9999; written = LIBC_NAMESPACE::sprintf(buff, "%# m"); ASSERT_STREQ_LEN(written, buff, "-9999"); - LIBC_NAMESPACE::libc_errno = 9999; + libc_errno = 9999; written = LIBC_NAMESPACE::sprintf(buff, "%# m"); ASSERT_STREQ_LEN(written, buff, " 9999"); // '0' flag - LIBC_NAMESPACE::libc_errno = ERANGE; + libc_errno = ERANGE; written = LIBC_NAMESPACE::sprintf(buff, "%#010m"); ASSERT_STREQ_LEN(written, buff, " ERANGE"); - LIBC_NAMESPACE::libc_errno = -9999; + libc_errno = -9999; written = LIBC_NAMESPACE::sprintf(buff, "%#010m"); ASSERT_STREQ_LEN(written, buff, "-000009999"); - LIBC_NAMESPACE::libc_errno = ERANGE; + libc_errno = ERANGE; written = LIBC_NAMESPACE::sprintf(buff, "%#03m"); ASSERT_STREQ_LEN(written, buff, "ERANGE"); - LIBC_NAMESPACE::libc_errno = -9999; + libc_errno = -9999; written = LIBC_NAMESPACE::sprintf(buff, "%#03m"); ASSERT_STREQ_LEN(written, buff, "-9999"); } diff --git a/libc/test/src/stdio/unlocked_fileop_test.cpp b/libc/test/src/stdio/unlocked_fileop_test.cpp index 67f1b0ff513b..5d482b70064b 100644 --- a/libc/test/src/stdio/unlocked_fileop_test.cpp +++ b/libc/test/src/stdio/unlocked_fileop_test.cpp @@ -17,7 +17,7 @@ #include "src/stdio/fwrite_unlocked.h" #include "test/UnitTest/Test.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" TEST(LlvmLibcFILETest, UnlockedReadAndWrite) { constexpr char fNAME[] = "testdata/unlocked_read_and_write.test"; @@ -36,7 +36,7 @@ TEST(LlvmLibcFILETest, UnlockedReadAndWrite) { LIBC_NAMESPACE::fread_unlocked(data, 1, sizeof(READ_SIZE), f)); ASSERT_NE(LIBC_NAMESPACE::ferror_unlocked(f), 0); ASSERT_ERRNO_FAILURE(); - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; LIBC_NAMESPACE::clearerr_unlocked(f); ASSERT_EQ(LIBC_NAMESPACE::ferror_unlocked(f), 0); @@ -57,7 +57,7 @@ TEST(LlvmLibcFILETest, UnlockedReadAndWrite) { LIBC_NAMESPACE::fwrite_unlocked(CONTENT, 1, sizeof(CONTENT), f)); ASSERT_NE(LIBC_NAMESPACE::ferror_unlocked(f), 0); ASSERT_ERRNO_FAILURE(); - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; LIBC_NAMESPACE::clearerr_unlocked(f); ASSERT_EQ(LIBC_NAMESPACE::ferror_unlocked(f), 0); diff --git a/libc/test/src/stdlib/StrtolTest.h b/libc/test/src/stdlib/StrtolTest.h index 03f0a6539c78..3eeccc5727e7 100644 --- a/libc/test/src/stdlib/StrtolTest.h +++ b/libc/test/src/stdlib/StrtolTest.h @@ -9,6 +9,7 @@ #include "src/__support/CPP/limits.h" #include "src/__support/CPP/type_traits.h" #include "src/__support/ctype_utils.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/properties/architectures.h" #include "test/UnitTest/ErrnoCheckingTest.h" #include "test/UnitTest/Test.h" diff --git a/libc/test/src/stdlib/strtoint32_test.cpp b/libc/test/src/stdlib/strtoint32_test.cpp index 17df432fc8e6..e6da692714d2 100644 --- a/libc/test/src/stdlib/strtoint32_test.cpp +++ b/libc/test/src/stdlib/strtoint32_test.cpp @@ -8,9 +8,9 @@ #include +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" #include "src/__support/str_to_integer.h" -#include "src/errno/libc_errno.h" #include "StrtolTest.h" #include "test/UnitTest/Test.h" @@ -21,7 +21,7 @@ int32_t strtoint32(const char *__restrict str, char **__restrict str_end, int base) { auto result = internal::strtointeger(str, base); if (result.has_error()) - LIBC_NAMESPACE::libc_errno = result.error; + libc_errno = result.error; if (str_end != nullptr) *str_end = const_cast(str + result.parsed_len); @@ -33,7 +33,7 @@ uint32_t strtouint32(const char *__restrict str, char **__restrict str_end, int base) { auto result = internal::strtointeger(str, base); if (result.has_error()) - LIBC_NAMESPACE::libc_errno = result.error; + libc_errno = result.error; if (str_end != nullptr) *str_end = const_cast(str + result.parsed_len); diff --git a/libc/test/src/stdlib/strtoint64_test.cpp b/libc/test/src/stdlib/strtoint64_test.cpp index b5fe69dfaa70..2c5d948f5fae 100644 --- a/libc/test/src/stdlib/strtoint64_test.cpp +++ b/libc/test/src/stdlib/strtoint64_test.cpp @@ -8,9 +8,9 @@ #include +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" #include "src/__support/str_to_integer.h" -#include "src/errno/libc_errno.h" #include "StrtolTest.h" #include "test/UnitTest/Test.h" @@ -21,7 +21,7 @@ int64_t strtoint64(const char *__restrict str, char **__restrict str_end, int base) { auto result = internal::strtointeger(str, base); if (result.has_error()) - LIBC_NAMESPACE::libc_errno = result.error; + libc_errno = result.error; if (str_end != nullptr) *str_end = const_cast(str + result.parsed_len); @@ -33,7 +33,7 @@ uint64_t strtouint64(const char *__restrict str, char **__restrict str_end, int base) { auto result = internal::strtointeger(str, base); if (result.has_error()) - LIBC_NAMESPACE::libc_errno = result.error; + libc_errno = result.error; if (str_end != nullptr) *str_end = const_cast(str + result.parsed_len); diff --git a/libc/test/src/stdlib/strtold_test.cpp b/libc/test/src/stdlib/strtold_test.cpp index eb4056dc7ba6..c2f2b9c9a11c 100644 --- a/libc/test/src/stdlib/strtold_test.cpp +++ b/libc/test/src/stdlib/strtold_test.cpp @@ -7,6 +7,7 @@ //===----------------------------------------------------------------------===// #include "src/__support/FPUtil/FPBits.h" +#include "src/__support/libc_errno.h" #include "src/__support/uint128.h" #include "src/stdlib/strtold.h" diff --git a/libc/test/src/sys/mman/linux/mlock_test.cpp b/libc/test/src/sys/mman/linux/mlock_test.cpp index 88abacad554e..6b81411ca604 100644 --- a/libc/test/src/sys/mman/linux/mlock_test.cpp +++ b/libc/test/src/sys/mman/linux/mlock_test.cpp @@ -7,7 +7,7 @@ //===----------------------------------------------------------------------===// #include "src/__support/OSUtil/syscall.h" // For internal syscall function. -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/sys/mman/madvise.h" #include "src/sys/mman/mincore.h" #include "src/sys/mman/mlock.h" @@ -149,9 +149,8 @@ TEST_F(LlvmLibcMlockTest, MLockAll) { Succeeds()); auto retval = LIBC_NAMESPACE::mlockall(MCL_CURRENT); if (retval == -1) { - EXPECT_TRUE(LIBC_NAMESPACE::libc_errno == ENOMEM || - LIBC_NAMESPACE::libc_errno == EPERM); - LIBC_NAMESPACE::libc_errno = 0; + EXPECT_TRUE(libc_errno == ENOMEM || libc_errno == EPERM); + libc_errno = 0; return; } unsigned char vec; @@ -163,9 +162,8 @@ TEST_F(LlvmLibcMlockTest, MLockAll) { { auto retval = LIBC_NAMESPACE::mlockall(MCL_FUTURE); if (retval == -1) { - EXPECT_TRUE(LIBC_NAMESPACE::libc_errno == ENOMEM || - LIBC_NAMESPACE::libc_errno == EPERM); - LIBC_NAMESPACE::libc_errno = 0; + EXPECT_TRUE(libc_errno == ENOMEM || libc_errno == EPERM); + libc_errno = 0; return; } PageHolder holder; @@ -180,9 +178,8 @@ TEST_F(LlvmLibcMlockTest, MLockAll) { { auto retval = LIBC_NAMESPACE::mlockall(MCL_FUTURE | MCL_ONFAULT); if (retval == -1) { - EXPECT_TRUE(LIBC_NAMESPACE::libc_errno == ENOMEM || - LIBC_NAMESPACE::libc_errno == EPERM); - LIBC_NAMESPACE::libc_errno = 0; + EXPECT_TRUE(libc_errno == ENOMEM || libc_errno == EPERM); + libc_errno = 0; return; } PageHolder holder; diff --git a/libc/test/src/sys/statvfs/linux/fstatvfs_test.cpp b/libc/test/src/sys/statvfs/linux/fstatvfs_test.cpp index 455a82678e18..ba0ee4f09109 100644 --- a/libc/test/src/sys/statvfs/linux/fstatvfs_test.cpp +++ b/libc/test/src/sys/statvfs/linux/fstatvfs_test.cpp @@ -7,8 +7,8 @@ //===----------------------------------------------------------------------===// #include "hdr/fcntl_macros.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include "src/fcntl/open.h" #include "src/sys/stat/mkdirat.h" #include "src/sys/statvfs/fstatvfs.h" @@ -41,7 +41,7 @@ TEST_F(LlvmLibcSysFStatvfsTest, FStatvfsInvalidPath) { // Always delete the folder so that we start in a consistent state. LIBC_NAMESPACE::rmdir(TEST_DIR); - LIBC_NAMESPACE::libc_errno = 0; // Reset errno + libc_errno = 0; // Reset errno ASSERT_THAT(LIBC_NAMESPACE::mkdirat(AT_FDCWD, TEST_DIR, S_IRWXU), Succeeds(0)); diff --git a/libc/test/src/sys/statvfs/linux/statvfs_test.cpp b/libc/test/src/sys/statvfs/linux/statvfs_test.cpp index f356bb3d277b..327dec07a1b7 100644 --- a/libc/test/src/sys/statvfs/linux/statvfs_test.cpp +++ b/libc/test/src/sys/statvfs/linux/statvfs_test.cpp @@ -7,8 +7,8 @@ //===----------------------------------------------------------------------===// #include "hdr/fcntl_macros.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include "src/sys/stat/mkdirat.h" #include "src/sys/statvfs/statvfs.h" #include "src/unistd/rmdir.h" @@ -37,7 +37,7 @@ TEST_F(LlvmLibcSysStatvfsTest, StatvfsInvalidPath) { // Always delete the folder so that we start in a consistent state. LIBC_NAMESPACE::rmdir(TEST_DIR); - LIBC_NAMESPACE::libc_errno = 0; // Reset errno + libc_errno = 0; // Reset errno ASSERT_THAT(LIBC_NAMESPACE::mkdirat(AT_FDCWD, TEST_DIR, S_IRWXU), Succeeds(0)); diff --git a/libc/test/src/sys/time/setitimer_test.cpp b/libc/test/src/sys/time/setitimer_test.cpp index 16d33fdf1e4f..115f9e662ed4 100644 --- a/libc/test/src/sys/time/setitimer_test.cpp +++ b/libc/test/src/sys/time/setitimer_test.cpp @@ -24,7 +24,7 @@ static bool timer_fired(false); extern "C" void handle_sigalrm(int) { timer_fired = true; } TEST_F(LlvmLibcSysTimeSetitimerTest, SmokeTest) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; struct sigaction sa; sa.sa_handler = handle_sigalrm; LIBC_NAMESPACE::sigemptyset(&sa.sa_mask); diff --git a/libc/test/src/termios/termios_test.cpp b/libc/test/src/termios/termios_test.cpp index f8fc09a8bbf0..5ec169a886b1 100644 --- a/libc/test/src/termios/termios_test.cpp +++ b/libc/test/src/termios/termios_test.cpp @@ -6,7 +6,7 @@ // //===----------------------------------------------------------------------===// -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/fcntl/open.h" #include "src/termios/cfgetispeed.h" #include "src/termios/cfgetospeed.h" @@ -30,21 +30,21 @@ using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds; TEST(LlvmLibcTermiosTest, SpeedSmokeTest) { struct termios t; - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; ASSERT_THAT(LIBC_NAMESPACE::cfsetispeed(&t, B50), Succeeds(0)); ASSERT_EQ(LIBC_NAMESPACE::cfgetispeed(&t), speed_t(B50)); ASSERT_THAT(LIBC_NAMESPACE::cfsetospeed(&t, B75), Succeeds(0)); ASSERT_EQ(LIBC_NAMESPACE::cfgetospeed(&t), speed_t(B75)); - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; ASSERT_THAT(LIBC_NAMESPACE::cfsetispeed(&t, ~CBAUD), Fails(EINVAL)); - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; ASSERT_THAT(LIBC_NAMESPACE::cfsetospeed(&t, ~CBAUD), Fails(EINVAL)); } TEST(LlvmLibcTermiosTest, GetAttrSmokeTest) { struct termios t; - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; int fd = LIBC_NAMESPACE::open("/dev/tty", O_RDONLY); if (fd < 0) return; // When /dev/tty is not available, no point continuing. @@ -54,7 +54,7 @@ TEST(LlvmLibcTermiosTest, GetAttrSmokeTest) { } TEST(LlvmLibcTermiosTest, TcGetSidSmokeTest) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; int fd = LIBC_NAMESPACE::open("/dev/tty", O_RDONLY); if (fd < 0) return; // When /dev/tty is not available, no point continuing. diff --git a/libc/test/src/time/asctime_r_test.cpp b/libc/test/src/time/asctime_r_test.cpp index b595cfe02486..d840248b7df4 100644 --- a/libc/test/src/time/asctime_r_test.cpp +++ b/libc/test/src/time/asctime_r_test.cpp @@ -6,7 +6,7 @@ // //===----------------------------------------------------------------------===// -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/time/asctime_r.h" #include "src/time/time_constants.h" #include "test/UnitTest/Test.h" diff --git a/libc/test/src/time/asctime_test.cpp b/libc/test/src/time/asctime_test.cpp index 169a7463a303..cad25fffc65a 100644 --- a/libc/test/src/time/asctime_test.cpp +++ b/libc/test/src/time/asctime_test.cpp @@ -6,7 +6,7 @@ // //===----------------------------------------------------------------------===// -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/time/asctime.h" #include "test/UnitTest/Test.h" #include "test/src/time/TmHelper.h" diff --git a/libc/test/src/time/ctime_r_test.cpp b/libc/test/src/time/ctime_r_test.cpp index 27011b7e0fbd..fe43877aa499 100644 --- a/libc/test/src/time/ctime_r_test.cpp +++ b/libc/test/src/time/ctime_r_test.cpp @@ -6,7 +6,7 @@ // //===----------------------------------------------------------------------===// -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/time/ctime_r.h" #include "src/time/time_constants.h" #include "test/UnitTest/Test.h" diff --git a/libc/test/src/time/ctime_test.cpp b/libc/test/src/time/ctime_test.cpp index 6f1168f0b668..5ff69f6619b4 100644 --- a/libc/test/src/time/ctime_test.cpp +++ b/libc/test/src/time/ctime_test.cpp @@ -6,7 +6,7 @@ // //===----------------------------------------------------------------------===// -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/time/ctime.h" #include "test/UnitTest/Test.h" #include "test/src/time/TmHelper.h" diff --git a/libc/test/src/time/gmtime_test.cpp b/libc/test/src/time/gmtime_test.cpp index 6af5a18d3699..41236665d2ea 100644 --- a/libc/test/src/time/gmtime_test.cpp +++ b/libc/test/src/time/gmtime_test.cpp @@ -8,7 +8,7 @@ #include "hdr/types/struct_tm.h" #include "src/__support/CPP/limits.h" // INT_MAX, INT_MIN -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/time/gmtime.h" #include "src/time/time_constants.h" #include "test/UnitTest/ErrnoSetterMatcher.h" @@ -30,7 +30,7 @@ TEST(LlvmLibcGmTime, OutOfRange) { EXPECT_TRUE(tm_data == nullptr); ASSERT_ERRNO_EQ(EOVERFLOW); - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; seconds = INT_MIN * static_cast( diff --git a/libc/test/src/time/nanosleep_test.cpp b/libc/test/src/time/nanosleep_test.cpp index d4f98e29bd98..e0200ff3aaa2 100644 --- a/libc/test/src/time/nanosleep_test.cpp +++ b/libc/test/src/time/nanosleep_test.cpp @@ -7,7 +7,7 @@ //===----------------------------------------------------------------------===// #include "hdr/types/struct_timespec.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/time/nanosleep.h" #include "test/UnitTest/ErrnoSetterMatcher.h" #include "test/UnitTest/Test.h" @@ -17,7 +17,7 @@ namespace cpp = LIBC_NAMESPACE::cpp; TEST(LlvmLibcNanosleep, SmokeTest) { // TODO: When we have the code to read clocks, test that time has passed. using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds; - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; struct timespec tim = {1, 500}; struct timespec tim2 = {0, 0}; diff --git a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel index b86d2f27e516..123d9ccc8310 100644 --- a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel @@ -1601,6 +1601,7 @@ libc_support_library( libc_header_library( name = "libcxx_shared_headers", hdrs = [ + "shared/libc_common.h", "shared/fp_bits.h", "shared/str_to_float.h", "shared/str_to_integer.h", @@ -1618,7 +1619,7 @@ libc_header_library( libc_support_library( name = "errno", srcs = ["src/errno/libc_errno.cpp"], - hdrs = ["src/errno/libc_errno.h"], + hdrs = ["src/__support/libc_errno.h"], deps = [ ":__support_common", ":__support_cpp_atomic", From 79108da325daec08f5b50169a9c35e03ea0645a3 Mon Sep 17 00:00:00 2001 From: sribee8 <145801438+sribee8@users.noreply.github.com> Date: Wed, 11 Jun 2025 20:28:55 +0000 Subject: [PATCH 0056/1322] [libc][obvious] Changed incorrect type (#143780) After changing mbstate_t to mbstate we forgot to change the character_converter files to reflect it. Co-authored-by: Sriya Pratipati --- libc/src/__support/wchar/character_converter.cpp | 2 +- libc/src/__support/wchar/character_converter.h | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/libc/src/__support/wchar/character_converter.cpp b/libc/src/__support/wchar/character_converter.cpp index 0afc2a6f59e6..3cdb8ca83b7f 100644 --- a/libc/src/__support/wchar/character_converter.cpp +++ b/libc/src/__support/wchar/character_converter.cpp @@ -16,7 +16,7 @@ namespace LIBC_NAMESPACE_DECL { namespace internal { -CharacterConverter::CharacterConverter(mbstate_t *mbstate) { state = mbstate; } +CharacterConverter::CharacterConverter(mbstate *mbstate) { state = mbstate; } bool CharacterConverter::isComplete() {} diff --git a/libc/src/__support/wchar/character_converter.h b/libc/src/__support/wchar/character_converter.h index a6bac4380537..d0602d2defe2 100644 --- a/libc/src/__support/wchar/character_converter.h +++ b/libc/src/__support/wchar/character_converter.h @@ -19,10 +19,10 @@ namespace internal { class CharacterConverter { private: - mbstate_t *state; + mbstate *state; public: - CharacterConverter(mbstate_t *mbstate); + CharacterConverter(mbstate *mbstate); bool isComplete(); From c0c0f60ca14422dfbfe27fddd8d47faa596165d8 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Wed, 11 Jun 2025 22:09:55 +0100 Subject: [PATCH 0057/1322] [GlobalOpt] Bail out on non-ConstExprs in isSimpleEnoughtToCommit. (#143400) Bail out for non ConstantExpr constants in isSimpleEnoughValueToCommitHelper to prevent crash for non-ConstantExpr constants PR: https://github.com/llvm/llvm-project/pull/143400 --- llvm/lib/Transforms/Utils/Evaluator.cpp | 4 +- .../global-constructor-complex-constants.ll | 64 +++++++++++++++++++ 2 files changed, 67 insertions(+), 1 deletion(-) create mode 100644 llvm/test/Transforms/GlobalOpt/global-constructor-complex-constants.ll diff --git a/llvm/lib/Transforms/Utils/Evaluator.cpp b/llvm/lib/Transforms/Utils/Evaluator.cpp index 2af447aadce2..d1db2ee29f3a 100644 --- a/llvm/lib/Transforms/Utils/Evaluator.cpp +++ b/llvm/lib/Transforms/Utils/Evaluator.cpp @@ -77,7 +77,9 @@ isSimpleEnoughValueToCommitHelper(Constant *C, // We don't know exactly what relocations are allowed in constant expressions, // so we allow &global+constantoffset, which is safe and uniformly supported // across targets. - ConstantExpr *CE = cast(C); + ConstantExpr *CE = dyn_cast(C); + if (!CE) + return false; switch (CE->getOpcode()) { case Instruction::BitCast: // Bitcast is fine if the casted value is fine. diff --git a/llvm/test/Transforms/GlobalOpt/global-constructor-complex-constants.ll b/llvm/test/Transforms/GlobalOpt/global-constructor-complex-constants.ll new file mode 100644 index 000000000000..6d9bdc41a004 --- /dev/null +++ b/llvm/test/Transforms/GlobalOpt/global-constructor-complex-constants.ll @@ -0,0 +1,64 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 5 +; RUN: opt -p globalopt -S %s | FileCheck %s + +@llvm.global_ctors = appending global [3 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 65535, ptr @ctor, ptr null }, { i32, ptr, ptr } { i32 65535, ptr @ctor_nocfi, ptr null }, { i32, ptr, ptr } { i32 65535, ptr @ctor_dso_local_equivalent, ptr null }] + +@foo = internal global ptr null + +declare void @user(ptr) + +;. +; CHECK: @llvm.global_ctors = appending global [3 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 65535, ptr @ctor, ptr null }, { i32, ptr, ptr } { i32 65535, ptr @ctor_nocfi, ptr null }, { i32, ptr, ptr } { i32 65535, ptr @ctor_dso_local_equivalent, ptr null }] +; CHECK: @foo = internal global ptr null +;. +define void @ctor() { +; CHECK-LABEL: define void @ctor() { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[DST:%.*]] = alloca ptr, align 8 +; CHECK-NEXT: store ptr ptrauth (ptr @foo, i32 0), ptr [[DST]], align 8 +; CHECK-NEXT: call void @user(ptr [[DST]]) +; CHECK-NEXT: ret void +; +entry: + %dst = alloca ptr, align 8 + store ptr ptrauth (ptr @foo, i32 0), ptr %dst, align 8 + call void @user(ptr %dst) + ret void +} + +define void @ctor_nocfi() { +; CHECK-LABEL: define void @ctor_nocfi() { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[DST:%.*]] = alloca ptr, align 8 +; CHECK-NEXT: store ptr no_cfi @foo, ptr [[DST]], align 8 +; CHECK-NEXT: call void @user(ptr [[DST]]) +; CHECK-NEXT: ret void +; +entry: + %dst = alloca ptr, align 8 + store ptr no_cfi @foo, ptr %dst, align 8 + call void @user(ptr %dst) + ret void +} + +define void @fn() { +; CHECK-LABEL: define void @fn() { +; CHECK-NEXT: ret void +; + ret void +} + +define void @ctor_dso_local_equivalent() { +; CHECK-LABEL: define void @ctor_dso_local_equivalent() { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[DST:%.*]] = alloca ptr, align 8 +; CHECK-NEXT: store ptr dso_local_equivalent @fn, ptr [[DST]], align 8 +; CHECK-NEXT: call void @user(ptr [[DST]]) +; CHECK-NEXT: ret void +; +entry: + %dst = alloca ptr, align 8 + store ptr dso_local_equivalent @fn, ptr %dst, align 8 + call void @user(ptr %dst) + ret void +} From f39f53e569f92987683626d910e9dbcbd59ff410 Mon Sep 17 00:00:00 2001 From: Shafik Yaghmour Date: Wed, 11 Jun 2025 14:11:19 -0700 Subject: [PATCH 0058/1322] [Clang][NFC] Move HeadingAndSpellings to avoid copying (#143611) Static analysis flagged that we could move HeadingAndSpellings and avoid a copy of a large object. --- clang/utils/TableGen/ClangAttrEmitter.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/clang/utils/TableGen/ClangAttrEmitter.cpp b/clang/utils/TableGen/ClangAttrEmitter.cpp index 21d76c12a3cc..42627f02cf35 100644 --- a/clang/utils/TableGen/ClangAttrEmitter.cpp +++ b/clang/utils/TableGen/ClangAttrEmitter.cpp @@ -5405,7 +5405,7 @@ void EmitClangAttrDocs(const RecordKeeper &Records, raw_ostream &OS) { // Handle Undocumented category separately - no content merging if (Cat == "Undocumented" && UndocumentedCategory) { UndocumentedDocs.push_back( - DocumentationData(Doc, Attr, HeadingAndSpellings)); + DocumentationData(Doc, Attr, std::move(HeadingAndSpellings))); continue; } From d7e7f22626f214766f3592341dd1737fd232c6a5 Mon Sep 17 00:00:00 2001 From: "Oleksandr T." Date: Thu, 12 Jun 2025 00:19:25 +0300 Subject: [PATCH 0059/1322] [Clang] fix missing source location for errors in macro-expanded (#143460) Fixes #143216 --- This patch fixes diagnostic locations for tokens from macro expansions. --- clang/docs/ReleaseNotes.rst | 1 + clang/include/clang/Parse/Parser.h | 4 +--- clang/lib/Parse/ParseExprCXX.cpp | 4 ++-- clang/lib/Parse/ParseStmt.cpp | 7 ++++-- clang/lib/Parse/Parser.cpp | 5 +++++ .../test/Parser/macro-expansion-recovery.cpp | 22 +++++++++++++++++++ clang/test/Parser/switch-recovery.cpp | 13 +++++++++++ 7 files changed, 49 insertions(+), 7 deletions(-) create mode 100644 clang/test/Parser/macro-expansion-recovery.cpp diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index 8043ab48f0b4..b42d5f8425af 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -694,6 +694,7 @@ Bug Fixes in This Version - Constant evaluation now correctly runs the destructor of a variable declared in the second clause of a C-style ``for`` loop. (#GH139818) - Fixed a bug with constexpr evaluation for structs containing unions in case of C++ modules. (#GH143168) +- Fixed incorrect token location when emitting diagnostics for tokens expanded from macros. (#GH143216) Bug Fixes to Compiler Builtins ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/clang/include/clang/Parse/Parser.h b/clang/include/clang/Parse/Parser.h index 0b2fab4a45c9..d99de77a5291 100644 --- a/clang/include/clang/Parse/Parser.h +++ b/clang/include/clang/Parse/Parser.h @@ -290,9 +290,7 @@ public: return ConsumeToken(); } - SourceLocation getEndOfPreviousToken() { - return PP.getLocForEndOfToken(PrevTokLocation); - } + SourceLocation getEndOfPreviousToken() const; /// GetLookAheadToken - This peeks ahead N tokens and returns that token /// without consuming any tokens. LookAhead(0) returns 'Tok', LookAhead(1) diff --git a/clang/lib/Parse/ParseExprCXX.cpp b/clang/lib/Parse/ParseExprCXX.cpp index d95260829e4a..55ad7f256fa8 100644 --- a/clang/lib/Parse/ParseExprCXX.cpp +++ b/clang/lib/Parse/ParseExprCXX.cpp @@ -421,8 +421,8 @@ bool Parser::ParseOptionalCXXScopeSpecifier( // like we never saw it. Token Identifier = Tok; // Stash away the identifier. ConsumeToken(); // Eat the identifier, current token is now '::'. - Diag(PP.getLocForEndOfToken(ConsumeToken()), diag::err_expected) - << tok::identifier; + ConsumeToken(); + Diag(getEndOfPreviousToken(), diag::err_expected) << tok::identifier; UnconsumeToken(Identifier); // Stick the identifier back. Next = NextToken(); // Point Next at the '{' token. } diff --git a/clang/lib/Parse/ParseStmt.cpp b/clang/lib/Parse/ParseStmt.cpp index c788723023c8..c00759893b0c 100644 --- a/clang/lib/Parse/ParseStmt.cpp +++ b/clang/lib/Parse/ParseStmt.cpp @@ -832,10 +832,13 @@ StmtResult Parser::ParseCaseStatement(ParsedStmtContext StmtCtx, << "'case'" << tok::colon << FixItHint::CreateReplacement(ColonLoc, ":"); } else { - SourceLocation ExpectedLoc = PP.getLocForEndOfToken(PrevTokLocation); + SourceLocation ExpectedLoc = getEndOfPreviousToken(); + Diag(ExpectedLoc, diag::err_expected_after) << "'case'" << tok::colon - << FixItHint::CreateInsertion(ExpectedLoc, ":"); + << FixItHint::CreateInsertion(ExpectedLoc, + tok::getTokenName(tok::colon)); + ColonLoc = ExpectedLoc; } diff --git a/clang/lib/Parse/Parser.cpp b/clang/lib/Parse/Parser.cpp index db65c05cc114..788ed79e0c1f 100644 --- a/clang/lib/Parse/Parser.cpp +++ b/clang/lib/Parse/Parser.cpp @@ -1873,6 +1873,11 @@ Parser::TryAnnotateName(CorrectionCandidateCallback *CCC, return AnnotatedNameKind::Unresolved; } +SourceLocation Parser::getEndOfPreviousToken() const { + SourceLocation TokenEndLoc = PP.getLocForEndOfToken(PrevTokLocation); + return TokenEndLoc.isValid() ? TokenEndLoc : Tok.getLocation(); +} + bool Parser::TryKeywordIdentFallback(bool DisableKeyword) { assert(Tok.isNot(tok::identifier)); Diag(Tok, diag::ext_keyword_as_ident) diff --git a/clang/test/Parser/macro-expansion-recovery.cpp b/clang/test/Parser/macro-expansion-recovery.cpp new file mode 100644 index 000000000000..6826cc04e4df --- /dev/null +++ b/clang/test/Parser/macro-expansion-recovery.cpp @@ -0,0 +1,22 @@ +// RUN: %clang_cc1 -fsyntax-only -verify %s + +namespace GH143216 { +#define A x y +enum { A }; // expected-error {{missing ',' between enumerators}} + +#define B x y +void f() { + int a[2]; + auto [B] = a; // expected-error {{expected ','}} +} + +#define C class D; +D C; // expected-error {{expected unqualified-id}} \ + // expected-error {{expected '>'}} \ + // expected-note {{to match this '<'}} + +#define E F::{ +class F { E }}; // expected-error {{expected identifier}} \ + // expected-error {{expected member name or ';' after declaration specifiers}} +} diff --git a/clang/test/Parser/switch-recovery.cpp b/clang/test/Parser/switch-recovery.cpp index baf703cd03ae..7b3909e3b0d3 100644 --- a/clang/test/Parser/switch-recovery.cpp +++ b/clang/test/Parser/switch-recovery.cpp @@ -229,3 +229,16 @@ void fn1() { } } // expected-error{{expected statement}} } + +namespace GH143216 { +#define FOO 1 case 3: + +int f(int x) { + switch (x) { + case FOO // expected-error {{expected ':' after 'case'}} + return 0; + default: + return 1; + } +} +} From 625bfb7179ad1acab2aba1023095826628275a60 Mon Sep 17 00:00:00 2001 From: Jiachen Yuan Date: Wed, 11 Jun 2025 14:23:41 -0700 Subject: [PATCH 0060/1322] Workaround MSVC Linker Issue when Cross-Compiling for ARM64EC (#143659) This MR presents a temporary workaround for the issue described at https://github.com/llvm/llvm-project/issues/143575. While an [upstream MSVC bug](https://developercommunity.visualstudio.com/t/MSVC-Linker-Issue-When-Cross-Compiling-L/10920141) is reported, it makes sense to apply a workaround in LLVM code to quickly unblock anyone affected. --- llvm/include/llvm/IR/Mangler.h | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/llvm/include/llvm/IR/Mangler.h b/llvm/include/llvm/IR/Mangler.h index e3dfe1eac618..232101a8926b 100644 --- a/llvm/include/llvm/IR/Mangler.h +++ b/llvm/include/llvm/IR/Mangler.h @@ -26,7 +26,16 @@ class Triple; class Twine; class raw_ostream; -constexpr std::string_view HybridPatchableTargetSuffix = "$hp_target"; +// TODO: The weird assignment of HybridPatchableTargetSuffix below is a +// temporary workaround for a linker failure that is only hit when compiling +// llvm for arm64ec on windows. The description and context of the issue is at +// https://github.com/llvm/llvm-project/issues/143575. +// An upstream MSVC bug is filed at +// https://developercommunity.visualstudio.com/t/MSVC-Linker-Issue-When-Cross- +// Compiling-L/10920141. +constexpr char HybridPatchableTargetSuffixArr[] = "$hp_target"; +constexpr std::string_view HybridPatchableTargetSuffix = + HybridPatchableTargetSuffixArr; class Mangler { /// We need to give global values the same name every time they are mangled. From 7838fc0cd3fbe578d9554fdcd3198c2ba3616bcc Mon Sep 17 00:00:00 2001 From: Sirraide Date: Wed, 11 Jun 2025 23:24:33 +0200 Subject: [PATCH 0061/1322] [Clang] [NFC] Move diagnostics emitting code from `DiagnosticIDs` into `DiagnosticsEngine` (#143517) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It makes more sense for this functionality to be all in one place rather than split up across two files—at least it caused me a bit of a headache to try and find all places where we were actually forwarding the diagnostic to the `DiagnosticConsumer`. Moreover, moving these functions into `DiagnosticsEngine` simplifies the code quite a bit since we access members of `DiagnosticsEngine` more frequently than those of `DiagnosticIDs`. There was also a duplicated code snippet that I’ve moved out into a new function. --- clang/include/clang/Basic/Diagnostic.h | 23 +++--- clang/include/clang/Basic/DiagnosticIDs.h | 12 --- clang/lib/Basic/Diagnostic.cpp | 98 ++++++++++++++++++++--- clang/lib/Basic/DiagnosticIDs.cpp | 97 ---------------------- 4 files changed, 102 insertions(+), 128 deletions(-) diff --git a/clang/include/clang/Basic/Diagnostic.h b/clang/include/clang/Basic/Diagnostic.h index e9c54c3c487c..efee8302e750 100644 --- a/clang/include/clang/Basic/Diagnostic.h +++ b/clang/include/clang/Basic/Diagnostic.h @@ -18,6 +18,7 @@ #include "clang/Basic/DiagnosticOptions.h" #include "clang/Basic/SourceLocation.h" #include "clang/Basic/Specifiers.h" +#include "clang/Basic/UnsignedOrNone.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/FunctionExtras.h" @@ -49,6 +50,7 @@ class FileSystem; namespace clang { class DeclContext; +class Diagnostic; class DiagnosticBuilder; class DiagnosticConsumer; class IdentifierInfo; @@ -228,6 +230,8 @@ public: class DiagnosticsEngine : public RefCountedBase { public: /// The level of the diagnostic, after it has been through mapping. + // FIXME: Make this an alias for DiagnosticIDs::Level as soon as + // we can use 'using enum'. enum Level { Ignored = DiagnosticIDs::Ignored, Note = DiagnosticIDs::Note, @@ -532,7 +536,7 @@ private: /// /// This is used to emit continuation diagnostics with the same level as the /// diagnostic that they follow. - DiagnosticIDs::Level LastDiagLevel; + Level LastDiagLevel; /// Number of warnings reported unsigned NumWarnings; @@ -777,18 +781,16 @@ public: /// the middle of another diagnostic. /// /// This can be used by clients who suppress diagnostics themselves. - void setLastDiagnosticIgnored(bool Ignored) { - if (LastDiagLevel == DiagnosticIDs::Fatal) + void setLastDiagnosticIgnored(bool IsIgnored) { + if (LastDiagLevel == Fatal) FatalErrorOccurred = true; - LastDiagLevel = Ignored ? DiagnosticIDs::Ignored : DiagnosticIDs::Warning; + LastDiagLevel = IsIgnored ? Ignored : Warning; } /// Determine whether the previous diagnostic was ignored. This can /// be used by clients that want to determine whether notes attached to a /// diagnostic will be suppressed. - bool isLastDiagnosticIgnored() const { - return LastDiagLevel == DiagnosticIDs::Ignored; - } + bool isLastDiagnosticIgnored() const { return LastDiagLevel == Ignored; } /// Controls whether otherwise-unmapped extension diagnostics are /// mapped onto ignore/warning/error. @@ -1024,9 +1026,10 @@ private: /// Used to report a diagnostic that is finally fully formed. /// /// \returns true if the diagnostic was emitted, false if it was suppressed. - bool ProcessDiag(const DiagnosticBuilder &DiagBuilder) { - return Diags->ProcessDiag(*this, DiagBuilder); - } + bool ProcessDiag(const DiagnosticBuilder &DiagBuilder); + + /// Forward a diagnostic to the DiagnosticConsumer. + void Report(Level DiagLevel, const Diagnostic &Info); /// @name Diagnostic Emission /// @{ diff --git a/clang/include/clang/Basic/DiagnosticIDs.h b/clang/include/clang/Basic/DiagnosticIDs.h index 80d52a0d0111..2b095f0fd674 100644 --- a/clang/include/clang/Basic/DiagnosticIDs.h +++ b/clang/include/clang/Basic/DiagnosticIDs.h @@ -483,18 +483,6 @@ private: Class getDiagClass(unsigned DiagID) const; - /// Used to report a diagnostic that is finally fully formed. - /// - /// \returns \c true if the diagnostic was emitted, \c false if it was - /// suppressed. - bool ProcessDiag(DiagnosticsEngine &Diag, - const DiagnosticBuilder &DiagBuilder) const; - - /// Used to emit a diagnostic that is finally fully formed, - /// ignoring suppression. - void EmitDiag(DiagnosticsEngine &Diag, const DiagnosticBuilder &DiagBuilder, - Level DiagLevel) const; - /// Whether the diagnostic may leave the AST in a state where some /// invariants can break. bool isUnrecoverable(unsigned DiagID) const; diff --git a/clang/lib/Basic/Diagnostic.cpp b/clang/lib/Basic/Diagnostic.cpp index 694224071347..95d86cb153b4 100644 --- a/clang/lib/Basic/Diagnostic.cpp +++ b/clang/lib/Basic/Diagnostic.cpp @@ -130,7 +130,7 @@ void DiagnosticsEngine::Reset(bool soft /*=false*/) { TrapNumErrorsOccurred = 0; TrapNumUnrecoverableErrorsOccurred = 0; - LastDiagLevel = DiagnosticIDs::Ignored; + LastDiagLevel = Ignored; if (!soft) { // Clear state related to #pragma diagnostic. @@ -658,13 +658,95 @@ void DiagnosticsEngine::Report(const StoredDiagnostic &storedDiag) { Level DiagLevel = storedDiag.getLevel(); Diagnostic Info(this, storedDiag.getLocation(), storedDiag.getID(), DiagStorage, storedDiag.getMessage()); + Report(DiagLevel, Info); +} + +void DiagnosticsEngine::Report(Level DiagLevel, const Diagnostic &Info) { + assert(DiagLevel != Ignored && "Cannot emit ignored diagnostics!"); Client->HandleDiagnostic(DiagLevel, Info); if (Client->IncludeInDiagnosticCounts()) { - if (DiagLevel == DiagnosticsEngine::Warning) + if (DiagLevel == Warning) ++NumWarnings; } } +/// ProcessDiag - This is the method used to report a diagnostic that is +/// finally fully formed. +bool DiagnosticsEngine::ProcessDiag(const DiagnosticBuilder &DiagBuilder) { + Diagnostic Info(this, DiagBuilder); + + assert(getClient() && "DiagnosticClient not set!"); + + // Figure out the diagnostic level of this message. + unsigned DiagID = Info.getID(); + Level DiagLevel = getDiagnosticLevel(DiagID, Info.getLocation()); + + // Update counts for DiagnosticErrorTrap even if a fatal error occurred + // or diagnostics are suppressed. + if (DiagLevel >= Error) { + ++TrapNumErrorsOccurred; + if (Diags->isUnrecoverable(DiagID)) + ++TrapNumUnrecoverableErrorsOccurred; + } + + if (SuppressAllDiagnostics) + return false; + + if (DiagLevel != Note) { + // Record that a fatal error occurred only when we see a second + // non-note diagnostic. This allows notes to be attached to the + // fatal error, but suppresses any diagnostics that follow those + // notes. + if (LastDiagLevel == Fatal) + FatalErrorOccurred = true; + + LastDiagLevel = DiagLevel; + } + + // If a fatal error has already been emitted, silence all subsequent + // diagnostics. + if (FatalErrorOccurred) { + if (DiagLevel >= Error && Client->IncludeInDiagnosticCounts()) + ++NumErrors; + + return false; + } + + // If the client doesn't care about this message, don't issue it. If this is + // a note and the last real diagnostic was ignored, ignore it too. + if (DiagLevel == Ignored || (DiagLevel == Note && LastDiagLevel == Ignored)) + return false; + + if (DiagLevel >= Error) { + if (Diags->isUnrecoverable(DiagID)) + UnrecoverableErrorOccurred = true; + + // Warnings which have been upgraded to errors do not prevent compilation. + if (Diags->isDefaultMappingAsError(DiagID)) + UncompilableErrorOccurred = true; + + ErrorOccurred = true; + if (Client->IncludeInDiagnosticCounts()) + ++NumErrors; + + // If we've emitted a lot of errors, emit a fatal error instead of it to + // stop a flood of bogus errors. + if (ErrorLimit && NumErrors > ErrorLimit && DiagLevel == Error) { + Report(diag::fatal_too_many_errors); + return false; + } + } + + // Make sure we set FatalErrorOccurred to ensure that the notes from the + // diagnostic that caused `fatal_too_many_errors` won't be emitted. + if (Info.getID() == diag::fatal_too_many_errors) + FatalErrorOccurred = true; + + // Finally, report it. + Report(DiagLevel, Info); + return true; +} + bool DiagnosticsEngine::EmitDiagnostic(const DiagnosticBuilder &DB, bool Force) { assert(getClient() && "DiagnosticClient not set!"); @@ -674,14 +756,12 @@ bool DiagnosticsEngine::EmitDiagnostic(const DiagnosticBuilder &DB, Diagnostic Info(this, DB); // Figure out the diagnostic level of this message. - DiagnosticIDs::Level DiagLevel = - Diags->getDiagnosticLevel(Info.getID(), Info.getLocation(), *this); + Level DiagLevel = getDiagnosticLevel(Info.getID(), Info.getLocation()); - Emitted = (DiagLevel != DiagnosticIDs::Ignored); - if (Emitted) { - // Emit the diagnostic regardless of suppression level. - Diags->EmitDiag(*this, DB, DiagLevel); - } + // Emit the diagnostic regardless of suppression level. + Emitted = DiagLevel != Ignored; + if (Emitted) + Report(DiagLevel, Info); } else { // Process the diagnostic, sending the accumulated information to the // DiagnosticConsumer. diff --git a/clang/lib/Basic/DiagnosticIDs.cpp b/clang/lib/Basic/DiagnosticIDs.cpp index 3e90b2d80477..dcf0c6cb5428 100644 --- a/clang/lib/Basic/DiagnosticIDs.cpp +++ b/clang/lib/Basic/DiagnosticIDs.cpp @@ -823,103 +823,6 @@ unsigned DiagnosticIDs::getCXXCompatDiagId(const LangOptions &LangOpts, return StdVer >= D.StdVer ? D.DiagId : D.PreDiagId; } -/// ProcessDiag - This is the method used to report a diagnostic that is -/// finally fully formed. -bool DiagnosticIDs::ProcessDiag(DiagnosticsEngine &Diag, - const DiagnosticBuilder &DiagBuilder) const { - Diagnostic Info(&Diag, DiagBuilder); - - assert(Diag.getClient() && "DiagnosticClient not set!"); - - // Figure out the diagnostic level of this message. - unsigned DiagID = Info.getID(); - DiagnosticIDs::Level DiagLevel - = getDiagnosticLevel(DiagID, Info.getLocation(), Diag); - - // Update counts for DiagnosticErrorTrap even if a fatal error occurred - // or diagnostics are suppressed. - if (DiagLevel >= DiagnosticIDs::Error) { - ++Diag.TrapNumErrorsOccurred; - if (isUnrecoverable(DiagID)) - ++Diag.TrapNumUnrecoverableErrorsOccurred; - } - - if (Diag.SuppressAllDiagnostics) - return false; - - if (DiagLevel != DiagnosticIDs::Note) { - // Record that a fatal error occurred only when we see a second - // non-note diagnostic. This allows notes to be attached to the - // fatal error, but suppresses any diagnostics that follow those - // notes. - if (Diag.LastDiagLevel == DiagnosticIDs::Fatal) - Diag.FatalErrorOccurred = true; - - Diag.LastDiagLevel = DiagLevel; - } - - // If a fatal error has already been emitted, silence all subsequent - // diagnostics. - if (Diag.FatalErrorOccurred) { - if (DiagLevel >= DiagnosticIDs::Error && - Diag.Client->IncludeInDiagnosticCounts()) { - ++Diag.NumErrors; - } - - return false; - } - - // If the client doesn't care about this message, don't issue it. If this is - // a note and the last real diagnostic was ignored, ignore it too. - if (DiagLevel == DiagnosticIDs::Ignored || - (DiagLevel == DiagnosticIDs::Note && - Diag.LastDiagLevel == DiagnosticIDs::Ignored)) - return false; - - if (DiagLevel >= DiagnosticIDs::Error) { - if (isUnrecoverable(DiagID)) - Diag.UnrecoverableErrorOccurred = true; - - // Warnings which have been upgraded to errors do not prevent compilation. - if (isDefaultMappingAsError(DiagID)) - Diag.UncompilableErrorOccurred = true; - - Diag.ErrorOccurred = true; - if (Diag.Client->IncludeInDiagnosticCounts()) { - ++Diag.NumErrors; - } - - // If we've emitted a lot of errors, emit a fatal error instead of it to - // stop a flood of bogus errors. - if (Diag.ErrorLimit && Diag.NumErrors > Diag.ErrorLimit && - DiagLevel == DiagnosticIDs::Error) { - Diag.Report(diag::fatal_too_many_errors); - return false; - } - } - - // Make sure we set FatalErrorOccurred to ensure that the notes from the - // diagnostic that caused `fatal_too_many_errors` won't be emitted. - if (Info.getID() == diag::fatal_too_many_errors) - Diag.FatalErrorOccurred = true; - // Finally, report it. - EmitDiag(Diag, DiagBuilder, DiagLevel); - return true; -} - -void DiagnosticIDs::EmitDiag(DiagnosticsEngine &Diag, - const DiagnosticBuilder &DiagBuilder, - Level DiagLevel) const { - Diagnostic Info(&Diag, DiagBuilder); - assert(DiagLevel != DiagnosticIDs::Ignored && "Cannot emit ignored diagnostics!"); - - Diag.Client->HandleDiagnostic((DiagnosticsEngine::Level)DiagLevel, Info); - if (Diag.Client->IncludeInDiagnosticCounts()) { - if (DiagLevel == DiagnosticIDs::Warning) - ++Diag.NumWarnings; - } -} - bool DiagnosticIDs::isUnrecoverable(unsigned DiagID) const { // Only errors may be unrecoverable. if (getDiagClass(DiagID) < CLASS_ERROR) From 6f2ba4712f17d7c82228a5b705570571e13a3832 Mon Sep 17 00:00:00 2001 From: Ian Wood Date: Wed, 11 Jun 2025 14:34:02 -0700 Subject: [PATCH 0062/1322] [mlir] Fix ComposeExpandOfCollapseOp for dynamic case (#142663) Changes `findCollapsingReassociation` to return nullopt in all cases where source shape has `>=2` dynamic dims. `expand(collapse)` can reshape to in any valid output shape but a collapse can only collapse contiguous dimensions. When there are `>=2` dynamic dimensions it is impossible to determine if it can be simplified to a collapse or if it is preforming a more advanced reassociation. This problem was uncovered by https://github.com/llvm/llvm-project/pull/137963 --------- Signed-off-by: Ian Wood --- mlir/include/mlir/Dialect/Utils/ReshapeOpsUtils.h | 9 ++++++--- mlir/test/Dialect/Tensor/canonicalize.mlir | 14 ++++++++++++++ 2 files changed, 20 insertions(+), 3 deletions(-) diff --git a/mlir/include/mlir/Dialect/Utils/ReshapeOpsUtils.h b/mlir/include/mlir/Dialect/Utils/ReshapeOpsUtils.h index af575e10acc8..61c2a50e514c 100644 --- a/mlir/include/mlir/Dialect/Utils/ReshapeOpsUtils.h +++ b/mlir/include/mlir/Dialect/Utils/ReshapeOpsUtils.h @@ -387,11 +387,14 @@ private: auto resultSubShape = resultShape.slice(resultIndices.front(), resultIndices.size()); + if (llvm::count_if(srcSubShape, ShapedType::isDynamic) >= 2 && + llvm::count_if(resultSubShape, ShapedType::isDynamic) >= 2) + return std::nullopt; + if (srcSubShape.size() == resultSubShape.size()) { - if (srcSubShape != resultSubShape || - llvm::count_if(srcSubShape, ShapedType::isDynamic) >= 2) { + if (srcSubShape != resultSubShape) return std::nullopt; - } + for (auto index : llvm::seq(0, srcSubShape.size())) { composedReassociation.emplace_back(1, srcIndices.front() + index); } diff --git a/mlir/test/Dialect/Tensor/canonicalize.mlir b/mlir/test/Dialect/Tensor/canonicalize.mlir index 65c5b3e8602e..67b03b0a3485 100644 --- a/mlir/test/Dialect/Tensor/canonicalize.mlir +++ b/mlir/test/Dialect/Tensor/canonicalize.mlir @@ -1272,6 +1272,20 @@ func.func @compose_expand_of_collapse_dynamic(%arg0 : tensor<4x?x10x64x2xf16>, % // ----- +func.func @no_compose_collapse_of_expand_dynamic(%arg0 : tensor, %arg1: index) -> tensor { + %collapse = tensor.collapse_shape %arg0 [[0, 1, 2, 3]] : tensor into tensor + %expanded_19 = tensor.expand_shape %collapse [[0, 1, 2]] output_shape [%arg1, 8, %arg1] : tensor into tensor + return %expanded_19 : tensor +} +// CHECK-LABEL: func @no_compose_collapse_of_expand_dynamic +// CHECK-SAME: %[[ARG0:.+]]: tensor +// CHECK-SAME: %[[ARG1:.+]]: index +// CHECK: %[[COLLAPSE:.+]] = tensor.collapse_shape %[[ARG0]] +// CHECK: %[[EXPAND:.+]] = tensor.expand_shape %[[COLLAPSE]] +// CHECK: return %[[EXPAND]] + +// ----- + // CHECK-LABEL: func @zero_rank_reshape_multi func.func @zero_rank_reshape_multi(%arg0: tensor) -> tensor { // CHECK: return %arg0 From 9c9a4a284e95ea5e27617af7235e3ab049bae680 Mon Sep 17 00:00:00 2001 From: Ellis Hoag Date: Wed, 11 Jun 2025 14:54:30 -0700 Subject: [PATCH 0063/1322] [LOH] Don't emit AdrpAddStr when register could be clobbered (#142849) https://github.com/llvm/llvm-project/commit/b783aa89795635cbe7b25b4143b562931fcec9f6 added a check to ensure an `AdrpAddLdr` LOH isn't created when there is an instruction between the `add` and `ldr` https://github.com/llvm/llvm-project/blob/50c5704dc000cc0af41a511aa44db03233edf0af/llvm/lib/Target/AArch64/AArch64CollectLOH.cpp#L419-L431 We need a similar check for `AdrpAddStr`. Although this technically isn't implemented in LLD, it could be in the future. https://github.com/llvm/llvm-project/blob/50c5704dc000cc0af41a511aa44db03233edf0af/lld/MachO/Arch/ARM64.cpp#L699-L702 --- llvm/lib/Target/AArch64/AArch64CollectLOH.cpp | 37 +++++++++++------- .../AArch64/loh-adrp-add-ldr-clobber.mir | 39 +++++++++++++------ 2 files changed, 49 insertions(+), 27 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64CollectLOH.cpp b/llvm/lib/Target/AArch64/AArch64CollectLOH.cpp index 53e8e438c5e5..064716216d1c 100644 --- a/llvm/lib/Target/AArch64/AArch64CollectLOH.cpp +++ b/llvm/lib/Target/AArch64/AArch64CollectLOH.cpp @@ -247,6 +247,17 @@ static bool supportLoadFromLiteral(const MachineInstr &MI) { } } +/// Returns \p true if there are no non-debug instructions between \p First and +/// \p Second +static bool areInstructionsConsecutive(const MachineInstr *First, + const MachineInstr *Second) { + auto It = First->getIterator(); + auto EndIt = First->getParent()->instr_end(); + if (It == EndIt) + return false; + return next_nodbg(It, EndIt) == Second->getIterator(); +} + /// Number of GPR registers tracked by mapRegToGPRIndex() static const unsigned N_GPR_REGS = 31; /// Map register number to index from 0-30. @@ -415,7 +426,7 @@ static void handleADRP(const MachineInstr &MI, AArch64FunctionInfo &AFI, ++NumADRPToLDR; } break; - case MCLOH_AdrpAddLdr: { + case MCLOH_AdrpAddLdr: // There is a possibility that the linker may try to rewrite: // adrp x0, @sym@PAGE // add x1, x0, @sym@PAGEOFF @@ -432,28 +443,24 @@ static void handleADRP(const MachineInstr &MI, AArch64FunctionInfo &AFI, // FIXME: Implement proper liveness tracking for all registers. For now, // don't emit the LOH if there are any instructions between the add and // the ldr. - MachineInstr *AddMI = const_cast(Info.MI1); - const MachineInstr *LdrMI = Info.MI0; - auto AddIt = MachineBasicBlock::iterator(AddMI); - auto EndIt = AddMI->getParent()->end(); - if (AddMI->getIterator() == EndIt || LdrMI != &*next_nodbg(AddIt, EndIt)) + if (!areInstructionsConsecutive(Info.MI1, Info.MI0)) break; - LLVM_DEBUG(dbgs() << "Adding MCLOH_AdrpAddLdr:\n" << '\t' << MI << '\t' << *Info.MI1 << '\t' << *Info.MI0); AFI.addLOHDirective(MCLOH_AdrpAddLdr, {&MI, Info.MI1, Info.MI0}); ++NumADDToLDR; break; - } case MCLOH_AdrpAddStr: - if (Info.MI1 != nullptr) { - LLVM_DEBUG(dbgs() << "Adding MCLOH_AdrpAddStr:\n" - << '\t' << MI << '\t' << *Info.MI1 << '\t' - << *Info.MI0); - AFI.addLOHDirective(MCLOH_AdrpAddStr, {&MI, Info.MI1, Info.MI0}); - ++NumADDToSTR; - } + if (!Info.MI1) + break; + if (!areInstructionsConsecutive(Info.MI1, Info.MI0)) + break; + LLVM_DEBUG(dbgs() << "Adding MCLOH_AdrpAddStr:\n" + << '\t' << MI << '\t' << *Info.MI1 << '\t' + << *Info.MI0); + AFI.addLOHDirective(MCLOH_AdrpAddStr, {&MI, Info.MI1, Info.MI0}); + ++NumADDToSTR; break; case MCLOH_AdrpLdrGotLdr: LLVM_DEBUG(dbgs() << "Adding MCLOH_AdrpLdrGotLdr:\n" diff --git a/llvm/test/CodeGen/AArch64/loh-adrp-add-ldr-clobber.mir b/llvm/test/CodeGen/AArch64/loh-adrp-add-ldr-clobber.mir index ce2d8f02f4cc..a1d8bf375a19 100644 --- a/llvm/test/CodeGen/AArch64/loh-adrp-add-ldr-clobber.mir +++ b/llvm/test/CodeGen/AArch64/loh-adrp-add-ldr-clobber.mir @@ -1,16 +1,34 @@ -# RUN: llc -o /dev/null %s -mtriple=aarch64-apple-ios -run-pass=aarch64-collect-loh -debug-only=aarch64-collect-loh 2>&1 | FileCheck %s +# RUN: llc -o /dev/null %s -mtriple=aarch64-apple-ios -run-pass=aarch64-collect-loh -debug-only=aarch64-collect-loh 2>&1 | FileCheck %s --implicit-check-not=MCLOH_ # REQUIRES: asserts + +# Check that we don't emit LOHs when there is a clobbering def of x8. --- | @sym2 = local_unnamed_addr global [10000000 x i32] zeroinitializer, align 8 @sym = local_unnamed_addr global i32 zeroinitializer, align 8 - define i32 @main() { - ret i32 0 - } - + define i32 @adrp_add_ldr() { ret i32 0 } + define i32 @adrp_add_str() { ret i32 0 } ... + --- -name: main +name: adrp_add_ldr +alignment: 4 +tracksRegLiveness: true +liveins: + - { reg: '$x21', virtual-reg: '' } +body: | + bb.0: + liveins: $x21 + renamable $x8 = ADRP target-flags(aarch64-page) @sym + renamable $x9 = ADDXri killed renamable $x8, target-flags(aarch64-pageoff, aarch64-nc) @sym, 0 + renamable $x8 = ADDXri killed renamable $x21, 1, 0 + $x9 = LDRXui $x9, 0 + + RET undef $lr +... + +--- +name: adrp_add_str alignment: 4 tracksRegLiveness: true liveins: @@ -19,13 +37,10 @@ liveins: body: | bb.0: liveins: $x21, $x22 - ; Check we don't emit an loh here because there's a clobbering def of x8 before the ldr. - ; CHECK-LABEL: main - ; CHECK-NOT: MCLOH_AdrpAddLdr renamable $x8 = ADRP target-flags(aarch64-page) @sym renamable $x9 = ADDXri killed renamable $x8, target-flags(aarch64-pageoff, aarch64-nc) @sym, 0 - renamable $x8 = ADDXri killed renamable $x22, 1, 0 - $x9 = LDRXui $x9, 0 - RET undef $lr + renamable $x8 = ADDXri killed renamable $x21, 1, 0 + STRXui $x22, $x9, 0 + RET undef $lr ... From 74172add65aa14e77e98b048db0074c3f273057f Mon Sep 17 00:00:00 2001 From: Michael Maitland Date: Wed, 11 Jun 2025 18:18:22 -0400 Subject: [PATCH 0064/1322] [mlir][generate-test-checks] Do not emit the autogenerated note if it exists (#143750) Prior to this PR, the script removed the already existing autogenerated note if we came across a line that was equal to the note. But the default note is multiple lines, so there would never be a match. Instead, check to see if the current line is a substring of the autogenerated note. Co-authored-by: Michael Maitland --- mlir/utils/generate-test-checks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mlir/utils/generate-test-checks.py b/mlir/utils/generate-test-checks.py index 11fb4e40072e..f77c9688d931 100755 --- a/mlir/utils/generate-test-checks.py +++ b/mlir/utils/generate-test-checks.py @@ -208,7 +208,7 @@ def process_source_lines(source_lines, note, args): source_segments = [[]] for line in source_lines: # Remove previous note. - if line == note: + if line in note: continue # Remove previous CHECK lines. if line.find(args.check_prefix) != -1: From 0e457315f55889878ccbc3e35d4beb04e277733f Mon Sep 17 00:00:00 2001 From: Michael Maitland Date: Wed, 11 Jun 2025 18:19:15 -0400 Subject: [PATCH 0065/1322] [mlir][generate-test-checks] Emit attributes with rest of CHECK lines (#143759) Prior to this patch, generating test checks in place put the ATTR definitions at the very top of the file, above the RUN lines and autogenerated note. All CHECK lines should below the RUN lines and autogenerated note. This change ensures that the attribute definitions are emitted with the rest of the CHECK lines. --------- Co-authored-by: Michael Maitland --- mlir/utils/generate-test-checks.py | 28 +++++++++++++++++++++++----- 1 file changed, 23 insertions(+), 5 deletions(-) diff --git a/mlir/utils/generate-test-checks.py b/mlir/utils/generate-test-checks.py index f77c9688d931..14a790e6d0e6 100755 --- a/mlir/utils/generate-test-checks.py +++ b/mlir/utils/generate-test-checks.py @@ -220,12 +220,19 @@ def process_source_lines(source_lines, note, args): source_segments[-1].append(line + "\n") return source_segments -def process_attribute_definition(line, attribute_namer, output): + +def process_attribute_definition(line, attribute_namer): m = ATTR_DEF_RE.match(line) if m: attribute_name = attribute_namer.generate_name(m.group(1)) - line = '// CHECK: #[[' + attribute_name + ':.+]] =' + line[len(m.group(0)):] + '\n' - output.write(line) + return ( + "// CHECK: #[[" + + attribute_name + + ":.+]] =" + + line[len(m.group(0)) :] + + "\n" + ) + return None def process_attribute_references(line, attribute_namer): @@ -340,6 +347,9 @@ def main(): variable_namer = VariableNamer(args.variable_names) attribute_namer = AttributeNamer(args.attribute_names) + # Store attribute definitions to emit at appropriate scope + pending_attr_defs = [] + # Process lines for input_line in input_lines: if not input_line: @@ -350,8 +360,9 @@ def main(): if input_line.startswith("// -----"): continue - # Check if this is an attribute definition and process it - process_attribute_definition(input_line, attribute_namer, output) + if ATTR_DEF_RE.match(input_line): + pending_attr_defs.append(input_line) + continue # Lines with blocks begin with a ^. These lines have a trailing comment # that needs to be stripped. @@ -407,6 +418,13 @@ def main(): output_line += process_line(ssa_split[1:], variable_namer) else: + # Emit any pending attribute definitions at the start of this scope + for attr in pending_attr_defs: + attr_line = process_attribute_definition(attr, attribute_namer) + if attr_line: + output_segments[-1].append(attr_line) + pending_attr_defs.clear() + # Output the first line chunk that does not contain an SSA name for the # label. output_line = "// " + args.check_prefix + "-LABEL: " + ssa_split[0] + "\n" From ee35e342945d6825c9b2b004fd135cf16c84ea0e Mon Sep 17 00:00:00 2001 From: Nikolay Panchenko Date: Wed, 11 Jun 2025 19:00:29 -0400 Subject: [PATCH 0066/1322] [ConstantFolding] Add folding for [de]interleave2, insert and extract (#141301) The change adds folding for 4 vector intrinsics: `interleave2`, `deinterleave2`, `vector_extract` and `vector_insert`. For the last 2 intrinsics the change does not use `ShuffleVector` fold mechanism as it's much simpler to construct result vector explicitly. --- llvm/lib/Analysis/ConstantFolding.cpp | 97 +++++++++++++++++++ .../InstSimplify/ConstProp/vector-calls.ll | 68 +++++++++++++ 2 files changed, 165 insertions(+) create mode 100644 llvm/test/Transforms/InstSimplify/ConstProp/vector-calls.ll diff --git a/llvm/lib/Analysis/ConstantFolding.cpp b/llvm/lib/Analysis/ConstantFolding.cpp index 1ef0badd2375..139a0b81e299 100644 --- a/llvm/lib/Analysis/ConstantFolding.cpp +++ b/llvm/lib/Analysis/ConstantFolding.cpp @@ -1635,6 +1635,10 @@ bool llvm::canConstantFoldCallTo(const CallBase *Call, const Function *F) { case Intrinsic::vector_reduce_smax: case Intrinsic::vector_reduce_umin: case Intrinsic::vector_reduce_umax: + case Intrinsic::vector_extract: + case Intrinsic::vector_insert: + case Intrinsic::vector_interleave2: + case Intrinsic::vector_deinterleave2: // Target intrinsics case Intrinsic::amdgcn_perm: case Intrinsic::amdgcn_wave_reduce_umin: @@ -3758,6 +3762,72 @@ static Constant *ConstantFoldFixedVectorCall( } return nullptr; } + case Intrinsic::vector_extract: { + auto *Idx = dyn_cast(Operands[1]); + Constant *Vec = Operands[0]; + if (!Idx || !isa(Vec->getType())) + return nullptr; + + unsigned NumElements = FVTy->getNumElements(); + unsigned VecNumElements = + cast(Vec->getType())->getNumElements(); + unsigned StartingIndex = Idx->getZExtValue(); + + // Extracting entire vector is nop + if (NumElements == VecNumElements && StartingIndex == 0) + return Vec; + + for (unsigned I = StartingIndex, E = StartingIndex + NumElements; I < E; + ++I) { + Constant *Elt = Vec->getAggregateElement(I); + if (!Elt) + return nullptr; + Result[I - StartingIndex] = Elt; + } + + return ConstantVector::get(Result); + } + case Intrinsic::vector_insert: { + Constant *Vec = Operands[0]; + Constant *SubVec = Operands[1]; + auto *Idx = dyn_cast(Operands[2]); + if (!Idx || !isa(Vec->getType())) + return nullptr; + + unsigned SubVecNumElements = + cast(SubVec->getType())->getNumElements(); + unsigned VecNumElements = + cast(Vec->getType())->getNumElements(); + unsigned IdxN = Idx->getZExtValue(); + // Replacing entire vector with a subvec is nop + if (SubVecNumElements == VecNumElements && IdxN == 0) + return SubVec; + + for (unsigned I = 0; I < VecNumElements; ++I) { + Constant *Elt; + if (I < IdxN + SubVecNumElements) + Elt = SubVec->getAggregateElement(I - IdxN); + else + Elt = Vec->getAggregateElement(I); + if (!Elt) + return nullptr; + Result[I] = Elt; + } + return ConstantVector::get(Result); + } + case Intrinsic::vector_interleave2: { + unsigned NumElements = + cast(Operands[0]->getType())->getNumElements(); + for (unsigned I = 0; I < NumElements; ++I) { + Constant *Elt0 = Operands[0]->getAggregateElement(I); + Constant *Elt1 = Operands[1]->getAggregateElement(I); + if (!Elt0 || !Elt1) + return nullptr; + Result[2 * I] = Elt0; + Result[2 * I + 1] = Elt1; + } + return ConstantVector::get(Result); + } default: break; } @@ -3919,6 +3989,33 @@ ConstantFoldStructCall(StringRef Name, Intrinsic::ID IntrinsicID, return nullptr; return ConstantStruct::get(StTy, SinResult, CosResult); } + case Intrinsic::vector_deinterleave2: { + auto *Vec = dyn_cast(Operands[0]); + if (!Vec) + return nullptr; + + auto *VecTy = cast(Vec->getType()); + unsigned NumElements = VecTy->getElementCount().getKnownMinValue() / 2; + if (isa(Vec)) { + auto *HalfVecTy = VectorType::getHalfElementsVectorType(VecTy); + return ConstantStruct::get(StTy, ConstantAggregateZero::get(HalfVecTy), + ConstantAggregateZero::get(HalfVecTy)); + } + if (isa(Vec->getType())) { + SmallVector Res0(NumElements), Res1(NumElements); + for (unsigned I = 0; I < NumElements; ++I) { + Constant *Elt0 = Vec->getAggregateElement(2 * I); + Constant *Elt1 = Vec->getAggregateElement(2 * I + 1); + if (!Elt0 || !Elt1) + return nullptr; + Res0[I] = Elt0; + Res1[I] = Elt1; + } + return ConstantStruct::get(StTy, ConstantVector::get(Res0), + ConstantVector::get(Res1)); + } + return nullptr; + } default: // TODO: Constant folding of vector intrinsics that fall through here does // not work (e.g. overflow intrinsics) diff --git a/llvm/test/Transforms/InstSimplify/ConstProp/vector-calls.ll b/llvm/test/Transforms/InstSimplify/ConstProp/vector-calls.ll new file mode 100644 index 000000000000..9dbe3d4e50ee --- /dev/null +++ b/llvm/test/Transforms/InstSimplify/ConstProp/vector-calls.ll @@ -0,0 +1,68 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt < %s -passes=instsimplify,verify -S | FileCheck %s + +define <3 x i32> @fold_vector_extract() { +; CHECK-LABEL: define <3 x i32> @fold_vector_extract() { +; CHECK-NEXT: ret <3 x i32> +; + %1 = call <3 x i32> @llvm.vector.extract.v3i32.v8i32(<8 x i32> , i64 3) + ret <3 x i32> %1 +} + +@a = external global i16, align 1 + +define <3 x i32> @fold_vector_extract_constexpr() { +; CHECK-LABEL: define <3 x i32> @fold_vector_extract_constexpr() { +; CHECK-NEXT: ret <3 x i32> +; + %1 = call <3 x i32> @llvm.vector.extract.v3i32.v8i32(<8 x i32> , i64 0) + ret <3 x i32> %1 +} + +define <8 x i32> @fold_vector_extract_nop() { +; CHECK-LABEL: define <8 x i32> @fold_vector_extract_nop() { +; CHECK-NEXT: ret <8 x i32> +; + %1 = call <8 x i32> @llvm.vector.extract.v3i32.v8i32(<8 x i32> , i64 0) + ret <8 x i32> %1 +} + +define <8 x i32> @fold_vector_insert() { +; CHECK-LABEL: define <8 x i32> @fold_vector_insert() { +; CHECK-NEXT: ret <8 x i32> +; + %1 = call <8 x i32> @llvm.vector.insert.v8i32(<8 x i32> , <4 x i32> , i64 0) + ret <8 x i32> %1 +} + +define <8 x i32> @fold_vector_insert_nop() { +; CHECK-LABEL: define <8 x i32> @fold_vector_insert_nop() { +; CHECK-NEXT: ret <8 x i32> +; + %1 = call <8 x i32> @llvm.vector.insert.v8i32(<8 x i32> , <8 x i32> , i64 0) + ret <8 x i32> %1 +} + +define <8 x i32> @fold_vector_interleave2() { +; CHECK-LABEL: define <8 x i32> @fold_vector_interleave2() { +; CHECK-NEXT: ret <8 x i32> +; + %1 = call <8 x i32> @llvm.vector.interleave2.v8i32(<4 x i32> , <4 x i32> ) + ret <8 x i32> %1 +} + +define {<4 x i32>, <4 x i32>} @fold_vector_deinterleave2() { +; CHECK-LABEL: define { <4 x i32>, <4 x i32> } @fold_vector_deinterleave2() { +; CHECK-NEXT: ret { <4 x i32>, <4 x i32> } { <4 x i32> , <4 x i32> } +; + %1 = call {<4 x i32>, <4 x i32>} @llvm.vector.deinterleave2.v4i32.v8i32(<8 x i32> ) + ret {<4 x i32>, <4 x i32>} %1 +} + +define {, } @fold_scalable_vector_deinterleave2() { +; CHECK-LABEL: define { , } @fold_scalable_vector_deinterleave2() { +; CHECK-NEXT: ret { , } zeroinitializer +; + %1 = call {, } @llvm.vector.deinterleave2.v4i32.v8i32( zeroinitializer) + ret {, } %1 +} From dc4335a2bf75c7b9928a72a7f15df0276120d7ed Mon Sep 17 00:00:00 2001 From: Joseph Huber Date: Wed, 11 Jun 2025 18:22:05 -0500 Subject: [PATCH 0067/1322] [libc] Perform bitfield zero initialization wave-parallel (#143607) Summary: We need to set the bitfield memory to zero because the system does not guarantee zeroed out memory. Even if fresh pages are zero, the system allows re-use so we would need a `kfd` level API to skip this step. Because we can't this patch updates the logic to perform the zero initialization wave-parallel. This reduces the amount of time it takes to allocate a fresh by up to a tenth. This has the unfortunate side effect that the control flow is more convoluted and we waste some extra registers, but it's worth it to reduce the slab allocation latency. --- libc/src/__support/GPU/allocator.cpp | 46 +++++++++++++++++++++------- 1 file changed, 35 insertions(+), 11 deletions(-) diff --git a/libc/src/__support/GPU/allocator.cpp b/libc/src/__support/GPU/allocator.cpp index ecc0de1cb6ec..66ab155e5c29 100644 --- a/libc/src/__support/GPU/allocator.cpp +++ b/libc/src/__support/GPU/allocator.cpp @@ -129,6 +129,14 @@ static inline constexpr T round_up(const T x) { return (x + N) & ~(N - 1); } +// Perform a lane parallel memset on a uint32_t pointer. +void uniform_memset(uint32_t *s, uint32_t c, uint32_t n, uint64_t uniform) { + uint64_t mask = gpu::get_lane_mask(); + uint32_t workers = cpp::popcount(uniform); + for (uint32_t i = impl::lane_count(mask & uniform); i < n; i += workers) + s[i] = c; +} + } // namespace impl /// A slab allocator used to hand out identically sized slabs of memory. @@ -157,10 +165,15 @@ struct Slab { Header *header = reinterpret_cast
(memory); header->chunk_size = chunk_size; header->global_index = global_index; + } - // This memset is expensive and likely not necessary for the current 'kfd' - // driver. Until zeroed pages are exposed by the API we must be careful. - __builtin_memset(get_bitfield(), 0, bitfield_bytes(chunk_size)); + // Set the necessary bitfield bytes to zero in parallel using many lanes. This + // must be called before the bitfield can be accessed safely, memory is not + // guaranteed to be zero initialized in the current implementation. + void initialize(uint64_t uniform) { + uint32_t size = (bitfield_bytes(get_chunk_size()) + sizeof(uint32_t) - 1) / + sizeof(uint32_t); + impl::uniform_memset(get_bitfield(), 0, size, uniform); } // Get the number of chunks that can theoretically fit inside this slab. @@ -354,14 +367,7 @@ private: void *raw = impl::rpc_allocate(sizeof(Slab)); if (!raw) return nullptr; - Slab *mem = new (raw) Slab(cpp::forward(args)...); - - cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE); - ptr.store(mem, cpp::MemoryOrder::RELAXED); - cpp::atomic_thread_fence(cpp::MemoryOrder::ACQUIRE); - if (!ref.acquire(n, count)) - ref.reset(n, count); - return mem; + return new (raw) Slab(cpp::forward(args)...); } if (!expected || expected == reinterpret_cast(SENTINEL)) @@ -374,6 +380,16 @@ private: return ptr.load(cpp::MemoryOrder::RELAXED); } + // Finalize the associated memory and signal that it is ready to use by + // resetting the counter. + void finalize(Slab *mem, uint32_t n, uint64_t &count) { + cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE); + ptr.store(mem, cpp::MemoryOrder::RELAXED); + cpp::atomic_thread_fence(cpp::MemoryOrder::ACQUIRE); + if (!ref.acquire(n, count)) + ref.reset(n, count); + } + public: // Attempt to lock access to the pointer, potentially creating it if empty. // The uniform mask represents which lanes share the same pointer. For each @@ -392,6 +408,14 @@ public: if (!result) return nullptr; + // We defer storing the newly allocated slab until now so that we can use + // multiple lanes to initialize it and release it for use. + if (count == cpp::numeric_limits::max()) { + result->initialize(uniform); + if (gpu::get_lane_id() == uint32_t(cpp::countr_zero(uniform))) + finalize(result, cpp::popcount(uniform), count); + } + if (count != cpp::numeric_limits::max()) count = count - cpp::popcount(uniform) + impl::lane_count(uniform) + 1; From 1ecd108cb7ceda2b11281b5d173e2827feb60c55 Mon Sep 17 00:00:00 2001 From: Alexey Samsonov Date: Wed, 11 Jun 2025 16:22:17 -0700 Subject: [PATCH 0068/1322] [libc] Migrate stdio tests to ErrnoCheckingTest. (#143802) Reduce the direct use of libc_errno in stdio unit tests by adopting ErrnoCheckingTest where appropriate. Also removes the libc_errno.h inclusions from stdlib.h tests that were accidentally added in d87eea35fac5a34a841c637db8908128409a184e --- libc/test/src/stdio/CMakeLists.txt | 10 ++++++++++ libc/test/src/stdio/fdopen_test.cpp | 14 ++++++-------- libc/test/src/stdio/fgetc_test.cpp | 5 ++--- libc/test/src/stdio/fgetc_unlocked_test.cpp | 5 ++--- libc/test/src/stdio/fgets_test.cpp | 6 +++--- libc/test/src/stdio/fileop_test.cpp | 20 +++++--------------- libc/test/src/stdio/fopencookie_test.cpp | 15 +++++++-------- libc/test/src/stdio/remove_test.cpp | 10 +++++----- libc/test/src/stdio/rename_test.cpp | 9 +++++---- libc/test/src/stdio/setvbuf_test.cpp | 8 ++++---- libc/test/src/stdio/unlocked_fileop_test.cpp | 7 +++---- libc/test/src/stdlib/StrtolTest.h | 1 - libc/test/src/stdlib/strtold_test.cpp | 1 - 13 files changed, 52 insertions(+), 59 deletions(-) diff --git a/libc/test/src/stdio/CMakeLists.txt b/libc/test/src/stdio/CMakeLists.txt index 01904a30504e..3627006ec28f 100644 --- a/libc/test/src/stdio/CMakeLists.txt +++ b/libc/test/src/stdio/CMakeLists.txt @@ -20,6 +20,7 @@ add_libc_test( libc.src.stdio.fread libc.src.stdio.fseek libc.src.stdio.fwrite + libc.test.UnitTest.ErrnoCheckingTest ) add_libc_test( @@ -68,6 +69,7 @@ add_libc_test( libc.src.stdio.fread libc.src.stdio.fwrite libc.src.stdio.setvbuf + libc.test.UnitTest.ErrnoCheckingTest ) add_libc_test( @@ -88,6 +90,7 @@ add_libc_test( libc.src.stdio.fread_unlocked libc.src.stdio.funlockfile libc.src.stdio.fwrite_unlocked + libc.test.UnitTest.ErrnoCheckingTest ) add_libc_test( @@ -109,6 +112,7 @@ add_libc_test( libc.src.stdio.fread libc.src.stdio.fseek libc.src.stdio.fwrite + libc.test.UnitTest.ErrnoCheckingTest LINK_LIBRARIES LibcMemoryHelpers ) @@ -426,6 +430,7 @@ if(${LIBC_TARGET_OS} STREQUAL "linux") libc.src.sys.stat.mkdirat libc.src.unistd.access libc.src.unistd.close + libc.test.UnitTest.ErrnoCheckingTest ) add_libc_test( @@ -440,6 +445,7 @@ if(${LIBC_TARGET_OS} STREQUAL "linux") libc.src.stdio.rename libc.src.unistd.access libc.src.unistd.close + libc.test.UnitTest.ErrnoCheckingTest libc.test.UnitTest.ErrnoSetterMatcher ) @@ -456,6 +462,7 @@ if(${LIBC_TARGET_OS} STREQUAL "linux") libc.src.stdio.fgets libc.src.stdio.fputs libc.src.unistd.close + libc.test.UnitTest.ErrnoCheckingTest libc.test.UnitTest.ErrnoSetterMatcher ) endif() @@ -476,6 +483,7 @@ add_libc_test( libc.src.stdio.fopen libc.src.stdio.fwrite libc.src.stdio.getc + libc.test.UnitTest.ErrnoCheckingTest ) add_libc_test( @@ -498,6 +506,7 @@ add_libc_test( libc.src.stdio.funlockfile libc.src.stdio.fwrite libc.src.stdio.getc_unlocked + libc.test.UnitTest.ErrnoCheckingTest ) add_libc_test( @@ -515,6 +524,7 @@ add_libc_test( libc.src.stdio.fgets libc.src.stdio.fopen libc.src.stdio.fwrite + libc.test.UnitTest.ErrnoCheckingTest ) add_libc_test( diff --git a/libc/test/src/stdio/fdopen_test.cpp b/libc/test/src/stdio/fdopen_test.cpp index 104fc478b100..b53184c30be3 100644 --- a/libc/test/src/stdio/fdopen_test.cpp +++ b/libc/test/src/stdio/fdopen_test.cpp @@ -9,20 +9,21 @@ #include "src/stdio/fdopen.h" #include "hdr/fcntl_macros.h" -#include "src/__support/libc_errno.h" #include "src/fcntl/open.h" #include "src/stdio/fclose.h" #include "src/stdio/fgets.h" #include "src/stdio/fputs.h" #include "src/unistd/close.h" +#include "test/UnitTest/ErrnoCheckingTest.h" #include "test/UnitTest/ErrnoSetterMatcher.h" #include "test/UnitTest/Test.h" #include // For S_IRWXU -TEST(LlvmLibcStdioFdopenTest, WriteAppendRead) { +using LlvmLibcStdioFdopenTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest; + +TEST_F(LlvmLibcStdioFdopenTest, WriteAppendRead) { using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds; - libc_errno = 0; constexpr const char *TEST_FILE_NAME = "testdata/write_read_append.test"; auto TEST_FILE = libc_make_test_file_path(TEST_FILE_NAME); int fd = LIBC_NAMESPACE::open(TEST_FILE, O_CREAT | O_TRUNC | O_RDWR, S_IRWXU); @@ -52,8 +53,7 @@ TEST(LlvmLibcStdioFdopenTest, WriteAppendRead) { ASSERT_ERRNO_SUCCESS(); } -TEST(LlvmLibcStdioFdopenTest, InvalidFd) { - libc_errno = 0; +TEST_F(LlvmLibcStdioFdopenTest, InvalidFd) { constexpr const char *TEST_FILE_NAME = "testdata/invalid_fd.test"; auto TEST_FILE = libc_make_test_file_path(TEST_FILE_NAME); int fd = LIBC_NAMESPACE::open(TEST_FILE, O_CREAT | O_TRUNC); @@ -64,8 +64,7 @@ TEST(LlvmLibcStdioFdopenTest, InvalidFd) { ASSERT_TRUE(nullptr == fp); } -TEST(LlvmLibcStdioFdopenTest, InvalidMode) { - libc_errno = 0; +TEST_F(LlvmLibcStdioFdopenTest, InvalidMode) { constexpr const char *TEST_FILE_NAME = "testdata/invalid_mode.test"; auto TEST_FILE = libc_make_test_file_path(TEST_FILE_NAME); int fd = LIBC_NAMESPACE::open(TEST_FILE, O_CREAT | O_RDONLY, S_IRWXU); @@ -83,7 +82,6 @@ TEST(LlvmLibcStdioFdopenTest, InvalidMode) { auto *fp2 = LIBC_NAMESPACE::fdopen(fd, "w"); ASSERT_ERRNO_EQ(EINVAL); ASSERT_TRUE(nullptr == fp2); - libc_errno = 0; LIBC_NAMESPACE::close(fd); ASSERT_ERRNO_SUCCESS(); } diff --git a/libc/test/src/stdio/fgetc_test.cpp b/libc/test/src/stdio/fgetc_test.cpp index 56bde5f0099a..7c652f666a8f 100644 --- a/libc/test/src/stdio/fgetc_test.cpp +++ b/libc/test/src/stdio/fgetc_test.cpp @@ -14,12 +14,12 @@ #include "src/stdio/fopen.h" #include "src/stdio/fwrite.h" #include "src/stdio/getc.h" +#include "test/UnitTest/ErrnoCheckingTest.h" #include "test/UnitTest/Test.h" #include "hdr/stdio_macros.h" -#include "src/__support/libc_errno.h" -class LlvmLibcGetcTest : public LIBC_NAMESPACE::testing::Test { +class LlvmLibcGetcTest : public LIBC_NAMESPACE::testing::ErrnoCheckingTest { public: using GetcFunc = int(FILE *); void test_with_func(GetcFunc *func, const char *filename) { @@ -33,7 +33,6 @@ public: // This is an error and not a real EOF. ASSERT_EQ(LIBC_NAMESPACE::feof(file), 0); ASSERT_NE(LIBC_NAMESPACE::ferror(file), 0); - libc_errno = 0; ASSERT_EQ(0, LIBC_NAMESPACE::fclose(file)); diff --git a/libc/test/src/stdio/fgetc_unlocked_test.cpp b/libc/test/src/stdio/fgetc_unlocked_test.cpp index 90429ecf4e82..f4471dd82df1 100644 --- a/libc/test/src/stdio/fgetc_unlocked_test.cpp +++ b/libc/test/src/stdio/fgetc_unlocked_test.cpp @@ -17,12 +17,12 @@ #include "src/stdio/funlockfile.h" #include "src/stdio/fwrite.h" #include "src/stdio/getc_unlocked.h" +#include "test/UnitTest/ErrnoCheckingTest.h" #include "test/UnitTest/Test.h" #include "hdr/stdio_macros.h" -#include "src/__support/libc_errno.h" -class LlvmLibcGetcTest : public LIBC_NAMESPACE::testing::Test { +class LlvmLibcGetcTest : public LIBC_NAMESPACE::testing::ErrnoCheckingTest { public: using GetcFunc = int(FILE *); void test_with_func(GetcFunc *func, const char *filename) { @@ -36,7 +36,6 @@ public: // This is an error and not a real EOF. ASSERT_EQ(LIBC_NAMESPACE::feof(file), 0); ASSERT_NE(LIBC_NAMESPACE::ferror(file), 0); - libc_errno = 0; ASSERT_EQ(0, LIBC_NAMESPACE::fclose(file)); diff --git a/libc/test/src/stdio/fgets_test.cpp b/libc/test/src/stdio/fgets_test.cpp index abed3d405293..c00a9256af52 100644 --- a/libc/test/src/stdio/fgets_test.cpp +++ b/libc/test/src/stdio/fgets_test.cpp @@ -12,11 +12,12 @@ #include "src/stdio/fgets.h" #include "src/stdio/fopen.h" #include "src/stdio/fwrite.h" +#include "test/UnitTest/ErrnoCheckingTest.h" #include "test/UnitTest/Test.h" -#include "src/__support/libc_errno.h" +using LlvmLibcFgetsTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest; -TEST(LlvmLibcFgetsTest, WriteAndReadCharacters) { +TEST_F(LlvmLibcFgetsTest, WriteAndReadCharacters) { constexpr char FILENAME[] = "testdata/fgets.test"; ::FILE *file = LIBC_NAMESPACE::fopen(FILENAME, "w"); ASSERT_FALSE(file == nullptr); @@ -35,7 +36,6 @@ TEST(LlvmLibcFgetsTest, WriteAndReadCharacters) { // This is an error and not a real EOF. ASSERT_EQ(LIBC_NAMESPACE::feof(file), 0); ASSERT_NE(LIBC_NAMESPACE::ferror(file), 0); - libc_errno = 0; ASSERT_EQ(0, LIBC_NAMESPACE::fclose(file)); diff --git a/libc/test/src/stdio/fileop_test.cpp b/libc/test/src/stdio/fileop_test.cpp index e624181c795b..e097785832d5 100644 --- a/libc/test/src/stdio/fileop_test.cpp +++ b/libc/test/src/stdio/fileop_test.cpp @@ -17,17 +17,18 @@ #include "src/stdio/fread.h" #include "src/stdio/fseek.h" #include "src/stdio/fwrite.h" +#include "test/UnitTest/ErrnoCheckingTest.h" #include "test/UnitTest/ErrnoSetterMatcher.h" #include "test/UnitTest/Test.h" #include "hdr/stdio_macros.h" -#include "src/__support/libc_errno.h" +using LlvmLibcFILETest = LIBC_NAMESPACE::testing::ErrnoCheckingTest; using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::EQ; using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::NE; using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::returns; -TEST(LlvmLibcFILETest, SimpleFileOperations) { +TEST_F(LlvmLibcFILETest, SimpleFileOperations) { constexpr char FILENAME[] = "testdata/simple_operations.test"; ::FILE *file = LIBC_NAMESPACE::fopen(FILENAME, "w"); ASSERT_FALSE(file == nullptr); @@ -41,7 +42,6 @@ TEST(LlvmLibcFILETest, SimpleFileOperations) { ASSERT_THAT(LIBC_NAMESPACE::fread(read_data, 1, sizeof(CONTENT), file), returns(EQ(size_t(0))).with_errno(NE(0))); ASSERT_NE(LIBC_NAMESPACE::ferror(file), 0); - libc_errno = 0; LIBC_NAMESPACE::clearerr(file); ASSERT_EQ(LIBC_NAMESPACE::ferror(file), 0); @@ -72,7 +72,6 @@ TEST(LlvmLibcFILETest, SimpleFileOperations) { ASSERT_THAT(LIBC_NAMESPACE::fwrite(CONTENT, 1, sizeof(CONTENT), file), returns(EQ(size_t(0))).with_errno(NE(0))); ASSERT_NE(LIBC_NAMESPACE::ferror(file), 0); - libc_errno = 0; LIBC_NAMESPACE::clearerr(file); @@ -80,15 +79,12 @@ TEST(LlvmLibcFILETest, SimpleFileOperations) { ASSERT_THAT(LIBC_NAMESPACE::fputs(CONTENT, file), returns(EQ(EOF)).with_errno(NE(0))); ASSERT_NE(LIBC_NAMESPACE::ferror(file), 0); - libc_errno = 0; LIBC_NAMESPACE::clearerr(file); ASSERT_EQ(LIBC_NAMESPACE::ferror(file), 0); - libc_errno = 0; ASSERT_THAT(LIBC_NAMESPACE::fwrite("nothing", 1, 1, file), returns(EQ(size_t(0))).with_errno(NE(0))); - libc_errno = 0; ASSERT_EQ(LIBC_NAMESPACE::fclose(file), 0); @@ -103,10 +99,8 @@ TEST(LlvmLibcFILETest, SimpleFileOperations) { ASSERT_EQ(LIBC_NAMESPACE::ferror(file), 0); // This is not a readable file. - libc_errno = 0; ASSERT_THAT(LIBC_NAMESPACE::fread(data, 1, 1, file), returns(EQ(0)).with_errno(NE(0))); - libc_errno = 0; ASSERT_EQ(0, LIBC_NAMESPACE::fclose(file)); @@ -121,21 +115,18 @@ TEST(LlvmLibcFILETest, SimpleFileOperations) { // Check that the other functions correctly set libc_errno. - // libc_errno = 0; // ASSERT_NE(LIBC_NAMESPACE::fseek(file, 0, SEEK_SET), 0); // ASSERT_ERRNO_FAILURE(); - // libc_errno = 0; // ASSERT_NE(LIBC_NAMESPACE::fclose(file), 0); // ASSERT_ERRNO_FAILURE(); - // libc_errno = 0; // ASSERT_EQ(LIBC_NAMESPACE::fopen("INVALID FILE NAME", "r"), // static_cast(nullptr)); // ASSERT_ERRNO_FAILURE(); } -TEST(LlvmLibcFILETest, FFlush) { +TEST_F(LlvmLibcFILETest, FFlush) { constexpr char FILENAME[] = "testdata/fflush.test"; ::FILE *file = LIBC_NAMESPACE::fopen(FILENAME, "w+"); ASSERT_FALSE(file == nullptr); @@ -156,7 +147,7 @@ TEST(LlvmLibcFILETest, FFlush) { ASSERT_EQ(LIBC_NAMESPACE::fclose(file), 0); } -TEST(LlvmLibcFILETest, FOpenFWriteSizeGreaterThanOne) { +TEST_F(LlvmLibcFILETest, FOpenFWriteSizeGreaterThanOne) { using MyStruct = struct { char c; unsigned long long i; @@ -165,7 +156,6 @@ TEST(LlvmLibcFILETest, FOpenFWriteSizeGreaterThanOne) { constexpr size_t WRITE_NMEMB = sizeof(WRITE_DATA) / sizeof(MyStruct); constexpr char FILENAME[] = "testdata/fread_fwrite.test"; - libc_errno = 0; FILE *file = LIBC_NAMESPACE::fopen(FILENAME, "w"); ASSERT_FALSE(file == nullptr); ASSERT_EQ(size_t(0), LIBC_NAMESPACE::fwrite(WRITE_DATA, 0, 1, file)); diff --git a/libc/test/src/stdio/fopencookie_test.cpp b/libc/test/src/stdio/fopencookie_test.cpp index 03e1ac286b64..bcf5e674141a 100644 --- a/libc/test/src/stdio/fopencookie_test.cpp +++ b/libc/test/src/stdio/fopencookie_test.cpp @@ -15,6 +15,7 @@ #include "src/stdio/fread.h" #include "src/stdio/fseek.h" #include "src/stdio/fwrite.h" +#include "test/UnitTest/ErrnoCheckingTest.h" #include "test/UnitTest/MemoryMatcher.h" #include "test/UnitTest/Test.h" @@ -22,6 +23,7 @@ #include "hdr/types/size_t.h" #include "src/__support/libc_errno.h" +using LlvmLibcFOpenCookieTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest; using MemoryView = LIBC_NAMESPACE::testing::MemoryView; struct StringStream { @@ -88,7 +90,7 @@ int close_ss(void *cookie) { constexpr cookie_io_functions_t STRING_STREAM_FUNCS = {&read_ss, &write_ss, &seek_ss, &close_ss}; -TEST(LlvmLibcFOpenCookie, ReadOnlyCookieTest) { +TEST_F(LlvmLibcFOpenCookieTest, ReadOnlyCookieTest) { constexpr char CONTENT[] = "Hello,readonly!"; auto *ss = reinterpret_cast(malloc(sizeof(StringStream))); ss->buf = reinterpret_cast(malloc(sizeof(CONTENT))); @@ -115,7 +117,6 @@ TEST(LlvmLibcFOpenCookie, ReadOnlyCookieTest) { ASSERT_EQ(size_t(0), LIBC_NAMESPACE::fwrite(CONTENT, 1, sizeof(CONTENT), f)); ASSERT_NE(LIBC_NAMESPACE::ferror(f), 0); ASSERT_ERRNO_FAILURE(); - libc_errno = 0; LIBC_NAMESPACE::clearerr(f); ASSERT_EQ(LIBC_NAMESPACE::ferror(f), 0); @@ -124,7 +125,7 @@ TEST(LlvmLibcFOpenCookie, ReadOnlyCookieTest) { free(ss); } -TEST(LlvmLibcFOpenCookie, WriteOnlyCookieTest) { +TEST_F(LlvmLibcFOpenCookieTest, WriteOnlyCookieTest) { size_t INIT_BUFSIZE = 32; auto *ss = reinterpret_cast(malloc(sizeof(StringStream))); ss->buf = reinterpret_cast(malloc(INIT_BUFSIZE)); @@ -149,7 +150,6 @@ TEST(LlvmLibcFOpenCookie, WriteOnlyCookieTest) { LIBC_NAMESPACE::fread(read_data, 1, sizeof(WRITE_DATA), f)); ASSERT_NE(LIBC_NAMESPACE::ferror(f), 0); ASSERT_ERRNO_EQ(EBADF); - libc_errno = 0; LIBC_NAMESPACE::clearerr(f); ASSERT_EQ(LIBC_NAMESPACE::ferror(f), 0); @@ -158,7 +158,7 @@ TEST(LlvmLibcFOpenCookie, WriteOnlyCookieTest) { free(ss); } -TEST(LlvmLibcFOpenCookie, AppendOnlyCookieTest) { +TEST_F(LlvmLibcFOpenCookieTest, AppendOnlyCookieTest) { constexpr char INITIAL_CONTENT[] = "1234567890987654321"; constexpr char WRITE_DATA[] = "append"; auto *ss = reinterpret_cast(malloc(sizeof(StringStream))); @@ -178,7 +178,6 @@ TEST(LlvmLibcFOpenCookie, AppendOnlyCookieTest) { ASSERT_EQ(LIBC_NAMESPACE::fread(read_data, 1, READ_SIZE, f), size_t(0)); ASSERT_NE(LIBC_NAMESPACE::ferror(f), 0); ASSERT_ERRNO_FAILURE(); - libc_errno = 0; LIBC_NAMESPACE::clearerr(f); ASSERT_EQ(LIBC_NAMESPACE::ferror(f), 0); @@ -192,7 +191,7 @@ TEST(LlvmLibcFOpenCookie, AppendOnlyCookieTest) { free(ss); } -TEST(LlvmLibcFOpenCookie, ReadUpdateCookieTest) { +TEST_F(LlvmLibcFOpenCookieTest, ReadUpdateCookieTest) { const char INITIAL_CONTENT[] = "1234567890987654321"; auto *ss = reinterpret_cast(malloc(sizeof(StringStream))); ss->buf = reinterpret_cast(malloc(sizeof(INITIAL_CONTENT))); @@ -223,7 +222,7 @@ TEST(LlvmLibcFOpenCookie, ReadUpdateCookieTest) { free(ss); } -TEST(LlvmLibcFOpenCookie, WriteUpdateCookieTest) { +TEST_F(LlvmLibcFOpenCookieTest, WriteUpdateCookieTest) { constexpr char WRITE_DATA[] = "hello, file"; auto *ss = reinterpret_cast(malloc(sizeof(StringStream))); ss->buf = reinterpret_cast(malloc(sizeof(WRITE_DATA))); diff --git a/libc/test/src/stdio/remove_test.cpp b/libc/test/src/stdio/remove_test.cpp index 84984e26398c..296bff1f5dc1 100644 --- a/libc/test/src/stdio/remove_test.cpp +++ b/libc/test/src/stdio/remove_test.cpp @@ -11,16 +11,17 @@ #include "src/sys/stat/mkdirat.h" #include "src/unistd/access.h" #include "src/unistd/close.h" +#include "test/UnitTest/ErrnoCheckingTest.h" #include "test/UnitTest/ErrnoSetterMatcher.h" #include "test/UnitTest/Test.h" -#include "src/__support/libc_errno.h" #include -TEST(LlvmLibcRemoveTest, CreateAndRemoveFile) { +using LlvmLibcRemoveTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest; + +TEST_F(LlvmLibcRemoveTest, CreateAndRemoveFile) { // The test strategy is to create a file and remove it, and also verify that // it was removed. - libc_errno = 0; using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Fails; using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds; @@ -36,10 +37,9 @@ TEST(LlvmLibcRemoveTest, CreateAndRemoveFile) { ASSERT_THAT(LIBC_NAMESPACE::access(TEST_FILE, F_OK), Fails(ENOENT)); } -TEST(LlvmLibcRemoveTest, CreateAndRemoveDir) { +TEST_F(LlvmLibcRemoveTest, CreateAndRemoveDir) { // The test strategy is to create a dir and remove it, and also verify that // it was removed. - libc_errno = 0; using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Fails; using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds; constexpr const char *FILENAME = "remove.test.dir"; diff --git a/libc/test/src/stdio/rename_test.cpp b/libc/test/src/stdio/rename_test.cpp index ac494a4ecaf8..135fb98c07fb 100644 --- a/libc/test/src/stdio/rename_test.cpp +++ b/libc/test/src/stdio/rename_test.cpp @@ -8,18 +8,19 @@ #include "include/llvm-libc-macros/linux/sys-stat-macros.h" #include "include/llvm-libc-macros/linux/unistd-macros.h" -#include "src/__support/libc_errno.h" #include "src/fcntl/open.h" #include "src/stdio/rename.h" #include "src/unistd/access.h" #include "src/unistd/close.h" +#include "test/UnitTest/ErrnoCheckingTest.h" #include "test/UnitTest/ErrnoSetterMatcher.h" #include "test/UnitTest/Test.h" -TEST(LlvmLibcRenameTest, CreateAndRenameFile) { +using LlvmLibcRenameTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest; + +TEST_F(LlvmLibcRenameTest, CreateAndRenameFile) { // The test strategy is to create a file and rename it, and also verify that // it was renamed. - libc_errno = 0; using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Fails; using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds; @@ -40,7 +41,7 @@ TEST(LlvmLibcRenameTest, CreateAndRenameFile) { ASSERT_THAT(LIBC_NAMESPACE::access(TEST_FILEPATH0, F_OK), Fails(ENOENT)); } -TEST(LlvmLibcRenameTest, RenameNonExistent) { +TEST_F(LlvmLibcRenameTest, RenameNonExistent) { using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Fails; constexpr const char *FILENAME1 = "rename.test.file1"; diff --git a/libc/test/src/stdio/setvbuf_test.cpp b/libc/test/src/stdio/setvbuf_test.cpp index 5872943c1bb4..4144bc1bef44 100644 --- a/libc/test/src/stdio/setvbuf_test.cpp +++ b/libc/test/src/stdio/setvbuf_test.cpp @@ -14,9 +14,10 @@ #include "test/UnitTest/Test.h" #include "hdr/stdio_macros.h" -#include "src/__support/libc_errno.h" -TEST(LlvmLibcSetvbufTest, SetNBFBuffer) { +using LlvmLibcSetvbufTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest; + +TEST_F(LlvmLibcSetvbufTest, SetNBFBuffer) { // The idea in this test is that we open a file for writing and reading, and // then set a NBF buffer to the write handle. Since it is NBF, the data // written using the write handle should be immediately readable by the read @@ -52,7 +53,7 @@ TEST(LlvmLibcSetvbufTest, SetNBFBuffer) { ASSERT_EQ(0, LIBC_NAMESPACE::fclose(fr)); } -TEST(LlvmLibcSetvbufTest, SetLBFBuffer) { +TEST_F(LlvmLibcSetvbufTest, SetLBFBuffer) { // The idea in this test is that we open a file for writing and reading, and // then set a LBF buffer to the write handle. Since it is LBF, the data // written using the write handle should be available right after a '\n' is @@ -102,6 +103,5 @@ TEST(LlvmLibcSetbufTest, InvalidBufferMode) { 0); ASSERT_ERRNO_EQ(EINVAL); - libc_errno = 0; ASSERT_EQ(0, LIBC_NAMESPACE::fclose(f)); } diff --git a/libc/test/src/stdio/unlocked_fileop_test.cpp b/libc/test/src/stdio/unlocked_fileop_test.cpp index 5d482b70064b..e99b382d1211 100644 --- a/libc/test/src/stdio/unlocked_fileop_test.cpp +++ b/libc/test/src/stdio/unlocked_fileop_test.cpp @@ -15,11 +15,12 @@ #include "src/stdio/fread_unlocked.h" #include "src/stdio/funlockfile.h" #include "src/stdio/fwrite_unlocked.h" +#include "test/UnitTest/ErrnoCheckingTest.h" #include "test/UnitTest/Test.h" -#include "src/__support/libc_errno.h" +using LlvmLibcFILETest = LIBC_NAMESPACE::testing::ErrnoCheckingTest; -TEST(LlvmLibcFILETest, UnlockedReadAndWrite) { +TEST_F(LlvmLibcFILETest, UnlockedReadAndWrite) { constexpr char fNAME[] = "testdata/unlocked_read_and_write.test"; ::FILE *f = LIBC_NAMESPACE::fopen(fNAME, "w"); ASSERT_FALSE(f == nullptr); @@ -36,7 +37,6 @@ TEST(LlvmLibcFILETest, UnlockedReadAndWrite) { LIBC_NAMESPACE::fread_unlocked(data, 1, sizeof(READ_SIZE), f)); ASSERT_NE(LIBC_NAMESPACE::ferror_unlocked(f), 0); ASSERT_ERRNO_FAILURE(); - libc_errno = 0; LIBC_NAMESPACE::clearerr_unlocked(f); ASSERT_EQ(LIBC_NAMESPACE::ferror_unlocked(f), 0); @@ -57,7 +57,6 @@ TEST(LlvmLibcFILETest, UnlockedReadAndWrite) { LIBC_NAMESPACE::fwrite_unlocked(CONTENT, 1, sizeof(CONTENT), f)); ASSERT_NE(LIBC_NAMESPACE::ferror_unlocked(f), 0); ASSERT_ERRNO_FAILURE(); - libc_errno = 0; LIBC_NAMESPACE::clearerr_unlocked(f); ASSERT_EQ(LIBC_NAMESPACE::ferror_unlocked(f), 0); diff --git a/libc/test/src/stdlib/StrtolTest.h b/libc/test/src/stdlib/StrtolTest.h index 3eeccc5727e7..03f0a6539c78 100644 --- a/libc/test/src/stdlib/StrtolTest.h +++ b/libc/test/src/stdlib/StrtolTest.h @@ -9,7 +9,6 @@ #include "src/__support/CPP/limits.h" #include "src/__support/CPP/type_traits.h" #include "src/__support/ctype_utils.h" -#include "src/__support/libc_errno.h" #include "src/__support/macros/properties/architectures.h" #include "test/UnitTest/ErrnoCheckingTest.h" #include "test/UnitTest/Test.h" diff --git a/libc/test/src/stdlib/strtold_test.cpp b/libc/test/src/stdlib/strtold_test.cpp index c2f2b9c9a11c..eb4056dc7ba6 100644 --- a/libc/test/src/stdlib/strtold_test.cpp +++ b/libc/test/src/stdlib/strtold_test.cpp @@ -7,7 +7,6 @@ //===----------------------------------------------------------------------===// #include "src/__support/FPUtil/FPBits.h" -#include "src/__support/libc_errno.h" #include "src/__support/uint128.h" #include "src/stdlib/strtold.h" From 3c7af175e51c3ab08ac3c442146c2b822f38c01e Mon Sep 17 00:00:00 2001 From: Michael Jones Date: Wed, 11 Jun 2025 16:52:21 -0700 Subject: [PATCH 0069/1322] [libc] Fix stdio tests after #143802 (#143810) In #143802 the stdio test cleanup missed a few places where errno was being set to a failing value, and one where the framework needed to included. --- libc/docs/configure.rst | 2 +- libc/test/src/stdio/fgetc_test.cpp | 1 + libc/test/src/stdio/fgetc_unlocked_test.cpp | 1 + libc/test/src/stdio/fgets_test.cpp | 1 + libc/test/src/stdio/setvbuf_test.cpp | 1 + 5 files changed, 5 insertions(+), 1 deletion(-) diff --git a/libc/docs/configure.rst b/libc/docs/configure.rst index 8d53390ae19b..109412225634 100644 --- a/libc/docs/configure.rst +++ b/libc/docs/configure.rst @@ -29,7 +29,7 @@ to learn about the defaults for your platform and target. - ``LIBC_CONF_ENABLE_STRONG_STACK_PROTECTOR``: Enable -fstack-protector-strong to defend against stack smashing attack. - ``LIBC_CONF_KEEP_FRAME_POINTER``: Keep frame pointer in functions for better debugging experience. * **"errno" options** - - ``LIBC_CONF_ERRNO_MODE``: The implementation used for errno, acceptable values are LIBC_ERRNO_MODE_DEFAULT, LIBC_ERRNO_MODE_UNDEFINED, LIBC_ERRNO_MODE_THREAD_LOCAL, LIBC_ERRNO_MODE_SHARED, LIBC_ERRNO_MODE_EXTERNAL, and LIBC_ERRNO_MODE_SYSTEM. + - ``LIBC_CONF_ERRNO_MODE``: The implementation used for errno, acceptable values are LIBC_ERRNO_MODE_DEFAULT, LIBC_ERRNO_MODE_UNDEFINED, LIBC_ERRNO_MODE_THREAD_LOCAL, LIBC_ERRNO_MODE_SHARED, LIBC_ERRNO_MODE_EXTERNAL, LIBC_ERRNO_MODE_SYSTEM, and LIBC_ERRNO_MODE_SYSTEM_INLINE. * **"general" options** - ``LIBC_ADD_NULL_CHECKS``: Add nullptr checks in the library's implementations to some functions for which passing nullptr is undefined behavior. * **"math" options** diff --git a/libc/test/src/stdio/fgetc_test.cpp b/libc/test/src/stdio/fgetc_test.cpp index 7c652f666a8f..1faa49112fb6 100644 --- a/libc/test/src/stdio/fgetc_test.cpp +++ b/libc/test/src/stdio/fgetc_test.cpp @@ -33,6 +33,7 @@ public: // This is an error and not a real EOF. ASSERT_EQ(LIBC_NAMESPACE::feof(file), 0); ASSERT_NE(LIBC_NAMESPACE::ferror(file), 0); + ASSERT_ERRNO_FAILURE(); ASSERT_EQ(0, LIBC_NAMESPACE::fclose(file)); diff --git a/libc/test/src/stdio/fgetc_unlocked_test.cpp b/libc/test/src/stdio/fgetc_unlocked_test.cpp index f4471dd82df1..7b2efe642fb5 100644 --- a/libc/test/src/stdio/fgetc_unlocked_test.cpp +++ b/libc/test/src/stdio/fgetc_unlocked_test.cpp @@ -36,6 +36,7 @@ public: // This is an error and not a real EOF. ASSERT_EQ(LIBC_NAMESPACE::feof(file), 0); ASSERT_NE(LIBC_NAMESPACE::ferror(file), 0); + ASSERT_ERRNO_FAILURE(); ASSERT_EQ(0, LIBC_NAMESPACE::fclose(file)); diff --git a/libc/test/src/stdio/fgets_test.cpp b/libc/test/src/stdio/fgets_test.cpp index c00a9256af52..2d7c68d49081 100644 --- a/libc/test/src/stdio/fgets_test.cpp +++ b/libc/test/src/stdio/fgets_test.cpp @@ -36,6 +36,7 @@ TEST_F(LlvmLibcFgetsTest, WriteAndReadCharacters) { // This is an error and not a real EOF. ASSERT_EQ(LIBC_NAMESPACE::feof(file), 0); ASSERT_NE(LIBC_NAMESPACE::ferror(file), 0); + ASSERT_ERRNO_FAILURE(); ASSERT_EQ(0, LIBC_NAMESPACE::fclose(file)); diff --git a/libc/test/src/stdio/setvbuf_test.cpp b/libc/test/src/stdio/setvbuf_test.cpp index 4144bc1bef44..a0936ba79ef7 100644 --- a/libc/test/src/stdio/setvbuf_test.cpp +++ b/libc/test/src/stdio/setvbuf_test.cpp @@ -11,6 +11,7 @@ #include "src/stdio/fread.h" #include "src/stdio/fwrite.h" #include "src/stdio/setvbuf.h" +#include "test/UnitTest/ErrnoCheckingTest.h" #include "test/UnitTest/Test.h" #include "hdr/stdio_macros.h" From 6c72084a578a7a1e4dc1013a1a4a30b72ad5c6ab Mon Sep 17 00:00:00 2001 From: Jorge Gorbe Moya Date: Wed, 11 Jun 2025 16:56:37 -0700 Subject: [PATCH 0070/1322] [bazel] port 1ecd108cb7ceda2b11281b5d173e2827feb60c55 --- utils/bazel/llvm-project-overlay/libc/test/src/stdio/BUILD.bazel | 1 + 1 file changed, 1 insertion(+) diff --git a/utils/bazel/llvm-project-overlay/libc/test/src/stdio/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/test/src/stdio/BUILD.bazel index 484d3e5e0a24..505b73fd7711 100644 --- a/utils/bazel/llvm-project-overlay/libc/test/src/stdio/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/libc/test/src/stdio/BUILD.bazel @@ -122,6 +122,7 @@ libc_test( "//libc:mkdirat", "//libc:open", "//libc:remove", + "//libc/test/UnitTest:errno_test_helpers", ], ) From bc7ea63e9c885fbe71dec29581a206bc0543d22a Mon Sep 17 00:00:00 2001 From: Jameson Nash Date: Wed, 11 Jun 2025 20:04:27 -0400 Subject: [PATCH 0071/1322] [MemCpyOpt] handle memcpy from memset for non-constant sizes (#143727) Allows forwarding memset to memcpy for mismatching unknown sizes if overread has undef contents. In that case we can refine the undef bytes to the memset value. Refs #140954 which laid some of the groundwork for this. --- .../lib/Transforms/Scalar/MemCpyOptimizer.cpp | 38 +++++++++---------- .../MemCpyOpt/variable-sized-memset-memcpy.ll | 6 +-- 2 files changed, 22 insertions(+), 22 deletions(-) diff --git a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp index 960001bf880c..1c4ec6aa08b4 100644 --- a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp +++ b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp @@ -1440,7 +1440,7 @@ bool MemCpyOptPass::performMemCpyToMemSetOptzn(MemCpyInst *MemCpy, int64_t MOffset = 0; const DataLayout &DL = MemCpy->getModule()->getDataLayout(); // We can only transforms memcpy's where the dest of one is the source of the - // other, or the memory transfer has a known offset from the memset. + // other, or they have a known offset. if (MemCpy->getSource() != MemSet->getDest()) { std::optional Offset = MemCpy->getSource()->getPointerOffsetFrom(MemSet->getDest(), DL); @@ -1451,28 +1451,28 @@ bool MemCpyOptPass::performMemCpyToMemSetOptzn(MemCpyInst *MemCpy, if (MOffset != 0 || MemSetSize != CopySize) { // Make sure the memcpy doesn't read any more than what the memset wrote, - // other than undef. Don't worry about sizes larger than i64. A known memset - // size is required. + // other than undef. Don't worry about sizes larger than i64. auto *CMemSetSize = dyn_cast(MemSetSize); - if (!CMemSetSize) - return false; - - // A known memcpy size is also required. auto *CCopySize = dyn_cast(CopySize); - if (!CCopySize) - return false; - if (CCopySize->getZExtValue() + MOffset > CMemSetSize->getZExtValue()) { + if (!CMemSetSize || !CCopySize || + CCopySize->getZExtValue() + MOffset > CMemSetSize->getZExtValue()) { if (!overreadUndefContents(MSSA, MemCpy, MemSet, BAA)) return false; - // Clip the memcpy to the bounds of the memset - if (MOffset == 0) - CopySize = MemSetSize; - else - CopySize = - ConstantInt::get(CopySize->getType(), - CMemSetSize->getZExtValue() <= (uint64_t)MOffset - ? 0 - : CMemSetSize->getZExtValue() - MOffset); + + if (CMemSetSize && CCopySize) { + // If both have constant sizes and offsets, clip the memcpy to the + // bounds of the memset if applicable. + assert(CCopySize->getZExtValue() + MOffset > + CMemSetSize->getZExtValue()); + if (MOffset == 0) + CopySize = MemSetSize; + else + CopySize = + ConstantInt::get(CopySize->getType(), + CMemSetSize->getZExtValue() <= (uint64_t)MOffset + ? 0 + : CMemSetSize->getZExtValue() - MOffset); + } } } diff --git a/llvm/test/Transforms/MemCpyOpt/variable-sized-memset-memcpy.ll b/llvm/test/Transforms/MemCpyOpt/variable-sized-memset-memcpy.ll index d5b1ab9b2f29..4b44f8b44f74 100644 --- a/llvm/test/Transforms/MemCpyOpt/variable-sized-memset-memcpy.ll +++ b/llvm/test/Transforms/MemCpyOpt/variable-sized-memset-memcpy.ll @@ -19,12 +19,12 @@ define void @test(ptr %src, i8 %c, i64 %size) { } ; Differing sizes, but would be UB if size1 < size2 since the memcpy would reference outside of the first alloca -define void @negative_test(ptr %src, i8 %c, i64 %size1, i64 %size2) { -; CHECK-LABEL: @negative_test( +define void @dynsize_test(ptr %src, i8 %c, i64 %size1, i64 %size2) { +; CHECK-LABEL: @dynsize_test( ; CHECK-NEXT: [[DST1:%.*]] = alloca i8, i64 [[SIZE1:%.*]], align 1 ; CHECK-NEXT: [[DST2:%.*]] = alloca i8, i64 [[SIZE2:%.*]], align 1 ; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 8 [[DST1]], i8 [[C:%.*]], i64 [[SIZE1]], i1 false) -; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[DST2]], ptr align 8 [[DST1]], i64 [[SIZE2]], i1 false) +; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 8 [[DST2]], i8 [[C]], i64 [[SIZE2]], i1 false) ; CHECK-NEXT: ret void ; %dst1 = alloca i8, i64 %size1 From d7c6cad744bc7ed28535dc6f75629902eda559ea Mon Sep 17 00:00:00 2001 From: Jake Egan Date: Wed, 11 Jun 2025 20:22:15 -0400 Subject: [PATCH 0072/1322] [sanitizer_common] Implement interception on AIX (#138606) Adjust AIX interceptor support in sanitizer_common. Issue: https://github.com/llvm/llvm-project/issues/138916 --- .../sanitizer_common_interceptors.inc | 43 ++++++++----- .../sanitizer_common_interceptors_ioctl.inc | 2 + ...izer_common_interceptors_memintrinsics.inc | 8 ++- .../sanitizer_platform_interceptors.h | 61 +++++++++++-------- .../sanitizer_redefine_builtins.h | 2 +- 5 files changed, 73 insertions(+), 43 deletions(-) diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors.inc b/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors.inc index 9272e2ab6cbd..2d6cf7fc3282 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors.inc +++ b/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors.inc @@ -481,7 +481,8 @@ INTERCEPTOR(char*, textdomain, const char *domainname) { #endif #if SANITIZER_INTERCEPT_STRCMP || SANITIZER_INTERCEPT_MEMCMP -static inline int CharCmpX(unsigned char c1, unsigned char c2) { +[[maybe_unused]] static inline int CharCmpX(unsigned char c1, + unsigned char c2) { return (c1 == c2) ? 0 : (c1 < c2) ? -1 : 1; } #endif @@ -1350,7 +1351,8 @@ INTERCEPTOR(unsigned long, time, unsigned long *t) { #if SANITIZER_INTERCEPT_LOCALTIME_AND_FRIENDS static void unpoison_tm(void *ctx, __sanitizer_tm *tm) { COMMON_INTERCEPTOR_WRITE_RANGE(ctx, tm, sizeof(*tm)); -#if !SANITIZER_SOLARIS +// AIX tm struct does not have tm_zone field. +# if !SANITIZER_SOLARIS && !SANITIZER_AIX if (tm->tm_zone) { // Can not use COMMON_INTERCEPTOR_WRITE_RANGE here, because tm->tm_zone // can point to shared memory and tsan would report a data race. @@ -1735,10 +1737,12 @@ INTERCEPTOR(int, __vsprintf_chk, char *str, int flag, SIZE_T size_to, VSPRINTF_INTERCEPTOR_IMPL(vsprintf, str, format, ap) #endif +# if SANITIZER_INTERCEPT_VASPRINTF INTERCEPTOR(int, vasprintf, char **strp, const char *format, va_list ap) VASPRINTF_INTERCEPTOR_IMPL(vasprintf, strp, format, ap) +# endif -#if SANITIZER_INTERCEPT_ISOC99_PRINTF +# if SANITIZER_INTERCEPT_ISOC99_PRINTF INTERCEPTOR(int, __isoc99_vprintf, const char *format, va_list ap) VPRINTF_INTERCEPTOR_IMPL(__isoc99_vprintf, format, ap) @@ -1787,10 +1791,12 @@ INTERCEPTOR(int, __snprintf_chk, char *str, SIZE_T size, int flag, FORMAT_INTERCEPTOR_IMPL(__snprintf_chk, vsnprintf, str, size, format) #endif +# if SANITIZER_INTERCEPT_ASPRINTF INTERCEPTOR(int, asprintf, char **strp, const char *format, ...) FORMAT_INTERCEPTOR_IMPL(asprintf, vasprintf, strp, format) +# endif -#if SANITIZER_INTERCEPT_ISOC99_PRINTF +# if SANITIZER_INTERCEPT_ISOC99_PRINTF INTERCEPTOR(int, __isoc99_printf, const char *format, ...) FORMAT_INTERCEPTOR_IMPL(__isoc99_printf, __isoc99_vprintf, format) @@ -1811,17 +1817,24 @@ FORMAT_INTERCEPTOR_IMPL(__isoc99_snprintf, __isoc99_vsnprintf, str, size, #endif // SANITIZER_INTERCEPT_PRINTF #if SANITIZER_INTERCEPT_PRINTF -#define INIT_PRINTF \ - COMMON_INTERCEPT_FUNCTION_LDBL(printf); \ - COMMON_INTERCEPT_FUNCTION_LDBL(sprintf); \ - COMMON_INTERCEPT_FUNCTION_LDBL(snprintf); \ - COMMON_INTERCEPT_FUNCTION_LDBL(asprintf); \ - COMMON_INTERCEPT_FUNCTION_LDBL(fprintf); \ - COMMON_INTERCEPT_FUNCTION_LDBL(vprintf); \ - COMMON_INTERCEPT_FUNCTION_LDBL(vsprintf); \ - COMMON_INTERCEPT_FUNCTION_LDBL(vsnprintf); \ - COMMON_INTERCEPT_FUNCTION_LDBL(vasprintf); \ - COMMON_INTERCEPT_FUNCTION_LDBL(vfprintf); +# define INIT_PRINTF_COMMON \ + COMMON_INTERCEPT_FUNCTION_LDBL(printf); \ + COMMON_INTERCEPT_FUNCTION_LDBL(sprintf); \ + COMMON_INTERCEPT_FUNCTION_LDBL(snprintf); \ + COMMON_INTERCEPT_FUNCTION_LDBL(fprintf); \ + COMMON_INTERCEPT_FUNCTION_LDBL(vprintf); \ + COMMON_INTERCEPT_FUNCTION_LDBL(vsprintf); \ + COMMON_INTERCEPT_FUNCTION_LDBL(vsnprintf); \ + COMMON_INTERCEPT_FUNCTION_LDBL(vfprintf); +# if !SANITIZER_AIX +// AIX does not have [v]asprintf. +# define INIT_PRINTF_EXTRA \ + COMMON_INTERCEPT_FUNCTION_LDBL(asprintf); \ + COMMON_INTERCEPT_FUNCTION_LDBL(vasprintf); +# else +# define INIT_PRINTF_EXTRA +# endif +# define INIT_PRINTF INIT_PRINTF_COMMON INIT_PRINTF_EXTRA #else #define INIT_PRINTF #endif diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors_ioctl.inc b/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors_ioctl.inc index bc8f02826c61..08c2be47f535 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors_ioctl.inc +++ b/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors_ioctl.inc @@ -79,7 +79,9 @@ static void ioctl_table_fill() { _(TIOCMSET, READ, sizeof(int)); _(TIOCNXCL, NONE, 0); _(TIOCOUTQ, WRITE, sizeof(int)); +# if !SANITIZER_AIX _(TIOCSCTTY, NONE, 0); +# endif _(TIOCSPGRP, READ, pid_t_sz); _(TIOCSWINSZ, READ, struct_winsize_sz); diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors_memintrinsics.inc b/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors_memintrinsics.inc index 1565a494140f..0b6731c89950 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors_memintrinsics.inc +++ b/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors_memintrinsics.inc @@ -33,11 +33,13 @@ // Platform-specific options. #if SANITIZER_APPLE -#define PLATFORM_HAS_DIFFERENT_MEMCPY_AND_MEMMOVE 0 +# define PLATFORM_HAS_DIFFERENT_MEMCPY_AND_MEMMOVE 0 #elif SANITIZER_WINDOWS64 -#define PLATFORM_HAS_DIFFERENT_MEMCPY_AND_MEMMOVE 0 +# define PLATFORM_HAS_DIFFERENT_MEMCPY_AND_MEMMOVE 0 +#elif SANITIZER_AIX +# define PLATFORM_HAS_DIFFERENT_MEMCPY_AND_MEMMOVE 0 #else -#define PLATFORM_HAS_DIFFERENT_MEMCPY_AND_MEMMOVE 1 +# define PLATFORM_HAS_DIFFERENT_MEMCPY_AND_MEMMOVE 1 #endif // SANITIZER_APPLE #ifndef COMMON_INTERCEPTOR_MEMSET_IMPL diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_platform_interceptors.h b/compiler-rt/lib/sanitizer_common/sanitizer_platform_interceptors.h index 4bc55d7801db..ccc808b60ca7 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_platform_interceptors.h +++ b/compiler-rt/lib/sanitizer_common/sanitizer_platform_interceptors.h @@ -141,6 +141,12 @@ SANITIZER_WEAK_IMPORT void *aligned_alloc(__sanitizer::usize __alignment, #define SI_SOLARIS 0 #endif +#if SANITIZER_AIX +# define SI_NOT_AIX 0 +#else +# define SI_NOT_AIX 1 +#endif + #if SANITIZER_SOLARIS32 #define SI_SOLARIS32 1 #else @@ -161,20 +167,20 @@ SANITIZER_WEAK_IMPORT void *aligned_alloc(__sanitizer::usize __alignment, #define SANITIZER_INTERCEPT_STRLEN SI_NOT_FUCHSIA #define SANITIZER_INTERCEPT_STRNLEN (SI_NOT_MAC && SI_NOT_FUCHSIA) -#define SANITIZER_INTERCEPT_STRCMP SI_NOT_FUCHSIA +#define SANITIZER_INTERCEPT_STRCMP (SI_NOT_FUCHSIA && SI_NOT_AIX) #define SANITIZER_INTERCEPT_STRSTR SI_NOT_FUCHSIA -#define SANITIZER_INTERCEPT_STRCASESTR SI_POSIX +#define SANITIZER_INTERCEPT_STRCASESTR (SI_POSIX && SI_NOT_AIX) #define SANITIZER_INTERCEPT_STRTOK SI_NOT_FUCHSIA #define SANITIZER_INTERCEPT_STRCHR SI_NOT_FUCHSIA -#define SANITIZER_INTERCEPT_STRCHRNUL SI_POSIX_NOT_MAC +#define SANITIZER_INTERCEPT_STRCHRNUL (SI_POSIX_NOT_MAC && SI_NOT_AIX) #define SANITIZER_INTERCEPT_STRRCHR SI_NOT_FUCHSIA #define SANITIZER_INTERCEPT_STRSPN SI_NOT_FUCHSIA #define SANITIZER_INTERCEPT_STRPBRK SI_NOT_FUCHSIA #define SANITIZER_INTERCEPT_TEXTDOMAIN SI_LINUX_NOT_ANDROID || SI_SOLARIS #define SANITIZER_INTERCEPT_STRCASECMP SI_POSIX #define SANITIZER_INTERCEPT_MEMSET 1 -#define SANITIZER_INTERCEPT_MEMMOVE 1 -#define SANITIZER_INTERCEPT_MEMCPY 1 +#define SANITIZER_INTERCEPT_MEMMOVE SI_NOT_AIX +#define SANITIZER_INTERCEPT_MEMCPY SI_NOT_AIX #define SANITIZER_INTERCEPT_MEMCMP SI_NOT_FUCHSIA #define SANITIZER_INTERCEPT_BCMP \ SANITIZER_INTERCEPT_MEMCMP && \ @@ -233,9 +239,11 @@ SANITIZER_WEAK_IMPORT void *aligned_alloc(__sanitizer::usize __alignment, #define SANITIZER_INTERCEPT_ISOC99_SCANF SI_GLIBC #ifndef SANITIZER_INTERCEPT_PRINTF -#define SANITIZER_INTERCEPT_PRINTF SI_POSIX -#define SANITIZER_INTERCEPT_PRINTF_L (SI_FREEBSD || SI_NETBSD) -#define SANITIZER_INTERCEPT_ISOC99_PRINTF SI_GLIBC +# define SANITIZER_INTERCEPT_ASPRINTF SI_NOT_AIX +# define SANITIZER_INTERCEPT_VASPRINTF SI_NOT_AIX +# define SANITIZER_INTERCEPT_PRINTF SI_POSIX +# define SANITIZER_INTERCEPT_PRINTF_L (SI_FREEBSD || SI_NETBSD) +# define SANITIZER_INTERCEPT_ISOC99_PRINTF SI_GLIBC #endif #define SANITIZER_INTERCEPT_SETPROCTITLE (SI_FREEBSD || SI_NETBSD) @@ -243,8 +251,9 @@ SANITIZER_WEAK_IMPORT void *aligned_alloc(__sanitizer::usize __alignment, #define SANITIZER_INTERCEPT___PRINTF_CHK \ (SANITIZER_INTERCEPT_PRINTF && SI_GLIBC) -#define SANITIZER_INTERCEPT_FREXP SI_NOT_FUCHSIA -#define SANITIZER_INTERCEPT_FREXPF SI_POSIX +// AIX libc does not export FREXP and FREXPF. +#define SANITIZER_INTERCEPT_FREXP (SI_NOT_FUCHSIA && SI_NOT_AIX) +#define SANITIZER_INTERCEPT_FREXPF (SI_POSIX && SI_NOT_AIX) #define SANITIZER_INTERCEPT_FREXPL SI_POSIX #define SANITIZER_INTERCEPT_GETPWNAM_AND_FRIENDS SI_POSIX @@ -294,7 +303,7 @@ SANITIZER_WEAK_IMPORT void *aligned_alloc(__sanitizer::usize __alignment, #define SANITIZER_INTERCEPT_ACCEPT4 \ (SI_LINUX_NOT_ANDROID || SI_NETBSD || SI_FREEBSD) #define SANITIZER_INTERCEPT_PACCEPT SI_NETBSD -#define SANITIZER_INTERCEPT_MODF SI_POSIX +#define SANITIZER_INTERCEPT_MODF (SI_POSIX && SI_NOT_AIX) #define SANITIZER_INTERCEPT_RECVMSG SI_POSIX #define SANITIZER_INTERCEPT_SENDMSG SI_POSIX #define SANITIZER_INTERCEPT_RECVMMSG SI_LINUX @@ -329,8 +338,9 @@ SANITIZER_WEAK_IMPORT void *aligned_alloc(__sanitizer::usize __alignment, #define SANITIZER_INTERCEPT___WCSXFRM_L SI_LINUX #define SANITIZER_INTERCEPT_WCSNRTOMBS \ (SI_FREEBSD || SI_NETBSD || SI_MAC || SI_LINUX_NOT_ANDROID || SI_SOLARIS) -#define SANITIZER_INTERCEPT_WCRTOMB \ - (SI_FREEBSD || SI_NETBSD || SI_MAC || SI_LINUX_NOT_ANDROID || SI_SOLARIS) +#define SANITIZER_INTERCEPT_WCRTOMB \ + (SI_FREEBSD || SI_NETBSD || SI_MAC || SI_LINUX_NOT_ANDROID || SI_SOLARIS || \ + !SI_NOT_AIX) #define SANITIZER_INTERCEPT_WCTOMB \ (SI_FREEBSD || SI_NETBSD || SI_MAC || SI_LINUX_NOT_ANDROID || SI_SOLARIS) #define SANITIZER_INTERCEPT_TCGETATTR SI_LINUX_NOT_ANDROID || SI_SOLARIS @@ -370,7 +380,8 @@ SANITIZER_WEAK_IMPORT void *aligned_alloc(__sanitizer::usize __alignment, #define SANITIZER_INTERCEPT_GETMNTENT_R SI_LINUX_NOT_ANDROID #define SANITIZER_INTERCEPT_STATFS \ (SI_FREEBSD || SI_MAC || SI_LINUX_NOT_ANDROID || SI_SOLARIS) -#define SANITIZER_INTERCEPT_STATFS64 SI_GLIBC && SANITIZER_HAS_STATFS64 +#define SANITIZER_INTERCEPT_STATFS64 \ + ((SI_GLIBC || !SI_NOT_AIX) && SANITIZER_HAS_STATFS64) #define SANITIZER_INTERCEPT_STATVFS \ (SI_FREEBSD || SI_NETBSD || SI_LINUX_NOT_ANDROID) #define SANITIZER_INTERCEPT_STATVFS64 SI_GLIBC @@ -419,10 +430,10 @@ SANITIZER_WEAK_IMPORT void *aligned_alloc(__sanitizer::usize __alignment, #define SANITIZER_INTERCEPT_TTYNAME_R SI_POSIX #define SANITIZER_INTERCEPT_TEMPNAM SI_POSIX #define SANITIZER_INTERCEPT_SINCOS SI_LINUX || SI_SOLARIS -#define SANITIZER_INTERCEPT_REMQUO SI_POSIX -#define SANITIZER_INTERCEPT_REMQUOL (SI_POSIX && !SI_NETBSD) -#define SANITIZER_INTERCEPT_LGAMMA SI_POSIX -#define SANITIZER_INTERCEPT_LGAMMAL (SI_POSIX && !SI_NETBSD) +#define SANITIZER_INTERCEPT_REMQUO (SI_POSIX && SI_NOT_AIX) +#define SANITIZER_INTERCEPT_REMQUOL (SI_POSIX && !SI_NETBSD && SI_NOT_AIX) +#define SANITIZER_INTERCEPT_LGAMMA (SI_POSIX && SI_NOT_AIX) +#define SANITIZER_INTERCEPT_LGAMMAL (SI_POSIX && !SI_NETBSD && SI_NOT_AIX) #define SANITIZER_INTERCEPT_LGAMMA_R (SI_FREEBSD || SI_LINUX || SI_SOLARIS) #define SANITIZER_INTERCEPT_LGAMMAL_R SI_LINUX_NOT_ANDROID || SI_SOLARIS #define SANITIZER_INTERCEPT_DRAND48_R SI_GLIBC @@ -505,11 +516,13 @@ SANITIZER_WEAK_IMPORT void *aligned_alloc(__sanitizer::usize __alignment, #define SANITIZER_INTERCEPT_EVENTFD_READ_WRITE (SI_LINUX || SI_FREEBSD) #define SI_STAT_LINUX (SI_LINUX && __GLIBC_PREREQ(2, 33)) -#define SANITIZER_INTERCEPT_STAT \ - (SI_FREEBSD || SI_MAC || SI_ANDROID || SI_NETBSD || SI_SOLARIS || \ - SI_STAT_LINUX) -#define SANITIZER_INTERCEPT_STAT64 SI_STAT_LINUX && SANITIZER_HAS_STAT64 -#define SANITIZER_INTERCEPT_LSTAT (SI_NETBSD || SI_FREEBSD || SI_STAT_LINUX) +#define SANITIZER_INTERCEPT_STAT \ + (SI_FREEBSD || SI_MAC || SI_ANDROID || SI_NETBSD || SI_SOLARIS || \ + SI_STAT_LINUX || !SI_NOT_AIX) +#define SANITIZER_INTERCEPT_STAT64 \ + ((SI_STAT_LINUX || !SI_NOT_AIX) && SANITIZER_HAS_STAT64) +#define SANITIZER_INTERCEPT_LSTAT \ + (SI_NETBSD || SI_FREEBSD || SI_STAT_LINUX || !SI_NOT_AIX) #define SANITIZER_INTERCEPT___XSTAT \ ((!SANITIZER_INTERCEPT_STAT && SI_POSIX) || SI_STAT_LINUX) #define SANITIZER_INTERCEPT___XSTAT64 SI_GLIBC @@ -578,7 +591,7 @@ SANITIZER_WEAK_IMPORT void *aligned_alloc(__sanitizer::usize __alignment, #define SANITIZER_INTERCEPT_PROTOENT_R SI_GLIBC #define SANITIZER_INTERCEPT_NETENT (SI_LINUX || SI_NETBSD || SI_FREEBSD) #define SANITIZER_INTERCEPT_SETVBUF \ - (SI_NETBSD || SI_FREEBSD || SI_LINUX || SI_MAC) + (SI_NETBSD || SI_FREEBSD || SI_LINUX || SI_MAC || !SI_NOT_AIX) #define SANITIZER_INTERCEPT_GETMNTINFO (SI_NETBSD || SI_FREEBSD || SI_MAC) #define SANITIZER_INTERCEPT_MI_VECTOR_HASH SI_NETBSD #define SANITIZER_INTERCEPT_GETVFSSTAT SI_NETBSD diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_redefine_builtins.h b/compiler-rt/lib/sanitizer_common/sanitizer_redefine_builtins.h index 41e0613d6fc1..bda0f0468769 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_redefine_builtins.h +++ b/compiler-rt/lib/sanitizer_common/sanitizer_redefine_builtins.h @@ -15,7 +15,7 @@ # define SANITIZER_REDEFINE_BUILTINS_H // The asm hack only works with GCC and Clang. -# if !defined(_WIN32) +# if !defined(_WIN32) && !defined(_AIX) asm(R"( .set memcpy, __sanitizer_internal_memcpy From 7a3bcf9f7179e6904d405de36360714da07c31ba Mon Sep 17 00:00:00 2001 From: Jim Lin Date: Wed, 11 Jun 2025 21:50:35 +0800 Subject: [PATCH 0073/1322] [RISCV] Add missing predicate for PseudoTHVdotVMAQA family instructions --- llvm/lib/Target/RISCV/RISCVInstrInfoXTHead.td | 2 ++ 1 file changed, 2 insertions(+) diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXTHead.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXTHead.td index 2fccbcaf2cf3..89441444a994 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoXTHead.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXTHead.td @@ -660,10 +660,12 @@ def : Pat<(i32 (sub GPR:$rd, (mul (sexti16 (i32 GPR:$rs1)), (TH_MULSH GPR:$rd, GPR:$rs1, GPR:$rs2)>; } // Predicates = [HasVendorXTHeadMac, IsRV32] +let Predicates = [HasVendorXTHeadVdot] in { defm PseudoTHVdotVMAQA : VPseudoVMAQA_VV_VX; defm PseudoTHVdotVMAQAU : VPseudoVMAQA_VV_VX; defm PseudoTHVdotVMAQASU : VPseudoVMAQA_VV_VX; defm PseudoTHVdotVMAQAUS : VPseudoVMAQA_VX; +} let Predicates = [HasVendorXTHeadVdot] in { defm : VPatTernaryVMAQA_VV_VX<"int_riscv_th_vmaqa", "PseudoTHVdotVMAQA", From 7034014d08249a1e159a668a71e96a0b78636a39 Mon Sep 17 00:00:00 2001 From: Jeffrey Byrnes Date: Wed, 11 Jun 2025 18:07:00 -0700 Subject: [PATCH 0074/1322] [InstCombine] Combine or-disjoint (and->mul), (and->mul) to and->mul (#136013) The canonical pattern for bitmasked mul is currently ``` %val = and %x, %bitMask // where %bitMask is some constant %cmp = icmp eq %val, 0 %sel = select %cmp, 0, %C // where %C is some constant = C' * %bitMask ``` In certain cases, where we are combining multiple of these bitmasked muls with common factors, we are able to optimize into and->mul (see https://github.com/llvm/llvm-project/pull/135274 ) This optimization lends itself to further optimizations. This PR addresses one of such optimizations. In cases where we have `or-disjoint ( mul(and (X, C1), D) , mul (and (X, C2), D))` we can combine into `mul( and (X, (C1 + C2)), D) ` provided C1 and C2 are disjoint. Generalized proof: https://alive2.llvm.org/ce/z/MQYMui --- .../InstCombine/InstCombineAndOrXor.cpp | 124 ++++++++++++------ .../test/Transforms/InstCombine/or-bitmask.ll | 116 ++++++++++++++-- 2 files changed, 187 insertions(+), 53 deletions(-) diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp index c6c231f81c4a..dce695a03600 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp @@ -3592,6 +3592,73 @@ static Value *foldOrOfInversions(BinaryOperator &I, return nullptr; } +// A decomposition of ((X & Mask) * Factor). The NUW / NSW bools +// track these properities for preservation. Note that we can decompose +// equivalent select form of this expression (e.g. (!(X & Mask) ? 0 : Mask * +// Factor)) +struct DecomposedBitMaskMul { + Value *X; + APInt Factor; + APInt Mask; + bool NUW; + bool NSW; +}; + +static std::optional matchBitmaskMul(Value *V) { + Instruction *Op = dyn_cast(V); + if (!Op) + return std::nullopt; + + // Decompose (A & N) * C) into BitMaskMul + Value *Original = nullptr; + const APInt *Mask = nullptr; + const APInt *MulConst = nullptr; + if (match(Op, m_Mul(m_And(m_Value(Original), m_APInt(Mask)), + m_APInt(MulConst)))) { + if (MulConst->isZero() || Mask->isZero()) + return std::nullopt; + + return std::optional( + {Original, *MulConst, *Mask, + cast(Op)->hasNoUnsignedWrap(), + cast(Op)->hasNoSignedWrap()}); + } + + Value *Cond = nullptr; + const APInt *EqZero = nullptr, *NeZero = nullptr; + + // Decompose ((A & N) ? 0 : N * C) into BitMaskMul + if (match(Op, m_Select(m_Value(Cond), m_APInt(EqZero), m_APInt(NeZero)))) { + auto ICmpDecompose = + decomposeBitTest(Cond, /*LookThruTrunc=*/true, + /*AllowNonZeroC=*/false, /*DecomposeBitMask=*/true); + if (!ICmpDecompose.has_value()) + return std::nullopt; + + assert(ICmpInst::isEquality(ICmpDecompose->Pred) && + ICmpDecompose->C.isZero()); + + if (ICmpDecompose->Pred == ICmpInst::ICMP_NE) + std::swap(EqZero, NeZero); + + if (!EqZero->isZero() || NeZero->isZero()) + return std::nullopt; + + if (!ICmpDecompose->Mask.isPowerOf2() || ICmpDecompose->Mask.isZero() || + NeZero->getBitWidth() != ICmpDecompose->Mask.getBitWidth()) + return std::nullopt; + + if (!NeZero->urem(ICmpDecompose->Mask).isZero()) + return std::nullopt; + + return std::optional( + {ICmpDecompose->X, NeZero->udiv(ICmpDecompose->Mask), + ICmpDecompose->Mask, /*NUW=*/false, /*NSW=*/false}); + } + + return std::nullopt; +} + // FIXME: We use commutative matchers (m_c_*) for some, but not all, matches // here. We should standardize that construct where it is needed or choose some // other way to ensure that commutated variants of patterns are not missed. @@ -3674,49 +3741,26 @@ Instruction *InstCombinerImpl::visitOr(BinaryOperator &I) { /*NSW=*/true, /*NUW=*/true)) return R; - Value *Cond0 = nullptr, *Cond1 = nullptr; - const APInt *Op0Eq = nullptr, *Op0Ne = nullptr; - const APInt *Op1Eq = nullptr, *Op1Ne = nullptr; + // (A & N) * C + (A & M) * C -> (A & (N + M)) & C + // This also accepts the equivalent select form of (A & N) * C + // expressions i.e. !(A & N) ? 0 : N * C) + auto Decomp1 = matchBitmaskMul(I.getOperand(1)); + if (Decomp1) { + auto Decomp0 = matchBitmaskMul(I.getOperand(0)); + if (Decomp0 && Decomp0->X == Decomp1->X && + (Decomp0->Mask & Decomp1->Mask).isZero() && + Decomp0->Factor == Decomp1->Factor) { - // (!(A & N) ? 0 : N * C) + (!(A & M) ? 0 : M * C) -> A & (N + M) * C - if (match(I.getOperand(0), - m_Select(m_Value(Cond0), m_APInt(Op0Eq), m_APInt(Op0Ne))) && - match(I.getOperand(1), - m_Select(m_Value(Cond1), m_APInt(Op1Eq), m_APInt(Op1Ne)))) { + Value *NewAnd = Builder.CreateAnd( + Decomp0->X, ConstantInt::get(Decomp0->X->getType(), + (Decomp0->Mask + Decomp1->Mask))); - auto LHSDecompose = - decomposeBitTest(Cond0, /*LookThruTrunc=*/true, - /*AllowNonZeroC=*/false, /*DecomposeAnd=*/true); - auto RHSDecompose = - decomposeBitTest(Cond1, /*LookThruTrunc=*/true, - /*AllowNonZeroC=*/false, /*DecomposeAnd=*/true); + auto *Combined = BinaryOperator::CreateMul( + NewAnd, ConstantInt::get(NewAnd->getType(), Decomp1->Factor)); - if (LHSDecompose && RHSDecompose && LHSDecompose->X == RHSDecompose->X && - RHSDecompose->Mask.isPowerOf2() && LHSDecompose->Mask.isPowerOf2() && - LHSDecompose->Mask != RHSDecompose->Mask && - LHSDecompose->Mask.getBitWidth() == Op0Ne->getBitWidth() && - RHSDecompose->Mask.getBitWidth() == Op1Ne->getBitWidth()) { - assert(Op0Ne->getBitWidth() == Op1Ne->getBitWidth()); - assert(ICmpInst::isEquality(LHSDecompose->Pred)); - if (LHSDecompose->Pred == ICmpInst::ICMP_NE) - std::swap(Op0Eq, Op0Ne); - if (RHSDecompose->Pred == ICmpInst::ICMP_NE) - std::swap(Op1Eq, Op1Ne); - - if (!Op0Ne->isZero() && !Op1Ne->isZero() && Op0Eq->isZero() && - Op1Eq->isZero() && Op0Ne->urem(LHSDecompose->Mask).isZero() && - Op1Ne->urem(RHSDecompose->Mask).isZero() && - Op0Ne->udiv(LHSDecompose->Mask) == - Op1Ne->udiv(RHSDecompose->Mask)) { - auto NewAnd = Builder.CreateAnd( - LHSDecompose->X, - ConstantInt::get(LHSDecompose->X->getType(), - (LHSDecompose->Mask + RHSDecompose->Mask))); - - return BinaryOperator::CreateMul( - NewAnd, ConstantInt::get(NewAnd->getType(), - Op0Ne->udiv(LHSDecompose->Mask))); - } + Combined->setHasNoUnsignedWrap(Decomp0->NUW && Decomp1->NUW); + Combined->setHasNoSignedWrap(Decomp0->NSW && Decomp1->NSW); + return Combined; } } } diff --git a/llvm/test/Transforms/InstCombine/or-bitmask.ll b/llvm/test/Transforms/InstCombine/or-bitmask.ll index 3b482dc1794d..3c992dfea569 100644 --- a/llvm/test/Transforms/InstCombine/or-bitmask.ll +++ b/llvm/test/Transforms/InstCombine/or-bitmask.ll @@ -36,13 +36,9 @@ define i32 @add_select_cmp_and2(i32 %in) { define i32 @add_select_cmp_and3(i32 %in) { ; CHECK-LABEL: @add_select_cmp_and3( -; CHECK-NEXT: [[TMP1:%.*]] = and i32 [[IN:%.*]], 3 -; CHECK-NEXT: [[TEMP:%.*]] = mul nuw nsw i32 [[TMP1]], 72 -; CHECK-NEXT: [[BITOP2:%.*]] = and i32 [[IN]], 4 -; CHECK-NEXT: [[CMP2:%.*]] = icmp eq i32 [[BITOP2]], 0 -; CHECK-NEXT: [[SEL2:%.*]] = select i1 [[CMP2]], i32 0, i32 288 -; CHECK-NEXT: [[OUT:%.*]] = or disjoint i32 [[TEMP]], [[SEL2]] -; CHECK-NEXT: ret i32 [[OUT]] +; CHECK-NEXT: [[TMP1:%.*]] = and i32 [[IN:%.*]], 7 +; CHECK-NEXT: [[TEMP1:%.*]] = mul nuw nsw i32 [[TMP1]], 72 +; CHECK-NEXT: ret i32 [[TEMP1]] ; %bitop0 = and i32 %in, 1 %cmp0 = icmp eq i32 %bitop0, 0 @@ -60,12 +56,9 @@ define i32 @add_select_cmp_and3(i32 %in) { define i32 @add_select_cmp_and4(i32 %in) { ; CHECK-LABEL: @add_select_cmp_and4( -; CHECK-NEXT: [[TMP1:%.*]] = and i32 [[IN:%.*]], 3 -; CHECK-NEXT: [[OUT:%.*]] = mul nuw nsw i32 [[TMP1]], 72 -; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[IN]], 12 -; CHECK-NEXT: [[TEMP3:%.*]] = mul nuw nsw i32 [[TMP2]], 72 -; CHECK-NEXT: [[OUT1:%.*]] = or disjoint i32 [[OUT]], [[TEMP3]] -; CHECK-NEXT: ret i32 [[OUT1]] +; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[IN:%.*]], 15 +; CHECK-NEXT: [[TEMP2:%.*]] = mul nuw nsw i32 [[TMP2]], 72 +; CHECK-NEXT: ret i32 [[TEMP2]] ; %bitop0 = and i32 %in, 1 %cmp0 = icmp eq i32 %bitop0, 0 @@ -361,6 +354,103 @@ define i64 @mask_select_types_1(i64 %in) { ret i64 %out } +define i32 @add_select_cmp_mixed1(i32 %in) { +; CHECK-LABEL: @add_select_cmp_mixed1( +; CHECK-NEXT: [[TMP1:%.*]] = and i32 [[IN:%.*]], 3 +; CHECK-NEXT: [[OUT:%.*]] = mul nuw nsw i32 [[TMP1]], 72 +; CHECK-NEXT: ret i32 [[OUT]] +; + %mask = and i32 %in, 1 + %sel0 = mul i32 %mask, 72 + %bitop1 = and i32 %in, 2 + %cmp1 = icmp eq i32 %bitop1, 0 + %sel1 = select i1 %cmp1, i32 0, i32 144 + %out = or disjoint i32 %sel0, %sel1 + ret i32 %out +} + +define i32 @add_select_cmp_mixed2(i32 %in) { +; CHECK-LABEL: @add_select_cmp_mixed2( +; CHECK-NEXT: [[TMP1:%.*]] = and i32 [[IN:%.*]], 3 +; CHECK-NEXT: [[OUT:%.*]] = mul nuw nsw i32 [[TMP1]], 72 +; CHECK-NEXT: ret i32 [[OUT]] +; + %bitop0 = and i32 %in, 1 + %cmp0 = icmp eq i32 %bitop0, 0 + %mask = and i32 %in, 2 + %sel0 = select i1 %cmp0, i32 0, i32 72 + %sel1 = mul i32 %mask, 72 + %out = or disjoint i32 %sel0, %sel1 + ret i32 %out +} + +define i32 @add_select_cmp_and_mul(i32 %in) { +; CHECK-LABEL: @add_select_cmp_and_mul( +; CHECK-NEXT: [[TMP1:%.*]] = and i32 [[IN:%.*]], 3 +; CHECK-NEXT: [[OUT:%.*]] = mul nuw nsw i32 [[TMP1]], 72 +; CHECK-NEXT: ret i32 [[OUT]] +; + %mask0 = and i32 %in, 1 + %sel0 = mul i32 %mask0, 72 + %mask1 = and i32 %in, 2 + %sel1 = mul i32 %mask1, 72 + %out = or disjoint i32 %sel0, %sel1 + ret i32 %out +} + +define i32 @add_select_cmp_mixed2_mismatch(i32 %in) { +; CHECK-LABEL: @add_select_cmp_mixed2_mismatch( +; CHECK-NEXT: [[BITOP0:%.*]] = and i32 [[IN:%.*]], 1 +; CHECK-NEXT: [[CMP0:%.*]] = icmp eq i32 [[BITOP0]], 0 +; CHECK-NEXT: [[MASK:%.*]] = and i32 [[IN]], 2 +; CHECK-NEXT: [[SEL0:%.*]] = select i1 [[CMP0]], i32 0, i32 73 +; CHECK-NEXT: [[SEL1:%.*]] = mul nuw nsw i32 [[MASK]], 72 +; CHECK-NEXT: [[OUT:%.*]] = or disjoint i32 [[SEL0]], [[SEL1]] +; CHECK-NEXT: ret i32 [[OUT]] +; + %bitop0 = and i32 %in, 1 + %cmp0 = icmp eq i32 %bitop0, 0 + %mask = and i32 %in, 2 + %sel0 = select i1 %cmp0, i32 0, i32 73 + %sel1 = mul i32 %mask, 72 + %out = or disjoint i32 %sel0, %sel1 + ret i32 %out +} + +define i32 @add_select_cmp_and_mul_mismatch(i32 %in) { +; CHECK-LABEL: @add_select_cmp_and_mul_mismatch( +; CHECK-NEXT: [[TMP1:%.*]] = trunc i32 [[IN:%.*]] to i1 +; CHECK-NEXT: [[SEL0:%.*]] = select i1 [[TMP1]], i32 73, i32 0 +; CHECK-NEXT: [[MASK1:%.*]] = and i32 [[IN]], 2 +; CHECK-NEXT: [[SEL1:%.*]] = mul nuw nsw i32 [[MASK1]], 72 +; CHECK-NEXT: [[OUT:%.*]] = or disjoint i32 [[SEL0]], [[SEL1]] +; CHECK-NEXT: ret i32 [[OUT]] +; + %mask0 = and i32 %in, 1 + %sel0 = mul i32 %mask0, 73 + %mask1 = and i32 %in, 2 + %sel1 = mul i32 %mask1, 72 + %out = or disjoint i32 %sel0, %sel1 + ret i32 %out +} + +define i32 @and_mul_non_disjoint(i32 %in) { +; CHECK-LABEL: @and_mul_non_disjoint( +; CHECK-NEXT: [[TMP1:%.*]] = and i32 [[IN:%.*]], 2 +; CHECK-NEXT: [[OUT:%.*]] = mul nuw nsw i32 [[TMP1]], 72 +; CHECK-NEXT: [[MASK1:%.*]] = and i32 [[IN]], 4 +; CHECK-NEXT: [[SEL1:%.*]] = mul nuw nsw i32 [[MASK1]], 72 +; CHECK-NEXT: [[OUT1:%.*]] = or i32 [[OUT]], [[SEL1]] +; CHECK-NEXT: ret i32 [[OUT1]] +; + %mask0 = and i32 %in, 2 + %sel0 = mul i32 %mask0, 72 + %mask1 = and i32 %in, 4 + %sel1 = mul i32 %mask1, 72 + %out = or i32 %sel0, %sel1 + ret i32 %out +} + ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: ; CONSTSPLAT: {{.*}} ; CONSTVEC: {{.*}} From c4316180418ce8de4b4c9812c7fac791d55b6102 Mon Sep 17 00:00:00 2001 From: Shunsuke Watanabe Date: Thu, 12 Jun 2025 10:19:26 +0900 Subject: [PATCH 0075/1322] [Clang][Driver] Override complex number calculation method by -fno-fast-math (#132680) This patch fixes a bug where -fno-fast-math doesn't revert the complex number calculation method to the default. The priority of overriding options related to complex number calculations differs slightly from GCC, as discussed in: https://discourse.llvm.org/t/the-priority-of-fno-fast-math-regarding-complex-number-calculations/84679 --- clang/lib/Driver/ToolChains/Clang.cpp | 22 +++++- clang/test/Driver/range.c | 100 +++++++++++++++++++++++--- 2 files changed, 112 insertions(+), 10 deletions(-) diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp index a74fa81f3cf5..1d11be1d82be 100644 --- a/clang/lib/Driver/ToolChains/Clang.cpp +++ b/clang/lib/Driver/ToolChains/Clang.cpp @@ -2831,8 +2831,9 @@ static void RenderFloatingPointOptions(const ToolChain &TC, const Driver &D, StringRef Float16ExcessPrecision = ""; StringRef BFloat16ExcessPrecision = ""; LangOptions::ComplexRangeKind Range = LangOptions::ComplexRangeKind::CX_None; - std::string ComplexRangeStr = ""; - std::string GccRangeComplexOption = ""; + std::string ComplexRangeStr; + std::string GccRangeComplexOption; + std::string LastComplexRangeOption; auto setComplexRange = [&](LangOptions::ComplexRangeKind NewRange) { // Warn if user expects to perform full implementation of complex @@ -2916,6 +2917,7 @@ static void RenderFloatingPointOptions(const ToolChain &TC, const Driver &D, EmitComplexRangeDiag(D, GccRangeComplexOption, "-fcx-limited-range"); } GccRangeComplexOption = "-fcx-limited-range"; + LastComplexRangeOption = A->getSpelling(); Range = LangOptions::ComplexRangeKind::CX_Basic; break; case options::OPT_fno_cx_limited_range: @@ -2929,6 +2931,7 @@ static void RenderFloatingPointOptions(const ToolChain &TC, const Driver &D, "-fno-cx-limited-range"); } GccRangeComplexOption = "-fno-cx-limited-range"; + LastComplexRangeOption = A->getSpelling(); Range = LangOptions::ComplexRangeKind::CX_Full; break; case options::OPT_fcx_fortran_rules: @@ -2938,6 +2941,7 @@ static void RenderFloatingPointOptions(const ToolChain &TC, const Driver &D, else EmitComplexRangeDiag(D, GccRangeComplexOption, "-fcx-fortran-rules"); GccRangeComplexOption = "-fcx-fortran-rules"; + LastComplexRangeOption = A->getSpelling(); Range = LangOptions::ComplexRangeKind::CX_Improved; break; case options::OPT_fno_cx_fortran_rules: @@ -2950,6 +2954,7 @@ static void RenderFloatingPointOptions(const ToolChain &TC, const Driver &D, "-fno-cx-fortran-rules"); } GccRangeComplexOption = "-fno-cx-fortran-rules"; + LastComplexRangeOption = A->getSpelling(); Range = LangOptions::ComplexRangeKind::CX_Full; break; case options::OPT_fcomplex_arithmetic_EQ: { @@ -2984,6 +2989,8 @@ static void RenderFloatingPointOptions(const ToolChain &TC, const Driver &D, ComplexArithmeticStr(RangeVal)); } } + LastComplexRangeOption = + Args.MakeArgString(A->getSpelling() + A->getValue()); Range = RangeVal; break; } @@ -3037,6 +3044,7 @@ static void RenderFloatingPointOptions(const ToolChain &TC, const Driver &D, } else D.Diag(diag::err_drv_unsupported_option_argument) << A->getSpelling() << Val; + LastComplexRangeOption = A->getSpelling(); break; } @@ -3222,6 +3230,7 @@ static void RenderFloatingPointOptions(const ToolChain &TC, const Driver &D, [[fallthrough]]; case options::OPT_ffast_math: applyFastMath(true); + LastComplexRangeOption = A->getSpelling(); if (A->getOption().getID() == options::OPT_Ofast) LastFpContractOverrideOption = "-Ofast"; else @@ -3239,6 +3248,15 @@ static void RenderFloatingPointOptions(const ToolChain &TC, const Driver &D, ApproxFunc = false; SignedZeros = true; restoreFPContractState(); + // If the last specified option related to complex range is not + // -ffast-math or -ffp-model=, emit warning. + if (LastComplexRangeOption != "-ffast-math" && + LastComplexRangeOption != "-ffp-model=" && + Range != LangOptions::ComplexRangeKind::CX_Full) + EmitComplexRangeDiag(D, LastComplexRangeOption, "-fno-fast-math"); + Range = LangOptions::ComplexRangeKind::CX_None; + LastComplexRangeOption = ""; + GccRangeComplexOption = ""; LastFpContractOverrideOption = ""; break; } // End switch (A->getOption().getID()) diff --git a/clang/test/Driver/range.c b/clang/test/Driver/range.c index da5748d7c723..30140f3c208e 100644 --- a/clang/test/Driver/range.c +++ b/clang/test/Driver/range.c @@ -177,14 +177,83 @@ // RUN: %clang -### -target x86_64 -ffast-math -fcomplex-arithmetic=basic -c %s 2>&1 \ // RUN: | FileCheck --check-prefix=BASIC %s -// BASIC: -complex-range=basic -// FULL: -complex-range=full -// PRMTD: -complex-range=promoted -// BASIC-NOT: -complex-range=improved -// CHECK-NOT: -complex-range=basic -// IMPRVD: -complex-range=improved -// IMPRVD-NOT: -complex-range=basic -// CHECK-NOT: -complex-range=improved +// RUN: %clang -### --target=x86_64 -fcx-limited-range -fno-fast-math \ +// RUN: -c %s 2>&1 | FileCheck --check-prefixes=RANGE,WARN21 %s + +// RUN: %clang -### -Werror --target=x86_64 -fno-cx-limited-range -fno-fast-math \ +// RUN: -c %s 2>&1 | FileCheck --check-prefixes=RANGE %s + +// RUN: %clang -### --target=x86_64 -fcx-fortran-rules -fno-fast-math \ +// RUN: -c %s 2>&1 | FileCheck --check-prefixes=RANGE,WARN22 %s + +// RUN: %clang -### -Werror --target=x86_64 -fno-cx-fortran-rules -fno-fast-math \ +// RUN: -c %s 2>&1 | FileCheck --check-prefixes=RANGE %s + +// RUN: %clang -### -Werror --target=x86_64 -ffast-math -fno-fast-math \ +// RUN: -c %s 2>&1 | FileCheck --check-prefixes=RANGE %s + +// RUN: %clang -### --target=x86_64 -fcomplex-arithmetic=basic -fno-fast-math \ +// RUN: -c %s 2>&1 | FileCheck --check-prefixes=RANGE,WARN23 %s + +// RUN: %clang -### --target=x86_64 -fcomplex-arithmetic=promoted -fno-fast-math \ +// RUN: -c %s 2>&1 | FileCheck --check-prefixes=RANGE,WARN24 %s + +// RUN: %clang -### --target=x86_64 -fcomplex-arithmetic=improved -fno-fast-math \ +// RUN: -c %s 2>&1 | FileCheck --check-prefixes=RANGE,WARN25 %s + +// RUN: %clang -### -Werror --target=x86_64 -fcomplex-arithmetic=full -fno-fast-math \ +// RUN: -c %s 2>&1 | FileCheck --check-prefixes=RANGE %s + +// RUN: %clang -### -Werror --target=x86_64 -ffp-model=aggressive -fno-fast-math \ +// RUN: -c %s 2>&1 | FileCheck --check-prefixes=RANGE %s + +// RUN: %clang -### -Werror --target=x86_64 -ffp-model=fast -fno-fast-math \ +// RUN: -c %s 2>&1 | FileCheck --check-prefixes=RANGE %s + +// RUN: %clang -### -Werror --target=x86_64 -ffp-model=precise -fno-fast-math \ +// RUN: -c %s 2>&1 | FileCheck --check-prefixes=RANGE %s + +// RUN: %clang -### -Werror --target=x86_64 -ffp-model=strict -fno-fast-math \ +// RUN: -c %s 2>&1 | FileCheck --check-prefixes=RANGE %s + +// RUN: %clang -### -Werror --target=x86_64 -fno-fast-math -fcx-limited-range \ +// RUN: -c %s 2>&1 | FileCheck --check-prefixes=BASIC %s + +// RUN: %clang -### -Werror --target=x86_64 -fno-fast-math -fno-cx-limited-range \ +// RUN: -c %s 2>&1 | FileCheck --check-prefixes=FULL %s + +// RUN: %clang -### -Werror --target=x86_64 -fno-fast-math -fcx-fortran-rules \ +// RUN: -c %s 2>&1 | FileCheck --check-prefixes=IMPRVD %s + +// RUN: %clang -### -Werror --target=x86_64 -fno-fast-math -fno-cx-fortran-rules \ +// RUN: -c %s 2>&1 | FileCheck --check-prefixes=FULL %s + +// RUN: %clang -### -Werror --target=x86_64 -fno-fast-math -ffast-math \ +// RUN: -c %s 2>&1 | FileCheck --check-prefixes=BASIC %s + +// RUN: %clang -### -Werror --target=x86_64 -fno-fast-math -fcomplex-arithmetic=basic \ +// RUN: -c %s 2>&1 | FileCheck --check-prefixes=BASIC %s + +// RUN: %clang -### -Werror --target=x86_64 -fno-fast-math -fcomplex-arithmetic=promoted \ +// RUN: -c %s 2>&1 | FileCheck --check-prefixes=PRMTD %s + +// RUN: %clang -### -Werror --target=x86_64 -fno-fast-math -fcomplex-arithmetic=improved \ +// RUN: -c %s 2>&1 | FileCheck --check-prefixes=IMPRVD %s + +// RUN: %clang -### -Werror --target=x86_64 -fno-fast-math -fcomplex-arithmetic=full \ +// RUN: -c %s 2>&1 | FileCheck --check-prefixes=FULL %s + +// RUN: %clang -### -Werror --target=x86_64 -fno-fast-math -ffp-model=aggressive \ +// RUN: -c %s 2>&1 | FileCheck --check-prefixes=BASIC %s + +// RUN: %clang -### -Werror --target=x86_64 -fno-fast-math -ffp-model=fast \ +// RUN: -c %s 2>&1 | FileCheck --check-prefixes=PRMTD %s + +// RUN: %clang -### -Werror --target=x86_64 -fno-fast-math -ffp-model=precise \ +// RUN: -c %s 2>&1 | FileCheck --check-prefixes=FULL %s + +// RUN: %clang -### -Werror --target=x86_64 -fno-fast-math -ffp-model=strict \ +// RUN: -c %s 2>&1 | FileCheck --check-prefixes=FULL %s // WARN1: warning: overriding '-fcx-limited-range' option with '-fcx-fortran-rules' [-Woverriding-option] // WARN2: warning: overriding '-fno-cx-limited-range' option with '-fcx-fortran-rules' [-Woverriding-option] @@ -196,5 +265,20 @@ // WARN14: overriding '-complex-range=promoted' option with '-fcx-limited-range' [-Woverriding-option] // WARN17: warning: overriding '-fcomplex-arithmetic=full' option with '-fcomplex-arithmetic=basic' [-Woverriding-option] // WARN20: warning: overriding '-fcx-fortran-rules' option with '-fcx-limited-range' [-Woverriding-option] +// WARN21: warning: overriding '-fcx-limited-range' option with '-fno-fast-math' [-Woverriding-option] +// WARN22: warning: overriding '-fcx-fortran-rules' option with '-fno-fast-math' [-Woverriding-option] +// WARN23: warning: overriding '-fcomplex-arithmetic=basic' option with '-fno-fast-math' [-Woverriding-option] +// WARN24: warning: overriding '-fcomplex-arithmetic=promoted' option with '-fno-fast-math' [-Woverriding-option] +// WARN25: warning: overriding '-fcomplex-arithmetic=improved' option with '-fno-fast-math' [-Woverriding-option] + +// BASIC: -complex-range=basic +// FULL: -complex-range=full +// PRMTD: -complex-range=promoted +// BASIC-NOT: -complex-range=improved +// CHECK-NOT: -complex-range=basic +// IMPRVD: -complex-range=improved +// IMPRVD-NOT: -complex-range=basic +// CHECK-NOT: -complex-range=improved +// RANGE-NOT: -complex-range= // ERR: error: unsupported argument 'foo' to option '-fcomplex-arithmetic=' From 52360d195b85608c677d781272534dfa61e9a1c3 Mon Sep 17 00:00:00 2001 From: Longsheng Mou Date: Thu, 12 Jun 2025 09:27:27 +0800 Subject: [PATCH 0076/1322] [NFC] Use `llvm::includes` instead of `std::includes` (#143542) This PR follows up #143297. --- clang-tools-extra/clangd/refactor/Rename.cpp | 2 +- llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp | 6 ++---- llvm/tools/sancov/sancov.cpp | 3 +-- llvm/unittests/ADT/DAGDeltaAlgorithmTest.cpp | 4 ++-- llvm/unittests/ADT/DeltaAlgorithmTest.cpp | 4 ++-- llvm/utils/TableGen/AsmMatcherEmitter.cpp | 3 +-- llvm/utils/TableGen/Common/CodeGenRegisters.cpp | 7 ++----- 7 files changed, 11 insertions(+), 18 deletions(-) diff --git a/clang-tools-extra/clangd/refactor/Rename.cpp b/clang-tools-extra/clangd/refactor/Rename.cpp index d9b73b83e902..c56375b1a98d 100644 --- a/clang-tools-extra/clangd/refactor/Rename.cpp +++ b/clang-tools-extra/clangd/refactor/Rename.cpp @@ -1308,7 +1308,7 @@ getMappedRanges(ArrayRef Indexed, ArrayRef Lexed) { return std::nullopt; } // Fast check for the special subset case. - if (std::includes(Indexed.begin(), Indexed.end(), Lexed.begin(), Lexed.end())) + if (llvm::includes(Indexed, Lexed)) return Lexed.vec(); std::vector Best; diff --git a/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp index d94a2fbb23d2..61fef1387d82 100644 --- a/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp @@ -1975,12 +1975,10 @@ Value *DFSanFunction::combineShadows(Value *V1, Value *V2, auto V1Elems = ShadowElements.find(V1); auto V2Elems = ShadowElements.find(V2); if (V1Elems != ShadowElements.end() && V2Elems != ShadowElements.end()) { - if (std::includes(V1Elems->second.begin(), V1Elems->second.end(), - V2Elems->second.begin(), V2Elems->second.end())) { + if (llvm::includes(V1Elems->second, V2Elems->second)) { return collapseToPrimitiveShadow(V1, Pos); } - if (std::includes(V2Elems->second.begin(), V2Elems->second.end(), - V1Elems->second.begin(), V1Elems->second.end())) { + if (llvm::includes(V2Elems->second, V1Elems->second)) { return collapseToPrimitiveShadow(V2, Pos); } } else if (V1Elems != ShadowElements.end()) { diff --git a/llvm/tools/sancov/sancov.cpp b/llvm/tools/sancov/sancov.cpp index 2cc84b47de6b..aebb5effd0be 100644 --- a/llvm/tools/sancov/sancov.cpp +++ b/llvm/tools/sancov/sancov.cpp @@ -889,8 +889,7 @@ symbolize(const RawCoverage &Data, const std::string ObjectFile) { } std::set AllAddrs = findCoveragePointAddrs(ObjectFile); - if (!std::includes(AllAddrs.begin(), AllAddrs.end(), Data.Addrs->begin(), - Data.Addrs->end())) { + if (!llvm::includes(AllAddrs, *Data.Addrs)) { fail("Coverage points in binary and .sancov file do not match."); } Coverage->Points = getCoveragePoints(ObjectFile, AllAddrs, *Data.Addrs); diff --git a/llvm/unittests/ADT/DAGDeltaAlgorithmTest.cpp b/llvm/unittests/ADT/DAGDeltaAlgorithmTest.cpp index 66a67d96d153..f54394789939 100644 --- a/llvm/unittests/ADT/DAGDeltaAlgorithmTest.cpp +++ b/llvm/unittests/ADT/DAGDeltaAlgorithmTest.cpp @@ -7,6 +7,7 @@ //===----------------------------------------------------------------------===// #include "llvm/ADT/DAGDeltaAlgorithm.h" +#include "llvm/ADT/STLExtras.h" #include "gtest/gtest.h" #include #include @@ -23,8 +24,7 @@ class FixedDAGDeltaAlgorithm : public DAGDeltaAlgorithm { protected: bool ExecuteOneTest(const changeset_ty &Changes) override { ++NumTests; - return std::includes(Changes.begin(), Changes.end(), - FailingSet.begin(), FailingSet.end()); + return llvm::includes(Changes, FailingSet); } public: diff --git a/llvm/unittests/ADT/DeltaAlgorithmTest.cpp b/llvm/unittests/ADT/DeltaAlgorithmTest.cpp index 5e284129180a..24e18f42eb33 100644 --- a/llvm/unittests/ADT/DeltaAlgorithmTest.cpp +++ b/llvm/unittests/ADT/DeltaAlgorithmTest.cpp @@ -7,6 +7,7 @@ //===----------------------------------------------------------------------===// #include "llvm/ADT/DeltaAlgorithm.h" +#include "llvm/ADT/STLExtras.h" #include "gtest/gtest.h" #include #include @@ -38,8 +39,7 @@ class FixedDeltaAlgorithm final : public DeltaAlgorithm { protected: bool ExecuteOneTest(const changeset_ty &Changes) override { ++NumTests; - return std::includes(Changes.begin(), Changes.end(), - FailingSet.begin(), FailingSet.end()); + return llvm::includes(Changes, FailingSet); } public: diff --git a/llvm/utils/TableGen/AsmMatcherEmitter.cpp b/llvm/utils/TableGen/AsmMatcherEmitter.cpp index 9792eb41ea5d..32098e96ce72 100644 --- a/llvm/utils/TableGen/AsmMatcherEmitter.cpp +++ b/llvm/utils/TableGen/AsmMatcherEmitter.cpp @@ -1330,8 +1330,7 @@ void AsmMatcherInfo::buildRegisterClasses( for (const RegisterSet &RS : RegisterSets) { ClassInfo *CI = RegisterSetClasses[RS]; for (const RegisterSet &RS2 : RegisterSets) - if (RS != RS2 && std::includes(RS2.begin(), RS2.end(), RS.begin(), - RS.end(), LessRecordByID())) + if (RS != RS2 && llvm::includes(RS2, RS, LessRecordByID())) CI->SuperClasses.push_back(RegisterSetClasses[RS2]); } diff --git a/llvm/utils/TableGen/Common/CodeGenRegisters.cpp b/llvm/utils/TableGen/Common/CodeGenRegisters.cpp index 4d24eb3de1ed..f52c21e97f9c 100644 --- a/llvm/utils/TableGen/Common/CodeGenRegisters.cpp +++ b/llvm/utils/TableGen/Common/CodeGenRegisters.cpp @@ -933,9 +933,7 @@ bool CodeGenRegisterClass::Key::operator<( static bool testSubClass(const CodeGenRegisterClass *A, const CodeGenRegisterClass *B) { return A->RSI.isSubClassOf(B->RSI) && - std::includes(A->getMembers().begin(), A->getMembers().end(), - B->getMembers().begin(), B->getMembers().end(), - deref>()); + llvm::includes(A->getMembers(), B->getMembers(), deref>()); } /// Sorting predicate for register classes. This provides a topological @@ -1990,8 +1988,7 @@ findRegUnitSet(const std::vector &UniqueSets, // Return true if the RUSubSet is a subset of RUSuperSet. static bool isRegUnitSubSet(const std::vector &RUSubSet, const std::vector &RUSuperSet) { - return std::includes(RUSuperSet.begin(), RUSuperSet.end(), RUSubSet.begin(), - RUSubSet.end()); + return llvm::includes(RUSuperSet, RUSubSet); } /// Iteratively prune unit sets. Prune subsets that are close to the superset, From 082251bba4effea7f60191c6cbddacb3705c07db Mon Sep 17 00:00:00 2001 From: Jameson Nash Date: Wed, 11 Jun 2025 21:49:01 -0400 Subject: [PATCH 0077/1322] [AArch64] fix trampoline implementation: use X15 (#126743) AAPCS64 reserves any of X9-X15 for a compiler to choose to use for this purpose, and says not to use X16 or X18 like GCC (and the previous implementation) chose to use. The X18 register may need to get used by the kernel in some circumstances, as specified by the platform ABI, so it is generally an unwise choice. Simply choosing a different register fixes the problem of this being broken on any platform that actually follows the platform ABI (which is all of them except EABI, if I am reading this linux kernel bug correctly https://lkml2.uits.iu.edu/hypermail/linux/kernel/2001.2/01502.html). As a side benefit, also generate slightly better code and avoids needing the compiler-rt to be present. I did that by following the XCore implementation instead of PPC (although in hindsight, following the RISCV might have been slightly more readable). That X18 is wrong to use for this purpose has been known for many years (e.g. https://www.mail-archive.com/gcc@gcc.gnu.org/msg76934.html) and also known that fixing this to use one of the correct registers is not an ABI break, since this only appears inside of a translation unit. Some of the other temporary registers (e.g. X9) are already reserved inside llvm for internal use as a generic temporary register in the prologue before saving registers, while X15 was already used in rare cases as a scratch register in the prologue as well, so I felt that seemed the most logical choice to choose here. --- compiler-rt/lib/builtins/README.txt | 5 - compiler-rt/lib/builtins/trampoline_setup.c | 42 --- .../builtins/Unit/trampoline_setup_test.c | 2 +- .../lib/Optimizer/CodeGen/BoxedProcedure.cpp | 8 +- flang/test/Fir/boxproc.fir | 4 +- .../AArch64/AArch64CallingConvention.td | 25 +- .../Target/AArch64/AArch64FrameLowering.cpp | 85 ++++-- .../Target/AArch64/AArch64ISelLowering.cpp | 95 ++++--- llvm/lib/TargetParser/Triple.cpp | 2 - llvm/test/CodeGen/AArch64/nest-register.ll | 16 +- .../AArch64/statepoint-call-lowering.ll | 2 +- llvm/test/CodeGen/AArch64/trampoline.ll | 257 +++++++++++++++++- llvm/test/CodeGen/AArch64/win64cc-x18.ll | 27 +- .../CodeGen/AArch64/zero-call-used-regs.ll | 16 +- 14 files changed, 420 insertions(+), 166 deletions(-) diff --git a/compiler-rt/lib/builtins/README.txt b/compiler-rt/lib/builtins/README.txt index 19f26c92a0f9..2d213d95f333 100644 --- a/compiler-rt/lib/builtins/README.txt +++ b/compiler-rt/lib/builtins/README.txt @@ -272,11 +272,6 @@ switch32 switch8 switchu8 -// This function generates a custom trampoline function with the specific -// realFunc and localsPtr values. -void __trampoline_setup(uint32_t* trampOnStack, int trampSizeAllocated, - const void* realFunc, void* localsPtr); - // There is no C interface to the *_vfp_d8_d15_regs functions. There are // called in the prolog and epilog of Thumb1 functions. When the C++ ABI use // SJLJ for exceptions, each function with a catch clause or destructors needs diff --git a/compiler-rt/lib/builtins/trampoline_setup.c b/compiler-rt/lib/builtins/trampoline_setup.c index 830e25e4c030..844eb2794414 100644 --- a/compiler-rt/lib/builtins/trampoline_setup.c +++ b/compiler-rt/lib/builtins/trampoline_setup.c @@ -41,45 +41,3 @@ COMPILER_RT_ABI void __trampoline_setup(uint32_t *trampOnStack, __clear_cache(trampOnStack, &trampOnStack[10]); } #endif // __powerpc__ && !defined(__powerpc64__) - -// The AArch64 compiler generates calls to __trampoline_setup() when creating -// trampoline functions on the stack for use with nested functions. -// This function creates a custom 36-byte trampoline function on the stack -// which loads x18 with a pointer to the outer function's locals -// and then jumps to the target nested function. -// Note: x18 is a reserved platform register on Windows and macOS. - -#if defined(__aarch64__) && defined(__ELF__) -COMPILER_RT_ABI void __trampoline_setup(uint32_t *trampOnStack, - int trampSizeAllocated, - const void *realFunc, void *localsPtr) { - // This should never happen, but if compiler did not allocate - // enough space on stack for the trampoline, abort. - if (trampSizeAllocated < 36) - compilerrt_abort(); - - // create trampoline - // Load realFunc into x17. mov/movk 16 bits at a time. - trampOnStack[0] = - 0xd2800000u | ((((uint64_t)realFunc >> 0) & 0xffffu) << 5) | 0x11; - trampOnStack[1] = - 0xf2a00000u | ((((uint64_t)realFunc >> 16) & 0xffffu) << 5) | 0x11; - trampOnStack[2] = - 0xf2c00000u | ((((uint64_t)realFunc >> 32) & 0xffffu) << 5) | 0x11; - trampOnStack[3] = - 0xf2e00000u | ((((uint64_t)realFunc >> 48) & 0xffffu) << 5) | 0x11; - // Load localsPtr into x18 - trampOnStack[4] = - 0xd2800000u | ((((uint64_t)localsPtr >> 0) & 0xffffu) << 5) | 0x12; - trampOnStack[5] = - 0xf2a00000u | ((((uint64_t)localsPtr >> 16) & 0xffffu) << 5) | 0x12; - trampOnStack[6] = - 0xf2c00000u | ((((uint64_t)localsPtr >> 32) & 0xffffu) << 5) | 0x12; - trampOnStack[7] = - 0xf2e00000u | ((((uint64_t)localsPtr >> 48) & 0xffffu) << 5) | 0x12; - trampOnStack[8] = 0xd61f0220; // br x17 - - // Clear instruction cache. - __clear_cache(trampOnStack, &trampOnStack[9]); -} -#endif // defined(__aarch64__) && !defined(__APPLE__) && !defined(_WIN64) diff --git a/compiler-rt/test/builtins/Unit/trampoline_setup_test.c b/compiler-rt/test/builtins/Unit/trampoline_setup_test.c index d51d35acaa02..da115fe76427 100644 --- a/compiler-rt/test/builtins/Unit/trampoline_setup_test.c +++ b/compiler-rt/test/builtins/Unit/trampoline_setup_test.c @@ -7,7 +7,7 @@ /* * Tests nested functions - * The ppc and aarch64 compilers generates a call to __trampoline_setup + * The ppc compiler generates a call to __trampoline_setup * The i386 and x86_64 compilers generate a call to ___enable_execute_stack */ diff --git a/flang/lib/Optimizer/CodeGen/BoxedProcedure.cpp b/flang/lib/Optimizer/CodeGen/BoxedProcedure.cpp index 82b11ad7db32..69bdb48146a5 100644 --- a/flang/lib/Optimizer/CodeGen/BoxedProcedure.cpp +++ b/flang/lib/Optimizer/CodeGen/BoxedProcedure.cpp @@ -274,12 +274,12 @@ public: auto loc = embox.getLoc(); mlir::Type i8Ty = builder.getI8Type(); mlir::Type i8Ptr = builder.getRefType(i8Ty); - // For AArch64, PPC32 and PPC64, the thunk is populated by a call to + // For PPC32 and PPC64, the thunk is populated by a call to // __trampoline_setup, which is defined in // compiler-rt/lib/builtins/trampoline_setup.c and requires the - // thunk size greater than 32 bytes. For RISCV and x86_64, the - // thunk setup doesn't go through __trampoline_setup and fits in 32 - // bytes. + // thunk size greater than 32 bytes. For AArch64, RISCV and x86_64, + // the thunk setup doesn't go through __trampoline_setup and fits in + // 32 bytes. fir::SequenceType::Extent thunkSize = triple.getTrampolineSize(); mlir::Type buffTy = SequenceType::get({thunkSize}, i8Ty); auto buffer = builder.create(loc, buffTy); diff --git a/flang/test/Fir/boxproc.fir b/flang/test/Fir/boxproc.fir index 5d82522055ad..97d9b38ed6f4 100644 --- a/flang/test/Fir/boxproc.fir +++ b/flang/test/Fir/boxproc.fir @@ -3,7 +3,7 @@ // RUN: %if powerpc-registered-target %{tco --target=powerpc64le-unknown-linux-gnu %s | FileCheck %s --check-prefixes=CHECK,CHECK-PPC %} // CHECK-LABEL: define void @_QPtest_proc_dummy() -// CHECK-AARCH64: %[[VAL_3:.*]] = alloca [36 x i8], i64 1, align 1 +// CHECK-AARCH64: %[[VAL_3:.*]] = alloca [32 x i8], i64 1, align 1 // CHECK-X86: %[[VAL_3:.*]] = alloca [32 x i8], i64 1, align 1 // CHECK-PPC: %[[VAL_3:.*]] = alloca [4{{[0-8]+}} x i8], i64 1, align 1 // CHECK: %[[VAL_1:.*]] = alloca { ptr }, i64 1, align 8 @@ -63,7 +63,7 @@ func.func @_QPtest_proc_dummy_other(%arg0: !fir.boxproc<() -> ()>) { } // CHECK-LABEL: define void @_QPtest_proc_dummy_char() -// CHECK-AARCH64: %[[VAL_20:.*]] = alloca [36 x i8], i64 1, align 1 +// CHECK-AARCH64: %[[VAL_20:.*]] = alloca [32 x i8], i64 1, align 1 // CHECK-X86: %[[VAL_20:.*]] = alloca [32 x i8], i64 1, align 1 // CHECK-PPC: %[[VAL_20:.*]] = alloca [4{{[0-8]+}} x i8], i64 1, align 1 // CHECK: %[[VAL_2:.*]] = alloca { { ptr, i64 } }, i64 1, align 8 diff --git a/llvm/lib/Target/AArch64/AArch64CallingConvention.td b/llvm/lib/Target/AArch64/AArch64CallingConvention.td index 920cc6727314..1b5a713bffdc 100644 --- a/llvm/lib/Target/AArch64/AArch64CallingConvention.td +++ b/llvm/lib/Target/AArch64/AArch64CallingConvention.td @@ -28,6 +28,12 @@ class CCIfSubtarget //===----------------------------------------------------------------------===// defvar AArch64_Common = [ + // The 'nest' parameter, if any, is passed in X15. + // The previous register used here (X18) is also defined to be unavailable + // for this purpose, while all of X9-X15 were defined to be free for LLVM to + // use for this, so use X15 (which LLVM often already clobbers anyways). + CCIfNest>, + CCIfType<[iPTR], CCBitConvertToType>, CCIfType<[v2f32], CCBitConvertToType>, CCIfType<[v2f64, v4f32], CCBitConvertToType>, @@ -117,13 +123,7 @@ defvar AArch64_Common = [ ]; let Entry = 1 in -def CC_AArch64_AAPCS : CallingConv>], - AArch64_Common -)>; +def CC_AArch64_AAPCS : CallingConv; let Entry = 1 in def RetCC_AArch64_AAPCS : CallingConv<[ @@ -177,6 +177,8 @@ def CC_AArch64_Win64_VarArg : CallingConv<[ // a stack layout compatible with the x64 calling convention. let Entry = 1 in def CC_AArch64_Arm64EC_VarArg : CallingConv<[ + CCIfNest>, + // Convert small floating-point values to integer. CCIfType<[f16, bf16], CCBitConvertToType>, CCIfType<[f32], CCBitConvertToType>, @@ -353,6 +355,8 @@ def RetCC_AArch64_Arm64EC_CFGuard_Check : CallingConv<[ // + Stack slots are sized as needed rather than being at least 64-bit. let Entry = 1 in def CC_AArch64_DarwinPCS : CallingConv<[ + CCIfNest>, + CCIfType<[iPTR], CCBitConvertToType>, CCIfType<[v2f32], CCBitConvertToType>, CCIfType<[v2f64, v4f32, f128], CCBitConvertToType>, @@ -427,6 +431,8 @@ def CC_AArch64_DarwinPCS : CallingConv<[ let Entry = 1 in def CC_AArch64_DarwinPCS_VarArg : CallingConv<[ + CCIfNest>, + CCIfType<[iPTR], CCBitConvertToType>, CCIfType<[v2f32], CCBitConvertToType>, CCIfType<[v2f64, v4f32, f128], CCBitConvertToType>, @@ -450,6 +456,8 @@ def CC_AArch64_DarwinPCS_VarArg : CallingConv<[ // same as the normal Darwin VarArgs handling. let Entry = 1 in def CC_AArch64_DarwinPCS_ILP32_VarArg : CallingConv<[ + CCIfNest>, + CCIfType<[v2f32], CCBitConvertToType>, CCIfType<[v2f64, v4f32, f128], CCBitConvertToType>, @@ -494,6 +502,8 @@ def CC_AArch64_DarwinPCS_ILP32_VarArg : CallingConv<[ let Entry = 1 in def CC_AArch64_GHC : CallingConv<[ + CCIfNest>, + CCIfType<[iPTR], CCBitConvertToType>, // Handle all vector types as either f64 or v2f64. @@ -522,6 +532,7 @@ def CC_AArch64_Preserve_None : CallingConv<[ // We can pass arguments in all general registers, except: // - X8, used for sret + // - X15 (on Windows), used as a temporary register in the prologue when allocating call frames // - X16/X17, used by the linker as IP0/IP1 // - X18, the platform register // - X19, the base pointer diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp index 3335ee04bb0e..2650c621e19f 100644 --- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp @@ -331,7 +331,9 @@ static int64_t getArgumentStackToRestore(MachineFunction &MF, static bool produceCompactUnwindFrame(MachineFunction &MF); static bool needsWinCFI(const MachineFunction &MF); static StackOffset getSVEStackSize(const MachineFunction &MF); -static Register findScratchNonCalleeSaveRegister(MachineBasicBlock *MBB); +static Register findScratchNonCalleeSaveRegister(MachineBasicBlock *MBB, + bool HasCall = false); +static bool requiresSaveVG(const MachineFunction &MF); /// Returns true if a homogeneous prolog or epilog code can be emitted /// for the size optimization. If possible, a frame helper call is injected. @@ -1006,6 +1008,16 @@ void AArch64FrameLowering::emitZeroCallUsedRegs(BitVector RegsToZero, } } +static bool windowsRequiresStackProbe(const MachineFunction &MF, + uint64_t StackSizeInBytes) { + const AArch64Subtarget &Subtarget = MF.getSubtarget(); + const AArch64FunctionInfo &MFI = *MF.getInfo(); + // TODO: When implementing stack protectors, take that into account + // for the probe threshold. + return Subtarget.isTargetWindows() && MFI.hasStackProbing() && + StackSizeInBytes >= uint64_t(MFI.getStackProbeSize()); +} + static void getLiveRegsForEntryMBB(LivePhysRegs &LiveRegs, const MachineBasicBlock &MBB) { const MachineFunction *MF = MBB.getParent(); @@ -1027,7 +1039,8 @@ static void getLiveRegsForEntryMBB(LivePhysRegs &LiveRegs, // but we would then have to make sure that we were in fact saving at least one // callee-save register in the prologue, which is additional complexity that // doesn't seem worth the benefit. -static Register findScratchNonCalleeSaveRegister(MachineBasicBlock *MBB) { +static Register findScratchNonCalleeSaveRegister(MachineBasicBlock *MBB, + bool HasCall) { MachineFunction *MF = MBB->getParent(); // If MBB is an entry block, use X9 as the scratch register @@ -1041,6 +1054,11 @@ static Register findScratchNonCalleeSaveRegister(MachineBasicBlock *MBB) { const AArch64RegisterInfo &TRI = *Subtarget.getRegisterInfo(); LivePhysRegs LiveRegs(TRI); getLiveRegsForEntryMBB(LiveRegs, *MBB); + if (HasCall) { + LiveRegs.addReg(AArch64::X16); + LiveRegs.addReg(AArch64::X17); + LiveRegs.addReg(AArch64::X18); + } // Prefer X9 since it was historically used for the prologue scratch reg. const MachineRegisterInfo &MRI = MF->getRegInfo(); @@ -1081,23 +1099,18 @@ bool AArch64FrameLowering::canUseAsPrologue( MBB.isLiveIn(AArch64::NZCV)) return false; - // Don't need a scratch register if we're not going to re-align the stack or - // emit stack probes. - if (!RegInfo->hasStackRealignment(*MF) && !TLI->hasInlineStackProbe(*MF)) - return true; - // Otherwise, we can use any block as long as it has a scratch register - // available. - return findScratchNonCalleeSaveRegister(TmpMBB) != AArch64::NoRegister; -} + if (RegInfo->hasStackRealignment(*MF) || TLI->hasInlineStackProbe(*MF)) + if (findScratchNonCalleeSaveRegister(TmpMBB) == AArch64::NoRegister) + return false; -static bool windowsRequiresStackProbe(MachineFunction &MF, - uint64_t StackSizeInBytes) { - const AArch64Subtarget &Subtarget = MF.getSubtarget(); - const AArch64FunctionInfo &MFI = *MF.getInfo(); - // TODO: When implementing stack protectors, take that into account - // for the probe threshold. - return Subtarget.isTargetWindows() && MFI.hasStackProbing() && - StackSizeInBytes >= uint64_t(MFI.getStackProbeSize()); + // May need a scratch register (for return value) if require making a special + // call + if (requiresSaveVG(*MF) || + windowsRequiresStackProbe(*MF, std::numeric_limits::max())) + if (findScratchNonCalleeSaveRegister(TmpMBB, true) == AArch64::NoRegister) + return false; + + return true; } static bool needsWinCFI(const MachineFunction &MF) { @@ -1378,8 +1391,8 @@ bool requiresGetVGCall(MachineFunction &MF) { !MF.getSubtarget().hasSVE(); } -static bool requiresSaveVG(MachineFunction &MF) { - AArch64FunctionInfo *AFI = MF.getInfo(); +static bool requiresSaveVG(const MachineFunction &MF) { + const AArch64FunctionInfo *AFI = MF.getInfo(); // For Darwin platforms we don't save VG for non-SVE functions, even if SME // is enabled with streaming mode changes. if (!AFI->hasStreamingModeChanges()) @@ -2049,6 +2062,29 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, if (AFI->getSVECalleeSavedStackSize()) report_fatal_error( "SVE callee saves not yet supported with stack probing"); + + // Find an available register to spill the value of X15 to, if X15 is being + // used already for nest. + unsigned X15Scratch = AArch64::NoRegister; + const AArch64Subtarget &STI = MF.getSubtarget(); + if (llvm::any_of(MBB.liveins(), + [&STI](const MachineBasicBlock::RegisterMaskPair &LiveIn) { + return STI.getRegisterInfo()->isSuperOrSubRegisterEq( + AArch64::X15, LiveIn.PhysReg); + })) { + X15Scratch = findScratchNonCalleeSaveRegister(&MBB, true); + assert(X15Scratch != AArch64::NoRegister && + (X15Scratch < AArch64::X15 || X15Scratch > AArch64::X17)); +#ifndef NDEBUG + LiveRegs.removeReg(AArch64::X15); // ignore X15 since we restore it +#endif + BuildMI(MBB, MBBI, DL, TII->get(AArch64::ORRXrr), X15Scratch) + .addReg(AArch64::XZR) + .addReg(AArch64::X15, RegState::Undef) + .addReg(AArch64::X15, RegState::Implicit) + .setMIFlag(MachineInstr::FrameSetup); + } + uint64_t NumWords = (NumBytes + RealignmentPadding) >> 4; if (NeedsWinCFI) { HasWinCFI = true; @@ -2171,6 +2207,13 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, // we've set a frame pointer and already finished the SEH prologue. assert(!NeedsWinCFI); } + if (X15Scratch != AArch64::NoRegister) { + BuildMI(MBB, MBBI, DL, TII->get(AArch64::ORRXrr), AArch64::X15) + .addReg(AArch64::XZR) + .addReg(X15Scratch, RegState::Undef) + .addReg(X15Scratch, RegState::Implicit) + .setMIFlag(MachineInstr::FrameSetup); + } } StackOffset SVECalleeSavesSize = {}, SVELocalsSize = SVEStackSize; @@ -3355,7 +3398,7 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters( unsigned X0Scratch = AArch64::NoRegister; if (Reg1 == AArch64::VG) { // Find an available register to store value of VG to. - Reg1 = findScratchNonCalleeSaveRegister(&MBB); + Reg1 = findScratchNonCalleeSaveRegister(&MBB, true); assert(Reg1 != AArch64::NoRegister); SMEAttrs Attrs = AFI->getSMEFnAttrs(); diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 766599d567ef..ad5b90984188 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -7126,59 +7126,80 @@ static SDValue LowerFLDEXP(SDValue Op, SelectionDAG &DAG) { SDValue AArch64TargetLowering::LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const { - // Note: x18 cannot be used for the Nest parameter on Windows and macOS. - if (Subtarget->isTargetDarwin() || Subtarget->isTargetWindows()) - report_fatal_error( - "ADJUST_TRAMPOLINE operation is only supported on Linux."); - return Op.getOperand(0); } SDValue AArch64TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const { - - // Note: x18 cannot be used for the Nest parameter on Windows and macOS. - if (Subtarget->isTargetDarwin() || Subtarget->isTargetWindows()) - report_fatal_error("INIT_TRAMPOLINE operation is only supported on Linux."); - SDValue Chain = Op.getOperand(0); - SDValue Trmp = Op.getOperand(1); // trampoline + SDValue Trmp = Op.getOperand(1); // trampoline, >=32 bytes SDValue FPtr = Op.getOperand(2); // nested function SDValue Nest = Op.getOperand(3); // 'nest' parameter value + + const Value *TrmpAddr = cast(Op.getOperand(4))->getValue(); + + // ldr NestReg, .+16 + // ldr x17, .+20 + // br x17 + // .word 0 + // .nest: .qword nest + // .fptr: .qword fptr + SDValue OutChains[5]; + + const Function *Func = + cast(cast(Op.getOperand(5))->getValue()); + CallingConv::ID CC = Func->getCallingConv(); + unsigned NestReg; + + switch (CC) { + default: + NestReg = 0x0f; // X15 + case CallingConv::ARM64EC_Thunk_Native: + case CallingConv::ARM64EC_Thunk_X64: + // Must be kept in sync with AArch64CallingConv.td + NestReg = 0x04; // X4 + break; + } + + const char FptrReg = 0x11; // X17 + + SDValue Addr = Trmp; + SDLoc dl(Op); + OutChains[0] = DAG.getStore( + Chain, dl, DAG.getConstant(0x58000080u | NestReg, dl, MVT::i32), Addr, + MachinePointerInfo(TrmpAddr)); - EVT PtrVT = getPointerTy(DAG.getDataLayout()); - Type *IntPtrTy = DAG.getDataLayout().getIntPtrType(*DAG.getContext()); + Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, + DAG.getConstant(4, dl, MVT::i64)); + OutChains[1] = DAG.getStore( + Chain, dl, DAG.getConstant(0x580000b0u | FptrReg, dl, MVT::i32), Addr, + MachinePointerInfo(TrmpAddr, 4)); - TargetLowering::ArgListTy Args; - TargetLowering::ArgListEntry Entry; + Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, + DAG.getConstant(8, dl, MVT::i64)); + OutChains[2] = + DAG.getStore(Chain, dl, DAG.getConstant(0xd61f0220u, dl, MVT::i32), Addr, + MachinePointerInfo(TrmpAddr, 8)); - Entry.Ty = IntPtrTy; - Entry.Node = Trmp; - Args.push_back(Entry); + Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, + DAG.getConstant(16, dl, MVT::i64)); + OutChains[3] = + DAG.getStore(Chain, dl, Nest, Addr, MachinePointerInfo(TrmpAddr, 16)); - if (auto *FI = dyn_cast(Trmp.getNode())) { - MachineFunction &MF = DAG.getMachineFunction(); - MachineFrameInfo &MFI = MF.getFrameInfo(); - Entry.Node = - DAG.getConstant(MFI.getObjectSize(FI->getIndex()), dl, MVT::i64); - } else - Entry.Node = DAG.getConstant(36, dl, MVT::i64); + Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, + DAG.getConstant(24, dl, MVT::i64)); + OutChains[4] = + DAG.getStore(Chain, dl, FPtr, Addr, MachinePointerInfo(TrmpAddr, 24)); - Args.push_back(Entry); - Entry.Node = FPtr; - Args.push_back(Entry); - Entry.Node = Nest; - Args.push_back(Entry); + SDValue StoreToken = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains); - // Lower to a call to __trampoline_setup(Trmp, TrampSize, FPtr, ctx_reg) - TargetLowering::CallLoweringInfo CLI(DAG); - CLI.setDebugLoc(dl).setChain(Chain).setLibCallee( - CallingConv::C, Type::getVoidTy(*DAG.getContext()), - DAG.getExternalSymbol("__trampoline_setup", PtrVT), std::move(Args)); + SDValue EndOfTrmp = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, + DAG.getConstant(12, dl, MVT::i64)); - std::pair CallResult = LowerCallTo(CLI); - return CallResult.second; + // Call clear cache on the trampoline instructions. + return DAG.getNode(ISD::CLEAR_CACHE, dl, MVT::Other, StoreToken, Trmp, + EndOfTrmp); } SDValue AArch64TargetLowering::LowerOperation(SDValue Op, diff --git a/llvm/lib/TargetParser/Triple.cpp b/llvm/lib/TargetParser/Triple.cpp index bd291e191821..5718ae385bac 100644 --- a/llvm/lib/TargetParser/Triple.cpp +++ b/llvm/lib/TargetParser/Triple.cpp @@ -1754,8 +1754,6 @@ unsigned Triple::getTrampolineSize() const { if (isOSLinux()) return 48; break; - case Triple::aarch64: - return 36; } return 32; } diff --git a/llvm/test/CodeGen/AArch64/nest-register.ll b/llvm/test/CodeGen/AArch64/nest-register.ll index 1e1c1b044bab..2e94dfba1fa5 100644 --- a/llvm/test/CodeGen/AArch64/nest-register.ll +++ b/llvm/test/CodeGen/AArch64/nest-register.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -disable-post-ra -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s ; Tests that the 'nest' parameter attribute causes the relevant parameter to be @@ -5,18 +6,21 @@ define ptr @nest_receiver(ptr nest %arg) nounwind { ; CHECK-LABEL: nest_receiver: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: mov x0, x18 -; CHECK-NEXT: ret +; CHECK: // %bb.0: +; CHECK-NEXT: mov x0, x15 +; CHECK-NEXT: ret ret ptr %arg } define ptr @nest_caller(ptr %arg) nounwind { ; CHECK-LABEL: nest_caller: -; CHECK: mov x18, x0 -; CHECK-NEXT: bl nest_receiver -; CHECK: ret +; CHECK: // %bb.0: +; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: mov x15, x0 +; CHECK-NEXT: bl nest_receiver +; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret %result = call ptr @nest_receiver(ptr nest %arg) ret ptr %result diff --git a/llvm/test/CodeGen/AArch64/statepoint-call-lowering.ll b/llvm/test/CodeGen/AArch64/statepoint-call-lowering.ll index 9619895c450c..32c3eaeb9c87 100644 --- a/llvm/test/CodeGen/AArch64/statepoint-call-lowering.ll +++ b/llvm/test/CodeGen/AArch64/statepoint-call-lowering.ll @@ -207,7 +207,7 @@ define void @test_attributes(ptr byval(%struct2) %s) gc "statepoint-example" { ; CHECK-NEXT: .cfi_offset w30, -16 ; CHECK-NEXT: ldr x8, [sp, #64] ; CHECK-NEXT: ldr q0, [sp, #48] -; CHECK-NEXT: mov x18, xzr +; CHECK-NEXT: mov x15, xzr ; CHECK-NEXT: mov w0, #42 // =0x2a ; CHECK-NEXT: mov w1, #17 // =0x11 ; CHECK-NEXT: str x8, [sp, #16] diff --git a/llvm/test/CodeGen/AArch64/trampoline.ll b/llvm/test/CodeGen/AArch64/trampoline.ll index 30ac2aa283b3..d9016b02a0f8 100644 --- a/llvm/test/CodeGen/AArch64/trampoline.ll +++ b/llvm/test/CodeGen/AArch64/trampoline.ll @@ -1,32 +1,265 @@ -; RUN: llc -mtriple=aarch64-- < %s | FileCheck %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=aarch64-linux-gnu < %s | FileCheck %s --check-prefixes=CHECK-LINUX +; RUN: llc -mtriple=aarch64-none-eabi < %s | FileCheck %s --check-prefixes=CHECK-LINUX +; RUN: llc -mtriple=aarch64-pc-windows-msvc < %s | FileCheck %s --check-prefix=CHECK-PC +; RUN: llc -mtriple=aarch64-apple-darwin < %s | FileCheck %s --check-prefixes=CHECK-APPLE @trampg = internal global [36 x i8] zeroinitializer, align 8 declare void @llvm.init.trampoline(ptr, ptr, ptr); declare ptr @llvm.adjust.trampoline(ptr); -define i64 @f(ptr nest %c, i64 %x, i64 %y) { - %sum = add i64 %x, %y - ret i64 %sum +define ptr @f(ptr nest %x, i64 %y) { +; CHECK-LINUX-LABEL: f: +; CHECK-LINUX: // %bb.0: +; CHECK-LINUX-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-LINUX-NEXT: sub sp, sp, #237, lsl #12 // =970752 +; CHECK-LINUX-NEXT: sub sp, sp, #3264 +; CHECK-LINUX-NEXT: .cfi_def_cfa_offset 974032 +; CHECK-LINUX-NEXT: .cfi_offset w29, -16 +; CHECK-LINUX-NEXT: add x0, x15, x0 +; CHECK-LINUX-NEXT: add sp, sp, #237, lsl #12 // =970752 +; CHECK-LINUX-NEXT: add sp, sp, #3264 +; CHECK-LINUX-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-LINUX-NEXT: ret +; +; CHECK-PC-LABEL: f: +; CHECK-PC: .seh_proc f +; CHECK-PC-NEXT: // %bb.0: +; CHECK-PC-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; CHECK-PC-NEXT: .seh_save_fplr_x 16 +; CHECK-PC-NEXT: mov x9, x15 +; CHECK-PC-NEXT: mov x15, #60876 // =0xedcc +; CHECK-PC-NEXT: .seh_nop +; CHECK-PC-NEXT: bl __chkstk +; CHECK-PC-NEXT: .seh_nop +; CHECK-PC-NEXT: sub sp, sp, x15, lsl #4 +; CHECK-PC-NEXT: .seh_stackalloc 974016 +; CHECK-PC-NEXT: mov x15, x9 +; CHECK-PC-NEXT: .seh_endprologue +; CHECK-PC-NEXT: add x0, x15, x0 +; CHECK-PC-NEXT: .seh_startepilogue +; CHECK-PC-NEXT: add sp, sp, #237, lsl #12 // =970752 +; CHECK-PC-NEXT: .seh_stackalloc 970752 +; CHECK-PC-NEXT: add sp, sp, #3264 +; CHECK-PC-NEXT: .seh_stackalloc 3264 +; CHECK-PC-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; CHECK-PC-NEXT: .seh_save_fplr_x 16 +; CHECK-PC-NEXT: .seh_endepilogue +; CHECK-PC-NEXT: ret +; CHECK-PC-NEXT: .seh_endfunclet +; CHECK-PC-NEXT: .seh_endproc +; +; CHECK-APPLE-LABEL: f: +; CHECK-APPLE: ; %bb.0: +; CHECK-APPLE-NEXT: stp x28, x27, [sp, #-16]! ; 16-byte Folded Spill +; CHECK-APPLE-NEXT: sub sp, sp, #237, lsl #12 ; =970752 +; CHECK-APPLE-NEXT: sub sp, sp, #3264 +; CHECK-APPLE-NEXT: .cfi_def_cfa_offset 974032 +; CHECK-APPLE-NEXT: .cfi_offset w27, -8 +; CHECK-APPLE-NEXT: .cfi_offset w28, -16 +; CHECK-APPLE-NEXT: add x0, x15, x0 +; CHECK-APPLE-NEXT: add sp, sp, #237, lsl #12 ; =970752 +; CHECK-APPLE-NEXT: add sp, sp, #3264 +; CHECK-APPLE-NEXT: ldp x28, x27, [sp], #16 ; 16-byte Folded Reload +; CHECK-APPLE-NEXT: ret + %chkstack = alloca [u0xedcba x i8] + %sum = getelementptr i8, ptr %x, i64 %y + ret ptr %sum } define i64 @func1() { +; CHECK-LINUX-LABEL: func1: +; CHECK-LINUX: // %bb.0: +; CHECK-LINUX-NEXT: sub sp, sp, #64 +; CHECK-LINUX-NEXT: str x30, [sp, #48] // 8-byte Folded Spill +; CHECK-LINUX-NEXT: .cfi_def_cfa_offset 64 +; CHECK-LINUX-NEXT: .cfi_offset w30, -16 +; CHECK-LINUX-NEXT: adrp x8, :got:f +; CHECK-LINUX-NEXT: mov w9, #544 // =0x220 +; CHECK-LINUX-NEXT: add x0, sp, #8 +; CHECK-LINUX-NEXT: ldr x8, [x8, :got_lo12:f] +; CHECK-LINUX-NEXT: movk w9, #54815, lsl #16 +; CHECK-LINUX-NEXT: str w9, [sp, #16] +; CHECK-LINUX-NEXT: add x9, sp, #56 +; CHECK-LINUX-NEXT: stp x9, x8, [sp, #24] +; CHECK-LINUX-NEXT: mov x8, #132 // =0x84 +; CHECK-LINUX-NEXT: movk x8, #22528, lsl #16 +; CHECK-LINUX-NEXT: movk x8, #177, lsl #32 +; CHECK-LINUX-NEXT: movk x8, #22528, lsl #48 +; CHECK-LINUX-NEXT: str x8, [sp, #8] +; CHECK-LINUX-NEXT: add x8, sp, #8 +; CHECK-LINUX-NEXT: add x1, x8, #12 +; CHECK-LINUX-NEXT: bl __clear_cache +; CHECK-LINUX-NEXT: ldr x30, [sp, #48] // 8-byte Folded Reload +; CHECK-LINUX-NEXT: mov x0, xzr +; CHECK-LINUX-NEXT: add sp, sp, #64 +; CHECK-LINUX-NEXT: ret +; +; CHECK-PC-LABEL: func1: +; CHECK-PC: .seh_proc func1 +; CHECK-PC-NEXT: // %bb.0: +; CHECK-PC-NEXT: sub sp, sp, #64 +; CHECK-PC-NEXT: .seh_stackalloc 64 +; CHECK-PC-NEXT: str x30, [sp, #48] // 8-byte Folded Spill +; CHECK-PC-NEXT: .seh_save_reg x30, 48 +; CHECK-PC-NEXT: .seh_endprologue +; CHECK-PC-NEXT: adrp x8, f +; CHECK-PC-NEXT: add x8, x8, :lo12:f +; CHECK-PC-NEXT: add x9, sp, #56 +; CHECK-PC-NEXT: stp x9, x8, [sp, #24] +; CHECK-PC-NEXT: mov w8, #544 // =0x220 +; CHECK-PC-NEXT: add x0, sp, #8 +; CHECK-PC-NEXT: movk w8, #54815, lsl #16 +; CHECK-PC-NEXT: str w8, [sp, #16] +; CHECK-PC-NEXT: mov x8, #132 // =0x84 +; CHECK-PC-NEXT: movk x8, #22528, lsl #16 +; CHECK-PC-NEXT: movk x8, #177, lsl #32 +; CHECK-PC-NEXT: movk x8, #22528, lsl #48 +; CHECK-PC-NEXT: str x8, [sp, #8] +; CHECK-PC-NEXT: add x8, sp, #8 +; CHECK-PC-NEXT: add x1, x8, #12 +; CHECK-PC-NEXT: bl __clear_cache +; CHECK-PC-NEXT: mov x0, xzr +; CHECK-PC-NEXT: .seh_startepilogue +; CHECK-PC-NEXT: ldr x30, [sp, #48] // 8-byte Folded Reload +; CHECK-PC-NEXT: .seh_save_reg x30, 48 +; CHECK-PC-NEXT: add sp, sp, #64 +; CHECK-PC-NEXT: .seh_stackalloc 64 +; CHECK-PC-NEXT: .seh_endepilogue +; CHECK-PC-NEXT: ret +; CHECK-PC-NEXT: .seh_endfunclet +; CHECK-PC-NEXT: .seh_endproc +; +; CHECK-APPLE-LABEL: func1: +; CHECK-APPLE: ; %bb.0: +; CHECK-APPLE-NEXT: sub sp, sp, #64 +; CHECK-APPLE-NEXT: stp x29, x30, [sp, #48] ; 16-byte Folded Spill +; CHECK-APPLE-NEXT: .cfi_def_cfa_offset 64 +; CHECK-APPLE-NEXT: .cfi_offset w30, -8 +; CHECK-APPLE-NEXT: .cfi_offset w29, -16 +; CHECK-APPLE-NEXT: Lloh0: +; CHECK-APPLE-NEXT: adrp x8, _f@PAGE +; CHECK-APPLE-NEXT: Lloh1: +; CHECK-APPLE-NEXT: add x8, x8, _f@PAGEOFF +; CHECK-APPLE-NEXT: add x9, sp, #40 +; CHECK-APPLE-NEXT: stp x9, x8, [sp, #16] +; CHECK-APPLE-NEXT: mov w8, #544 ; =0x220 +; CHECK-APPLE-NEXT: mov x0, sp +; CHECK-APPLE-NEXT: movk w8, #54815, lsl #16 +; CHECK-APPLE-NEXT: str w8, [sp, #8] +; CHECK-APPLE-NEXT: mov x8, #132 ; =0x84 +; CHECK-APPLE-NEXT: movk x8, #22528, lsl #16 +; CHECK-APPLE-NEXT: movk x8, #177, lsl #32 +; CHECK-APPLE-NEXT: movk x8, #22528, lsl #48 +; CHECK-APPLE-NEXT: str x8, [sp] +; CHECK-APPLE-NEXT: mov x8, sp +; CHECK-APPLE-NEXT: add x1, x8, #12 +; CHECK-APPLE-NEXT: bl ___clear_cache +; CHECK-APPLE-NEXT: ldp x29, x30, [sp, #48] ; 16-byte Folded Reload +; CHECK-APPLE-NEXT: mov x0, xzr +; CHECK-APPLE-NEXT: add sp, sp, #64 +; CHECK-APPLE-NEXT: ret +; CHECK-APPLE-NEXT: .loh AdrpAdd Lloh0, Lloh1 %val = alloca i64 - %nval = bitcast ptr %val to ptr %tramp = alloca [36 x i8], align 8 - ; CHECK: mov w1, #36 - ; CHECK: bl __trampoline_setup - call void @llvm.init.trampoline(ptr %tramp, ptr @f, ptr %nval) + call void @llvm.init.trampoline(ptr %tramp, ptr @f, ptr %val) %fp = call ptr @llvm.adjust.trampoline(ptr %tramp) ret i64 0 } define i64 @func2() { +; CHECK-LINUX-LABEL: func2: +; CHECK-LINUX: // %bb.0: +; CHECK-LINUX-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill +; CHECK-LINUX-NEXT: .cfi_def_cfa_offset 16 +; CHECK-LINUX-NEXT: .cfi_offset w30, -16 +; CHECK-LINUX-NEXT: adrp x8, :got:f +; CHECK-LINUX-NEXT: mov w9, #544 // =0x220 +; CHECK-LINUX-NEXT: adrp x0, trampg +; CHECK-LINUX-NEXT: add x0, x0, :lo12:trampg +; CHECK-LINUX-NEXT: ldr x8, [x8, :got_lo12:f] +; CHECK-LINUX-NEXT: movk w9, #54815, lsl #16 +; CHECK-LINUX-NEXT: str w9, [x0, #8] +; CHECK-LINUX-NEXT: add x9, sp, #8 +; CHECK-LINUX-NEXT: add x1, x0, #12 +; CHECK-LINUX-NEXT: stp x9, x8, [x0, #16] +; CHECK-LINUX-NEXT: mov x8, #132 // =0x84 +; CHECK-LINUX-NEXT: movk x8, #22528, lsl #16 +; CHECK-LINUX-NEXT: movk x8, #177, lsl #32 +; CHECK-LINUX-NEXT: movk x8, #22528, lsl #48 +; CHECK-LINUX-NEXT: str x8, [x0] +; CHECK-LINUX-NEXT: bl __clear_cache +; CHECK-LINUX-NEXT: mov x0, xzr +; CHECK-LINUX-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; CHECK-LINUX-NEXT: ret +; +; CHECK-PC-LABEL: func2: +; CHECK-PC: .seh_proc func2 +; CHECK-PC-NEXT: // %bb.0: +; CHECK-PC-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill +; CHECK-PC-NEXT: .seh_save_reg_x x30, 16 +; CHECK-PC-NEXT: .seh_endprologue +; CHECK-PC-NEXT: adrp x0, trampg +; CHECK-PC-NEXT: add x0, x0, :lo12:trampg +; CHECK-PC-NEXT: adrp x8, f +; CHECK-PC-NEXT: add x8, x8, :lo12:f +; CHECK-PC-NEXT: add x9, sp, #8 +; CHECK-PC-NEXT: add x1, x0, #12 +; CHECK-PC-NEXT: stp x9, x8, [x0, #16] +; CHECK-PC-NEXT: mov w8, #544 // =0x220 +; CHECK-PC-NEXT: movk w8, #54815, lsl #16 +; CHECK-PC-NEXT: str w8, [x0, #8] +; CHECK-PC-NEXT: mov x8, #132 // =0x84 +; CHECK-PC-NEXT: movk x8, #22528, lsl #16 +; CHECK-PC-NEXT: movk x8, #177, lsl #32 +; CHECK-PC-NEXT: movk x8, #22528, lsl #48 +; CHECK-PC-NEXT: str x8, [x0] +; CHECK-PC-NEXT: bl __clear_cache +; CHECK-PC-NEXT: mov x0, xzr +; CHECK-PC-NEXT: .seh_startepilogue +; CHECK-PC-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; CHECK-PC-NEXT: .seh_save_reg_x x30, 16 +; CHECK-PC-NEXT: .seh_endepilogue +; CHECK-PC-NEXT: ret +; CHECK-PC-NEXT: .seh_endfunclet +; CHECK-PC-NEXT: .seh_endproc +; +; CHECK-APPLE-LABEL: func2: +; CHECK-APPLE: ; %bb.0: +; CHECK-APPLE-NEXT: sub sp, sp, #32 +; CHECK-APPLE-NEXT: stp x29, x30, [sp, #16] ; 16-byte Folded Spill +; CHECK-APPLE-NEXT: .cfi_def_cfa_offset 32 +; CHECK-APPLE-NEXT: .cfi_offset w30, -8 +; CHECK-APPLE-NEXT: .cfi_offset w29, -16 +; CHECK-APPLE-NEXT: Lloh2: +; CHECK-APPLE-NEXT: adrp x0, _trampg@PAGE +; CHECK-APPLE-NEXT: Lloh3: +; CHECK-APPLE-NEXT: add x0, x0, _trampg@PAGEOFF +; CHECK-APPLE-NEXT: Lloh4: +; CHECK-APPLE-NEXT: adrp x8, _f@PAGE +; CHECK-APPLE-NEXT: Lloh5: +; CHECK-APPLE-NEXT: add x8, x8, _f@PAGEOFF +; CHECK-APPLE-NEXT: add x9, sp, #8 +; CHECK-APPLE-NEXT: add x1, x0, #12 +; CHECK-APPLE-NEXT: stp x9, x8, [x0, #16] +; CHECK-APPLE-NEXT: mov w8, #544 ; =0x220 +; CHECK-APPLE-NEXT: movk w8, #54815, lsl #16 +; CHECK-APPLE-NEXT: str w8, [x0, #8] +; CHECK-APPLE-NEXT: mov x8, #132 ; =0x84 +; CHECK-APPLE-NEXT: movk x8, #22528, lsl #16 +; CHECK-APPLE-NEXT: movk x8, #177, lsl #32 +; CHECK-APPLE-NEXT: movk x8, #22528, lsl #48 +; CHECK-APPLE-NEXT: str x8, [x0] +; CHECK-APPLE-NEXT: bl ___clear_cache +; CHECK-APPLE-NEXT: ldp x29, x30, [sp, #16] ; 16-byte Folded Reload +; CHECK-APPLE-NEXT: mov x0, xzr +; CHECK-APPLE-NEXT: add sp, sp, #32 +; CHECK-APPLE-NEXT: ret +; CHECK-APPLE-NEXT: .loh AdrpAdd Lloh4, Lloh5 +; CHECK-APPLE-NEXT: .loh AdrpAdd Lloh2, Lloh3 %val = alloca i64 - %nval = bitcast ptr %val to ptr - ; CHECK: mov w1, #36 - ; CHECK: bl __trampoline_setup - call void @llvm.init.trampoline(ptr @trampg, ptr @f, ptr %nval) + call void @llvm.init.trampoline(ptr @trampg, ptr @f, ptr %val) %fp = call ptr @llvm.adjust.trampoline(ptr @trampg) ret i64 0 } diff --git a/llvm/test/CodeGen/AArch64/win64cc-x18.ll b/llvm/test/CodeGen/AArch64/win64cc-x18.ll index b3e78cc9bbb8..4b45c300e9c1 100644 --- a/llvm/test/CodeGen/AArch64/win64cc-x18.ll +++ b/llvm/test/CodeGen/AArch64/win64cc-x18.ll @@ -1,35 +1,26 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +;; Testing that nest uses x15 on all calling conventions (except Arm64EC) -;; Testing that x18 is not clobbered when passing pointers with the nest -;; attribute on windows - -; RUN: llc < %s -mtriple=aarch64-pc-windows-msvc | FileCheck %s --check-prefixes=CHECK,CHECK-NO-X18 -; RUN: llc < %s -mtriple=aarch64-linux-gnu | FileCheck %s --check-prefixes=CHECK,CHECK-X18 +; RUN: llc < %s -mtriple=aarch64-pc-windows-msvc | FileCheck %s +; RUN: llc < %s -mtriple=aarch64-linux-gnu | FileCheck %s +; RUN: llc < %s -mtriple=aarch64-apple-darwin- | FileCheck %s define dso_local i64 @other(ptr nest %p) #0 { ; CHECK-LABEL: other: -; CHECK-X18: ldr x0, [x18] -; CHECK-NO-X18: ldr x0, [x0] +; CHECK: ldr x0, [x15] +; CHECK: ret %r = load i64, ptr %p -; CHECK: ret ret i64 %r } define dso_local void @func() #0 { ; CHECK-LABEL: func: - - +; CHECK: add x15, sp, #8 +; CHECK: bl {{_?other}} +; CHECK: ret entry: %p = alloca i64 -; CHECK: mov w8, #1 -; CHECK: stp x30, x8, [sp, #-16] -; CHECK-X18: add x18, sp, #8 store i64 1, ptr %p -; CHECK-NO-X18: add x0, sp, #8 -; CHECK: bl other call void @other(ptr nest %p) -; CHECK: ldr x30, [sp], #16 -; CHECK: ret ret void } diff --git a/llvm/test/CodeGen/AArch64/zero-call-used-regs.ll b/llvm/test/CodeGen/AArch64/zero-call-used-regs.ll index 4799ea3bcd19..986666e015e9 100644 --- a/llvm/test/CodeGen/AArch64/zero-call-used-regs.ll +++ b/llvm/test/CodeGen/AArch64/zero-call-used-regs.ll @@ -93,7 +93,7 @@ define dso_local i32 @all_gpr_arg(i32 noundef %a, i32 noundef %b, i32 noundef %c ; CHECK-NEXT: mov x5, #0 // =0x0 ; CHECK-NEXT: mov x6, #0 // =0x0 ; CHECK-NEXT: mov x7, #0 // =0x0 -; CHECK-NEXT: mov x18, #0 // =0x0 +; CHECK-NEXT: mov x15, #0 // =0x0 ; CHECK-NEXT: orr w0, w8, w2 ; CHECK-NEXT: mov x2, #0 // =0x0 ; CHECK-NEXT: mov x8, #0 // =0x0 @@ -146,7 +146,7 @@ define dso_local i32 @all_arg(i32 noundef %a, i32 noundef %b, i32 noundef %c) lo ; DEFAULT-NEXT: mov x5, #0 // =0x0 ; DEFAULT-NEXT: mov x6, #0 // =0x0 ; DEFAULT-NEXT: mov x7, #0 // =0x0 -; DEFAULT-NEXT: mov x18, #0 // =0x0 +; DEFAULT-NEXT: mov x15, #0 // =0x0 ; DEFAULT-NEXT: movi v0.2d, #0000000000000000 ; DEFAULT-NEXT: orr w0, w8, w2 ; DEFAULT-NEXT: mov x2, #0 // =0x0 @@ -169,7 +169,7 @@ define dso_local i32 @all_arg(i32 noundef %a, i32 noundef %b, i32 noundef %c) lo ; SVE-OR-SME-NEXT: mov x5, #0 // =0x0 ; SVE-OR-SME-NEXT: mov x6, #0 // =0x0 ; SVE-OR-SME-NEXT: mov x7, #0 // =0x0 -; SVE-OR-SME-NEXT: mov x18, #0 // =0x0 +; SVE-OR-SME-NEXT: mov x15, #0 // =0x0 ; SVE-OR-SME-NEXT: mov z0.d, #0 // =0x0 ; SVE-OR-SME-NEXT: orr w0, w8, w2 ; SVE-OR-SME-NEXT: mov x2, #0 // =0x0 @@ -196,7 +196,7 @@ define dso_local i32 @all_arg(i32 noundef %a, i32 noundef %b, i32 noundef %c) lo ; STREAMING-COMPAT-NEXT: mov x5, #0 // =0x0 ; STREAMING-COMPAT-NEXT: mov x6, #0 // =0x0 ; STREAMING-COMPAT-NEXT: mov x7, #0 // =0x0 -; STREAMING-COMPAT-NEXT: mov x18, #0 // =0x0 +; STREAMING-COMPAT-NEXT: mov x15, #0 // =0x0 ; STREAMING-COMPAT-NEXT: fmov d0, xzr ; STREAMING-COMPAT-NEXT: orr w0, w8, w2 ; STREAMING-COMPAT-NEXT: mov x2, #0 // =0x0 @@ -492,7 +492,7 @@ define dso_local double @all_gpr_arg_float(double noundef %a, float noundef %b) ; CHECK-NEXT: mov x6, #0 // =0x0 ; CHECK-NEXT: mov x7, #0 // =0x0 ; CHECK-NEXT: mov x8, #0 // =0x0 -; CHECK-NEXT: mov x18, #0 // =0x0 +; CHECK-NEXT: mov x15, #0 // =0x0 ; CHECK-NEXT: ret entry: @@ -547,7 +547,7 @@ define dso_local double @all_arg_float(double noundef %a, float noundef %b) loca ; DEFAULT-NEXT: mov x6, #0 // =0x0 ; DEFAULT-NEXT: mov x7, #0 // =0x0 ; DEFAULT-NEXT: mov x8, #0 // =0x0 -; DEFAULT-NEXT: mov x18, #0 // =0x0 +; DEFAULT-NEXT: mov x15, #0 // =0x0 ; DEFAULT-NEXT: movi v1.2d, #0000000000000000 ; DEFAULT-NEXT: movi v2.2d, #0000000000000000 ; DEFAULT-NEXT: movi v3.2d, #0000000000000000 @@ -570,7 +570,7 @@ define dso_local double @all_arg_float(double noundef %a, float noundef %b) loca ; SVE-OR-SME-NEXT: mov x6, #0 // =0x0 ; SVE-OR-SME-NEXT: mov x7, #0 // =0x0 ; SVE-OR-SME-NEXT: mov x8, #0 // =0x0 -; SVE-OR-SME-NEXT: mov x18, #0 // =0x0 +; SVE-OR-SME-NEXT: mov x15, #0 // =0x0 ; SVE-OR-SME-NEXT: mov z1.d, #0 // =0x0 ; SVE-OR-SME-NEXT: mov z2.d, #0 // =0x0 ; SVE-OR-SME-NEXT: mov z3.d, #0 // =0x0 @@ -597,7 +597,7 @@ define dso_local double @all_arg_float(double noundef %a, float noundef %b) loca ; STREAMING-COMPAT-NEXT: mov x6, #0 // =0x0 ; STREAMING-COMPAT-NEXT: mov x7, #0 // =0x0 ; STREAMING-COMPAT-NEXT: mov x8, #0 // =0x0 -; STREAMING-COMPAT-NEXT: mov x18, #0 // =0x0 +; STREAMING-COMPAT-NEXT: mov x15, #0 // =0x0 ; STREAMING-COMPAT-NEXT: fmov d1, xzr ; STREAMING-COMPAT-NEXT: fmov d2, xzr ; STREAMING-COMPAT-NEXT: fmov d3, xzr From bb3b8306dc226c4dc4dfde36444b43476eea66ee Mon Sep 17 00:00:00 2001 From: Chuanqi Xu Date: Thu, 12 Jun 2025 10:48:32 +0800 Subject: [PATCH 0078/1322] [NFC] [C++20] [Modules] Add a test module local declaration lookup From https://github.com/llvm/llvm-project/issues/143734, but it looks good on trunk. Add it as tests are always good. --- .../Modules/module-local-declarations.cppm | 30 +++++++++++++++++++ 1 file changed, 30 insertions(+) create mode 100644 clang/test/Modules/module-local-declarations.cppm diff --git a/clang/test/Modules/module-local-declarations.cppm b/clang/test/Modules/module-local-declarations.cppm new file mode 100644 index 000000000000..4fbcf09e4d79 --- /dev/null +++ b/clang/test/Modules/module-local-declarations.cppm @@ -0,0 +1,30 @@ +// RUN: rm -rf %t +// RUN: mkdir -p %t +// RUN: split-file %s %t +// +// RUN: %clang_cc1 -std=c++20 %t/Base.cppm -emit-module-interface -o %t/Base.pcm +// RUN: %clang_cc1 -std=c++20 %t/A.cppm -emit-module-interface -o %t/A.pcm -fprebuilt-module-path=%t +// RUN: %clang_cc1 -std=c++20 %t/B.cppm -fsyntax-only -verify -fprebuilt-module-path=%t + +//--- Base.cppm +export module Base; +export template +class Base {}; + +//--- A.cppm +export module A; +import Base; +struct S {}; + +export Base a; + +//--- B.cppm +// expected-no-diagnostics +export module B; + +import A; +import Base; + +struct S {}; + +export Base b; From de51b2dd3c6fc995e7db56fc50b4c8dceddc0aab Mon Sep 17 00:00:00 2001 From: Jonas Devlieghere Date: Wed, 11 Jun 2025 19:51:05 -0700 Subject: [PATCH 0079/1322] [lldb] Move Transport class into lldb_private (NFC) (#143806) Move lldb-dap's Transport class into lldb_private so the code can be shared between the "JSON with header" protocol used by DAP and the JSON RPC protocol used by MCP (see [1]). [1]: https://discourse.llvm.org/t/rfc-adding-mcp-support-to-lldb/86798 --- lldb/include/lldb/Host/JSONTransport.h | 126 +++++++++++++++++++ lldb/source/Host/CMakeLists.txt | 3 +- lldb/source/Host/common/JSONTransport.cpp | 147 ++++++++++++++++++++++ lldb/tools/lldb-dap/DAP.cpp | 7 +- lldb/tools/lldb-dap/Transport.cpp | 145 +-------------------- lldb/tools/lldb-dap/Transport.h | 65 ++-------- lldb/unittests/DAP/DAPTest.cpp | 7 +- lldb/unittests/DAP/TestBase.cpp | 3 +- lldb/unittests/DAP/TransportTest.cpp | 16 ++- 9 files changed, 308 insertions(+), 211 deletions(-) create mode 100644 lldb/include/lldb/Host/JSONTransport.h create mode 100644 lldb/source/Host/common/JSONTransport.cpp diff --git a/lldb/include/lldb/Host/JSONTransport.h b/lldb/include/lldb/Host/JSONTransport.h new file mode 100644 index 000000000000..4db5e417ea85 --- /dev/null +++ b/lldb/include/lldb/Host/JSONTransport.h @@ -0,0 +1,126 @@ +//===-- JSONTransport.h ---------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Transport layer for encoding and decoding JSON protocol messages. +// +//===----------------------------------------------------------------------===// + +#ifndef LLDB_HOST_JSONTRANSPORT_H +#define LLDB_HOST_JSONTRANSPORT_H + +#include "lldb/lldb-forward.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/Support/Error.h" +#include "llvm/Support/FormatVariadic.h" +#include "llvm/Support/JSON.h" +#include +#include + +namespace lldb_private { + +class TransportEOFError : public llvm::ErrorInfo { +public: + static char ID; + + TransportEOFError() = default; + + void log(llvm::raw_ostream &OS) const override { + OS << "transport end of file reached"; + } + std::error_code convertToErrorCode() const override { + return llvm::inconvertibleErrorCode(); + } +}; + +class TransportTimeoutError : public llvm::ErrorInfo { +public: + static char ID; + + TransportTimeoutError() = default; + + void log(llvm::raw_ostream &OS) const override { + OS << "transport operation timed out"; + } + std::error_code convertToErrorCode() const override { + return std::make_error_code(std::errc::timed_out); + } +}; + +class TransportClosedError : public llvm::ErrorInfo { +public: + static char ID; + + TransportClosedError() = default; + + void log(llvm::raw_ostream &OS) const override { + OS << "transport is closed"; + } + std::error_code convertToErrorCode() const override { + return llvm::inconvertibleErrorCode(); + } +}; + +/// A transport class that uses JSON for communication. +class JSONTransport { +public: + JSONTransport(lldb::IOObjectSP input, lldb::IOObjectSP output); + virtual ~JSONTransport() = default; + + /// Transport is not copyable. + /// @{ + JSONTransport(const JSONTransport &rhs) = delete; + void operator=(const JSONTransport &rhs) = delete; + /// @} + + /// Writes a message to the output stream. + template llvm::Error Write(const T &t) { + const std::string message = llvm::formatv("{0}", toJSON(t)).str(); + return WriteImpl(message); + } + + /// Reads the next message from the input stream. + template + llvm::Expected Read(const std::chrono::microseconds &timeout) { + llvm::Expected message = ReadImpl(timeout); + if (!message) + return message.takeError(); + return llvm::json::parse(/*JSON=*/*message); + } + +protected: + virtual void Log(llvm::StringRef message); + + virtual llvm::Error WriteImpl(const std::string &message) = 0; + virtual llvm::Expected + ReadImpl(const std::chrono::microseconds &timeout) = 0; + + lldb::IOObjectSP m_input; + lldb::IOObjectSP m_output; +}; + +/// A transport class for JSON with a HTTP header. +class HTTPDelimitedJSONTransport : public JSONTransport { +public: + HTTPDelimitedJSONTransport(lldb::IOObjectSP input, lldb::IOObjectSP output) + : JSONTransport(input, output) {} + virtual ~HTTPDelimitedJSONTransport() = default; + +protected: + virtual llvm::Error WriteImpl(const std::string &message) override; + virtual llvm::Expected + ReadImpl(const std::chrono::microseconds &timeout) override; + + // FIXME: Support any header. + static constexpr llvm::StringLiteral kHeaderContentLength = + "Content-Length: "; + static constexpr llvm::StringLiteral kHeaderSeparator = "\r\n\r\n"; +}; + +} // namespace lldb_private + +#endif diff --git a/lldb/source/Host/CMakeLists.txt b/lldb/source/Host/CMakeLists.txt index 5b713133afea..b15d72e61b6e 100644 --- a/lldb/source/Host/CMakeLists.txt +++ b/lldb/source/Host/CMakeLists.txt @@ -27,8 +27,9 @@ add_host_subdirectory(common common/HostNativeThreadBase.cpp common/HostProcess.cpp common/HostThread.cpp - common/LockFileBase.cpp + common/JSONTransport.cpp common/LZMA.cpp + common/LockFileBase.cpp common/MainLoopBase.cpp common/MemoryMonitor.cpp common/MonitoringProcessLauncher.cpp diff --git a/lldb/source/Host/common/JSONTransport.cpp b/lldb/source/Host/common/JSONTransport.cpp new file mode 100644 index 000000000000..103c76d25daf --- /dev/null +++ b/lldb/source/Host/common/JSONTransport.cpp @@ -0,0 +1,147 @@ +//===-- JSONTransport.cpp -------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "lldb/Host/JSONTransport.h" +#include "lldb/Utility/IOObject.h" +#include "lldb/Utility/LLDBLog.h" +#include "lldb/Utility/Log.h" +#include "lldb/Utility/SelectHelper.h" +#include "lldb/Utility/Status.h" +#include "lldb/lldb-forward.h" +#include "llvm/ADT/StringExtras.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/Support/Error.h" +#include "llvm/Support/raw_ostream.h" +#include +#include +#include + +using namespace llvm; +using namespace lldb; +using namespace lldb_private; + +/// ReadFull attempts to read the specified number of bytes. If EOF is +/// encountered, an empty string is returned. +static Expected +ReadFull(IOObject &descriptor, size_t length, + std::optional timeout = std::nullopt) { + if (!descriptor.IsValid()) + return llvm::make_error(); + + bool timeout_supported = true; + // FIXME: SelectHelper does not work with NativeFile on Win32. +#if _WIN32 + timeout_supported = descriptor.GetFdType() == IOObject::eFDTypeSocket; +#endif + + if (timeout && timeout_supported) { + SelectHelper sh; + sh.SetTimeout(*timeout); + sh.FDSetRead(descriptor.GetWaitableHandle()); + Status status = sh.Select(); + if (status.Fail()) { + // Convert timeouts into a specific error. + if (status.GetType() == lldb::eErrorTypePOSIX && + status.GetError() == ETIMEDOUT) + return make_error(); + return status.takeError(); + } + } + + std::string data; + data.resize(length); + Status status = descriptor.Read(data.data(), length); + if (status.Fail()) + return status.takeError(); + + // Read returns '' on EOF. + if (length == 0) + return make_error(); + + // Return the actual number of bytes read. + return data.substr(0, length); +} + +static Expected +ReadUntil(IOObject &descriptor, StringRef delimiter, + std::optional timeout = std::nullopt) { + std::string buffer; + buffer.reserve(delimiter.size() + 1); + while (!llvm::StringRef(buffer).ends_with(delimiter)) { + Expected next = + ReadFull(descriptor, buffer.empty() ? delimiter.size() : 1, timeout); + if (auto Err = next.takeError()) + return std::move(Err); + buffer += *next; + } + return buffer.substr(0, buffer.size() - delimiter.size()); +} + +JSONTransport::JSONTransport(IOObjectSP input, IOObjectSP output) + : m_input(std::move(input)), m_output(std::move(output)) {} + +void JSONTransport::Log(llvm::StringRef message) { + LLDB_LOG(GetLog(LLDBLog::Host), "{0}", message); +} + +Expected +HTTPDelimitedJSONTransport::ReadImpl(const std::chrono::microseconds &timeout) { + if (!m_input || !m_input->IsValid()) + return createStringError("transport output is closed"); + + IOObject *input = m_input.get(); + Expected message_header = + ReadFull(*input, kHeaderContentLength.size(), timeout); + if (!message_header) + return message_header.takeError(); + if (*message_header != kHeaderContentLength) + return createStringError(formatv("expected '{0}' and got '{1}'", + kHeaderContentLength, *message_header) + .str()); + + Expected raw_length = ReadUntil(*input, kHeaderSeparator); + if (!raw_length) + return handleErrors(raw_length.takeError(), + [&](const TransportEOFError &E) -> llvm::Error { + return createStringError( + "unexpected EOF while reading header separator"); + }); + + size_t length; + if (!to_integer(*raw_length, length)) + return createStringError( + formatv("invalid content length {0}", *raw_length).str()); + + Expected raw_json = ReadFull(*input, length); + if (!raw_json) + return handleErrors( + raw_json.takeError(), [&](const TransportEOFError &E) -> llvm::Error { + return createStringError("unexpected EOF while reading JSON"); + }); + + Log(llvm::formatv("--> {0}", *raw_json).str()); + + return raw_json; +} + +Error HTTPDelimitedJSONTransport::WriteImpl(const std::string &message) { + if (!m_output || !m_output->IsValid()) + return llvm::make_error(); + + Log(llvm::formatv("<-- {0}", message).str()); + + std::string Output; + raw_string_ostream OS(Output); + OS << kHeaderContentLength << message.length() << kHeaderSeparator << message; + size_t num_bytes = Output.size(); + return m_output->Write(Output.data(), num_bytes).takeError(); +} + +char TransportEOFError::ID; +char TransportTimeoutError::ID; +char TransportClosedError::ID; diff --git a/lldb/tools/lldb-dap/DAP.cpp b/lldb/tools/lldb-dap/DAP.cpp index b034c967594b..9fe8227cd2d6 100644 --- a/lldb/tools/lldb-dap/DAP.cpp +++ b/lldb/tools/lldb-dap/DAP.cpp @@ -70,6 +70,7 @@ using namespace lldb_dap; using namespace lldb_dap::protocol; +using namespace lldb_private; namespace { #ifdef _WIN32 @@ -893,14 +894,14 @@ llvm::Error DAP::Loop() { while (!disconnecting) { llvm::Expected next = - transport.Read(std::chrono::seconds(1)); - if (next.errorIsA()) { + transport.Read(std::chrono::seconds(1)); + if (next.errorIsA()) { consumeError(next.takeError()); break; } // If the read timed out, continue to check if we should disconnect. - if (next.errorIsA()) { + if (next.errorIsA()) { consumeError(next.takeError()); continue; } diff --git a/lldb/tools/lldb-dap/Transport.cpp b/lldb/tools/lldb-dap/Transport.cpp index 4e322e9ff135..d602920da34e 100644 --- a/lldb/tools/lldb-dap/Transport.cpp +++ b/lldb/tools/lldb-dap/Transport.cpp @@ -8,152 +8,19 @@ #include "Transport.h" #include "DAPLog.h" -#include "Protocol/ProtocolBase.h" -#include "lldb/Utility/IOObject.h" -#include "lldb/Utility/SelectHelper.h" -#include "lldb/Utility/Status.h" #include "lldb/lldb-forward.h" -#include "llvm/ADT/StringExtras.h" #include "llvm/ADT/StringRef.h" -#include "llvm/Support/Error.h" -#include "llvm/Support/raw_ostream.h" -#include -#include -#include using namespace llvm; using namespace lldb; using namespace lldb_private; using namespace lldb_dap; -using namespace lldb_dap::protocol; -/// ReadFull attempts to read the specified number of bytes. If EOF is -/// encountered, an empty string is returned. -static Expected -ReadFull(IOObject &descriptor, size_t length, - std::optional timeout = std::nullopt) { - if (!descriptor.IsValid()) - return createStringError("transport output is closed"); +Transport::Transport(llvm::StringRef client_name, lldb_dap::Log *log, + lldb::IOObjectSP input, lldb::IOObjectSP output) + : HTTPDelimitedJSONTransport(input, output), m_client_name(client_name), + m_log(log) {} - bool timeout_supported = true; - // FIXME: SelectHelper does not work with NativeFile on Win32. -#if _WIN32 - timeout_supported = descriptor.GetFdType() == IOObject::eFDTypeSocket; -#endif - - if (timeout && timeout_supported) { - SelectHelper sh; - sh.SetTimeout(*timeout); - sh.FDSetRead(descriptor.GetWaitableHandle()); - Status status = sh.Select(); - if (status.Fail()) { - // Convert timeouts into a specific error. - if (status.GetType() == lldb::eErrorTypePOSIX && - status.GetError() == ETIMEDOUT) - return make_error(); - return status.takeError(); - } - } - - std::string data; - data.resize(length); - Status status = descriptor.Read(data.data(), length); - if (status.Fail()) - return status.takeError(); - - // Read returns '' on EOF. - if (length == 0) - return make_error(); - - // Return the actual number of bytes read. - return data.substr(0, length); +void Transport::Log(llvm::StringRef message) { + DAP_LOG(m_log, "({0}) {1}", m_client_name, message); } - -static Expected -ReadUntil(IOObject &descriptor, StringRef delimiter, - std::optional timeout = std::nullopt) { - std::string buffer; - buffer.reserve(delimiter.size() + 1); - while (!llvm::StringRef(buffer).ends_with(delimiter)) { - Expected next = - ReadFull(descriptor, buffer.empty() ? delimiter.size() : 1, timeout); - if (auto Err = next.takeError()) - return std::move(Err); - buffer += *next; - } - return buffer.substr(0, buffer.size() - delimiter.size()); -} - -/// DAP message format -/// ``` -/// Content-Length: (?\d+)\r\n\r\n(?.{\k}) -/// ``` -static constexpr StringLiteral kHeaderContentLength = "Content-Length: "; -static constexpr StringLiteral kHeaderSeparator = "\r\n\r\n"; - -namespace lldb_dap { - -char EndOfFileError::ID; -char TimeoutError::ID; - -Transport::Transport(StringRef client_name, Log *log, IOObjectSP input, - IOObjectSP output) - : m_client_name(client_name), m_log(log), m_input(std::move(input)), - m_output(std::move(output)) {} - -Expected Transport::Read(const std::chrono::microseconds &timeout) { - if (!m_input || !m_input->IsValid()) - return createStringError("transport output is closed"); - - IOObject *input = m_input.get(); - Expected message_header = - ReadFull(*input, kHeaderContentLength.size(), timeout); - if (!message_header) - return message_header.takeError(); - if (*message_header != kHeaderContentLength) - return createStringError(formatv("expected '{0}' and got '{1}'", - kHeaderContentLength, *message_header) - .str()); - - Expected raw_length = ReadUntil(*input, kHeaderSeparator); - if (!raw_length) - return handleErrors(raw_length.takeError(), - [&](const EndOfFileError &E) -> llvm::Error { - return createStringError( - "unexpected EOF while reading header separator"); - }); - - size_t length; - if (!to_integer(*raw_length, length)) - return createStringError( - formatv("invalid content length {0}", *raw_length).str()); - - Expected raw_json = ReadFull(*input, length); - if (!raw_json) - return handleErrors( - raw_json.takeError(), [&](const EndOfFileError &E) -> llvm::Error { - return createStringError("unexpected EOF while reading JSON"); - }); - - DAP_LOG(m_log, "--> ({0}) {1}", m_client_name, *raw_json); - - return json::parse(/*JSON=*/*raw_json, - /*RootName=*/"protocol_message"); -} - -Error Transport::Write(const Message &message) { - if (!m_output || !m_output->IsValid()) - return createStringError("transport output is closed"); - - std::string json = formatv("{0}", toJSON(message)).str(); - - DAP_LOG(m_log, "<-- ({0}) {1}", m_client_name, json); - - std::string Output; - raw_string_ostream OS(Output); - OS << kHeaderContentLength << json.length() << kHeaderSeparator << json; - size_t num_bytes = Output.size(); - return m_output->Write(Output.data(), num_bytes).takeError(); -} - -} // end namespace lldb_dap diff --git a/lldb/tools/lldb-dap/Transport.h b/lldb/tools/lldb-dap/Transport.h index 4e347eaa5131..51f62e718a0d 100644 --- a/lldb/tools/lldb-dap/Transport.h +++ b/lldb/tools/lldb-dap/Transport.h @@ -15,70 +15,21 @@ #define LLDB_TOOLS_LLDB_DAP_TRANSPORT_H #include "DAPForward.h" -#include "Protocol/ProtocolBase.h" +#include "lldb/Host/JSONTransport.h" #include "lldb/lldb-forward.h" #include "llvm/ADT/StringRef.h" -#include "llvm/Support/Error.h" -#include -#include namespace lldb_dap { -class EndOfFileError : public llvm::ErrorInfo { -public: - static char ID; - - EndOfFileError() = default; - - void log(llvm::raw_ostream &OS) const override { - OS << "end of file reached"; - } - std::error_code convertToErrorCode() const override { - return llvm::inconvertibleErrorCode(); - } -}; - -class TimeoutError : public llvm::ErrorInfo { -public: - static char ID; - - TimeoutError() = default; - - void log(llvm::raw_ostream &OS) const override { - OS << "operation timed out"; - } - std::error_code convertToErrorCode() const override { - return std::make_error_code(std::errc::timed_out); - } -}; - /// A transport class that performs the Debug Adapter Protocol communication /// with the client. -class Transport { +class Transport : public lldb_private::HTTPDelimitedJSONTransport { public: - Transport(llvm::StringRef client_name, Log *log, lldb::IOObjectSP input, - lldb::IOObjectSP output); - ~Transport() = default; + Transport(llvm::StringRef client_name, lldb_dap::Log *log, + lldb::IOObjectSP input, lldb::IOObjectSP output); + virtual ~Transport() = default; - /// Transport is not copyable. - /// @{ - Transport(const Transport &rhs) = delete; - void operator=(const Transport &rhs) = delete; - /// @} - - /// Writes a Debug Adater Protocol message to the output stream. - llvm::Error Write(const protocol::Message &M); - - /// Reads the next Debug Adater Protocol message from the input stream. - /// - /// \param timeout[in] - /// A timeout to wait for reading the initial header. Once a message - /// header is recieved, this will block until the full message is - /// read. - /// - /// \returns Returns the next protocol message. - llvm::Expected - Read(const std::chrono::microseconds &timeout); + virtual void Log(llvm::StringRef message) override; /// Returns the name of this transport client, for example `stdin/stdout` or /// `client_1`. @@ -86,9 +37,7 @@ public: private: llvm::StringRef m_client_name; - Log *m_log; - lldb::IOObjectSP m_input; - lldb::IOObjectSP m_output; + lldb_dap::Log *m_log; }; } // namespace lldb_dap diff --git a/lldb/unittests/DAP/DAPTest.cpp b/lldb/unittests/DAP/DAPTest.cpp index 5fb6bf7e564a..40ffaf87c9c4 100644 --- a/lldb/unittests/DAP/DAPTest.cpp +++ b/lldb/unittests/DAP/DAPTest.cpp @@ -32,7 +32,8 @@ TEST_F(DAPTest, SendProtocolMessages) { /*transport=*/*to_dap, }; dap.Send(Event{/*event=*/"my-event", /*body=*/std::nullopt}); - ASSERT_THAT_EXPECTED(from_dap->Read(std::chrono::milliseconds(1)), - HasValue(testing::VariantWith(testing::FieldsAre( - /*event=*/"my-event", /*body=*/std::nullopt)))); + ASSERT_THAT_EXPECTED( + from_dap->Read(std::chrono::milliseconds(1)), + HasValue(testing::VariantWith(testing::FieldsAre( + /*event=*/"my-event", /*body=*/std::nullopt)))); } diff --git a/lldb/unittests/DAP/TestBase.cpp b/lldb/unittests/DAP/TestBase.cpp index 388d1b901507..4063b3425031 100644 --- a/lldb/unittests/DAP/TestBase.cpp +++ b/lldb/unittests/DAP/TestBase.cpp @@ -122,7 +122,8 @@ std::vector DAPTestBase::DrainOutput() { std::vector msgs; output.CloseWriteFileDescriptor(); while (true) { - Expected next = from_dap->Read(std::chrono::milliseconds(1)); + Expected next = + from_dap->Read(std::chrono::milliseconds(1)); if (!next) { consumeError(next.takeError()); break; diff --git a/lldb/unittests/DAP/TransportTest.cpp b/lldb/unittests/DAP/TransportTest.cpp index e6dab42e3094..aaf257993af2 100644 --- a/lldb/unittests/DAP/TransportTest.cpp +++ b/lldb/unittests/DAP/TransportTest.cpp @@ -26,6 +26,8 @@ using namespace lldb_dap::protocol; using lldb_private::File; using lldb_private::NativeFile; using lldb_private::Pipe; +using lldb_private::TransportEOFError; +using lldb_private::TransportTimeoutError; class TransportTest : public PipeBase { protected: @@ -50,7 +52,7 @@ TEST_F(TransportTest, MalformedRequests) { input.Write(malformed_header.data(), malformed_header.size()), Succeeded()); ASSERT_THAT_EXPECTED( - transport->Read(std::chrono::milliseconds(1)), + transport->Read(std::chrono::milliseconds(1)), FailedWithMessage( "expected 'Content-Length: ' and got 'COnTent-LenGth: '")); } @@ -63,20 +65,22 @@ TEST_F(TransportTest, Read) { ASSERT_THAT_EXPECTED(input.Write(message.data(), message.size()), Succeeded()); ASSERT_THAT_EXPECTED( - transport->Read(std::chrono::milliseconds(1)), + transport->Read(std::chrono::milliseconds(1)), HasValue(testing::VariantWith(testing::FieldsAre( /*seq=*/1, /*command=*/"abc", /*arguments=*/std::nullopt)))); } TEST_F(TransportTest, ReadWithTimeout) { - ASSERT_THAT_EXPECTED(transport->Read(std::chrono::milliseconds(1)), - Failed()); + ASSERT_THAT_EXPECTED( + transport->Read(std::chrono::milliseconds(1)), + Failed()); } TEST_F(TransportTest, ReadWithEOF) { input.CloseWriteFileDescriptor(); - ASSERT_THAT_EXPECTED(transport->Read(std::chrono::milliseconds(1)), - Failed()); + ASSERT_THAT_EXPECTED( + transport->Read(std::chrono::milliseconds(1)), + Failed()); } TEST_F(TransportTest, Write) { From faa49d6662b4c14438cc8e63a3751c22f28d2481 Mon Sep 17 00:00:00 2001 From: LLVM GN Syncbot Date: Thu, 12 Jun 2025 02:53:03 +0000 Subject: [PATCH 0080/1322] [gn build] Port de51b2dd3c6f --- llvm/utils/gn/secondary/lldb/source/Host/BUILD.gn | 1 + 1 file changed, 1 insertion(+) diff --git a/llvm/utils/gn/secondary/lldb/source/Host/BUILD.gn b/llvm/utils/gn/secondary/lldb/source/Host/BUILD.gn index ca1acf9ba8aa..b00442d8e1eb 100644 --- a/llvm/utils/gn/secondary/lldb/source/Host/BUILD.gn +++ b/llvm/utils/gn/secondary/lldb/source/Host/BUILD.gn @@ -27,6 +27,7 @@ static_library("Host") { "common/HostNativeThreadBase.cpp", "common/HostProcess.cpp", "common/HostThread.cpp", + "common/JSONTransport.cpp", "common/LZMA.cpp", "common/LockFileBase.cpp", "common/MainLoopBase.cpp", From d8118ed6db28a3caaf3fa4a4f8d0d51d33b09c30 Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Wed, 11 Jun 2025 20:00:45 -0700 Subject: [PATCH 0081/1322] [ELF,test] Improve weak-undef-rw.s --- lld/test/ELF/weak-undef-rw.s | 54 +++++++++++++++++++++++------------- 1 file changed, 34 insertions(+), 20 deletions(-) diff --git a/lld/test/ELF/weak-undef-rw.s b/lld/test/ELF/weak-undef-rw.s index bbc37ba49304..902cad87aba9 100644 --- a/lld/test/ELF/weak-undef-rw.s +++ b/lld/test/ELF/weak-undef-rw.s @@ -3,12 +3,17 @@ # RUN: llvm-mc -filetype=obj -triple=x86_64 a.s -o a.o # RUN: llvm-mc -filetype=obj -triple=x86_64 b.s -o b.o # RUN: llvm-mc -filetype=obj -triple=x86_64 c.s -o c.o -# RUN: ld.lld a.o -o nopie --export-dynamic -# RUN: llvm-readelf -r --hex-dump=.data nopie | FileCheck %s --check-prefix=STATIC -# RUN: ld.lld a.o -o out.pie -pie -# RUN: llvm-readelf -r --hex-dump=.data out.pie | FileCheck %s --check-prefix=STATIC -# RUN: ld.lld a.o -o out.so -shared -# RUN: llvm-readobj -r out.so | FileCheck %s --check-prefix=PIC +# RUN: llvm-mc -filetype=obj -triple=x86_64 %S/Inputs/shared.s -o s.o +# RUN: ld.lld -shared s.o -o s.so + +# RUN: ld.lld a.o -o a --export-dynamic +# RUN: llvm-readelf -r --hex-dump=.data a | FileCheck %s --check-prefix=STATIC +# RUN: ld.lld a.o s.so -o as +# RUN: llvm-readelf -r --hex-dump=.data as | FileCheck %s --check-prefix=STATIC +# RUN: ld.lld a.o -o a.pie -pie +# RUN: llvm-readelf -r --hex-dump=.data a.pie | FileCheck %s --check-prefix=STATIC +# RUN: ld.lld a.o -o a.so -shared +# RUN: llvm-readelf -r a.so | FileCheck %s --check-prefix=DYN ## gABI leaves the behavior of weak undefined references implementation defined. ## We choose to resolve them statically for static linking and produce dynamic relocations @@ -19,35 +24,44 @@ # STATIC: no relocations # STATIC: Hex dump of section '.data': -# STATIC-NEXT: {{.*}} 00000000 00000000 . +# STATIC-NEXT: {{.*}} 00000000 00000000 03000000 00000000 . # STATIC-EMPTY: -# PIC: .rela.dyn { -# PIC-NEXT: R_X86_64_64 foobar 0x0 -# PIC-NEXT: } +# DYN: Relocation section '.rela.dyn' {{.*}} contains 2 +# DYN: R_X86_64_64 0000000000000000 foobar + 0{{$}} -# RUN: ld.lld a.o b.o -o out1 -z undefs -# RUN: llvm-readelf -r -x .data out1 | FileCheck %s --check-prefix=STATIC1 -# RUN: ld.lld a.o b.o -o out1.pie -pie -z undefs -# RUN: llvm-readelf -r -x .data out1.pie | FileCheck %s --check-prefix=STATIC1 +# RUN: ld.lld a.o b.o -o ab -z undefs +# RUN: llvm-readelf -r -x .data ab | FileCheck %s --check-prefix=STATIC1 +# RUN: ld.lld a.o b.o s.so -o abs -z undefs +# RUN: llvm-readelf -r -x .data abs | FileCheck %s --check-prefix=DYN1 +# RUN: ld.lld a.o b.o -o abs.pie -pie -z undefs +# RUN: llvm-readelf -r -x .data abs.pie | FileCheck %s --check-prefix=STATIC1 # STATIC1: no relocations # STATIC1: Hex dump of section '.data': -# STATIC1-NEXT: {{.*}} 00000000 00000000 00000000 00000000 . +# STATIC1-NEXT: {{.*}} 00000000 00000000 03000000 00000000 . +# STATIC1-NEXT: {{.*}} 05000000 00000000 . # STATIC1-EMPTY: +# DYN1: Relocation section '.rela.dyn' {{.*}} contains 1 +# DYN1: Hex dump of section '.data': +# DYN1-NEXT: {{.*}} 00000000 00000000 03000000 00000000 . +# DYN1-NEXT: {{.*}} 00000000 00000000 . +# DYN1-EMPTY: + # RUN: ld.lld a.o b.o c.o -pie -z undefs 2>&1 | count 0 #--- a.s - .global _start +.global _start _start: - .data - .weak foobar - .quad foobar +.data +.weak foobar +.quad foobar +.quad foobar+3 #--- b.s .data -.quad undef +.quad undef+5 #--- c.s call undef From b46f34452e9dec50eee6ddbe07875f05e421a81c Mon Sep 17 00:00:00 2001 From: Khem Raj Date: Wed, 11 Jun 2025 20:22:08 -0700 Subject: [PATCH 0082/1322] libunwind: Do not use __attribute__((target("gcs"))) with non-clang compilers (#138077) This attribute is unsupported in GCC, so far it worked because before GCC15 did not define this macros in _CHKFEAT_GCS in arm_acle.h [1] With gcc15 compiler libunwind's check for this macros is succeeding and it ends up enabling 'gcs' by using function attribute, this works with clang but not with gcc. We can see this in rust compiler bootstrap for aarch64/musl when system uses gcc15, it ends up with these errors Building libunwind.a for aarch64-poky-linux-musl ``` cargo:warning=/mnt/b/yoe/master/sources/poky/build/tmp/work/cortexa57-poky-linux-musl/rust/1.85.1/rustc-1.85.1-src/src/llvm-project/libunwind/src/UnwindLevel1.c:191:1: error: arch extension 'gcs' should be prefixed by '+' cargo:warning= 191 | unwind_phase2(unw_context_t *uc, unw_cursor_t *cursor, _Unwind_Exception *exception_object) { cargo:warning= | ^~~~~~~~~~~~~ cargo:warning=/mnt/b/yoe/master/sources/poky/build/tmp/work/cortexa57-poky-linux-musl/rust/1.85.1/rustc-1.85.1-src/src/llvm-project/libunwind/src/UnwindLevel1.c:337:22: error: arch extension 'gcs' should be prefixed by '+' cargo:warning= 337 | _Unwind_Stop_Fn stop, void *stop_parameter) { cargo:warning= | ^~~~~~~~~~~~~~~ ``` [1] https://gcc.gnu.org/git/?p=gcc.git;a=commit;h=5a6af707f0af Signed-off-by: Khem Raj --- libunwind/src/UnwindLevel1.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/libunwind/src/UnwindLevel1.c b/libunwind/src/UnwindLevel1.c index a258a832a9c3..f3b451ad9b73 100644 --- a/libunwind/src/UnwindLevel1.c +++ b/libunwind/src/UnwindLevel1.c @@ -188,10 +188,11 @@ extern int __unw_step_stage2(unw_cursor_t *); #if defined(_LIBUNWIND_USE_GCS) // Enable the GCS target feature to permit gcspop instructions to be used. -__attribute__((target("gcs"))) +__attribute__((target("+gcs"))) #endif static _Unwind_Reason_Code -unwind_phase2(unw_context_t *uc, unw_cursor_t *cursor, _Unwind_Exception *exception_object) { +unwind_phase2(unw_context_t *uc, unw_cursor_t *cursor, + _Unwind_Exception *exception_object) { __unw_init_local(cursor, uc); _LIBUNWIND_TRACE_UNWINDING("unwind_phase2(ex_obj=%p)", @@ -332,12 +333,12 @@ unwind_phase2(unw_context_t *uc, unw_cursor_t *cursor, _Unwind_Exception *except #if defined(_LIBUNWIND_USE_GCS) // Enable the GCS target feature to permit gcspop instructions to be used. -__attribute__((target("gcs"))) +__attribute__((target("+gcs"))) #endif static _Unwind_Reason_Code unwind_phase2_forced(unw_context_t *uc, unw_cursor_t *cursor, - _Unwind_Exception *exception_object, - _Unwind_Stop_Fn stop, void *stop_parameter) { + _Unwind_Exception *exception_object, _Unwind_Stop_Fn stop, + void *stop_parameter) { __unw_init_local(cursor, uc); // uc is initialized by __unw_getcontext in the parent frame. The first stack @@ -443,7 +444,6 @@ unwind_phase2_forced(unw_context_t *uc, unw_cursor_t *cursor, return _URC_FATAL_PHASE2_ERROR; } - /// Called by __cxa_throw. Only returns if there is a fatal error. _LIBUNWIND_EXPORT _Unwind_Reason_Code _Unwind_RaiseException(_Unwind_Exception *exception_object) { From a71210e5abdbae80363cb5956a24a2004f625ca6 Mon Sep 17 00:00:00 2001 From: Kewen12 Date: Wed, 11 Jun 2025 20:24:56 -0700 Subject: [PATCH 0083/1322] Revert "[libc] Fix stdio tests after #143802" (#143824) Reverts llvm/llvm-project#143810 This PR breaks our buildbot: https://lab.llvm.org/buildbot/#/builders/10/builds/7159 revert to unblock downstream merge. --- libc/docs/configure.rst | 2 +- libc/test/src/stdio/fgetc_test.cpp | 1 - libc/test/src/stdio/fgetc_unlocked_test.cpp | 1 - libc/test/src/stdio/fgets_test.cpp | 1 - libc/test/src/stdio/setvbuf_test.cpp | 1 - 5 files changed, 1 insertion(+), 5 deletions(-) diff --git a/libc/docs/configure.rst b/libc/docs/configure.rst index 109412225634..8d53390ae19b 100644 --- a/libc/docs/configure.rst +++ b/libc/docs/configure.rst @@ -29,7 +29,7 @@ to learn about the defaults for your platform and target. - ``LIBC_CONF_ENABLE_STRONG_STACK_PROTECTOR``: Enable -fstack-protector-strong to defend against stack smashing attack. - ``LIBC_CONF_KEEP_FRAME_POINTER``: Keep frame pointer in functions for better debugging experience. * **"errno" options** - - ``LIBC_CONF_ERRNO_MODE``: The implementation used for errno, acceptable values are LIBC_ERRNO_MODE_DEFAULT, LIBC_ERRNO_MODE_UNDEFINED, LIBC_ERRNO_MODE_THREAD_LOCAL, LIBC_ERRNO_MODE_SHARED, LIBC_ERRNO_MODE_EXTERNAL, LIBC_ERRNO_MODE_SYSTEM, and LIBC_ERRNO_MODE_SYSTEM_INLINE. + - ``LIBC_CONF_ERRNO_MODE``: The implementation used for errno, acceptable values are LIBC_ERRNO_MODE_DEFAULT, LIBC_ERRNO_MODE_UNDEFINED, LIBC_ERRNO_MODE_THREAD_LOCAL, LIBC_ERRNO_MODE_SHARED, LIBC_ERRNO_MODE_EXTERNAL, and LIBC_ERRNO_MODE_SYSTEM. * **"general" options** - ``LIBC_ADD_NULL_CHECKS``: Add nullptr checks in the library's implementations to some functions for which passing nullptr is undefined behavior. * **"math" options** diff --git a/libc/test/src/stdio/fgetc_test.cpp b/libc/test/src/stdio/fgetc_test.cpp index 1faa49112fb6..7c652f666a8f 100644 --- a/libc/test/src/stdio/fgetc_test.cpp +++ b/libc/test/src/stdio/fgetc_test.cpp @@ -33,7 +33,6 @@ public: // This is an error and not a real EOF. ASSERT_EQ(LIBC_NAMESPACE::feof(file), 0); ASSERT_NE(LIBC_NAMESPACE::ferror(file), 0); - ASSERT_ERRNO_FAILURE(); ASSERT_EQ(0, LIBC_NAMESPACE::fclose(file)); diff --git a/libc/test/src/stdio/fgetc_unlocked_test.cpp b/libc/test/src/stdio/fgetc_unlocked_test.cpp index 7b2efe642fb5..f4471dd82df1 100644 --- a/libc/test/src/stdio/fgetc_unlocked_test.cpp +++ b/libc/test/src/stdio/fgetc_unlocked_test.cpp @@ -36,7 +36,6 @@ public: // This is an error and not a real EOF. ASSERT_EQ(LIBC_NAMESPACE::feof(file), 0); ASSERT_NE(LIBC_NAMESPACE::ferror(file), 0); - ASSERT_ERRNO_FAILURE(); ASSERT_EQ(0, LIBC_NAMESPACE::fclose(file)); diff --git a/libc/test/src/stdio/fgets_test.cpp b/libc/test/src/stdio/fgets_test.cpp index 2d7c68d49081..c00a9256af52 100644 --- a/libc/test/src/stdio/fgets_test.cpp +++ b/libc/test/src/stdio/fgets_test.cpp @@ -36,7 +36,6 @@ TEST_F(LlvmLibcFgetsTest, WriteAndReadCharacters) { // This is an error and not a real EOF. ASSERT_EQ(LIBC_NAMESPACE::feof(file), 0); ASSERT_NE(LIBC_NAMESPACE::ferror(file), 0); - ASSERT_ERRNO_FAILURE(); ASSERT_EQ(0, LIBC_NAMESPACE::fclose(file)); diff --git a/libc/test/src/stdio/setvbuf_test.cpp b/libc/test/src/stdio/setvbuf_test.cpp index a0936ba79ef7..4144bc1bef44 100644 --- a/libc/test/src/stdio/setvbuf_test.cpp +++ b/libc/test/src/stdio/setvbuf_test.cpp @@ -11,7 +11,6 @@ #include "src/stdio/fread.h" #include "src/stdio/fwrite.h" #include "src/stdio/setvbuf.h" -#include "test/UnitTest/ErrnoCheckingTest.h" #include "test/UnitTest/Test.h" #include "hdr/stdio_macros.h" From 968d8eaa44c500259fe8d56ad77ec1c71cad35e2 Mon Sep 17 00:00:00 2001 From: Yang Zaizhou <91008302+Mxfg-incense@users.noreply.github.com> Date: Thu, 12 Jun 2025 11:28:57 +0800 Subject: [PATCH 0084/1322] [OpenMP][Flang]Fix omp_get_cancellation return type from integer to logical (#142990) --- openmp/runtime/src/include/omp_lib.F90.var | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/openmp/runtime/src/include/omp_lib.F90.var b/openmp/runtime/src/include/omp_lib.F90.var index 3463b698291e..20639f60b5d9 100644 --- a/openmp/runtime/src/include/omp_lib.F90.var +++ b/openmp/runtime/src/include/omp_lib.F90.var @@ -399,7 +399,7 @@ function omp_get_cancellation() bind(c) use omp_lib_kinds - integer (kind=omp_integer_kind) omp_get_cancellation + logical (kind=omp_logical_kind) omp_get_cancellation end function omp_get_cancellation function omp_is_initial_device() bind(c) From 2fcaa00d1e2317a90c9071b735eb0e758b5dd58b Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Wed, 11 Jun 2025 20:37:15 -0700 Subject: [PATCH 0085/1322] [ELF] -z undefs: handle relocations referencing undefined non-weak like undefined weak * Merge the special case into isStaticLinkTimeConstant * Generalize isUndefWeak to isUndefined. undefined non-weak is an error case. We choose to be general, which also brings us in line with GNU ld. --- lld/ELF/Relocations.cpp | 25 ++++++++++--------------- lld/test/ELF/weak-undef-rw.s | 12 +++++++----- 2 files changed, 17 insertions(+), 20 deletions(-) diff --git a/lld/ELF/Relocations.cpp b/lld/ELF/Relocations.cpp index 1af01e7247dc..6c4209a2b81e 100644 --- a/lld/ELF/Relocations.cpp +++ b/lld/ELF/Relocations.cpp @@ -990,10 +990,17 @@ bool RelocationScanner::isStaticLinkTimeConstant(RelExpr e, RelType type, // only the low bits are used. if (e == R_GOT || e == R_PLT) return ctx.target->usesOnlyLowPageBits(type) || !ctx.arg.isPic; - // R_AARCH64_AUTH_ABS64 requires a dynamic relocation. - if (sym.isPreemptible || e == RE_AARCH64_AUTH) + if (e == RE_AARCH64_AUTH) return false; + + // The behavior of an undefined weak reference is implementation defined. + // (We treat undefined non-weak the same as undefined weak.) For static + // -no-pie linking, dynamic relocations are generally avoided (except + // IRELATIVE). Emitting dynamic relocations for -shared aligns with its -z + // undefs default. Dynamic -no-pie linking and -pie allow flexibility. + if (sym.isPreemptible) + return sym.isUndefined() && !ctx.arg.isPic; if (!ctx.arg.isPic) return true; @@ -1113,19 +1120,7 @@ void RelocationScanner::processAux(RelExpr expr, RelType type, uint64_t offset, // If the relocation is known to be a link-time constant, we know no dynamic // relocation will be created, pass the control to relocateAlloc() or // relocateNonAlloc() to resolve it. - // - // The behavior of an undefined weak reference is implementation defined. For - // non-link-time constants, we resolve relocations statically (let - // relocate{,Non}Alloc() resolve them) for -no-pie and try producing dynamic - // relocations for -pie and -shared. - // - // The general expectation of -no-pie static linking is that there is no - // dynamic relocation (except IRELATIVE). Emitting dynamic relocations for - // -shared matches the spirit of its -z undefs default. -pie has freedom on - // choices, and we choose dynamic relocations to be consistent with the - // handling of GOT-generating relocations. - if (isStaticLinkTimeConstant(expr, type, sym, offset) || - (!ctx.arg.isPic && sym.isUndefWeak())) { + if (isStaticLinkTimeConstant(expr, type, sym, offset)) { sec->addReloc({expr, type, offset, addend, &sym}); return; } diff --git a/lld/test/ELF/weak-undef-rw.s b/lld/test/ELF/weak-undef-rw.s index 902cad87aba9..497228a3cf90 100644 --- a/lld/test/ELF/weak-undef-rw.s +++ b/lld/test/ELF/weak-undef-rw.s @@ -33,9 +33,11 @@ # RUN: ld.lld a.o b.o -o ab -z undefs # RUN: llvm-readelf -r -x .data ab | FileCheck %s --check-prefix=STATIC1 # RUN: ld.lld a.o b.o s.so -o abs -z undefs -# RUN: llvm-readelf -r -x .data abs | FileCheck %s --check-prefix=DYN1 -# RUN: ld.lld a.o b.o -o abs.pie -pie -z undefs -# RUN: llvm-readelf -r -x .data abs.pie | FileCheck %s --check-prefix=STATIC1 +# RUN: llvm-readelf -r -x .data abs | FileCheck %s --check-prefix=STATIC1 +# RUN: ld.lld a.o b.o -o ab.pie -pie -z undefs +# RUN: llvm-readelf -r -x .data ab.pie | FileCheck %s --check-prefix=STATIC1 +# RUN: ld.lld a.o b.o s.so -o abs.pie -pie -z undefs +# RUN: llvm-readelf -r -x .data abs.pie | FileCheck %s --check-prefix=DYN1 # STATIC1: no relocations # STATIC1: Hex dump of section '.data': @@ -43,9 +45,9 @@ # STATIC1-NEXT: {{.*}} 05000000 00000000 . # STATIC1-EMPTY: -# DYN1: Relocation section '.rela.dyn' {{.*}} contains 1 +# DYN1: Relocation section '.rela.dyn' {{.*}} contains 3 # DYN1: Hex dump of section '.data': -# DYN1-NEXT: {{.*}} 00000000 00000000 03000000 00000000 . +# DYN1-NEXT: {{.*}} 00000000 00000000 00000000 00000000 . # DYN1-NEXT: {{.*}} 00000000 00000000 . # DYN1-EMPTY: From 5f231db76482bbdd3e658d8e9797cbd46837d4e1 Mon Sep 17 00:00:00 2001 From: Brandon Wu Date: Thu, 12 Jun 2025 11:41:52 +0800 Subject: [PATCH 0086/1322] [RISCV] Use StringRef for RequiredExtensions in RVVIntrinsicDef (#143503) This prevents many duplicated copies of required extensions string. --- clang/lib/Sema/SemaRISCV.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/clang/lib/Sema/SemaRISCV.cpp b/clang/lib/Sema/SemaRISCV.cpp index 9f70be746eb3..9eab0c2a0df6 100644 --- a/clang/lib/Sema/SemaRISCV.cpp +++ b/clang/lib/Sema/SemaRISCV.cpp @@ -47,7 +47,7 @@ struct RVVIntrinsicDef { std::string BuiltinName; /// Mapping to RequiredFeatures in riscv_vector.td - std::string RequiredExtensions; + StringRef RequiredExtensions; /// Function signature, first element is return type. RVVTypes Signature; From f09050fdc85074869f0b34f0d9e061a74ef549ee Mon Sep 17 00:00:00 2001 From: Chuanqi Xu Date: Thu, 12 Jun 2025 11:35:44 +0800 Subject: [PATCH 0087/1322] [C++20] [Modules] Fix module local lookup ambiguousity Close https://github.com/llvm/llvm-project/issues/61360 Close https://github.com/llvm/llvm-project/issues/129525 Close https://github.com/llvm/llvm-project/issues/143734 We shouldn't identify different module local decls in different modules as the same entity. --- clang/include/clang/AST/ASTContext.h | 6 ++-- clang/include/clang/AST/DeclBase.h | 4 +++ clang/lib/AST/ASTContext.cpp | 8 ++++- clang/lib/AST/DeclBase.cpp | 6 ++++ .../Modules/module-local-declarations-02.cppm | 31 +++++++++++++++++++ clang/test/Modules/pr61360.cppm | 25 +++++++++++++++ 6 files changed, 76 insertions(+), 4 deletions(-) create mode 100644 clang/test/Modules/module-local-declarations-02.cppm create mode 100644 clang/test/Modules/pr61360.cppm diff --git a/clang/include/clang/AST/ASTContext.h b/clang/include/clang/AST/ASTContext.h index 8d24d393eab0..3abb49312255 100644 --- a/clang/include/clang/AST/ASTContext.h +++ b/clang/include/clang/AST/ASTContext.h @@ -488,8 +488,8 @@ class ASTContext : public RefCountedBase { /// if possible. /// /// Not serialized intentionally. - llvm::StringMap PrimaryModuleNameMap; - llvm::DenseMap SameModuleLookupSet; + mutable llvm::StringMap PrimaryModuleNameMap; + mutable llvm::DenseMap SameModuleLookupSet; static constexpr unsigned ConstantArrayTypesLog2InitSize = 8; static constexpr unsigned GeneralTypesLog2InitSize = 9; @@ -1151,7 +1151,7 @@ public: /// /// FIXME: The signature may be confusing since `clang::Module` means to /// a module fragment or a module unit but not a C++20 module. - bool isInSameModule(const Module *M1, const Module *M2); + bool isInSameModule(const Module *M1, const Module *M2) const; TranslationUnitDecl *getTranslationUnitDecl() const { return TUDecl->getMostRecentDecl(); diff --git a/clang/include/clang/AST/DeclBase.h b/clang/include/clang/AST/DeclBase.h index 375e9e259250..dd67ebc9873f 100644 --- a/clang/include/clang/AST/DeclBase.h +++ b/clang/include/clang/AST/DeclBase.h @@ -646,6 +646,10 @@ public: return getModuleOwnershipKind() == ModuleOwnershipKind::ModulePrivate; } + /// Whether this declaration was a local declaration to a C++20 + /// named module. + bool isModuleLocal() const; + /// Whether this declaration was exported in a lexical context. /// e.g.: /// diff --git a/clang/lib/AST/ASTContext.cpp b/clang/lib/AST/ASTContext.cpp index b51f7622288d..4d44f23c0f50 100644 --- a/clang/lib/AST/ASTContext.cpp +++ b/clang/lib/AST/ASTContext.cpp @@ -1175,7 +1175,7 @@ void ASTContext::setCurrentNamedModule(Module *M) { CurrentCXXNamedModule = M; } -bool ASTContext::isInSameModule(const Module *M1, const Module *M2) { +bool ASTContext::isInSameModule(const Module *M1, const Module *M2) const { if (!M1 != !M2) return false; @@ -7429,6 +7429,12 @@ bool ASTContext::isSameEntity(const NamedDecl *X, const NamedDecl *Y) const { cast(Y->getDeclContext()->getRedeclContext()))) return false; + // If either X or Y are local to the owning module, they are only possible to + // be the same entity if they are in the same module. + if (X->isModuleLocal() || Y->isModuleLocal()) + if (!isInSameModule(X->getOwningModule(), Y->getOwningModule())) + return false; + // Two typedefs refer to the same entity if they have the same underlying // type. if (const auto *TypedefX = dyn_cast(X)) diff --git a/clang/lib/AST/DeclBase.cpp b/clang/lib/AST/DeclBase.cpp index a1bb62bcb68f..48c60aa4e449 100644 --- a/clang/lib/AST/DeclBase.cpp +++ b/clang/lib/AST/DeclBase.cpp @@ -1132,6 +1132,12 @@ bool Decl::isInExportDeclContext() const { return isa_and_nonnull(DC); } +bool Decl::isModuleLocal() const { + auto *M = getOwningModule(); + return M && M->isNamedModule() && + getModuleOwnershipKind() == ModuleOwnershipKind::ReachableWhenImported; +} + bool Decl::isInAnotherModuleUnit() const { auto *M = getOwningModule(); diff --git a/clang/test/Modules/module-local-declarations-02.cppm b/clang/test/Modules/module-local-declarations-02.cppm new file mode 100644 index 000000000000..0670c4295abc --- /dev/null +++ b/clang/test/Modules/module-local-declarations-02.cppm @@ -0,0 +1,31 @@ +// RUN: rm -rf %t +// RUN: mkdir -p %t +// RUN: split-file %s %t +// +// RUN: %clang_cc1 -std=c++20 %t/A.cppm -emit-module-interface -o %t/A.pcm +// RUN: %clang_cc1 -std=c++20 %t/B.cppm -fprebuilt-module-path=%t -emit-llvm -o %t/B.ll + +//--- A.cppm +export module A; + +export template +struct holder { +}; + +struct foo {}; + +export struct a { + holder m; +}; + +//--- B.cppm +// expected-no-diagnostics +export module B; + +import A; + +struct foo {}; + +struct b { + holder m; +}; \ No newline at end of file diff --git a/clang/test/Modules/pr61360.cppm b/clang/test/Modules/pr61360.cppm new file mode 100644 index 000000000000..a16f65d4be2f --- /dev/null +++ b/clang/test/Modules/pr61360.cppm @@ -0,0 +1,25 @@ +// RUN: rm -rf %t +// RUN: mkdir -p %t +// RUN: split-file %s %t +// +// RUN: %clang_cc1 -std=c++20 %t/A.cppm -emit-module-interface -o %t/A.pcm +// RUN: %clang_cc1 -std=c++20 %t/B.cppm -fprebuilt-module-path=%t -emit-llvm -o %t/B.ll + +//--- A.cppm +export module A; +export template +struct holder { +}; + +struct a { + holder m; +}; + +//--- B.cppm +// expected-no-diagnostics +export module B; +import A; + +struct b { + holder m; +}; From 282e471018d234f78b0990100834532389877519 Mon Sep 17 00:00:00 2001 From: Kareem Ergawy Date: Thu, 12 Jun 2025 05:58:55 +0200 Subject: [PATCH 0088/1322] [flang] Erase `fir.local` ops before lowering `fir` to `llvm` (#143687) `fir.local` ops are not supposed to have any uses at this point (i.e. during lowering to LLVM). In case of serialization, the `fir.do_concurrent` users are expected to have been lowered to `fir.do_loop` nests. In case of parallelization, the `fir.do_concurrent` users are expected to have been lowered to the target parallel model (e.g. OpenMP). This hopefully resolved a build issue introduced by https://github.com/llvm/llvm-project/pull/142567 (see for example: https://lab.llvm.org/buildbot/#/builders/199/builds/4009). --- flang/lib/Optimizer/CodeGen/CodeGen.cpp | 42 +++++++++++++++++++------ flang/test/Fir/local.fir | 10 ++++++ 2 files changed, 43 insertions(+), 9 deletions(-) create mode 100644 flang/test/Fir/local.fir diff --git a/flang/lib/Optimizer/CodeGen/CodeGen.cpp b/flang/lib/Optimizer/CodeGen/CodeGen.cpp index 82d960a6fc61..a3de3ae9d116 100644 --- a/flang/lib/Optimizer/CodeGen/CodeGen.cpp +++ b/flang/lib/Optimizer/CodeGen/CodeGen.cpp @@ -3294,6 +3294,30 @@ struct LoadOpConversion : public fir::FIROpConversion { } }; +struct LocalitySpecifierOpConversion + : public fir::FIROpConversion { + using FIROpConversion::FIROpConversion; + llvm::LogicalResult + matchAndRewrite(fir::LocalitySpecifierOp localizer, OpAdaptor adaptor, + mlir::ConversionPatternRewriter &rewriter) const override { +#ifdef EXPENSIVE_CHECKS + auto uses = mlir::SymbolTable::getSymbolUses( + localizer, localizer->getParentOfType()); + + // `fir.local` ops are not supposed to have any uses at this point (i.e. + // during lowering to LLVM). In case of serialization, the + // `fir.do_concurrent` users are expected to have been lowered to + // `fir.do_loop` nests. In case of parallelization, the `fir.do_concurrent` + // users are expected to have been lowered to the target parallel model + // (e.g. OpenMP). + assert(uses && uses->empty()); +#endif + + rewriter.eraseOp(localizer); + return mlir::success(); + } +}; + /// Lower `fir.no_reassoc` to LLVM IR dialect. /// TODO: how do we want to enforce this in LLVM-IR? Can we manipulate the fast /// math flags? @@ -4249,15 +4273,15 @@ void fir::populateFIRToLLVMConversionPatterns( FieldIndexOpConversion, FirEndOpConversion, FreeMemOpConversion, GlobalLenOpConversion, GlobalOpConversion, InsertOnRangeOpConversion, IsPresentOpConversion, LenParamIndexOpConversion, LoadOpConversion, - MulcOpConversion, NegcOpConversion, NoReassocOpConversion, - SelectCaseOpConversion, SelectOpConversion, SelectRankOpConversion, - SelectTypeOpConversion, ShapeOpConversion, ShapeShiftOpConversion, - ShiftOpConversion, SliceOpConversion, StoreOpConversion, - StringLitOpConversion, SubcOpConversion, TypeDescOpConversion, - TypeInfoOpConversion, UnboxCharOpConversion, UnboxProcOpConversion, - UndefOpConversion, UnreachableOpConversion, XArrayCoorOpConversion, - XEmboxOpConversion, XReboxOpConversion, ZeroOpConversion>(converter, - options); + LocalitySpecifierOpConversion, MulcOpConversion, NegcOpConversion, + NoReassocOpConversion, SelectCaseOpConversion, SelectOpConversion, + SelectRankOpConversion, SelectTypeOpConversion, ShapeOpConversion, + ShapeShiftOpConversion, ShiftOpConversion, SliceOpConversion, + StoreOpConversion, StringLitOpConversion, SubcOpConversion, + TypeDescOpConversion, TypeInfoOpConversion, UnboxCharOpConversion, + UnboxProcOpConversion, UndefOpConversion, UnreachableOpConversion, + XArrayCoorOpConversion, XEmboxOpConversion, XReboxOpConversion, + ZeroOpConversion>(converter, options); // Patterns that are populated without a type converter do not trigger // target materializations for the operands of the root op. diff --git a/flang/test/Fir/local.fir b/flang/test/Fir/local.fir new file mode 100644 index 000000000000..006f5ca94467 --- /dev/null +++ b/flang/test/Fir/local.fir @@ -0,0 +1,10 @@ +// RUN: fir-opt --fir-to-llvm-ir %s | FileCheck %s + +// Tests that `fir.local` ops are dropped from the module before LLVM lowering. + +fir.local {type = local} @local_privatizer : i32 +func.func @foo() { + return +} + +// CHECK-NOT: fir.local From c3be4524a56ba01bc1f868fc37e329f24ec5041c Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Wed, 11 Jun 2025 21:23:06 -0700 Subject: [PATCH 0089/1322] [ELF,test] Improve weak-undef-got-plt.s --- lld/test/ELF/weak-undef-got-pie.s | 22 -------------------- lld/test/ELF/weak-undef-got-plt.s | 34 +++++++++++++++++++++++++++++++ lld/test/ELF/weak-undef.s | 31 ---------------------------- 3 files changed, 34 insertions(+), 53 deletions(-) delete mode 100644 lld/test/ELF/weak-undef-got-pie.s create mode 100644 lld/test/ELF/weak-undef-got-plt.s delete mode 100644 lld/test/ELF/weak-undef.s diff --git a/lld/test/ELF/weak-undef-got-pie.s b/lld/test/ELF/weak-undef-got-pie.s deleted file mode 100644 index 2301400f4e0b..000000000000 --- a/lld/test/ELF/weak-undef-got-pie.s +++ /dev/null @@ -1,22 +0,0 @@ -# REQUIRES: x86 -# RUN: llvm-mc -filetype=obj -triple=x86_64 %p/Inputs/dummy-shared.s -o %t1.o -# RUN: ld.lld %t1.o -shared -o %t1.so -# RUN: llvm-mc -filetype=obj -x86-relax-relocations=false -triple=x86_64 %s -o %t.o - -# RUN: ld.lld -pie %t.o %t1.so -o %t -# RUN: llvm-readobj -r %t | FileCheck --check-prefix=RELOCS %s -# RUN: llvm-objdump -d --no-show-raw-insn %t | FileCheck --check-prefix=DISASM %s - -# RELOCS: Relocations [ -# RELOCS-NEXT: Section ({{.*}}) .rela.dyn { -# RELOCS-NEXT: R_X86_64_GLOB_DAT foo 0x0 -# RELOCS-NEXT: } -# RELOCS-NEXT: ] - -.weak foo - -.globl _start -_start: -# DISASM: <_start>: -# DISASM-NEXT: movq {{.*}}(%rip), %rax -mov foo@gotpcrel(%rip), %rax diff --git a/lld/test/ELF/weak-undef-got-plt.s b/lld/test/ELF/weak-undef-got-plt.s new file mode 100644 index 000000000000..0ee3da2cd3b4 --- /dev/null +++ b/lld/test/ELF/weak-undef-got-plt.s @@ -0,0 +1,34 @@ +# REQUIRES: x86 +# RUN: rm -rf %t && split-file %s %t && cd %t +# RUN: llvm-mc -filetype=obj -triple=x86_64 -x86-relax-relocations=false a.s -o a.o +# RUN: llvm-mc -filetype=obj -triple=x86_64 %S/Inputs/shared.s -o s.o +# RUN: ld.lld -shared s.o -o s.so + +# RUN: ld.lld a.o -o a +# RUN: llvm-readelf -r a | FileCheck %s --check-prefix=NORELOC +# RUN: ld.lld a.o s.so -o as +# RUN: llvm-objdump -dR as | FileCheck %s + +# RUN: ld.lld -pie a.o s.so -o as.pie +# RUN: llvm-objdump -dR as.pie | FileCheck %s + +# RUN: ld.lld -shared a.o -o a.so +# RUN: llvm-objdump -dR a.so | FileCheck %s + +# NORELOC: no relocation + +# CHECK: TYPE VALUE +# CHECK-NEXT: R_X86_64_GLOB_DAT foo{{$}} +# CHECK-NEXT: R_X86_64_JUMP_SLOT foo{{$}} +# CHECK-EMPTY: +# CHECK: <_start>: +# CHECK-NEXT: movq {{.*}}(%rip), %rax +# CHECK-NEXT: callq {{.*}} + +#--- a.s +.weak foo + +.globl _start +_start: +mov foo@gotpcrel(%rip), %rax +call foo diff --git a/lld/test/ELF/weak-undef.s b/lld/test/ELF/weak-undef.s deleted file mode 100644 index 21488023a79e..000000000000 --- a/lld/test/ELF/weak-undef.s +++ /dev/null @@ -1,31 +0,0 @@ -# REQUIRES: x86 -# RUN: llvm-mc -filetype=obj -triple=x86_64 %s -o %t.o -# RUN: ld.lld %t.o -o %t --export-dynamic -# RUN: llvm-readelf -r --dyn-syms --hex-dump=.data %t | \ -# RUN: FileCheck %s --check-prefixes=NORELOC,COMMON - -# NORELOC: There are no relocations in this file. - -# RUN: llvm-mc -filetype=obj -triple=x86_64 %p/Inputs/dummy-shared.s -o %t1.o -# RUN: ld.lld %t1.o -shared -o %t1.so -# RUN: ld.lld %t.o -o %t %t1.so -pie -# RUN: llvm-readelf -r --dyn-syms --hex-dump=.data %t | \ -# RUN: FileCheck %s --check-prefixes=RELOC,COMMON - -# RELOC: Relocation section '.rela.dyn' at offset {{.*}} contains 1 entries: -# RELOC-NEXT: Offset Info Type Symbol's Value Symbol's Name + Addend -# RELOC-NEXT: {{.*}} 0000000100000001 R_X86_64_64 0000000000000000 foo + 0 - -# NORELOC-NOT: Symbol table '.dynsym' -# RELOC: Symbol table '.dynsym' contains 2 entries: -# RELOC-NEXT: Num: Value Size Type Bind Vis Ndx Name -# RELOC-NEXT: 0: 0000000000000000 0 NOTYPE LOCAL DEFAULT UND -# RELOC-NEXT: 1: 0000000000000000 0 NOTYPE WEAK DEFAULT UND foo -# COMMON: Hex dump of section '.data': -# COMMON-NEXT: {{.*}} 00000000 00000000 -# COMMON-EMPTY: - -.weak foo - -.data - .dc.a foo From a93e55e57ed00a55f822c64e3520c7c732b58480 Mon Sep 17 00:00:00 2001 From: Alexey Samsonov Date: Wed, 11 Jun 2025 21:33:46 -0700 Subject: [PATCH 0090/1322] Revert "[libc] Migrate stdio tests to ErrnoCheckingTest." (#143829) Reverts llvm/llvm-project#143802. Follow-up fix 3c7af175e51c3ab08ac3c442146c2b822f38c01e wasn't robust enough and itself got reverted. --- libc/test/src/stdio/CMakeLists.txt | 10 ---------- libc/test/src/stdio/fdopen_test.cpp | 14 ++++++++------ libc/test/src/stdio/fgetc_test.cpp | 5 +++-- libc/test/src/stdio/fgetc_unlocked_test.cpp | 5 +++-- libc/test/src/stdio/fgets_test.cpp | 6 +++--- libc/test/src/stdio/fileop_test.cpp | 20 +++++++++++++++----- libc/test/src/stdio/fopencookie_test.cpp | 15 ++++++++------- libc/test/src/stdio/remove_test.cpp | 10 +++++----- libc/test/src/stdio/rename_test.cpp | 9 ++++----- libc/test/src/stdio/setvbuf_test.cpp | 8 ++++---- libc/test/src/stdio/unlocked_fileop_test.cpp | 7 ++++--- libc/test/src/stdlib/StrtolTest.h | 1 + libc/test/src/stdlib/strtold_test.cpp | 1 + 13 files changed, 59 insertions(+), 52 deletions(-) diff --git a/libc/test/src/stdio/CMakeLists.txt b/libc/test/src/stdio/CMakeLists.txt index 3627006ec28f..01904a30504e 100644 --- a/libc/test/src/stdio/CMakeLists.txt +++ b/libc/test/src/stdio/CMakeLists.txt @@ -20,7 +20,6 @@ add_libc_test( libc.src.stdio.fread libc.src.stdio.fseek libc.src.stdio.fwrite - libc.test.UnitTest.ErrnoCheckingTest ) add_libc_test( @@ -69,7 +68,6 @@ add_libc_test( libc.src.stdio.fread libc.src.stdio.fwrite libc.src.stdio.setvbuf - libc.test.UnitTest.ErrnoCheckingTest ) add_libc_test( @@ -90,7 +88,6 @@ add_libc_test( libc.src.stdio.fread_unlocked libc.src.stdio.funlockfile libc.src.stdio.fwrite_unlocked - libc.test.UnitTest.ErrnoCheckingTest ) add_libc_test( @@ -112,7 +109,6 @@ add_libc_test( libc.src.stdio.fread libc.src.stdio.fseek libc.src.stdio.fwrite - libc.test.UnitTest.ErrnoCheckingTest LINK_LIBRARIES LibcMemoryHelpers ) @@ -430,7 +426,6 @@ if(${LIBC_TARGET_OS} STREQUAL "linux") libc.src.sys.stat.mkdirat libc.src.unistd.access libc.src.unistd.close - libc.test.UnitTest.ErrnoCheckingTest ) add_libc_test( @@ -445,7 +440,6 @@ if(${LIBC_TARGET_OS} STREQUAL "linux") libc.src.stdio.rename libc.src.unistd.access libc.src.unistd.close - libc.test.UnitTest.ErrnoCheckingTest libc.test.UnitTest.ErrnoSetterMatcher ) @@ -462,7 +456,6 @@ if(${LIBC_TARGET_OS} STREQUAL "linux") libc.src.stdio.fgets libc.src.stdio.fputs libc.src.unistd.close - libc.test.UnitTest.ErrnoCheckingTest libc.test.UnitTest.ErrnoSetterMatcher ) endif() @@ -483,7 +476,6 @@ add_libc_test( libc.src.stdio.fopen libc.src.stdio.fwrite libc.src.stdio.getc - libc.test.UnitTest.ErrnoCheckingTest ) add_libc_test( @@ -506,7 +498,6 @@ add_libc_test( libc.src.stdio.funlockfile libc.src.stdio.fwrite libc.src.stdio.getc_unlocked - libc.test.UnitTest.ErrnoCheckingTest ) add_libc_test( @@ -524,7 +515,6 @@ add_libc_test( libc.src.stdio.fgets libc.src.stdio.fopen libc.src.stdio.fwrite - libc.test.UnitTest.ErrnoCheckingTest ) add_libc_test( diff --git a/libc/test/src/stdio/fdopen_test.cpp b/libc/test/src/stdio/fdopen_test.cpp index b53184c30be3..104fc478b100 100644 --- a/libc/test/src/stdio/fdopen_test.cpp +++ b/libc/test/src/stdio/fdopen_test.cpp @@ -9,21 +9,20 @@ #include "src/stdio/fdopen.h" #include "hdr/fcntl_macros.h" +#include "src/__support/libc_errno.h" #include "src/fcntl/open.h" #include "src/stdio/fclose.h" #include "src/stdio/fgets.h" #include "src/stdio/fputs.h" #include "src/unistd/close.h" -#include "test/UnitTest/ErrnoCheckingTest.h" #include "test/UnitTest/ErrnoSetterMatcher.h" #include "test/UnitTest/Test.h" #include // For S_IRWXU -using LlvmLibcStdioFdopenTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest; - -TEST_F(LlvmLibcStdioFdopenTest, WriteAppendRead) { +TEST(LlvmLibcStdioFdopenTest, WriteAppendRead) { using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds; + libc_errno = 0; constexpr const char *TEST_FILE_NAME = "testdata/write_read_append.test"; auto TEST_FILE = libc_make_test_file_path(TEST_FILE_NAME); int fd = LIBC_NAMESPACE::open(TEST_FILE, O_CREAT | O_TRUNC | O_RDWR, S_IRWXU); @@ -53,7 +52,8 @@ TEST_F(LlvmLibcStdioFdopenTest, WriteAppendRead) { ASSERT_ERRNO_SUCCESS(); } -TEST_F(LlvmLibcStdioFdopenTest, InvalidFd) { +TEST(LlvmLibcStdioFdopenTest, InvalidFd) { + libc_errno = 0; constexpr const char *TEST_FILE_NAME = "testdata/invalid_fd.test"; auto TEST_FILE = libc_make_test_file_path(TEST_FILE_NAME); int fd = LIBC_NAMESPACE::open(TEST_FILE, O_CREAT | O_TRUNC); @@ -64,7 +64,8 @@ TEST_F(LlvmLibcStdioFdopenTest, InvalidFd) { ASSERT_TRUE(nullptr == fp); } -TEST_F(LlvmLibcStdioFdopenTest, InvalidMode) { +TEST(LlvmLibcStdioFdopenTest, InvalidMode) { + libc_errno = 0; constexpr const char *TEST_FILE_NAME = "testdata/invalid_mode.test"; auto TEST_FILE = libc_make_test_file_path(TEST_FILE_NAME); int fd = LIBC_NAMESPACE::open(TEST_FILE, O_CREAT | O_RDONLY, S_IRWXU); @@ -82,6 +83,7 @@ TEST_F(LlvmLibcStdioFdopenTest, InvalidMode) { auto *fp2 = LIBC_NAMESPACE::fdopen(fd, "w"); ASSERT_ERRNO_EQ(EINVAL); ASSERT_TRUE(nullptr == fp2); + libc_errno = 0; LIBC_NAMESPACE::close(fd); ASSERT_ERRNO_SUCCESS(); } diff --git a/libc/test/src/stdio/fgetc_test.cpp b/libc/test/src/stdio/fgetc_test.cpp index 7c652f666a8f..56bde5f0099a 100644 --- a/libc/test/src/stdio/fgetc_test.cpp +++ b/libc/test/src/stdio/fgetc_test.cpp @@ -14,12 +14,12 @@ #include "src/stdio/fopen.h" #include "src/stdio/fwrite.h" #include "src/stdio/getc.h" -#include "test/UnitTest/ErrnoCheckingTest.h" #include "test/UnitTest/Test.h" #include "hdr/stdio_macros.h" +#include "src/__support/libc_errno.h" -class LlvmLibcGetcTest : public LIBC_NAMESPACE::testing::ErrnoCheckingTest { +class LlvmLibcGetcTest : public LIBC_NAMESPACE::testing::Test { public: using GetcFunc = int(FILE *); void test_with_func(GetcFunc *func, const char *filename) { @@ -33,6 +33,7 @@ public: // This is an error and not a real EOF. ASSERT_EQ(LIBC_NAMESPACE::feof(file), 0); ASSERT_NE(LIBC_NAMESPACE::ferror(file), 0); + libc_errno = 0; ASSERT_EQ(0, LIBC_NAMESPACE::fclose(file)); diff --git a/libc/test/src/stdio/fgetc_unlocked_test.cpp b/libc/test/src/stdio/fgetc_unlocked_test.cpp index f4471dd82df1..90429ecf4e82 100644 --- a/libc/test/src/stdio/fgetc_unlocked_test.cpp +++ b/libc/test/src/stdio/fgetc_unlocked_test.cpp @@ -17,12 +17,12 @@ #include "src/stdio/funlockfile.h" #include "src/stdio/fwrite.h" #include "src/stdio/getc_unlocked.h" -#include "test/UnitTest/ErrnoCheckingTest.h" #include "test/UnitTest/Test.h" #include "hdr/stdio_macros.h" +#include "src/__support/libc_errno.h" -class LlvmLibcGetcTest : public LIBC_NAMESPACE::testing::ErrnoCheckingTest { +class LlvmLibcGetcTest : public LIBC_NAMESPACE::testing::Test { public: using GetcFunc = int(FILE *); void test_with_func(GetcFunc *func, const char *filename) { @@ -36,6 +36,7 @@ public: // This is an error and not a real EOF. ASSERT_EQ(LIBC_NAMESPACE::feof(file), 0); ASSERT_NE(LIBC_NAMESPACE::ferror(file), 0); + libc_errno = 0; ASSERT_EQ(0, LIBC_NAMESPACE::fclose(file)); diff --git a/libc/test/src/stdio/fgets_test.cpp b/libc/test/src/stdio/fgets_test.cpp index c00a9256af52..abed3d405293 100644 --- a/libc/test/src/stdio/fgets_test.cpp +++ b/libc/test/src/stdio/fgets_test.cpp @@ -12,12 +12,11 @@ #include "src/stdio/fgets.h" #include "src/stdio/fopen.h" #include "src/stdio/fwrite.h" -#include "test/UnitTest/ErrnoCheckingTest.h" #include "test/UnitTest/Test.h" -using LlvmLibcFgetsTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest; +#include "src/__support/libc_errno.h" -TEST_F(LlvmLibcFgetsTest, WriteAndReadCharacters) { +TEST(LlvmLibcFgetsTest, WriteAndReadCharacters) { constexpr char FILENAME[] = "testdata/fgets.test"; ::FILE *file = LIBC_NAMESPACE::fopen(FILENAME, "w"); ASSERT_FALSE(file == nullptr); @@ -36,6 +35,7 @@ TEST_F(LlvmLibcFgetsTest, WriteAndReadCharacters) { // This is an error and not a real EOF. ASSERT_EQ(LIBC_NAMESPACE::feof(file), 0); ASSERT_NE(LIBC_NAMESPACE::ferror(file), 0); + libc_errno = 0; ASSERT_EQ(0, LIBC_NAMESPACE::fclose(file)); diff --git a/libc/test/src/stdio/fileop_test.cpp b/libc/test/src/stdio/fileop_test.cpp index e097785832d5..e624181c795b 100644 --- a/libc/test/src/stdio/fileop_test.cpp +++ b/libc/test/src/stdio/fileop_test.cpp @@ -17,18 +17,17 @@ #include "src/stdio/fread.h" #include "src/stdio/fseek.h" #include "src/stdio/fwrite.h" -#include "test/UnitTest/ErrnoCheckingTest.h" #include "test/UnitTest/ErrnoSetterMatcher.h" #include "test/UnitTest/Test.h" #include "hdr/stdio_macros.h" +#include "src/__support/libc_errno.h" -using LlvmLibcFILETest = LIBC_NAMESPACE::testing::ErrnoCheckingTest; using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::EQ; using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::NE; using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::returns; -TEST_F(LlvmLibcFILETest, SimpleFileOperations) { +TEST(LlvmLibcFILETest, SimpleFileOperations) { constexpr char FILENAME[] = "testdata/simple_operations.test"; ::FILE *file = LIBC_NAMESPACE::fopen(FILENAME, "w"); ASSERT_FALSE(file == nullptr); @@ -42,6 +41,7 @@ TEST_F(LlvmLibcFILETest, SimpleFileOperations) { ASSERT_THAT(LIBC_NAMESPACE::fread(read_data, 1, sizeof(CONTENT), file), returns(EQ(size_t(0))).with_errno(NE(0))); ASSERT_NE(LIBC_NAMESPACE::ferror(file), 0); + libc_errno = 0; LIBC_NAMESPACE::clearerr(file); ASSERT_EQ(LIBC_NAMESPACE::ferror(file), 0); @@ -72,6 +72,7 @@ TEST_F(LlvmLibcFILETest, SimpleFileOperations) { ASSERT_THAT(LIBC_NAMESPACE::fwrite(CONTENT, 1, sizeof(CONTENT), file), returns(EQ(size_t(0))).with_errno(NE(0))); ASSERT_NE(LIBC_NAMESPACE::ferror(file), 0); + libc_errno = 0; LIBC_NAMESPACE::clearerr(file); @@ -79,12 +80,15 @@ TEST_F(LlvmLibcFILETest, SimpleFileOperations) { ASSERT_THAT(LIBC_NAMESPACE::fputs(CONTENT, file), returns(EQ(EOF)).with_errno(NE(0))); ASSERT_NE(LIBC_NAMESPACE::ferror(file), 0); + libc_errno = 0; LIBC_NAMESPACE::clearerr(file); ASSERT_EQ(LIBC_NAMESPACE::ferror(file), 0); + libc_errno = 0; ASSERT_THAT(LIBC_NAMESPACE::fwrite("nothing", 1, 1, file), returns(EQ(size_t(0))).with_errno(NE(0))); + libc_errno = 0; ASSERT_EQ(LIBC_NAMESPACE::fclose(file), 0); @@ -99,8 +103,10 @@ TEST_F(LlvmLibcFILETest, SimpleFileOperations) { ASSERT_EQ(LIBC_NAMESPACE::ferror(file), 0); // This is not a readable file. + libc_errno = 0; ASSERT_THAT(LIBC_NAMESPACE::fread(data, 1, 1, file), returns(EQ(0)).with_errno(NE(0))); + libc_errno = 0; ASSERT_EQ(0, LIBC_NAMESPACE::fclose(file)); @@ -115,18 +121,21 @@ TEST_F(LlvmLibcFILETest, SimpleFileOperations) { // Check that the other functions correctly set libc_errno. + // libc_errno = 0; // ASSERT_NE(LIBC_NAMESPACE::fseek(file, 0, SEEK_SET), 0); // ASSERT_ERRNO_FAILURE(); + // libc_errno = 0; // ASSERT_NE(LIBC_NAMESPACE::fclose(file), 0); // ASSERT_ERRNO_FAILURE(); + // libc_errno = 0; // ASSERT_EQ(LIBC_NAMESPACE::fopen("INVALID FILE NAME", "r"), // static_cast(nullptr)); // ASSERT_ERRNO_FAILURE(); } -TEST_F(LlvmLibcFILETest, FFlush) { +TEST(LlvmLibcFILETest, FFlush) { constexpr char FILENAME[] = "testdata/fflush.test"; ::FILE *file = LIBC_NAMESPACE::fopen(FILENAME, "w+"); ASSERT_FALSE(file == nullptr); @@ -147,7 +156,7 @@ TEST_F(LlvmLibcFILETest, FFlush) { ASSERT_EQ(LIBC_NAMESPACE::fclose(file), 0); } -TEST_F(LlvmLibcFILETest, FOpenFWriteSizeGreaterThanOne) { +TEST(LlvmLibcFILETest, FOpenFWriteSizeGreaterThanOne) { using MyStruct = struct { char c; unsigned long long i; @@ -156,6 +165,7 @@ TEST_F(LlvmLibcFILETest, FOpenFWriteSizeGreaterThanOne) { constexpr size_t WRITE_NMEMB = sizeof(WRITE_DATA) / sizeof(MyStruct); constexpr char FILENAME[] = "testdata/fread_fwrite.test"; + libc_errno = 0; FILE *file = LIBC_NAMESPACE::fopen(FILENAME, "w"); ASSERT_FALSE(file == nullptr); ASSERT_EQ(size_t(0), LIBC_NAMESPACE::fwrite(WRITE_DATA, 0, 1, file)); diff --git a/libc/test/src/stdio/fopencookie_test.cpp b/libc/test/src/stdio/fopencookie_test.cpp index bcf5e674141a..03e1ac286b64 100644 --- a/libc/test/src/stdio/fopencookie_test.cpp +++ b/libc/test/src/stdio/fopencookie_test.cpp @@ -15,7 +15,6 @@ #include "src/stdio/fread.h" #include "src/stdio/fseek.h" #include "src/stdio/fwrite.h" -#include "test/UnitTest/ErrnoCheckingTest.h" #include "test/UnitTest/MemoryMatcher.h" #include "test/UnitTest/Test.h" @@ -23,7 +22,6 @@ #include "hdr/types/size_t.h" #include "src/__support/libc_errno.h" -using LlvmLibcFOpenCookieTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest; using MemoryView = LIBC_NAMESPACE::testing::MemoryView; struct StringStream { @@ -90,7 +88,7 @@ int close_ss(void *cookie) { constexpr cookie_io_functions_t STRING_STREAM_FUNCS = {&read_ss, &write_ss, &seek_ss, &close_ss}; -TEST_F(LlvmLibcFOpenCookieTest, ReadOnlyCookieTest) { +TEST(LlvmLibcFOpenCookie, ReadOnlyCookieTest) { constexpr char CONTENT[] = "Hello,readonly!"; auto *ss = reinterpret_cast(malloc(sizeof(StringStream))); ss->buf = reinterpret_cast(malloc(sizeof(CONTENT))); @@ -117,6 +115,7 @@ TEST_F(LlvmLibcFOpenCookieTest, ReadOnlyCookieTest) { ASSERT_EQ(size_t(0), LIBC_NAMESPACE::fwrite(CONTENT, 1, sizeof(CONTENT), f)); ASSERT_NE(LIBC_NAMESPACE::ferror(f), 0); ASSERT_ERRNO_FAILURE(); + libc_errno = 0; LIBC_NAMESPACE::clearerr(f); ASSERT_EQ(LIBC_NAMESPACE::ferror(f), 0); @@ -125,7 +124,7 @@ TEST_F(LlvmLibcFOpenCookieTest, ReadOnlyCookieTest) { free(ss); } -TEST_F(LlvmLibcFOpenCookieTest, WriteOnlyCookieTest) { +TEST(LlvmLibcFOpenCookie, WriteOnlyCookieTest) { size_t INIT_BUFSIZE = 32; auto *ss = reinterpret_cast(malloc(sizeof(StringStream))); ss->buf = reinterpret_cast(malloc(INIT_BUFSIZE)); @@ -150,6 +149,7 @@ TEST_F(LlvmLibcFOpenCookieTest, WriteOnlyCookieTest) { LIBC_NAMESPACE::fread(read_data, 1, sizeof(WRITE_DATA), f)); ASSERT_NE(LIBC_NAMESPACE::ferror(f), 0); ASSERT_ERRNO_EQ(EBADF); + libc_errno = 0; LIBC_NAMESPACE::clearerr(f); ASSERT_EQ(LIBC_NAMESPACE::ferror(f), 0); @@ -158,7 +158,7 @@ TEST_F(LlvmLibcFOpenCookieTest, WriteOnlyCookieTest) { free(ss); } -TEST_F(LlvmLibcFOpenCookieTest, AppendOnlyCookieTest) { +TEST(LlvmLibcFOpenCookie, AppendOnlyCookieTest) { constexpr char INITIAL_CONTENT[] = "1234567890987654321"; constexpr char WRITE_DATA[] = "append"; auto *ss = reinterpret_cast(malloc(sizeof(StringStream))); @@ -178,6 +178,7 @@ TEST_F(LlvmLibcFOpenCookieTest, AppendOnlyCookieTest) { ASSERT_EQ(LIBC_NAMESPACE::fread(read_data, 1, READ_SIZE, f), size_t(0)); ASSERT_NE(LIBC_NAMESPACE::ferror(f), 0); ASSERT_ERRNO_FAILURE(); + libc_errno = 0; LIBC_NAMESPACE::clearerr(f); ASSERT_EQ(LIBC_NAMESPACE::ferror(f), 0); @@ -191,7 +192,7 @@ TEST_F(LlvmLibcFOpenCookieTest, AppendOnlyCookieTest) { free(ss); } -TEST_F(LlvmLibcFOpenCookieTest, ReadUpdateCookieTest) { +TEST(LlvmLibcFOpenCookie, ReadUpdateCookieTest) { const char INITIAL_CONTENT[] = "1234567890987654321"; auto *ss = reinterpret_cast(malloc(sizeof(StringStream))); ss->buf = reinterpret_cast(malloc(sizeof(INITIAL_CONTENT))); @@ -222,7 +223,7 @@ TEST_F(LlvmLibcFOpenCookieTest, ReadUpdateCookieTest) { free(ss); } -TEST_F(LlvmLibcFOpenCookieTest, WriteUpdateCookieTest) { +TEST(LlvmLibcFOpenCookie, WriteUpdateCookieTest) { constexpr char WRITE_DATA[] = "hello, file"; auto *ss = reinterpret_cast(malloc(sizeof(StringStream))); ss->buf = reinterpret_cast(malloc(sizeof(WRITE_DATA))); diff --git a/libc/test/src/stdio/remove_test.cpp b/libc/test/src/stdio/remove_test.cpp index 296bff1f5dc1..84984e26398c 100644 --- a/libc/test/src/stdio/remove_test.cpp +++ b/libc/test/src/stdio/remove_test.cpp @@ -11,17 +11,16 @@ #include "src/sys/stat/mkdirat.h" #include "src/unistd/access.h" #include "src/unistd/close.h" -#include "test/UnitTest/ErrnoCheckingTest.h" #include "test/UnitTest/ErrnoSetterMatcher.h" #include "test/UnitTest/Test.h" +#include "src/__support/libc_errno.h" #include -using LlvmLibcRemoveTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest; - -TEST_F(LlvmLibcRemoveTest, CreateAndRemoveFile) { +TEST(LlvmLibcRemoveTest, CreateAndRemoveFile) { // The test strategy is to create a file and remove it, and also verify that // it was removed. + libc_errno = 0; using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Fails; using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds; @@ -37,9 +36,10 @@ TEST_F(LlvmLibcRemoveTest, CreateAndRemoveFile) { ASSERT_THAT(LIBC_NAMESPACE::access(TEST_FILE, F_OK), Fails(ENOENT)); } -TEST_F(LlvmLibcRemoveTest, CreateAndRemoveDir) { +TEST(LlvmLibcRemoveTest, CreateAndRemoveDir) { // The test strategy is to create a dir and remove it, and also verify that // it was removed. + libc_errno = 0; using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Fails; using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds; constexpr const char *FILENAME = "remove.test.dir"; diff --git a/libc/test/src/stdio/rename_test.cpp b/libc/test/src/stdio/rename_test.cpp index 135fb98c07fb..ac494a4ecaf8 100644 --- a/libc/test/src/stdio/rename_test.cpp +++ b/libc/test/src/stdio/rename_test.cpp @@ -8,19 +8,18 @@ #include "include/llvm-libc-macros/linux/sys-stat-macros.h" #include "include/llvm-libc-macros/linux/unistd-macros.h" +#include "src/__support/libc_errno.h" #include "src/fcntl/open.h" #include "src/stdio/rename.h" #include "src/unistd/access.h" #include "src/unistd/close.h" -#include "test/UnitTest/ErrnoCheckingTest.h" #include "test/UnitTest/ErrnoSetterMatcher.h" #include "test/UnitTest/Test.h" -using LlvmLibcRenameTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest; - -TEST_F(LlvmLibcRenameTest, CreateAndRenameFile) { +TEST(LlvmLibcRenameTest, CreateAndRenameFile) { // The test strategy is to create a file and rename it, and also verify that // it was renamed. + libc_errno = 0; using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Fails; using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds; @@ -41,7 +40,7 @@ TEST_F(LlvmLibcRenameTest, CreateAndRenameFile) { ASSERT_THAT(LIBC_NAMESPACE::access(TEST_FILEPATH0, F_OK), Fails(ENOENT)); } -TEST_F(LlvmLibcRenameTest, RenameNonExistent) { +TEST(LlvmLibcRenameTest, RenameNonExistent) { using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Fails; constexpr const char *FILENAME1 = "rename.test.file1"; diff --git a/libc/test/src/stdio/setvbuf_test.cpp b/libc/test/src/stdio/setvbuf_test.cpp index 4144bc1bef44..5872943c1bb4 100644 --- a/libc/test/src/stdio/setvbuf_test.cpp +++ b/libc/test/src/stdio/setvbuf_test.cpp @@ -14,10 +14,9 @@ #include "test/UnitTest/Test.h" #include "hdr/stdio_macros.h" +#include "src/__support/libc_errno.h" -using LlvmLibcSetvbufTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest; - -TEST_F(LlvmLibcSetvbufTest, SetNBFBuffer) { +TEST(LlvmLibcSetvbufTest, SetNBFBuffer) { // The idea in this test is that we open a file for writing and reading, and // then set a NBF buffer to the write handle. Since it is NBF, the data // written using the write handle should be immediately readable by the read @@ -53,7 +52,7 @@ TEST_F(LlvmLibcSetvbufTest, SetNBFBuffer) { ASSERT_EQ(0, LIBC_NAMESPACE::fclose(fr)); } -TEST_F(LlvmLibcSetvbufTest, SetLBFBuffer) { +TEST(LlvmLibcSetvbufTest, SetLBFBuffer) { // The idea in this test is that we open a file for writing and reading, and // then set a LBF buffer to the write handle. Since it is LBF, the data // written using the write handle should be available right after a '\n' is @@ -103,5 +102,6 @@ TEST(LlvmLibcSetbufTest, InvalidBufferMode) { 0); ASSERT_ERRNO_EQ(EINVAL); + libc_errno = 0; ASSERT_EQ(0, LIBC_NAMESPACE::fclose(f)); } diff --git a/libc/test/src/stdio/unlocked_fileop_test.cpp b/libc/test/src/stdio/unlocked_fileop_test.cpp index e99b382d1211..5d482b70064b 100644 --- a/libc/test/src/stdio/unlocked_fileop_test.cpp +++ b/libc/test/src/stdio/unlocked_fileop_test.cpp @@ -15,12 +15,11 @@ #include "src/stdio/fread_unlocked.h" #include "src/stdio/funlockfile.h" #include "src/stdio/fwrite_unlocked.h" -#include "test/UnitTest/ErrnoCheckingTest.h" #include "test/UnitTest/Test.h" -using LlvmLibcFILETest = LIBC_NAMESPACE::testing::ErrnoCheckingTest; +#include "src/__support/libc_errno.h" -TEST_F(LlvmLibcFILETest, UnlockedReadAndWrite) { +TEST(LlvmLibcFILETest, UnlockedReadAndWrite) { constexpr char fNAME[] = "testdata/unlocked_read_and_write.test"; ::FILE *f = LIBC_NAMESPACE::fopen(fNAME, "w"); ASSERT_FALSE(f == nullptr); @@ -37,6 +36,7 @@ TEST_F(LlvmLibcFILETest, UnlockedReadAndWrite) { LIBC_NAMESPACE::fread_unlocked(data, 1, sizeof(READ_SIZE), f)); ASSERT_NE(LIBC_NAMESPACE::ferror_unlocked(f), 0); ASSERT_ERRNO_FAILURE(); + libc_errno = 0; LIBC_NAMESPACE::clearerr_unlocked(f); ASSERT_EQ(LIBC_NAMESPACE::ferror_unlocked(f), 0); @@ -57,6 +57,7 @@ TEST_F(LlvmLibcFILETest, UnlockedReadAndWrite) { LIBC_NAMESPACE::fwrite_unlocked(CONTENT, 1, sizeof(CONTENT), f)); ASSERT_NE(LIBC_NAMESPACE::ferror_unlocked(f), 0); ASSERT_ERRNO_FAILURE(); + libc_errno = 0; LIBC_NAMESPACE::clearerr_unlocked(f); ASSERT_EQ(LIBC_NAMESPACE::ferror_unlocked(f), 0); diff --git a/libc/test/src/stdlib/StrtolTest.h b/libc/test/src/stdlib/StrtolTest.h index 03f0a6539c78..3eeccc5727e7 100644 --- a/libc/test/src/stdlib/StrtolTest.h +++ b/libc/test/src/stdlib/StrtolTest.h @@ -9,6 +9,7 @@ #include "src/__support/CPP/limits.h" #include "src/__support/CPP/type_traits.h" #include "src/__support/ctype_utils.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/properties/architectures.h" #include "test/UnitTest/ErrnoCheckingTest.h" #include "test/UnitTest/Test.h" diff --git a/libc/test/src/stdlib/strtold_test.cpp b/libc/test/src/stdlib/strtold_test.cpp index eb4056dc7ba6..c2f2b9c9a11c 100644 --- a/libc/test/src/stdlib/strtold_test.cpp +++ b/libc/test/src/stdlib/strtold_test.cpp @@ -7,6 +7,7 @@ //===----------------------------------------------------------------------===// #include "src/__support/FPUtil/FPBits.h" +#include "src/__support/libc_errno.h" #include "src/__support/uint128.h" #include "src/stdlib/strtold.h" From 99638537cd19b84252685a3dd56535a4d54d690e Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Wed, 11 Jun 2025 21:56:48 -0700 Subject: [PATCH 0091/1322] [AArch64] Fix a warning This patch fixes: llvm/lib/Target/AArch64/AArch64ISelLowering.cpp:7157:3: error: unannotated fall-through between switch labels [-Werror,-Wimplicit-fallthrough] --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index ad5b90984188..af5dfd6c9b8f 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -7154,6 +7154,7 @@ SDValue AArch64TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op, switch (CC) { default: NestReg = 0x0f; // X15 + LLVM_FALLTHROUGH; case CallingConv::ARM64EC_Thunk_Native: case CallingConv::ARM64EC_Thunk_X64: // Must be kept in sync with AArch64CallingConv.td From 02550da932913bd7c3987c68abc9060c9e5bde2c Mon Sep 17 00:00:00 2001 From: Fazlay Rabbi <106703039+mdfazlay@users.noreply.github.com> Date: Wed, 11 Jun 2025 22:06:11 -0700 Subject: [PATCH 0092/1322] [OpenMP 60] Initial parsing/sema for `need_device_addr` modifier on `adjust_args` clause (#143442) Adds initial parsing and semantic analysis for `need_device_addr` modifier on `adjust_args` clause. --- clang/include/clang/Basic/Attr.td | 1 + .../clang/Basic/DiagnosticParseKinds.td | 6 ++-- clang/include/clang/Basic/OpenMPKinds.def | 1 + clang/include/clang/Sema/SemaOpenMP.h | 1 + clang/lib/AST/AttrImpl.cpp | 6 ++++ clang/lib/Parse/ParseOpenMP.cpp | 28 +++++++++++++------ clang/lib/Sema/SemaOpenMP.cpp | 5 ++++ .../lib/Sema/SemaTemplateInstantiateDecl.cpp | 11 ++++++-- .../declare_variant_clauses_ast_print.cpp | 26 ++++++++++------- .../declare_variant_clauses_messages.cpp | 24 +++++++++++----- 10 files changed, 80 insertions(+), 29 deletions(-) diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td index 9e84462eaa66..f113cd2ba2fb 100644 --- a/clang/include/clang/Basic/Attr.td +++ b/clang/include/clang/Basic/Attr.td @@ -4630,6 +4630,7 @@ def OMPDeclareVariant : InheritableAttr { OMPTraitInfoArgument<"TraitInfos">, VariadicExprArgument<"AdjustArgsNothing">, VariadicExprArgument<"AdjustArgsNeedDevicePtr">, + VariadicExprArgument<"AdjustArgsNeedDeviceAddr">, VariadicOMPInteropInfoArgument<"AppendArgs">, ]; let AdditionalMembers = [{ diff --git a/clang/include/clang/Basic/DiagnosticParseKinds.td b/clang/include/clang/Basic/DiagnosticParseKinds.td index 3aa36ad59d0b..6c30da376daf 100644 --- a/clang/include/clang/Basic/DiagnosticParseKinds.td +++ b/clang/include/clang/Basic/DiagnosticParseKinds.td @@ -1581,8 +1581,10 @@ def err_omp_unexpected_append_op : Error< "unexpected operation specified in 'append_args' clause, expected 'interop'">; def err_omp_unexpected_execution_modifier : Error< "unexpected 'execution' modifier in non-executable context">; -def err_omp_unknown_adjust_args_op : Error< - "incorrect adjust_args type, expected 'need_device_ptr' or 'nothing'">; +def err_omp_unknown_adjust_args_op + : Error< + "incorrect 'adjust_args' type, expected 'need_device_ptr'%select{|, " + "'need_device_addr',}0 or 'nothing'">; def err_omp_declare_variant_wrong_clause : Error< "expected %select{'match'|'match', 'adjust_args', or 'append_args'}0 clause " "on 'omp declare variant' directive">; diff --git a/clang/include/clang/Basic/OpenMPKinds.def b/clang/include/clang/Basic/OpenMPKinds.def index b0de65df7e39..2b1dc1e0121b 100644 --- a/clang/include/clang/Basic/OpenMPKinds.def +++ b/clang/include/clang/Basic/OpenMPKinds.def @@ -214,6 +214,7 @@ OPENMP_ORIGINAL_SHARING_MODIFIER(default) // Adjust-op kinds for the 'adjust_args' clause. OPENMP_ADJUST_ARGS_KIND(nothing) OPENMP_ADJUST_ARGS_KIND(need_device_ptr) +OPENMP_ADJUST_ARGS_KIND(need_device_addr) // Binding kinds for the 'bind' clause. OPENMP_BIND_KIND(teams) diff --git a/clang/include/clang/Sema/SemaOpenMP.h b/clang/include/clang/Sema/SemaOpenMP.h index 6498390fe96f..be6bec206878 100644 --- a/clang/include/clang/Sema/SemaOpenMP.h +++ b/clang/include/clang/Sema/SemaOpenMP.h @@ -849,6 +849,7 @@ public: FunctionDecl *FD, Expr *VariantRef, OMPTraitInfo &TI, ArrayRef AdjustArgsNothing, ArrayRef AdjustArgsNeedDevicePtr, + ArrayRef AdjustArgsNeedDeviceAddr, ArrayRef AppendArgs, SourceLocation AdjustArgsLoc, SourceLocation AppendArgsLoc, SourceRange SR); diff --git a/clang/lib/AST/AttrImpl.cpp b/clang/lib/AST/AttrImpl.cpp index fefb8f55a9ee..5875a925d3fb 100644 --- a/clang/lib/AST/AttrImpl.cpp +++ b/clang/lib/AST/AttrImpl.cpp @@ -224,6 +224,12 @@ void OMPDeclareVariantAttr::printPrettyPragma( PrintExprs(adjustArgsNeedDevicePtr_begin(), adjustArgsNeedDevicePtr_end()); OS << ")"; } + if (adjustArgsNeedDeviceAddr_size()) { + OS << " adjust_args(need_device_addr:"; + PrintExprs(adjustArgsNeedDeviceAddr_begin(), + adjustArgsNeedDeviceAddr_end()); + OS << ")"; + } auto PrintInteropInfo = [&OS](OMPInteropInfo *Begin, OMPInteropInfo *End) { for (OMPInteropInfo *I = Begin; I != End; ++I) { diff --git a/clang/lib/Parse/ParseOpenMP.cpp b/clang/lib/Parse/ParseOpenMP.cpp index e41e5ba8596b..b69c3abe0b32 100644 --- a/clang/lib/Parse/ParseOpenMP.cpp +++ b/clang/lib/Parse/ParseOpenMP.cpp @@ -1483,6 +1483,7 @@ void Parser::ParseOMPDeclareVariantClauses(Parser::DeclGroupPtrTy Ptr, OMPTraitInfo &TI = ASTCtx.getNewOMPTraitInfo(); SmallVector AdjustNothing; SmallVector AdjustNeedDevicePtr; + SmallVector AdjustNeedDeviceAddr; SmallVector AppendArgs; SourceLocation AdjustArgsLoc, AppendArgsLoc; @@ -1515,11 +1516,21 @@ void Parser::ParseOMPDeclareVariantClauses(Parser::DeclGroupPtrTy Ptr, SmallVector Vars; IsError = ParseOpenMPVarList(OMPD_declare_variant, OMPC_adjust_args, Vars, Data); - if (!IsError) - llvm::append_range(Data.ExtraModifier == OMPC_ADJUST_ARGS_nothing - ? AdjustNothing - : AdjustNeedDevicePtr, - Vars); + if (!IsError) { + switch (Data.ExtraModifier) { + case OMPC_ADJUST_ARGS_nothing: + llvm::append_range(AdjustNothing, Vars); + break; + case OMPC_ADJUST_ARGS_need_device_ptr: + llvm::append_range(AdjustNeedDevicePtr, Vars); + break; + case OMPC_ADJUST_ARGS_need_device_addr: + llvm::append_range(AdjustNeedDeviceAddr, Vars); + break; + default: + llvm_unreachable("Unexpected 'adjust_args' clause modifier."); + } + } break; } case OMPC_append_args: @@ -1559,8 +1570,8 @@ void Parser::ParseOMPDeclareVariantClauses(Parser::DeclGroupPtrTy Ptr, if (DeclVarData && !TI.Sets.empty()) Actions.OpenMP().ActOnOpenMPDeclareVariantDirective( DeclVarData->first, DeclVarData->second, TI, AdjustNothing, - AdjustNeedDevicePtr, AppendArgs, AdjustArgsLoc, AppendArgsLoc, - SourceRange(Loc, Tok.getLocation())); + AdjustNeedDevicePtr, AdjustNeedDeviceAddr, AppendArgs, AdjustArgsLoc, + AppendArgsLoc, SourceRange(Loc, Tok.getLocation())); // Skip the last annot_pragma_openmp_end. (void)ConsumeAnnotationToken(); @@ -4818,7 +4829,8 @@ bool Parser::ParseOpenMPVarList(OpenMPDirectiveKind DKind, getLangOpts()); Data.ExtraModifierLoc = Tok.getLocation(); if (Data.ExtraModifier == OMPC_ADJUST_ARGS_unknown) { - Diag(Tok, diag::err_omp_unknown_adjust_args_op); + Diag(Tok, diag::err_omp_unknown_adjust_args_op) + << (getLangOpts().OpenMP >= 60 ? 1 : 0); SkipUntil(tok::r_paren, tok::annot_pragma_openmp_end, StopBeforeMatch); } else { ConsumeToken(); diff --git a/clang/lib/Sema/SemaOpenMP.cpp b/clang/lib/Sema/SemaOpenMP.cpp index 2cbe79c5c07c..d928b7ae2b4c 100644 --- a/clang/lib/Sema/SemaOpenMP.cpp +++ b/clang/lib/Sema/SemaOpenMP.cpp @@ -7122,6 +7122,7 @@ void SemaOpenMP::ActOnFinishedFunctionDefinitionInOpenMPDeclareVariantScope( getASTContext(), VariantFuncRef, DVScope.TI, /*NothingArgs=*/nullptr, /*NothingArgsSize=*/0, /*NeedDevicePtrArgs=*/nullptr, /*NeedDevicePtrArgsSize=*/0, + /*NeedDeviceAddrArgs=*/nullptr, /*NeedDeviceAddrArgsSize=*/0, /*AppendArgs=*/nullptr, /*AppendArgsSize=*/0); for (FunctionDecl *BaseFD : Bases) BaseFD->addAttr(OMPDeclareVariantA); @@ -7553,6 +7554,7 @@ void SemaOpenMP::ActOnOpenMPDeclareVariantDirective( FunctionDecl *FD, Expr *VariantRef, OMPTraitInfo &TI, ArrayRef AdjustArgsNothing, ArrayRef AdjustArgsNeedDevicePtr, + ArrayRef AdjustArgsNeedDeviceAddr, ArrayRef AppendArgs, SourceLocation AdjustArgsLoc, SourceLocation AppendArgsLoc, SourceRange SR) { @@ -7564,6 +7566,7 @@ void SemaOpenMP::ActOnOpenMPDeclareVariantDirective( SmallVector AllAdjustArgs; llvm::append_range(AllAdjustArgs, AdjustArgsNothing); llvm::append_range(AllAdjustArgs, AdjustArgsNeedDevicePtr); + llvm::append_range(AllAdjustArgs, AdjustArgsNeedDeviceAddr); if (!AllAdjustArgs.empty() || !AppendArgs.empty()) { VariantMatchInfo VMI; @@ -7614,6 +7617,8 @@ void SemaOpenMP::ActOnOpenMPDeclareVariantDirective( const_cast(AdjustArgsNothing.data()), AdjustArgsNothing.size(), const_cast(AdjustArgsNeedDevicePtr.data()), AdjustArgsNeedDevicePtr.size(), + const_cast(AdjustArgsNeedDeviceAddr.data()), + AdjustArgsNeedDeviceAddr.size(), const_cast(AppendArgs.data()), AppendArgs.size(), SR); FD->addAttr(NewAttr); } diff --git a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp index 57271415f838..a25bfd1c48de 100644 --- a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp +++ b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp @@ -527,6 +527,7 @@ static void instantiateOMPDeclareVariantAttr( SmallVector NothingExprs; SmallVector NeedDevicePtrExprs; + SmallVector NeedDeviceAddrExprs; SmallVector AppendArgs; for (Expr *E : Attr.adjustArgsNothing()) { @@ -541,14 +542,20 @@ static void instantiateOMPDeclareVariantAttr( continue; NeedDevicePtrExprs.push_back(ER.get()); } + for (Expr *E : Attr.adjustArgsNeedDeviceAddr()) { + ExprResult ER = Subst(E); + if (ER.isInvalid()) + continue; + NeedDeviceAddrExprs.push_back(ER.get()); + } for (OMPInteropInfo &II : Attr.appendArgs()) { // When prefer_type is implemented for append_args handle them here too. AppendArgs.emplace_back(II.IsTarget, II.IsTargetSync); } S.OpenMP().ActOnOpenMPDeclareVariantDirective( - FD, E, TI, NothingExprs, NeedDevicePtrExprs, AppendArgs, SourceLocation(), - SourceLocation(), Attr.getRange()); + FD, E, TI, NothingExprs, NeedDevicePtrExprs, NeedDeviceAddrExprs, + AppendArgs, SourceLocation(), SourceLocation(), Attr.getRange()); } static void instantiateDependentAMDGPUFlatWorkGroupSizeAttr( diff --git a/clang/test/OpenMP/declare_variant_clauses_ast_print.cpp b/clang/test/OpenMP/declare_variant_clauses_ast_print.cpp index 172dd1670421..c14e19cc8b7e 100644 --- a/clang/test/OpenMP/declare_variant_clauses_ast_print.cpp +++ b/clang/test/OpenMP/declare_variant_clauses_ast_print.cpp @@ -54,9 +54,9 @@ void foo_v3(float *AAA, float *BBB, int *I) {return;} //DUMP: DeclRefExpr{{.*}}Function{{.*}}foo_v1 //DUMP: DeclRefExpr{{.*}}ParmVar{{.*}}'AAA' //DUMP: DeclRefExpr{{.*}}ParmVar{{.*}}'BBB' -//PRINT: #pragma omp declare variant(foo_v3) match(construct={dispatch}, device={arch(x86, x86_64)}) adjust_args(nothing:I) adjust_args(need_device_ptr:BBB) +//PRINT: #pragma omp declare variant(foo_v3) match(construct={dispatch}, device={arch(x86, x86_64)}) adjust_args(nothing:I) adjust_args(need_device_ptr:BBB) adjust_args(need_device_addr:AAA) -//PRINT: #pragma omp declare variant(foo_v2) match(construct={dispatch}, device={arch(ppc)}) adjust_args(need_device_ptr:AAA) +//PRINT: #pragma omp declare variant(foo_v2) match(construct={dispatch}, device={arch(ppc)}) adjust_args(need_device_ptr:AAA) adjust_args(need_device_addr:BBB) //PRINT: omp declare variant(foo_v1) match(construct={dispatch}, device={arch(arm)}) adjust_args(need_device_ptr:AAA,BBB) @@ -66,42 +66,48 @@ void foo_v3(float *AAA, float *BBB, int *I) {return;} #pragma omp declare variant(foo_v2) \ match(construct={dispatch}, device={arch(ppc)}), \ - adjust_args(need_device_ptr:AAA) + adjust_args(need_device_ptr:AAA) \ + adjust_args(need_device_addr:BBB) #pragma omp declare variant(foo_v3) \ adjust_args(need_device_ptr:BBB) adjust_args(nothing:I) \ + adjust_args(need_device_addr:AAA) \ match(construct={dispatch}, device={arch(x86,x86_64)}) void foo(float *AAA, float *BBB, int *I) {return;} -void Foo_Var(float *AAA, float *BBB) {return;} +void Foo_Var(float *AAA, float *BBB, float *CCC) {return;} #pragma omp declare variant(Foo_Var) \ match(construct={dispatch}, device={arch(x86_64)}) \ - adjust_args(need_device_ptr:AAA) adjust_args(nothing:BBB) + adjust_args(need_device_ptr:AAA) adjust_args(nothing:BBB) \ + adjust_args(need_device_addr:CCC) template -void Foo(T *AAA, T *BBB) {return;} +void Foo(T *AAA, T *BBB, T *CCC) {return;} -//PRINT: #pragma omp declare variant(Foo_Var) match(construct={dispatch}, device={arch(x86_64)}) adjust_args(nothing:BBB) adjust_args(need_device_ptr:AAA) -//DUMP: FunctionDecl{{.*}} Foo 'void (T *, T *)' +//PRINT: #pragma omp declare variant(Foo_Var) match(construct={dispatch}, device={arch(x86_64)}) adjust_args(nothing:BBB) adjust_args(need_device_ptr:AAA) adjust_args(need_device_addr:CCC) +//DUMP: FunctionDecl{{.*}} Foo 'void (T *, T *, T *)' //DUMP: OMPDeclareVariantAttr{{.*}}device={arch(x86_64)} //DUMP: DeclRefExpr{{.*}}Function{{.*}}Foo_Var //DUMP: DeclRefExpr{{.*}}ParmVar{{.*}}'BBB' //DUMP: DeclRefExpr{{.*}}ParmVar{{.*}}'AAA' +//DUMP: DeclRefExpr{{.*}}ParmVar{{.*}}'CCC' // -//DUMP: FunctionDecl{{.*}} Foo 'void (float *, float *)' +//DUMP: FunctionDecl{{.*}} Foo 'void (float *, float *, float *)' //DUMP: OMPDeclareVariantAttr{{.*}}device={arch(x86_64)} //DUMP: DeclRefExpr{{.*}}Function{{.*}}Foo_Var //DUMP: DeclRefExpr{{.*}}ParmVar{{.*}}'BBB' //DUMP: DeclRefExpr{{.*}}ParmVar{{.*}}'AAA' +//DUMP: DeclRefExpr{{.*}}ParmVar{{.*}}'CCC' void func() { float *A; float *B; + float *C; //#pragma omp dispatch - Foo(A, B); + Foo(A, B, C); } typedef void *omp_interop_t; diff --git a/clang/test/OpenMP/declare_variant_clauses_messages.cpp b/clang/test/OpenMP/declare_variant_clauses_messages.cpp index 284e49bbd21b..aadded7699ea 100644 --- a/clang/test/OpenMP/declare_variant_clauses_messages.cpp +++ b/clang/test/OpenMP/declare_variant_clauses_messages.cpp @@ -1,10 +1,10 @@ -// RUN: %clang_cc1 -verify -triple x86_64-unknown-linux -fopenmp -std=c++11 -o - %s -// RUN: %clang_cc1 -verify -triple x86_64-unknown-linux -fopenmp -std=c++11 \ +// RUN: %clang_cc1 -verify -triple x86_64-unknown-linux -fopenmp -fopenmp-version=60 -std=c++11 -o - %s +// RUN: %clang_cc1 -verify -triple x86_64-unknown-linux -fopenmp -fopenmp-version=60 -std=c++11 \ // RUN: -DNO_INTEROP_T_DEF -o - %s -// RUN: %clang_cc1 -verify -triple x86_64-unknown-linux -fopenmp -std=c++11 -o - %s -// RUN: %clang_cc1 -verify -triple x86_64-unknown-linux -fopenmp -Wno-strict-prototypes -DC -x c -o - %s +// RUN: %clang_cc1 -verify -triple x86_64-unknown-linux -fopenmp -fopenmp-version=60 -std=c++11 -o - %s +// RUN: %clang_cc1 -verify -triple x86_64-unknown-linux -fopenmp -fopenmp-version=60 -Wno-strict-prototypes -DC -x c -o - %s // RUN: %clang_cc1 -verify -triple x86_64-pc-windows-msvc -fms-compatibility \ -// RUN: -fopenmp -Wno-strict-prototypes -DC -DWIN -x c -o - %s +// RUN: -fopenmp -fopenmp-version=60 -Wno-strict-prototypes -DC -DWIN -x c -o - %s #ifdef NO_INTEROP_T_DEF void foo_v1(float *, void *); @@ -114,6 +114,16 @@ void vararg_bar2(const char *fmt) { return; } match(construct={dispatch}, device={arch(ppc)}), \ adjust_args(need_device_ptr:AAA) adjust_args(nothing:AAA) +// expected-error@+3 {{'adjust_arg' argument 'AAA' used in multiple clauses}} +#pragma omp declare variant(foo_v1) \ + match(construct={dispatch}, device={arch(arm)}) \ + adjust_args(need_device_ptr:AAA,BBB) adjust_args(need_device_addr:AAA) + +// expected-error@+3 {{'adjust_arg' argument 'AAA' used in multiple clauses}} +#pragma omp declare variant(foo_v1) \ + match(construct={dispatch}, device={arch(ppc)}), \ + adjust_args(need_device_addr:AAA) adjust_args(nothing:AAA) + // expected-error@+2 {{use of undeclared identifier 'J'}} #pragma omp declare variant(foo_v1) \ adjust_args(nothing:J) \ @@ -186,12 +196,12 @@ void vararg_bar2(const char *fmt) { return; } // expected-error@+1 {{variant in '#pragma omp declare variant' with type 'void (float *, float *, int *, omp_interop_t)' (aka 'void (float *, float *, int *, void *)') is incompatible with type 'void (float *, float *, int *)'}} #pragma omp declare variant(foo_v4) match(construct={dispatch}) -// expected-error@+3 {{incorrect adjust_args type, expected 'need_device_ptr' or 'nothing'}} +// expected-error@+3 {{incorrect 'adjust_args' type, expected 'need_device_ptr', 'need_device_addr', or 'nothing'}} #pragma omp declare variant(foo_v1) \ match(construct={dispatch}, device={arch(arm)}) \ adjust_args(badaaop:AAA,BBB) -// expected-error@+3 {{incorrect adjust_args type, expected 'need_device_ptr' or 'nothing'}} +// expected-error@+3 {{incorrect 'adjust_args' type, expected 'need_device_ptr', 'need_device_addr', or 'nothing'}} #pragma omp declare variant(foo_v1) \ match(construct={dispatch}, device={arch(arm)}) \ adjust_args(badaaop AAA,BBB) From 28bda778437fea17a25b561f1b3b84545612b565 Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Wed, 11 Jun 2025 22:19:31 -0700 Subject: [PATCH 0093/1322] Introduce MCAsmInfo::UsesSetToEquateSymbol and prefer = to .set Introduce MCAsmInfo::UsesSetToEquateSymbol to control the preferred syntax for symbol equating. We now favor the more readable and common `symbol = expression` syntax over `.set`. This aligns with pre- https://reviews.llvm.org/D44256 behavior. On Apple platforms, this resolves a clang -S vs -c behavior difference (resolves #104623). For targets whose = support is unconfirmed, UsesSetToEquateSymbol is set to false. This also minimizes test updates. Pull Request: https://github.com/llvm/llvm-project/pull/142289 --- clang/test/CodeGen/alias.c | 6 +-- llvm/include/llvm/MC/MCAsmInfo.h | 4 ++ llvm/lib/MC/MCAsmStreamer.cpp | 6 ++- .../AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp | 1 + .../Hexagon/MCTargetDesc/HexagonMCAsmInfo.cpp | 1 + .../PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp | 2 + .../SystemZ/MCTargetDesc/SystemZMCAsmInfo.cpp | 1 + llvm/test/CodeGen/AArch64/arm64ec-alias.ll | 14 +++--- .../AArch64/arm64ec-hybrid-patchable.ll | 18 +++---- llvm/test/CodeGen/AArch64/arm64ec-symbols.ll | 6 +-- llvm/test/CodeGen/AArch64/arm64ec-varargs.ll | 16 +++--- llvm/test/CodeGen/AArch64/ehcontguard.ll | 2 +- llvm/test/CodeGen/AArch64/global-merge-1.ll | 8 +-- llvm/test/CodeGen/AArch64/global-merge-2.ll | 12 ++--- llvm/test/CodeGen/AArch64/global-merge-3.ll | 10 ++-- .../AArch64/global-merge-hidden-minsize.ll | 4 +- llvm/test/CodeGen/AArch64/ifunc-asm.ll | 2 +- llvm/test/CodeGen/AArch64/seh-finally.ll | 8 +-- .../CodeGen/AArch64/stackguard-internal.ll | 2 +- llvm/test/CodeGen/ARM/alias_store.ll | 2 +- llvm/test/CodeGen/ARM/aliases.ll | 14 +++--- .../CodeGen/ARM/global-merge-dllexport.ll | 4 +- .../CodeGen/ARM/global-merge-external-2.ll | 12 ++--- .../test/CodeGen/ARM/global-merge-external.ll | 12 ++--- llvm/test/CodeGen/AVR/global-aliases.ll | 28 +++++------ llvm/test/CodeGen/Mips/hf16call32_body.ll | 24 ++++----- llvm/test/CodeGen/Mips/mips16ex.ll | 2 +- .../PowerPC/asm-printer-topological-order.ll | 6 +-- llvm/test/CodeGen/PowerPC/data-align.ll | 10 ++-- llvm/test/CodeGen/WebAssembly/aliases.ll | 22 ++++---- llvm/test/CodeGen/WinCFGuard/cfguard-mingw.ll | 2 +- llvm/test/CodeGen/WinCFGuard/cfguard.ll | 2 +- .../CodeGen/X86/2007-09-06-ExtWeakAliasee.ll | 2 +- llvm/test/CodeGen/X86/2009-08-12-badswitch.ll | 50 +++++++++---------- .../CodeGen/X86/2010-05-26-DotDebugLoc.ll | 8 +-- llvm/test/CodeGen/X86/alias-gep.ll | 8 +-- llvm/test/CodeGen/X86/aliases.ll | 8 +-- .../CodeGen/X86/catchret-empty-fallthrough.ll | 2 +- llvm/test/CodeGen/X86/coff-alias-type.ll | 2 +- llvm/test/CodeGen/X86/coff-comdat.ll | 2 +- llvm/test/CodeGen/X86/coff-feat00.ll | 2 +- llvm/test/CodeGen/X86/dllexport-x86_64.ll | 10 ++-- llvm/test/CodeGen/X86/dllexport.ll | 8 +-- llvm/test/CodeGen/X86/ehcontguard.ll | 2 +- .../CodeGen/X86/fastcall-correct-mangling.ll | 4 +- llvm/test/CodeGen/X86/ifunc-asm.ll | 2 +- .../test/CodeGen/X86/lea-opt-memop-check-1.ll | 6 +-- llvm/test/CodeGen/X86/linux-preemption.ll | 16 +++--- llvm/test/CodeGen/X86/localescape.ll | 16 +++--- llvm/test/CodeGen/X86/pr22019.ll | 8 +-- llvm/test/CodeGen/X86/seh-catch-all-win32.ll | 4 +- llvm/test/CodeGen/X86/seh-catchpad.ll | 2 +- llvm/test/CodeGen/X86/seh-finally.ll | 2 +- llvm/test/CodeGen/X86/seh-no-invokes.ll | 2 +- llvm/test/CodeGen/X86/seh-stack-realign.ll | 4 +- llvm/test/CodeGen/X86/tailcall-cgp-dup.ll | 12 ++--- .../X86/windows-seh-EHa-TryInFinally.ll | 2 +- llvm/test/CodeGen/XCore/globals.ll | 2 +- llvm/test/CodeGen/XCore/linkage.ll | 4 +- llvm/test/DebugInfo/X86/dbg-value-range.ll | 4 +- .../X86/stmt-list-multiple-compile-units.ll | 4 +- llvm/test/MC/AArch64/basic-a64-instructions.s | 2 +- llvm/test/MC/AsmParser/assignment.s | 12 ++--- llvm/test/MC/AsmParser/directive_include.s | 2 +- llvm/test/MC/AsmParser/directive_set.s | 6 +-- llvm/test/MC/AsmParser/include.ll | 4 +- llvm/test/MC/AsmParser/labels.s | 6 +-- llvm/test/MC/AsmParser/macro-arg-darwin.s | 4 +- llvm/test/MC/AsmParser/motorola_integers.s | 16 +++--- llvm/test/MC/Mips/cpsetup.s | 2 +- 70 files changed, 263 insertions(+), 252 deletions(-) diff --git a/clang/test/CodeGen/alias.c b/clang/test/CodeGen/alias.c index bc4167adf53f..9403c55beae0 100644 --- a/clang/test/CodeGen/alias.c +++ b/clang/test/CodeGen/alias.c @@ -29,20 +29,20 @@ const int wacom_usb_ids[] = {1, 1, 2, 3, 5, 8, 13, 0}; extern const int __mod_usb_device_table __attribute__ ((alias("wacom_usb_ids"))); // CHECKBASIC-DAG: @__mod_usb_device_table ={{.*}} alias i32, ptr @wacom_usb_ids // CHECKASM-DAG: .globl __mod_usb_device_table -// CHECKASM-DAG: .set __mod_usb_device_table, wacom_usb_ids +// CHECKASM-DAG: __mod_usb_device_table = wacom_usb_ids // CHECKASM-NOT: .size __mod_usb_device_table extern int g1; extern int g1 __attribute((alias("g0"))); // CHECKBASIC-DAG: @g1 ={{.*}} alias i32, ptr @g0 // CHECKASM-DAG: .globl g1 -// CHECKASM-DAG: .set g1, g0 +// CHECKASM-DAG: g1 = g0 // CHECKASM-NOT: .size g1 extern __thread int __libc_errno __attribute__ ((alias ("TL_WITH_ALIAS"))); // CHECKBASIC-DAG: @__libc_errno ={{.*}} thread_local alias i32, ptr @TL_WITH_ALIAS // CHECKASM-DAG: .globl __libc_errno -// CHECKASM-DAG: .set __libc_errno, TL_WITH_ALIAS +// CHECKASM-DAG: __libc_errno = TL_WITH_ALIAS // CHECKASM-NOT: .size __libc_errno void f0(void) { } diff --git a/llvm/include/llvm/MC/MCAsmInfo.h b/llvm/include/llvm/MC/MCAsmInfo.h index 4eb50344d638..e98cd17a9df5 100644 --- a/llvm/include/llvm/MC/MCAsmInfo.h +++ b/llvm/include/llvm/MC/MCAsmInfo.h @@ -141,6 +141,9 @@ protected: /// This is appended to emitted labels. Defaults to ":" const char *LabelSuffix; + /// Use .set instead of = to equate a symbol to an expression. + bool UsesSetToEquateSymbol = false; + // Print the EH begin symbol with an assignment. Defaults to false. bool UseAssignmentForEHBegin = false; @@ -525,6 +528,7 @@ public: bool shouldAllowAdditionalComments() const { return AllowAdditionalComments; } const char *getLabelSuffix() const { return LabelSuffix; } + bool usesSetToEquateSymbol() const { return UsesSetToEquateSymbol; } bool useAssignmentForEHBegin() const { return UseAssignmentForEHBegin; } bool needsLocalForSize() const { return NeedsLocalForSize; } StringRef getPrivateGlobalPrefix() const { return PrivateGlobalPrefix; } diff --git a/llvm/lib/MC/MCAsmStreamer.cpp b/llvm/lib/MC/MCAsmStreamer.cpp index da0d99e70d9e..4380f74318e7 100644 --- a/llvm/lib/MC/MCAsmStreamer.cpp +++ b/llvm/lib/MC/MCAsmStreamer.cpp @@ -695,9 +695,11 @@ void MCAsmStreamer::emitAssignment(MCSymbol *Symbol, const MCExpr *Value) { if (E->inlineAssignedExpr()) EmitSet = false; if (EmitSet) { - OS << ".set "; + bool UseSet = MAI->usesSetToEquateSymbol(); + if (UseSet) + OS << ".set "; Symbol->print(OS, MAI); - OS << ", "; + OS << (UseSet ? ", " : " = "); Value->print(OS, MAI); EmitEOL(); diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp index 6f1d89e500ed..fcf134aa8658 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp @@ -42,6 +42,7 @@ AMDGPUMCAsmInfo::AMDGPUMCAsmInfo(const Triple &TT, CommentString = ";"; InlineAsmStart = ";#ASMSTART"; InlineAsmEnd = ";#ASMEND"; + UsesSetToEquateSymbol = true; //===--- Data Emission Directives -------------------------------------===// UsesELFSectionDirectiveForBSS = true; diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.cpp b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.cpp index 7675b05f106a..ba8faaeb74a0 100644 --- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.cpp +++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.cpp @@ -38,6 +38,7 @@ HexagonMCAsmInfo::HexagonMCAsmInfo(const Triple &TT) { LCOMMDirectiveAlignmentType = LCOMM::ByteAlignment; InlineAsmStart = "# InlineAsm Start"; InlineAsmEnd = "# InlineAsm End"; + UsesSetToEquateSymbol = true; ZeroDirective = "\t.space\t"; AscizDirective = "\t.string\t"; diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp index 160ee07fad5c..b5be23c5a96a 100644 --- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp +++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp @@ -155,5 +155,7 @@ PPCXCOFFMCAsmInfo::PPCXCOFFMCAsmInfo(bool Is64Bit, const Triple &T) { // Support $ as PC in inline asm DollarIsPC = true; + UsesSetToEquateSymbol = true; + initializeVariantKinds(variantKindDescs); } diff --git a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.cpp b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.cpp index 27272cdbbd23..e9d387399bf3 100644 --- a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.cpp +++ b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.cpp @@ -49,6 +49,7 @@ SystemZMCAsmInfoGOFF::SystemZMCAsmInfoGOFF(const Triple &TT) { CalleeSaveStackSlotSize = 8; CodePointerSize = 8; CommentString = "*"; + UsesSetToEquateSymbol = true; ExceptionsType = ExceptionHandling::ZOS; IsHLASM = true; IsLittleEndian = false; diff --git a/llvm/test/CodeGen/AArch64/arm64ec-alias.ll b/llvm/test/CodeGen/AArch64/arm64ec-alias.ll index 03cc87313694..18023a95a5d2 100644 --- a/llvm/test/CodeGen/AArch64/arm64ec-alias.ll +++ b/llvm/test/CodeGen/AArch64/arm64ec-alias.ll @@ -13,30 +13,30 @@ define dso_local void @patchable_func() hybrid_patchable { @patchable_alias = alias void (), ptr @patchable_func ; CHECK: .weak_anti_dep func_alias -; CHECK-NEXT: .set func_alias, "#func_alias" +; CHECK-NEXT: func_alias = "#func_alias" ; CHECK-NEXT: .weak_anti_dep func_alias2 -; CHECK-NEXT: .set func_alias2, "#func_alias2" +; CHECK-NEXT: func_alias2 = "#func_alias2" ; CHECK-NEXT: .weak_anti_dep func -; CHECK-NEXT: .set func, "#func" +; CHECK-NEXT: func = "#func" ; CHECK: .weak_anti_dep patchable_alias -; CHECK-NEXT: .set patchable_alias, "#patchable_alias" +; CHECK-NEXT: patchable_alias = "#patchable_alias" ; CHECK: .globl "#func_alias" ; CHECK-NEXT: .def "#func_alias"; ; CHECK-NEXT: .scl 2; ; CHECK-NEXT: .type 32; ; CHECK-NEXT: .endef -; CHECK-NEXT: .set "#func_alias", "#func" +; CHECK-NEXT: "#func_alias" = "#func" ; CHECK-NEXT: .globl "#func_alias2" ; CHECK-NEXT: .def "#func_alias2"; ; CHECK-NEXT: .scl 2; ; CHECK-NEXT: .type 32; ; CHECK-NEXT: .endef -; CHECK-NEXT: .set "#func_alias2", "#func_alias" +; CHECK-NEXT: "#func_alias2" = "#func_alias" ; CHECK: .globl "#patchable_alias" ; CHECK-NEXT: .def "#patchable_alias"; ; CHECK-NEXT: .scl 2; ; CHECK-NEXT: .type 32; ; CHECK-NEXT: .endef -; CHECK-NEXT: .set "#patchable_alias", "#patchable_func" +; CHECK-NEXT: "#patchable_alias" = "#patchable_func" diff --git a/llvm/test/CodeGen/AArch64/arm64ec-hybrid-patchable.ll b/llvm/test/CodeGen/AArch64/arm64ec-hybrid-patchable.ll index f964484c0c2d..7c77832a9d9a 100644 --- a/llvm/test/CodeGen/AArch64/arm64ec-hybrid-patchable.ll +++ b/llvm/test/CodeGen/AArch64/arm64ec-hybrid-patchable.ll @@ -76,7 +76,7 @@ define dso_local void @caller() nounwind { ; CHECK-NEXT: .p2align 2 ; CHECK-NEXT: "#caller": // @"#caller" ; CHECK-NEXT: .weak_anti_dep caller -; CHECK-NEXT: .set caller, "#caller"{{$}} +; CHECK-NEXT: caller = "#caller"{{$}} ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: bl "#func" @@ -253,13 +253,13 @@ define dso_local void @caller() nounwind { ; CHECK-NEXT: .type 32; ; CHECK-NEXT: .endef ; CHECK-NEXT: .weak func -; CHECK-NEXT: .set func, "EXP+#func"{{$}} +; CHECK-NEXT: func = "EXP+#func"{{$}} ; CHECK-NEXT: .weak "#func" ; CHECK-NEXT: .def "#func"; ; CHECK-NEXT: .scl 2; ; CHECK-NEXT: .type 32; ; CHECK-NEXT: .endef -; CHECK-NEXT: .set "#func", "#func$hybpatch_thunk"{{$}} +; CHECK-NEXT: "#func" = "#func$hybpatch_thunk"{{$}} ; CHECK-NEXT: .def "EXP+#has_varargs"; ; CHECK-NEXT: .scl 2; ; CHECK-NEXT: .type 32; @@ -269,13 +269,13 @@ define dso_local void @caller() nounwind { ; CHECK-NEXT: .type 32; ; CHECK-NEXT: .endef ; CHECK-NEXT: .weak has_varargs -; CHECK-NEXT: .set has_varargs, "EXP+#has_varargs" +; CHECK-NEXT: has_varargs = "EXP+#has_varargs" ; CHECK-NEXT: .weak "#has_varargs" ; CHECK-NEXT: .def "#has_varargs"; ; CHECK-NEXT: .scl 2; ; CHECK-NEXT: .type 32; ; CHECK-NEXT: .endef -; CHECK-NEXT: .set "#has_varargs", "#has_varargs$hybpatch_thunk" +; CHECK-NEXT: "#has_varargs" = "#has_varargs$hybpatch_thunk" ; CHECK-NEXT: .def "EXP+#has_sret"; ; CHECK-NEXT: .scl 2; ; CHECK-NEXT: .type 32; @@ -285,13 +285,13 @@ define dso_local void @caller() nounwind { ; CHECK-NEXT: .type 32; ; CHECK-NEXT: .endef ; CHECK-NEXT: .weak has_sret -; CHECK-NEXT: .set has_sret, "EXP+#has_sret" +; CHECK-NEXT: has_sret = "EXP+#has_sret" ; CHECK-NEXT: .weak "#has_sret" ; CHECK-NEXT: .def "#has_sret"; ; CHECK-NEXT: .scl 2; ; CHECK-NEXT: .type 32; ; CHECK-NEXT: .endef -; CHECK-NEXT: .set "#has_sret", "#has_sret$hybpatch_thunk" +; CHECK-NEXT: "#has_sret" = "#has_sret$hybpatch_thunk" ; CHECK-NEXT: .def "EXP+#exp"; ; CHECK-NEXT: .scl 2; ; CHECK-NEXT: .type 32; @@ -301,13 +301,13 @@ define dso_local void @caller() nounwind { ; CHECK-NEXT: .type 32; ; CHECK-NEXT: .endef ; CHECK-NEXT: .weak exp -; CHECK-NEXT: .set exp, "EXP+#exp" +; CHECK-NEXT: exp = "EXP+#exp" ; CHECK-NEXT: .weak "#exp" ; CHECK-NEXT: .def "#exp"; ; CHECK-NEXT: .scl 2; ; CHECK-NEXT: .type 32; ; CHECK-NEXT: .endef -; CHECK-NEXT: .set "#exp", "#exp$hybpatch_thunk" +; CHECK-NEXT: "#exp" = "#exp$hybpatch_thunk" ; SYM: [53](sec 15)(fl 0x00)(ty 20)(scl 2) (nx 0) 0x00000000 #func$hybpatch_thunk ; SYM: [58](sec 16)(fl 0x00)(ty 20)(scl 2) (nx 0) 0x00000000 #has_varargs$hybpatch_thunk diff --git a/llvm/test/CodeGen/AArch64/arm64ec-symbols.ll b/llvm/test/CodeGen/AArch64/arm64ec-symbols.ll index b79dd7d61dd6..b44f39ad7b73 100644 --- a/llvm/test/CodeGen/AArch64/arm64ec-symbols.ll +++ b/llvm/test/CodeGen/AArch64/arm64ec-symbols.ll @@ -10,12 +10,12 @@ define void @caller() nounwind { } ; CHECK: .weak_anti_dep caller -; CHECK-NEXT: .set caller, "#caller"{{$}} +; CHECK-NEXT: caller = "#caller"{{$}} ; CHECK: .weak_anti_dep func -; CHECK-NEXT: .set func, "#func"{{$}} +; CHECK-NEXT: func = "#func"{{$}} ; CHECK-NEXT: .weak_anti_dep "#func" -; CHECK-NEXT: .set "#func", "#func$exit_thunk"{{$}} +; CHECK-NEXT: "#func" = "#func$exit_thunk"{{$}} ; SYM: [ 8](sec 4)(fl 0x00)(ty 20)(scl 2) (nx 0) 0x00000000 #caller ; SYM: [21](sec 7)(fl 0x00)(ty 20)(scl 2) (nx 0) 0x00000000 #func$exit_thunk diff --git a/llvm/test/CodeGen/AArch64/arm64ec-varargs.ll b/llvm/test/CodeGen/AArch64/arm64ec-varargs.ll index 5fab5738078d..389969bebaea 100644 --- a/llvm/test/CodeGen/AArch64/arm64ec-varargs.ll +++ b/llvm/test/CodeGen/AArch64/arm64ec-varargs.ll @@ -45,9 +45,9 @@ define void @varargs_caller() nounwind { ; CHECK-NEXT: stp x9, x8, [sp] ; CHECK-NEXT: str xzr, [sp, #16] ; CHECK-NEXT: .weak_anti_dep varargs_callee -; CHECK-NEXT: .set varargs_callee, "#varargs_callee" +; CHECK-NEXT: varargs_callee = "#varargs_callee" ; CHECK-NEXT: .weak_anti_dep "#varargs_callee" -; CHECK-NEXT: .set "#varargs_callee", varargs_callee +; CHECK-NEXT: "#varargs_callee" = varargs_callee ; CHECK-NEXT: bl "#varargs_callee" ; CHECK-NEXT: ldr x30, [sp, #32] // 8-byte Folded Reload ; CHECK-NEXT: add sp, sp, #48 @@ -86,9 +86,9 @@ define void @varargs_many_argscalleer() nounwind { ; CHECK-NEXT: stp x9, x8, [sp] ; CHECK-NEXT: stp q0, q0, [sp, #16] ; CHECK-NEXT: .weak_anti_dep varargs_many_argscallee -; CHECK-NEXT: .set varargs_many_argscallee, "#varargs_many_argscallee" +; CHECK-NEXT: varargs_many_argscallee = "#varargs_many_argscallee" ; CHECK-NEXT: .weak_anti_dep "#varargs_many_argscallee" -; CHECK-NEXT: .set "#varargs_many_argscallee", varargs_many_argscallee +; CHECK-NEXT: "#varargs_many_argscallee" = varargs_many_argscallee ; CHECK-NEXT: bl "#varargs_many_argscallee" ; CHECK-NEXT: ldr x30, [sp, #48] // 8-byte Folded Reload ; CHECK-NEXT: add sp, sp, #64 @@ -116,9 +116,9 @@ define void @varargs_caller_tail() nounwind { ; CHECK-NEXT: stp x9, x8, [sp] ; CHECK-NEXT: str xzr, [sp, #16] ; CHECK-NEXT: .weak_anti_dep varargs_callee -; CHECK-NEXT: .set varargs_callee, "#varargs_callee" +; CHECK-NEXT: varargs_callee = "#varargs_callee" ; CHECK-NEXT: .weak_anti_dep "#varargs_callee" -; CHECK-NEXT: .set "#varargs_callee", varargs_callee +; CHECK-NEXT: "#varargs_callee" = varargs_callee ; CHECK-NEXT: bl "#varargs_callee" ; CHECK-NEXT: ldr x30, [sp, #32] // 8-byte Folded Reload ; CHECK-NEXT: add x4, sp, #48 @@ -129,9 +129,9 @@ define void @varargs_caller_tail() nounwind { ; CHECK-NEXT: mov x5, xzr ; CHECK-NEXT: add sp, sp, #48 ; CHECK-NEXT: .weak_anti_dep varargs_callee -; CHECK-NEXT: .set varargs_callee, "#varargs_callee" +; CHECK-NEXT: varargs_callee = "#varargs_callee" ; CHECK-NEXT: .weak_anti_dep "#varargs_callee" -; CHECK-NEXT: .set "#varargs_callee", varargs_callee +; CHECK-NEXT: "#varargs_callee" = varargs_callee ; CHECK-NEXT: b "#varargs_callee" call void (double, ...) @varargs_callee(double 1.0, i32 2, double 3.0, i32 4, double 5.0, <2 x double> ) tail call void (double, ...) @varargs_callee(double 1.0, i32 4, i32 3, i32 2) diff --git a/llvm/test/CodeGen/AArch64/ehcontguard.ll b/llvm/test/CodeGen/AArch64/ehcontguard.ll index eecff391d0f8..cb603a482d22 100644 --- a/llvm/test/CodeGen/AArch64/ehcontguard.ll +++ b/llvm/test/CodeGen/AArch64/ehcontguard.ll @@ -1,7 +1,7 @@ ; RUN: llc < %s -mtriple=aarch64-windows | FileCheck %s ; EHCont Guard is currently only available on Windows -; CHECK: .set "@feat.00", 16384 +; CHECK: "@feat.00" = 16384 ; CHECK: .section .gehcont$y diff --git a/llvm/test/CodeGen/AArch64/global-merge-1.ll b/llvm/test/CodeGen/AArch64/global-merge-1.ll index cc17e344c211..626310fc4ec2 100644 --- a/llvm/test/CodeGen/AArch64/global-merge-1.ll +++ b/llvm/test/CodeGen/AArch64/global-merge-1.ll @@ -23,9 +23,9 @@ define void @f1(i32 %a1, i32 %a2) { ;CHECK: .type .L_MergedGlobals,@object // @_MergedGlobals ;CHECK: .local .L_MergedGlobals ;CHECK: .comm .L_MergedGlobals,8,4 -;CHECK: .set m, .L_MergedGlobals -;CHECK: .set n, .L_MergedGlobals+4 +;CHECK: m = .L_MergedGlobals +;CHECK: n = .L_MergedGlobals+4 ;CHECK-APPLE-IOS: .zerofill __DATA,__bss,__MergedGlobals,8,2 ; @_MergedGlobals -;CHECK-APPLE-IOS-NOT: .set _m, l__MergedGlobals -;CHECK-APPLE-IOS-NOT: .set _n, l__MergedGlobals+4 +;CHECK-APPLE-IOS-NOT: _m = l__MergedGlobals +;CHECK-APPLE-IOS-NOT: _n = l__MergedGlobals+4 diff --git a/llvm/test/CodeGen/AArch64/global-merge-2.ll b/llvm/test/CodeGen/AArch64/global-merge-2.ll index 85d814c3177b..1b5333b907d2 100644 --- a/llvm/test/CodeGen/AArch64/global-merge-2.ll +++ b/llvm/test/CodeGen/AArch64/global-merge-2.ll @@ -32,21 +32,21 @@ define dso_local void @g1(i32 %a1, i32 %a2) { ;CHECK: .comm .L_MergedGlobals,12,4 ;CHECK: .globl x -;CHECK: .set x, .L_MergedGlobals +;CHECK: x = .L_MergedGlobals ;CHECK: .size x, 4 ;CHECK: .globl y -;CHECK: .set y, .L_MergedGlobals+4 +;CHECK: y = .L_MergedGlobals+4 ;CHECK: .size y, 4 ;CHECK: .globl z -;CHECK: .set z, .L_MergedGlobals+8 +;CHECK: z = .L_MergedGlobals+8 ;CHECK: .size z, 4 ;CHECK-APPLE-IOS: .zerofill __DATA,__common,__MergedGlobals_x,12,2 ;CHECK-APPLE-IOS: .globl _x -;CHECK-APPLE-IOS: .set {{.*}}, __MergedGlobals_x +;CHECK-APPLE-IOS: {{.*}} = __MergedGlobals_x ;CHECK-APPLE-IOS: .globl _y -;CHECK-APPLE-IOS: .set _y, __MergedGlobals_x+4 +;CHECK-APPLE-IOS: _y = __MergedGlobals_x+4 ;CHECK-APPLE-IOS: .globl _z -;CHECK-APPLE-IOS: .set _z, __MergedGlobals_x+8 +;CHECK-APPLE-IOS: _z = __MergedGlobals_x+8 ;CHECK-APPLE-IOS: .subsections_via_symbols diff --git a/llvm/test/CodeGen/AArch64/global-merge-3.ll b/llvm/test/CodeGen/AArch64/global-merge-3.ll index b3f58887139f..2a0ae1227455 100644 --- a/llvm/test/CodeGen/AArch64/global-merge-3.ll +++ b/llvm/test/CodeGen/AArch64/global-merge-3.ll @@ -40,14 +40,14 @@ define dso_local void @f1(i32 %a1, i32 %a2, i32 %a3) { ;CHECK-APPLE-IOS: .globl __MergedGlobals_x ;CHECK-APPLE-IOS: .zerofill __DATA,__common,__MergedGlobals_x,800,2 -;CHECK-APPLE-IOS: .set _x, __MergedGlobals_x -;CHECK-APPLE-IOS: .set _y, __MergedGlobals_x+400 +;CHECK-APPLE-IOS: _x = __MergedGlobals_x +;CHECK-APPLE-IOS: _y = __MergedGlobals_x+400 ;CHECK: .type .L_MergedGlobals,@object // @_MergedGlobals ;CHECK: .local .L_MergedGlobals ;CHECK: .comm .L_MergedGlobals,800,4 ;CHECK: globl x -;CHECK: .set x, .L_MergedGlobals +;CHECK: x = .L_MergedGlobals ;CHECK: globl y -;CHECK: .set y, .L_MergedGlobals+400 -;CHECK-NOT: .set z, .L_MergedGlobals +;CHECK: y = .L_MergedGlobals+400 +;CHECK-NOT: z = .L_MergedGlobals diff --git a/llvm/test/CodeGen/AArch64/global-merge-hidden-minsize.ll b/llvm/test/CodeGen/AArch64/global-merge-hidden-minsize.ll index 9c694fc4d289..5292aa91fc38 100644 --- a/llvm/test/CodeGen/AArch64/global-merge-hidden-minsize.ll +++ b/llvm/test/CodeGen/AArch64/global-merge-hidden-minsize.ll @@ -16,10 +16,10 @@ attributes #0 = { minsize optsize } ; CHECK: .globl x ; CHECK: .hidden x -; CHECK: .set x, .L_MergedGlobals +; CHECK: x = .L_MergedGlobals ; CHECK: .size x, 4 ; CHECK: .globl y ; CHECK: .hidden y -; CHECK: .set y, .L_MergedGlobals+4 +; CHECK: y = .L_MergedGlobals+4 ; CHECK: .size y, 4 diff --git a/llvm/test/CodeGen/AArch64/ifunc-asm.ll b/llvm/test/CodeGen/AArch64/ifunc-asm.ll index 57fc2f0c9d7f..7aad6cce09cf 100644 --- a/llvm/test/CodeGen/AArch64/ifunc-asm.ll +++ b/llvm/test/CodeGen/AArch64/ifunc-asm.ll @@ -16,7 +16,7 @@ entry: @global_ifunc = ifunc i32 (i32), ptr @the_resolver ; ELF: .globl global_ifunc ; ELF-NEXT: .type global_ifunc,@gnu_indirect_function -; ELF-NEXT: .set global_ifunc, the_resolver +; ELF-NEXT: global_ifunc = the_resolver ; MACHO: .section __DATA,__data ; MACHO-NEXT: .p2align 3, 0x0 diff --git a/llvm/test/CodeGen/AArch64/seh-finally.ll b/llvm/test/CodeGen/AArch64/seh-finally.ll index 04a30800d929..fd6b3fd0bc1f 100644 --- a/llvm/test/CodeGen/AArch64/seh-finally.ll +++ b/llvm/test/CodeGen/AArch64/seh-finally.ll @@ -38,7 +38,7 @@ entry: ; CHECK: add x29, sp, #16 ; CHECK: mov x0, #-2 ; CHECK: stur x0, [x29, #16] -; CHECK: .set .Lsimple_seh$frame_escape_0, -8 +; CHECK: .Lsimple_seh$frame_escape_0 = -8 ; CHECK: ldur w0, [x29, #-8] ; CHECK: bl foo @@ -89,7 +89,7 @@ entry: ; CHECK: mov x19, sp ; CHECK: mov x0, #-2 ; CHECK: stur x0, [x29, #24] -; CHECK: .set .Lstack_realign$frame_escape_0, 0 +; CHECK: .Lstack_realign$frame_escape_0 = 0 ; CHECK: ldr w0, [x19] ; CHECK: bl foo @@ -137,7 +137,7 @@ entry: ; CHECK: add x29, sp, #32 ; CHECK: mov x1, #-2 ; CHECK: stur x1, [x29, #16] -; CHECK: .set .Lvla_present$frame_escape_0, -4 +; CHECK: .Lvla_present$frame_escape_0 = -4 ; CHECK: stur w0, [x29, #-4] ; CHECK: ldur w8, [x29, #-4] ; CHECK: mov x9, sp @@ -204,7 +204,7 @@ entry: ; CHECK: mov x19, sp ; CHECK: mov x1, #-2 ; CHECK: stur x1, [x29, #24] -; CHECK: .set .Lvla_and_realign$frame_escape_0, 32 +; CHECK: .Lvla_and_realign$frame_escape_0 = 32 ; CHECK: str w0, [x29, #36] ; CHECK: ldr w8, [x29, #36] ; CHECK: mov x9, sp diff --git a/llvm/test/CodeGen/AArch64/stackguard-internal.ll b/llvm/test/CodeGen/AArch64/stackguard-internal.ll index a70c8874edba..7b32e8c0caab 100644 --- a/llvm/test/CodeGen/AArch64/stackguard-internal.ll +++ b/llvm/test/CodeGen/AArch64/stackguard-internal.ll @@ -6,7 +6,7 @@ target triple = "aarch64-linux-gnu" ; is an alias. (The alias is created by GlobalMerge.) ; CHECK: adrp {{.*}}, __stack_chk_guard ; CHECK: ldr {{.*}}, [{{.*}}, :lo12:__stack_chk_guard] -; CHECK: .set __stack_chk_guard, .L_MergedGlobals+4 +; CHECK: __stack_chk_guard = .L_MergedGlobals+4 @__stack_chk_guard = internal global [8 x i32] zeroinitializer, align 4 @x = internal global i32 0, align 4 diff --git a/llvm/test/CodeGen/ARM/alias_store.ll b/llvm/test/CodeGen/ARM/alias_store.ll index c6612334eaf1..60aa58d37499 100644 --- a/llvm/test/CodeGen/ARM/alias_store.ll +++ b/llvm/test/CodeGen/ARM/alias_store.ll @@ -13,4 +13,4 @@ entry: ; CHECK: ldr r{{.*}}, [[L:.*]] ; CHECK: [[L]]: ; CHECK-NEXT: .long XA -; CHECK: .set XA, X+1 +; CHECK: XA = X+1 diff --git a/llvm/test/CodeGen/ARM/aliases.ll b/llvm/test/CodeGen/ARM/aliases.ll index 6075ad813e99..8d9f938155d1 100644 --- a/llvm/test/CodeGen/ARM/aliases.ll +++ b/llvm/test/CodeGen/ARM/aliases.ll @@ -6,30 +6,30 @@ ; CHECK: .size .Lstructvar, 8 ; CHECK: .globl foo1 -; CHECK: .set foo1, bar +; CHECK: foo1 = bar ; CHECK-NOT: .size foo1 ; CHECK: .globl foo2 -; CHECK: .set foo2, bar +; CHECK: foo2 = bar ; CHECK-NOT: .size foo2 ; CHECK: .weak bar_f -; CHECK: .set bar_f, foo_f +; CHECK: bar_f = foo_f ; CHECK-NOT: .size bar_f -; CHECK: .set bar_i, bar +; CHECK: bar_i = bar ; CHECK-NOT: .size bar_i ; CHECK: .globl A -; CHECK: .set A, bar +; CHECK: A = bar ; CHECK-NOT: .size A ; CHECK: .globl elem0 -; CHECK: .set elem0, .Lstructvar +; CHECK: elem0 = .Lstructvar ; CHECK: .size elem0, 4 ; CHECK: .globl elem1 -; CHECK: .set elem1, .Lstructvar+4 +; CHECK: elem1 = .Lstructvar+4 ; CHECK: .size elem1, 4 @bar = global i32 42 diff --git a/llvm/test/CodeGen/ARM/global-merge-dllexport.ll b/llvm/test/CodeGen/ARM/global-merge-dllexport.ll index 89e8a859b939..f5961d7f79e3 100644 --- a/llvm/test/CodeGen/ARM/global-merge-dllexport.ll +++ b/llvm/test/CodeGen/ARM/global-merge-dllexport.ll @@ -16,6 +16,6 @@ define void @f1(i32 %a1, i32 %a2) { ; CHECK: .section .drectve,"yni" ; CHECK: .ascii " /EXPORT:y,DATA" ; CHECK: .globl x -; CHECK: .set x, .L_MergedGlobals +; CHECK: x = .L_MergedGlobals ; CHECK: .globl y -; CHECK: .set y, .L_MergedGlobals+4 +; CHECK: y = .L_MergedGlobals+4 diff --git a/llvm/test/CodeGen/ARM/global-merge-external-2.ll b/llvm/test/CodeGen/ARM/global-merge-external-2.ll index 602533e045e0..c9e92d98e484 100644 --- a/llvm/test/CodeGen/ARM/global-merge-external-2.ll +++ b/llvm/test/CodeGen/ARM/global-merge-external-2.ll @@ -50,16 +50,16 @@ define dso_local void @g1(i32 %a1, i32 %a2) { ;CHECK-WIN32: .lcomm .L_MergedGlobals,8,4 ;CHECK-MERGE: .globl x -;CHECK-MERGE: .set x, .L_MergedGlobals +;CHECK-MERGE: x = .L_MergedGlobals ;CHECK-MERGE: .size x, 4 ;CHECK-MERGE: .globl y -;CHECK-MERGE: .set y, .L_MergedGlobals+4 +;CHECK-MERGE: y = .L_MergedGlobals+4 ;CHECK-MERGE: .size y, 4 -;CHECK-MERGE-NOT: .set z, .L_MergedGlobals+8 +;CHECK-MERGE-NOT: z = .L_MergedGlobals+8 ;CHECK-WIN32: .globl x -;CHECK-WIN32: .set x, .L_MergedGlobals +;CHECK-WIN32: x = .L_MergedGlobals ;CHECK-WIN32: .globl y -;CHECK-WIN32: .set y, .L_MergedGlobals+4 -;CHECK-WIN32-NOT: .set z, .L_MergedGlobals+8 +;CHECK-WIN32: y = .L_MergedGlobals+4 +;CHECK-WIN32-NOT: z = .L_MergedGlobals+8 diff --git a/llvm/test/CodeGen/ARM/global-merge-external.ll b/llvm/test/CodeGen/ARM/global-merge-external.ll index 364659b36bb9..4fe1914aae35 100644 --- a/llvm/test/CodeGen/ARM/global-merge-external.ll +++ b/llvm/test/CodeGen/ARM/global-merge-external.ll @@ -45,18 +45,18 @@ define dso_local void @g1(i32 %a1, i32 %a2) { ;CHECK-WIN32: .lcomm .L_MergedGlobals,12,4 ;CHECK-MERGE: .globl x -;CHECK-MERGE: .set x, .L_MergedGlobals +;CHECK-MERGE: x = .L_MergedGlobals ;CHECK-MERGE: .size x, 4 ;CHECK-MERGE: .globl y -;CHECK-MERGE: .set y, .L_MergedGlobals+4 +;CHECK-MERGE: y = .L_MergedGlobals+4 ;CHECK-MERGE: .size y, 4 ;CHECK-MERGE: .globl z -;CHECK-MERGE: .set z, .L_MergedGlobals+8 +;CHECK-MERGE: z = .L_MergedGlobals+8 ;CHECK-MERGE: .size z, 4 ;CHECK-WIN32: .globl x -;CHECK-WIN32: .set x, .L_MergedGlobals +;CHECK-WIN32: x = .L_MergedGlobals ;CHECK-WIN32: .globl y -;CHECK-WIN32: .set y, .L_MergedGlobals+4 +;CHECK-WIN32: y = .L_MergedGlobals+4 ;CHECK-WIN32: .globl z -;CHECK-WIN32: .set z, .L_MergedGlobals+8 +;CHECK-WIN32: z = .L_MergedGlobals+8 diff --git a/llvm/test/CodeGen/AVR/global-aliases.ll b/llvm/test/CodeGen/AVR/global-aliases.ll index 91bcedc7e0db..b948003e8b88 100644 --- a/llvm/test/CodeGen/AVR/global-aliases.ll +++ b/llvm/test/CodeGen/AVR/global-aliases.ll @@ -1,18 +1,18 @@ ; RUN: llc < %s -mtriple=avr -mcpu=atxmega384c3 | FileCheck %s --check-prefixes=MEGA ; RUN: llc < %s -mtriple=avr -mcpu=attiny40 | FileCheck %s --check-prefixes=TINY -; MEGA: .set __tmp_reg__, 0 -; MEGA: .set __zero_reg__, 1 -; MEGA: .set __SREG__, 63 -; MEGA: .set __SP_H__, 62 -; MEGA: .set __SP_L__, 61 -; MEGA: .set __EIND__, 60 -; MEGA: .set __RAMPZ__, 59 +; MEGA: __tmp_reg__ = 0 +; MEGA: __zero_reg__ = 1 +; MEGA: __SREG__ = 63 +; MEGA: __SP_H__ = 62 +; MEGA: __SP_L__ = 61 +; MEGA: __EIND__ = 60 +; MEGA: __RAMPZ__ = 59 -; TINY: .set __tmp_reg__, 16 -; TINY: .set __zero_reg__, 17 -; TINY: .set __SREG__, 63 -; TINY-NOT: .set __SP_H__, 62 -; TINY: .set __SP_L__, 61 -; TINY-NOT: .set __EIND__, 60 -; TINY-NOT: .set __RAMPZ__, 59 +; TINY: __tmp_reg__ = 16 +; TINY: __zero_reg__ = 17 +; TINY: __SREG__ = 63 +; TINY-NOT: __SP_H__ = 62 +; TINY: __SP_L__ = 61 +; TINY-NOT: __EIND__ = 60 +; TINY-NOT: __RAMPZ__ = 59 diff --git a/llvm/test/CodeGen/Mips/hf16call32_body.ll b/llvm/test/CodeGen/Mips/hf16call32_body.ll index ea83f776bd40..3bcb6f6bc015 100644 --- a/llvm/test/CodeGen/Mips/hf16call32_body.ll +++ b/llvm/test/CodeGen/Mips/hf16call32_body.ll @@ -24,7 +24,7 @@ entry: ; stel: addiu $25, $25, %lo(v_sf) ; stel: mfc1 $4, $f12 ; stel: jr $25 -; stel: .set $__fn_local_v_sf, v_sf +; stel: $__fn_local_v_sf = v_sf ; stel: .end __fn_stub_v_sf declare i32 @printf(ptr, ...) #1 @@ -46,7 +46,7 @@ entry: ; stel: mfc1 $4, $f12 ; stel: mfc1 $5, $f13 ; stel: jr $25 -; stel: .set $__fn_local_v_df, v_df +; stel: $__fn_local_v_df = v_df ; stel: .end __fn_stub_v_df ; Function Attrs: nounwind @@ -70,7 +70,7 @@ entry: ; stel: mfc1 $4, $f12 ; stel: mfc1 $5, $f14 ; stel: jr $25 -; stel: .set $__fn_local_v_sf_sf, v_sf_sf +; stel: $__fn_local_v_sf_sf = v_sf_sf ; stel: .end __fn_stub_v_sf_sf ; Function Attrs: nounwind @@ -95,7 +95,7 @@ entry: ; stel: mfc1 $6, $f14 ; stel: mfc1 $7, $f15 ; stel: jr $25 -; stel: .set $__fn_local_v_sf_df, v_sf_df +; stel: $__fn_local_v_sf_df = v_sf_df ; stel: .end __fn_stub_v_sf_df ; Function Attrs: nounwind @@ -120,7 +120,7 @@ entry: ; stel: mfc1 $5, $f13 ; stel: mfc1 $6, $f14 ; stel: jr $25 -; stel: .set $__fn_local_v_df_sf, v_df_sf +; stel: $__fn_local_v_df_sf = v_df_sf ; stel: .end __fn_stub_v_df_sf ; Function Attrs: nounwind @@ -146,7 +146,7 @@ entry: ; stel: mfc1 $6, $f14 ; stel: mfc1 $7, $f15 ; stel: jr $25 -; stel: .set $__fn_local_v_df_df, v_df_df +; stel: $__fn_local_v_df_df = v_df_df ; stel: .end __fn_stub_v_df_df ; Function Attrs: nounwind @@ -174,7 +174,7 @@ entry: ; stel: addiu $25, $25, %lo(sf_sf) ; stel: mfc1 $4, $f12 ; stel: jr $25 -; stel: .set $__fn_local_sf_sf, sf_sf +; stel: $__fn_local_sf_sf = sf_sf ; stel: .end __fn_stub_sf_sf @@ -196,7 +196,7 @@ entry: ; stel: mfc1 $4, $f12 ; stel: mfc1 $5, $f13 ; stel: jr $25 -; stel: .set $__fn_local_sf_df, sf_df +; stel: $__fn_local_sf_df = sf_df ; stel: .end __fn_stub_sf_df ; Function Attrs: nounwind @@ -221,7 +221,7 @@ entry: ; stel: mfc1 $4, $f12 ; stel: mfc1 $5, $f14 ; stel: jr $25 -; stel: .set $__fn_local_sf_sf_sf, sf_sf_sf +; stel: $__fn_local_sf_sf_sf = sf_sf_sf ; stel: .end __fn_stub_sf_sf_sf ; Function Attrs: nounwind @@ -247,7 +247,7 @@ entry: ; stel: mfc1 $6, $f14 ; stel: mfc1 $7, $f15 ; stel: jr $25 -; stel: .set $__fn_local_sf_sf_df, sf_sf_df +; stel: $__fn_local_sf_sf_df = sf_sf_df ; stel: .end __fn_stub_sf_sf_df ; Function Attrs: nounwind @@ -273,7 +273,7 @@ entry: ; stel: mfc1 $5, $f13 ; stel: mfc1 $6, $f14 ; stel: jr $25 -; stel: .set $__fn_local_sf_df_sf, sf_df_sf +; stel: $__fn_local_sf_df_sf = sf_df_sf ; stel: .end __fn_stub_sf_df_sf ; Function Attrs: nounwind @@ -300,7 +300,7 @@ entry: ; stel: mfc1 $6, $f14 ; stel: mfc1 $7, $f15 ; stel: jr $25 -; stel: .set $__fn_local_sf_df_df, sf_df_df +; stel: $__fn_local_sf_df_df = sf_df_df ; stel: .end __fn_stub_sf_df_df attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" } diff --git a/llvm/test/CodeGen/Mips/mips16ex.ll b/llvm/test/CodeGen/Mips/mips16ex.ll index fb9a44e76751..f4d1125718a9 100644 --- a/llvm/test/CodeGen/Mips/mips16ex.ll +++ b/llvm/test/CodeGen/Mips/mips16ex.ll @@ -2,7 +2,7 @@ ;16: main: ;16-NEXT: [[TMP:.*]]: -;16-NEXT: .set $func_begin0, [[TMP]] +;16-NEXT: $func_begin0 = [[TMP]] ;16-NEXT: .cfi_startproc ;16-NEXT: .cfi_personality @.str = private unnamed_addr constant [7 x i8] c"hello\0A\00", align 1 diff --git a/llvm/test/CodeGen/PowerPC/asm-printer-topological-order.ll b/llvm/test/CodeGen/PowerPC/asm-printer-topological-order.ll index 6299b4e393d9..3218c77f08c8 100644 --- a/llvm/test/CodeGen/PowerPC/asm-printer-topological-order.ll +++ b/llvm/test/CodeGen/PowerPC/asm-printer-topological-order.ll @@ -10,6 +10,6 @@ entry: } ; CHECK-LABEL: TestD: -; CHECK: .set TestC, TestD -; CHECK-DAG: .set TestB, TestC -; CHECK-DAG: .set TestA, TestC +; CHECK: TestC = TestD +; CHECK-DAG: TestB = TestC +; CHECK-DAG: TestA = TestC diff --git a/llvm/test/CodeGen/PowerPC/data-align.ll b/llvm/test/CodeGen/PowerPC/data-align.ll index bfedec139369..42dee13d152a 100644 --- a/llvm/test/CodeGen/PowerPC/data-align.ll +++ b/llvm/test/CodeGen/PowerPC/data-align.ll @@ -2,23 +2,23 @@ ; RUN: llc < %s -mtriple=powerpc64-unknown-linux | FileCheck %s ; RUN: llc < %s -mtriple=powerpc64le-unknown-linux | FileCheck %s -; CHECK: .set .Li8, +; CHECK: .Li8 = ; CHECK-NEXT: .size .Li8, 1 @i8 = private constant i8 42 -; CHECK: .set .Li16, +; CHECK: .Li16 = ; CHECK-NEXT: .size .Li16, 2 @i16 = private constant i16 42 -; CHECK: .set .Li32, +; CHECK: .Li32 = ; CHECK-NEXT: .size .Li32, 4 @i32 = private constant i32 42 -; CHECK: .set .Li64, +; CHECK: .Li64 = ; CHECK-NEXT: .size .Li64, 8 @i64 = private constant i64 42 -; CHECK: .set .Li128, +; CHECK: .Li128 = ; CHECK-NEXT: .size .Li128, 16 @i128 = private constant i128 42 diff --git a/llvm/test/CodeGen/WebAssembly/aliases.ll b/llvm/test/CodeGen/WebAssembly/aliases.ll index 91b57b90df1d..87b292f53c62 100644 --- a/llvm/test/CodeGen/WebAssembly/aliases.ll +++ b/llvm/test/CodeGen/WebAssembly/aliases.ll @@ -4,11 +4,11 @@ @bar = global i32 42 ; CHECK-DAG: .globl foo1 -; CHECK-DAG: .set foo1, bar +; CHECK-DAG: foo1 = bar @foo1 = alias i32, ptr @bar ; CHECK-DAG: .globl foo2 -; CHECK-DAG: .set foo2, bar +; CHECK-DAG: foo2 = bar @foo2 = alias i32, ptr @bar %FunTy = type i32() @@ -19,14 +19,14 @@ define i32 @foo_f() { ; CHECK-DAG: .weak bar_f ; CHECK-DAG: .type bar_f,@function -; CHECK-DAG: .set bar_f, foo_f +; CHECK-DAG: bar_f = foo_f @bar_f = weak alias %FunTy, ptr @foo_f ; CHECK-DAG: .weak bar_l -; CHECK-DAG: .set bar_l, bar +; CHECK-DAG: bar_l = bar @bar_l = linkonce_odr alias i32, ptr @bar -; CHECK-DAG: .set bar_i, bar +; CHECK-DAG: bar_i = bar @bar_i = internal alias i32, ptr @bar ; CHECK-DAG: .globl A @@ -34,24 +34,24 @@ define i32 @foo_f() { ; CHECK-DAG: .globl bar_h ; CHECK-DAG: .hidden bar_h -; CHECK-DAG: .set bar_h, bar +; CHECK-DAG: bar_h = bar @bar_h = hidden alias i32, ptr @bar ; CHECK-DAG: .globl bar_p ; CHECK-DAG: .protected bar_p -; CHECK-DAG: .set bar_p, bar +; CHECK-DAG: bar_p = bar @bar_p = protected alias i32, ptr @bar -; CHECK-DAG: .set test2, bar+4 +; CHECK-DAG: test2 = bar+4 @test2 = alias i32, getelementptr(i32, ptr @bar, i32 1) -; CHECK-DAG: .set test3, 42 +; CHECK-DAG: test3 = 42 @test3 = alias i32, inttoptr(i32 42 to ptr) -; CHECK-DAG: .set test4, bar +; CHECK-DAG: test4 = bar @test4 = alias i32, inttoptr(i64 ptrtoint (ptr @bar to i64) to ptr) -; CHECK-DAG: .set test5, test2-bar +; CHECK-DAG: test5 = test2-bar @test5 = alias i32, inttoptr(i32 sub (i32 ptrtoint (ptr @test2 to i32), i32 ptrtoint (ptr @bar to i32)) to ptr) diff --git a/llvm/test/CodeGen/WinCFGuard/cfguard-mingw.ll b/llvm/test/CodeGen/WinCFGuard/cfguard-mingw.ll index 7a5baa09f95e..10985de88bf2 100644 --- a/llvm/test/CodeGen/WinCFGuard/cfguard-mingw.ll +++ b/llvm/test/CodeGen/WinCFGuard/cfguard-mingw.ll @@ -35,7 +35,7 @@ ; } ;------------------------------------------------------------------------------- -; CHECK: .set @feat.00, 2048 +; CHECK: @feat.00 = 2048 ; CHECK: .section .gfids$y ; CHECK: .symidx _ZNK7Derived4calcEv diff --git a/llvm/test/CodeGen/WinCFGuard/cfguard.ll b/llvm/test/CodeGen/WinCFGuard/cfguard.ll index 2ec2e573f716..a77d5490ef87 100644 --- a/llvm/test/CodeGen/WinCFGuard/cfguard.ll +++ b/llvm/test/CodeGen/WinCFGuard/cfguard.ll @@ -1,7 +1,7 @@ ; RUN: llc < %s -mtriple=x86_64-pc-windows-msvc | FileCheck %s ; Control Flow Guard is currently only available on Windows -; CHECK: .set @feat.00, 2048 +; CHECK: @feat.00 = 2048 ; CHECK: .section .gfids$y ; CHECK: .symidx "?address_taken@@YAXXZ" diff --git a/llvm/test/CodeGen/X86/2007-09-06-ExtWeakAliasee.ll b/llvm/test/CodeGen/X86/2007-09-06-ExtWeakAliasee.ll index d59953fb4e37..cc80f87fda31 100644 --- a/llvm/test/CodeGen/X86/2007-09-06-ExtWeakAliasee.ll +++ b/llvm/test/CodeGen/X86/2007-09-06-ExtWeakAliasee.ll @@ -10,4 +10,4 @@ define weak i32 @pthread_once(ptr, ptr) { ; CHECK: pthread_once: ; CHECK: .weak __gthrw_pthread_once -; CHECK: .set __gthrw_pthread_once, pthread_once +; CHECK: __gthrw_pthread_once = pthread_once diff --git a/llvm/test/CodeGen/X86/2009-08-12-badswitch.ll b/llvm/test/CodeGen/X86/2009-08-12-badswitch.ll index 7050889d7102..527684f5a27d 100644 --- a/llvm/test/CodeGen/X86/2009-08-12-badswitch.ll +++ b/llvm/test/CodeGen/X86/2009-08-12-badswitch.ll @@ -125,31 +125,31 @@ define internal fastcc i32 @foo(i64 %bar) nounwind ssp { ; CHECK-NEXT: retq ; CHECK-NEXT: .p2align 2 ; CHECK-NEXT: .data_region jt32 -; CHECK-NEXT: .set L0_0_set_3, LBB0_3-LJTI0_0 -; CHECK-NEXT: .set L0_0_set_4, LBB0_4-LJTI0_0 -; CHECK-NEXT: .set L0_0_set_5, LBB0_5-LJTI0_0 -; CHECK-NEXT: .set L0_0_set_6, LBB0_6-LJTI0_0 -; CHECK-NEXT: .set L0_0_set_7, LBB0_7-LJTI0_0 -; CHECK-NEXT: .set L0_0_set_8, LBB0_8-LJTI0_0 -; CHECK-NEXT: .set L0_0_set_9, LBB0_9-LJTI0_0 -; CHECK-NEXT: .set L0_0_set_10, LBB0_10-LJTI0_0 -; CHECK-NEXT: .set L0_0_set_11, LBB0_11-LJTI0_0 -; CHECK-NEXT: .set L0_0_set_12, LBB0_12-LJTI0_0 -; CHECK-NEXT: .set L0_0_set_13, LBB0_13-LJTI0_0 -; CHECK-NEXT: .set L0_0_set_14, LBB0_14-LJTI0_0 -; CHECK-NEXT: .set L0_0_set_15, LBB0_15-LJTI0_0 -; CHECK-NEXT: .set L0_0_set_16, LBB0_16-LJTI0_0 -; CHECK-NEXT: .set L0_0_set_17, LBB0_17-LJTI0_0 -; CHECK-NEXT: .set L0_0_set_18, LBB0_18-LJTI0_0 -; CHECK-NEXT: .set L0_0_set_19, LBB0_19-LJTI0_0 -; CHECK-NEXT: .set L0_0_set_20, LBB0_20-LJTI0_0 -; CHECK-NEXT: .set L0_0_set_21, LBB0_21-LJTI0_0 -; CHECK-NEXT: .set L0_0_set_22, LBB0_22-LJTI0_0 -; CHECK-NEXT: .set L0_0_set_23, LBB0_23-LJTI0_0 -; CHECK-NEXT: .set L0_0_set_24, LBB0_24-LJTI0_0 -; CHECK-NEXT: .set L0_0_set_25, LBB0_25-LJTI0_0 -; CHECK-NEXT: .set L0_0_set_26, LBB0_26-LJTI0_0 -; CHECK-NEXT: .set L0_0_set_27, LBB0_27-LJTI0_0 +; CHECK-NEXT: L0_0_set_3 = LBB0_3-LJTI0_0 +; CHECK-NEXT: L0_0_set_4 = LBB0_4-LJTI0_0 +; CHECK-NEXT: L0_0_set_5 = LBB0_5-LJTI0_0 +; CHECK-NEXT: L0_0_set_6 = LBB0_6-LJTI0_0 +; CHECK-NEXT: L0_0_set_7 = LBB0_7-LJTI0_0 +; CHECK-NEXT: L0_0_set_8 = LBB0_8-LJTI0_0 +; CHECK-NEXT: L0_0_set_9 = LBB0_9-LJTI0_0 +; CHECK-NEXT: L0_0_set_10 = LBB0_10-LJTI0_0 +; CHECK-NEXT: L0_0_set_11 = LBB0_11-LJTI0_0 +; CHECK-NEXT: L0_0_set_12 = LBB0_12-LJTI0_0 +; CHECK-NEXT: L0_0_set_13 = LBB0_13-LJTI0_0 +; CHECK-NEXT: L0_0_set_14 = LBB0_14-LJTI0_0 +; CHECK-NEXT: L0_0_set_15 = LBB0_15-LJTI0_0 +; CHECK-NEXT: L0_0_set_16 = LBB0_16-LJTI0_0 +; CHECK-NEXT: L0_0_set_17 = LBB0_17-LJTI0_0 +; CHECK-NEXT: L0_0_set_18 = LBB0_18-LJTI0_0 +; CHECK-NEXT: L0_0_set_19 = LBB0_19-LJTI0_0 +; CHECK-NEXT: L0_0_set_20 = LBB0_20-LJTI0_0 +; CHECK-NEXT: L0_0_set_21 = LBB0_21-LJTI0_0 +; CHECK-NEXT: L0_0_set_22 = LBB0_22-LJTI0_0 +; CHECK-NEXT: L0_0_set_23 = LBB0_23-LJTI0_0 +; CHECK-NEXT: L0_0_set_24 = LBB0_24-LJTI0_0 +; CHECK-NEXT: L0_0_set_25 = LBB0_25-LJTI0_0 +; CHECK-NEXT: L0_0_set_26 = LBB0_26-LJTI0_0 +; CHECK-NEXT: L0_0_set_27 = LBB0_27-LJTI0_0 ; CHECK-NEXT: LJTI0_0: ; CHECK-NEXT: .long L0_0_set_3 ; CHECK-NEXT: .long L0_0_set_3 diff --git a/llvm/test/CodeGen/X86/2010-05-26-DotDebugLoc.ll b/llvm/test/CodeGen/X86/2010-05-26-DotDebugLoc.ll index cf20cfaced5d..17df3e10fd3d 100644 --- a/llvm/test/CodeGen/X86/2010-05-26-DotDebugLoc.ll +++ b/llvm/test/CodeGen/X86/2010-05-26-DotDebugLoc.ll @@ -64,15 +64,15 @@ attributes #1 = { nounwind readnone } ; CHECK-NEXT: [[CLOBBER:Ltmp[0-9]*]] ; CHECK: Ldebug_loc0: -; CHECK-NEXT: .set [[SET1:.*]], Lfunc_begin0-Lfunc_begin0 +; CHECK-NEXT: [[SET1:.*]] = Lfunc_begin0-Lfunc_begin0 ; CHECK-NEXT: .quad [[SET1]] -; CHECK-NEXT: .set [[SET2:.*]], [[LABEL]]-Lfunc_begin0 +; CHECK-NEXT: [[SET2:.*]] = [[LABEL]]-Lfunc_begin0 ; CHECK-NEXT: .quad [[SET2]] ; CHECK-NEXT: .short 1 ## Loc expr size ; CHECK-NEXT: .byte 85 -; CHECK-NEXT: .set [[SET3:.*]], [[LABEL]]-Lfunc_begin0 +; CHECK-NEXT: [[SET3:.*]] = [[LABEL]]-Lfunc_begin0 ; CHECK-NEXT: .quad [[SET3]] -; CHECK-NEXT: .set [[SET4:.*]], [[CLOBBER]]-Lfunc_begin0 +; CHECK-NEXT: [[SET4:.*]] = [[CLOBBER]]-Lfunc_begin0 ; CHECK-NEXT: .quad [[SET4]] ; CHECK-NEXT: .short 1 ## Loc expr size ; CHECK-NEXT: .byte 83 diff --git a/llvm/test/CodeGen/X86/alias-gep.ll b/llvm/test/CodeGen/X86/alias-gep.ll index 904a611f61d1..65d2ced6df5b 100644 --- a/llvm/test/CodeGen/X86/alias-gep.ll +++ b/llvm/test/CodeGen/X86/alias-gep.ll @@ -3,17 +3,17 @@ ;MACHO: .globl _offsetSym0 ;MACHO-NOT: .alt_entry -;MACHO: .set _offsetSym0, _s +;MACHO: _offsetSym0 = _s ;MACHO: .globl _offsetSym1 ;MACHO: .alt_entry _offsetSym1 -;MACHO: .set _offsetSym1, _s+8 +;MACHO: _offsetSym1 = _s+8 ;ELF: .globl offsetSym0 ;ELF-NOT: .alt_entry -;ELF: .set offsetSym0, s +;ELF: offsetSym0 = s ;ELF: .globl offsetSym1 ;ELF-NOT: .alt_entry -;ELF: .set offsetSym1, s+8 +;ELF: offsetSym1 = s+8 %struct.S1 = type { i32, i32, i32 } diff --git a/llvm/test/CodeGen/X86/aliases.ll b/llvm/test/CodeGen/X86/aliases.ll index 03ea2579d0f8..d36798820fe8 100644 --- a/llvm/test/CodeGen/X86/aliases.ll +++ b/llvm/test/CodeGen/X86/aliases.ll @@ -48,16 +48,16 @@ define i32 @foo_f() { ; CHECK-DAG: .protected bar_p @bar_p = protected alias i32, ptr @bar -; CHECK-DAG: .set test2, bar+4 +; CHECK-DAG: test2 = bar+4 @test2 = alias i32, getelementptr(i32, ptr @bar, i32 1) -; CHECK-DAG: .set test3, 42 +; CHECK-DAG: test3 = 42 @test3 = alias i32, inttoptr(i32 42 to ptr) -; CHECK-DAG: .set test4, bar +; CHECK-DAG: test4 = bar @test4 = alias i32, inttoptr(i64 ptrtoint (ptr @bar to i64) to ptr) -; CHECK-DAG: .set test5, test2-bar +; CHECK-DAG: test5 = test2-bar @test5 = alias i32, inttoptr(i32 sub (i32 ptrtoint (ptr @test2 to i32), i32 ptrtoint (ptr @bar to i32)) to ptr) diff --git a/llvm/test/CodeGen/X86/catchret-empty-fallthrough.ll b/llvm/test/CodeGen/X86/catchret-empty-fallthrough.ll index 437d9698ee6b..ab9fa2287ffa 100644 --- a/llvm/test/CodeGen/X86/catchret-empty-fallthrough.ll +++ b/llvm/test/CodeGen/X86/catchret-empty-fallthrough.ll @@ -44,7 +44,7 @@ return: ; preds = %catch, %entry ; CHECK: .LBB0_[[catch:[0-9]+]]: ; CHECK: .seh_handlerdata -; CHECK-NEXT: .set .Lfoo$parent_frame_offset, 32 +; CHECK-NEXT: .Lfoo$parent_frame_offset = 32 ; CHECK-NEXT: .long (.Llsda_end0-.Llsda_begin0)/16 ; CHECK-NEXT: .Llsda_begin0: ; CHECK-NEXT: .long .Ltmp0@IMGREL diff --git a/llvm/test/CodeGen/X86/coff-alias-type.ll b/llvm/test/CodeGen/X86/coff-alias-type.ll index a242cd2d77d7..6cc0638b2d4a 100644 --- a/llvm/test/CodeGen/X86/coff-alias-type.ll +++ b/llvm/test/CodeGen/X86/coff-alias-type.ll @@ -22,4 +22,4 @@ entry: ; CHECK-NEXT: .scl 2 ; CHECK-NEXT: .type 32 ; CHECK-NEXT: .endef -; CHECK-NEXT: .set _ZN8MyStructC1Ev, _ZN8MyStructC2Ev +; CHECK-NEXT: _ZN8MyStructC1Ev = _ZN8MyStructC2Ev diff --git a/llvm/test/CodeGen/X86/coff-comdat.ll b/llvm/test/CodeGen/X86/coff-comdat.ll index 99b3c0a687af..084a5a71125e 100644 --- a/llvm/test/CodeGen/X86/coff-comdat.ll +++ b/llvm/test/CodeGen/X86/coff-comdat.ll @@ -89,4 +89,4 @@ $vftable = comdat largest ; CHECK: .globl _f6 ; CHECK: .section .rdata,"dr",largest,_vftable ; CHECK: .globl _vftable -; CHECK: .set _vftable, L_some_name+4 +; CHECK: _vftable = L_some_name+4 diff --git a/llvm/test/CodeGen/X86/coff-feat00.ll b/llvm/test/CodeGen/X86/coff-feat00.ll index 21dd04ed34c7..1dcd4276399a 100644 --- a/llvm/test/CodeGen/X86/coff-feat00.ll +++ b/llvm/test/CodeGen/X86/coff-feat00.ll @@ -4,4 +4,4 @@ define i32 @foo() { ret i32 0 } -; CHECK: .set @feat.00, 1 +; CHECK: @feat.00 = 1 diff --git a/llvm/test/CodeGen/X86/dllexport-x86_64.ll b/llvm/test/CodeGen/X86/dllexport-x86_64.ll index 76add98314f5..b640e630e47e 100644 --- a/llvm/test/CodeGen/X86/dllexport-x86_64.ll +++ b/llvm/test/CodeGen/X86/dllexport-x86_64.ll @@ -105,23 +105,23 @@ define weak_odr dllexport void @weak1() { ; MINGW: .ascii " -export:blob_alias" ; CHECK: .globl alias -; CHECK: .set alias, notExported +; CHECK: alias = notExported @alias = dllexport alias void(), ptr @notExported ; CHECK: .globl aliasNotExported -; CHECK: .set aliasNotExported, f1 +; CHECK: aliasNotExported = f1 @aliasNotExported = alias void(), ptr @f1 ; CHECK: .globl alias2 -; CHECK: .set alias2, f1 +; CHECK: alias2 = f1 @alias2 = dllexport alias void(), ptr @f1 ; CHECK: .globl alias3 -; CHECK: .set alias3, notExported +; CHECK: alias3 = notExported @alias3 = dllexport alias void(), ptr @notExported ; CHECK: .weak weak_alias -; CHECK: .set weak_alias, f1 +; CHECK: weak_alias = f1 @weak_alias = weak_odr dllexport alias void(), ptr @f1 @blob = global [6 x i8] c"\B8*\00\00\00\C3", section ".text", align 16 diff --git a/llvm/test/CodeGen/X86/dllexport.ll b/llvm/test/CodeGen/X86/dllexport.ll index 09cc03e7729d..53ecb8e7a1b4 100644 --- a/llvm/test/CodeGen/X86/dllexport.ll +++ b/llvm/test/CodeGen/X86/dllexport.ll @@ -135,17 +135,17 @@ define weak_odr dllexport void @weak1() { ; CHECK-GCC: .ascii " -export:weak_alias" ; CHECK: .globl _alias -; CHECK: .set _alias, _notExported +; CHECK: _alias = _notExported @alias = dllexport alias void(), ptr @notExported ; CHECK: .globl _alias2 -; CHECK: .set _alias2, _f1 +; CHECK: _alias2 = _f1 @alias2 = dllexport alias void(), ptr @f1 ; CHECK: .globl _alias3 -; CHECK: .set _alias3, _notExported +; CHECK: _alias3 = _notExported @alias3 = dllexport alias void(), ptr @notExported ; CHECK: .weak _weak_alias -; CHECK: .set _weak_alias, _f1 +; CHECK: _weak_alias = _f1 @weak_alias = weak_odr dllexport alias void(), ptr @f1 diff --git a/llvm/test/CodeGen/X86/ehcontguard.ll b/llvm/test/CodeGen/X86/ehcontguard.ll index 740621bc5d02..e868209babce 100644 --- a/llvm/test/CodeGen/X86/ehcontguard.ll +++ b/llvm/test/CodeGen/X86/ehcontguard.ll @@ -1,7 +1,7 @@ ; RUN: llc < %s -mtriple=x86_64-pc-windows-msvc | FileCheck %s ; EHCont Guard is currently only available on Windows -; CHECK: .set @feat.00, 16384 +; CHECK: @feat.00 = 16384 ; CHECK: .section .gehcont$y diff --git a/llvm/test/CodeGen/X86/fastcall-correct-mangling.ll b/llvm/test/CodeGen/X86/fastcall-correct-mangling.ll index 53b4bc8f1df2..4840308a5d49 100644 --- a/llvm/test/CodeGen/X86/fastcall-correct-mangling.ll +++ b/llvm/test/CodeGen/X86/fastcall-correct-mangling.ll @@ -33,5 +33,5 @@ define private x86_fastcallcc void @dontCrash() { } @alias = alias void(i64, i8, i8, i16), ptr @func -; CHECK32-LABEL: {{^}}.set @alias@20, @func@20 -; CHECK64-LABEL: {{^}}.set alias, func +; CHECK32-LABEL: {{^}}@alias@20 = @func@20 +; CHECK64-LABEL: {{^}}alias = func diff --git a/llvm/test/CodeGen/X86/ifunc-asm.ll b/llvm/test/CodeGen/X86/ifunc-asm.ll index a4c47da7f4c6..bc8e7e3d7d05 100644 --- a/llvm/test/CodeGen/X86/ifunc-asm.ll +++ b/llvm/test/CodeGen/X86/ifunc-asm.ll @@ -15,7 +15,7 @@ entry: @foo_ifunc = ifunc i32 (i32), ptr @foo_resolver ; ELF: .globl foo_ifunc ; ELF-NEXT: .type foo_ifunc,@gnu_indirect_function -; ELF-NEXT: .set foo_ifunc, foo_resolver +; ELF-NEXT: foo_ifunc = foo_resolver ; MACHO: .section __DATA,__data ; MACHO-NEXT: .p2align 3, 0x0 diff --git a/llvm/test/CodeGen/X86/lea-opt-memop-check-1.ll b/llvm/test/CodeGen/X86/lea-opt-memop-check-1.ll index b8f0661225f8..5199b1519ebe 100644 --- a/llvm/test/CodeGen/X86/lea-opt-memop-check-1.ll +++ b/llvm/test/CodeGen/X86/lea-opt-memop-check-1.ll @@ -47,9 +47,9 @@ entry: call fastcc void @"\01?fin$0@0@test2@@"(ptr %tmp0) ret void ; CHECK-LABEL: test2: -; CHECK: .set Ltest2$frame_escape_0, 8 -; CHECK: .set Ltest2$frame_escape_1, 4 -; CHECK: .set Ltest2$frame_escape_2, 0 +; CHECK: Ltest2$frame_escape_0 = 8 +; CHECK: Ltest2$frame_escape_1 = 4 +; CHECK: Ltest2$frame_escape_2 = 0 ; CHECK: calll "?fin$0@0@test2@@" } diff --git a/llvm/test/CodeGen/X86/linux-preemption.ll b/llvm/test/CodeGen/X86/linux-preemption.ll index 8e60b4787975..dc06a34e1c69 100644 --- a/llvm/test/CodeGen/X86/linux-preemption.ll +++ b/llvm/test/CodeGen/X86/linux-preemption.ll @@ -285,18 +285,18 @@ define dso_local ptr @comdat_any_local() comdat { ; CHECK-NEXT: .Lstrong_local_global$local: ; COMMON: .globl strong_default_alias -; COMMON-NEXT: .set strong_default_alias, aliasee +; COMMON-NEXT: strong_default_alias = aliasee ; COMMON-NEXT: .globl strong_hidden_alias ; COMMON-NEXT: .hidden strong_hidden_alias -; COMMON-NEXT: .set strong_hidden_alias, aliasee +; COMMON-NEXT: strong_hidden_alias = aliasee ; COMMON-NEXT: .weak weak_default_alias -; COMMON-NEXT: .set weak_default_alias, aliasee +; COMMON-NEXT: weak_default_alias = aliasee ; COMMON-NEXT: .globl strong_local_alias -; COMMON-NEXT: .set strong_local_alias, aliasee -; CHECK-NEXT: .set .Lstrong_local_alias$local, aliasee +; COMMON-NEXT: strong_local_alias = aliasee +; CHECK-NEXT: .Lstrong_local_alias$local = aliasee ; COMMON-NEXT: .weak weak_local_alias -; COMMON-NEXT: .set weak_local_alias, aliasee +; COMMON-NEXT: weak_local_alias = aliasee ; COMMON-NEXT: .globl strong_preemptable_alias -; COMMON-NEXT: .set strong_preemptable_alias, aliasee +; COMMON-NEXT: strong_preemptable_alias = aliasee ; COMMON-NEXT: .weak weak_preemptable_alias -; COMMON-NEXT: .set weak_preemptable_alias, aliasee +; COMMON-NEXT: weak_preemptable_alias = aliasee diff --git a/llvm/test/CodeGen/X86/localescape.ll b/llvm/test/CodeGen/X86/localescape.ll index aee7613273f7..57369be489af 100644 --- a/llvm/test/CodeGen/X86/localescape.ll +++ b/llvm/test/CodeGen/X86/localescape.ll @@ -76,8 +76,8 @@ define void @alloc_func(i32 %n) { ; X64: .seh_stackalloc 16 ; X64: leaq 16(%rsp), %rbp ; X64: .seh_setframe %rbp, 16 -; X64: .set .Lalloc_func$frame_escape_0, -4 -; X64: .set .Lalloc_func$frame_escape_1, -12 +; X64: .Lalloc_func$frame_escape_0 = -4 +; X64: .Lalloc_func$frame_escape_1 = -12 ; X64: movl $42, -4(%rbp) ; X64: movl $13, -12(%rbp) ; X64: movq %rbp, %rcx @@ -88,8 +88,8 @@ define void @alloc_func(i32 %n) { ; X86: pushl %ebp ; X86: movl %esp, %ebp ; X86: subl $12, %esp -; X86: .set Lalloc_func$frame_escape_0, -4 -; X86: .set Lalloc_func$frame_escape_1, -12 +; X86: Lalloc_func$frame_escape_0 = -4 +; X86: Lalloc_func$frame_escape_1 = -12 ; X86: movl $42, -4(%ebp) ; X86: movl $13, -12(%ebp) ; X86: pushl %ebp @@ -118,8 +118,8 @@ define void @alloc_func_no_frameaddr() { ; X64: subq $40, %rsp ; X64: .seh_stackalloc 40 ; X64: .seh_endprologue -; X64: .set .Lalloc_func_no_frameaddr$frame_escape_0, 36 -; X64: .set .Lalloc_func_no_frameaddr$frame_escape_1, 32 +; X64: .Lalloc_func_no_frameaddr$frame_escape_0 = 36 +; X64: .Lalloc_func_no_frameaddr$frame_escape_1 = 32 ; X64: movl $42, 36(%rsp) ; X64: movl $13, 32(%rsp) ; X64: xorl %ecx, %ecx @@ -131,8 +131,8 @@ define void @alloc_func_no_frameaddr() { ; X86-LABEL: alloc_func_no_frameaddr: ; X86: subl $8, %esp -; X86: .set Lalloc_func_no_frameaddr$frame_escape_0, 4 -; X86: .set Lalloc_func_no_frameaddr$frame_escape_1, 0 +; X86: Lalloc_func_no_frameaddr$frame_escape_0 = 4 +; X86: Lalloc_func_no_frameaddr$frame_escape_1 = 0 ; X86: movl $42, 4(%esp) ; X86: movl $13, (%esp) ; X86: pushl $0 diff --git a/llvm/test/CodeGen/X86/pr22019.ll b/llvm/test/CodeGen/X86/pr22019.ll index 4e78bae20442..262ee5fad737 100644 --- a/llvm/test/CodeGen/X86/pr22019.ll +++ b/llvm/test/CodeGen/X86/pr22019.ll @@ -5,9 +5,9 @@ target triple = "x86_64-unknown-linux-gnu" module asm "pselect = __pselect" module asm "var = __var" module asm "alias = __alias" -; CHECK: .set pselect, __pselect -; CHECK: .set var, __var -; CHECK: .set alias, __alias +; CHECK: pselect = __pselect +; CHECK: var = __var +; CHECK: alias = __alias ; CHECK: pselect: ; CHECK: retq @@ -19,5 +19,5 @@ define void @pselect() { ; CHECK: .long 0 @var = global i32 0 -; CHECK: .set alias, var +; CHECK: alias = var @alias = alias i32, ptr @var diff --git a/llvm/test/CodeGen/X86/seh-catch-all-win32.ll b/llvm/test/CodeGen/X86/seh-catch-all-win32.ll index 3acf999fc423..bd51ca76c59d 100644 --- a/llvm/test/CodeGen/X86/seh-catch-all-win32.ll +++ b/llvm/test/CodeGen/X86/seh-catch-all-win32.ll @@ -58,7 +58,7 @@ entry: ; CHECK: pushl %edi ; CHECK: pushl %esi -; CHECK: .set Lmain$frame_escape_0, [[code_offs:[-0-9]+]] +; CHECK: Lmain$frame_escape_0 = [[code_offs:[-0-9]+]] ; CHECK: movl %esp, [[reg_offs:[-0-9]+]](%ebp) ; CHECK: movl $L__ehtable$main, ; EH state 0 @@ -78,7 +78,7 @@ entry: ; CHECK: calll _printf ; CHECK: .section .xdata,"dr" -; CHECK: .set Lmain$parent_frame_offset, [[reg_offs]] +; CHECK: Lmain$parent_frame_offset = [[reg_offs]] ; CHECK: .p2align 2 ; CHECK: L__ehtable$main ; CHECK-NEXT: .long -1 diff --git a/llvm/test/CodeGen/X86/seh-catchpad.ll b/llvm/test/CodeGen/X86/seh-catchpad.ll index 7558c4389be5..d958580e5925 100644 --- a/llvm/test/CodeGen/X86/seh-catchpad.ll +++ b/llvm/test/CodeGen/X86/seh-catchpad.ll @@ -119,7 +119,7 @@ __except.ret: ; preds = %catch.dispatch.7 ; CHECK: jmp .LBB1_[[epilogue]] ; CHECK: .seh_handlerdata -; CHECK-NEXT: .set .Lmain$parent_frame_offset, 32 +; CHECK-NEXT: .Lmain$parent_frame_offset = 32 ; CHECK-NEXT: .long (.Llsda_end0-.Llsda_begin0)/16 ; CHECK-NEXT: .Llsda_begin0: ; CHECK-NEXT: .long .Ltmp0@IMGREL diff --git a/llvm/test/CodeGen/X86/seh-finally.ll b/llvm/test/CodeGen/X86/seh-finally.ll index 28e5cf68dd27..41823dfb38f0 100644 --- a/llvm/test/CodeGen/X86/seh-finally.ll +++ b/llvm/test/CodeGen/X86/seh-finally.ll @@ -26,7 +26,7 @@ lpad: ; preds = %entry ; X64: retq ; X64: .seh_handlerdata -; X64-NEXT: .set .Lmain$parent_frame_offset, 32 +; X64-NEXT: .Lmain$parent_frame_offset = 32 ; X64-NEXT: .long (.Llsda_end0-.Llsda_begin0)/16 # Number of call sites ; X64-NEXT: .Llsda_begin0: ; X64-NEXT: .long .Ltmp0@IMGREL # LabelStart diff --git a/llvm/test/CodeGen/X86/seh-no-invokes.ll b/llvm/test/CodeGen/X86/seh-no-invokes.ll index 99b81f0eb1bb..63e91d33d400 100644 --- a/llvm/test/CodeGen/X86/seh-no-invokes.ll +++ b/llvm/test/CodeGen/X86/seh-no-invokes.ll @@ -15,7 +15,7 @@ ; label. This was PR30431. ; CHECK-LABEL: _f: # @f -; CHECK: .set Lf$parent_frame_offset, 0 +; CHECK: Lf$parent_frame_offset = 0 ; CHECK: retl ; CHECK-LABEL: "?filt$0@0@f@@": # @"\01?filt$0@0@f@@" diff --git a/llvm/test/CodeGen/X86/seh-stack-realign.ll b/llvm/test/CodeGen/X86/seh-stack-realign.ll index 2869bff82231..ae687343cc50 100644 --- a/llvm/test/CodeGen/X86/seh-stack-realign.ll +++ b/llvm/test/CodeGen/X86/seh-stack-realign.ll @@ -51,7 +51,7 @@ entry: ; Check that we can get the exception code from eax to the printf. ; CHECK-LABEL: _main: -; CHECK: .set Lmain$frame_escape_0, [[code_offs:[-0-9]+]] +; CHECK: Lmain$frame_escape_0 = [[code_offs:[-0-9]+]] ; CHECK: movl %esp, [[reg_offs:[-0-9]+]](%esi) ; CHECK: movl $L__ehtable$main, ; EH state 0 @@ -71,7 +71,7 @@ entry: ; CHECK: calll _printf ; CHECK: .section .xdata,"dr" -; CHECK: .set Lmain$parent_frame_offset, [[reg_offs]] +; CHECK: Lmain$parent_frame_offset = [[reg_offs]] ; CHECK: L__ehtable$main ; CHECK-NEXT: .long -1 ; CHECK-NEXT: .long _filt$main diff --git a/llvm/test/CodeGen/X86/tailcall-cgp-dup.ll b/llvm/test/CodeGen/X86/tailcall-cgp-dup.ll index d8fcf6d86fa4..ecbbaf3ab362 100644 --- a/llvm/test/CodeGen/X86/tailcall-cgp-dup.ll +++ b/llvm/test/CodeGen/X86/tailcall-cgp-dup.ll @@ -34,12 +34,12 @@ define i32 @foo(i32 %x) nounwind ssp { ; CHECK-NEXT: retq ; CHECK-NEXT: .p2align 2 ; CHECK-NEXT: .data_region jt32 -; CHECK-NEXT: .set L0_0_set_2, LBB0_2-LJTI0_0 -; CHECK-NEXT: .set L0_0_set_3, LBB0_3-LJTI0_0 -; CHECK-NEXT: .set L0_0_set_4, LBB0_4-LJTI0_0 -; CHECK-NEXT: .set L0_0_set_5, LBB0_5-LJTI0_0 -; CHECK-NEXT: .set L0_0_set_6, LBB0_6-LJTI0_0 -; CHECK-NEXT: .set L0_0_set_7, LBB0_7-LJTI0_0 +; CHECK-NEXT: L0_0_set_2 = LBB0_2-LJTI0_0 +; CHECK-NEXT: L0_0_set_3 = LBB0_3-LJTI0_0 +; CHECK-NEXT: L0_0_set_4 = LBB0_4-LJTI0_0 +; CHECK-NEXT: L0_0_set_5 = LBB0_5-LJTI0_0 +; CHECK-NEXT: L0_0_set_6 = LBB0_6-LJTI0_0 +; CHECK-NEXT: L0_0_set_7 = LBB0_7-LJTI0_0 ; CHECK-NEXT: LJTI0_0: ; CHECK-NEXT: .long L0_0_set_2 ; CHECK-NEXT: .long L0_0_set_3 diff --git a/llvm/test/CodeGen/X86/windows-seh-EHa-TryInFinally.ll b/llvm/test/CodeGen/X86/windows-seh-EHa-TryInFinally.ll index 16322cbe9980..9e44299083d4 100644 --- a/llvm/test/CodeGen/X86/windows-seh-EHa-TryInFinally.ll +++ b/llvm/test/CodeGen/X86/windows-seh-EHa-TryInFinally.ll @@ -2,7 +2,7 @@ ; CHECK-LABEL: "?fin$0@0@main@@" ; CHECK: .seh_handlerdata -; CHECK: .set ".L?fin$0@0@main@@$parent_frame_offset", 48 +; CHECK: ".L?fin$0@0@main@@$parent_frame_offset" = 48 ; CHECK-NEXT: .long (.Llsda_end1-.Llsda_begin1)/16 ; CHECK-NEXT: .Llsda_begin1: ; CHECK-NEXT: .long .Ltmp diff --git a/llvm/test/CodeGen/XCore/globals.ll b/llvm/test/CodeGen/XCore/globals.ll index 134bbb3444b5..186cfda97104 100644 --- a/llvm/test/CodeGen/XCore/globals.ll +++ b/llvm/test/CodeGen/XCore/globals.ll @@ -127,4 +127,4 @@ entry: @array = global [10 x i16] zeroinitializer, align 2 ; CHECK: .globl array.globound -; CHECK: .set array.globound, 10 +; CHECK: array.globound = 10 diff --git a/llvm/test/CodeGen/XCore/linkage.ll b/llvm/test/CodeGen/XCore/linkage.ll index 93edf01cf8a9..5bfb83d964df 100644 --- a/llvm/test/CodeGen/XCore/linkage.ll +++ b/llvm/test/CodeGen/XCore/linkage.ll @@ -19,14 +19,14 @@ define protected void @test_protected() { } ; CHECK: .globl array.globound -; CHECK: .set array.globound, 2 +; CHECK: array.globound = 2 ; CHECK: .weak array.globound ; CHECK: .globl array ; CHECK: .weak array @array = weak global [2 x i32] zeroinitializer ; CHECK: .globl ac.globound -; CHECK: .set ac.globound, 2 +; CHECK: ac.globound = 2 ; CHECK: .weak ac.globound ; CHECK: .globl ac ; CHECK: .weak ac diff --git a/llvm/test/DebugInfo/X86/dbg-value-range.ll b/llvm/test/DebugInfo/X86/dbg-value-range.ll index 0d49b5eeefd1..a6ede2814aba 100644 --- a/llvm/test/DebugInfo/X86/dbg-value-range.ll +++ b/llvm/test/DebugInfo/X86/dbg-value-range.ll @@ -49,9 +49,9 @@ declare void @llvm.dbg.value(metadata, metadata, metadata) nounwind readnone ;CHECK-NEXT: [[CLOBBER:Ltmp[0-9]*]] ;CHECK:Ldebug_loc0: -;CHECK-NEXT: .set Lset{{.*}}, +;CHECK-NEXT: Lset{{.*}} = ;CHECK-NEXT: .quad -;CHECK-NEXT: .set [[CLOBBER_OFF:Lset.*]], [[CLOBBER]]-{{.*}} +;CHECK-NEXT: [[CLOBBER_OFF:Lset.*]] = [[CLOBBER]]-{{.*}} ;CHECK-NEXT: .quad [[CLOBBER_OFF]] ;CHECK-NEXT: .short 1 ## Loc expr size ;CHECK-NEXT: .byte 85 ## DW_OP_reg diff --git a/llvm/test/DebugInfo/X86/stmt-list-multiple-compile-units.ll b/llvm/test/DebugInfo/X86/stmt-list-multiple-compile-units.ll index 446f31f9a912..8d4d065641fc 100644 --- a/llvm/test/DebugInfo/X86/stmt-list-multiple-compile-units.ll +++ b/llvm/test/DebugInfo/X86/stmt-list-multiple-compile-units.ll @@ -64,11 +64,11 @@ ; PR15408 ; ASM: Lcu_begin0: ; ASM-NOT: Lcu_begin -; ASM: .set Lset[[LT:[0-9]+]], Lline_table_start0-Lsection_line ## DW_AT_stmt_list +; ASM: Lset[[LT:[0-9]+]] = Lline_table_start0-Lsection_line ## DW_AT_stmt_list ; ASM-NEXT: .long Lset[[LT]] ; ASM: Lcu_begin1: ; ASM-NOT: Lcu_begin -; ASM: .set Lset[[LT:[0-9]+]], Lline_table_start0-Lsection_line ## DW_AT_stmt_list +; ASM: Lset[[LT:[0-9]+]] = Lline_table_start0-Lsection_line ## DW_AT_stmt_list ; ASM-NEXT: .long Lset[[LT]] define i32 @test(i32 %a) nounwind uwtable ssp !dbg !5 { entry: diff --git a/llvm/test/MC/AArch64/basic-a64-instructions.s b/llvm/test/MC/AArch64/basic-a64-instructions.s index 14ac11f581a5..b2ec5b6ac367 100644 --- a/llvm/test/MC/AArch64/basic-a64-instructions.s +++ b/llvm/test/MC/AArch64/basic-a64-instructions.s @@ -3349,7 +3349,7 @@ _func: .equ equvalue, 0x0001 movk x1, equvalue, lsl 16 -// CHECK: .set equvalue, 1 +// CHECK: equvalue = 1 // CHECK-NEXT: movk x1, #1, lsl #16 // encoding: [0x21,0x00,0xa0,0xf2] movz x2, #:abs_g0:sym diff --git a/llvm/test/MC/AsmParser/assignment.s b/llvm/test/MC/AsmParser/assignment.s index 6f84a1c338da..8c8984c12ac3 100644 --- a/llvm/test/MC/AsmParser/assignment.s +++ b/llvm/test/MC/AsmParser/assignment.s @@ -1,22 +1,22 @@ # RUN: llvm-mc -triple i386-unknown-unknown %s | FileCheck %s # CHECK: TEST0: -# CHECK: .set a, 0 +# CHECK: a = 0 TEST0: a = 0 # CHECK: TEST1: -# CHECK: .set b, 0 +# CHECK: b = 0 TEST1: - .set b, 0 + b = 0 # CHECK: .globl _f1 -# CHECK: .set _f1, 0 +# CHECK: _f1 = 0 .globl _f1 _f1 = 0 # CHECK: .globl _f2 -# CHECK: .set _f2, 0 +# CHECK: _f2 = 0 .globl _f2 - .set _f2, 0 + _f2 = 0 diff --git a/llvm/test/MC/AsmParser/directive_include.s b/llvm/test/MC/AsmParser/directive_include.s index 8d2ef2753b23..f53bc671fc64 100644 --- a/llvm/test/MC/AsmParser/directive_include.s +++ b/llvm/test/MC/AsmParser/directive_include.s @@ -2,7 +2,7 @@ # CHECK: TESTA: # CHECK: TEST0: -# CHECK: .set a, 0 +# CHECK: a = 0 # CHECK: TESTB: TESTA: .include "directive\137set.s" # "\137" is underscore "_" diff --git a/llvm/test/MC/AsmParser/directive_set.s b/llvm/test/MC/AsmParser/directive_set.s index 65dd33d1d54f..4b93de01b309 100644 --- a/llvm/test/MC/AsmParser/directive_set.s +++ b/llvm/test/MC/AsmParser/directive_set.s @@ -1,13 +1,13 @@ # RUN: llvm-mc -triple i386-unknown-elf %s | FileCheck %s # CHECK: TEST0: -# CHECK: .set a, 0 +# CHECK: a = 0 # CHECK-NOT: .no_dead_strip a TEST0: - .set a, 0 + a = 0 # CHECK: TEST1: -# CHECK: .set a, 0 +# CHECK: a = 0 # CHECK-NOT: .no_dead_strip a TEST1: .equ a, 0 diff --git a/llvm/test/MC/AsmParser/include.ll b/llvm/test/MC/AsmParser/include.ll index 3321f0a6a287..22c9eaf7a36e 100644 --- a/llvm/test/MC/AsmParser/include.ll +++ b/llvm/test/MC/AsmParser/include.ll @@ -10,5 +10,5 @@ entry: ret void } -; CHECK: .set MODULE, 1 -; CHECK: .set FUNCTION, 1 +; CHECK: MODULE = 1 +; CHECK: FUNCTION = 1 diff --git a/llvm/test/MC/AsmParser/labels.s b/llvm/test/MC/AsmParser/labels.s index 599ce72c44ee..6a9870b655f2 100644 --- a/llvm/test/MC/AsmParser/labels.s +++ b/llvm/test/MC/AsmParser/labels.s @@ -18,12 +18,12 @@ foo: // CHECK: addl $24, a$b+10(%eax) addl $24, ("a$b" + 10)(%eax) -// CHECK: .set b$c, 10 +// CHECK: b$c = 10 "b$c" = 10 // CHECK: addl $10, %eax addl $"b$c", %eax -// CHECK: .set "a 0", 11 +// CHECK: "a 0" = 11 .set "a 0", 11 // CHECK: .long 11 @@ -49,7 +49,7 @@ foo: // CHECX: .lsym "a 8",1 // .lsym "a 8", 1 -// CHECK: .set "a 9", a-b +// CHECK: "a 9" = a-b .set "a 9", a - b // CHECK: .long "a 9" diff --git a/llvm/test/MC/AsmParser/macro-arg-darwin.s b/llvm/test/MC/AsmParser/macro-arg-darwin.s index 8671107539ce..88c63dd488be 100644 --- a/llvm/test/MC/AsmParser/macro-arg-darwin.s +++ b/llvm/test/MC/AsmParser/macro-arg-darwin.s @@ -38,7 +38,7 @@ bar .endif .endm .macro bottom - .set fred, $0 + fred = $0 .endm .text @@ -49,7 +49,7 @@ top bar, 42 // CHECK: _foo: // CHECK-NOT: fred // CHECK: _bar -// CHECK-NEXT: .set fred, 42 +// CHECK-NEXT: fred = 42 .macro foo diff --git a/llvm/test/MC/AsmParser/motorola_integers.s b/llvm/test/MC/AsmParser/motorola_integers.s index c75d9a5e0cb1..1ec2e02e97f0 100644 --- a/llvm/test/MC/AsmParser/motorola_integers.s +++ b/llvm/test/MC/AsmParser/motorola_integers.s @@ -1,10 +1,10 @@ # RUN: llvm-mc -triple i386-unknown-unknown -motorola-integers %s | FileCheck %s -# CHECK: .set a, 2882400009 -.set a, $aBcDeF09 -# CHECK: .set b, 256 -.set b, $0100 -# CHECK: .set c, 10 -.set c, %01010 -# CHECK: .set d, 1 -.set d, %1 +# CHECK: a = 2882400009 +a = $aBcDeF09 +# CHECK: b = 256 +b = $0100 +# CHECK: c = 10 +c = %01010 +# CHECK: d = 1 +d = %1 diff --git a/llvm/test/MC/Mips/cpsetup.s b/llvm/test/MC/Mips/cpsetup.s index 4a027c6e796a..f948d650da94 100644 --- a/llvm/test/MC/Mips/cpsetup.s +++ b/llvm/test/MC/Mips/cpsetup.s @@ -196,7 +196,7 @@ IMM_8 = 8 # ALL-LABEL: : # ASM-LABEL: t1b: -# ASM-NEXT: .set IMM_8, 8 +# ASM-NEXT: IMM_8 = 8 # O32-NOT: __cerror From 95bbaca6c1dcabb03bd67aabe3aaa4730a11200d Mon Sep 17 00:00:00 2001 From: Rajveer Singh Bharadwaj Date: Thu, 12 Jun 2025 10:54:01 +0530 Subject: [PATCH 0094/1322] [AArch64] Extend usage of `XAR` instruction for fixed-length operations (#139460) --- .../Target/AArch64/AArch64ISelDAGToDAG.cpp | 102 +++++-- llvm/test/CodeGen/AArch64/xar.ll | 250 +++++++++++++++++- 2 files changed, 324 insertions(+), 28 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp index 11cb91fbe02d..009d69b2b943 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp @@ -4606,7 +4606,33 @@ bool AArch64DAGToDAGISel::trySelectXAR(SDNode *N) { return false; } - if (!Subtarget->hasSHA3()) + // We have Neon SHA3 XAR operation for v2i64 but for types + // v4i32, v8i16, v16i8 we can use SVE operations when SVE2-SHA3 + // is available. + EVT SVT; + switch (VT.getSimpleVT().SimpleTy) { + case MVT::v4i32: + case MVT::v2i32: + SVT = MVT::nxv4i32; + break; + case MVT::v8i16: + case MVT::v4i16: + SVT = MVT::nxv8i16; + break; + case MVT::v16i8: + case MVT::v8i8: + SVT = MVT::nxv16i8; + break; + case MVT::v2i64: + case MVT::v1i64: + SVT = Subtarget->hasSHA3() ? MVT::v2i64 : MVT::nxv2i64; + break; + default: + return false; + } + + if ((!SVT.isScalableVector() && !Subtarget->hasSHA3()) || + (SVT.isScalableVector() && !Subtarget->hasSVE2())) return false; if (N0->getOpcode() != AArch64ISD::VSHL || @@ -4632,7 +4658,8 @@ bool AArch64DAGToDAGISel::trySelectXAR(SDNode *N) { SDValue Imm = CurDAG->getTargetConstant( ShAmt, DL, N0.getOperand(1).getValueType(), false); - if (ShAmt + HsAmt != 64) + unsigned VTSizeInBits = VT.getScalarSizeInBits(); + if (ShAmt + HsAmt != VTSizeInBits) return false; if (!IsXOROperand) { @@ -4640,33 +4667,76 @@ bool AArch64DAGToDAGISel::trySelectXAR(SDNode *N) { SDNode *MOV = CurDAG->getMachineNode(AArch64::MOVIv2d_ns, DL, MVT::v2i64, Zero); SDValue MOVIV = SDValue(MOV, 0); + R1 = N1->getOperand(0); R2 = MOVIV; } - // If the input is a v1i64, widen to a v2i64 to use XAR. - assert((VT == MVT::v1i64 || VT == MVT::v2i64) && "Unexpected XAR type!"); - if (VT == MVT::v1i64) { - EVT SVT = MVT::v2i64; + if (SVT != VT) { SDValue Undef = - SDValue(CurDAG->getMachineNode(AArch64::IMPLICIT_DEF, DL, SVT), 0); - SDValue DSub = CurDAG->getTargetConstant(AArch64::dsub, DL, MVT::i32); + SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, SVT), 0); + + if (SVT.isScalableVector() && VT.is64BitVector()) { + EVT QVT = VT.getDoubleNumVectorElementsVT(*CurDAG->getContext()); + + SDValue UndefQ = SDValue( + CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, QVT), 0); + SDValue DSub = CurDAG->getTargetConstant(AArch64::dsub, DL, MVT::i32); + + R1 = SDValue(CurDAG->getMachineNode(AArch64::INSERT_SUBREG, DL, QVT, + UndefQ, R1, DSub), + 0); + if (R2.getValueType() == VT) + R2 = SDValue(CurDAG->getMachineNode(AArch64::INSERT_SUBREG, DL, QVT, + UndefQ, R2, DSub), + 0); + } + + SDValue SubReg = CurDAG->getTargetConstant( + (SVT.isScalableVector() ? AArch64::zsub : AArch64::dsub), DL, MVT::i32); + R1 = SDValue(CurDAG->getMachineNode(AArch64::INSERT_SUBREG, DL, SVT, Undef, - R1, DSub), + R1, SubReg), 0); - if (R2.getValueType() == MVT::v1i64) + + if (SVT.isScalableVector() || R2.getValueType() != SVT) R2 = SDValue(CurDAG->getMachineNode(AArch64::INSERT_SUBREG, DL, SVT, - Undef, R2, DSub), + Undef, R2, SubReg), 0); } SDValue Ops[] = {R1, R2, Imm}; - SDNode *XAR = CurDAG->getMachineNode(AArch64::XAR, DL, MVT::v2i64, Ops); + SDNode *XAR = nullptr; - if (VT == MVT::v1i64) { - SDValue DSub = CurDAG->getTargetConstant(AArch64::dsub, DL, MVT::i32); - XAR = CurDAG->getMachineNode(AArch64::EXTRACT_SUBREG, DL, VT, - SDValue(XAR, 0), DSub); + if (SVT.isScalableVector()) { + if (auto Opc = SelectOpcodeFromVT( + SVT, {AArch64::XAR_ZZZI_B, AArch64::XAR_ZZZI_H, AArch64::XAR_ZZZI_S, + AArch64::XAR_ZZZI_D})) + XAR = CurDAG->getMachineNode(Opc, DL, SVT, Ops); + } else { + XAR = CurDAG->getMachineNode(AArch64::XAR, DL, SVT, Ops); + } + + assert(XAR && "Unexpected NULL value for XAR instruction in DAG"); + + if (SVT != VT) { + if (VT.is64BitVector() && SVT.isScalableVector()) { + EVT QVT = VT.getDoubleNumVectorElementsVT(*CurDAG->getContext()); + + SDValue ZSub = CurDAG->getTargetConstant(AArch64::zsub, DL, MVT::i32); + SDNode *Q = CurDAG->getMachineNode(AArch64::EXTRACT_SUBREG, DL, QVT, + SDValue(XAR, 0), ZSub); + + SDValue DSub = CurDAG->getTargetConstant(AArch64::dsub, DL, MVT::i32); + XAR = CurDAG->getMachineNode(AArch64::EXTRACT_SUBREG, DL, VT, + SDValue(Q, 0), DSub); + } else { + SDValue SubReg = CurDAG->getTargetConstant( + (SVT.isScalableVector() ? AArch64::zsub : AArch64::dsub), DL, + MVT::i32); + XAR = CurDAG->getMachineNode(AArch64::EXTRACT_SUBREG, DL, VT, + SDValue(XAR, 0), SubReg); + } } ReplaceNode(N, XAR); return true; diff --git a/llvm/test/CodeGen/AArch64/xar.ll b/llvm/test/CodeGen/AArch64/xar.ll index d682f4f4a1bf..652617b58eaf 100644 --- a/llvm/test/CodeGen/AArch64/xar.ll +++ b/llvm/test/CodeGen/AArch64/xar.ll @@ -1,6 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 ; RUN: llc -mtriple=aarch64 -mattr=+sha3 < %s | FileCheck --check-prefix=SHA3 %s ; RUN: llc -mtriple=aarch64 -mattr=-sha3 < %s | FileCheck --check-prefix=NOSHA3 %s +; RUN: llc -mtriple=aarch64 -mattr=+sve2 < %s | FileCheck --check-prefix=SVE2 %s + +/* 128-bit vectors */ define <2 x i64> @xar(<2 x i64> %x, <2 x i64> %y) { ; SHA3-LABEL: xar: @@ -14,6 +17,14 @@ define <2 x i64> @xar(<2 x i64> %x, <2 x i64> %y) { ; NOSHA3-NEXT: shl v0.2d, v1.2d, #10 ; NOSHA3-NEXT: usra v0.2d, v1.2d, #54 ; NOSHA3-NEXT: ret +; +; SVE2-LABEL: xar: +; SVE2: // %bb.0: +; SVE2-NEXT: // kill: def $q0 killed $q0 def $z0 +; SVE2-NEXT: // kill: def $q1 killed $q1 def $z1 +; SVE2-NEXT: xar z0.d, z0.d, z1.d, #54 +; SVE2-NEXT: // kill: def $q0 killed $q0 killed $z0 +; SVE2-NEXT: ret %a = xor <2 x i64> %x, %y %b = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a, <2 x i64> %a, <2 x i64> ) ret <2 x i64> %b @@ -34,24 +45,40 @@ define <1 x i64> @xar_v1i64(<1 x i64> %a, <1 x i64> %b) { ; NOSHA3-NEXT: shl d0, d1, #1 ; NOSHA3-NEXT: usra d0, d1, #63 ; NOSHA3-NEXT: ret +; +; SVE2-LABEL: xar_v1i64: +; SVE2: // %bb.0: +; SVE2-NEXT: // kill: def $d0 killed $d0 def $z0 +; SVE2-NEXT: // kill: def $d1 killed $d1 def $z1 +; SVE2-NEXT: xar z0.d, z0.d, z1.d, #63 +; SVE2-NEXT: // kill: def $d0 killed $d0 killed $z0 +; SVE2-NEXT: ret %v.val = xor <1 x i64> %a, %b %fshl = tail call <1 x i64> @llvm.fshl.v1i64(<1 x i64> %v.val, <1 x i64> %v.val, <1 x i64> splat (i64 1)) ret <1 x i64> %fshl } -define <2 x i64> @xar_instead_of_or1(<2 x i64> %r) { -; SHA3-LABEL: xar_instead_of_or1: +define <2 x i64> @xar_instead_of_or_v2i64(<2 x i64> %r) { +; SHA3-LABEL: xar_instead_of_or_v2i64: ; SHA3: // %bb.0: // %entry ; SHA3-NEXT: movi v1.2d, #0000000000000000 ; SHA3-NEXT: xar v0.2d, v0.2d, v1.2d, #39 ; SHA3-NEXT: ret ; -; NOSHA3-LABEL: xar_instead_of_or1: +; NOSHA3-LABEL: xar_instead_of_or_v2i64: ; NOSHA3: // %bb.0: // %entry ; NOSHA3-NEXT: shl v1.2d, v0.2d, #25 ; NOSHA3-NEXT: usra v1.2d, v0.2d, #39 ; NOSHA3-NEXT: mov v0.16b, v1.16b ; NOSHA3-NEXT: ret +; +; SVE2-LABEL: xar_instead_of_or_v2i64: +; SVE2: // %bb.0: // %entry +; SVE2-NEXT: movi v1.2d, #0000000000000000 +; SVE2-NEXT: // kill: def $q0 killed $q0 def $z0 +; SVE2-NEXT: xar z0.d, z0.d, z1.d, #39 +; SVE2-NEXT: // kill: def $q0 killed $q0 killed $z0 +; SVE2-NEXT: ret entry: %or = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %r, <2 x i64> %r, <2 x i64> splat (i64 25)) ret <2 x i64> %or @@ -72,67 +99,266 @@ define <1 x i64> @xar_instead_of_or_v1i64(<1 x i64> %v.val) { ; NOSHA3-NEXT: usra d1, d0, #63 ; NOSHA3-NEXT: fmov d0, d1 ; NOSHA3-NEXT: ret +; +; SVE2-LABEL: xar_instead_of_or_v1i64: +; SVE2: // %bb.0: +; SVE2-NEXT: movi v1.2d, #0000000000000000 +; SVE2-NEXT: // kill: def $d0 killed $d0 def $z0 +; SVE2-NEXT: xar z0.d, z0.d, z1.d, #63 +; SVE2-NEXT: // kill: def $d0 killed $d0 killed $z0 +; SVE2-NEXT: ret %fshl = tail call <1 x i64> @llvm.fshl.v1i64(<1 x i64> %v.val, <1 x i64> %v.val, <1 x i64> splat (i64 1)) ret <1 x i64> %fshl } -define <4 x i32> @xar_instead_of_or2(<4 x i32> %r) { -; SHA3-LABEL: xar_instead_of_or2: +define <4 x i32> @xar_instead_of_or_v4i32(<4 x i32> %r) { +; SHA3-LABEL: xar_instead_of_or_v4i32: ; SHA3: // %bb.0: // %entry ; SHA3-NEXT: shl v1.4s, v0.4s, #25 ; SHA3-NEXT: usra v1.4s, v0.4s, #7 ; SHA3-NEXT: mov v0.16b, v1.16b ; SHA3-NEXT: ret ; -; NOSHA3-LABEL: xar_instead_of_or2: +; NOSHA3-LABEL: xar_instead_of_or_v4i32: ; NOSHA3: // %bb.0: // %entry ; NOSHA3-NEXT: shl v1.4s, v0.4s, #25 ; NOSHA3-NEXT: usra v1.4s, v0.4s, #7 ; NOSHA3-NEXT: mov v0.16b, v1.16b ; NOSHA3-NEXT: ret +; +; SVE2-LABEL: xar_instead_of_or_v4i32: +; SVE2: // %bb.0: // %entry +; SVE2-NEXT: movi v1.2d, #0000000000000000 +; SVE2-NEXT: // kill: def $q0 killed $q0 def $z0 +; SVE2-NEXT: xar z0.s, z0.s, z1.s, #7 +; SVE2-NEXT: // kill: def $q0 killed $q0 killed $z0 +; SVE2-NEXT: ret entry: %or = call <4 x i32> @llvm.fshl.v2i32(<4 x i32> %r, <4 x i32> %r, <4 x i32> splat (i32 25)) ret <4 x i32> %or } -define <8 x i16> @xar_instead_of_or3(<8 x i16> %r) { -; SHA3-LABEL: xar_instead_of_or3: +define <8 x i16> @xar_instead_of_or_v8i16(<8 x i16> %r) { +; SHA3-LABEL: xar_instead_of_or_v8i16: ; SHA3: // %bb.0: // %entry ; SHA3-NEXT: shl v1.8h, v0.8h, #9 ; SHA3-NEXT: usra v1.8h, v0.8h, #7 ; SHA3-NEXT: mov v0.16b, v1.16b ; SHA3-NEXT: ret ; -; NOSHA3-LABEL: xar_instead_of_or3: +; NOSHA3-LABEL: xar_instead_of_or_v8i16: ; NOSHA3: // %bb.0: // %entry ; NOSHA3-NEXT: shl v1.8h, v0.8h, #9 ; NOSHA3-NEXT: usra v1.8h, v0.8h, #7 ; NOSHA3-NEXT: mov v0.16b, v1.16b ; NOSHA3-NEXT: ret +; +; SVE2-LABEL: xar_instead_of_or_v8i16: +; SVE2: // %bb.0: // %entry +; SVE2-NEXT: movi v1.2d, #0000000000000000 +; SVE2-NEXT: // kill: def $q0 killed $q0 def $z0 +; SVE2-NEXT: xar z0.h, z0.h, z1.h, #7 +; SVE2-NEXT: // kill: def $q0 killed $q0 killed $z0 +; SVE2-NEXT: ret entry: %or = call <8 x i16> @llvm.fshl.v2i16(<8 x i16> %r, <8 x i16> %r, <8 x i16> splat (i16 25)) ret <8 x i16> %or } -define <16 x i8> @xar_instead_of_or4(<16 x i8> %r) { -; SHA3-LABEL: xar_instead_of_or4: +define <16 x i8> @xar_instead_of_or_v16i8(<16 x i8> %r) { +; SHA3-LABEL: xar_instead_of_or_v16i8: ; SHA3: // %bb.0: // %entry ; SHA3-NEXT: add v1.16b, v0.16b, v0.16b ; SHA3-NEXT: usra v1.16b, v0.16b, #7 ; SHA3-NEXT: mov v0.16b, v1.16b ; SHA3-NEXT: ret ; -; NOSHA3-LABEL: xar_instead_of_or4: +; NOSHA3-LABEL: xar_instead_of_or_v16i8: ; NOSHA3: // %bb.0: // %entry ; NOSHA3-NEXT: add v1.16b, v0.16b, v0.16b ; NOSHA3-NEXT: usra v1.16b, v0.16b, #7 ; NOSHA3-NEXT: mov v0.16b, v1.16b ; NOSHA3-NEXT: ret +; +; SVE2-LABEL: xar_instead_of_or_v16i8: +; SVE2: // %bb.0: // %entry +; SVE2-NEXT: movi v1.2d, #0000000000000000 +; SVE2-NEXT: // kill: def $q0 killed $q0 def $z0 +; SVE2-NEXT: xar z0.b, z0.b, z1.b, #7 +; SVE2-NEXT: // kill: def $q0 killed $q0 killed $z0 +; SVE2-NEXT: ret entry: %or = call <16 x i8> @llvm.fshl.v2i8(<16 x i8> %r, <16 x i8> %r, <16 x i8> splat (i8 25)) ret <16 x i8> %or } +/* 64 bit vectors */ + +define <2 x i32> @xar_v2i32(<2 x i32> %x, <2 x i32> %y) { +; SHA3-LABEL: xar_v2i32: +; SHA3: // %bb.0: // %entry +; SHA3-NEXT: eor v1.8b, v0.8b, v1.8b +; SHA3-NEXT: shl v0.2s, v1.2s, #25 +; SHA3-NEXT: usra v0.2s, v1.2s, #7 +; SHA3-NEXT: ret +; +; NOSHA3-LABEL: xar_v2i32: +; NOSHA3: // %bb.0: // %entry +; NOSHA3-NEXT: eor v1.8b, v0.8b, v1.8b +; NOSHA3-NEXT: shl v0.2s, v1.2s, #25 +; NOSHA3-NEXT: usra v0.2s, v1.2s, #7 +; NOSHA3-NEXT: ret +; +; SVE2-LABEL: xar_v2i32: +; SVE2: // %bb.0: // %entry +; SVE2-NEXT: // kill: def $d0 killed $d0 def $z0 +; SVE2-NEXT: // kill: def $d1 killed $d1 def $z1 +; SVE2-NEXT: xar z0.s, z0.s, z1.s, #7 +; SVE2-NEXT: // kill: def $d0 killed $d0 killed $z0 +; SVE2-NEXT: ret +entry: + %a = xor <2 x i32> %x, %y + %b = call <2 x i32> @llvm.fshl(<2 x i32> %a, <2 x i32> %a, <2 x i32> ) + ret <2 x i32> %b +} + +define <2 x i32> @xar_instead_of_or_v2i32(<2 x i32> %r) { +; SHA3-LABEL: xar_instead_of_or_v2i32: +; SHA3: // %bb.0: // %entry +; SHA3-NEXT: shl v1.2s, v0.2s, #25 +; SHA3-NEXT: usra v1.2s, v0.2s, #7 +; SHA3-NEXT: fmov d0, d1 +; SHA3-NEXT: ret +; +; NOSHA3-LABEL: xar_instead_of_or_v2i32: +; NOSHA3: // %bb.0: // %entry +; NOSHA3-NEXT: shl v1.2s, v0.2s, #25 +; NOSHA3-NEXT: usra v1.2s, v0.2s, #7 +; NOSHA3-NEXT: fmov d0, d1 +; NOSHA3-NEXT: ret +; +; SVE2-LABEL: xar_instead_of_or_v2i32: +; SVE2: // %bb.0: // %entry +; SVE2-NEXT: movi v1.2d, #0000000000000000 +; SVE2-NEXT: // kill: def $d0 killed $d0 def $z0 +; SVE2-NEXT: xar z0.s, z0.s, z1.s, #7 +; SVE2-NEXT: // kill: def $d0 killed $d0 killed $z0 +; SVE2-NEXT: ret +entry: + %or = call <2 x i32> @llvm.fshl(<2 x i32> %r, <2 x i32> %r, <2 x i32> splat (i32 25)) + ret <2 x i32> %or +} + +define <4 x i16> @xar_v4i16(<4 x i16> %x, <4 x i16> %y) { +; SHA3-LABEL: xar_v4i16: +; SHA3: // %bb.0: // %entry +; SHA3-NEXT: eor v1.8b, v0.8b, v1.8b +; SHA3-NEXT: shl v0.4h, v1.4h, #9 +; SHA3-NEXT: usra v0.4h, v1.4h, #7 +; SHA3-NEXT: ret +; +; NOSHA3-LABEL: xar_v4i16: +; NOSHA3: // %bb.0: // %entry +; NOSHA3-NEXT: eor v1.8b, v0.8b, v1.8b +; NOSHA3-NEXT: shl v0.4h, v1.4h, #9 +; NOSHA3-NEXT: usra v0.4h, v1.4h, #7 +; NOSHA3-NEXT: ret +; +; SVE2-LABEL: xar_v4i16: +; SVE2: // %bb.0: // %entry +; SVE2-NEXT: // kill: def $d0 killed $d0 def $z0 +; SVE2-NEXT: // kill: def $d1 killed $d1 def $z1 +; SVE2-NEXT: xar z0.h, z0.h, z1.h, #7 +; SVE2-NEXT: // kill: def $d0 killed $d0 killed $z0 +; SVE2-NEXT: ret +entry: + %a = xor <4 x i16> %x, %y + %b = call <4 x i16> @llvm.fshl(<4 x i16> %a, <4 x i16> %a, <4 x i16> splat (i16 25)) + ret <4 x i16> %b +} + +define <4 x i16> @xar_instead_of_or_v4i16(<4 x i16> %r) { +; SHA3-LABEL: xar_instead_of_or_v4i16: +; SHA3: // %bb.0: // %entry +; SHA3-NEXT: shl v1.4h, v0.4h, #9 +; SHA3-NEXT: usra v1.4h, v0.4h, #7 +; SHA3-NEXT: fmov d0, d1 +; SHA3-NEXT: ret +; +; NOSHA3-LABEL: xar_instead_of_or_v4i16: +; NOSHA3: // %bb.0: // %entry +; NOSHA3-NEXT: shl v1.4h, v0.4h, #9 +; NOSHA3-NEXT: usra v1.4h, v0.4h, #7 +; NOSHA3-NEXT: fmov d0, d1 +; NOSHA3-NEXT: ret +; +; SVE2-LABEL: xar_instead_of_or_v4i16: +; SVE2: // %bb.0: // %entry +; SVE2-NEXT: movi v1.2d, #0000000000000000 +; SVE2-NEXT: // kill: def $d0 killed $d0 def $z0 +; SVE2-NEXT: xar z0.h, z0.h, z1.h, #7 +; SVE2-NEXT: // kill: def $d0 killed $d0 killed $z0 +; SVE2-NEXT: ret +entry: + %or = call <4 x i16> @llvm.fshl(<4 x i16> %r, <4 x i16> %r, <4 x i16> splat (i16 25)) + ret <4 x i16> %or +} + +define <8 x i8> @xar_v8i8(<8 x i8> %x, <8 x i8> %y) { +; SHA3-LABEL: xar_v8i8: +; SHA3: // %bb.0: // %entry +; SHA3-NEXT: eor v1.8b, v0.8b, v1.8b +; SHA3-NEXT: add v0.8b, v1.8b, v1.8b +; SHA3-NEXT: usra v0.8b, v1.8b, #7 +; SHA3-NEXT: ret +; +; NOSHA3-LABEL: xar_v8i8: +; NOSHA3: // %bb.0: // %entry +; NOSHA3-NEXT: eor v1.8b, v0.8b, v1.8b +; NOSHA3-NEXT: add v0.8b, v1.8b, v1.8b +; NOSHA3-NEXT: usra v0.8b, v1.8b, #7 +; NOSHA3-NEXT: ret +; +; SVE2-LABEL: xar_v8i8: +; SVE2: // %bb.0: // %entry +; SVE2-NEXT: // kill: def $d0 killed $d0 def $z0 +; SVE2-NEXT: // kill: def $d1 killed $d1 def $z1 +; SVE2-NEXT: xar z0.b, z0.b, z1.b, #7 +; SVE2-NEXT: // kill: def $d0 killed $d0 killed $z0 +; SVE2-NEXT: ret +entry: + %a = xor <8 x i8> %x, %y + %b = call <8 x i8> @llvm.fshl(<8 x i8> %a, <8 x i8> %a, <8 x i8> splat (i8 25)) + ret <8 x i8> %b +} + +define <8 x i8> @xar_instead_of_or_v8i8(<8 x i8> %r) { +; SHA3-LABEL: xar_instead_of_or_v8i8: +; SHA3: // %bb.0: // %entry +; SHA3-NEXT: add v1.8b, v0.8b, v0.8b +; SHA3-NEXT: usra v1.8b, v0.8b, #7 +; SHA3-NEXT: fmov d0, d1 +; SHA3-NEXT: ret +; +; NOSHA3-LABEL: xar_instead_of_or_v8i8: +; NOSHA3: // %bb.0: // %entry +; NOSHA3-NEXT: add v1.8b, v0.8b, v0.8b +; NOSHA3-NEXT: usra v1.8b, v0.8b, #7 +; NOSHA3-NEXT: fmov d0, d1 +; NOSHA3-NEXT: ret +; +; SVE2-LABEL: xar_instead_of_or_v8i8: +; SVE2: // %bb.0: // %entry +; SVE2-NEXT: movi v1.2d, #0000000000000000 +; SVE2-NEXT: // kill: def $d0 killed $d0 def $z0 +; SVE2-NEXT: xar z0.b, z0.b, z1.b, #7 +; SVE2-NEXT: // kill: def $d0 killed $d0 killed $z0 +; SVE2-NEXT: ret +entry: + %or = call <8 x i8> @llvm.fshl(<8 x i8> %r, <8 x i8> %r, <8 x i8> splat (i8 25)) + ret <8 x i8> %or +} + declare <2 x i64> @llvm.fshl.v2i64(<2 x i64>, <2 x i64>, <2 x i64>) declare <4 x i32> @llvm.fshl.v4i32(<4 x i32>, <4 x i32>, <4 x i32>) declare <8 x i16> @llvm.fshl.v8i16(<8 x i16>, <8 x i16>, <8 x i16>) From 2efff47363f18966cd37461323b5db5418183534 Mon Sep 17 00:00:00 2001 From: Thurston Dang Date: Wed, 11 Jun 2025 22:43:06 -0700 Subject: [PATCH 0095/1322] [NFCI][msan] Show that shadow for partially undefined constant vectors is computed as fully initialized (#143823) This happens because `getShadow(Value *V)` has a special case for fully undefined/poisoned values, but partially undefined values fall-through and are given a clean shadow. This leads to false negatives (no false positives). Note: MSan correctly handles InsertElementInst, but the shadow of the initial constant vector may still be wrong and be propagated. Showing that the same approximation happens for other composite types is left as an exercise for the reader. --- .../Instrumentation/MemorySanitizer.cpp | 4 + .../MemorySanitizer/partial-poison.ll | 78 +++++++++++++++++++ 2 files changed, 82 insertions(+) create mode 100644 llvm/test/Instrumentation/MemorySanitizer/partial-poison.ll diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp index c2315d5de704..d3c6a7151ec3 100644 --- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp @@ -2085,6 +2085,10 @@ struct MemorySanitizerVisitor : public InstVisitor { assert(ShadowPtr && "Could not find shadow for an argument"); return ShadowPtr; } + + // TODO: Partially undefined vectors are handled by the fall-through case + // below (see partial-poison.ll); this causes false negatives. + // For everything else the shadow is zero. return getCleanShadow(V); } diff --git a/llvm/test/Instrumentation/MemorySanitizer/partial-poison.ll b/llvm/test/Instrumentation/MemorySanitizer/partial-poison.ll new file mode 100644 index 000000000000..5164441c17e1 --- /dev/null +++ b/llvm/test/Instrumentation/MemorySanitizer/partial-poison.ll @@ -0,0 +1,78 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt < %s -S -passes='msan' 2>&1 | FileCheck %s +; +; Test case to show that MSan computes shadows for partially poisoned vectors +; as fully initialized, resulting in false negatives. + +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +define <2 x i64> @left_poison(ptr %add.ptr) sanitize_memory { +; CHECK-LABEL: define <2 x i64> @left_poison( +; CHECK-SAME: ptr [[ADD_PTR:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <2 x i64> +; + ret <2 x i64> +} + +define <2 x i64> @right_poison(ptr %add.ptr) sanitize_memory { +; CHECK-LABEL: define <2 x i64> @right_poison( +; CHECK-SAME: ptr [[ADD_PTR:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <2 x i64> +; + ret <2 x i64> +} + +define <2 x i64> @full_poison(ptr %add.ptr) sanitize_memory { +; CHECK-LABEL: define <2 x i64> @full_poison( +; CHECK-SAME: ptr [[ADD_PTR:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: store <2 x i64> splat (i64 -1), ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <2 x i64> poison +; + ret <2 x i64> +} + +define <2 x i64> @no_poison_or_undef(ptr %add.ptr) sanitize_memory { +; CHECK-LABEL: define <2 x i64> @no_poison_or_undef( +; CHECK-SAME: ptr [[ADD_PTR:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <2 x i64> splat (i64 42) +; + ret <2 x i64> +} + +define <2 x i64> @left_undef(ptr %add.ptr) sanitize_memory { +; CHECK-LABEL: define <2 x i64> @left_undef( +; CHECK-SAME: ptr [[ADD_PTR:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <2 x i64> +; + ret <2 x i64> +} + +define <2 x i64> @right_undef(ptr %add.ptr) sanitize_memory { +; CHECK-LABEL: define <2 x i64> @right_undef( +; CHECK-SAME: ptr [[ADD_PTR:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <2 x i64> +; + ret <2 x i64> +} + +define <2 x i64> @full_undef(ptr %add.ptr) sanitize_memory { +; CHECK-LABEL: define <2 x i64> @full_undef( +; CHECK-SAME: ptr [[ADD_PTR:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: store <2 x i64> splat (i64 -1), ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <2 x i64> undef +; + ret <2 x i64> +} From bec85f3b187f57713e01191381c88134e122bd35 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Storsj=C3=B6?= Date: Thu, 12 Jun 2025 08:58:26 +0300 Subject: [PATCH 0096/1322] [LLD] [COFF] [test] Readd lto-late-arm.ll (#143494) This testcase was removed in 4cafd28b7dd92080103d11cccc78d9a2f01e1242, as a082f665f85b1002ab22af263eeafceca5288657 had made it no longer trigger the error that it was supposed to do. (Because the latter of those two commits makes the symbol "__rt_sdiv" be included among the potential libcalls listed by lto::LTO::getRuntimeLibcallSymbols().) Readd the test as a positive test, making sure that such libcalls can get linked. We do have preexisting test coverage for LTO libcalls overall in libcall-archive.ll, but readd this test to cover specifically the ARM division helper functions as well. --- lld/test/COFF/lto-late-arm.ll | 38 +++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) create mode 100644 lld/test/COFF/lto-late-arm.ll diff --git a/lld/test/COFF/lto-late-arm.ll b/lld/test/COFF/lto-late-arm.ll new file mode 100644 index 000000000000..1070fc52a013 --- /dev/null +++ b/lld/test/COFF/lto-late-arm.ll @@ -0,0 +1,38 @@ +; REQUIRES: arm + +;; A bitcode file can generate undefined references to symbols that weren't +;; listed as undefined on the bitcode file itself, when lowering produces +;; calls to e.g. builtin helper functions. Ideally all those functions are +;; listed by lto::LTO::getRuntimeLibcallSymbols(), then we successfully +;; can link cases when the helper functions are provided as bitcode too. +;; (In practice, compiler-rt builtins are always compiled with -fno-lto, so +;; this shouldn't really happen anyway.) + +; RUN: rm -rf %t.dir +; RUN: split-file %s %t.dir +; RUN: llvm-as %t.dir/main.ll -o %t.main.obj +; RUN: llvm-as %t.dir/sdiv.ll -o %t.sdiv.obj +; RUN: llvm-ar rcs %t.sdiv.lib %t.sdiv.obj + +; RUN: lld-link /entry:entry %t.main.obj %t.sdiv.lib /out:%t.exe /subsystem:console + +;--- main.ll +target datalayout = "e-m:w-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64" +target triple = "thumbv7-w64-windows-gnu" + +@num = dso_local global i32 100 + +define dso_local arm_aapcs_vfpcc i32 @entry(i32 %param) { +entry: + %0 = load i32, ptr @num + %div = sdiv i32 %0, %param + ret i32 %div +} +;--- sdiv.ll +target datalayout = "e-m:w-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64" +target triple = "thumbv7-w64-windows-gnu" + +define dso_local arm_aapcs_vfpcc void @__rt_sdiv() { +entry: + ret void +} From 9d491bc602c2d9730cb42fe25f0753471a3af389 Mon Sep 17 00:00:00 2001 From: David Green Date: Thu, 12 Jun 2025 07:03:09 +0100 Subject: [PATCH 0097/1322] [AArch64][GlobalISel] Enable extract_vec_elt_combines postlegalization. --- llvm/lib/Target/AArch64/AArch64Combine.td | 2 +- .../AArch64/vec-combine-compare-to-bitmask.ll | 51 +++++++------------ 2 files changed, 18 insertions(+), 35 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64Combine.td b/llvm/lib/Target/AArch64/AArch64Combine.td index 571e2692cbff..ca09598464d1 100644 --- a/llvm/lib/Target/AArch64/AArch64Combine.td +++ b/llvm/lib/Target/AArch64/AArch64Combine.td @@ -361,7 +361,7 @@ def AArch64PostLegalizerCombiner ptr_add_immed_chain, overlapping_and, split_store_zero_128, undef_combines, select_to_minmax, or_to_bsp, combine_concat_vector, - commute_constant_to_rhs, + commute_constant_to_rhs, extract_vec_elt_combines, push_freeze_to_prevent_poison_from_propagating, combine_mul_cmlt, combine_use_vector_truncate, extmultomull]> { } diff --git a/llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll b/llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll index 77483ebb2235..d6d323530946 100644 --- a/llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll +++ b/llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll @@ -596,23 +596,15 @@ define i4 @convert_to_bitmask_4xi8(<4 x i8> %vec) { ; CHECK-GI-NEXT: mov.b v1[3], w8 ; CHECK-GI-NEXT: cmeq.8b v0, v0, v1 ; CHECK-GI-NEXT: mvn.8b v0, v0 -; CHECK-GI-NEXT: umov.b w8, v0[0] -; CHECK-GI-NEXT: umov.b w9, v0[1] -; CHECK-GI-NEXT: fmov s1, w8 -; CHECK-GI-NEXT: umov.b w8, v0[2] -; CHECK-GI-NEXT: mov.s v1[1], w9 -; CHECK-GI-NEXT: umov.b w9, v0[3] -; CHECK-GI-NEXT: mov.s v1[2], w8 -; CHECK-GI-NEXT: mov.s v1[3], w9 -; CHECK-GI-NEXT: mov.s w8, v1[1] -; CHECK-GI-NEXT: mov.s w9, v1[2] -; CHECK-GI-NEXT: fmov w11, s1 -; CHECK-GI-NEXT: mov.s w10, v1[3] +; CHECK-GI-NEXT: umov.b w8, v0[1] +; CHECK-GI-NEXT: umov.b w9, v0[0] +; CHECK-GI-NEXT: umov.b w10, v0[2] +; CHECK-GI-NEXT: umov.b w11, v0[3] ; CHECK-GI-NEXT: and w8, w8, #0x1 -; CHECK-GI-NEXT: bfi w11, w8, #1, #31 -; CHECK-GI-NEXT: and w8, w9, #0x1 -; CHECK-GI-NEXT: and w9, w10, #0x1 -; CHECK-GI-NEXT: orr w8, w11, w8, lsl #2 +; CHECK-GI-NEXT: bfi w9, w8, #1, #31 +; CHECK-GI-NEXT: and w8, w10, #0x1 +; CHECK-GI-NEXT: orr w8, w9, w8, lsl #2 +; CHECK-GI-NEXT: and w9, w11, #0x1 ; CHECK-GI-NEXT: orr w8, w8, w9, lsl #3 ; CHECK-GI-NEXT: strb w8, [sp, #15] ; CHECK-GI-NEXT: and w0, w8, #0xff @@ -871,28 +863,19 @@ define i6 @no_combine_illegal_num_elements(<6 x i32> %vec) { ; CHECK-GI-NEXT: cmtst.4s v1, v1, v1 ; CHECK-GI-NEXT: mov.s w8, v1[1] ; CHECK-GI-NEXT: mov.s w9, v1[2] +; CHECK-GI-NEXT: fmov w11, s1 ; CHECK-GI-NEXT: mov.s w10, v1[3] -; CHECK-GI-NEXT: mov.h v1[1], w8 -; CHECK-GI-NEXT: mov.s w8, v0[1] -; CHECK-GI-NEXT: mov.h v1[2], w9 -; CHECK-GI-NEXT: mov.h v1[3], w10 -; CHECK-GI-NEXT: mov.h v1[4], v0[0] -; CHECK-GI-NEXT: mov.h v1[5], w8 -; CHECK-GI-NEXT: umov.h w8, v1[1] -; CHECK-GI-NEXT: umov.h w9, v1[0] -; CHECK-GI-NEXT: umov.h w10, v1[2] -; CHECK-GI-NEXT: umov.h w11, v1[3] ; CHECK-GI-NEXT: and w8, w8, #0x1 -; CHECK-GI-NEXT: bfi w9, w8, #1, #31 -; CHECK-GI-NEXT: and w8, w10, #0x1 -; CHECK-GI-NEXT: umov.h w10, v1[4] -; CHECK-GI-NEXT: orr w8, w9, w8, lsl #2 -; CHECK-GI-NEXT: and w9, w11, #0x1 -; CHECK-GI-NEXT: umov.h w11, v1[5] -; CHECK-GI-NEXT: orr w8, w8, w9, lsl #3 +; CHECK-GI-NEXT: bfi w11, w8, #1, #31 +; CHECK-GI-NEXT: and w8, w9, #0x1 ; CHECK-GI-NEXT: and w9, w10, #0x1 +; CHECK-GI-NEXT: mov.s w10, v0[1] +; CHECK-GI-NEXT: orr w8, w11, w8, lsl #2 +; CHECK-GI-NEXT: orr w8, w8, w9, lsl #3 +; CHECK-GI-NEXT: fmov w9, s0 +; CHECK-GI-NEXT: and w9, w9, #0x1 ; CHECK-GI-NEXT: orr w8, w8, w9, lsl #4 -; CHECK-GI-NEXT: and w9, w11, #0x1 +; CHECK-GI-NEXT: and w9, w10, #0x1 ; CHECK-GI-NEXT: orr w8, w8, w9, lsl #5 ; CHECK-GI-NEXT: and w8, w8, #0x3f ; CHECK-GI-NEXT: strb w8, [sp, #15] From 3f0cf742ac4eb3437450f8f263081ea951248851 Mon Sep 17 00:00:00 2001 From: Chuanqi Xu Date: Thu, 12 Jun 2025 14:40:38 +0800 Subject: [PATCH 0098/1322] [C++20] [Modules] [Reduced BMI] Don't write specializations with local args Close https://github.com/llvm/llvm-project/issues/119947 As discussed in the above thread, we shouldn't write specializations with local args in reduced BMI. Since users can't find such specializations any way. --- clang/lib/Serialization/ASTWriterDecl.cpp | 45 +++++++++++++++++++ clang/test/Modules/pr119947.cppm | 54 +++++++++++++++++++++++ 2 files changed, 99 insertions(+) create mode 100644 clang/test/Modules/pr119947.cppm diff --git a/clang/lib/Serialization/ASTWriterDecl.cpp b/clang/lib/Serialization/ASTWriterDecl.cpp index 8f82324a2753..052cb5a253bf 100644 --- a/clang/lib/Serialization/ASTWriterDecl.cpp +++ b/clang/lib/Serialization/ASTWriterDecl.cpp @@ -221,6 +221,48 @@ namespace clang { Record.AddDeclRef(F.second); } + template bool shouldSkipWritingSpecializations(T *Spec) { + // Now we will only avoid writing specializations if we're generating + // reduced BMI. + if (!GeneratingReducedBMI) + return false; + + assert((isa(Spec))); + + ArrayRef Args; + if (auto *CTSD = dyn_cast(Spec)) + Args = CTSD->getTemplateArgs().asArray(); + else if (auto *VTSD = dyn_cast(Spec)) + Args = VTSD->getTemplateArgs().asArray(); + else + Args = cast(Spec) + ->getTemplateSpecializationArgs() + ->asArray(); + + // If there is any template argument is TULocal, we can avoid writing the + // specialization since the consumers of reduced BMI won't get the + // specialization anyway. + for (const TemplateArgument &TA : Args) { + switch (TA.getKind()) { + case TemplateArgument::Type: { + Linkage L = TA.getAsType()->getLinkage(); + if (!isExternallyVisible(L)) + return true; + break; + } + case TemplateArgument::Declaration: + if (!TA.getAsDecl()->isExternallyVisible()) + return true; + break; + default: + break; + } + } + + return false; + } + /// Add to the record the first template specialization from each module /// file that provides a declaration of D. We store the DeclId and an /// ODRHash of the template arguments of D which should provide enough @@ -235,6 +277,9 @@ namespace clang { CollectFirstDeclFromEachModule(D, /*IncludeLocal*/ true, Firsts); for (const auto &F : Firsts) { + if (shouldSkipWritingSpecializations(F.second)) + continue; + if (isa(F.second)) PartialSpecsInMap.push_back(F.second); diff --git a/clang/test/Modules/pr119947.cppm b/clang/test/Modules/pr119947.cppm new file mode 100644 index 000000000000..40de2cad3c0d --- /dev/null +++ b/clang/test/Modules/pr119947.cppm @@ -0,0 +1,54 @@ +// RUN: rm -rf %t +// RUN: mkdir -p %t +// RUN: split-file %s %t +// +// RUN: %clang_cc1 -std=c++20 %t/a.cppm -emit-reduced-module-interface -o %t/a.pcm +// RUN: %clang_cc1 -std=c++20 %t/use.cpp -fprebuilt-module-path=%t -emit-llvm -o - + + +//--- a.cppm +export module a; + +struct a_inner { + ~a_inner() { + } + void f(auto) { + } +}; + +export template +struct a { + a() { + struct local {}; + inner.f(local()); + } +private: + a_inner inner; +}; + + +namespace { + +struct s { +}; + +} // namespace + +void f() { + a x; +} + +//--- use.cpp +import a; + +namespace { + +struct s { +}; + +} // namespace + +void g() { + a x; +} + From 6157028fea93ff14af18b173dd01eb431cfb6aef Mon Sep 17 00:00:00 2001 From: Nikita Popov Date: Thu, 12 Jun 2025 09:19:50 +0200 Subject: [PATCH 0099/1322] [BasicAA][ValueTracking] Increase depth for underlying object search (#143714) This depth limits a linear search (rather than the usual potentially exponential one) and is not particularly important for compile-time in practice. The change in #137297 is going to increase the length of GEP chains, so I'd like to increase this limit a bit to reduce the chance of regressions (https://github.com/dtcxzyw/llvm-opt-benchmark/pull/2419 showed a 13% increase in SearchLimitReached). There is no particular significance to the new value of 10. Compile-time is neutral. --- llvm/include/llvm/Analysis/ValueTracking.h | 2 +- .../BasicAA/gep-decomposition-limit.ll | 38 +++++++++++-------- .../underlying-objects-2.ll | 5 ++- .../inline-noalias-unidentify-object.ll | 22 +++++++---- 4 files changed, 42 insertions(+), 25 deletions(-) diff --git a/llvm/include/llvm/Analysis/ValueTracking.h b/llvm/include/llvm/Analysis/ValueTracking.h index 32ab9733d13c..e215c90b5a72 100644 --- a/llvm/include/llvm/Analysis/ValueTracking.h +++ b/llvm/include/llvm/Analysis/ValueTracking.h @@ -47,7 +47,7 @@ constexpr unsigned MaxAnalysisRecursionDepth = 6; /// The max limit of the search depth in DecomposeGEPExpression() and /// getUnderlyingObject(). -constexpr unsigned MaxLookupSearchDepth = 6; +constexpr unsigned MaxLookupSearchDepth = 10; /// Determine which bits of V are known to be either zero or one and return /// them in the KnownZero/KnownOne bit sets. diff --git a/llvm/test/Analysis/BasicAA/gep-decomposition-limit.ll b/llvm/test/Analysis/BasicAA/gep-decomposition-limit.ll index 23a96ebca848..a256ececbe56 100644 --- a/llvm/test/Analysis/BasicAA/gep-decomposition-limit.ll +++ b/llvm/test/Analysis/BasicAA/gep-decomposition-limit.ll @@ -2,22 +2,22 @@ ; CHECK-LABEL: Function: test ;; Before limit: -; CHECK-DAG: MustAlias: i8* %gep.add5, i8* %gep.inc5 -; CHECK-DAG: NoAlias: i8* %gep.inc3, i8* %gep.inc5 -; CHECK-DAG: NoAlias: i8* %gep.inc4, i8* %gep.inc5 +; CHECK-DAG: MustAlias: i8* %gep.add9, i8* %gep.inc9 +; CHECK-DAG: NoAlias: i8* %gep.inc7, i8* %gep.inc9 +; CHECK-DAG: NoAlias: i8* %gep.inc8, i8* %gep.inc9 ;; At limit: -; CHECK-DAG: MustAlias: i8* %gep.add6, i8* %gep.inc6 -; CHECK-DAG: NoAlias: i8* %gep.inc4, i8* %gep.inc6 -; CHECK-DAG: NoAlias: i8* %gep.inc5, i8* %gep.inc6 +; CHECK-DAG: MustAlias: i8* %gep.add10, i8* %gep.inc10 +; CHECK-DAG: NoAlias: i8* %gep.inc10, i8* %gep.inc8 +; CHECK-DAG: NoAlias: i8* %gep.inc10, i8* %gep.inc9 ;; After limit: -; CHECK-DAG: MayAlias: i8* %gep.add7, i8* %gep.inc7 -; CHECK-DAG: MayAlias: i8* %gep.inc5, i8* %gep.inc7 -; CHECK-DAG: NoAlias: i8* %gep.inc6, i8* %gep.inc7 +; CHECK-DAG: MayAlias: i8* %gep.add11, i8* %gep.inc11 +; CHECK-DAG: MayAlias: i8* %gep.inc11, i8* %gep.inc9 +; CHECK-DAG: NoAlias: i8* %gep.inc10, i8* %gep.inc11 define void @test(ptr %base) { - %gep.add5 = getelementptr i8, ptr %base, i64 5 - %gep.add6 = getelementptr i8, ptr %base, i64 6 - %gep.add7 = getelementptr i8, ptr %base, i64 7 + %gep.add9 = getelementptr i8, ptr %base, i64 9 + %gep.add10 = getelementptr i8, ptr %base, i64 10 + %gep.add11 = getelementptr i8, ptr %base, i64 11 %gep.inc1 = getelementptr i8, ptr %base, i64 1 %gep.inc2 = getelementptr i8, ptr %gep.inc1, i64 1 @@ -26,15 +26,23 @@ define void @test(ptr %base) { %gep.inc5 = getelementptr i8, ptr %gep.inc4, i64 1 %gep.inc6 = getelementptr i8, ptr %gep.inc5, i64 1 %gep.inc7 = getelementptr i8, ptr %gep.inc6, i64 1 + %gep.inc8 = getelementptr i8, ptr %gep.inc7, i64 1 + %gep.inc9 = getelementptr i8, ptr %gep.inc8, i64 1 + %gep.inc10 = getelementptr i8, ptr %gep.inc9, i64 1 + %gep.inc11 = getelementptr i8, ptr %gep.inc10, i64 1 - load i8, ptr %gep.add5 - load i8, ptr %gep.add6 - load i8, ptr %gep.add7 + load i8, ptr %gep.add9 + load i8, ptr %gep.add10 + load i8, ptr %gep.add11 load i8, ptr %gep.inc3 load i8, ptr %gep.inc4 load i8, ptr %gep.inc5 load i8, ptr %gep.inc6 load i8, ptr %gep.inc7 + load i8, ptr %gep.inc8 + load i8, ptr %gep.inc9 + load i8, ptr %gep.inc10 + load i8, ptr %gep.inc11 ret void } diff --git a/llvm/test/Analysis/LoopAccessAnalysis/underlying-objects-2.ll b/llvm/test/Analysis/LoopAccessAnalysis/underlying-objects-2.ll index abfdff79dc11..1d3512128678 100644 --- a/llvm/test/Analysis/LoopAccessAnalysis/underlying-objects-2.ll +++ b/llvm/test/Analysis/LoopAccessAnalysis/underlying-objects-2.ll @@ -127,9 +127,12 @@ for_j.body: %gepB7 = getelementptr inbounds i8, ptr %gepB6, i64 0 %gepB8 = getelementptr inbounds i8, ptr %gepB7, i64 0 %gepB9 = getelementptr inbounds i8, ptr %gepB8, i64 0 + %gepB10 = getelementptr inbounds i8, ptr %gepB9, i64 0 + %gepB11 = getelementptr inbounds i8, ptr %gepB10, i64 0 + %gepB12 = getelementptr inbounds i8, ptr %gepB11, i64 0 %loadPrev = load i8, ptr %gepPrev, align 1 - %loadB = load i8, ptr %gepB9, align 1 + %loadB = load i8, ptr %gepB12, align 1 %mul = mul i8 %loadPrev, %loadB diff --git a/llvm/test/Transforms/Inline/inline-noalias-unidentify-object.ll b/llvm/test/Transforms/Inline/inline-noalias-unidentify-object.ll index 54e9ee0918ae..b7ba1b32238a 100644 --- a/llvm/test/Transforms/Inline/inline-noalias-unidentify-object.ll +++ b/llvm/test/Transforms/Inline/inline-noalias-unidentify-object.ll @@ -3,15 +3,18 @@ define i32 @caller(ptr %p) { ; CHECK-LABEL: define i32 @caller(ptr %p) { ; CHECK-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META0:![0-9]+]]) -; CHECK-NEXT: [[P_8_I:%.*]] = getelementptr i8, ptr [[P:%.*]], i64 8 -; CHECK-NEXT: [[V_I:%.*]] = load i32, ptr [[P_8_I]], align 4, !alias.scope !0 -; CHECK-NEXT: [[P_1_I:%.*]] = getelementptr i8, ptr [[P]], i64 1 +; CHECK-NEXT: [[P_11_I:%.*]] = getelementptr i8, ptr %p, i64 11 +; CHECK-NEXT: [[V_I:%.*]] = load i32, ptr [[P_11_I]], align 4, !alias.scope !0 +; CHECK-NEXT: [[P_1_I:%.*]] = getelementptr i8, ptr %p, i64 1 ; CHECK-NEXT: [[P_2_I:%.*]] = getelementptr i8, ptr [[P_1_I]], i64 1 ; CHECK-NEXT: [[P_3_I:%.*]] = getelementptr i8, ptr [[P_2_I]], i64 1 ; CHECK-NEXT: [[P_4_I:%.*]] = getelementptr i8, ptr [[P_3_I]], i64 1 ; CHECK-NEXT: [[P_5_I:%.*]] = getelementptr i8, ptr [[P_4_I]], i64 1 ; CHECK-NEXT: [[P_6_I:%.*]] = getelementptr i8, ptr [[P_5_I]], i64 1 -; CHECK-NEXT: [[P_7_I:%.*]] = getelementptr i8, ptr [[P_6_I]], i64 1 +; CHECK-NEXT: [[P_7_I1:%.*]] = getelementptr i8, ptr [[P_6_I]], i64 1 +; CHECK-NEXT: [[P_8_I:%.*]] = getelementptr i8, ptr [[P_7_I1]], i64 1 +; CHECK-NEXT: [[P_9_I:%.*]] = getelementptr i8, ptr [[P_8_I]], i64 1 +; CHECK-NEXT: [[P_7_I:%.*]] = getelementptr i8, ptr [[P_9_I]], i64 1 ; CHECK-NEXT: [[P_8_ALIAS_I:%.*]] = getelementptr i8, ptr [[P_7_I]], i64 1 ; CHECK-NEXT: store i32 42, ptr [[P_8_ALIAS_I]], align 4 ; CHECK-NEXT: ret i32 [[V_I]] @@ -21,8 +24,8 @@ define i32 @caller(ptr %p) { } define internal i32 @callee(ptr noalias %p) { - %p.8 = getelementptr i8, ptr %p, i64 8 - %v = load i32, ptr %p.8 + %p.11 = getelementptr i8, ptr %p, i64 11 + %v = load i32, ptr %p.11 %p.1 = getelementptr i8, ptr %p, i64 1 %p.2 = getelementptr i8, ptr %p.1, i64 1 %p.3 = getelementptr i8, ptr %p.2, i64 1 @@ -30,7 +33,10 @@ define internal i32 @callee(ptr noalias %p) { %p.5 = getelementptr i8, ptr %p.4, i64 1 %p.6 = getelementptr i8, ptr %p.5, i64 1 %p.7 = getelementptr i8, ptr %p.6, i64 1 - %p.8.alias = getelementptr i8, ptr %p.7, i64 1 - store i32 42, ptr %p.8.alias + %p.8 = getelementptr i8, ptr %p.7, i64 1 + %p.9 = getelementptr i8, ptr %p.8, i64 1 + %p.10 = getelementptr i8, ptr %p.9, i64 1 + %p.11.alias = getelementptr i8, ptr %p.10, i64 1 + store i32 42, ptr %p.11.alias ret i32 %v } From 77062244ed56be61aecda28d6fede3432545f741 Mon Sep 17 00:00:00 2001 From: Mikael Holmen Date: Thu, 12 Jun 2025 09:29:40 +0200 Subject: [PATCH 0100/1322] Fix two instances of -Wparentheses warnings [NFC] Add parentheses around the assert conditions. Without this gcc warned like ../lib/Target/AMDGPU/GCNSchedStrategy.cpp:2250: warning: suggest parentheses around '&&' within '||' [-Wparentheses] 2250 | NewMI != RegionBounds.second && "cannot remove at region end"); and ../../clang/lib/Sema/SemaOverload.cpp:11326:39: warning: suggest parentheses around '&&' within '||' [-Wparentheses] 11326 | DeferredCandidatesCount == 0 && | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^~ 11327 | "Unexpected deferred template candidates"); | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ --- clang/lib/Sema/SemaOverload.cpp | 6 +++--- llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/clang/lib/Sema/SemaOverload.cpp b/clang/lib/Sema/SemaOverload.cpp index cf455f4588de..89e86f49a3ca 100644 --- a/clang/lib/Sema/SemaOverload.cpp +++ b/clang/lib/Sema/SemaOverload.cpp @@ -11322,9 +11322,9 @@ OverloadingResult OverloadCandidateSet::BestViableFunction(Sema &S, SourceLocation Loc, iterator &Best) { - assert(shouldDeferTemplateArgumentDeduction(S.getLangOpts()) || - DeferredCandidatesCount == 0 && - "Unexpected deferred template candidates"); + assert((shouldDeferTemplateArgumentDeduction(S.getLangOpts()) || + DeferredCandidatesCount == 0) && + "Unexpected deferred template candidates"); bool TwoPhaseResolution = DeferredCandidatesCount != 0 && !ResolutionByPerfectCandidateIsDisabled; diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp index 0f80462050cd..7165cf89ca45 100644 --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp @@ -2246,8 +2246,8 @@ void PreRARematStage::finalizeGCNSchedStage() { void GCNScheduleDAGMILive::updateRegionBoundaries( RegionBoundaries &RegionBounds, MachineBasicBlock::iterator MI, MachineInstr *NewMI) { - assert(!NewMI || - NewMI != RegionBounds.second && "cannot remove at region end"); + assert((!NewMI || NewMI != RegionBounds.second) && + "cannot remove at region end"); if (RegionBounds.first == RegionBounds.second) { assert(NewMI && "cannot remove from an empty region"); From 2d35b568ef949717e35df664d4d9352eddbffbfd Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Thu, 12 Jun 2025 09:27:24 +0100 Subject: [PATCH 0101/1322] [X86] bsf.ll - add icmp_ne coverage to bsf passthrough tests --- llvm/test/CodeGen/X86/bsf.ll | 56 ++++++++++++++++++------------------ 1 file changed, 28 insertions(+), 28 deletions(-) diff --git a/llvm/test/CodeGen/X86/bsf.ll b/llvm/test/CodeGen/X86/bsf.ll index 58929115baf5..312f94c04123 100644 --- a/llvm/test/CodeGen/X86/bsf.ll +++ b/llvm/test/CodeGen/X86/bsf.ll @@ -38,13 +38,13 @@ define i8 @cmov_bsf8_undef(i8 %x, i8 %y) nounwind { ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: testb %al, %al -; X86-NEXT: je .LBB1_1 +; X86-NEXT: jne .LBB1_1 ; X86-NEXT: # %bb.2: -; X86-NEXT: rep bsfl %eax, %eax +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-NEXT: # kill: def $al killed $al killed $eax ; X86-NEXT: retl ; X86-NEXT: .LBB1_1: -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-NEXT: rep bsfl %eax, %eax ; X86-NEXT: # kill: def $al killed $al killed $eax ; X86-NEXT: retl ; @@ -56,8 +56,8 @@ define i8 @cmov_bsf8_undef(i8 %x, i8 %y) nounwind { ; X64-NEXT: # kill: def $al killed $al killed $eax ; X64-NEXT: retq %1 = tail call i8 @llvm.cttz.i8(i8 %x, i1 true) - %2 = icmp eq i8 %x, 0 - %3 = select i1 %2, i8 %y, i8 %1 + %2 = icmp ne i8 %x, 0 + %3 = select i1 %2, i8 %1, i8 %y ret i8 %3 } @@ -66,14 +66,14 @@ define i16 @cmov_bsf16(i16 %x, i16 %y) nounwind { ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: testw %ax, %ax -; X86-NEXT: je .LBB2_1 +; X86-NEXT: jne .LBB2_1 ; X86-NEXT: # %bb.2: -; X86-NEXT: orl $65536, %eax # imm = 0x10000 -; X86-NEXT: rep bsfl %eax, %eax +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-NEXT: # kill: def $ax killed $ax killed $eax ; X86-NEXT: retl ; X86-NEXT: .LBB2_1: -; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: orl $65536, %eax # imm = 0x10000 +; X86-NEXT: rep bsfl %eax, %eax ; X86-NEXT: # kill: def $ax killed $ax killed $eax ; X86-NEXT: retl ; @@ -87,8 +87,8 @@ define i16 @cmov_bsf16(i16 %x, i16 %y) nounwind { ; X64-NEXT: # kill: def $ax killed $ax killed $eax ; X64-NEXT: retq %1 = tail call i16 @llvm.cttz.i16(i16 %x, i1 false) - %2 = icmp eq i16 %x, 0 - %3 = select i1 %2, i16 %y, i16 %1 + %2 = icmp ne i16 %x, 0 + %3 = select i1 %2, i16 %1, i16 %y ret i16 %3 } @@ -157,12 +157,12 @@ define i32 @cmov_bsf32_undef(i32 %x, i32 %y) nounwind { ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: testl %eax, %eax -; X86-NEXT: je .LBB5_1 +; X86-NEXT: jne .LBB5_1 ; X86-NEXT: # %bb.2: -; X86-NEXT: rep bsfl %eax, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: retl ; X86-NEXT: .LBB5_1: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: rep bsfl %eax, %eax ; X86-NEXT: retl ; ; X64-LABEL: cmov_bsf32_undef: @@ -171,8 +171,8 @@ define i32 @cmov_bsf32_undef(i32 %x, i32 %y) nounwind { ; X64-NEXT: cmovel %esi, %eax ; X64-NEXT: retq %1 = tail call i32 @llvm.cttz.i32(i32 %x, i1 true) - %2 = icmp eq i32 %x, 0 - %3 = select i1 %2, i32 %y, i32 %1 + %2 = icmp ne i32 %x, 0 + %3 = select i1 %2, i32 %1, i32 %y ret i32 %3 } @@ -199,7 +199,7 @@ define i64 @cmov_bsf64(i64 %x, i64 %y) nounwind { ; X86-NEXT: movl $64, %eax ; X86-NEXT: orl %ecx, %esi ; X86-NEXT: jne .LBB6_7 -; X86-NEXT: .LBB6_6: +; X86-NEXT: .LBB6_6: # %cond.end ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: .LBB6_7: # %cond.end @@ -218,8 +218,8 @@ define i64 @cmov_bsf64(i64 %x, i64 %y) nounwind { ; X64-NEXT: cmoveq %rsi, %rax ; X64-NEXT: retq %1 = tail call i64 @llvm.cttz.i64(i64 %x, i1 false) - %2 = icmp eq i64 %x, 0 - %3 = select i1 %2, i64 %y, i64 %1 + %2 = icmp ne i64 %x, 0 + %3 = select i1 %2, i64 %1, i64 %y ret i64 %3 } @@ -375,10 +375,10 @@ define i128 @cmov_bsf128_undef(i128 %x, i128 %y) nounwind { ; X86-NEXT: orl %ebx, %ebp ; X86-NEXT: orl %edi, %ebp ; X86-NEXT: je .LBB9_11 -; X86-NEXT: # %bb.1: # %select.false.sink +; X86-NEXT: # %bb.1: # %select.true.sink ; X86-NEXT: testl %edx, %edx ; X86-NEXT: jne .LBB9_2 -; X86-NEXT: # %bb.3: # %select.false.sink +; X86-NEXT: # %bb.3: # %select.true.sink ; X86-NEXT: rep bsfl %ecx, %edi ; X86-NEXT: addl $32, %edi ; X86-NEXT: testl %ebx, %ebx @@ -402,20 +402,20 @@ define i128 @cmov_bsf128_undef(i128 %x, i128 %y) nounwind { ; X86-NEXT: rep bsfl %edx, %edi ; X86-NEXT: testl %ebx, %ebx ; X86-NEXT: jne .LBB9_5 -; X86-NEXT: .LBB9_6: # %select.false.sink +; X86-NEXT: .LBB9_6: # %select.true.sink ; X86-NEXT: rep bsfl %esi, %esi ; X86-NEXT: addl $32, %esi ; X86-NEXT: orl %ecx, %edx ; X86-NEXT: jne .LBB9_9 -; X86-NEXT: .LBB9_8: # %select.false.sink +; X86-NEXT: .LBB9_8: # %select.true.sink ; X86-NEXT: addl $64, %esi ; X86-NEXT: movl %esi, %edi -; X86-NEXT: .LBB9_9: # %select.false.sink +; X86-NEXT: .LBB9_9: # %select.true.sink ; X86-NEXT: movl %edi, (%eax) ; X86-NEXT: movl $0, 12(%eax) ; X86-NEXT: movl $0, 8(%eax) ; X86-NEXT: movl $0, 4(%eax) -; X86-NEXT: .LBB9_10: # %select.false.sink +; X86-NEXT: .LBB9_10: # %select.true.sink ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx @@ -427,7 +427,7 @@ define i128 @cmov_bsf128_undef(i128 %x, i128 %y) nounwind { ; X64-NEXT: movq %rdi, %rax ; X64-NEXT: orq %rsi, %rax ; X64-NEXT: je .LBB9_2 -; X64-NEXT: # %bb.1: # %select.false.sink +; X64-NEXT: # %bb.1: # %select.true.sink ; X64-NEXT: rep bsfq %rdi, %rcx ; X64-NEXT: rep bsfq %rsi, %rax ; X64-NEXT: addq $64, %rax @@ -440,8 +440,8 @@ define i128 @cmov_bsf128_undef(i128 %x, i128 %y) nounwind { ; X64-NEXT: movq %rcx, %rdx ; X64-NEXT: retq %1 = tail call i128 @llvm.cttz.i128(i128 %x, i1 true) - %2 = icmp eq i128 %x, 0 - %3 = select i1 %2, i128 %y, i128 %1 + %2 = icmp ne i128 %x, 0 + %3 = select i1 %2, i128 %1, i128 %y ret i128 %3 } From 6e5a1423b752c66273bfcff35aaa8083075788a8 Mon Sep 17 00:00:00 2001 From: Ian Wood Date: Thu, 12 Jun 2025 01:28:27 -0700 Subject: [PATCH 0102/1322] [mlir] Reapply "Loosen restrictions on folding dynamic reshapes" (#142827) The original PR https://github.com/llvm/llvm-project/pull/137963 had a nvidia bot failure. This appears to be a flaky test because rerunning the build was successful. This change needs commit 6f2ba47 to fix incorrect usage of `getReassociationIndicesForCollapse`. Reverts llvm/llvm-project#142639 Co-authored-by: Artem Gindinson --- mlir/lib/Dialect/Utils/ReshapeOpsUtils.cpp | 374 +++++++++++++++--- .../Dialect/Linalg/simplify-pack-unpack.mlir | 4 +- mlir/test/Dialect/Tensor/canonicalize.mlir | 39 +- mlir/unittests/Dialect/Utils/CMakeLists.txt | 1 + .../Dialect/Utils/ReshapeOpsUtilsTest.cpp | 203 ++++++++++ 5 files changed, 561 insertions(+), 60 deletions(-) create mode 100644 mlir/unittests/Dialect/Utils/ReshapeOpsUtilsTest.cpp diff --git a/mlir/lib/Dialect/Utils/ReshapeOpsUtils.cpp b/mlir/lib/Dialect/Utils/ReshapeOpsUtils.cpp index 1a04d702e055..3b1fdb69e8ef 100644 --- a/mlir/lib/Dialect/Utils/ReshapeOpsUtils.cpp +++ b/mlir/lib/Dialect/Utils/ReshapeOpsUtils.cpp @@ -10,6 +10,10 @@ #include "mlir/IR/AffineMap.h" #include "mlir/IR/Builders.h" +#include "mlir/IR/BuiltinTypeInterfaces.h" +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/Support/LogicalResult.h" #include #include @@ -28,67 +32,329 @@ mlir::getReassociationIndicesForReshape(ShapedType sourceType, return std::nullopt; } +namespace { +/// A simple struct to represent ReassociationIndices as an inclusive interval. +/// It's designed to be feasibly minimal, so the call sites should manage the +/// validity of the range manually. +struct ReassociationIndexRange { + /// FIXME: Signed type is used for consistency with ReassociationIndices. + /// We should consider refactoring all reassociation utilities to use unsigned + /// types. + int64_t leftIdx = 0, rightIdx = 0; + + /// Util for manual checks of the range's validity + LogicalResult verify() const { + return leftIdx >= 0 && (leftIdx <= rightIdx) ? success() : failure(); + } + + /// Checks range's containment within another range. Treats the edges + /// non-exclusively. + bool isInRange(const ReassociationIndexRange &outerRange) const { + return leftIdx >= outerRange.leftIdx && rightIdx <= outerRange.rightIdx; + } + + unsigned size() const { + assert(succeeded(verify())); + return rightIdx - leftIdx + 1; + } + bool containsSingleIndex() const { return size() == 1; } + + /// Collects indices that do not overlap between this and another range. + ReassociationIndices + getNonOverlappingIndicesWith(ReassociationIndexRange &rhs) const { + if (rightIdx < rhs.leftIdx) { + // The intervals do not overlap - concatenate the indices from both. + auto jointFullIndices = getFullIndices(); + jointFullIndices.append(rhs.getFullIndices()); + return jointFullIndices; + } + ReassociationIndices result; + // Handle the chunk left of the overlapping range. + int64_t leftStart = std::min(leftIdx, rhs.leftIdx); + int64_t leftEnd = std::max(leftIdx, rhs.leftIdx); + llvm::append_range(result, llvm::seq(leftStart, leftEnd)); + // Handle the chunk right of the overlapping range. Symmetrically, we should + // skip the edge of the overlap AND include the rightmost index. + int64_t rightStart = std::min(rightIdx, rhs.rightIdx) + 1; + int64_t rightEnd = std::max(rightIdx, rhs.rightIdx); + if (rightStart < rightEnd) + llvm::append_range(result, llvm::seq_inclusive(rightStart, rightEnd)); + return result; + } + + /// Converts the range into ReassociationIndices. + ReassociationIndices getFullIndices() const { + ReassociationIndices result; + for (int64_t idx = leftIdx; idx <= rightIdx; ++idx) { + result.push_back(idx); + } + return result; + } +}; +} // namespace + +/// Starting from `sourceStartIdx`, searches `sourceShape` for the first +/// sequence that can be collapsed into a dynamic dimension (at least one must +/// be present in the source). +/// By default, lazily returns once the first dynamic dimension has been found. +/// Setting `matchGreedily` as `true` will also mark all subsequent +/// source dimensions for collapsing into the target. +static FailureOr +findReassociationRangeForDynamicDim(ArrayRef sourceShape, + int64_t sourceStartIdx, + bool matchGreedily = false) { + const unsigned numSourceDims = sourceShape.size(); + ReassociationIndexRange sourceShapeAsRange{0, numSourceDims - 1}; + std::optional resultRange = std::nullopt; + + ReassociationIndexRange iterationRange{sourceStartIdx, sourceStartIdx}; + for (; iterationRange.isInRange(sourceShapeAsRange); + iterationRange.rightIdx++) { + int64_t sourceSize = sourceShape[iterationRange.rightIdx]; + if (sourceSize == ShapedType::kDynamic) { + resultRange = iterationRange; + break; + } + } + if (!resultRange) + return failure(); + if (matchGreedily) + resultRange->rightIdx = sourceShapeAsRange.rightIdx; + return *resultRange; +} + +/// Starting from `sourceStartIdx`, searches `sourceShape` for the first +/// sequence of static dimensions such that their product matches `targetSize`. +/// By default, lazily returns once the product matches the target size. Setting +/// `matchGreedily` as `true` will append all neighboring unit dimensions +/// (dimensions of 1) to the match. +static FailureOr +findReassociationRangeForSize(ArrayRef sourceShape, + int64_t sourceStartIdx, int64_t targetSize, + bool matchGreedily = false) { + const unsigned numSourceDims = sourceShape.size(); + ReassociationIndexRange sourceShapeAsRange{0, numSourceDims - 1}; + std::optional resultRange = std::nullopt; + + ReassociationIndexRange iterationRange{sourceStartIdx, sourceStartIdx}; + int64_t prodOfCollapsedDims = 1; + while (iterationRange.isInRange(sourceShapeAsRange)) { + int64_t sourceSize = sourceShape[iterationRange.rightIdx]; + if (sourceSize == ShapedType::kDynamic) { + // Reassociation for a static dim cannot include a dynamic dim. Reset + // induction variables to essentially restart the loop from the next + // source dimension. + prodOfCollapsedDims = 1; + iterationRange = {iterationRange.rightIdx + 1, + iterationRange.rightIdx + 1}; + continue; + } + prodOfCollapsedDims *= sourceSize; + // If the target size has been exceeded without matching, we need to shift + // the range start right. From the start of the range, roll back the + // multiplication until the target size exceeds the product again. + while (prodOfCollapsedDims > targetSize && + !iterationRange.containsSingleIndex()) { + int64_t frontSourceSize = sourceShape[iterationRange.leftIdx]; + prodOfCollapsedDims /= frontSourceSize; + // Shrink the range rightwards + iterationRange.leftIdx++; + } + // We could've reached the target size with the current dimension, + // also as a result of the above shift to right. + if (prodOfCollapsedDims == targetSize) { + resultRange = iterationRange; + break; + } + // Increment the iteration range + iterationRange.rightIdx++; + } + if (!resultRange) + return failure(); + if (matchGreedily) { + // We now want to collect all unit dimensions directly after the target + // product match. Advance the iterator to avoid OOB when the product match + // happens at the last element. + iterationRange.rightIdx++; + while (iterationRange.isInRange(sourceShapeAsRange) && + sourceShape[iterationRange.rightIdx] == 1) { + resultRange = iterationRange; + iterationRange.rightIdx++; + } + } + return *resultRange; +} + +/// Attempts to find a valid collapsing reassociation of `sourceShape` into +/// `targetShape` through a simple traversal. If successful, an array of source +/// index ranges is returned, correspondingly to each dimension in the target +/// shape. The resulting indices shall fully cover the `sourceShape` without +/// overlaps. +/// +/// The algorithm is essentially a lazy one, searching for non-greedy matches - +/// it will only yield a greedy match for the last target dimension. +/// FIXME: The algorithm can only backtrack when it needs to append an offset +/// for a static target dimension to the preceding dynamic one (this retains the +/// linear complexity). As feasible, consider adding further backtracking +/// routines to enable more reassociations, e.g.: +/// - ?x2x?x2 into ?x2 +static FailureOr> +findReassociationRangesForCollapse(ArrayRef sourceShape, + ArrayRef targetShape) { + unsigned numSourceDims = sourceShape.size(), + numTargetDims = targetShape.size(); + assert(numSourceDims > numTargetDims); + ReassociationIndexRange sourceShapeAsRange{0, numSourceDims - 1}; + + SmallVector reassocRanges; + reassocRanges.reserve(numTargetDims); + // We'll iterate in strides of 2 to enable pseudo-backtracking for simple + // cases, e.g.: + // - ?x2x3x5 into ?x15 + std::optional prevTargetSize = std::nullopt; + for (unsigned targetDimIdx = 0, sourceDimIdx = 0; + targetDimIdx < numTargetDims; ++targetDimIdx) { + int64_t targetSize = targetShape[targetDimIdx]; + // Simply check if there are any subsequent target dimensions left - if not, + // the match must be made greedily. + bool shouldMatchGreedily = targetDimIdx == numTargetDims - 1; + FailureOr sourceRange; + if (targetSize == ShapedType::kDynamic) { + sourceRange = findReassociationRangeForDynamicDim( + sourceShape, sourceDimIdx, shouldMatchGreedily); + } else { + sourceRange = findReassociationRangeForSize( + sourceShape, sourceDimIdx, targetSize, shouldMatchGreedily); + } + + // Run sanity checks on the returned index range. + if (failed(sourceRange) || failed(sourceRange->verify()) || + !sourceRange->isInRange(sourceShapeAsRange)) + return failure(); + if (sourceRange->leftIdx > sourceDimIdx) { + // If some source dimensions had to be skipped in order to find a match, + // they must be collapsed into the directly preceding dynamic dimension. + if (!prevTargetSize || prevTargetSize != ShapedType::kDynamic) + return failure(); + reassocRanges.back().rightIdx = sourceRange->leftIdx - 1; + } + + // Store the gathered information as required for the next iteration. + prevTargetSize = targetSize; + sourceDimIdx = sourceRange->rightIdx + 1; + reassocRanges.push_back(*sourceRange); + } + // Fail if the source shape wasn't a full match for the target shape. We only + // need to check the last recorded index - any other gaps should have been + // mended by the main loop. + if (reassocRanges.back().rightIdx < sourceShapeAsRange.rightIdx) + return failure(); + return reassocRanges; +} + +/// A variant of `findReassociationRangesForCollapse(...)` that can also scan +/// the shapes right-to-left. +static FailureOr> +findReassociationRangesForCollapse(ArrayRef sourceShape, + ArrayRef targetShape, + bool iterateRightToLeft) { + if (!iterateRightToLeft) + return findReassociationRangesForCollapse(sourceShape, targetShape); + // NB: To iterate right-to-left, we currently reverse the shapes and then + // reverse the result back. The reversed shapes must not be temporary, as + // we're passing through an ArrayRef. + // FIXME: It would be preferable to avoid the expensive copies. At the moment, + // this approach is chosen for readability of the main implementation. + std::vector sourceToReverse = sourceShape.vec(), + targetToReverse = targetShape.vec(); + std::reverse(sourceToReverse.begin(), sourceToReverse.end()); + std::reverse(targetToReverse.begin(), targetToReverse.end()); + auto invertedRanges = + findReassociationRangesForCollapse(sourceToReverse, targetToReverse); + if (failed(invertedRanges)) + return failure(); + SmallVector &rangesToInvert = *invertedRanges; + unsigned numSourceDims = sourceShape.size(); + // We have received the ranges for inverted shapes. Now we have to invert + // the ranges back to correspond with the original source shape. + for (auto &range : rangesToInvert) { + int64_t invLeftIdx = range.leftIdx, invRightIdx = range.rightIdx; + range.leftIdx = numSourceDims - 1 - invRightIdx; + range.rightIdx = numSourceDims - 1 - invLeftIdx; + } + // Also invert the ordering of the ranges to correspond with the original + // target shape. + std::reverse(rangesToInvert.begin(), rangesToInvert.end()); + return rangesToInvert; +} + std::optional> mlir::getReassociationIndicesForCollapse(ArrayRef sourceShape, ArrayRef targetShape) { - if (sourceShape.size() <= targetShape.size()) + unsigned numSourceDims = sourceShape.size(), + numTargetDims = targetShape.size(); + // We're supposed to search for a collapsing reassociation. If the sizes + // match, there's no actual collapsing taking place - it's either a no-op or a + // `tensor.reshape`-style reassociation (that would be beyond the scope of + // this utility). + if (numSourceDims <= numTargetDims) return std::nullopt; - unsigned sourceDim = 0; - SmallVector reassociationMap; - reassociationMap.reserve(targetShape.size()); - - ReassociationIndices currIndices; - int64_t prodOfCollapsedDims = 1; - while (sourceDim < sourceShape.size()) { - unsigned targetDim = reassociationMap.size(); - // If we have mapped all the target dimensions stop and handle the remaining - // tail of size-1 dimensions explicitly. - if (targetDim == targetShape.size()) - break; - - int64_t currTargetShape = targetShape[targetDim]; - while (sourceDim < (sourceShape.size() - 1) && - sourceShape[sourceDim] != ShapedType::kDynamic && - prodOfCollapsedDims * sourceShape[sourceDim] < currTargetShape) { - prodOfCollapsedDims *= sourceShape[sourceDim]; - currIndices.push_back(sourceDim++); + // Early handling for scalar target types. + if (numTargetDims == 0) { + ReassociationIndices allSourceIndices; + allSourceIndices.reserve(numSourceDims); + for (unsigned sourceDimIdx = 0; sourceDimIdx < numSourceDims; + ++sourceDimIdx) { + int64_t sourceSize = sourceShape[sourceDimIdx]; + // All source dimensions must be unit or dynamic. + if (sourceSize != 1 && sourceSize != ShapedType::kDynamic) + return std::nullopt; + allSourceIndices.push_back(sourceDimIdx); } - - // If the current expanded dimension is dynamic, then the collapsed - // dimensions should also be dynamic and product of all previous unprocessed - // dimensions of the expanded shape should be 1. - if (sourceShape[sourceDim] == ShapedType::kDynamic && - (currTargetShape != ShapedType::kDynamic || prodOfCollapsedDims != 1)) - return std::nullopt; - - // If the collapsed dim is dynamic, the current expanded dim should also - // be dynamic. - if (currTargetShape == ShapedType::kDynamic && - sourceShape[sourceDim] != ShapedType::kDynamic) - return std::nullopt; - - // For static shapes, if the product of dimensions of the expanded shape - // should match the collapsed dimension shape. - if (prodOfCollapsedDims * sourceShape[sourceDim] != currTargetShape) - return std::nullopt; - - currIndices.push_back(sourceDim++); - reassociationMap.emplace_back(ReassociationIndices{}); - std::swap(reassociationMap.back(), currIndices); - prodOfCollapsedDims = 1; + return SmallVector{allSourceIndices}; } - // All the dimensions in the target must have been processed. - if (reassociationMap.size() != targetShape.size()) + + // Collect source ranges by iterating over the target shape left-to-right. + FailureOr> maybeForwardRanges = + findReassociationRangesForCollapse(sourceShape, targetShape); + if (failed(maybeForwardRanges)) return std::nullopt; - // Process any remaining entries in the source shape. They all need to be - // 1 or dynamic. - for (; sourceDim < sourceShape.size(); sourceDim++) { - if (sourceShape[sourceDim] != ShapedType::kDynamic && - sourceShape[sourceDim] != 1) - return std::nullopt; - // The map is empty when the target type is a scalar. - if (!reassociationMap.empty()) - reassociationMap.back().push_back(sourceDim); + auto &ranges = *maybeForwardRanges; + // Now do the same in reverse. We need to get another valid reassociation + // through some other strategy, and then compare the results in order to + // disambiguate mixed subshapes, such as: + // ?x?x? into ?x?, ?x2x? into ?x?, ?x2x3x6x? into ?x6x? + // This leads us to lose some of the reassociation opportunities that can only + // be found by iterating in a certain direction, e.g. 2x2x? into 2x? - without + // backtracking, the algorithm will fail right-to-left. However, this is the + // best way to preserve correctness. + FailureOr> maybeReverseRanges = + findReassociationRangesForCollapse(sourceShape, targetShape, + /*iterateRightToLeft=*/true); + if (failed(maybeReverseRanges)) + return std::nullopt; + auto &reverseRanges = *maybeReverseRanges; + + if (ranges.size() != numTargetDims || reverseRanges.size() != numTargetDims) + return std::nullopt; + // Now we can check for ambiguity of each target dimension's reassociation. If + // successful, we put the full indices into our result map for the target + // shape. + SmallVector reassociationMap(numTargetDims); + for (unsigned targetDimIdx = 0; targetDimIdx < numTargetDims; + ++targetDimIdx) { + ReassociationIndexRange &range = ranges[targetDimIdx]; + ReassociationIndexRange &reverseRange = reverseRanges[targetDimIdx]; + // Get non-overlapping indices between the ranges + ReassociationIndices nonMatchingIndices = + range.getNonOverlappingIndicesWith(reverseRange); + // Unit dimensions can be collapsed wherever - this is the only ambiguity + // that we allow. + for (int64_t sourceDimIdx : nonMatchingIndices) { + if (sourceShape[sourceDimIdx] != 1) + return std::nullopt; + } + reassociationMap[targetDimIdx] = range.getFullIndices(); } return reassociationMap; } diff --git a/mlir/test/Dialect/Linalg/simplify-pack-unpack.mlir b/mlir/test/Dialect/Linalg/simplify-pack-unpack.mlir index 51350e5bc849..6979770154ba 100644 --- a/mlir/test/Dialect/Linalg/simplify-pack-unpack.mlir +++ b/mlir/test/Dialect/Linalg/simplify-pack-unpack.mlir @@ -158,8 +158,8 @@ func.func @unpack_to_partial_slice(%arg0: tensor<8x32xf32>) -> tensor<255xf32> { // ----- // CHECK-LABEL: func.func @unpack_dynamic -// CHECK-NOT: tensor.collapse -// CHECK: linalg.unpack +// CHECK: tensor.collapse +// CHECK-NOT: linalg.unpack func.func @unpack_dynamic(%arg0: tensor) -> tensor { %c32 = arith.constant 32 : index %c0 = arith.constant 0 : index diff --git a/mlir/test/Dialect/Tensor/canonicalize.mlir b/mlir/test/Dialect/Tensor/canonicalize.mlir index 67b03b0a3485..3251c5a4a2bf 100644 --- a/mlir/test/Dialect/Tensor/canonicalize.mlir +++ b/mlir/test/Dialect/Tensor/canonicalize.mlir @@ -1101,7 +1101,7 @@ func.func @fold_expand_of_collapse(%arg0 : tensor<3x4x4xf32>) -> tensor<3x4x4xf3 // ----- -func.func @fold_expand_of_collapse_dynamic(%arg0 : tensor, %arg1: index, %arg2: index) +func.func @fold_expand_of_collapse_mixed_subshape(%arg0 : tensor, %arg1: index, %arg2: index) -> tensor { %0 = tensor.collapse_shape %arg0 [[0, 1], [2]] : tensor into tensor @@ -1109,12 +1109,28 @@ func.func @fold_expand_of_collapse_dynamic(%arg0 : tensor, %arg1: ind : tensor into tensor return %1 : tensor } -// CHECK-LABEL: @fold_expand_of_collapse_dynamic +// CHECK-LABEL: @fold_expand_of_collapse_mixed_subshape // CHECK-NOT: tensor.{{.*}}_shape // ----- -func.func @no_fold_expand_of_collapse_dynamic(%arg0 : tensor, %arg1: index, %arg2: index, %arg3: index) +func.func @fold_expand_of_collapse_mixed_target_subshape(%arg0 : tensor, %arg1: index, %arg2: index) + -> tensor { + %0 = tensor.collapse_shape %arg0 [[0, 1], [2, 3]] + : tensor into tensor + %1 = tensor.expand_shape %0 [[0, 1], [2]] output_shape [%arg1, 4, %arg2] + : tensor into tensor + return %1 : tensor +} +// CHECK-LABEL: @fold_expand_of_collapse_mixed_target_subshape +// CHECK-NOT: tensor.expand_shape +// CHECK: %[[COLLAPSE:.+]] = tensor.collapse_shape %arg0 {{\[}}[0], [1], [2, 3]] +// CHECK-SAME: : tensor into tensor +// CHECK-NEXT: return %[[COLLAPSE]] + +// ----- + +func.func @no_fold_expand_of_collapse_fully_dynamic(%arg0 : tensor, %arg1: index, %arg2: index, %arg3: index) -> tensor { %0 = tensor.collapse_shape %arg0 [[0, 1], [2]] : tensor into tensor @@ -1122,7 +1138,22 @@ func.func @no_fold_expand_of_collapse_dynamic(%arg0 : tensor, %arg1: : tensor into tensor return %1 : tensor } -// CHECK-LABEL: @no_fold_expand_of_collapse_dynamic +// CHECK-LABEL: @no_fold_expand_of_collapse_fully_dynamic +// CHECK: tensor.collapse_shape +// CHECK: %[[EXPAND:.+]] = tensor.expand_shape +// CHECK: return %[[EXPAND]] + +// ----- + +func.func @no_fold_expand_of_collapse_adjacent_dynamic(%arg0 : tensor, %arg1: index, %arg2: index) + -> tensor { + %0 = tensor.collapse_shape %arg0 [[0, 1, 2]] + : tensor into tensor + %1 = tensor.expand_shape %0 [[0, 1]] output_shape [%arg1, %arg2] + : tensor into tensor + return %1 : tensor +} +// CHECK-LABEL: @no_fold_expand_of_collapse_adjacent_dynamic // CHECK: tensor.collapse_shape // CHECK: %[[EXPAND:.+]] = tensor.expand_shape // CHECK: return %[[EXPAND]] diff --git a/mlir/unittests/Dialect/Utils/CMakeLists.txt b/mlir/unittests/Dialect/Utils/CMakeLists.txt index 61b9cdcb3b8f..e921c8bcfb4e 100644 --- a/mlir/unittests/Dialect/Utils/CMakeLists.txt +++ b/mlir/unittests/Dialect/Utils/CMakeLists.txt @@ -1,5 +1,6 @@ add_mlir_unittest(MLIRDialectUtilsTests StructuredOpsUtilsTest.cpp + ReshapeOpsUtilsTest.cpp IndexingUtilsTest.cpp ) mlir_target_link_libraries(MLIRDialectUtilsTests diff --git a/mlir/unittests/Dialect/Utils/ReshapeOpsUtilsTest.cpp b/mlir/unittests/Dialect/Utils/ReshapeOpsUtilsTest.cpp new file mode 100644 index 000000000000..db1a87a4de2d --- /dev/null +++ b/mlir/unittests/Dialect/Utils/ReshapeOpsUtilsTest.cpp @@ -0,0 +1,203 @@ +//===- ReshapeOpsUtilsTest.cpp - ReshapeOpsUtils unit tests ---------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "mlir/Dialect/Utils/ReshapeOpsUtils.h" +#include "mlir/IR/BuiltinTypeInterfaces.h" +#include "llvm/ADT/STLExtras.h" +#include "gtest/gtest.h" +#include + +using namespace mlir; + +/// Helper to make constructing +/// `std::optional>` more readable. +static std::optional> +makeOptionalIndices(std::initializer_list list) { + return std::optional>(list); +} + +TEST(ReassociationIndicesForCollapse, ScalarTest) { + EXPECT_EQ(getReassociationIndicesForCollapse({1}, {}), + makeOptionalIndices({{0}})); + EXPECT_EQ(getReassociationIndicesForCollapse({1, 1}, {}), + makeOptionalIndices({{0, 1}})); + EXPECT_EQ(getReassociationIndicesForCollapse({ShapedType::kDynamic}, {}), + makeOptionalIndices({{0}})); + EXPECT_EQ(getReassociationIndicesForCollapse({1, ShapedType::kDynamic, + ShapedType::kDynamic, 1, + ShapedType::kDynamic}, + {}), + makeOptionalIndices({{0, 1, 2, 3, 4}})); +} + +TEST(ReassociationIndicesForCollapse, ScalarTestFailure) { + EXPECT_EQ(getReassociationIndicesForCollapse({}, {}), std::nullopt); + EXPECT_EQ(getReassociationIndicesForCollapse({}, {1}), std::nullopt); + EXPECT_EQ(getReassociationIndicesForCollapse({2}, {}), std::nullopt); + EXPECT_EQ( + getReassociationIndicesForCollapse({1, 2, ShapedType::kDynamic, 1}, {}), + std::nullopt); +} + +TEST(ReassociationIndicesForCollapse, StaticTest) { + EXPECT_EQ(getReassociationIndicesForCollapse({10, 20}, {200}), + makeOptionalIndices({{0, 1}})); + EXPECT_EQ(getReassociationIndicesForCollapse({10, 20, 30}, {10, 600}), + makeOptionalIndices({{0}, {1, 2}})); + EXPECT_EQ(getReassociationIndicesForCollapse({10, 20, 30}, {200, 30}), + makeOptionalIndices({{0, 1}, {2}})); +} + +TEST(ReassociationIndicesForCollapse, StaticTestFailure) { + // No-op reassociation + EXPECT_EQ(getReassociationIndicesForCollapse({10, 20}, {10, 20}), + std::nullopt); + // Invalid static reassociations + EXPECT_EQ(getReassociationIndicesForCollapse({10, 20}, {10}), std::nullopt); + EXPECT_EQ(getReassociationIndicesForCollapse({10, 20, 30}, {200, 300}), + std::nullopt); + // Non-collapsing (expanding) reassociation + EXPECT_EQ(getReassociationIndicesForCollapse({10, 20, 30}, {1, 10, 20, 30}), + std::nullopt); +} + +TEST(ReassociationIndicesForCollapse, StaticTestUnitDims) { + EXPECT_EQ(getReassociationIndicesForCollapse({10, 1}, {10}), + makeOptionalIndices({{0, 1}})); + EXPECT_EQ(getReassociationIndicesForCollapse({1, 20, 30}, {600}), + makeOptionalIndices({{0, 1, 2}})); + EXPECT_EQ(getReassociationIndicesForCollapse({1, 1, 1}, {1}), + makeOptionalIndices({{0, 1, 2}})); + EXPECT_EQ(getReassociationIndicesForCollapse({1, 1, 1, 1}, {1, 1, 1}), + makeOptionalIndices({{0}, {1}, {2, 3}})); +} + +TEST(ReassociationIndicesForCollapse, DynamicTest) { + EXPECT_EQ(getReassociationIndicesForCollapse({ShapedType::kDynamic, 1}, + {ShapedType::kDynamic}), + makeOptionalIndices({{0, 1}})); + EXPECT_EQ(getReassociationIndicesForCollapse({ShapedType::kDynamic, 1, 1}, + {ShapedType::kDynamic}), + makeOptionalIndices({{0, 1, 2}})); + EXPECT_EQ(getReassociationIndicesForCollapse( + {1, ShapedType::kDynamic, 1, ShapedType::kDynamic, 1}, + {ShapedType::kDynamic, ShapedType::kDynamic}), + makeOptionalIndices({{0, 1}, {2, 3, 4}})); + EXPECT_EQ( + getReassociationIndicesForCollapse( + {ShapedType::kDynamic, ShapedType::kDynamic}, {ShapedType::kDynamic}), + makeOptionalIndices({{0, 1}})); + EXPECT_EQ(getReassociationIndicesForCollapse( + {1, ShapedType::kDynamic, ShapedType::kDynamic}, + {1, ShapedType::kDynamic}), + makeOptionalIndices({{0}, {1, 2}})); + + EXPECT_EQ(getReassociationIndicesForCollapse({ShapedType::kDynamic, 10}, + {ShapedType::kDynamic}), + makeOptionalIndices({{0, 1}})); + EXPECT_EQ(getReassociationIndicesForCollapse( + {1, ShapedType::kDynamic, ShapedType::kDynamic}, + {ShapedType::kDynamic}), + makeOptionalIndices({{0, 1, 2}})); + EXPECT_EQ(getReassociationIndicesForCollapse({10, ShapedType::kDynamic}, + {ShapedType::kDynamic}), + makeOptionalIndices({{0, 1}})); + EXPECT_EQ(getReassociationIndicesForCollapse( + {ShapedType::kDynamic, 1, 2, ShapedType::kDynamic, 10}, + {ShapedType::kDynamic, 10}), + makeOptionalIndices({{0, 1, 2, 3}, {4}})); + EXPECT_EQ(getReassociationIndicesForCollapse({ShapedType::kDynamic, 10, 20}, + {ShapedType::kDynamic, 20}), + makeOptionalIndices({{0, 1}, {2}})); + EXPECT_EQ(getReassociationIndicesForCollapse({10, ShapedType::kDynamic, 20}, + {ShapedType::kDynamic, 20}), + makeOptionalIndices({{0, 1}, {2}})); + EXPECT_EQ(getReassociationIndicesForCollapse( + {ShapedType::kDynamic, 3, 2, 5, 2}, {ShapedType::kDynamic, 20}), + makeOptionalIndices({{0, 1}, {2, 3, 4}})); + EXPECT_EQ(getReassociationIndicesForCollapse( + {10, ShapedType::kDynamic, 20, ShapedType::kDynamic, 1}, + {ShapedType::kDynamic, 20, ShapedType::kDynamic}), + makeOptionalIndices({{0, 1}, {2}, {3, 4}})); + EXPECT_EQ(getReassociationIndicesForCollapse({1, ShapedType::kDynamic, 1}, + {ShapedType::kDynamic}), + makeOptionalIndices({{0, 1, 2}})); + EXPECT_EQ(getReassociationIndicesForCollapse( + {ShapedType::kDynamic, ShapedType::kDynamic, 1}, + {ShapedType::kDynamic, ShapedType::kDynamic}), + makeOptionalIndices({{0}, {1, 2}})); + EXPECT_EQ(getReassociationIndicesForCollapse( + {1, ShapedType::kDynamic, ShapedType::kDynamic}, + {ShapedType::kDynamic, ShapedType::kDynamic}), + makeOptionalIndices({{0, 1}, {2}})); + EXPECT_EQ(getReassociationIndicesForCollapse( + {ShapedType::kDynamic, 1, ShapedType::kDynamic}, + {ShapedType::kDynamic, ShapedType::kDynamic}), + makeOptionalIndices({{0}, {1, 2}})); +} + +TEST(ReassociationIndicesForCollapse, DynamicTestFailure) { + EXPECT_EQ(getReassociationIndicesForCollapse({ShapedType::kDynamic, 10, 20}, + {ShapedType::kDynamic, 10}), + std::nullopt); + EXPECT_EQ(getReassociationIndicesForCollapse( + {ShapedType::kDynamic, 10, ShapedType::kDynamic}, + {ShapedType::kDynamic, ShapedType::kDynamic}), + std::nullopt); + EXPECT_EQ(getReassociationIndicesForCollapse( + {20, ShapedType::kDynamic, 10, ShapedType::kDynamic}, + {ShapedType::kDynamic, ShapedType::kDynamic}), + std::nullopt); + EXPECT_EQ(getReassociationIndicesForCollapse( + {ShapedType::kDynamic, 5, 3, 2, 2}, {ShapedType::kDynamic, 20}), + std::nullopt); + EXPECT_EQ( + getReassociationIndicesForCollapse( + {ShapedType::kDynamic, ShapedType::kDynamic, ShapedType::kDynamic}, + {ShapedType::kDynamic, ShapedType::kDynamic}), + std::nullopt); + EXPECT_EQ(getReassociationIndicesForCollapse( + {ShapedType::kDynamic, ShapedType::kDynamic, 10, 1, + ShapedType::kDynamic}, + {ShapedType::kDynamic, ShapedType::kDynamic}), + std::nullopt); + EXPECT_EQ(getReassociationIndicesForCollapse( + {ShapedType::kDynamic, 10, 10, 10, ShapedType::kDynamic}, + {ShapedType::kDynamic, 10, ShapedType::kDynamic}), + std::nullopt); + EXPECT_EQ(getReassociationIndicesForCollapse( + {ShapedType::kDynamic, 10, 10, 10, ShapedType::kDynamic}, + {ShapedType::kDynamic, 2, 2, ShapedType::kDynamic}), + std::nullopt); + EXPECT_EQ(getReassociationIndicesForCollapse( + {ShapedType::kDynamic, 3, 4, 3, ShapedType::kDynamic}, + {ShapedType::kDynamic, 12, ShapedType::kDynamic}), + std::nullopt); + EXPECT_EQ(getReassociationIndicesForCollapse( + {ShapedType::kDynamic, 8, 4, 2, 16, ShapedType::kDynamic}, + {ShapedType::kDynamic, 32, ShapedType::kDynamic}), + std::nullopt); + + //===----------------------------------------------------------------------===// + // TODO: Reassociation for the following examples can be computed, but isn't + // supported by `getReassociationIndicesForCollapse`. + //===----------------------------------------------------------------------===// + + // TODO: Fails because there's no backtracking when some source dimensions + // remain unmatched at either edge. + EXPECT_EQ(getReassociationIndicesForCollapse( + {ShapedType::kDynamic, 10, ShapedType::kDynamic, 10}, + {ShapedType::kDynamic, 10}), + std::nullopt); + EXPECT_EQ(getReassociationIndicesForCollapse({1, ShapedType::kDynamic, 2, 2}, + {1, ShapedType::kDynamic, 2}), + std::nullopt); + EXPECT_EQ(getReassociationIndicesForCollapse({2, 2, ShapedType::kDynamic, 1}, + {2, ShapedType::kDynamic}), + std::nullopt); +} From edaac11df3f82268e8ca34bf34b3e9d115b7d475 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Thu, 12 Jun 2025 09:29:41 +0100 Subject: [PATCH 0103/1322] [X86] combineSelect - attempt to combine with shuffles (#143753) Before legalization we will convert to a vector_shuffle node - but afterward we can try to combine the select into an existing target shuffle chain --- llvm/lib/Target/X86/X86ISelLowering.cpp | 16 +- .../CodeGen/X86/combine-mask-with-shuffle.ll | 32 +- llvm/test/CodeGen/X86/pr132844.ll | 11 +- .../vector-interleaved-load-i8-stride-7.ll | 1166 ++++--- .../vector-interleaved-store-i16-stride-8.ll | 2864 ++++++++--------- .../vector-interleaved-store-i8-stride-5.ll | 30 +- .../vector-interleaved-store-i8-stride-6.ll | 2026 ++++++------ .../vector-interleaved-store-i8-stride-7.ll | 231 +- .../vector-interleaved-store-i8-stride-8.ll | 1096 +++---- .../X86/vector-shuffle-combining-avx512f.ll | 40 +- 10 files changed, 3610 insertions(+), 3902 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 96714adf78e4..b0553aa4b819 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -47785,13 +47785,19 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG, DL, DAG, Subtarget)) return V; - // Convert vselects with constant condition into shuffles. - if (CondConstantVector && DCI.isBeforeLegalizeOps() && - (N->getOpcode() == ISD::VSELECT || N->getOpcode() == X86ISD::BLENDV)) { + if (N->getOpcode() == ISD::VSELECT || N->getOpcode() == X86ISD::BLENDV) { SmallVector Mask; if (createShuffleMaskFromVSELECT(Mask, Cond, - N->getOpcode() == X86ISD::BLENDV)) - return DAG.getVectorShuffle(VT, DL, LHS, RHS, Mask); + N->getOpcode() == X86ISD::BLENDV)) { + // Convert vselects with constant condition into shuffles. + if (DCI.isBeforeLegalizeOps()) + return DAG.getVectorShuffle(VT, DL, LHS, RHS, Mask); + + // Attempt to combine as shuffle. + SDValue Op(N, 0); + if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget)) + return Res; + } } // fold vselect(cond, pshufb(x), pshufb(y)) -> or (pshufb(x), pshufb(y)) diff --git a/llvm/test/CodeGen/X86/combine-mask-with-shuffle.ll b/llvm/test/CodeGen/X86/combine-mask-with-shuffle.ll index 268ac3dd31b8..7564e65a428b 100644 --- a/llvm/test/CodeGen/X86/combine-mask-with-shuffle.ll +++ b/llvm/test/CodeGen/X86/combine-mask-with-shuffle.ll @@ -67,11 +67,9 @@ define <16 x i32> @combine_mask_with_abs(<16 x i32> %v0) { define <16 x i32> @combine_mask_with_umin(<16 x i32> %v0) { ; CHECK-LABEL: combine_mask_with_umin: ; CHECK: # %bb.0: -; CHECK-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] -; CHECK-NEXT: movw $-21846, %ax # imm = 0xAAAA -; CHECK-NEXT: kmovw %eax, %k1 -; CHECK-NEXT: vpblendmd %zmm1, %zmm0, %zmm2 {%k1} -; CHECK-NEXT: vpminud %zmm2, %zmm1, %zmm1 +; CHECK-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14] +; CHECK-NEXT: vpshufd {{.*#+}} zmm2 = zmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] +; CHECK-NEXT: vpminud %zmm1, %zmm2, %zmm1 ; CHECK-NEXT: movw $-3856, %ax # imm = 0xF0F0 ; CHECK-NEXT: kmovw %eax, %k1 ; CHECK-NEXT: vpopcntd %zmm0, %zmm1 {%k1} @@ -88,11 +86,9 @@ define <16 x i32> @combine_mask_with_umin(<16 x i32> %v0) { define <16 x i32> @combine_mask_with_umax(<16 x i32> %v0) { ; CHECK-LABEL: combine_mask_with_umax: ; CHECK: # %bb.0: -; CHECK-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] -; CHECK-NEXT: movw $-21846, %ax # imm = 0xAAAA -; CHECK-NEXT: kmovw %eax, %k1 -; CHECK-NEXT: vpblendmd %zmm1, %zmm0, %zmm2 {%k1} -; CHECK-NEXT: vpmaxud %zmm2, %zmm1, %zmm1 +; CHECK-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14] +; CHECK-NEXT: vpshufd {{.*#+}} zmm2 = zmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] +; CHECK-NEXT: vpmaxud %zmm1, %zmm2, %zmm1 ; CHECK-NEXT: movw $-3856, %ax # imm = 0xF0F0 ; CHECK-NEXT: kmovw %eax, %k1 ; CHECK-NEXT: vpopcntd %zmm0, %zmm1 {%k1} @@ -109,11 +105,9 @@ define <16 x i32> @combine_mask_with_umax(<16 x i32> %v0) { define <16 x i32> @combine_mask_with_smin(<16 x i32> %v0) { ; CHECK-LABEL: combine_mask_with_smin: ; CHECK: # %bb.0: -; CHECK-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] -; CHECK-NEXT: movw $-21846, %ax # imm = 0xAAAA -; CHECK-NEXT: kmovw %eax, %k1 -; CHECK-NEXT: vpblendmd %zmm1, %zmm0, %zmm2 {%k1} -; CHECK-NEXT: vpminsd %zmm2, %zmm1, %zmm1 +; CHECK-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14] +; CHECK-NEXT: vpshufd {{.*#+}} zmm2 = zmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] +; CHECK-NEXT: vpminsd %zmm1, %zmm2, %zmm1 ; CHECK-NEXT: movw $-3856, %ax # imm = 0xF0F0 ; CHECK-NEXT: kmovw %eax, %k1 ; CHECK-NEXT: vpopcntd %zmm0, %zmm1 {%k1} @@ -130,11 +124,9 @@ define <16 x i32> @combine_mask_with_smin(<16 x i32> %v0) { define <16 x i32> @combine_mask_with_smax(<16 x i32> %v0) { ; CHECK-LABEL: combine_mask_with_smax: ; CHECK: # %bb.0: -; CHECK-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] -; CHECK-NEXT: movw $-21846, %ax # imm = 0xAAAA -; CHECK-NEXT: kmovw %eax, %k1 -; CHECK-NEXT: vpblendmd %zmm1, %zmm0, %zmm2 {%k1} -; CHECK-NEXT: vpmaxsd %zmm2, %zmm1, %zmm1 +; CHECK-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14] +; CHECK-NEXT: vpshufd {{.*#+}} zmm2 = zmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] +; CHECK-NEXT: vpmaxsd %zmm1, %zmm2, %zmm1 ; CHECK-NEXT: movw $-3856, %ax # imm = 0xF0F0 ; CHECK-NEXT: kmovw %eax, %k1 ; CHECK-NEXT: vpopcntd %zmm0, %zmm1 {%k1} diff --git a/llvm/test/CodeGen/X86/pr132844.ll b/llvm/test/CodeGen/X86/pr132844.ll index ded100b2accc..dc9f006d93d1 100644 --- a/llvm/test/CodeGen/X86/pr132844.ll +++ b/llvm/test/CodeGen/X86/pr132844.ll @@ -4,12 +4,11 @@ define { ptr, i8 } @PR132844(<4 x ptr> %0, <4 x ptr> %1) { ; CHECK-LABEL: PR132844: ; CHECK: # %bb.0: -; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm2 -; CHECK-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; CHECK-NEXT: movb $10, %al -; CHECK-NEXT: kmovd %eax, %k1 -; CHECK-NEXT: vinserti64x2 $1, 16, %ymm2, %ymm0 {%k1} -; CHECK-NEXT: vmovdqu %ymm0, {{[0-9]+}}(%rsp) +; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm2 +; CHECK-NEXT: vinsertf128 $1, 16, %ymm2, %ymm2 +; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; CHECK-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7] +; CHECK-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: xorl %edx, %edx ; CHECK-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll index c132c5ea2ef4..82481269022b 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll @@ -13723,364 +13723,361 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX512BW-FCP-LABEL: load_i8_stride7_vf64: ; AVX512BW-FCP: # %bb.0: -; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm2 +; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm3 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [24,17,2,19,28,21,6,31,16,9,26,27,20,13,30,23] -; AVX512BW-FCP-NEXT: vpermw %zmm0, %zmm1, %zmm24 +; AVX512BW-FCP-NEXT: vpermw %zmm3, %zmm1, %zmm16 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [8,1,18,11,4,5,22,15,0,25,10,3,12,29,14,7] -; AVX512BW-FCP-NEXT: vpermw %zmm2, %zmm1, %zmm13 +; AVX512BW-FCP-NEXT: vpermw %zmm0, %zmm1, %zmm24 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [24,17,2,27,20,5,22,31,16,9,26,19,12,29,30,23] -; AVX512BW-FCP-NEXT: vpermw %zmm0, %zmm1, %zmm25 +; AVX512BW-FCP-NEXT: vpermw %zmm3, %zmm1, %zmm17 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,1,18,11,4,21,14,7,8,25,10,3,28,13,6,15] -; AVX512BW-FCP-NEXT: vpermw %zmm2, %zmm1, %zmm12 +; AVX512BW-FCP-NEXT: vpermw %zmm0, %zmm1, %zmm12 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,1,2,11,4,5,14,7,8,9,26,19,12,29,22,15] -; AVX512BW-FCP-NEXT: vpermw %zmm0, %zmm1, %zmm16 +; AVX512BW-FCP-NEXT: vpermw %zmm3, %zmm1, %zmm18 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,17,10,3,4,21,14,7,24,9,2,11,28,13,6,31] -; AVX512BW-FCP-NEXT: vpermw %zmm2, %zmm1, %zmm7 +; AVX512BW-FCP-NEXT: vpermw %zmm0, %zmm1, %zmm8 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,1,10,3,4,5,14,7,8,25,18,11,12,29,22,15] -; AVX512BW-FCP-NEXT: vpermw %zmm0, %zmm1, %zmm6 +; AVX512BW-FCP-NEXT: vpermw %zmm3, %zmm1, %zmm7 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [16,17,10,3,20,13,6,23,24,25,18,27,28,21,30,31] -; AVX512BW-FCP-NEXT: vpermw %zmm2, %zmm1, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm10 -; AVX512BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm3 +; AVX512BW-FCP-NEXT: vpermw %zmm0, %zmm1, %zmm5 +; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm4 +; AVX512BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm2 ; AVX512BW-FCP-NEXT: movw $-28382, %ax # imm = 0x9122 ; AVX512BW-FCP-NEXT: kmovd %eax, %k1 -; AVX512BW-FCP-NEXT: vpblendmw %ymm3, %ymm10, %ymm1 {%k1} +; AVX512BW-FCP-NEXT: vpblendmw %ymm2, %ymm4, %ymm1 {%k1} ; AVX512BW-FCP-NEXT: kmovq %k1, %k2 ; AVX512BW-FCP-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm1, %xmm5 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[5,12],zero,zero,xmm5[1,8,15,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm1, %xmm6 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm6[5,12],zero,zero,xmm6[1,8,15,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,7,14],zero,zero,xmm1[3,10],zero,zero,zero,xmm1[u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpor %xmm5, %xmm1, %xmm1 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpor %xmm6, %xmm1, %xmm1 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: movw $992, %ax # imm = 0x3E0 ; AVX512BW-FCP-NEXT: kmovd %eax, %k1 -; AVX512BW-FCP-NEXT: vmovdqu16 %ymm4, %ymm1 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm11 -; AVX512BW-FCP-NEXT: vmovdqa 160(%rdi), %ymm9 +; AVX512BW-FCP-NEXT: vmovdqu16 %ymm5, %ymm1 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm6 +; AVX512BW-FCP-NEXT: vmovdqa 160(%rdi), %ymm5 ; AVX512BW-FCP-NEXT: movw $8772, %ax # imm = 0x2244 -; AVX512BW-FCP-NEXT: kmovd %eax, %k1 -; AVX512BW-FCP-NEXT: vpblendmw %ymm11, %ymm9, %ymm4 {%k1} -; AVX512BW-FCP-NEXT: kmovq %k1, %k3 -; AVX512BW-FCP-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u],zero,zero,xmm5[3,10],zero,zero,zero,xmm5[6,13,u,u,u,u] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,5,12],zero,zero,xmm4[1,8,15],zero,zero,xmm4[u,u,u,u] -; AVX512BW-FCP-NEXT: vpor %xmm5, %xmm4, %xmm4 -; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,0,0,0,1,2,4,6] -; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %ymm17 -; AVX512BW-FCP-NEXT: vpermd %ymm17, %ymm5, %ymm5 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,23,26,29] -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm5[7] +; AVX512BW-FCP-NEXT: kmovd %eax, %k6 +; AVX512BW-FCP-NEXT: vpblendmw %ymm6, %ymm5, %ymm9 {%k6} +; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm9, %xmm10 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u],zero,zero,xmm10[3,10],zero,zero,zero,xmm10[6,13,u,u,u,u] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,5,12],zero,zero,xmm9[1,8,15],zero,zero,xmm9[u,u,u,u] +; AVX512BW-FCP-NEXT: vpor %xmm10, %xmm9, %xmm9 +; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [0,0,0,0,1,2,4,6] +; AVX512BW-FCP-NEXT: vmovdqa 192(%rdi), %ymm14 +; AVX512BW-FCP-NEXT: vpermd %ymm14, %ymm10, %ymm10 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,23,26,29] +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5,6],ymm10[7] ; AVX512BW-FCP-NEXT: vmovdqa64 240(%rdi), %xmm19 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm19[5,12,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm19[5,12,u,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vmovdqa64 224(%rdi), %xmm20 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm20[0,7,14],zero,zero,xmm20[u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpor %xmm5, %xmm8, %xmm5 -; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm5, %zmm4, %zmm4 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm20[0,7,14],zero,zero,xmm20[u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpor %xmm10, %xmm11, %xmm10 +; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm10, %zmm9, %zmm9 ; AVX512BW-FCP-NEXT: movabsq $137438429184, %rax # imm = 0x1FFFF80000 ; AVX512BW-FCP-NEXT: kmovq %rax, %k5 -; AVX512BW-FCP-NEXT: vmovdqu8 %zmm4, %zmm1 {%k5} -; AVX512BW-FCP-NEXT: vmovdqa 288(%rdi), %ymm5 -; AVX512BW-FCP-NEXT: vmovdqa 256(%rdi), %ymm4 +; AVX512BW-FCP-NEXT: vmovdqu8 %zmm9, %zmm1 {%k5} +; AVX512BW-FCP-NEXT: vmovdqa 288(%rdi), %ymm10 +; AVX512BW-FCP-NEXT: vmovdqa 256(%rdi), %ymm9 ; AVX512BW-FCP-NEXT: movw $9288, %ax # imm = 0x2448 -; AVX512BW-FCP-NEXT: kmovd %eax, %k6 -; AVX512BW-FCP-NEXT: vpblendmw %ymm5, %ymm4, %ymm8 {%k6} -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm8[u,u,u,u,u,3,10],zero,zero,zero,xmm8[6,13],zero,zero,xmm8[u,u] -; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm8, %xmm8 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u],zero,zero,xmm8[1,8,15],zero,zero,xmm8[4,11,u,u] -; AVX512BW-FCP-NEXT: vporq %xmm18, %xmm8, %xmm21 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,2,9,16,23,30,21,28,19,26,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: kmovd %eax, %k3 +; AVX512BW-FCP-NEXT: vpblendmw %ymm10, %ymm9, %ymm11 {%k3} +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm11[u,u,u,u,u,3,10],zero,zero,zero,xmm11[6,13],zero,zero,xmm11[u,u] +; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm11, %xmm11 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u],zero,zero,xmm11[1,8,15],zero,zero,xmm11[4,11,u,u] +; AVX512BW-FCP-NEXT: vporq %xmm15, %xmm11, %xmm21 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,2,9,16,23,30,21,28,19,26,u,u,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: movw $3968, %ax # imm = 0xF80 ; AVX512BW-FCP-NEXT: kmovd %eax, %k7 -; AVX512BW-FCP-NEXT: vmovdqu16 %ymm6, %ymm21 {%k7} -; AVX512BW-FCP-NEXT: vmovdqa 416(%rdi), %ymm8 -; AVX512BW-FCP-NEXT: vmovdqa 384(%rdi), %ymm6 +; AVX512BW-FCP-NEXT: vmovdqu16 %ymm7, %ymm21 {%k7} +; AVX512BW-FCP-NEXT: vmovdqa 416(%rdi), %ymm11 +; AVX512BW-FCP-NEXT: vmovdqa 384(%rdi), %ymm7 ; AVX512BW-FCP-NEXT: movw $4644, %ax # imm = 0x1224 ; AVX512BW-FCP-NEXT: kmovd %eax, %k4 -; AVX512BW-FCP-NEXT: vpblendmw %ymm8, %ymm6, %ymm18 {%k4} -; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm18, %xmm22 +; AVX512BW-FCP-NEXT: vpblendmw %ymm11, %ymm7, %ymm15 {%k4} +; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm15, %xmm22 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm22 = xmm22[u,u,u,u,u,u,u],zero,zero,zero,xmm22[6,13],zero,zero,xmm22[2,9] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm18[4,11],zero,zero -; AVX512BW-FCP-NEXT: vporq %xmm22, %xmm18, %xmm18 -; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm18, %ymm0, %ymm22 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm15[4,11],zero,zero +; AVX512BW-FCP-NEXT: vporq %xmm22, %xmm15, %xmm15 +; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm15, %ymm0, %ymm22 ; AVX512BW-FCP-NEXT: movl $-8388608, %eax # imm = 0xFF800000 -; AVX512BW-FCP-NEXT: vpblendmw %ymm3, %ymm10, %ymm18 {%k4} -; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm18, %xmm23 +; AVX512BW-FCP-NEXT: vpblendmw %ymm2, %ymm4, %ymm15 {%k4} +; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm15, %xmm23 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm23 = zero,zero,zero,xmm23[6,13],zero,zero,xmm23[2,9,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[1,8,15],zero,zero,xmm18[4,11],zero,zero,xmm18[u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vporq %xmm23, %xmm18, %xmm18 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,u,u,u,0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[1,8,15],zero,zero,xmm15[4,11],zero,zero,xmm15[u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vporq %xmm23, %xmm15, %xmm15 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,u,u,u,u,u,0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: movl $511, %r10d # imm = 0x1FF ; AVX512BW-FCP-NEXT: kmovd %r10d, %k1 -; AVX512BW-FCP-NEXT: vmovdqu8 %ymm18, %ymm7 {%k1} -; AVX512BW-FCP-NEXT: vpblendmw %ymm11, %ymm9, %ymm18 {%k6} -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm23 = xmm18[u,u,u,6,13],zero,zero,xmm18[2,9],zero,zero,zero,xmm18[u,u,u,u] -; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm18, %xmm18 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[u,u,u],zero,zero,xmm18[4,11],zero,zero,xmm18[0,7,14,u,u,u,u] -; AVX512BW-FCP-NEXT: vporq %xmm23, %xmm18, %xmm18 -; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm18, %ymm0, %ymm14 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm18 = [0,0,0,0,1,3,4,6] -; AVX512BW-FCP-NEXT: vpermd %ymm17, %ymm18, %ymm18 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm18[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30] -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5,6],ymm15[7] +; AVX512BW-FCP-NEXT: vmovdqu8 %ymm15, %ymm8 {%k1} +; AVX512BW-FCP-NEXT: vpblendmw %ymm6, %ymm5, %ymm15 {%k3} +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm23 = xmm15[u,u,u,6,13],zero,zero,xmm15[2,9],zero,zero,zero,xmm15[u,u,u,u] +; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm15, %xmm15 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u],zero,zero,xmm15[4,11],zero,zero,xmm15[0,7,14,u,u,u,u] +; AVX512BW-FCP-NEXT: vporq %xmm23, %xmm15, %xmm15 +; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm23 = [0,0,0,0,1,3,4,6] +; AVX512BW-FCP-NEXT: vpermd %ymm14, %ymm23, %ymm23 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm13 = ymm23[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30] +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4,5,6],ymm13[7] ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,zero,xmm19[6,13,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm20[1,8,15],zero,zero,xmm20[u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vporq %xmm15, %xmm18, %xmm15 -; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm15, %zmm14, %zmm14 -; AVX512BW-FCP-NEXT: vmovdqu8 %zmm14, %zmm7 {%k5} -; AVX512BW-FCP-NEXT: vpblendmw %ymm3, %ymm10, %ymm14 {%k3} -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm14[2,9],zero,zero,zero,xmm14[5,12],zero,zero,xmm14[u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm14, %xmm14 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,xmm14[0,7,14],zero,zero,xmm14[3,10,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpor %xmm15, %xmm14, %xmm14 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm23 = xmm20[1,8,15],zero,zero,xmm20[u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vporq %xmm15, %xmm23, %xmm15 +; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm15, %zmm13, %zmm13 +; AVX512BW-FCP-NEXT: vmovdqu8 %zmm13, %zmm8 {%k5} +; AVX512BW-FCP-NEXT: vpblendmw %ymm2, %ymm4, %ymm13 {%k6} +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm13[2,9],zero,zero,zero,xmm13[5,12],zero,zero,xmm13[u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm13, %xmm13 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm13 = zero,zero,xmm13[0,7,14],zero,zero,xmm13[3,10,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpor %xmm15, %xmm13, %xmm13 ; AVX512BW-FCP-NEXT: movl $261632, %r10d # imm = 0x3FE00 ; AVX512BW-FCP-NEXT: kmovd %r10d, %k5 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm14 {%k5} = ymm12[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpblendmw %ymm9, %ymm11, %ymm12 {%k2} +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm13 {%k5} = ymm12[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpblendmw %ymm5, %ymm6, %ymm12 {%k2} ; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm12, %xmm15 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u],zero,zero,zero,xmm15[5,12],zero,zero,xmm15[1,8,15,u,u,u,u] ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,0,7,14],zero,zero,xmm12[3,10],zero,zero,zero,xmm12[u,u,u,u] ; AVX512BW-FCP-NEXT: vpor %xmm15, %xmm12, %xmm12 ; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm15 = [0,0,0,0,1,3,5,6] -; AVX512BW-FCP-NEXT: vpermd %ymm17, %ymm15, %ymm15 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,21,24,31] -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,6],ymm15[7] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm20[2,9],zero,zero,zero,xmm20[u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = zero,zero,xmm19[0,7,14,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vporq %xmm15, %xmm17, %xmm15 -; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm15, %zmm12, %zmm12 -; AVX512BW-FCP-NEXT: vmovdqu16 %zmm14, %zmm12 {%k1} -; AVX512BW-FCP-NEXT: vpblendmw %ymm3, %ymm10, %ymm14 {%k6} -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm14[3,10],zero,zero,zero,xmm14[6,13],zero,zero,xmm14[u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm14, %xmm14 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,xmm14[1,8,15],zero,zero,xmm14[4,11,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpor %xmm15, %xmm14, %xmm14 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm14 {%k5} = ymm13[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpblendmw %ymm9, %ymm11, %ymm13 {%k4} -; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm13, %xmm15 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u],zero,zero,zero,xmm15[6,13],zero,zero,xmm15[2,9,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpermd %ymm14, %ymm15, %ymm14 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,21,24,31] +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,6],ymm14[7] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm20[2,9],zero,zero,zero,xmm20[u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,xmm19[0,7,14,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpor %xmm14, %xmm15, %xmm14 +; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm14, %zmm12, %zmm12 +; AVX512BW-FCP-NEXT: vmovdqu16 %zmm13, %zmm12 {%k1} +; AVX512BW-FCP-NEXT: vpblendmw %ymm2, %ymm4, %ymm13 {%k3} +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm13[3,10],zero,zero,zero,xmm13[6,13],zero,zero,xmm13[u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm13, %xmm13 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm13 = zero,zero,xmm13[1,8,15],zero,zero,xmm13[4,11,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vporq %xmm14, %xmm13, %xmm23 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm23 {%k5} = ymm24[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpblendmw %ymm5, %ymm6, %ymm13 {%k4} +; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm13, %xmm14 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u],zero,zero,zero,xmm14[6,13],zero,zero,xmm14[2,9,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,1,8,15],zero,zero,xmm13[4,11],zero,zero,xmm13[u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpor %xmm15, %xmm13, %xmm13 +; AVX512BW-FCP-NEXT: vpor %xmm14, %xmm13, %xmm13 ; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 -; AVX512BW-FCP-NEXT: vmovdqa64 208(%rdi), %xmm17 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm17[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm17[5,12] -; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %xmm18 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm23 = xmm18[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero -; AVX512BW-FCP-NEXT: vporq %xmm15, %xmm23, %xmm15 -; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 +; AVX512BW-FCP-NEXT: vmovdqa 208(%rdi), %xmm14 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm24 = xmm14[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm14[5,12] +; AVX512BW-FCP-NEXT: vmovdqa 192(%rdi), %xmm15 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm25 = xmm15[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero +; AVX512BW-FCP-NEXT: vporq %xmm24, %xmm25, %xmm24 +; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm24, %ymm0, %ymm24 ; AVX512BW-FCP-NEXT: movl $-134217728, %r10d # imm = 0xF8000000 ; AVX512BW-FCP-NEXT: kmovd %r10d, %k2 ; AVX512BW-FCP-NEXT: kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu8 %ymm15, %ymm13 {%k2} -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm20[3,10],zero,zero,zero,xmm20[u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vmovdqu8 %ymm24, %ymm13 {%k2} +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm20 = xmm20[3,10],zero,zero,zero,xmm20[u,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm19 = zero,zero,xmm19[1,8,15,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vporq %xmm15, %xmm19, %xmm15 -; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm15, %zmm13, %zmm13 -; AVX512BW-FCP-NEXT: vmovdqu16 %zmm14, %zmm13 {%k1} -; AVX512BW-FCP-NEXT: kmovd %eax, %k3 -; AVX512BW-FCP-NEXT: vmovdqu8 %ymm22, %ymm21 {%k3} -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm21, %zmm0, %zmm14 +; AVX512BW-FCP-NEXT: vporq %xmm20, %xmm19, %xmm19 +; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm19, %zmm13, %zmm13 +; AVX512BW-FCP-NEXT: vmovdqu16 %zmm23, %zmm13 {%k1} +; AVX512BW-FCP-NEXT: kmovd %eax, %k2 +; AVX512BW-FCP-NEXT: vmovdqu8 %ymm22, %ymm21 {%k2} +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm21, %zmm0, %zmm19 ; AVX512BW-FCP-NEXT: movabsq $-137438953472, %rax # imm = 0xFFFFFFE000000000 -; AVX512BW-FCP-NEXT: kmovq %rax, %k2 -; AVX512BW-FCP-NEXT: vmovdqu8 %zmm14, %zmm1 {%k2} -; AVX512BW-FCP-NEXT: vpblendmw %ymm4, %ymm5, %ymm14 {%k4} -; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm14, %xmm15 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u,u],zero,zero,xmm15[2,9],zero,zero,zero,xmm15[5,12,u,u] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,u,4,11],zero,zero,xmm14[0,7,14],zero,zero,xmm14[u,u] -; AVX512BW-FCP-NEXT: vpor %xmm15, %xmm14, %xmm14 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm16[u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,10,17,24,31,22,29,20,27,u,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vmovdqu16 %ymm15, %ymm14 {%k7} +; AVX512BW-FCP-NEXT: kmovq %rax, %k1 +; AVX512BW-FCP-NEXT: vmovdqu8 %zmm19, %zmm1 {%k1} +; AVX512BW-FCP-NEXT: vpblendmw %ymm9, %ymm10, %ymm19 {%k4} +; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm19, %xmm20 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm20 = xmm20[u,u,u,u,u],zero,zero,xmm20[2,9],zero,zero,zero,xmm20[5,12,u,u] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm19 = xmm19[u,u,u,u,u,4,11],zero,zero,xmm19[0,7,14],zero,zero,xmm19[u,u] +; AVX512BW-FCP-NEXT: vporq %xmm20, %xmm19, %xmm19 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm18 = ymm18[u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,10,17,24,31,22,29,20,27,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vmovdqu16 %ymm18, %ymm19 {%k7} +; AVX512BW-FCP-NEXT: vpblendmw %ymm11, %ymm7, %ymm18 {%k6} +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm20 = xmm18[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm18[5,12],zero,zero +; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm18, %xmm18 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[u,u,u,u,u,u,u],zero,zero,xmm18[0,7,14],zero,zero,xmm18[3,10] +; AVX512BW-FCP-NEXT: vporq %xmm20, %xmm18, %xmm18 +; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm18, %ymm0, %ymm18 +; AVX512BW-FCP-NEXT: vmovdqu8 %ymm18, %ymm19 {%k2} +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm19, %zmm0, %zmm18 +; AVX512BW-FCP-NEXT: vmovdqu8 %zmm18, %zmm8 {%k1} +; AVX512BW-FCP-NEXT: vpblendmw %ymm9, %ymm10, %ymm18 {%k6} +; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm18, %xmm19 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm19 = xmm19[u,u,u,u,u],zero,zero,xmm19[3,10],zero,zero,zero,xmm19[6,13,u,u] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[u,u,u,u,u,5,12],zero,zero,xmm18[1,8,15],zero,zero,xmm18[u,u] +; AVX512BW-FCP-NEXT: vporq %xmm19, %xmm18, %xmm18 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm17 = ymm17[u,u,u,u,u,u,u,u,u,u,u,u,u,u,4,11,18,25,16,23,30,21,28,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vmovdqu16 %ymm17, %ymm18 {%k7} +; AVX512BW-FCP-NEXT: vpblendmw %ymm11, %ymm7, %ymm17 {%k3} +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm19 = xmm17[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm17[6,13],zero,zero +; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm17, %xmm17 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = xmm17[u,u,u,u,u,u,u],zero,zero,xmm17[1,8,15],zero,zero,xmm17[4,11] +; AVX512BW-FCP-NEXT: vporq %xmm19, %xmm17, %xmm17 +; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm17, %ymm0, %ymm17 +; AVX512BW-FCP-NEXT: vmovdqu8 %ymm17, %ymm18 {%k2} +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm18, %zmm0, %zmm17 +; AVX512BW-FCP-NEXT: vmovdqu8 %zmm17, %zmm12 {%k1} +; AVX512BW-FCP-NEXT: vpblendmw %ymm9, %ymm10, %ymm17 {%k3} +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm17[u,u,u,u,u,6,13],zero,zero,xmm17[2,9],zero,zero,zero,xmm17[u,u] +; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm17, %xmm17 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = xmm17[u,u,u,u,u],zero,zero,xmm17[4,11],zero,zero,xmm17[0,7,14,u,u] +; AVX512BW-FCP-NEXT: vporq %xmm18, %xmm17, %xmm17 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm16 = ymm16[u,u,u,u,u,u,u,u,u,u,u,u,u,u,5,12,19,26,17,24,31,22,29,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vmovdqu16 %ymm16, %ymm17 {%k7} +; AVX512BW-FCP-NEXT: vpblendmw %ymm7, %ymm11, %ymm16 {%k4} +; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm16, %xmm18 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[u,u,u,u,u,u,u],zero,zero,xmm18[2,9],zero,zero,zero,xmm18[5,12] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,u,u,u,u,4,11],zero,zero,xmm16[0,7,14],zero,zero +; AVX512BW-FCP-NEXT: vporq %xmm18, %xmm16, %xmm16 +; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm16, %ymm0, %ymm16 +; AVX512BW-FCP-NEXT: vmovdqu8 %ymm16, %ymm17 {%k2} +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm17, %zmm0, %zmm16 +; AVX512BW-FCP-NEXT: vmovdqu8 %zmm16, %zmm13 {%k1} +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm16 = [0,9,2,3,4,13,6,7,24,17,10,11,28,21,14,31] +; AVX512BW-FCP-NEXT: vpermw %zmm3, %zmm16, %zmm16 +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm17 = [16,25,18,3,28,21,6,23,24,17,10,27,20,13,30,31] +; AVX512BW-FCP-NEXT: vpermw %zmm3, %zmm17, %zmm17 ; AVX512BW-FCP-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-FCP-NEXT: vpblendmw %ymm8, %ymm6, %ymm15 {%k1} -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm15[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm15[5,12],zero,zero -; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm15, %xmm15 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u,u,u,u],zero,zero,xmm15[0,7,14],zero,zero,xmm15[3,10] -; AVX512BW-FCP-NEXT: vporq %xmm16, %xmm15, %xmm15 -; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 -; AVX512BW-FCP-NEXT: vmovdqu8 %ymm15, %ymm14 {%k3} -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm14, %zmm0, %zmm14 -; AVX512BW-FCP-NEXT: vmovdqu8 %zmm14, %zmm7 {%k2} -; AVX512BW-FCP-NEXT: vpblendmw %ymm4, %ymm5, %ymm14 {%k1} -; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm14, %xmm15 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u,u],zero,zero,xmm15[3,10],zero,zero,zero,xmm15[6,13,u,u] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,u,5,12],zero,zero,xmm14[1,8,15],zero,zero,xmm14[u,u] -; AVX512BW-FCP-NEXT: vpor %xmm15, %xmm14, %xmm14 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm25[u,u,u,u,u,u,u,u,u,u,u,u,u,u,4,11,18,25,16,23,30,21,28,u,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vmovdqu16 %ymm15, %ymm14 {%k7} -; AVX512BW-FCP-NEXT: vpblendmw %ymm8, %ymm6, %ymm15 {%k6} -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm15[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm15[6,13],zero,zero -; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm15, %xmm15 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u,u,u,u],zero,zero,xmm15[1,8,15],zero,zero,xmm15[4,11] -; AVX512BW-FCP-NEXT: vporq %xmm16, %xmm15, %xmm15 -; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 -; AVX512BW-FCP-NEXT: vmovdqu8 %ymm15, %ymm14 {%k3} -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm14, %zmm0, %zmm14 -; AVX512BW-FCP-NEXT: vmovdqu8 %zmm14, %zmm12 {%k2} -; AVX512BW-FCP-NEXT: vpblendmw %ymm4, %ymm5, %ymm14 {%k6} -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm14[u,u,u,u,u,6,13],zero,zero,xmm14[2,9],zero,zero,zero,xmm14[u,u] -; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm14, %xmm14 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,u],zero,zero,xmm14[4,11],zero,zero,xmm14[0,7,14,u,u] -; AVX512BW-FCP-NEXT: vpor %xmm15, %xmm14, %xmm14 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm24[u,u,u,u,u,u,u,u,u,u,u,u,u,u,5,12,19,26,17,24,31,22,29,u,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vmovdqu16 %ymm15, %ymm14 {%k7} -; AVX512BW-FCP-NEXT: vpblendmw %ymm6, %ymm8, %ymm15 {%k4} -; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm15, %xmm16 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,u,u,u,u],zero,zero,xmm16[2,9],zero,zero,zero,xmm16[5,12] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u,u,u,u,4,11],zero,zero,xmm15[0,7,14],zero,zero -; AVX512BW-FCP-NEXT: vporq %xmm16, %xmm15, %xmm15 -; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 -; AVX512BW-FCP-NEXT: vmovdqu8 %ymm15, %ymm14 {%k3} -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm14, %zmm0, %zmm14 -; AVX512BW-FCP-NEXT: vmovdqu8 %zmm14, %zmm13 {%k2} -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm14 = [0,9,2,3,4,13,6,7,24,17,10,11,28,21,14,31] -; AVX512BW-FCP-NEXT: vpermw %zmm0, %zmm14, %zmm15 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm14 = [16,25,18,3,28,21,6,23,24,17,10,27,20,13,30,31] -; AVX512BW-FCP-NEXT: vpermw %zmm0, %zmm14, %zmm16 -; AVX512BW-FCP-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-FCP-NEXT: vpblendmw %ymm5, %ymm4, %ymm14 {%k2} -; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm14, %xmm19 +; AVX512BW-FCP-NEXT: vpblendmw %ymm10, %ymm9, %ymm18 {%k1} +; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm18, %xmm19 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm19 = xmm19[u,u,u,u],zero,zero,zero,xmm19[5,12],zero,zero,xmm19[1,8,15,u,u] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,0,7,14],zero,zero,xmm14[3,10],zero,zero,zero,xmm14[u,u] -; AVX512BW-FCP-NEXT: vporq %xmm19, %xmm14, %xmm14 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm16 = ymm16[u,u,u,u,u,u,u,u,u,u,u,u,u,u,6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vmovdqu16 %ymm16, %ymm14 {%k7} -; AVX512BW-FCP-NEXT: vpblendmw %ymm5, %ymm4, %ymm16 {%k4} -; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm16, %xmm19 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[u,u,u,u,0,7,14],zero,zero,xmm18[3,10],zero,zero,zero,xmm18[u,u] +; AVX512BW-FCP-NEXT: vporq %xmm19, %xmm18, %xmm18 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm17 = ymm17[u,u,u,u,u,u,u,u,u,u,u,u,u,u,6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vmovdqu16 %ymm17, %ymm18 {%k7} +; AVX512BW-FCP-NEXT: vpblendmw %ymm10, %ymm9, %ymm17 {%k4} +; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm17, %xmm19 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm19 = xmm19[u,u,u,u],zero,zero,zero,xmm19[6,13],zero,zero,xmm19[2,9,u,u,u] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,u,1,8,15],zero,zero,xmm16[4,11],zero,zero,xmm16[u,u,u] -; AVX512BW-FCP-NEXT: vporq %xmm19, %xmm16, %xmm16 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,0,7,14,21,28,19,26,17,24,31,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = xmm17[u,u,u,u,1,8,15],zero,zero,xmm17[4,11],zero,zero,xmm17[u,u,u] +; AVX512BW-FCP-NEXT: vporq %xmm19, %xmm17, %xmm17 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm19 = ymm16[u,u,u,u,u,u,u,u,u,u,u,u,u,0,7,14,21,28,19,26,17,24,31,u,u,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: movl $8176, %eax # imm = 0x1FF0 ; AVX512BW-FCP-NEXT: kmovd %eax, %k1 -; AVX512BW-FCP-NEXT: vmovdqu8 %ymm16, %ymm15 {%k1} -; AVX512BW-FCP-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-FCP-NEXT: vpblendmw %ymm6, %ymm8, %ymm16 {%k1} -; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm16, %xmm19 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm19 = xmm19[u,u,u,u,u,u,u],zero,zero,xmm19[3,10],zero,zero,zero,xmm19[6,13] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,u,u,u,u,5,12],zero,zero,xmm16[1,8,15],zero,zero -; AVX512BW-FCP-NEXT: vporq %xmm19, %xmm16, %xmm16 -; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm16, %ymm0, %ymm16 -; AVX512BW-FCP-NEXT: vmovdqu8 %ymm16, %ymm14 {%k3} -; AVX512BW-FCP-NEXT: vpblendmw %ymm6, %ymm8, %ymm16 {%k6} -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm19 = xmm16[u,u,u,u,u,u,u,6,13],zero,zero,xmm16[2,9],zero,zero,zero -; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm16, %xmm16 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,u,u,u,u],zero,zero,xmm16[4,11],zero,zero,xmm16[0,7,14] -; AVX512BW-FCP-NEXT: vporq %xmm19, %xmm16, %xmm16 -; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm16, %ymm0, %ymm16 -; AVX512BW-FCP-NEXT: vmovdqu8 %ymm16, %ymm15 {%k3} -; AVX512BW-FCP-NEXT: vpblendmw %ymm10, %ymm3, %ymm19 {%k4} -; AVX512BW-FCP-NEXT: vpblendmw %ymm10, %ymm3, %ymm16 {%k1} -; AVX512BW-FCP-NEXT: vpblendmw %ymm9, %ymm11, %ymm20 {%k6} -; AVX512BW-FCP-NEXT: vmovdqu16 %ymm10, %ymm3 {%k6} -; AVX512BW-FCP-NEXT: vpblendmw %ymm9, %ymm11, %ymm10 {%k1} -; AVX512BW-FCP-NEXT: vmovdqu16 %ymm11, %ymm9 {%k4} -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm10[u,u,2,9],zero,zero,zero,xmm10[5,12],zero,zero,xmm10[u,u,u,u,u] -; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm10, %xmm10 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u],zero,zero,xmm10[0,7,14],zero,zero,xmm10[3,10,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpor %xmm11, %xmm10, %xmm10 -; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm17[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm17[6,13] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm21 = xmm18[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero -; AVX512BW-FCP-NEXT: vporq %xmm11, %xmm21, %xmm11 -; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX512BW-FCP-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 4-byte Reload -; AVX512BW-FCP-NEXT: vmovdqu8 %ymm11, %ymm10 {%k3} -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm20[u,u,3,10],zero,zero,zero,xmm20[6,13],zero,zero,xmm20[u,u,u,u,u] +; AVX512BW-FCP-NEXT: vmovdqu8 %ymm17, %ymm19 {%k1} +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm16 = [16,9,2,19,12,5,22,23,24,17,26,27,20,29,30,31] +; AVX512BW-FCP-NEXT: vpermw %zmm0, %zmm16, %zmm20 +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm16 = [8,1,2,19,12,5,22,15,0,9,26,11,4,29,14,7] +; AVX512BW-FCP-NEXT: vpermw %zmm0, %zmm16, %zmm17 +; AVX512BW-FCP-NEXT: vpblendmw %ymm4, %ymm2, %ymm16 {%k4} +; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm16, %xmm21 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm21 = zero,zero,xmm21[2,9],zero,zero,zero,xmm21[5,12,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[4,11],zero,zero,xmm16[0,7,14],zero,zero,xmm16[u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vporq %xmm21, %xmm16, %xmm16 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm16 {%k5} = ymm17[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpblendmw %ymm5, %ymm6, %ymm17 {%k6} +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm21 = xmm17[u,u,2,9],zero,zero,zero,xmm17[5,12],zero,zero,xmm17[u,u,u,u,u] +; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm17, %xmm17 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = xmm17[u,u],zero,zero,xmm17[0,7,14],zero,zero,xmm17[3,10,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vporq %xmm21, %xmm17, %xmm17 +; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm17, %ymm0, %ymm21 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = xmm14[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm14[6,13] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm22 = xmm15[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero +; AVX512BW-FCP-NEXT: vporq %xmm17, %xmm22, %xmm17 +; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm17, %ymm0, %ymm17 +; AVX512BW-FCP-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 4-byte Reload +; AVX512BW-FCP-NEXT: vmovdqu8 %ymm17, %ymm21 {%k7} +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm22 = [1,2,4,6,0,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 224(%rdi), %ymm17 +; AVX512BW-FCP-NEXT: vpermd %ymm17, %ymm22, %ymm22 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm22 = xmm22[0,7,10,13,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm22, %zmm21, %zmm21 +; AVX512BW-FCP-NEXT: vmovdqu16 %zmm21, %zmm16 {%k5} +; AVX512BW-FCP-NEXT: vpblendmw %ymm7, %ymm11, %ymm21 {%k6} +; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm21, %xmm22 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm22 = xmm22[u,u,u,u,u,u,u],zero,zero,xmm22[3,10],zero,zero,zero,xmm22[6,13] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm21 = xmm21[u,u,u,u,u,u,u,5,12],zero,zero,xmm21[1,8,15],zero,zero +; AVX512BW-FCP-NEXT: vporq %xmm22, %xmm21, %xmm21 +; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm21, %ymm0, %ymm21 +; AVX512BW-FCP-NEXT: vmovdqu8 %ymm21, %ymm18 {%k2} +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm18, %zmm0, %zmm18 +; AVX512BW-FCP-NEXT: movw $-512, %ax # imm = 0xFE00 +; AVX512BW-FCP-NEXT: vpblendmw %ymm7, %ymm11, %ymm21 {%k3} +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm22 = xmm21[u,u,u,u,u,u,u,6,13],zero,zero,xmm21[2,9],zero,zero,zero +; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm21, %xmm21 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm21 = xmm21[u,u,u,u,u,u,u],zero,zero,xmm21[4,11],zero,zero,xmm21[0,7,14] +; AVX512BW-FCP-NEXT: vporq %xmm22, %xmm21, %xmm21 +; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm21, %ymm0, %ymm21 +; AVX512BW-FCP-NEXT: vmovdqu8 %ymm21, %ymm19 {%k2} +; AVX512BW-FCP-NEXT: kmovd %eax, %k1 +; AVX512BW-FCP-NEXT: vmovdqa32 %zmm18, %zmm16 {%k1} +; AVX512BW-FCP-NEXT: vpblendmw %ymm4, %ymm2, %ymm18 {%k6} +; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm18, %xmm21 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm21 = zero,zero,xmm21[3,10],zero,zero,zero,xmm21[6,13,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[5,12],zero,zero,xmm18[1,8,15],zero,zero,xmm18[u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vporq %xmm21, %xmm18, %xmm18 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm18 {%k5} = ymm20[u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,21,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpblendmw %ymm5, %ymm6, %ymm20 {%k3} +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm21 = xmm20[u,u,3,10],zero,zero,zero,xmm20[6,13],zero,zero,xmm20[u,u,u,u,u] ; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm20, %xmm20 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm20 = xmm20[u,u],zero,zero,xmm20[1,8,15],zero,zero,xmm20[4,11,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vporq %xmm11, %xmm20, %xmm11 -; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm20 = xmm18[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm21 = xmm17[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm17[0,7,14] -; AVX512BW-FCP-NEXT: vporq %xmm20, %xmm21, %xmm20 +; AVX512BW-FCP-NEXT: vporq %xmm21, %xmm20, %xmm20 ; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm20 -; AVX512BW-FCP-NEXT: vmovdqu8 %ymm20, %ymm11 {%k3} -; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm9, %xmm20 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm20 = xmm20[u,u],zero,zero,xmm20[2,9],zero,zero,zero,xmm20[5,12,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,4,11],zero,zero,xmm9[0,7,14],zero,zero,xmm9[u,u,u,u,u] -; AVX512BW-FCP-NEXT: vporq %xmm20, %xmm9, %xmm9 -; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm9, %ymm0, %ymm20 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm18[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = xmm17[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm17[1,8,15] -; AVX512BW-FCP-NEXT: vporq %xmm9, %xmm17, %xmm9 -; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX512BW-FCP-NEXT: vmovdqu8 %ymm9, %ymm20 {%k3} -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm9 = [16,9,2,19,20,13,6,23,24,17,26,27,28,21,30,31] -; AVX512BW-FCP-NEXT: vpermw %zmm2, %zmm9, %zmm17 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm9 = [16,9,2,19,12,5,22,23,24,17,26,27,20,29,30,31] -; AVX512BW-FCP-NEXT: vpermw %zmm2, %zmm9, %zmm18 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm9 = [8,1,2,19,12,5,22,15,0,9,26,11,4,29,14,7] -; AVX512BW-FCP-NEXT: vpermw %zmm2, %zmm9, %zmm9 -; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm19, %xmm2 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[2,9],zero,zero,zero,xmm2[5,12,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm19 = xmm19[4,11],zero,zero,xmm19[0,7,14],zero,zero,xmm19[u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vporq %xmm2, %xmm19, %xmm2 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm2 {%k5} = ymm9[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [1,2,4,6,0,0,0,0] -; AVX512BW-FCP-NEXT: vmovdqa64 224(%rdi), %ymm19 -; AVX512BW-FCP-NEXT: vpermd %ymm19, %ymm9, %ymm9 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[0,7,10,13,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm9, %zmm10, %zmm9 -; AVX512BW-FCP-NEXT: vmovdqu16 %zmm9, %zmm2 {%k5} -; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm16, %xmm9 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,xmm9[3,10],zero,zero,zero,xmm9[6,13,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm16[5,12],zero,zero,xmm16[1,8,15],zero,zero,xmm16[u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpor %xmm9, %xmm10, %xmm9 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm9 {%k5} = ymm18[u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,21,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [1,3,4,6,0,0,0,0] -; AVX512BW-FCP-NEXT: vpermd %ymm19, %ymm10, %ymm10 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[1,4,11,14,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm10, %zmm11, %zmm10 -; AVX512BW-FCP-NEXT: vmovdqu16 %zmm10, %zmm9 {%k5} -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm3[6,13],zero,zero,xmm3[2,9],zero,zero,zero,xmm3[u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm3, %xmm3 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[4,11],zero,zero,xmm3[0,7,14,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpor %xmm3, %xmm10, %xmm3 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm3 {%k5} = ymm17[u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,22,29,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [1,3,5,6,0,0,0,0] -; AVX512BW-FCP-NEXT: vpermd %ymm19, %ymm10, %ymm10 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[2,5,8,15,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm10, %zmm20, %zmm10 -; AVX512BW-FCP-NEXT: vmovdqu16 %zmm10, %zmm3 {%k5} -; AVX512BW-FCP-NEXT: vmovdqu16 %ymm8, %ymm6 {%k2} -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm8 = [0,1,10,3,4,13,6,7,8,25,18,11,28,21,14,15] -; AVX512BW-FCP-NEXT: vpermw %zmm0, %zmm8, %zmm0 -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm14, %zmm0, %zmm8 -; AVX512BW-FCP-NEXT: movw $-512, %ax # imm = 0xFE00 -; AVX512BW-FCP-NEXT: vmovdqu16 %ymm5, %ymm4 {%k1} -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm4[u,u,u,u,2,9],zero,zero,zero,xmm4[5,12],zero,zero,xmm4[u,u,u] -; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm4, %xmm4 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u],zero,zero,xmm4[0,7,14],zero,zero,xmm4[3,10,u,u,u] -; AVX512BW-FCP-NEXT: vpor %xmm5, %xmm4, %xmm4 -; AVX512BW-FCP-NEXT: movl $4186112, %edi # imm = 0x3FE000 -; AVX512BW-FCP-NEXT: kmovd %edi, %k1 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm4 {%k1} = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,1,8,15,22,29,20,27,18,25,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm21 = xmm15[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm22 = xmm14[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm14[0,7,14] +; AVX512BW-FCP-NEXT: vporq %xmm21, %xmm22, %xmm21 +; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm21, %ymm0, %ymm21 +; AVX512BW-FCP-NEXT: vmovdqu8 %ymm21, %ymm20 {%k7} +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm21 = [1,3,4,6,0,0,0,0] +; AVX512BW-FCP-NEXT: vpermd %ymm17, %ymm21, %ymm21 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm21 = xmm21[1,4,11,14,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm21, %zmm20, %zmm20 +; AVX512BW-FCP-NEXT: vmovdqu16 %zmm20, %zmm18 {%k5} +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm19, %zmm0, %zmm19 +; AVX512BW-FCP-NEXT: vmovdqa32 %zmm19, %zmm18 {%k1} +; AVX512BW-FCP-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-FCP-NEXT: vmovdqu16 %ymm11, %ymm7 {%k1} +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm11 = [0,1,10,3,4,13,6,7,8,25,18,11,28,21,14,15] +; AVX512BW-FCP-NEXT: vpermw %zmm3, %zmm11, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqu16 %ymm10, %ymm9 {%k6} +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm9[u,u,u,u,2,9],zero,zero,zero,xmm9[5,12],zero,zero,xmm9[u,u,u] +; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm9, %xmm9 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u],zero,zero,xmm9[0,7,14],zero,zero,xmm9[3,10,u,u,u] +; AVX512BW-FCP-NEXT: vpor %xmm10, %xmm9, %xmm9 +; AVX512BW-FCP-NEXT: movl $4186112, %eax # imm = 0x3FE000 ; AVX512BW-FCP-NEXT: kmovd %eax, %k1 -; AVX512BW-FCP-NEXT: vmovdqa32 %zmm8, %zmm2 {%k1} -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm15, %zmm0, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm9 {%k1} -; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm6, %xmm0 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u],zero,zero,zero,xmm0[5,12],zero,zero,xmm0[1,8,15] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm6[u,u,u,u,u,u,0,7,14],zero,zero,xmm6[3,10],zero,zero,zero -; AVX512BW-FCP-NEXT: vpor %xmm0, %xmm5, %xmm0 -; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm4 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm5 = [16,17,18,19,20,21,22,23,24,25,26,43,44,45,46,47] -; AVX512BW-FCP-NEXT: vpermi2w %zmm0, %zmm4, %zmm5 -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm3 {%k1} +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm9 {%k1} = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,1,8,15,22,29,20,27,18,25,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vmovdqu16 %ymm4, %ymm2 {%k3} +; AVX512BW-FCP-NEXT: vmovdqu16 %ymm6, %ymm5 {%k4} +; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm5, %xmm3 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u],zero,zero,xmm3[2,9],zero,zero,zero,xmm3[5,12,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm5[u,u,4,11],zero,zero,xmm5[0,7,14],zero,zero,xmm5[u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm15[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm14[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm14[1,8,15] +; AVX512BW-FCP-NEXT: vpor %xmm4, %xmm5, %xmm4 +; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512BW-FCP-NEXT: vmovdqu8 %ymm4, %ymm3 {%k7} +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [16,9,2,19,20,13,6,23,24,17,26,27,28,21,30,31] +; AVX512BW-FCP-NEXT: vpermw %zmm0, %zmm4, %zmm0 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm2[6,13],zero,zero,xmm2[2,9],zero,zero,zero,xmm2[u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpor %xmm4, %xmm2, %xmm2 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm2 {%k5} = ymm0[u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,22,29,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [1,3,5,6,0,0,0,0] +; AVX512BW-FCP-NEXT: vpermd %ymm17, %ymm0, %ymm0 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,5,8,15,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm3, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqu16 %zmm0, %zmm2 {%k5} ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rdi +; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm7, %xmm0 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u],zero,zero,zero,xmm0[5,12],zero,zero,xmm0[1,8,15] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm7[u,u,u,u,u,u,0,7,14],zero,zero,xmm7[3,10],zero,zero,zero +; AVX512BW-FCP-NEXT: vpor %xmm0, %xmm3, %xmm0 +; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm3 +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [16,17,18,19,20,21,22,23,24,25,26,43,44,45,46,47] +; AVX512BW-FCP-NEXT: vpermi2w %zmm0, %zmm3, %zmm4 +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,2,3,4,5,6,7,8,17,18,19,20,21,22,23] +; AVX512BW-FCP-NEXT: vpermi2d %zmm4, %zmm2, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, (%rsi) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, (%rdx) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, (%rdx) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, (%rcx) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, (%r8) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, (%r9) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, (%rdi) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, (%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, (%r9) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, (%rdi) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, (%rax) ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq ; @@ -14453,362 +14450,359 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX512DQ-BW-FCP-LABEL: load_i8_stride7_vf64: ; AVX512DQ-BW-FCP: # %bb.0: -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm3 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [24,17,2,19,28,21,6,31,16,9,26,27,20,13,30,23] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm0, %zmm1, %zmm24 +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm3, %zmm1, %zmm24 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [8,1,18,11,4,5,22,15,0,25,10,3,12,29,14,7] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm2, %zmm1, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [24,17,2,27,20,5,22,31,16,9,26,19,12,29,30,23] ; AVX512DQ-BW-FCP-NEXT: vpermw %zmm0, %zmm1, %zmm25 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [24,17,2,27,20,5,22,31,16,9,26,19,12,29,30,23] +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm3, %zmm1, %zmm17 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,1,18,11,4,21,14,7,8,25,10,3,28,13,6,15] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm2, %zmm1, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm0, %zmm1, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,1,2,11,4,5,14,7,8,9,26,19,12,29,22,15] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm0, %zmm1, %zmm16 +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm3, %zmm1, %zmm18 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,17,10,3,4,21,14,7,24,9,2,11,28,13,6,31] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm2, %zmm1, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm0, %zmm1, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,1,10,3,4,5,14,7,8,25,18,11,12,29,22,15] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm0, %zmm1, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm3, %zmm1, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [16,17,10,3,20,13,6,23,24,25,18,27,28,21,30,31] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm2, %zmm1, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm3 +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm0, %zmm1, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm2 ; AVX512DQ-BW-FCP-NEXT: movw $-28382, %ax # imm = 0x9122 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm3, %ymm10, %ymm1 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm2, %ymm4, %ymm1 {%k1} ; AVX512DQ-BW-FCP-NEXT: kmovq %k1, %k2 ; AVX512DQ-BW-FCP-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm1, %xmm5 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[5,12],zero,zero,xmm5[1,8,15,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm1, %xmm6 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm6[5,12],zero,zero,xmm6[1,8,15,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,7,14],zero,zero,xmm1[3,10],zero,zero,zero,xmm1[u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpor %xmm5, %xmm1, %xmm1 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpor %xmm6, %xmm1, %xmm1 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: movw $992, %ax # imm = 0x3E0 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm4, %ymm1 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm11 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 160(%rdi), %ymm9 +; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm5, %ymm1 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 160(%rdi), %ymm5 ; AVX512DQ-BW-FCP-NEXT: movw $8772, %ax # imm = 0x2244 -; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm11, %ymm9, %ymm4 {%k1} -; AVX512DQ-BW-FCP-NEXT: kmovq %k1, %k3 -; AVX512DQ-BW-FCP-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u],zero,zero,xmm5[3,10],zero,zero,zero,xmm5[6,13,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,5,12],zero,zero,xmm4[1,8,15],zero,zero,xmm4[u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpor %xmm5, %xmm4, %xmm4 -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,0,0,0,1,2,4,6] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %ymm17 -; AVX512DQ-BW-FCP-NEXT: vpermd %ymm17, %ymm5, %ymm5 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,23,26,29] -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm5[7] +; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k6 +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm6, %ymm5, %ymm9 {%k6} +; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm9, %xmm10 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u],zero,zero,xmm10[3,10],zero,zero,zero,xmm10[6,13,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,5,12],zero,zero,xmm9[1,8,15],zero,zero,xmm9[u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpor %xmm10, %xmm9, %xmm9 +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [0,0,0,0,1,2,4,6] +; AVX512DQ-BW-FCP-NEXT: vmovdqa 192(%rdi), %ymm15 +; AVX512DQ-BW-FCP-NEXT: vpermd %ymm15, %ymm10, %ymm10 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,23,26,29] +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5,6],ymm10[7] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 240(%rdi), %xmm19 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm19[5,12,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm19[5,12,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 224(%rdi), %xmm20 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm20[0,7,14],zero,zero,xmm20[u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpor %xmm5, %xmm8, %xmm5 -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm5, %zmm4, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm20[0,7,14],zero,zero,xmm20[u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpor %xmm10, %xmm11, %xmm10 +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm10, %zmm9, %zmm9 ; AVX512DQ-BW-FCP-NEXT: movabsq $137438429184, %rax # imm = 0x1FFFF80000 ; AVX512DQ-BW-FCP-NEXT: kmovq %rax, %k5 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm4, %zmm1 {%k5} -; AVX512DQ-BW-FCP-NEXT: vmovdqa 288(%rdi), %ymm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 256(%rdi), %ymm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm9, %zmm1 {%k5} +; AVX512DQ-BW-FCP-NEXT: vmovdqa 288(%rdi), %ymm10 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 256(%rdi), %ymm9 ; AVX512DQ-BW-FCP-NEXT: movw $9288, %ax # imm = 0x2448 -; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k6 -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm5, %ymm4, %ymm8 {%k6} -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm8[u,u,u,u,u,3,10],zero,zero,zero,xmm8[6,13],zero,zero,xmm8[u,u] -; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm8, %xmm8 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u],zero,zero,xmm8[1,8,15],zero,zero,xmm8[4,11,u,u] -; AVX512DQ-BW-FCP-NEXT: vporq %xmm18, %xmm8, %xmm21 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,2,9,16,23,30,21,28,19,26,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k3 +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm10, %ymm9, %ymm11 {%k3} +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm11[u,u,u,u,u,3,10],zero,zero,zero,xmm11[6,13],zero,zero,xmm11[u,u] +; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm11, %xmm11 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u],zero,zero,xmm11[1,8,15],zero,zero,xmm11[4,11,u,u] +; AVX512DQ-BW-FCP-NEXT: vporq %xmm16, %xmm11, %xmm21 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,2,9,16,23,30,21,28,19,26,u,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: movw $3968, %ax # imm = 0xF80 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k7 -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm6, %ymm21 {%k7} -; AVX512DQ-BW-FCP-NEXT: vmovdqa 416(%rdi), %ymm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 384(%rdi), %ymm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm7, %ymm21 {%k7} +; AVX512DQ-BW-FCP-NEXT: vmovdqa 416(%rdi), %ymm11 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 384(%rdi), %ymm7 ; AVX512DQ-BW-FCP-NEXT: movw $4644, %ax # imm = 0x1224 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k4 -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm8, %ymm6, %ymm18 {%k4} -; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm18, %xmm22 +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm11, %ymm7, %ymm16 {%k4} +; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm16, %xmm22 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm22 = xmm22[u,u,u,u,u,u,u],zero,zero,zero,xmm22[6,13],zero,zero,xmm22[2,9] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm18[4,11],zero,zero -; AVX512DQ-BW-FCP-NEXT: vporq %xmm22, %xmm18, %xmm18 -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm18, %ymm0, %ymm22 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm16[4,11],zero,zero +; AVX512DQ-BW-FCP-NEXT: vporq %xmm22, %xmm16, %xmm16 +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm16, %ymm0, %ymm22 ; AVX512DQ-BW-FCP-NEXT: movl $-8388608, %eax # imm = 0xFF800000 -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm3, %ymm10, %ymm18 {%k4} -; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm18, %xmm23 +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm2, %ymm4, %ymm16 {%k4} +; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm16, %xmm23 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm23 = zero,zero,zero,xmm23[6,13],zero,zero,xmm23[2,9,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[1,8,15],zero,zero,xmm18[4,11],zero,zero,xmm18[u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vporq %xmm23, %xmm18, %xmm18 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,u,u,u,0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[1,8,15],zero,zero,xmm16[4,11],zero,zero,xmm16[u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vporq %xmm23, %xmm16, %xmm16 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,u,u,u,u,u,0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: movl $511, %r10d # imm = 0x1FF ; AVX512DQ-BW-FCP-NEXT: kmovd %r10d, %k1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm18, %ymm7 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm11, %ymm9, %ymm18 {%k6} -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm23 = xmm18[u,u,u,6,13],zero,zero,xmm18[2,9],zero,zero,zero,xmm18[u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm18, %xmm18 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[u,u,u],zero,zero,xmm18[4,11],zero,zero,xmm18[0,7,14,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vporq %xmm23, %xmm18, %xmm18 -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm18, %ymm0, %ymm14 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm18 = [0,0,0,0,1,3,4,6] -; AVX512DQ-BW-FCP-NEXT: vpermd %ymm17, %ymm18, %ymm18 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm18[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30] -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5,6],ymm15[7] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,zero,xmm19[6,13,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm20[1,8,15],zero,zero,xmm20[u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vporq %xmm15, %xmm18, %xmm15 -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm15, %zmm14, %zmm14 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm14, %zmm7 {%k5} -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm3, %ymm10, %ymm14 {%k3} -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm14[2,9],zero,zero,zero,xmm14[5,12],zero,zero,xmm14[u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm14, %xmm14 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,xmm14[0,7,14],zero,zero,xmm14[3,10,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpor %xmm15, %xmm14, %xmm14 +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm16, %ymm8 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm6, %ymm5, %ymm16 {%k3} +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm23 = xmm16[u,u,u,6,13],zero,zero,xmm16[2,9],zero,zero,zero,xmm16[u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm16, %xmm16 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[u,u,u],zero,zero,xmm16[4,11],zero,zero,xmm16[0,7,14,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vporq %xmm23, %xmm16, %xmm16 +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm16, %ymm0, %ymm14 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm16 = [0,0,0,0,1,3,4,6] +; AVX512DQ-BW-FCP-NEXT: vpermd %ymm15, %ymm16, %ymm16 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm13 = ymm16[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30] +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4,5,6],ymm13[7] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,zero,xmm19[6,13,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm20[1,8,15],zero,zero,xmm20[u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vporq %xmm14, %xmm16, %xmm14 +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm14, %zmm13, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm13, %zmm8 {%k5} +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm2, %ymm4, %ymm13 {%k6} +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm13[2,9],zero,zero,zero,xmm13[5,12],zero,zero,xmm13[u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm13, %xmm13 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm13 = zero,zero,xmm13[0,7,14],zero,zero,xmm13[3,10,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpor %xmm14, %xmm13, %xmm13 ; AVX512DQ-BW-FCP-NEXT: movl $261632, %r10d # imm = 0x3FE00 ; AVX512DQ-BW-FCP-NEXT: kmovd %r10d, %k5 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm14 {%k5} = ymm12[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm9, %ymm11, %ymm12 {%k2} -; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm12, %xmm15 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u],zero,zero,zero,xmm15[5,12],zero,zero,xmm15[1,8,15,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm13 {%k5} = ymm12[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm5, %ymm6, %ymm12 {%k2} +; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm12, %xmm14 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u],zero,zero,zero,xmm14[5,12],zero,zero,xmm14[1,8,15,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,0,7,14],zero,zero,xmm12[3,10],zero,zero,zero,xmm12[u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpor %xmm15, %xmm12, %xmm12 +; AVX512DQ-BW-FCP-NEXT: vpor %xmm14, %xmm12, %xmm12 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm15 = [0,0,0,0,1,3,5,6] -; AVX512DQ-BW-FCP-NEXT: vpermd %ymm17, %ymm15, %ymm15 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,21,24,31] -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,6],ymm15[7] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm20[2,9],zero,zero,zero,xmm20[u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = zero,zero,xmm19[0,7,14,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vporq %xmm15, %xmm17, %xmm15 -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm15, %zmm12, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm14, %zmm12 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm3, %ymm10, %ymm14 {%k6} -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm14[3,10],zero,zero,zero,xmm14[6,13],zero,zero,xmm14[u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm14, %xmm14 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,xmm14[1,8,15],zero,zero,xmm14[4,11,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpor %xmm15, %xmm14, %xmm14 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm14 {%k5} = ymm13[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm9, %ymm11, %ymm13 {%k4} +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm14 = [0,0,0,0,1,3,5,6] +; AVX512DQ-BW-FCP-NEXT: vpermd %ymm15, %ymm14, %ymm14 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,21,24,31] +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,6],ymm14[7] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm20[2,9],zero,zero,zero,xmm20[u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,xmm19[0,7,14,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpor %xmm14, %xmm15, %xmm14 +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm14, %zmm12, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm13, %zmm12 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm2, %ymm4, %ymm13 {%k3} +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm13[3,10],zero,zero,zero,xmm13[6,13],zero,zero,xmm13[u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm13, %xmm13 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm13 = zero,zero,xmm13[1,8,15],zero,zero,xmm13[4,11,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpor %xmm14, %xmm13, %xmm14 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm14 {%k5} = ymm25[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm5, %ymm6, %ymm13 {%k4} ; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm13, %xmm15 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u],zero,zero,zero,xmm15[6,13],zero,zero,xmm15[2,9,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,1,8,15],zero,zero,xmm13[4,11],zero,zero,xmm13[u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpor %xmm15, %xmm13, %xmm13 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 208(%rdi), %xmm17 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm17[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm17[5,12] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %xmm18 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm23 = xmm18[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero -; AVX512DQ-BW-FCP-NEXT: vporq %xmm15, %xmm23, %xmm15 -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 208(%rdi), %xmm15 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm23 = xmm15[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm15[5,12] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %xmm16 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm25 = xmm16[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero +; AVX512DQ-BW-FCP-NEXT: vporq %xmm23, %xmm25, %xmm23 +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm23, %ymm0, %ymm23 ; AVX512DQ-BW-FCP-NEXT: movl $-134217728, %r10d # imm = 0xF8000000 ; AVX512DQ-BW-FCP-NEXT: kmovd %r10d, %k2 ; AVX512DQ-BW-FCP-NEXT: kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm15, %ymm13 {%k2} -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm20[3,10],zero,zero,zero,xmm20[u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm23, %ymm13 {%k2} +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm20 = xmm20[3,10],zero,zero,zero,xmm20[u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm19 = zero,zero,xmm19[1,8,15,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vporq %xmm15, %xmm19, %xmm15 -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm15, %zmm13, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vporq %xmm20, %xmm19, %xmm19 +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm19, %zmm13, %zmm13 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm14, %zmm13 {%k1} -; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm22, %ymm21 {%k3} +; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k2 +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm22, %ymm21 {%k2} ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm21, %zmm0, %zmm14 ; AVX512DQ-BW-FCP-NEXT: movabsq $-137438953472, %rax # imm = 0xFFFFFFE000000000 -; AVX512DQ-BW-FCP-NEXT: kmovq %rax, %k2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm14, %zmm1 {%k2} -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm4, %ymm5, %ymm14 {%k4} -; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm14, %xmm15 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u,u],zero,zero,xmm15[2,9],zero,zero,zero,xmm15[5,12,u,u] +; AVX512DQ-BW-FCP-NEXT: kmovq %rax, %k1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm14, %zmm1 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm9, %ymm10, %ymm14 {%k4} +; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm14, %xmm19 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm19 = xmm19[u,u,u,u,u],zero,zero,xmm19[2,9],zero,zero,zero,xmm19[5,12,u,u] ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,u,4,11],zero,zero,xmm14[0,7,14],zero,zero,xmm14[u,u] -; AVX512DQ-BW-FCP-NEXT: vpor %xmm15, %xmm14, %xmm14 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm16[u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,10,17,24,31,22,29,20,27,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm15, %ymm14 {%k7} -; AVX512DQ-BW-FCP-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm8, %ymm6, %ymm15 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm15[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm15[5,12],zero,zero -; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm15, %xmm15 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u,u,u,u],zero,zero,xmm15[0,7,14],zero,zero,xmm15[3,10] -; AVX512DQ-BW-FCP-NEXT: vporq %xmm16, %xmm15, %xmm15 -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm15, %ymm14 {%k3} +; AVX512DQ-BW-FCP-NEXT: vporq %xmm19, %xmm14, %xmm14 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm18 = ymm18[u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,10,17,24,31,22,29,20,27,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm18, %ymm14 {%k7} +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm11, %ymm7, %ymm18 {%k6} +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm19 = xmm18[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm18[5,12],zero,zero +; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm18, %xmm18 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[u,u,u,u,u,u,u],zero,zero,xmm18[0,7,14],zero,zero,xmm18[3,10] +; AVX512DQ-BW-FCP-NEXT: vporq %xmm19, %xmm18, %xmm18 +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm18, %ymm0, %ymm18 +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm18, %ymm14 {%k2} ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm14, %zmm0, %zmm14 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm14, %zmm7 {%k2} -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm4, %ymm5, %ymm14 {%k1} -; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm14, %xmm15 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u,u],zero,zero,xmm15[3,10],zero,zero,zero,xmm15[6,13,u,u] +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm14, %zmm8 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm9, %ymm10, %ymm14 {%k6} +; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm14, %xmm18 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[u,u,u,u,u],zero,zero,xmm18[3,10],zero,zero,zero,xmm18[6,13,u,u] ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,u,5,12],zero,zero,xmm14[1,8,15],zero,zero,xmm14[u,u] -; AVX512DQ-BW-FCP-NEXT: vpor %xmm15, %xmm14, %xmm14 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm25[u,u,u,u,u,u,u,u,u,u,u,u,u,u,4,11,18,25,16,23,30,21,28,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm15, %ymm14 {%k7} -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm8, %ymm6, %ymm15 {%k6} -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm15[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm15[6,13],zero,zero -; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm15, %xmm15 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u,u,u,u],zero,zero,xmm15[1,8,15],zero,zero,xmm15[4,11] -; AVX512DQ-BW-FCP-NEXT: vporq %xmm16, %xmm15, %xmm15 -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm15, %ymm14 {%k3} +; AVX512DQ-BW-FCP-NEXT: vporq %xmm18, %xmm14, %xmm14 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm17 = ymm17[u,u,u,u,u,u,u,u,u,u,u,u,u,u,4,11,18,25,16,23,30,21,28,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm17, %ymm14 {%k7} +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm11, %ymm7, %ymm17 {%k3} +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm17[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm17[6,13],zero,zero +; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm17, %xmm17 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = xmm17[u,u,u,u,u,u,u],zero,zero,xmm17[1,8,15],zero,zero,xmm17[4,11] +; AVX512DQ-BW-FCP-NEXT: vporq %xmm18, %xmm17, %xmm17 +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm17, %ymm0, %ymm17 +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm17, %ymm14 {%k2} ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm14, %zmm0, %zmm14 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm14, %zmm12 {%k2} -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm4, %ymm5, %ymm14 {%k6} -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm14[u,u,u,u,u,6,13],zero,zero,xmm14[2,9],zero,zero,zero,xmm14[u,u] +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm14, %zmm12 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm9, %ymm10, %ymm14 {%k3} +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = xmm14[u,u,u,u,u,6,13],zero,zero,xmm14[2,9],zero,zero,zero,xmm14[u,u] ; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm14, %xmm14 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,u],zero,zero,xmm14[4,11],zero,zero,xmm14[0,7,14,u,u] -; AVX512DQ-BW-FCP-NEXT: vpor %xmm15, %xmm14, %xmm14 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm24[u,u,u,u,u,u,u,u,u,u,u,u,u,u,5,12,19,26,17,24,31,22,29,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm15, %ymm14 {%k7} -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm6, %ymm8, %ymm15 {%k4} -; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm15, %xmm16 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,u,u,u,u],zero,zero,xmm16[2,9],zero,zero,zero,xmm16[5,12] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u,u,u,u,4,11],zero,zero,xmm15[0,7,14],zero,zero -; AVX512DQ-BW-FCP-NEXT: vporq %xmm16, %xmm15, %xmm15 -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm15, %ymm14 {%k3} +; AVX512DQ-BW-FCP-NEXT: vporq %xmm17, %xmm14, %xmm14 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm17 = ymm24[u,u,u,u,u,u,u,u,u,u,u,u,u,u,5,12,19,26,17,24,31,22,29,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm17, %ymm14 {%k7} +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm7, %ymm11, %ymm17 {%k4} +; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm17, %xmm18 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[u,u,u,u,u,u,u],zero,zero,xmm18[2,9],zero,zero,zero,xmm18[5,12] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = xmm17[u,u,u,u,u,u,u,4,11],zero,zero,xmm17[0,7,14],zero,zero +; AVX512DQ-BW-FCP-NEXT: vporq %xmm18, %xmm17, %xmm17 +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm17, %ymm0, %ymm17 +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm17, %ymm14 {%k2} ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm14, %zmm0, %zmm14 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm14, %zmm13 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm14, %zmm13 {%k1} ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm14 = [0,9,2,3,4,13,6,7,24,17,10,11,28,21,14,31] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm0, %zmm14, %zmm15 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm14 = [16,25,18,3,28,21,6,23,24,17,10,27,20,13,30,31] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm0, %zmm14, %zmm16 +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm3, %zmm14, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm17 = [16,25,18,3,28,21,6,23,24,17,10,27,20,13,30,31] +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm3, %zmm17, %zmm18 ; AVX512DQ-BW-FCP-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm5, %ymm4, %ymm14 {%k1} -; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm14, %xmm19 +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm10, %ymm9, %ymm17 {%k1} +; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm17, %xmm19 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm19 = xmm19[u,u,u,u],zero,zero,zero,xmm19[5,12],zero,zero,xmm19[1,8,15,u,u] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,0,7,14],zero,zero,xmm14[3,10],zero,zero,zero,xmm14[u,u] -; AVX512DQ-BW-FCP-NEXT: vporq %xmm19, %xmm14, %xmm14 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm16 = ymm16[u,u,u,u,u,u,u,u,u,u,u,u,u,u,6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm16, %ymm14 {%k7} -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm5, %ymm4, %ymm16 {%k4} -; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm16, %xmm19 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = xmm17[u,u,u,u,0,7,14],zero,zero,xmm17[3,10],zero,zero,zero,xmm17[u,u] +; AVX512DQ-BW-FCP-NEXT: vporq %xmm19, %xmm17, %xmm17 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm18 = ymm18[u,u,u,u,u,u,u,u,u,u,u,u,u,u,6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm18, %ymm17 {%k7} +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm10, %ymm9, %ymm18 {%k4} +; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm18, %xmm19 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm19 = xmm19[u,u,u,u],zero,zero,zero,xmm19[6,13],zero,zero,xmm19[2,9,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,u,1,8,15],zero,zero,xmm16[4,11],zero,zero,xmm16[u,u,u] -; AVX512DQ-BW-FCP-NEXT: vporq %xmm19, %xmm16, %xmm16 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,0,7,14,21,28,19,26,17,24,31,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[u,u,u,u,1,8,15],zero,zero,xmm18[4,11],zero,zero,xmm18[u,u,u] +; AVX512DQ-BW-FCP-NEXT: vporq %xmm19, %xmm18, %xmm18 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm19 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,0,7,14,21,28,19,26,17,24,31,u,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: movl $8176, %eax # imm = 0x1FF0 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm16, %ymm15 {%k1} -; AVX512DQ-BW-FCP-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm6, %ymm8, %ymm16 {%k1} -; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm16, %xmm19 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm19 = xmm19[u,u,u,u,u,u,u],zero,zero,xmm19[3,10],zero,zero,zero,xmm19[6,13] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,u,u,u,u,5,12],zero,zero,xmm16[1,8,15],zero,zero -; AVX512DQ-BW-FCP-NEXT: vporq %xmm19, %xmm16, %xmm16 -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm16, %ymm0, %ymm16 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm16, %ymm14 {%k3} -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm6, %ymm8, %ymm16 {%k6} -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm19 = xmm16[u,u,u,u,u,u,u,6,13],zero,zero,xmm16[2,9],zero,zero,zero -; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm16, %xmm16 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,u,u,u,u],zero,zero,xmm16[4,11],zero,zero,xmm16[0,7,14] -; AVX512DQ-BW-FCP-NEXT: vporq %xmm19, %xmm16, %xmm16 -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm16, %ymm0, %ymm16 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm16, %ymm15 {%k3} -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm10, %ymm3, %ymm19 {%k4} -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm10, %ymm3, %ymm16 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm9, %ymm11, %ymm20 {%k6} -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm10, %ymm3 {%k6} -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm9, %ymm11, %ymm10 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm11, %ymm9 {%k4} -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm10[u,u,2,9],zero,zero,zero,xmm10[5,12],zero,zero,xmm10[u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm10, %xmm10 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u],zero,zero,xmm10[0,7,14],zero,zero,xmm10[3,10,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpor %xmm11, %xmm10, %xmm10 -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm17[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm17[6,13] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm21 = xmm18[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero -; AVX512DQ-BW-FCP-NEXT: vporq %xmm11, %xmm21, %xmm11 -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX512DQ-BW-FCP-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 4-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm11, %ymm10 {%k2} -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm20[u,u,3,10],zero,zero,zero,xmm20[6,13],zero,zero,xmm20[u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm18, %ymm19 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm14 = [16,9,2,19,12,5,22,23,24,17,26,27,20,29,30,31] +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm0, %zmm14, %zmm20 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm14 = [8,1,2,19,12,5,22,15,0,9,26,11,4,29,14,7] +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm0, %zmm14, %zmm18 +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm7, %ymm11, %ymm14 {%k6} +; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm14, %xmm21 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm21 = xmm21[u,u,u,u,u,u,u],zero,zero,xmm21[3,10],zero,zero,zero,xmm21[6,13] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,u,u,u,5,12],zero,zero,xmm14[1,8,15],zero,zero +; AVX512DQ-BW-FCP-NEXT: vporq %xmm21, %xmm14, %xmm14 +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm14, %ymm17 {%k2} +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm4, %ymm2, %ymm14 {%k4} +; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm14, %xmm21 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm21 = zero,zero,xmm21[2,9],zero,zero,zero,xmm21[5,12,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[4,11],zero,zero,xmm14[0,7,14],zero,zero,xmm14[u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vporq %xmm21, %xmm14, %xmm14 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm14 {%k5} = ymm18[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm5, %ymm6, %ymm18 {%k6} +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm21 = xmm18[u,u,2,9],zero,zero,zero,xmm18[5,12],zero,zero,xmm18[u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm18, %xmm18 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[u,u],zero,zero,xmm18[0,7,14],zero,zero,xmm18[3,10,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vporq %xmm21, %xmm18, %xmm18 +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm18, %ymm0, %ymm21 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm15[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm15[6,13] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm22 = xmm16[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero +; AVX512DQ-BW-FCP-NEXT: vporq %xmm18, %xmm22, %xmm18 +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm18, %ymm0, %ymm18 +; AVX512DQ-BW-FCP-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 4-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm18, %ymm21 {%k7} +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm22 = [1,2,4,6,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 224(%rdi), %ymm18 +; AVX512DQ-BW-FCP-NEXT: vpermd %ymm18, %ymm22, %ymm22 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm22 = xmm22[0,7,10,13,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm22, %zmm21, %zmm21 +; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm21, %zmm14 {%k5} +; AVX512DQ-BW-FCP-NEXT: movw $-512, %ax # imm = 0xFE00 +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm7, %ymm11, %ymm21 {%k3} +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm22 = xmm21[u,u,u,u,u,u,u,6,13],zero,zero,xmm21[2,9],zero,zero,zero +; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm21, %xmm21 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm21 = xmm21[u,u,u,u,u,u,u],zero,zero,xmm21[4,11],zero,zero,xmm21[0,7,14] +; AVX512DQ-BW-FCP-NEXT: vporq %xmm22, %xmm21, %xmm21 +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm21, %ymm0, %ymm21 +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm21, %ymm19 {%k2} +; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 +; AVX512DQ-BW-FCP-NEXT: vinserti32x8 $1, %ymm17, %zmm0, %zmm14 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm4, %ymm2, %ymm17 {%k6} +; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm17, %xmm21 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm21 = zero,zero,xmm21[3,10],zero,zero,zero,xmm21[6,13,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = xmm17[5,12],zero,zero,xmm17[1,8,15],zero,zero,xmm17[u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vporq %xmm21, %xmm17, %xmm17 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm17 {%k5} = ymm20[u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,21,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm5, %ymm6, %ymm20 {%k3} +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm21 = xmm20[u,u,3,10],zero,zero,zero,xmm20[6,13],zero,zero,xmm20[u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm20, %xmm20 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm20 = xmm20[u,u],zero,zero,xmm20[1,8,15],zero,zero,xmm20[4,11,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vporq %xmm11, %xmm20, %xmm11 -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm20 = xmm18[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm21 = xmm17[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm17[0,7,14] -; AVX512DQ-BW-FCP-NEXT: vporq %xmm20, %xmm21, %xmm20 +; AVX512DQ-BW-FCP-NEXT: vporq %xmm21, %xmm20, %xmm20 ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm20 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm20, %ymm11 {%k2} -; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm9, %xmm20 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm20 = xmm20[u,u],zero,zero,xmm20[2,9],zero,zero,zero,xmm20[5,12,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,4,11],zero,zero,xmm9[0,7,14],zero,zero,xmm9[u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vporq %xmm20, %xmm9, %xmm9 -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm9, %ymm0, %ymm20 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm18[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = xmm17[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm17[1,8,15] -; AVX512DQ-BW-FCP-NEXT: vporq %xmm9, %xmm17, %xmm9 -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm9, %ymm20 {%k2} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm9 = [16,9,2,19,20,13,6,23,24,17,26,27,28,21,30,31] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm2, %zmm9, %zmm17 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm9 = [16,9,2,19,12,5,22,23,24,17,26,27,20,29,30,31] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm2, %zmm9, %zmm18 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm9 = [8,1,2,19,12,5,22,15,0,9,26,11,4,29,14,7] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm2, %zmm9, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm19, %xmm2 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[2,9],zero,zero,zero,xmm2[5,12,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm19 = xmm19[4,11],zero,zero,xmm19[0,7,14],zero,zero,xmm19[u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vporq %xmm2, %xmm19, %xmm2 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm2 {%k5} = ymm9[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [1,2,4,6,0,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 224(%rdi), %ymm19 -; AVX512DQ-BW-FCP-NEXT: vpermd %ymm19, %ymm9, %ymm9 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[0,7,10,13,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm9, %zmm10, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm9, %zmm2 {%k5} -; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm16, %xmm9 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,xmm9[3,10],zero,zero,zero,xmm9[6,13,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm16[5,12],zero,zero,xmm16[1,8,15],zero,zero,xmm16[u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpor %xmm9, %xmm10, %xmm9 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm9 {%k5} = ymm18[u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,21,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [1,3,4,6,0,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermd %ymm19, %ymm10, %ymm10 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[1,4,11,14,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm10, %zmm11, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm10, %zmm9 {%k5} -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm3[6,13],zero,zero,xmm3[2,9],zero,zero,zero,xmm3[u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm3, %xmm3 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[4,11],zero,zero,xmm3[0,7,14,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpor %xmm3, %xmm10, %xmm3 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm3 {%k5} = ymm17[u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,22,29,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [1,3,5,6,0,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermd %ymm19, %ymm10, %ymm10 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[2,5,8,15,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm10, %zmm20, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm10, %zmm3 {%k5} -; AVX512DQ-BW-FCP-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm8, %ymm6 {%k2} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm8 = [0,1,10,3,4,13,6,7,8,25,18,11,28,21,14,15] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm0, %zmm8, %zmm0 -; AVX512DQ-BW-FCP-NEXT: movw $-512, %ax # imm = 0xFE00 -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm5, %ymm4 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm4[u,u,u,u,2,9],zero,zero,zero,xmm4[5,12],zero,zero,xmm4[u,u,u] -; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm4, %xmm4 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u],zero,zero,xmm4[0,7,14],zero,zero,xmm4[3,10,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpor %xmm5, %xmm4, %xmm4 -; AVX512DQ-BW-FCP-NEXT: movl $4186112, %edi # imm = 0x3FE000 -; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k1 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm4 {%k1} = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,1,8,15,22,29,20,27,18,25,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm21 = xmm16[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm22 = xmm15[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm15[0,7,14] +; AVX512DQ-BW-FCP-NEXT: vporq %xmm21, %xmm22, %xmm21 +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm21, %ymm0, %ymm21 +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm21, %ymm20 {%k7} +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm21 = [1,3,4,6,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermd %ymm18, %ymm21, %ymm21 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm21 = xmm21[1,4,11,14,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm21, %zmm20, %zmm20 +; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm20, %zmm17 {%k5} +; AVX512DQ-BW-FCP-NEXT: vinserti32x8 $1, %ymm19, %zmm0, %zmm17 {%k1} +; AVX512DQ-BW-FCP-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm11, %ymm7 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm11 = [0,1,10,3,4,13,6,7,8,25,18,11,28,21,14,15] +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm3, %zmm11, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm10, %ymm9 {%k6} +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm9[u,u,u,u,2,9],zero,zero,zero,xmm9[5,12],zero,zero,xmm9[u,u,u] +; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm9, %xmm9 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u],zero,zero,xmm9[0,7,14],zero,zero,xmm9[3,10,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpor %xmm10, %xmm9, %xmm9 +; AVX512DQ-BW-FCP-NEXT: movl $4186112, %eax # imm = 0x3FE000 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 -; AVX512DQ-BW-FCP-NEXT: vinserti32x8 $1, %ymm14, %zmm0, %zmm2 {%k1} -; AVX512DQ-BW-FCP-NEXT: vinserti32x8 $1, %ymm15, %zmm0, %zmm9 {%k1} -; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm6, %xmm0 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u],zero,zero,zero,xmm0[5,12],zero,zero,xmm0[1,8,15] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm6[u,u,u,u,u,u,0,7,14],zero,zero,xmm6[3,10],zero,zero,zero -; AVX512DQ-BW-FCP-NEXT: vpor %xmm0, %xmm5, %xmm0 -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm5 = [16,17,18,19,20,21,22,23,24,25,26,43,44,45,46,47] -; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm0, %zmm4, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vinserti32x8 $1, %ymm5, %zmm0, %zmm3 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm9 {%k1} = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,1,8,15,22,29,20,27,18,25,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm4, %ymm2 {%k3} +; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm6, %ymm5 {%k4} +; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm5, %xmm3 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u],zero,zero,xmm3[2,9],zero,zero,zero,xmm3[5,12,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm5[u,u,4,11],zero,zero,xmm5[0,7,14],zero,zero,xmm5[u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm16[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm15[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm15[1,8,15] +; AVX512DQ-BW-FCP-NEXT: vpor %xmm4, %xmm5, %xmm4 +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm4, %ymm3 {%k7} +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [16,9,2,19,20,13,6,23,24,17,26,27,28,21,30,31] +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm0, %zmm4, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm2[6,13],zero,zero,xmm2[2,9],zero,zero,zero,xmm2[u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpor %xmm4, %xmm2, %xmm2 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm2 {%k5} = ymm0[u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,22,29,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [1,3,5,6,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermd %ymm18, %ymm0, %ymm0 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,5,8,15,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm3, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm0, %zmm2 {%k5} ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rdi +; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm7, %xmm0 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u],zero,zero,zero,xmm0[5,12],zero,zero,xmm0[1,8,15] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm7[u,u,u,u,u,u,0,7,14],zero,zero,xmm7[3,10],zero,zero,zero +; AVX512DQ-BW-FCP-NEXT: vpor %xmm0, %xmm3, %xmm0 +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [16,17,18,19,20,21,22,23,24,25,26,43,44,45,46,47] +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm0, %zmm3, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,2,3,4,5,6,7,8,17,18,19,20,21,22,23] +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm4, %zmm2, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, (%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, (%rdx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, (%rdx) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, (%rcx) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, (%r8) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, (%r9) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, (%rdi) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, (%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, (%r9) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, (%rdi) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, (%rax) ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq %wide.vec = load <448 x i8>, ptr %in.vec, align 64 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-8.ll index 9c9dca82f60c..f626dfe5daf0 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-8.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-8.ll @@ -4093,139 +4093,125 @@ define void @store_i16_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512: # %bb.0: ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512-NEXT: vmovdqa (%r10), %xmm1 -; AVX512-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-NEXT: vmovdqa (%rax), %xmm0 -; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX512-NEXT: vmovdqa (%r9), %xmm1 -; AVX512-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-NEXT: vmovdqa (%r8), %xmm5 -; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm26 = [0,0,0,0,0,0,1,0,2,2,2,2,0,0,3,3] -; AVX512-NEXT: vpermd %zmm1, %zmm26, %zmm30 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm27 = [0,0,0,0,0,0,1,1,2,2,2,2,0,0,3,3] -; AVX512-NEXT: movw $-30584, %r11w # imm = 0x8888 +; AVX512-NEXT: vmovdqa (%r10), %xmm0 +; AVX512-NEXT: vmovdqa (%rax), %xmm1 +; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX512-NEXT: vmovdqa64 %xmm1, %xmm20 +; AVX512-NEXT: vmovdqa64 %xmm0, %xmm21 +; AVX512-NEXT: vmovdqa (%r9), %xmm0 +; AVX512-NEXT: vmovdqa (%r8), %xmm1 +; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX512-NEXT: vmovdqa64 %xmm1, %xmm22 +; AVX512-NEXT: vmovdqa64 %xmm0, %xmm23 +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm16 = [0,0,0,16,0,0,1,17,2,2,2,18,0,0,3,19] +; AVX512-NEXT: vpermt2d %zmm2, %zmm16, %zmm5 +; AVX512-NEXT: vmovdqa (%rcx), %xmm0 +; AVX512-NEXT: vmovdqa (%rdx), %xmm1 +; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX512-NEXT: vmovdqa64 %xmm1, %xmm24 +; AVX512-NEXT: vmovdqa64 %xmm0, %xmm25 +; AVX512-NEXT: vmovdqa (%rsi), %xmm0 +; AVX512-NEXT: vmovdqa (%rdi), %xmm1 +; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX512-NEXT: vmovdqa64 %xmm1, %xmm26 +; AVX512-NEXT: vmovdqa64 %xmm0, %xmm27 +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm17 = [0,16,1,0,1,17,0,0,2,18,3,3,3,19,0,0] +; AVX512-NEXT: vpermt2d %zmm10, %zmm17, %zmm9 +; AVX512-NEXT: movb $-86, %r11b ; AVX512-NEXT: kmovw %r11d, %k1 -; AVX512-NEXT: vpermd %zmm0, %zmm27, %zmm30 {%k1} -; AVX512-NEXT: vmovdqa (%rcx), %xmm8 -; AVX512-NEXT: vmovdqa (%rdx), %xmm9 -; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] -; AVX512-NEXT: vmovdqa (%rsi), %xmm10 -; AVX512-NEXT: vmovdqa (%rdi), %xmm11 -; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm28 = [0,0,1,0,1,0,0,0,2,2,3,3,3,3,0,0] -; AVX512-NEXT: vpermd %zmm1, %zmm28, %zmm3 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm29 = [0,0,1,1,1,1,0,0,2,2,3,3,3,3,0,0] -; AVX512-NEXT: movw $8738, %r11w # imm = 0x2222 -; AVX512-NEXT: kmovw %r11d, %k2 -; AVX512-NEXT: vpermd %zmm0, %zmm29, %zmm3 {%k2} -; AVX512-NEXT: vmovdqa 32(%r10), %ymm15 -; AVX512-NEXT: vmovdqa 32(%rax), %ymm2 -; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm2[0],ymm15[0],ymm2[1],ymm15[1],ymm2[2],ymm15[2],ymm2[3],ymm15[3],ymm2[8],ymm15[8],ymm2[9],ymm15[9],ymm2[10],ymm15[10],ymm2[11],ymm15[11] -; AVX512-NEXT: vmovdqa 32(%r9), %ymm7 -; AVX512-NEXT: vmovdqa 32(%r8), %ymm12 -; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm13 = ymm12[0],ymm7[0],ymm12[1],ymm7[1],ymm12[2],ymm7[2],ymm12[3],ymm7[3],ymm12[8],ymm7[8],ymm12[9],ymm7[9],ymm12[10],ymm7[10],ymm12[11],ymm7[11] -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm19 = [0,1,4,5,4,5,5,7,2,1,6,5,6,5,7,7] -; AVX512-NEXT: vpermd %zmm13, %zmm19, %zmm31 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm18 = [0,0,4,4,4,4,6,5,0,2,4,6,4,6,6,7] -; AVX512-NEXT: vpermd %zmm6, %zmm18, %zmm31 {%k1} -; AVX512-NEXT: vmovdqa 32(%rcx), %ymm13 -; AVX512-NEXT: vmovdqa 32(%rsi), %ymm1 -; AVX512-NEXT: vmovdqa 32(%rdi), %ymm0 -; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm20 = [4,5,1,3,5,7,5,7,6,5,3,3,7,7,7,7] -; AVX512-NEXT: vpermd %zmm6, %zmm20, %zmm14 -; AVX512-NEXT: vmovdqa 32(%rdx), %ymm6 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm21 = [4,4,2,1,6,5,6,5,4,6,2,3,6,7,6,7] -; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm6[0],ymm13[0],ymm6[1],ymm13[1],ymm6[2],ymm13[2],ymm6[3],ymm13[3],ymm6[8],ymm13[8],ymm6[9],ymm13[9],ymm6[10],ymm13[10],ymm6[11],ymm13[11] -; AVX512-NEXT: vpermd %zmm4, %zmm21, %zmm14 {%k2} -; AVX512-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm2[4],ymm15[4],ymm2[5],ymm15[5],ymm2[6],ymm15[6],ymm2[7],ymm15[7],ymm2[12],ymm15[12],ymm2[13],ymm15[13],ymm2[14],ymm15[14],ymm2[15],ymm15[15] -; AVX512-NEXT: vmovdqa 32(%r10), %xmm2 -; AVX512-NEXT: vpunpckhwd {{.*#+}} ymm12 = ymm12[4],ymm7[4],ymm12[5],ymm7[5],ymm12[6],ymm7[6],ymm12[7],ymm7[7],ymm12[12],ymm7[12],ymm12[13],ymm7[13],ymm12[14],ymm7[14],ymm12[15],ymm7[15] -; AVX512-NEXT: vmovdqa 32(%rax), %xmm7 -; AVX512-NEXT: vpermd %zmm12, %zmm19, %zmm17 -; AVX512-NEXT: vmovdqa 32(%r9), %xmm12 -; AVX512-NEXT: vpermd %zmm4, %zmm18, %zmm17 {%k1} -; AVX512-NEXT: vmovdqa 32(%r8), %xmm15 -; AVX512-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm6[4],ymm13[4],ymm6[5],ymm13[5],ymm6[6],ymm13[6],ymm6[7],ymm13[7],ymm6[12],ymm13[12],ymm6[13],ymm13[13],ymm6[14],ymm13[14],ymm6[15],ymm13[15] -; AVX512-NEXT: vmovdqa 32(%rcx), %xmm13 -; AVX512-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15] -; AVX512-NEXT: vmovdqa 32(%rsi), %xmm1 -; AVX512-NEXT: vpermd %zmm0, %zmm20, %zmm16 +; AVX512-NEXT: vmovdqa64 %zmm5, %zmm9 {%k1} +; AVX512-NEXT: vmovdqa 32(%r10), %ymm5 +; AVX512-NEXT: vmovdqa 32(%rax), %ymm10 +; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm11 = ymm10[0],ymm5[0],ymm10[1],ymm5[1],ymm10[2],ymm5[2],ymm10[3],ymm5[3],ymm10[8],ymm5[8],ymm10[9],ymm5[9],ymm10[10],ymm5[10],ymm10[11],ymm5[11] +; AVX512-NEXT: vmovdqa 32(%r9), %ymm13 +; AVX512-NEXT: vmovdqa 32(%r8), %ymm15 +; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm15[0],ymm13[0],ymm15[1],ymm13[1],ymm15[2],ymm13[2],ymm15[3],ymm13[3],ymm15[8],ymm13[8],ymm15[9],ymm13[9],ymm15[10],ymm13[10],ymm15[11],ymm13[11] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm18 = [0,1,4,20,4,5,5,21,2,1,6,22,6,5,7,23] +; AVX512-NEXT: vpermt2d %zmm11, %zmm18, %zmm0 +; AVX512-NEXT: vmovdqa 32(%rcx), %ymm3 +; AVX512-NEXT: vmovdqa 32(%rdx), %ymm12 +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm19 = [4,20,1,3,5,21,5,7,6,22,3,3,7,23,7,7] +; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm14 = ymm12[0],ymm3[0],ymm12[1],ymm3[1],ymm12[2],ymm3[2],ymm12[3],ymm3[3],ymm12[8],ymm3[8],ymm12[9],ymm3[9],ymm12[10],ymm3[10],ymm12[11],ymm3[11] +; AVX512-NEXT: vmovdqa 32(%rsi), %ymm2 +; AVX512-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm11 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11] +; AVX512-NEXT: vpermt2d %zmm14, %zmm19, %zmm11 +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} +; AVX512-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm10[4],ymm5[4],ymm10[5],ymm5[5],ymm10[6],ymm5[6],ymm10[7],ymm5[7],ymm10[12],ymm5[12],ymm10[13],ymm5[13],ymm10[14],ymm5[14],ymm10[15],ymm5[15] +; AVX512-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm15[4],ymm13[4],ymm15[5],ymm13[5],ymm15[6],ymm13[6],ymm15[7],ymm13[7],ymm15[12],ymm13[12],ymm15[13],ymm13[13],ymm15[14],ymm13[14],ymm15[15],ymm13[15] +; AVX512-NEXT: vpermt2d %zmm0, %zmm18, %zmm5 +; AVX512-NEXT: vmovdqa 32(%r10), %xmm6 +; AVX512-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm12[4],ymm3[4],ymm12[5],ymm3[5],ymm12[6],ymm3[6],ymm12[7],ymm3[7],ymm12[12],ymm3[12],ymm12[13],ymm3[13],ymm12[14],ymm3[14],ymm12[15],ymm3[15] +; AVX512-NEXT: vmovdqa 32(%rax), %xmm10 +; AVX512-NEXT: vpunpckhwd {{.*#+}} ymm13 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15] +; AVX512-NEXT: vmovdqa 32(%r9), %xmm4 +; AVX512-NEXT: vpermt2d %zmm3, %zmm19, %zmm13 +; AVX512-NEXT: vmovdqa 32(%r8), %xmm2 +; AVX512-NEXT: vmovdqa64 %zmm5, %zmm13 {%k1} +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm10[0],xmm6[0],xmm10[1],xmm6[1],xmm10[2],xmm6[2],xmm10[3],xmm6[3] +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; AVX512-NEXT: vpermt2d %zmm3, %zmm16, %zmm5 +; AVX512-NEXT: vmovdqa 32(%rcx), %xmm3 +; AVX512-NEXT: vmovdqa 32(%rdx), %xmm12 +; AVX512-NEXT: vmovdqa 32(%rsi), %xmm14 ; AVX512-NEXT: vmovdqa 32(%rdi), %xmm0 -; AVX512-NEXT: vpermd %zmm4, %zmm21, %zmm16 {%k2} -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm15[0],xmm12[0],xmm15[1],xmm12[1],xmm15[2],xmm12[2],xmm15[3],xmm12[3] -; AVX512-NEXT: vpermd %zmm4, %zmm26, %zmm23 -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm7[0],xmm2[0],xmm7[1],xmm2[1],xmm7[2],xmm2[2],xmm7[3],xmm2[3] -; AVX512-NEXT: vpermd %zmm4, %zmm27, %zmm23 {%k1} -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512-NEXT: vpermd %zmm4, %zmm28, %zmm22 -; AVX512-NEXT: vmovdqa 32(%rdx), %xmm4 -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm4[0],xmm13[0],xmm4[1],xmm13[1],xmm4[2],xmm13[2],xmm4[3],xmm13[3] -; AVX512-NEXT: vpermd %zmm6, %zmm29, %zmm22 {%k2} -; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm7[4],xmm2[4],xmm7[5],xmm2[5],xmm7[6],xmm2[6],xmm7[7],xmm2[7] -; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm15[4],xmm12[4],xmm15[5],xmm12[5],xmm15[6],xmm12[6],xmm15[7],xmm12[7] -; AVX512-NEXT: vpermd %zmm6, %zmm26, %zmm25 -; AVX512-NEXT: vpermd %zmm2, %zmm27, %zmm25 {%k1} -; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm4[4],xmm13[4],xmm4[5],xmm13[5],xmm4[6],xmm13[6],xmm4[7],xmm13[7] -; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX512-NEXT: vpermd %zmm0, %zmm28, %zmm24 -; AVX512-NEXT: vpermd %zmm2, %zmm29, %zmm24 {%k2} -; AVX512-NEXT: vmovdqa (%r10), %ymm0 -; AVX512-NEXT: vmovdqa (%rax), %ymm1 -; AVX512-NEXT: vmovdqa (%r9), %ymm2 +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm12[0],xmm3[0],xmm12[1],xmm3[1],xmm12[2],xmm3[2],xmm12[3],xmm3[3] +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3] +; AVX512-NEXT: vpermt2d %zmm1, %zmm17, %zmm15 +; AVX512-NEXT: vmovdqa64 %zmm5, %zmm15 {%k1} +; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm10[4],xmm6[4],xmm10[5],xmm6[5],xmm10[6],xmm6[6],xmm10[7],xmm6[7] +; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] +; AVX512-NEXT: vpermt2d %zmm1, %zmm16, %zmm2 +; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm12[4],xmm3[4],xmm12[5],xmm3[5],xmm12[6],xmm3[6],xmm12[7],xmm3[7] +; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7] +; AVX512-NEXT: vpermt2d %zmm1, %zmm17, %zmm5 +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} +; AVX512-NEXT: vmovdqa (%r10), %ymm8 +; AVX512-NEXT: vmovdqa (%rax), %ymm7 +; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm7[0],ymm8[0],ymm7[1],ymm8[1],ymm7[2],ymm8[2],ymm7[3],ymm8[3],ymm7[8],ymm8[8],ymm7[9],ymm8[9],ymm7[10],ymm8[10],ymm7[11],ymm8[11] +; AVX512-NEXT: vmovdqa (%r9), %ymm3 ; AVX512-NEXT: vmovdqa (%r8), %ymm4 -; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm4[0],ymm2[0],ymm4[1],ymm2[1],ymm4[2],ymm2[2],ymm4[3],ymm2[3],ymm4[8],ymm2[8],ymm4[9],ymm2[9],ymm4[10],ymm2[10],ymm4[11],ymm2[11] -; AVX512-NEXT: vpermd %zmm6, %zmm19, %zmm6 -; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] -; AVX512-NEXT: vpermd %zmm7, %zmm18, %zmm6 {%k1} -; AVX512-NEXT: vmovdqa (%rcx), %ymm7 -; AVX512-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15] -; AVX512-NEXT: vmovdqa (%rdx), %ymm1 -; AVX512-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm4[4],ymm2[4],ymm4[5],ymm2[5],ymm4[6],ymm2[6],ymm4[7],ymm2[7],ymm4[12],ymm2[12],ymm4[13],ymm2[13],ymm4[14],ymm2[14],ymm4[15],ymm2[15] -; AVX512-NEXT: vmovdqa (%rsi), %ymm4 -; AVX512-NEXT: vpermd %zmm2, %zmm19, %zmm2 -; AVX512-NEXT: vmovdqa (%rdi), %ymm12 -; AVX512-NEXT: vpermd %zmm0, %zmm18, %zmm2 {%k1} -; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm12[0],ymm4[0],ymm12[1],ymm4[1],ymm12[2],ymm4[2],ymm12[3],ymm4[3],ymm12[8],ymm4[8],ymm12[9],ymm4[9],ymm12[10],ymm4[10],ymm12[11],ymm4[11] -; AVX512-NEXT: vpermd %zmm0, %zmm20, %zmm0 -; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm13 = ymm1[0],ymm7[0],ymm1[1],ymm7[1],ymm1[2],ymm7[2],ymm1[3],ymm7[3],ymm1[8],ymm7[8],ymm1[9],ymm7[9],ymm1[10],ymm7[10],ymm1[11],ymm7[11] -; AVX512-NEXT: vpermd %zmm13, %zmm21, %zmm0 {%k2} -; AVX512-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm1[4],ymm7[4],ymm1[5],ymm7[5],ymm1[6],ymm7[6],ymm1[7],ymm7[7],ymm1[12],ymm7[12],ymm1[13],ymm7[13],ymm1[14],ymm7[14],ymm1[15],ymm7[15] -; AVX512-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm12[4],ymm4[4],ymm12[5],ymm4[5],ymm12[6],ymm4[6],ymm12[7],ymm4[7],ymm12[12],ymm4[12],ymm12[13],ymm4[13],ymm12[14],ymm4[14],ymm12[15],ymm4[15] -; AVX512-NEXT: vpermd %zmm4, %zmm20, %zmm4 -; AVX512-NEXT: vpermd %zmm1, %zmm21, %zmm4 {%k2} -; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX512-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX512-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] -; AVX512-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload -; AVX512-NEXT: # xmm5 = xmm5[0],mem[0],xmm5[1],mem[1],xmm5[2],mem[2],xmm5[3],mem[3] -; AVX512-NEXT: vpermd %zmm5, %zmm26, %zmm5 -; AVX512-NEXT: vpermd %zmm1, %zmm27, %zmm5 {%k1} -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] -; AVX512-NEXT: vpermd %zmm7, %zmm28, %zmm7 -; AVX512-NEXT: vpermd %zmm1, %zmm29, %zmm7 {%k2} -; AVX512-NEXT: movb $-86, %al -; AVX512-NEXT: kmovw %eax, %k1 -; AVX512-NEXT: vmovdqa64 %zmm30, %zmm3 {%k1} -; AVX512-NEXT: vmovdqa64 %zmm31, %zmm14 {%k1} -; AVX512-NEXT: vmovdqa64 %zmm17, %zmm16 {%k1} -; AVX512-NEXT: vmovdqa64 %zmm23, %zmm22 {%k1} -; AVX512-NEXT: vmovdqa64 %zmm25, %zmm24 {%k1} -; AVX512-NEXT: vmovdqa64 %zmm6, %zmm0 {%k1} -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1} +; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11] +; AVX512-NEXT: vpermt2d %zmm2, %zmm18, %zmm6 +; AVX512-NEXT: vmovdqa (%rcx), %ymm2 +; AVX512-NEXT: vmovdqa (%rdx), %ymm12 +; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm14 = ymm12[0],ymm2[0],ymm12[1],ymm2[1],ymm12[2],ymm2[2],ymm12[3],ymm2[3],ymm12[8],ymm2[8],ymm12[9],ymm2[9],ymm12[10],ymm2[10],ymm12[11],ymm2[11] +; AVX512-NEXT: vmovdqa (%rsi), %ymm1 +; AVX512-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm10 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] +; AVX512-NEXT: vpermt2d %zmm14, %zmm19, %zmm10 +; AVX512-NEXT: vmovdqa64 %zmm6, %zmm10 {%k1} +; AVX512-NEXT: vpunpckhwd {{.*#+}} ymm6 = ymm7[4],ymm8[4],ymm7[5],ymm8[5],ymm7[6],ymm8[6],ymm7[7],ymm8[7],ymm7[12],ymm8[12],ymm7[13],ymm8[13],ymm7[14],ymm8[14],ymm7[15],ymm8[15] +; AVX512-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm4[4],ymm3[4],ymm4[5],ymm3[5],ymm4[6],ymm3[6],ymm4[7],ymm3[7],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15] +; AVX512-NEXT: vpermt2d %zmm6, %zmm18, %zmm3 +; AVX512-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm12[4],ymm2[4],ymm12[5],ymm2[5],ymm12[6],ymm2[6],ymm12[7],ymm2[7],ymm12[12],ymm2[12],ymm12[13],ymm2[13],ymm12[14],ymm2[14],ymm12[15],ymm2[15] +; AVX512-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15] +; AVX512-NEXT: vpermt2d %zmm2, %zmm19, %zmm0 +; AVX512-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1} +; AVX512-NEXT: vmovdqa64 %xmm20, %xmm1 +; AVX512-NEXT: vmovdqa64 %xmm21, %xmm2 +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; AVX512-NEXT: vmovdqa64 %xmm22, %xmm2 +; AVX512-NEXT: vmovdqa64 %xmm23, %xmm3 +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX512-NEXT: vpermt2d %zmm1, %zmm16, %zmm2 +; AVX512-NEXT: vmovdqa64 %xmm24, %xmm1 +; AVX512-NEXT: vmovdqa64 %xmm25, %xmm3 +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; AVX512-NEXT: vmovdqa64 %xmm26, %xmm3 +; AVX512-NEXT: vmovdqa64 %xmm27, %xmm4 +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; AVX512-NEXT: vpermt2d %zmm1, %zmm17, %zmm3 +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: vmovdqa64 %zmm7, (%rax) -; AVX512-NEXT: vmovdqa64 %zmm4, 192(%rax) -; AVX512-NEXT: vmovdqa64 %zmm0, 128(%rax) -; AVX512-NEXT: vmovdqa64 %zmm24, 320(%rax) -; AVX512-NEXT: vmovdqa64 %zmm22, 256(%rax) -; AVX512-NEXT: vmovdqa64 %zmm16, 448(%rax) -; AVX512-NEXT: vmovdqa64 %zmm14, 384(%rax) -; AVX512-NEXT: vmovdqa64 %zmm3, 64(%rax) +; AVX512-NEXT: vmovdqa64 %zmm3, (%rax) +; AVX512-NEXT: vmovdqa64 %zmm0, 192(%rax) +; AVX512-NEXT: vmovdqa64 %zmm10, 128(%rax) +; AVX512-NEXT: vmovdqa64 %zmm5, 320(%rax) +; AVX512-NEXT: vmovdqa64 %zmm15, 256(%rax) +; AVX512-NEXT: vmovdqa64 %zmm13, 448(%rax) +; AVX512-NEXT: vmovdqa64 %zmm11, 384(%rax) +; AVX512-NEXT: vmovdqa64 %zmm9, 64(%rax) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; @@ -4234,139 +4220,123 @@ define void @store_i16_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512-FCP-NEXT: vmovdqa (%r10), %xmm1 -; AVX512-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-FCP-NEXT: vmovdqa (%rax), %xmm0 -; AVX512-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX512-FCP-NEXT: vmovdqa (%rax), %xmm2 +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; AVX512-FCP-NEXT: vmovdqa64 %xmm2, %xmm21 +; AVX512-FCP-NEXT: vmovdqa64 %xmm1, %xmm22 ; AVX512-FCP-NEXT: vmovdqa (%r9), %xmm1 -; AVX512-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-FCP-NEXT: vmovdqa (%r8), %xmm5 -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm26 = [0,0,0,0,0,0,1,1,2,2,2,2,0,0,3,3] -; AVX512-FCP-NEXT: vpermd %zmm1, %zmm26, %zmm25 -; AVX512-FCP-NEXT: movw $-30584, %r11w # imm = 0x8888 +; AVX512-FCP-NEXT: vmovdqa (%r8), %xmm2 +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; AVX512-FCP-NEXT: vmovdqa64 %xmm2, %xmm23 +; AVX512-FCP-NEXT: vmovdqa64 %xmm1, %xmm24 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm16 = [0,0,0,16,0,0,1,17,2,2,2,18,0,0,3,19] +; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm16, %zmm5 +; AVX512-FCP-NEXT: vmovdqa (%rcx), %xmm1 +; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm2 +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; AVX512-FCP-NEXT: vmovdqa64 %xmm2, %xmm25 +; AVX512-FCP-NEXT: vmovdqa64 %xmm1, %xmm26 +; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm1 +; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm9 +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm9[4],xmm1[4],xmm9[5],xmm1[5],xmm9[6],xmm1[6],xmm9[7],xmm1[7] +; AVX512-FCP-NEXT: vmovdqa64 %xmm1, %xmm27 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm19 = [0,16,1,0,1,17,0,0,2,18,3,0,3,19,0,0] +; AVX512-FCP-NEXT: vpermi2d %zmm0, %zmm10, %zmm19 +; AVX512-FCP-NEXT: movb $-86, %r11b ; AVX512-FCP-NEXT: kmovw %r11d, %k1 -; AVX512-FCP-NEXT: vpermd %zmm0, %zmm26, %zmm25 {%k1} -; AVX512-FCP-NEXT: vmovdqa (%rcx), %xmm7 -; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm8 -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] -; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm9 -; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm10 -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,1,1,1,1,0,0,2,0,3,0,3,0,0,0] -; AVX512-FCP-NEXT: vpermd %zmm1, %zmm6, %zmm29 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm24 = [0,0,1,1,1,1,0,0,2,2,3,3,3,3,0,0] -; AVX512-FCP-NEXT: movw $8738, %r11w # imm = 0x2222 -; AVX512-FCP-NEXT: kmovw %r11d, %k2 -; AVX512-FCP-NEXT: vpermd %zmm0, %zmm24, %zmm29 {%k2} -; AVX512-FCP-NEXT: vmovdqa 32(%r10), %ymm2 -; AVX512-FCP-NEXT: vmovdqa 32(%rax), %ymm3 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11] +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm19 {%k1} +; AVX512-FCP-NEXT: vmovdqa 32(%r10), %ymm1 +; AVX512-FCP-NEXT: vmovdqa 32(%rax), %ymm5 +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm10 = ymm5[0],ymm1[0],ymm5[1],ymm1[1],ymm5[2],ymm1[2],ymm5[3],ymm1[3],ymm5[8],ymm1[8],ymm5[9],ymm1[9],ymm5[10],ymm1[10],ymm5[11],ymm1[11] ; AVX512-FCP-NEXT: vmovdqa 32(%r9), %ymm12 -; AVX512-FCP-NEXT: vmovdqa 32(%r8), %ymm13 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm11 = ymm13[0],ymm12[0],ymm13[1],ymm12[1],ymm13[2],ymm12[2],ymm13[3],ymm12[3],ymm13[8],ymm12[8],ymm13[9],ymm12[9],ymm13[10],ymm12[10],ymm13[11],ymm12[11] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm16 = [0,1,4,5,4,5,5,7,2,1,6,5,6,5,7,7] -; AVX512-FCP-NEXT: vpermd %zmm11, %zmm16, %zmm27 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm17 = [0,0,4,4,4,4,6,5,0,2,4,6,4,6,6,7] -; AVX512-FCP-NEXT: vpermd %zmm6, %zmm17, %zmm27 {%k1} -; AVX512-FCP-NEXT: vmovdqa 32(%rcx), %ymm14 -; AVX512-FCP-NEXT: vmovdqa 32(%rsi), %ymm15 -; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm1[0],ymm15[0],ymm1[1],ymm15[1],ymm1[2],ymm15[2],ymm1[3],ymm15[3],ymm1[8],ymm15[8],ymm1[9],ymm15[9],ymm1[10],ymm15[10],ymm1[11],ymm15[11] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm18 = [4,5,1,3,5,7,5,7,6,5,3,3,7,7,7,7] -; AVX512-FCP-NEXT: vpermd %zmm6, %zmm18, %zmm30 -; AVX512-FCP-NEXT: vmovdqa 32(%rdx), %ymm0 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm19 = [4,4,2,1,6,5,6,5,4,6,2,3,6,7,6,7] -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm0[0],ymm14[0],ymm0[1],ymm14[1],ymm0[2],ymm14[2],ymm0[3],ymm14[3],ymm0[8],ymm14[8],ymm0[9],ymm14[9],ymm0[10],ymm14[10],ymm0[11],ymm14[11] -; AVX512-FCP-NEXT: vpermd %zmm6, %zmm19, %zmm30 {%k2} -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm3[4],ymm2[4],ymm3[5],ymm2[5],ymm3[6],ymm2[6],ymm3[7],ymm2[7],ymm3[12],ymm2[12],ymm3[13],ymm2[13],ymm3[14],ymm2[14],ymm3[15],ymm2[15] -; AVX512-FCP-NEXT: vmovdqa 32(%r10), %xmm11 -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm6 = ymm13[4],ymm12[4],ymm13[5],ymm12[5],ymm13[6],ymm12[6],ymm13[7],ymm12[7],ymm13[12],ymm12[12],ymm13[13],ymm12[13],ymm13[14],ymm12[14],ymm13[15],ymm12[15] -; AVX512-FCP-NEXT: vmovdqa 32(%rax), %xmm3 -; AVX512-FCP-NEXT: vpermd %zmm6, %zmm16, %zmm28 -; AVX512-FCP-NEXT: vmovdqa 32(%r9), %xmm6 -; AVX512-FCP-NEXT: vpermd %zmm2, %zmm17, %zmm28 {%k1} +; AVX512-FCP-NEXT: vmovdqa 32(%r8), %ymm14 +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm15 = ymm14[0],ymm12[0],ymm14[1],ymm12[1],ymm14[2],ymm12[2],ymm14[3],ymm12[3],ymm14[8],ymm12[8],ymm14[9],ymm12[9],ymm14[10],ymm12[10],ymm14[11],ymm12[11] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm17 = [0,1,4,20,4,5,5,21,2,1,6,22,6,5,7,23] +; AVX512-FCP-NEXT: vpermt2d %zmm10, %zmm17, %zmm15 +; AVX512-FCP-NEXT: vmovdqa 32(%rcx), %ymm11 +; AVX512-FCP-NEXT: vmovdqa 32(%rdx), %ymm13 +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm13[0],ymm11[0],ymm13[1],ymm11[1],ymm13[2],ymm11[2],ymm13[3],ymm11[3],ymm13[8],ymm11[8],ymm13[9],ymm11[9],ymm13[10],ymm11[10],ymm13[11],ymm11[11] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm18 = [4,20,1,3,5,21,5,7,6,22,3,3,7,23,7,7] +; AVX512-FCP-NEXT: vmovdqa 32(%rsi), %ymm3 +; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm0 +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm10 = ymm0[0],ymm3[0],ymm0[1],ymm3[1],ymm0[2],ymm3[2],ymm0[3],ymm3[3],ymm0[8],ymm3[8],ymm0[9],ymm3[9],ymm0[10],ymm3[10],ymm0[11],ymm3[11] +; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm18, %zmm10 +; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm10 {%k1} +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm5[4],ymm1[4],ymm5[5],ymm1[5],ymm5[6],ymm1[6],ymm5[7],ymm1[7],ymm5[12],ymm1[12],ymm5[13],ymm1[13],ymm5[14],ymm1[14],ymm5[15],ymm1[15] +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm14[4],ymm12[4],ymm14[5],ymm12[5],ymm14[6],ymm12[6],ymm14[7],ymm12[7],ymm14[12],ymm12[12],ymm14[13],ymm12[13],ymm14[14],ymm12[14],ymm14[15],ymm12[15] +; AVX512-FCP-NEXT: vmovdqa 32(%r10), %xmm6 +; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm17, %zmm5 +; AVX512-FCP-NEXT: vmovdqa 32(%rax), %xmm4 +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm11 = ymm13[4],ymm11[4],ymm13[5],ymm11[5],ymm13[6],ymm11[6],ymm13[7],ymm11[7],ymm13[12],ymm11[12],ymm13[13],ymm11[13],ymm13[14],ymm11[14],ymm13[15],ymm11[15] +; AVX512-FCP-NEXT: vmovdqa 32(%r9), %xmm13 +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm12 = ymm0[4],ymm3[4],ymm0[5],ymm3[5],ymm0[6],ymm3[6],ymm0[7],ymm3[7],ymm0[12],ymm3[12],ymm0[13],ymm3[13],ymm0[14],ymm3[14],ymm0[15],ymm3[15] ; AVX512-FCP-NEXT: vmovdqa 32(%r8), %xmm2 -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm14[4],ymm0[5],ymm14[5],ymm0[6],ymm14[6],ymm0[7],ymm14[7],ymm0[12],ymm14[12],ymm0[13],ymm14[13],ymm0[14],ymm14[14],ymm0[15],ymm14[15] -; AVX512-FCP-NEXT: vmovdqa 32(%rcx), %xmm13 -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm1[4],ymm15[4],ymm1[5],ymm15[5],ymm1[6],ymm15[6],ymm1[7],ymm15[7],ymm1[12],ymm15[12],ymm1[13],ymm15[13],ymm1[14],ymm15[14],ymm1[15],ymm15[15] +; AVX512-FCP-NEXT: vpermt2d %zmm11, %zmm18, %zmm12 +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm12 {%k1} +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3] +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm2[0],xmm13[0],xmm2[1],xmm13[1],xmm2[2],xmm13[2],xmm2[3],xmm13[3] +; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm16, %zmm5 +; AVX512-FCP-NEXT: vmovdqa 32(%rcx), %xmm3 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm20 = [0,16,1,0,1,17,0,0,2,18,3,3,3,19,0,0] +; AVX512-FCP-NEXT: vmovdqa 32(%rdx), %xmm11 ; AVX512-FCP-NEXT: vmovdqa 32(%rsi), %xmm15 -; AVX512-FCP-NEXT: vpermd %zmm1, %zmm18, %zmm31 -; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm12 -; AVX512-FCP-NEXT: vpermd %zmm0, %zmm19, %zmm31 {%k2} -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,1,0,2,2,2,2,0,0,3,3] -; AVX512-FCP-NEXT: vpermd %zmm0, %zmm1, %zmm21 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm11[0],xmm3[1],xmm11[1],xmm3[2],xmm11[2],xmm3[3],xmm11[3] -; AVX512-FCP-NEXT: vpermd %zmm0, %zmm26, %zmm21 {%k1} -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm12[0],xmm15[0],xmm12[1],xmm15[1],xmm12[2],xmm15[2],xmm12[3],xmm15[3] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,1,0,1,0,0,0,2,2,3,3,3,3,0,0] -; AVX512-FCP-NEXT: vpermd %zmm0, %zmm1, %zmm20 -; AVX512-FCP-NEXT: vmovdqa 32(%rdx), %xmm0 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm13[0],xmm0[1],xmm13[1],xmm0[2],xmm13[2],xmm0[3],xmm13[3] -; AVX512-FCP-NEXT: vpermd %zmm1, %zmm24, %zmm20 {%k2} -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm3[4],xmm11[4],xmm3[5],xmm11[5],xmm3[6],xmm11[6],xmm3[7],xmm11[7] -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] -; AVX512-FCP-NEXT: vpermd %zmm2, %zmm26, %zmm23 -; AVX512-FCP-NEXT: vpermd %zmm1, %zmm26, %zmm23 {%k1} -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm13[4],xmm0[5],xmm13[5],xmm0[6],xmm13[6],xmm0[7],xmm13[7] -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm12[4],xmm15[4],xmm12[5],xmm15[5],xmm12[6],xmm15[6],xmm12[7],xmm15[7] -; AVX512-FCP-NEXT: vpermd %zmm1, %zmm24, %zmm22 -; AVX512-FCP-NEXT: vpermd %zmm0, %zmm24, %zmm22 {%k2} -; AVX512-FCP-NEXT: vmovdqa (%r10), %ymm0 -; AVX512-FCP-NEXT: vmovdqa (%rax), %ymm1 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] +; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm0 +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm11[0],xmm3[0],xmm11[1],xmm3[1],xmm11[2],xmm3[2],xmm11[3],xmm3[3] +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3] +; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm20, %zmm14 +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm14 {%k1} +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7] +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm13[4],xmm2[5],xmm13[5],xmm2[6],xmm13[6],xmm2[7],xmm13[7] +; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm16, %zmm2 +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm11[4],xmm3[4],xmm11[5],xmm3[5],xmm11[6],xmm3[6],xmm11[7],xmm3[7] +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7] +; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm20, %zmm5 +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} +; AVX512-FCP-NEXT: vmovdqa (%r10), %ymm8 +; AVX512-FCP-NEXT: vmovdqa (%rax), %ymm7 +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm7[0],ymm8[0],ymm7[1],ymm8[1],ymm7[2],ymm8[2],ymm7[3],ymm8[3],ymm7[8],ymm8[8],ymm7[9],ymm8[9],ymm7[10],ymm8[10],ymm7[11],ymm8[11] ; AVX512-FCP-NEXT: vmovdqa (%r9), %ymm3 -; AVX512-FCP-NEXT: vmovdqa (%r8), %ymm6 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm11 = ymm6[0],ymm3[0],ymm6[1],ymm3[1],ymm6[2],ymm3[2],ymm6[3],ymm3[3],ymm6[8],ymm3[8],ymm6[9],ymm3[9],ymm6[10],ymm3[10],ymm6[11],ymm3[11] -; AVX512-FCP-NEXT: vpermd %zmm11, %zmm16, %zmm11 -; AVX512-FCP-NEXT: vpermd %zmm2, %zmm17, %zmm11 {%k1} +; AVX512-FCP-NEXT: vmovdqa (%r8), %ymm4 +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11] +; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm17, %zmm6 +; AVX512-FCP-NEXT: vmovdqa (%rcx), %ymm11 +; AVX512-FCP-NEXT: vmovdqa (%rdx), %ymm13 +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm15 = ymm13[0],ymm11[0],ymm13[1],ymm11[1],ymm13[2],ymm11[2],ymm13[3],ymm11[3],ymm13[8],ymm11[8],ymm13[9],ymm11[9],ymm13[10],ymm11[10],ymm13[11],ymm11[11] ; AVX512-FCP-NEXT: vmovdqa (%rsi), %ymm2 -; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm12 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm13 = ymm12[0],ymm2[0],ymm12[1],ymm2[1],ymm12[2],ymm2[2],ymm12[3],ymm2[3],ymm12[8],ymm2[8],ymm12[9],ymm2[9],ymm12[10],ymm2[10],ymm12[11],ymm2[11] -; AVX512-FCP-NEXT: vpermd %zmm13, %zmm18, %zmm13 -; AVX512-FCP-NEXT: vmovdqa (%rcx), %ymm15 -; AVX512-FCP-NEXT: vmovdqa (%rdx), %ymm14 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm14[0],ymm15[0],ymm14[1],ymm15[1],ymm14[2],ymm15[2],ymm14[3],ymm15[3],ymm14[8],ymm15[8],ymm14[9],ymm15[9],ymm14[10],ymm15[10],ymm14[11],ymm15[11] -; AVX512-FCP-NEXT: vpermd %zmm4, %zmm19, %zmm13 {%k2} -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15] -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm6[4],ymm3[4],ymm6[5],ymm3[5],ymm6[6],ymm3[6],ymm6[7],ymm3[7],ymm6[12],ymm3[12],ymm6[13],ymm3[13],ymm6[14],ymm3[14],ymm6[15],ymm3[15] -; AVX512-FCP-NEXT: vpermd %zmm1, %zmm16, %zmm1 -; AVX512-FCP-NEXT: vpermd %zmm0, %zmm17, %zmm1 {%k1} -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm14[4],ymm15[4],ymm14[5],ymm15[5],ymm14[6],ymm15[6],ymm14[7],ymm15[7],ymm14[12],ymm15[12],ymm14[13],ymm15[13],ymm14[14],ymm15[14],ymm14[15],ymm15[15] -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm12[4],ymm2[4],ymm12[5],ymm2[5],ymm12[6],ymm2[6],ymm12[7],ymm2[7],ymm12[12],ymm2[12],ymm12[13],ymm2[13],ymm12[14],ymm2[14],ymm12[15],ymm2[15] -; AVX512-FCP-NEXT: vpermd %zmm2, %zmm18, %zmm2 -; AVX512-FCP-NEXT: vpermd %zmm0, %zmm19, %zmm2 {%k2} -; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512-FCP-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX512-FCP-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; AVX512-FCP-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm3 # 16-byte Folded Reload -; AVX512-FCP-NEXT: # xmm3 = xmm5[0],mem[0],xmm5[1],mem[1],xmm5[2],mem[2],xmm5[3],mem[3] -; AVX512-FCP-NEXT: vpermd %zmm3, %zmm26, %zmm3 -; AVX512-FCP-NEXT: vpermd %zmm0, %zmm26, %zmm3 {%k1} -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] -; AVX512-FCP-NEXT: vpermd %zmm4, %zmm24, %zmm4 -; AVX512-FCP-NEXT: vpermd %zmm0, %zmm24, %zmm4 {%k2} -; AVX512-FCP-NEXT: movb $-86, %al -; AVX512-FCP-NEXT: kmovw %eax, %k1 -; AVX512-FCP-NEXT: vmovdqa64 %zmm25, %zmm29 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 %zmm27, %zmm30 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 %zmm28, %zmm31 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 %zmm21, %zmm20 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 %zmm23, %zmm22 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm13 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm1 +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11] +; AVX512-FCP-NEXT: vpermt2d %zmm15, %zmm18, %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm0 {%k1} +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm6 = ymm7[4],ymm8[4],ymm7[5],ymm8[5],ymm7[6],ymm8[6],ymm7[7],ymm8[7],ymm7[12],ymm8[12],ymm7[13],ymm8[13],ymm7[14],ymm8[14],ymm7[15],ymm8[15] +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm4[4],ymm3[4],ymm4[5],ymm3[5],ymm4[6],ymm3[6],ymm4[7],ymm3[7],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15] +; AVX512-FCP-NEXT: vpermt2d %zmm6, %zmm17, %zmm3 +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm13[4],ymm11[4],ymm13[5],ymm11[5],ymm13[6],ymm11[6],ymm13[7],ymm11[7],ymm13[12],ymm11[12],ymm13[13],ymm11[13],ymm13[14],ymm11[14],ymm13[15],ymm11[15] +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15] +; AVX512-FCP-NEXT: vpermt2d %zmm4, %zmm18, %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 %xmm21, %xmm2 +; AVX512-FCP-NEXT: vmovdqa64 %xmm22, %xmm3 +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX512-FCP-NEXT: vmovdqa64 %xmm23, %xmm3 +; AVX512-FCP-NEXT: vmovdqa64 %xmm24, %xmm4 +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm16, %zmm3 +; AVX512-FCP-NEXT: vmovdqa64 %xmm25, %xmm2 +; AVX512-FCP-NEXT: vmovdqa64 %xmm26, %xmm4 +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; AVX512-FCP-NEXT: vmovdqa64 %xmm27, %xmm4 +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3] +; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm20, %zmm4 ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, (%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, 192(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm13, 128(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm22, 320(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm20, 256(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm31, 448(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm30, 384(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm29, 64(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, 192(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, 128(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, 320(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm14, 256(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm12, 448(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm10, 384(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm19, 64(%rax) ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq ; @@ -4374,139 +4344,125 @@ define void @store_i16_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512DQ-NEXT: vmovdqa (%r10), %xmm1 -; AVX512DQ-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-NEXT: vmovdqa (%rax), %xmm0 -; AVX512DQ-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX512DQ-NEXT: vmovdqa (%r9), %xmm1 -; AVX512DQ-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-NEXT: vmovdqa (%r8), %xmm5 -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm26 = [0,0,0,0,0,0,1,0,2,2,2,2,0,0,3,3] -; AVX512DQ-NEXT: vpermd %zmm1, %zmm26, %zmm30 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm27 = [0,0,0,0,0,0,1,1,2,2,2,2,0,0,3,3] -; AVX512DQ-NEXT: movw $-30584, %r11w # imm = 0x8888 +; AVX512DQ-NEXT: vmovdqa (%r10), %xmm0 +; AVX512DQ-NEXT: vmovdqa (%rax), %xmm1 +; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX512DQ-NEXT: vmovdqa64 %xmm1, %xmm20 +; AVX512DQ-NEXT: vmovdqa64 %xmm0, %xmm21 +; AVX512DQ-NEXT: vmovdqa (%r9), %xmm0 +; AVX512DQ-NEXT: vmovdqa (%r8), %xmm1 +; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX512DQ-NEXT: vmovdqa64 %xmm1, %xmm22 +; AVX512DQ-NEXT: vmovdqa64 %xmm0, %xmm23 +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm16 = [0,0,0,16,0,0,1,17,2,2,2,18,0,0,3,19] +; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm16, %zmm5 +; AVX512DQ-NEXT: vmovdqa (%rcx), %xmm0 +; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm1 +; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX512DQ-NEXT: vmovdqa64 %xmm1, %xmm24 +; AVX512DQ-NEXT: vmovdqa64 %xmm0, %xmm25 +; AVX512DQ-NEXT: vmovdqa (%rsi), %xmm0 +; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm1 +; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX512DQ-NEXT: vmovdqa64 %xmm1, %xmm26 +; AVX512DQ-NEXT: vmovdqa64 %xmm0, %xmm27 +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm17 = [0,16,1,0,1,17,0,0,2,18,3,3,3,19,0,0] +; AVX512DQ-NEXT: vpermt2d %zmm10, %zmm17, %zmm9 +; AVX512DQ-NEXT: movb $-86, %r11b ; AVX512DQ-NEXT: kmovw %r11d, %k1 -; AVX512DQ-NEXT: vpermd %zmm0, %zmm27, %zmm30 {%k1} -; AVX512DQ-NEXT: vmovdqa (%rcx), %xmm8 -; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm9 -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] -; AVX512DQ-NEXT: vmovdqa (%rsi), %xmm10 -; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm11 -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm28 = [0,0,1,0,1,0,0,0,2,2,3,3,3,3,0,0] -; AVX512DQ-NEXT: vpermd %zmm1, %zmm28, %zmm3 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm29 = [0,0,1,1,1,1,0,0,2,2,3,3,3,3,0,0] -; AVX512DQ-NEXT: movw $8738, %r11w # imm = 0x2222 -; AVX512DQ-NEXT: kmovw %r11d, %k2 -; AVX512DQ-NEXT: vpermd %zmm0, %zmm29, %zmm3 {%k2} -; AVX512DQ-NEXT: vmovdqa 32(%r10), %ymm15 -; AVX512DQ-NEXT: vmovdqa 32(%rax), %ymm2 -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm2[0],ymm15[0],ymm2[1],ymm15[1],ymm2[2],ymm15[2],ymm2[3],ymm15[3],ymm2[8],ymm15[8],ymm2[9],ymm15[9],ymm2[10],ymm15[10],ymm2[11],ymm15[11] -; AVX512DQ-NEXT: vmovdqa 32(%r9), %ymm7 -; AVX512DQ-NEXT: vmovdqa 32(%r8), %ymm12 -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm13 = ymm12[0],ymm7[0],ymm12[1],ymm7[1],ymm12[2],ymm7[2],ymm12[3],ymm7[3],ymm12[8],ymm7[8],ymm12[9],ymm7[9],ymm12[10],ymm7[10],ymm12[11],ymm7[11] -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm19 = [0,1,4,5,4,5,5,7,2,1,6,5,6,5,7,7] -; AVX512DQ-NEXT: vpermd %zmm13, %zmm19, %zmm31 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm18 = [0,0,4,4,4,4,6,5,0,2,4,6,4,6,6,7] -; AVX512DQ-NEXT: vpermd %zmm6, %zmm18, %zmm31 {%k1} -; AVX512DQ-NEXT: vmovdqa 32(%rcx), %ymm13 -; AVX512DQ-NEXT: vmovdqa 32(%rsi), %ymm1 -; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm0 -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm20 = [4,5,1,3,5,7,5,7,6,5,3,3,7,7,7,7] -; AVX512DQ-NEXT: vpermd %zmm6, %zmm20, %zmm14 -; AVX512DQ-NEXT: vmovdqa 32(%rdx), %ymm6 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm21 = [4,4,2,1,6,5,6,5,4,6,2,3,6,7,6,7] -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm6[0],ymm13[0],ymm6[1],ymm13[1],ymm6[2],ymm13[2],ymm6[3],ymm13[3],ymm6[8],ymm13[8],ymm6[9],ymm13[9],ymm6[10],ymm13[10],ymm6[11],ymm13[11] -; AVX512DQ-NEXT: vpermd %zmm4, %zmm21, %zmm14 {%k2} -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm2[4],ymm15[4],ymm2[5],ymm15[5],ymm2[6],ymm15[6],ymm2[7],ymm15[7],ymm2[12],ymm15[12],ymm2[13],ymm15[13],ymm2[14],ymm15[14],ymm2[15],ymm15[15] -; AVX512DQ-NEXT: vmovdqa 32(%r10), %xmm2 -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm12 = ymm12[4],ymm7[4],ymm12[5],ymm7[5],ymm12[6],ymm7[6],ymm12[7],ymm7[7],ymm12[12],ymm7[12],ymm12[13],ymm7[13],ymm12[14],ymm7[14],ymm12[15],ymm7[15] -; AVX512DQ-NEXT: vmovdqa 32(%rax), %xmm7 -; AVX512DQ-NEXT: vpermd %zmm12, %zmm19, %zmm17 -; AVX512DQ-NEXT: vmovdqa 32(%r9), %xmm12 -; AVX512DQ-NEXT: vpermd %zmm4, %zmm18, %zmm17 {%k1} -; AVX512DQ-NEXT: vmovdqa 32(%r8), %xmm15 -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm6[4],ymm13[4],ymm6[5],ymm13[5],ymm6[6],ymm13[6],ymm6[7],ymm13[7],ymm6[12],ymm13[12],ymm6[13],ymm13[13],ymm6[14],ymm13[14],ymm6[15],ymm13[15] -; AVX512DQ-NEXT: vmovdqa 32(%rcx), %xmm13 -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15] -; AVX512DQ-NEXT: vmovdqa 32(%rsi), %xmm1 -; AVX512DQ-NEXT: vpermd %zmm0, %zmm20, %zmm16 +; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm9 {%k1} +; AVX512DQ-NEXT: vmovdqa 32(%r10), %ymm5 +; AVX512DQ-NEXT: vmovdqa 32(%rax), %ymm10 +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm11 = ymm10[0],ymm5[0],ymm10[1],ymm5[1],ymm10[2],ymm5[2],ymm10[3],ymm5[3],ymm10[8],ymm5[8],ymm10[9],ymm5[9],ymm10[10],ymm5[10],ymm10[11],ymm5[11] +; AVX512DQ-NEXT: vmovdqa 32(%r9), %ymm13 +; AVX512DQ-NEXT: vmovdqa 32(%r8), %ymm15 +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm15[0],ymm13[0],ymm15[1],ymm13[1],ymm15[2],ymm13[2],ymm15[3],ymm13[3],ymm15[8],ymm13[8],ymm15[9],ymm13[9],ymm15[10],ymm13[10],ymm15[11],ymm13[11] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm18 = [0,1,4,20,4,5,5,21,2,1,6,22,6,5,7,23] +; AVX512DQ-NEXT: vpermt2d %zmm11, %zmm18, %zmm0 +; AVX512DQ-NEXT: vmovdqa 32(%rcx), %ymm3 +; AVX512DQ-NEXT: vmovdqa 32(%rdx), %ymm12 +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm19 = [4,20,1,3,5,21,5,7,6,22,3,3,7,23,7,7] +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm14 = ymm12[0],ymm3[0],ymm12[1],ymm3[1],ymm12[2],ymm3[2],ymm12[3],ymm3[3],ymm12[8],ymm3[8],ymm12[9],ymm3[9],ymm12[10],ymm3[10],ymm12[11],ymm3[11] +; AVX512DQ-NEXT: vmovdqa 32(%rsi), %ymm2 +; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm11 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11] +; AVX512DQ-NEXT: vpermt2d %zmm14, %zmm19, %zmm11 +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} +; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm10[4],ymm5[4],ymm10[5],ymm5[5],ymm10[6],ymm5[6],ymm10[7],ymm5[7],ymm10[12],ymm5[12],ymm10[13],ymm5[13],ymm10[14],ymm5[14],ymm10[15],ymm5[15] +; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm15[4],ymm13[4],ymm15[5],ymm13[5],ymm15[6],ymm13[6],ymm15[7],ymm13[7],ymm15[12],ymm13[12],ymm15[13],ymm13[13],ymm15[14],ymm13[14],ymm15[15],ymm13[15] +; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm18, %zmm5 +; AVX512DQ-NEXT: vmovdqa 32(%r10), %xmm6 +; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm12[4],ymm3[4],ymm12[5],ymm3[5],ymm12[6],ymm3[6],ymm12[7],ymm3[7],ymm12[12],ymm3[12],ymm12[13],ymm3[13],ymm12[14],ymm3[14],ymm12[15],ymm3[15] +; AVX512DQ-NEXT: vmovdqa 32(%rax), %xmm10 +; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm13 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15] +; AVX512DQ-NEXT: vmovdqa 32(%r9), %xmm4 +; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm19, %zmm13 +; AVX512DQ-NEXT: vmovdqa 32(%r8), %xmm2 +; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm13 {%k1} +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm10[0],xmm6[0],xmm10[1],xmm6[1],xmm10[2],xmm6[2],xmm10[3],xmm6[3] +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm16, %zmm5 +; AVX512DQ-NEXT: vmovdqa 32(%rcx), %xmm3 +; AVX512DQ-NEXT: vmovdqa 32(%rdx), %xmm12 +; AVX512DQ-NEXT: vmovdqa 32(%rsi), %xmm14 ; AVX512DQ-NEXT: vmovdqa 32(%rdi), %xmm0 -; AVX512DQ-NEXT: vpermd %zmm4, %zmm21, %zmm16 {%k2} -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm15[0],xmm12[0],xmm15[1],xmm12[1],xmm15[2],xmm12[2],xmm15[3],xmm12[3] -; AVX512DQ-NEXT: vpermd %zmm4, %zmm26, %zmm23 -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm7[0],xmm2[0],xmm7[1],xmm2[1],xmm7[2],xmm2[2],xmm7[3],xmm2[3] -; AVX512DQ-NEXT: vpermd %zmm4, %zmm27, %zmm23 {%k1} -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512DQ-NEXT: vpermd %zmm4, %zmm28, %zmm22 -; AVX512DQ-NEXT: vmovdqa 32(%rdx), %xmm4 -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm4[0],xmm13[0],xmm4[1],xmm13[1],xmm4[2],xmm13[2],xmm4[3],xmm13[3] -; AVX512DQ-NEXT: vpermd %zmm6, %zmm29, %zmm22 {%k2} -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm7[4],xmm2[4],xmm7[5],xmm2[5],xmm7[6],xmm2[6],xmm7[7],xmm2[7] -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm15[4],xmm12[4],xmm15[5],xmm12[5],xmm15[6],xmm12[6],xmm15[7],xmm12[7] -; AVX512DQ-NEXT: vpermd %zmm6, %zmm26, %zmm25 -; AVX512DQ-NEXT: vpermd %zmm2, %zmm27, %zmm25 {%k1} -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm4[4],xmm13[4],xmm4[5],xmm13[5],xmm4[6],xmm13[6],xmm4[7],xmm13[7] -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX512DQ-NEXT: vpermd %zmm0, %zmm28, %zmm24 -; AVX512DQ-NEXT: vpermd %zmm2, %zmm29, %zmm24 {%k2} -; AVX512DQ-NEXT: vmovdqa (%r10), %ymm0 -; AVX512DQ-NEXT: vmovdqa (%rax), %ymm1 -; AVX512DQ-NEXT: vmovdqa (%r9), %ymm2 +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm12[0],xmm3[0],xmm12[1],xmm3[1],xmm12[2],xmm3[2],xmm12[3],xmm3[3] +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3] +; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm17, %zmm15 +; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm15 {%k1} +; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm10[4],xmm6[4],xmm10[5],xmm6[5],xmm10[6],xmm6[6],xmm10[7],xmm6[7] +; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] +; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm16, %zmm2 +; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm12[4],xmm3[4],xmm12[5],xmm3[5],xmm12[6],xmm3[6],xmm12[7],xmm3[7] +; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7] +; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm17, %zmm5 +; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} +; AVX512DQ-NEXT: vmovdqa (%r10), %ymm8 +; AVX512DQ-NEXT: vmovdqa (%rax), %ymm7 +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm7[0],ymm8[0],ymm7[1],ymm8[1],ymm7[2],ymm8[2],ymm7[3],ymm8[3],ymm7[8],ymm8[8],ymm7[9],ymm8[9],ymm7[10],ymm8[10],ymm7[11],ymm8[11] +; AVX512DQ-NEXT: vmovdqa (%r9), %ymm3 ; AVX512DQ-NEXT: vmovdqa (%r8), %ymm4 -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm4[0],ymm2[0],ymm4[1],ymm2[1],ymm4[2],ymm2[2],ymm4[3],ymm2[3],ymm4[8],ymm2[8],ymm4[9],ymm2[9],ymm4[10],ymm2[10],ymm4[11],ymm2[11] -; AVX512DQ-NEXT: vpermd %zmm6, %zmm19, %zmm6 -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] -; AVX512DQ-NEXT: vpermd %zmm7, %zmm18, %zmm6 {%k1} -; AVX512DQ-NEXT: vmovdqa (%rcx), %ymm7 -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15] -; AVX512DQ-NEXT: vmovdqa (%rdx), %ymm1 -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm4[4],ymm2[4],ymm4[5],ymm2[5],ymm4[6],ymm2[6],ymm4[7],ymm2[7],ymm4[12],ymm2[12],ymm4[13],ymm2[13],ymm4[14],ymm2[14],ymm4[15],ymm2[15] -; AVX512DQ-NEXT: vmovdqa (%rsi), %ymm4 -; AVX512DQ-NEXT: vpermd %zmm2, %zmm19, %zmm2 -; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm12 -; AVX512DQ-NEXT: vpermd %zmm0, %zmm18, %zmm2 {%k1} -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm12[0],ymm4[0],ymm12[1],ymm4[1],ymm12[2],ymm4[2],ymm12[3],ymm4[3],ymm12[8],ymm4[8],ymm12[9],ymm4[9],ymm12[10],ymm4[10],ymm12[11],ymm4[11] -; AVX512DQ-NEXT: vpermd %zmm0, %zmm20, %zmm0 -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm13 = ymm1[0],ymm7[0],ymm1[1],ymm7[1],ymm1[2],ymm7[2],ymm1[3],ymm7[3],ymm1[8],ymm7[8],ymm1[9],ymm7[9],ymm1[10],ymm7[10],ymm1[11],ymm7[11] -; AVX512DQ-NEXT: vpermd %zmm13, %zmm21, %zmm0 {%k2} -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm1[4],ymm7[4],ymm1[5],ymm7[5],ymm1[6],ymm7[6],ymm1[7],ymm7[7],ymm1[12],ymm7[12],ymm1[13],ymm7[13],ymm1[14],ymm7[14],ymm1[15],ymm7[15] -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm12[4],ymm4[4],ymm12[5],ymm4[5],ymm12[6],ymm4[6],ymm12[7],ymm4[7],ymm12[12],ymm4[12],ymm12[13],ymm4[13],ymm12[14],ymm4[14],ymm12[15],ymm4[15] -; AVX512DQ-NEXT: vpermd %zmm4, %zmm20, %zmm4 -; AVX512DQ-NEXT: vpermd %zmm1, %zmm21, %zmm4 {%k2} -; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX512DQ-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX512DQ-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] -; AVX512DQ-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload -; AVX512DQ-NEXT: # xmm5 = xmm5[0],mem[0],xmm5[1],mem[1],xmm5[2],mem[2],xmm5[3],mem[3] -; AVX512DQ-NEXT: vpermd %zmm5, %zmm26, %zmm5 -; AVX512DQ-NEXT: vpermd %zmm1, %zmm27, %zmm5 {%k1} -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] -; AVX512DQ-NEXT: vpermd %zmm7, %zmm28, %zmm7 -; AVX512DQ-NEXT: vpermd %zmm1, %zmm29, %zmm7 {%k2} -; AVX512DQ-NEXT: movb $-86, %al -; AVX512DQ-NEXT: kmovw %eax, %k1 -; AVX512DQ-NEXT: vmovdqa64 %zmm30, %zmm3 {%k1} -; AVX512DQ-NEXT: vmovdqa64 %zmm31, %zmm14 {%k1} -; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm16 {%k1} -; AVX512DQ-NEXT: vmovdqa64 %zmm23, %zmm22 {%k1} -; AVX512DQ-NEXT: vmovdqa64 %zmm25, %zmm24 {%k1} -; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm0 {%k1} -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1} +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11] +; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm18, %zmm6 +; AVX512DQ-NEXT: vmovdqa (%rcx), %ymm2 +; AVX512DQ-NEXT: vmovdqa (%rdx), %ymm12 +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm14 = ymm12[0],ymm2[0],ymm12[1],ymm2[1],ymm12[2],ymm2[2],ymm12[3],ymm2[3],ymm12[8],ymm2[8],ymm12[9],ymm2[9],ymm12[10],ymm2[10],ymm12[11],ymm2[11] +; AVX512DQ-NEXT: vmovdqa (%rsi), %ymm1 +; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm10 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] +; AVX512DQ-NEXT: vpermt2d %zmm14, %zmm19, %zmm10 +; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm10 {%k1} +; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm6 = ymm7[4],ymm8[4],ymm7[5],ymm8[5],ymm7[6],ymm8[6],ymm7[7],ymm8[7],ymm7[12],ymm8[12],ymm7[13],ymm8[13],ymm7[14],ymm8[14],ymm7[15],ymm8[15] +; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm4[4],ymm3[4],ymm4[5],ymm3[5],ymm4[6],ymm3[6],ymm4[7],ymm3[7],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15] +; AVX512DQ-NEXT: vpermt2d %zmm6, %zmm18, %zmm3 +; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm12[4],ymm2[4],ymm12[5],ymm2[5],ymm12[6],ymm2[6],ymm12[7],ymm2[7],ymm12[12],ymm2[12],ymm12[13],ymm2[13],ymm12[14],ymm2[14],ymm12[15],ymm2[15] +; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15] +; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm19, %zmm0 +; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1} +; AVX512DQ-NEXT: vmovdqa64 %xmm20, %xmm1 +; AVX512DQ-NEXT: vmovdqa64 %xmm21, %xmm2 +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; AVX512DQ-NEXT: vmovdqa64 %xmm22, %xmm2 +; AVX512DQ-NEXT: vmovdqa64 %xmm23, %xmm3 +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm16, %zmm2 +; AVX512DQ-NEXT: vmovdqa64 %xmm24, %xmm1 +; AVX512DQ-NEXT: vmovdqa64 %xmm25, %xmm3 +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; AVX512DQ-NEXT: vmovdqa64 %xmm26, %xmm3 +; AVX512DQ-NEXT: vmovdqa64 %xmm27, %xmm4 +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm17, %zmm3 +; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-NEXT: vmovdqa64 %zmm7, (%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm4, 192(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm0, 128(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm24, 320(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm22, 256(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm16, 448(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm14, 384(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm3, 64(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm3, (%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm0, 192(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm10, 128(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm5, 320(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm15, 256(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm13, 448(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm11, 384(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm9, 64(%rax) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -4515,139 +4471,123 @@ define void @store_i16_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512DQ-FCP-NEXT: vmovdqa (%r10), %xmm1 -; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa (%rax), %xmm0 -; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX512DQ-FCP-NEXT: vmovdqa (%rax), %xmm2 +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm2, %xmm21 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm1, %xmm22 ; AVX512DQ-FCP-NEXT: vmovdqa (%r9), %xmm1 -; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %xmm5 -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm26 = [0,0,0,0,0,0,1,1,2,2,2,2,0,0,3,3] -; AVX512DQ-FCP-NEXT: vpermd %zmm1, %zmm26, %zmm25 -; AVX512DQ-FCP-NEXT: movw $-30584, %r11w # imm = 0x8888 +; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %xmm2 +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm2, %xmm23 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm1, %xmm24 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm16 = [0,0,0,16,0,0,1,17,2,2,2,18,0,0,3,19] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm16, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %xmm1 +; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm2 +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm2, %xmm25 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm1, %xmm26 +; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %xmm1 +; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm9 +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm9[4],xmm1[4],xmm9[5],xmm1[5],xmm9[6],xmm1[6],xmm9[7],xmm1[7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm1, %xmm27 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm19 = [0,16,1,0,1,17,0,0,2,18,3,0,3,19,0,0] +; AVX512DQ-FCP-NEXT: vpermi2d %zmm0, %zmm10, %zmm19 +; AVX512DQ-FCP-NEXT: movb $-86, %r11b ; AVX512DQ-FCP-NEXT: kmovw %r11d, %k1 -; AVX512DQ-FCP-NEXT: vpermd %zmm0, %zmm26, %zmm25 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %xmm7 -; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm8 -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] -; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %xmm9 -; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm10 -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,1,1,1,1,0,0,2,0,3,0,3,0,0,0] -; AVX512DQ-FCP-NEXT: vpermd %zmm1, %zmm6, %zmm29 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm24 = [0,0,1,1,1,1,0,0,2,2,3,3,3,3,0,0] -; AVX512DQ-FCP-NEXT: movw $8738, %r11w # imm = 0x2222 -; AVX512DQ-FCP-NEXT: kmovw %r11d, %k2 -; AVX512DQ-FCP-NEXT: vpermd %zmm0, %zmm24, %zmm29 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqa 32(%r10), %ymm2 -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rax), %ymm3 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm19 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa 32(%r10), %ymm1 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rax), %ymm5 +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm10 = ymm5[0],ymm1[0],ymm5[1],ymm1[1],ymm5[2],ymm1[2],ymm5[3],ymm1[3],ymm5[8],ymm1[8],ymm5[9],ymm1[9],ymm5[10],ymm1[10],ymm5[11],ymm1[11] ; AVX512DQ-FCP-NEXT: vmovdqa 32(%r9), %ymm12 -; AVX512DQ-FCP-NEXT: vmovdqa 32(%r8), %ymm13 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm11 = ymm13[0],ymm12[0],ymm13[1],ymm12[1],ymm13[2],ymm12[2],ymm13[3],ymm12[3],ymm13[8],ymm12[8],ymm13[9],ymm12[9],ymm13[10],ymm12[10],ymm13[11],ymm12[11] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm16 = [0,1,4,5,4,5,5,7,2,1,6,5,6,5,7,7] -; AVX512DQ-FCP-NEXT: vpermd %zmm11, %zmm16, %zmm27 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm17 = [0,0,4,4,4,4,6,5,0,2,4,6,4,6,6,7] -; AVX512DQ-FCP-NEXT: vpermd %zmm6, %zmm17, %zmm27 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rcx), %ymm14 -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rsi), %ymm15 -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm1[0],ymm15[0],ymm1[1],ymm15[1],ymm1[2],ymm15[2],ymm1[3],ymm15[3],ymm1[8],ymm15[8],ymm1[9],ymm15[9],ymm1[10],ymm15[10],ymm1[11],ymm15[11] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm18 = [4,5,1,3,5,7,5,7,6,5,3,3,7,7,7,7] -; AVX512DQ-FCP-NEXT: vpermd %zmm6, %zmm18, %zmm30 -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdx), %ymm0 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm19 = [4,4,2,1,6,5,6,5,4,6,2,3,6,7,6,7] -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm0[0],ymm14[0],ymm0[1],ymm14[1],ymm0[2],ymm14[2],ymm0[3],ymm14[3],ymm0[8],ymm14[8],ymm0[9],ymm14[9],ymm0[10],ymm14[10],ymm0[11],ymm14[11] -; AVX512DQ-FCP-NEXT: vpermd %zmm6, %zmm19, %zmm30 {%k2} -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm3[4],ymm2[4],ymm3[5],ymm2[5],ymm3[6],ymm2[6],ymm3[7],ymm2[7],ymm3[12],ymm2[12],ymm3[13],ymm2[13],ymm3[14],ymm2[14],ymm3[15],ymm2[15] -; AVX512DQ-FCP-NEXT: vmovdqa 32(%r10), %xmm11 -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm6 = ymm13[4],ymm12[4],ymm13[5],ymm12[5],ymm13[6],ymm12[6],ymm13[7],ymm12[7],ymm13[12],ymm12[12],ymm13[13],ymm12[13],ymm13[14],ymm12[14],ymm13[15],ymm12[15] -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rax), %xmm3 -; AVX512DQ-FCP-NEXT: vpermd %zmm6, %zmm16, %zmm28 -; AVX512DQ-FCP-NEXT: vmovdqa 32(%r9), %xmm6 -; AVX512DQ-FCP-NEXT: vpermd %zmm2, %zmm17, %zmm28 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa 32(%r8), %ymm14 +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm15 = ymm14[0],ymm12[0],ymm14[1],ymm12[1],ymm14[2],ymm12[2],ymm14[3],ymm12[3],ymm14[8],ymm12[8],ymm14[9],ymm12[9],ymm14[10],ymm12[10],ymm14[11],ymm12[11] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm17 = [0,1,4,20,4,5,5,21,2,1,6,22,6,5,7,23] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm10, %zmm17, %zmm15 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rcx), %ymm11 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdx), %ymm13 +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm13[0],ymm11[0],ymm13[1],ymm11[1],ymm13[2],ymm11[2],ymm13[3],ymm11[3],ymm13[8],ymm11[8],ymm13[9],ymm11[9],ymm13[10],ymm11[10],ymm13[11],ymm11[11] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm18 = [4,20,1,3,5,21,5,7,6,22,3,3,7,23,7,7] +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rsi), %ymm3 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm0 +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm10 = ymm0[0],ymm3[0],ymm0[1],ymm3[1],ymm0[2],ymm3[2],ymm0[3],ymm3[3],ymm0[8],ymm3[8],ymm0[9],ymm3[9],ymm0[10],ymm3[10],ymm0[11],ymm3[11] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm18, %zmm10 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm10 {%k1} +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm5[4],ymm1[4],ymm5[5],ymm1[5],ymm5[6],ymm1[6],ymm5[7],ymm1[7],ymm5[12],ymm1[12],ymm5[13],ymm1[13],ymm5[14],ymm1[14],ymm5[15],ymm1[15] +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm14[4],ymm12[4],ymm14[5],ymm12[5],ymm14[6],ymm12[6],ymm14[7],ymm12[7],ymm14[12],ymm12[12],ymm14[13],ymm12[13],ymm14[14],ymm12[14],ymm14[15],ymm12[15] +; AVX512DQ-FCP-NEXT: vmovdqa 32(%r10), %xmm6 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm17, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rax), %xmm4 +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm11 = ymm13[4],ymm11[4],ymm13[5],ymm11[5],ymm13[6],ymm11[6],ymm13[7],ymm11[7],ymm13[12],ymm11[12],ymm13[13],ymm11[13],ymm13[14],ymm11[14],ymm13[15],ymm11[15] +; AVX512DQ-FCP-NEXT: vmovdqa 32(%r9), %xmm13 +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm12 = ymm0[4],ymm3[4],ymm0[5],ymm3[5],ymm0[6],ymm3[6],ymm0[7],ymm3[7],ymm0[12],ymm3[12],ymm0[13],ymm3[13],ymm0[14],ymm3[14],ymm0[15],ymm3[15] ; AVX512DQ-FCP-NEXT: vmovdqa 32(%r8), %xmm2 -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm14[4],ymm0[5],ymm14[5],ymm0[6],ymm14[6],ymm0[7],ymm14[7],ymm0[12],ymm14[12],ymm0[13],ymm14[13],ymm0[14],ymm14[14],ymm0[15],ymm14[15] -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rcx), %xmm13 -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm1[4],ymm15[4],ymm1[5],ymm15[5],ymm1[6],ymm15[6],ymm1[7],ymm15[7],ymm1[12],ymm15[12],ymm1[13],ymm15[13],ymm1[14],ymm15[14],ymm1[15],ymm15[15] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm11, %zmm18, %zmm12 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm12 {%k1} +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3] +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm2[0],xmm13[0],xmm2[1],xmm13[1],xmm2[2],xmm13[2],xmm2[3],xmm13[3] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm16, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rcx), %xmm3 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm20 = [0,16,1,0,1,17,0,0,2,18,3,3,3,19,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdx), %xmm11 ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rsi), %xmm15 -; AVX512DQ-FCP-NEXT: vpermd %zmm1, %zmm18, %zmm31 -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm12 -; AVX512DQ-FCP-NEXT: vpermd %zmm0, %zmm19, %zmm31 {%k2} -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,1,0,2,2,2,2,0,0,3,3] -; AVX512DQ-FCP-NEXT: vpermd %zmm0, %zmm1, %zmm21 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm11[0],xmm3[1],xmm11[1],xmm3[2],xmm11[2],xmm3[3],xmm11[3] -; AVX512DQ-FCP-NEXT: vpermd %zmm0, %zmm26, %zmm21 {%k1} -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm12[0],xmm15[0],xmm12[1],xmm15[1],xmm12[2],xmm15[2],xmm12[3],xmm15[3] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,1,0,1,0,0,0,2,2,3,3,3,3,0,0] -; AVX512DQ-FCP-NEXT: vpermd %zmm0, %zmm1, %zmm20 -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdx), %xmm0 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm13[0],xmm0[1],xmm13[1],xmm0[2],xmm13[2],xmm0[3],xmm13[3] -; AVX512DQ-FCP-NEXT: vpermd %zmm1, %zmm24, %zmm20 {%k2} -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm3[4],xmm11[4],xmm3[5],xmm11[5],xmm3[6],xmm11[6],xmm3[7],xmm11[7] -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] -; AVX512DQ-FCP-NEXT: vpermd %zmm2, %zmm26, %zmm23 -; AVX512DQ-FCP-NEXT: vpermd %zmm1, %zmm26, %zmm23 {%k1} -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm13[4],xmm0[5],xmm13[5],xmm0[6],xmm13[6],xmm0[7],xmm13[7] -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm12[4],xmm15[4],xmm12[5],xmm15[5],xmm12[6],xmm15[6],xmm12[7],xmm15[7] -; AVX512DQ-FCP-NEXT: vpermd %zmm1, %zmm24, %zmm22 -; AVX512DQ-FCP-NEXT: vpermd %zmm0, %zmm24, %zmm22 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqa (%r10), %ymm0 -; AVX512DQ-FCP-NEXT: vmovdqa (%rax), %ymm1 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm0 +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm11[0],xmm3[0],xmm11[1],xmm3[1],xmm11[2],xmm3[2],xmm11[3],xmm3[3] +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm20, %zmm14 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm14 {%k1} +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7] +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm13[4],xmm2[5],xmm13[5],xmm2[6],xmm13[6],xmm2[7],xmm13[7] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm16, %zmm2 +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm11[4],xmm3[4],xmm11[5],xmm3[5],xmm11[6],xmm3[6],xmm11[7],xmm3[7] +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm20, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa (%r10), %ymm8 +; AVX512DQ-FCP-NEXT: vmovdqa (%rax), %ymm7 +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm7[0],ymm8[0],ymm7[1],ymm8[1],ymm7[2],ymm8[2],ymm7[3],ymm8[3],ymm7[8],ymm8[8],ymm7[9],ymm8[9],ymm7[10],ymm8[10],ymm7[11],ymm8[11] ; AVX512DQ-FCP-NEXT: vmovdqa (%r9), %ymm3 -; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %ymm6 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm11 = ymm6[0],ymm3[0],ymm6[1],ymm3[1],ymm6[2],ymm3[2],ymm6[3],ymm3[3],ymm6[8],ymm3[8],ymm6[9],ymm3[9],ymm6[10],ymm3[10],ymm6[11],ymm3[11] -; AVX512DQ-FCP-NEXT: vpermd %zmm11, %zmm16, %zmm11 -; AVX512DQ-FCP-NEXT: vpermd %zmm2, %zmm17, %zmm11 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %ymm4 +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm17, %zmm6 +; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %ymm11 +; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %ymm13 +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm15 = ymm13[0],ymm11[0],ymm13[1],ymm11[1],ymm13[2],ymm11[2],ymm13[3],ymm11[3],ymm13[8],ymm11[8],ymm13[9],ymm11[9],ymm13[10],ymm11[10],ymm13[11],ymm11[11] ; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %ymm2 -; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm12 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm13 = ymm12[0],ymm2[0],ymm12[1],ymm2[1],ymm12[2],ymm2[2],ymm12[3],ymm2[3],ymm12[8],ymm2[8],ymm12[9],ymm2[9],ymm12[10],ymm2[10],ymm12[11],ymm2[11] -; AVX512DQ-FCP-NEXT: vpermd %zmm13, %zmm18, %zmm13 -; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %ymm15 -; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %ymm14 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm14[0],ymm15[0],ymm14[1],ymm15[1],ymm14[2],ymm15[2],ymm14[3],ymm15[3],ymm14[8],ymm15[8],ymm14[9],ymm15[9],ymm14[10],ymm15[10],ymm14[11],ymm15[11] -; AVX512DQ-FCP-NEXT: vpermd %zmm4, %zmm19, %zmm13 {%k2} -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15] -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm6[4],ymm3[4],ymm6[5],ymm3[5],ymm6[6],ymm3[6],ymm6[7],ymm3[7],ymm6[12],ymm3[12],ymm6[13],ymm3[13],ymm6[14],ymm3[14],ymm6[15],ymm3[15] -; AVX512DQ-FCP-NEXT: vpermd %zmm1, %zmm16, %zmm1 -; AVX512DQ-FCP-NEXT: vpermd %zmm0, %zmm17, %zmm1 {%k1} -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm14[4],ymm15[4],ymm14[5],ymm15[5],ymm14[6],ymm15[6],ymm14[7],ymm15[7],ymm14[12],ymm15[12],ymm14[13],ymm15[13],ymm14[14],ymm15[14],ymm14[15],ymm15[15] -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm12[4],ymm2[4],ymm12[5],ymm2[5],ymm12[6],ymm2[6],ymm12[7],ymm2[7],ymm12[12],ymm2[12],ymm12[13],ymm2[13],ymm12[14],ymm2[14],ymm12[15],ymm2[15] -; AVX512DQ-FCP-NEXT: vpermd %zmm2, %zmm18, %zmm2 -; AVX512DQ-FCP-NEXT: vpermd %zmm0, %zmm19, %zmm2 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512DQ-FCP-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX512DQ-FCP-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; AVX512DQ-FCP-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm3 # 16-byte Folded Reload -; AVX512DQ-FCP-NEXT: # xmm3 = xmm5[0],mem[0],xmm5[1],mem[1],xmm5[2],mem[2],xmm5[3],mem[3] -; AVX512DQ-FCP-NEXT: vpermd %zmm3, %zmm26, %zmm3 -; AVX512DQ-FCP-NEXT: vpermd %zmm0, %zmm26, %zmm3 {%k1} -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] -; AVX512DQ-FCP-NEXT: vpermd %zmm4, %zmm24, %zmm4 -; AVX512DQ-FCP-NEXT: vpermd %zmm0, %zmm24, %zmm4 {%k2} -; AVX512DQ-FCP-NEXT: movb $-86, %al -; AVX512DQ-FCP-NEXT: kmovw %eax, %k1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, %zmm29 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, %zmm30 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, %zmm31 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, %zmm20 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, %zmm22 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm13 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm1 +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm15, %zmm18, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm0 {%k1} +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm6 = ymm7[4],ymm8[4],ymm7[5],ymm8[5],ymm7[6],ymm8[6],ymm7[7],ymm8[7],ymm7[12],ymm8[12],ymm7[13],ymm8[13],ymm7[14],ymm8[14],ymm7[15],ymm8[15] +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm4[4],ymm3[4],ymm4[5],ymm3[5],ymm4[6],ymm3[6],ymm4[7],ymm3[7],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm6, %zmm17, %zmm3 +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm13[4],ymm11[4],ymm13[5],ymm11[5],ymm13[6],ymm11[6],ymm13[7],ymm11[7],ymm13[12],ymm11[12],ymm13[13],ymm11[13],ymm13[14],ymm11[14],ymm13[15],ymm11[15] +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm4, %zmm18, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm21, %xmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm22, %xmm3 +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm23, %xmm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm24, %xmm4 +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm16, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm25, %xmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm26, %xmm4 +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm27, %xmm4 +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm20, %zmm4 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, (%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, 192(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, 128(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, 320(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, 256(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm31, 448(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm30, 384(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, 64(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, 192(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, 128(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, 320(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, 256(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, 448(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, 384(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, 64(%rax) ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq ; @@ -7777,1095 +7717,959 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX512-LABEL: store_i16_stride8_vf64: ; AVX512: # %bb.0: -; AVX512-NEXT: subq $504, %rsp # imm = 0x1F8 +; AVX512-NEXT: subq $392, %rsp # imm = 0x188 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512-NEXT: vmovdqa (%r10), %xmm0 -; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-NEXT: vmovdqa 64(%r10), %xmm4 -; AVX512-NEXT: vmovdqa (%rax), %xmm1 -; AVX512-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-NEXT: vmovdqa 64(%rax), %xmm3 -; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX512-NEXT: vmovdqa (%r9), %xmm0 -; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-NEXT: vmovdqa (%r8), %xmm7 -; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7] -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm30 = [0,0,0,0,0,0,1,0,2,2,2,2,0,0,3,3] -; AVX512-NEXT: vpermd %zmm2, %zmm30, %zmm0 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm29 = [0,0,0,0,0,0,1,1,2,2,2,2,0,0,3,3] -; AVX512-NEXT: movw $-30584, %r11w # imm = 0x8888 -; AVX512-NEXT: kmovw %r11d, %k2 -; AVX512-NEXT: vpermd %zmm1, %zmm29, %zmm0 {%k2} -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: movw $8738, %r11w # imm = 0x2222 +; AVX512-NEXT: vmovdqa (%r10), %xmm1 +; AVX512-NEXT: vmovdqa (%rax), %xmm2 +; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; AVX512-NEXT: vmovdqa64 %xmm2, %xmm20 +; AVX512-NEXT: vmovdqa64 %xmm1, %xmm21 +; AVX512-NEXT: vmovdqa (%r9), %xmm2 +; AVX512-NEXT: vmovdqa (%r8), %xmm3 +; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; AVX512-NEXT: vmovdqa64 %xmm3, %xmm22 +; AVX512-NEXT: vmovdqa64 %xmm2, %xmm23 +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm16 = [0,0,0,16,0,0,1,17,2,2,2,18,0,0,3,19] +; AVX512-NEXT: vpermt2d %zmm0, %zmm16, %zmm1 +; AVX512-NEXT: vmovdqa (%rcx), %xmm2 +; AVX512-NEXT: vmovdqa (%rdx), %xmm3 +; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; AVX512-NEXT: vmovdqa64 %xmm3, %xmm24 +; AVX512-NEXT: vmovdqa64 %xmm2, %xmm25 +; AVX512-NEXT: vmovdqa (%rsi), %xmm3 +; AVX512-NEXT: vmovdqa (%rdi), %xmm4 +; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; AVX512-NEXT: vmovdqa64 %xmm4, %xmm26 +; AVX512-NEXT: vmovdqa64 %xmm3, %xmm27 +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm17 = [0,16,1,0,1,17,0,0,2,18,3,3,3,19,0,0] +; AVX512-NEXT: vpermt2d %zmm0, %zmm17, %zmm2 +; AVX512-NEXT: movb $-86, %r11b ; AVX512-NEXT: kmovw %r11d, %k1 -; AVX512-NEXT: vmovdqa 96(%r10), %ymm2 -; AVX512-NEXT: vmovdqa 96(%rax), %ymm5 -; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm5[0],ymm2[0],ymm5[1],ymm2[1],ymm5[2],ymm2[2],ymm5[3],ymm2[3],ymm5[8],ymm2[8],ymm5[9],ymm2[9],ymm5[10],ymm2[10],ymm5[11],ymm2[11] -; AVX512-NEXT: vmovdqa 96(%r9), %ymm8 -; AVX512-NEXT: vmovdqa 96(%r8), %ymm9 -; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm10 = ymm9[0],ymm8[0],ymm9[1],ymm8[1],ymm9[2],ymm8[2],ymm9[3],ymm8[3],ymm9[8],ymm8[8],ymm9[9],ymm8[9],ymm9[10],ymm8[10],ymm9[11],ymm8[11] -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm19 = [0,1,4,5,4,5,5,7,2,1,6,5,6,5,7,7] -; AVX512-NEXT: vpermd %zmm10, %zmm19, %zmm0 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm18 = [0,0,4,4,4,4,6,5,0,2,4,6,4,6,6,7] -; AVX512-NEXT: vpermd %zmm1, %zmm18, %zmm0 {%k2} -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa 96(%rcx), %ymm10 -; AVX512-NEXT: vmovdqa 96(%rdx), %ymm11 +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa 96(%r10), %ymm0 +; AVX512-NEXT: vmovdqa 96(%rax), %ymm1 +; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] +; AVX512-NEXT: vmovdqa 96(%r9), %ymm3 +; AVX512-NEXT: vmovdqa 96(%r8), %ymm4 +; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm18 = [0,1,4,20,4,5,5,21,2,1,6,22,6,5,7,23] +; AVX512-NEXT: vpermt2d %zmm2, %zmm18, %zmm5 +; AVX512-NEXT: vmovdqa 96(%rcx), %ymm2 +; AVX512-NEXT: vmovdqa 96(%rdx), %ymm6 +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm19 = [4,20,1,3,5,21,5,7,6,22,3,3,7,23,7,7] +; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm10 = ymm6[0],ymm2[0],ymm6[1],ymm2[1],ymm6[2],ymm2[2],ymm6[3],ymm2[3],ymm6[8],ymm2[8],ymm6[9],ymm2[9],ymm6[10],ymm2[10],ymm6[11],ymm2[11] ; AVX512-NEXT: vmovdqa 96(%rsi), %ymm12 ; AVX512-NEXT: vmovdqa 96(%rdi), %ymm13 -; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm13[0],ymm12[0],ymm13[1],ymm12[1],ymm13[2],ymm12[2],ymm13[3],ymm12[3],ymm13[8],ymm12[8],ymm13[9],ymm12[9],ymm13[10],ymm12[10],ymm13[11],ymm12[11] -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm16 = [4,5,1,3,5,7,5,7,6,5,3,3,7,7,7,7] -; AVX512-NEXT: vpermd %zmm1, %zmm16, %zmm0 -; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm14 = ymm11[0],ymm10[0],ymm11[1],ymm10[1],ymm11[2],ymm10[2],ymm11[3],ymm10[3],ymm11[8],ymm10[8],ymm11[9],ymm10[9],ymm11[10],ymm10[10],ymm11[11],ymm10[11] -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm17 = [4,4,2,1,6,5,6,5,4,6,2,3,6,7,6,7] -; AVX512-NEXT: vpermd %zmm14, %zmm17, %zmm0 {%k1} -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm5[4],ymm2[4],ymm5[5],ymm2[5],ymm5[6],ymm2[6],ymm5[7],ymm2[7],ymm5[12],ymm2[12],ymm5[13],ymm2[13],ymm5[14],ymm2[14],ymm5[15],ymm2[15] -; AVX512-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm9[4],ymm8[4],ymm9[5],ymm8[5],ymm9[6],ymm8[6],ymm9[7],ymm8[7],ymm9[12],ymm8[12],ymm9[13],ymm8[13],ymm9[14],ymm8[14],ymm9[15],ymm8[15] -; AVX512-NEXT: vpermd %zmm5, %zmm19, %zmm0 -; AVX512-NEXT: vpermd %zmm2, %zmm18, %zmm0 {%k2} -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm11[4],ymm10[4],ymm11[5],ymm10[5],ymm11[6],ymm10[6],ymm11[7],ymm10[7],ymm11[12],ymm10[12],ymm11[13],ymm10[13],ymm11[14],ymm10[14],ymm11[15],ymm10[15] -; AVX512-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm13[4],ymm12[4],ymm13[5],ymm12[5],ymm13[6],ymm12[6],ymm13[7],ymm12[7],ymm13[12],ymm12[12],ymm13[13],ymm12[13],ymm13[14],ymm12[14],ymm13[15],ymm12[15] -; AVX512-NEXT: vpermd %zmm5, %zmm16, %zmm31 -; AVX512-NEXT: vpermd %zmm2, %zmm17, %zmm31 {%k1} -; AVX512-NEXT: vmovdqa 96(%r10), %xmm2 -; AVX512-NEXT: vmovdqa 96(%rax), %xmm8 -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm8[0],xmm2[0],xmm8[1],xmm2[1],xmm8[2],xmm2[2],xmm8[3],xmm2[3] -; AVX512-NEXT: vmovdqa 96(%r9), %xmm10 -; AVX512-NEXT: vmovdqa 96(%r8), %xmm11 -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] -; AVX512-NEXT: vpermd %zmm12, %zmm30, %zmm0 -; AVX512-NEXT: vpermd %zmm9, %zmm29, %zmm0 {%k2} -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm8[4],xmm2[4],xmm8[5],xmm2[5],xmm8[6],xmm2[6],xmm8[7],xmm2[7] -; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] -; AVX512-NEXT: vpermd %zmm8, %zmm30, %zmm0 -; AVX512-NEXT: vpermd %zmm2, %zmm29, %zmm0 {%k2} -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa 64(%r10), %ymm2 -; AVX512-NEXT: vmovdqa 64(%rax), %ymm9 -; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm9[0],ymm2[0],ymm9[1],ymm2[1],ymm9[2],ymm2[2],ymm9[3],ymm2[3],ymm9[8],ymm2[8],ymm9[9],ymm2[9],ymm9[10],ymm2[10],ymm9[11],ymm2[11] -; AVX512-NEXT: vmovdqa 64(%r9), %ymm10 -; AVX512-NEXT: vmovdqa 64(%r8), %ymm11 -; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm12 = ymm11[0],ymm10[0],ymm11[1],ymm10[1],ymm11[2],ymm10[2],ymm11[3],ymm10[3],ymm11[8],ymm10[8],ymm11[9],ymm10[9],ymm11[10],ymm10[10],ymm11[11],ymm10[11] -; AVX512-NEXT: vpermd %zmm12, %zmm19, %zmm0 -; AVX512-NEXT: vpermd %zmm8, %zmm18, %zmm0 {%k2} -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa 64(%rcx), %ymm12 -; AVX512-NEXT: vmovdqa 64(%rdx), %ymm15 +; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm13[0],ymm12[0],ymm13[1],ymm12[1],ymm13[2],ymm12[2],ymm13[3],ymm12[3],ymm13[8],ymm12[8],ymm13[9],ymm12[9],ymm13[10],ymm12[10],ymm13[11],ymm12[11] +; AVX512-NEXT: vpermt2d %zmm10, %zmm19, %zmm7 +; AVX512-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1} +; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15] +; AVX512-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm4[4],ymm3[4],ymm4[5],ymm3[5],ymm4[6],ymm3[6],ymm4[7],ymm3[7],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15] +; AVX512-NEXT: vpermt2d %zmm0, %zmm18, %zmm1 +; AVX512-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm6[4],ymm2[4],ymm6[5],ymm2[5],ymm6[6],ymm2[6],ymm6[7],ymm2[7],ymm6[12],ymm2[12],ymm6[13],ymm2[13],ymm6[14],ymm2[14],ymm6[15],ymm2[15] +; AVX512-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm13[4],ymm12[4],ymm13[5],ymm12[5],ymm13[6],ymm12[6],ymm13[7],ymm12[7],ymm13[12],ymm12[12],ymm13[13],ymm12[13],ymm13[14],ymm12[14],ymm13[15],ymm12[15] +; AVX512-NEXT: vpermt2d %zmm0, %zmm19, %zmm2 +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa 96(%r10), %xmm0 +; AVX512-NEXT: vmovdqa 96(%rax), %xmm1 +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX512-NEXT: vmovdqa 96(%r9), %xmm3 +; AVX512-NEXT: vmovdqa 96(%r8), %xmm4 +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX512-NEXT: vpermt2d %zmm2, %zmm16, %zmm5 +; AVX512-NEXT: vmovdqa 96(%rcx), %xmm2 +; AVX512-NEXT: vmovdqa 96(%rdx), %xmm6 +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3] +; AVX512-NEXT: vmovdqa 96(%rsi), %xmm13 +; AVX512-NEXT: vmovdqa 96(%rdi), %xmm14 +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3] +; AVX512-NEXT: vpermt2d %zmm10, %zmm17, %zmm7 +; AVX512-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1} +; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; AVX512-NEXT: vpermt2d %zmm0, %zmm16, %zmm1 +; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7] +; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] +; AVX512-NEXT: vpermt2d %zmm0, %zmm17, %zmm2 +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa 64(%r10), %ymm6 +; AVX512-NEXT: vmovdqa 64(%rax), %ymm5 +; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm5[0],ymm6[0],ymm5[1],ymm6[1],ymm5[2],ymm6[2],ymm5[3],ymm6[3],ymm5[8],ymm6[8],ymm5[9],ymm6[9],ymm5[10],ymm6[10],ymm5[11],ymm6[11] +; AVX512-NEXT: vmovdqa 64(%r9), %ymm3 +; AVX512-NEXT: vmovdqa 64(%r8), %ymm4 +; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm10 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11] +; AVX512-NEXT: vpermt2d %zmm2, %zmm18, %zmm10 +; AVX512-NEXT: vmovdqa 64(%rcx), %ymm2 +; AVX512-NEXT: vmovdqa 64(%rdx), %ymm13 +; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm14 = ymm13[0],ymm2[0],ymm13[1],ymm2[1],ymm13[2],ymm2[2],ymm13[3],ymm2[3],ymm13[8],ymm2[8],ymm13[9],ymm2[9],ymm13[10],ymm2[10],ymm13[11],ymm2[11] ; AVX512-NEXT: vmovdqa 64(%rsi), %ymm1 ; AVX512-NEXT: vmovdqa 64(%rdi), %ymm0 -; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] -; AVX512-NEXT: vpermd %zmm8, %zmm16, %zmm26 -; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm15[0],ymm12[0],ymm15[1],ymm12[1],ymm15[2],ymm12[2],ymm15[3],ymm12[3],ymm15[8],ymm12[8],ymm15[9],ymm12[9],ymm15[10],ymm12[10],ymm15[11],ymm12[11] -; AVX512-NEXT: vpermd %zmm8, %zmm17, %zmm26 {%k1} -; AVX512-NEXT: vmovdqa 64(%r9), %xmm8 -; AVX512-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm9[4],ymm2[4],ymm9[5],ymm2[5],ymm9[6],ymm2[6],ymm9[7],ymm2[7],ymm9[12],ymm2[12],ymm9[13],ymm2[13],ymm9[14],ymm2[14],ymm9[15],ymm2[15] -; AVX512-NEXT: vmovdqa 64(%r8), %xmm9 -; AVX512-NEXT: vpunpckhwd {{.*#+}} ymm10 = ymm11[4],ymm10[4],ymm11[5],ymm10[5],ymm11[6],ymm10[6],ymm11[7],ymm10[7],ymm11[12],ymm10[12],ymm11[13],ymm10[13],ymm11[14],ymm10[14],ymm11[15],ymm10[15] -; AVX512-NEXT: vpermd %zmm10, %zmm19, %zmm5 -; AVX512-NEXT: vpermd %zmm2, %zmm18, %zmm5 {%k2} -; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm15[4],ymm12[4],ymm15[5],ymm12[5],ymm15[6],ymm12[6],ymm15[7],ymm12[7],ymm15[12],ymm12[12],ymm15[13],ymm12[13],ymm15[14],ymm12[14],ymm15[15],ymm12[15] +; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] +; AVX512-NEXT: vpermt2d %zmm14, %zmm19, %zmm7 +; AVX512-NEXT: vmovdqa64 %zmm10, %zmm7 {%k1} +; AVX512-NEXT: vmovdqu64 %zmm7, (%rsp) # 64-byte Spill +; AVX512-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm5[4],ymm6[4],ymm5[5],ymm6[5],ymm5[6],ymm6[6],ymm5[7],ymm6[7],ymm5[12],ymm6[12],ymm5[13],ymm6[13],ymm5[14],ymm6[14],ymm5[15],ymm6[15] +; AVX512-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm4[4],ymm3[4],ymm4[5],ymm3[5],ymm4[6],ymm3[6],ymm4[7],ymm3[7],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15] +; AVX512-NEXT: vpermt2d %zmm5, %zmm18, %zmm3 +; AVX512-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm13[4],ymm2[4],ymm13[5],ymm2[5],ymm13[6],ymm2[6],ymm13[7],ymm2[7],ymm13[12],ymm2[12],ymm13[13],ymm2[13],ymm13[14],ymm2[14],ymm13[15],ymm2[15] ; AVX512-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15] -; AVX512-NEXT: vpermd %zmm0, %zmm16, %zmm24 -; AVX512-NEXT: vpermd %zmm2, %zmm17, %zmm24 {%k1} -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] -; AVX512-NEXT: vpermd %zmm1, %zmm30, %zmm1 -; AVX512-NEXT: vpermd %zmm0, %zmm29, %zmm1 {%k2} -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] -; AVX512-NEXT: vmovdqa 32(%r10), %ymm0 -; AVX512-NEXT: vmovdqa 32(%rax), %ymm1 -; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] -; AVX512-NEXT: vmovdqa 32(%r9), %ymm4 -; AVX512-NEXT: vmovdqa 32(%r8), %ymm8 -; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm9 = ymm8[0],ymm4[0],ymm8[1],ymm4[1],ymm8[2],ymm4[2],ymm8[3],ymm4[3],ymm8[8],ymm4[8],ymm8[9],ymm4[9],ymm8[10],ymm4[10],ymm8[11],ymm4[11] -; AVX512-NEXT: vpermd %zmm9, %zmm19, %zmm28 -; AVX512-NEXT: vpermd %zmm3, %zmm18, %zmm28 {%k2} -; AVX512-NEXT: vmovdqa 32(%rsi), %ymm9 -; AVX512-NEXT: vmovdqa 32(%rdi), %ymm11 -; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm11[0],ymm9[0],ymm11[1],ymm9[1],ymm11[2],ymm9[2],ymm11[3],ymm9[3],ymm11[8],ymm9[8],ymm11[9],ymm9[9],ymm11[10],ymm9[10],ymm11[11],ymm9[11] -; AVX512-NEXT: vpermd %zmm3, %zmm16, %zmm23 -; AVX512-NEXT: vmovdqa 32(%rcx), %ymm12 -; AVX512-NEXT: vmovdqa 32(%rdx), %ymm3 -; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm3[0],ymm12[0],ymm3[1],ymm12[1],ymm3[2],ymm12[2],ymm3[3],ymm12[3],ymm3[8],ymm12[8],ymm3[9],ymm12[9],ymm3[10],ymm12[10],ymm3[11],ymm12[11] -; AVX512-NEXT: vpermd %zmm6, %zmm17, %zmm23 {%k1} -; AVX512-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15] -; AVX512-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm8[4],ymm4[4],ymm8[5],ymm4[5],ymm8[6],ymm4[6],ymm8[7],ymm4[7],ymm8[12],ymm4[12],ymm8[13],ymm4[13],ymm8[14],ymm4[14],ymm8[15],ymm4[15] -; AVX512-NEXT: vpermd %zmm1, %zmm19, %zmm25 -; AVX512-NEXT: vpermd %zmm0, %zmm18, %zmm25 {%k2} -; AVX512-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm3[4],ymm12[4],ymm3[5],ymm12[5],ymm3[6],ymm12[6],ymm3[7],ymm12[7],ymm3[12],ymm12[12],ymm3[13],ymm12[13],ymm3[14],ymm12[14],ymm3[15],ymm12[15] -; AVX512-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm11[4],ymm9[4],ymm11[5],ymm9[5],ymm11[6],ymm9[6],ymm11[7],ymm9[7],ymm11[12],ymm9[12],ymm11[13],ymm9[13],ymm11[14],ymm9[14],ymm11[15],ymm9[15] -; AVX512-NEXT: vpermd %zmm0, %zmm16, %zmm21 -; AVX512-NEXT: vpermd %zmm3, %zmm17, %zmm21 {%k1} -; AVX512-NEXT: vmovdqa (%r10), %ymm3 -; AVX512-NEXT: vmovdqa (%r9), %ymm4 -; AVX512-NEXT: vmovdqa (%r8), %ymm6 -; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm6[0],ymm4[0],ymm6[1],ymm4[1],ymm6[2],ymm4[2],ymm6[3],ymm4[3],ymm6[8],ymm4[8],ymm6[9],ymm4[9],ymm6[10],ymm4[10],ymm6[11],ymm4[11] -; AVX512-NEXT: vpermd %zmm8, %zmm19, %zmm27 -; AVX512-NEXT: vmovdqa (%rax), %ymm8 -; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm9 = ymm8[0],ymm3[0],ymm8[1],ymm3[1],ymm8[2],ymm3[2],ymm8[3],ymm3[3],ymm8[8],ymm3[8],ymm8[9],ymm3[9],ymm8[10],ymm3[10],ymm8[11],ymm3[11] -; AVX512-NEXT: vpermd %zmm9, %zmm18, %zmm27 {%k2} -; AVX512-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm8[4],ymm3[4],ymm8[5],ymm3[5],ymm8[6],ymm3[6],ymm8[7],ymm3[7],ymm8[12],ymm3[12],ymm8[13],ymm3[13],ymm8[14],ymm3[14],ymm8[15],ymm3[15] -; AVX512-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm6[4],ymm4[4],ymm6[5],ymm4[5],ymm6[6],ymm4[6],ymm6[7],ymm4[7],ymm6[12],ymm4[12],ymm6[13],ymm4[13],ymm6[14],ymm4[14],ymm6[15],ymm4[15] -; AVX512-NEXT: vmovdqa (%rcx), %ymm6 -; AVX512-NEXT: vpermd %zmm4, %zmm19, %zmm20 -; AVX512-NEXT: vmovdqa (%rsi), %ymm4 -; AVX512-NEXT: vpermd %zmm3, %zmm18, %zmm20 {%k2} -; AVX512-NEXT: vmovdqa (%rdi), %ymm3 -; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[8],ymm4[8],ymm3[9],ymm4[9],ymm3[10],ymm4[10],ymm3[11],ymm4[11] -; AVX512-NEXT: vpermd %zmm8, %zmm16, %zmm18 -; AVX512-NEXT: vmovdqa (%rdx), %ymm8 -; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm9 = ymm8[0],ymm6[0],ymm8[1],ymm6[1],ymm8[2],ymm6[2],ymm8[3],ymm6[3],ymm8[8],ymm6[8],ymm8[9],ymm6[9],ymm8[10],ymm6[10],ymm8[11],ymm6[11] -; AVX512-NEXT: vpermd %zmm9, %zmm17, %zmm18 {%k1} -; AVX512-NEXT: vpunpckhwd {{.*#+}} ymm6 = ymm8[4],ymm6[4],ymm8[5],ymm6[5],ymm8[6],ymm6[6],ymm8[7],ymm6[7],ymm8[12],ymm6[12],ymm8[13],ymm6[13],ymm8[14],ymm6[14],ymm8[15],ymm6[15] -; AVX512-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm3[4],ymm4[4],ymm3[5],ymm4[5],ymm3[6],ymm4[6],ymm3[7],ymm4[7],ymm3[12],ymm4[12],ymm3[13],ymm4[13],ymm3[14],ymm4[14],ymm3[15],ymm4[15] -; AVX512-NEXT: vmovdqa 32(%r10), %xmm4 -; AVX512-NEXT: vpermd %zmm3, %zmm16, %zmm16 +; AVX512-NEXT: vpermt2d %zmm2, %zmm19, %zmm0 +; AVX512-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1} +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa 64(%r10), %xmm7 +; AVX512-NEXT: vmovdqa 64(%rax), %xmm2 +; AVX512-NEXT: vmovdqa 64(%r9), %xmm8 +; AVX512-NEXT: vmovdqa 64(%r8), %xmm3 +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3] +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3] +; AVX512-NEXT: vpermt2d %zmm4, %zmm16, %zmm5 +; AVX512-NEXT: vmovdqa 64(%rcx), %xmm4 +; AVX512-NEXT: vmovdqa 64(%rdx), %xmm6 +; AVX512-NEXT: vmovdqa 64(%rsi), %xmm14 +; AVX512-NEXT: vmovdqa 64(%rdi), %xmm0 +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3] +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3] +; AVX512-NEXT: vpermt2d %zmm1, %zmm17, %zmm9 +; AVX512-NEXT: vmovdqa64 %zmm5, %zmm9 {%k1} +; AVX512-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm7[4],xmm2[5],xmm7[5],xmm2[6],xmm7[6],xmm2[7],xmm7[7] +; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm8[4],xmm3[5],xmm8[5],xmm3[6],xmm8[6],xmm3[7],xmm8[7] +; AVX512-NEXT: vpermt2d %zmm1, %zmm16, %zmm2 +; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] +; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7] +; AVX512-NEXT: vpermt2d %zmm1, %zmm17, %zmm15 +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm15 {%k1} +; AVX512-NEXT: vmovdqa 32(%r10), %ymm8 +; AVX512-NEXT: vmovdqa 32(%rax), %ymm4 +; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm4[0],ymm8[0],ymm4[1],ymm8[1],ymm4[2],ymm8[2],ymm4[3],ymm8[3],ymm4[8],ymm8[8],ymm4[9],ymm8[9],ymm4[10],ymm8[10],ymm4[11],ymm8[11] +; AVX512-NEXT: vmovdqa 32(%r9), %ymm3 +; AVX512-NEXT: vmovdqa 32(%r8), %ymm5 +; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm5[0],ymm3[0],ymm5[1],ymm3[1],ymm5[2],ymm3[2],ymm5[3],ymm3[3],ymm5[8],ymm3[8],ymm5[9],ymm3[9],ymm5[10],ymm3[10],ymm5[11],ymm3[11] +; AVX512-NEXT: vpermt2d %zmm2, %zmm18, %zmm6 +; AVX512-NEXT: vmovdqa 32(%rcx), %ymm2 +; AVX512-NEXT: vmovdqa 32(%rdx), %ymm7 +; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm9 = ymm7[0],ymm2[0],ymm7[1],ymm2[1],ymm7[2],ymm2[2],ymm7[3],ymm2[3],ymm7[8],ymm2[8],ymm7[9],ymm2[9],ymm7[10],ymm2[10],ymm7[11],ymm2[11] +; AVX512-NEXT: vmovdqa 32(%rsi), %ymm1 +; AVX512-NEXT: vmovdqa 32(%rdi), %ymm0 +; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm14 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] +; AVX512-NEXT: vpermt2d %zmm9, %zmm19, %zmm14 +; AVX512-NEXT: vmovdqa64 %zmm6, %zmm14 {%k1} +; AVX512-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm4[4],ymm8[4],ymm4[5],ymm8[5],ymm4[6],ymm8[6],ymm4[7],ymm8[7],ymm4[12],ymm8[12],ymm4[13],ymm8[13],ymm4[14],ymm8[14],ymm4[15],ymm8[15] +; AVX512-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm5[4],ymm3[4],ymm5[5],ymm3[5],ymm5[6],ymm3[6],ymm5[7],ymm3[7],ymm5[12],ymm3[12],ymm5[13],ymm3[13],ymm5[14],ymm3[14],ymm5[15],ymm3[15] +; AVX512-NEXT: vpermt2d %zmm4, %zmm18, %zmm5 +; AVX512-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm7[4],ymm2[4],ymm7[5],ymm2[5],ymm7[6],ymm2[6],ymm7[7],ymm2[7],ymm7[12],ymm2[12],ymm7[13],ymm2[13],ymm7[14],ymm2[14],ymm7[15],ymm2[15] +; AVX512-NEXT: vpunpckhwd {{.*#+}} ymm13 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15] +; AVX512-NEXT: vpermt2d %zmm2, %zmm19, %zmm13 +; AVX512-NEXT: vmovdqa64 %zmm5, %zmm13 {%k1} +; AVX512-NEXT: vmovdqa 32(%r10), %xmm10 ; AVX512-NEXT: vmovdqa 32(%rax), %xmm3 -; AVX512-NEXT: vpermd %zmm6, %zmm17, %zmm16 {%k1} -; AVX512-NEXT: vmovdqa 32(%r9), %xmm6 -; AVX512-NEXT: vpermd %zmm10, %zmm30, %zmm19 -; AVX512-NEXT: vmovdqa 32(%r8), %xmm8 -; AVX512-NEXT: vpermd %zmm2, %zmm29, %zmm19 {%k2} -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3] -; AVX512-NEXT: vpermd %zmm2, %zmm30, %zmm10 -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; AVX512-NEXT: vpermd %zmm9, %zmm29, %zmm10 {%k2} -; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; AVX512-NEXT: vmovdqa (%rcx), %xmm0 -; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] -; AVX512-NEXT: vmovdqa (%rdx), %xmm13 -; AVX512-NEXT: vpermd %zmm4, %zmm30, %zmm17 -; AVX512-NEXT: vpermd %zmm3, %zmm29, %zmm17 {%k2} -; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm13[4],xmm0[4],xmm13[5],xmm0[5],xmm13[6],xmm0[6],xmm13[7],xmm0[7] -; AVX512-NEXT: vmovdqa64 %xmm0, %xmm22 -; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload -; AVX512-NEXT: # xmm4 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; AVX512-NEXT: vmovdqa (%rsi), %xmm9 -; AVX512-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm6 # 16-byte Folded Reload -; AVX512-NEXT: # xmm6 = xmm7[0],mem[0],xmm7[1],mem[1],xmm7[2],mem[2],xmm7[3],mem[3] -; AVX512-NEXT: vmovdqa (%rdi), %xmm12 -; AVX512-NEXT: vpermd %zmm6, %zmm30, %zmm8 -; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm12[4],xmm9[4],xmm12[5],xmm9[5],xmm12[6],xmm9[6],xmm12[7],xmm9[7] -; AVX512-NEXT: vpermd %zmm4, %zmm29, %zmm8 {%k2} -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm29 = [0,0,1,0,1,0,0,0,2,2,3,3,3,3,0,0] -; AVX512-NEXT: vpermd %zmm6, %zmm29, %zmm7 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm30 = [0,0,1,1,1,1,0,0,2,2,3,3,3,3,0,0] -; AVX512-NEXT: vpermd %zmm3, %zmm30, %zmm7 {%k1} -; AVX512-NEXT: vmovdqa 96(%rsi), %xmm3 -; AVX512-NEXT: vmovdqa 96(%rdi), %xmm2 -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX512-NEXT: vpermd %zmm6, %zmm29, %zmm6 -; AVX512-NEXT: vmovdqa 96(%rcx), %xmm1 -; AVX512-NEXT: vmovdqa 96(%rdx), %xmm0 -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512-NEXT: vpermd %zmm14, %zmm30, %zmm6 {%k1} -; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] -; AVX512-NEXT: vpermd %zmm1, %zmm29, %zmm5 -; AVX512-NEXT: vpermd %zmm0, %zmm30, %zmm5 {%k1} -; AVX512-NEXT: vmovdqa 64(%rsi), %xmm1 -; AVX512-NEXT: vmovdqa 64(%rdi), %xmm2 -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX512-NEXT: vpermd %zmm3, %zmm29, %zmm4 -; AVX512-NEXT: vmovdqa 64(%rcx), %xmm14 -; AVX512-NEXT: vmovdqa 64(%rdx), %xmm0 -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3] -; AVX512-NEXT: vpermd %zmm15, %zmm30, %zmm4 {%k1} -; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7] -; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; AVX512-NEXT: vpermd %zmm1, %zmm29, %zmm3 -; AVX512-NEXT: vpermd %zmm0, %zmm30, %zmm3 {%k1} -; AVX512-NEXT: vmovdqa 32(%rsi), %xmm1 -; AVX512-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX512-NEXT: vpermd %zmm14, %zmm29, %zmm14 -; AVX512-NEXT: vmovdqa 32(%rcx), %xmm15 -; AVX512-NEXT: vmovdqa 32(%rdx), %xmm0 -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3] -; AVX512-NEXT: vpermd %zmm11, %zmm30, %zmm14 {%k1} -; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7] -; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; AVX512-NEXT: vpermd %zmm1, %zmm29, %zmm1 -; AVX512-NEXT: vpermd %zmm0, %zmm30, %zmm1 {%k1} -; AVX512-NEXT: vmovdqa64 %xmm22, %xmm0 -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm13[0],xmm0[0],xmm13[1],xmm0[1],xmm13[2],xmm0[2],xmm13[3],xmm0[3] -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm12[0],xmm9[0],xmm12[1],xmm9[1],xmm12[2],xmm9[2],xmm12[3],xmm9[3] -; AVX512-NEXT: vpermd %zmm2, %zmm29, %zmm2 -; AVX512-NEXT: vpermd %zmm0, %zmm30, %zmm2 {%k1} -; AVX512-NEXT: movb $-86, %al -; AVX512-NEXT: kmovw %eax, %k1 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm9, %zmm0 {%k1} -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm9, %zmm31 {%k1} -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm9, %zmm6 {%k1} -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm9, %zmm5 {%k1} -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm9, %zmm26 {%k1} -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm9, %zmm24 {%k1} -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm9, %zmm4 {%k1} -; AVX512-NEXT: vmovdqa64 %zmm19, %zmm3 {%k1} -; AVX512-NEXT: vmovdqa64 %zmm28, %zmm23 {%k1} -; AVX512-NEXT: vmovdqa64 %zmm25, %zmm21 {%k1} -; AVX512-NEXT: vmovdqa64 %zmm10, %zmm14 {%k1} -; AVX512-NEXT: vmovdqa64 %zmm17, %zmm1 {%k1} -; AVX512-NEXT: vmovdqa64 %zmm27, %zmm18 {%k1} -; AVX512-NEXT: vmovdqa64 %zmm20, %zmm16 {%k1} -; AVX512-NEXT: vmovdqa64 %zmm8, %zmm2 {%k1} +; AVX512-NEXT: vmovdqa 32(%r9), %xmm4 +; AVX512-NEXT: vmovdqa 32(%r8), %xmm5 +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm10[0],xmm3[1],xmm10[1],xmm3[2],xmm10[2],xmm3[3],xmm10[3] +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX512-NEXT: vpermt2d %zmm2, %zmm16, %zmm6 +; AVX512-NEXT: vmovdqa 32(%rcx), %xmm7 +; AVX512-NEXT: vmovdqa 32(%rdx), %xmm8 +; AVX512-NEXT: vmovdqa 32(%rsi), %xmm9 +; AVX512-NEXT: vmovdqa 32(%rdi), %xmm0 +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3] +; AVX512-NEXT: vpermt2d %zmm1, %zmm17, %zmm12 +; AVX512-NEXT: vmovdqa64 %zmm6, %zmm12 {%k1} +; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm3[4],xmm10[4],xmm3[5],xmm10[5],xmm3[6],xmm10[6],xmm3[7],xmm10[7] +; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; AVX512-NEXT: vpermt2d %zmm1, %zmm16, %zmm3 +; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] +; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7] +; AVX512-NEXT: vpermt2d %zmm1, %zmm17, %zmm4 +; AVX512-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512-NEXT: vmovdqa (%r10), %ymm10 +; AVX512-NEXT: vmovdqa (%rax), %ymm3 +; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm3[0],ymm10[0],ymm3[1],ymm10[1],ymm3[2],ymm10[2],ymm3[3],ymm10[3],ymm3[8],ymm10[8],ymm3[9],ymm10[9],ymm3[10],ymm10[10],ymm3[11],ymm10[11] +; AVX512-NEXT: vmovdqa (%r9), %ymm5 +; AVX512-NEXT: vmovdqa (%r8), %ymm6 +; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm6[0],ymm5[0],ymm6[1],ymm5[1],ymm6[2],ymm5[2],ymm6[3],ymm5[3],ymm6[8],ymm5[8],ymm6[9],ymm5[9],ymm6[10],ymm5[10],ymm6[11],ymm5[11] +; AVX512-NEXT: vpermt2d %zmm0, %zmm18, %zmm7 +; AVX512-NEXT: vmovdqa (%rcx), %ymm8 +; AVX512-NEXT: vmovdqa (%rdx), %ymm9 +; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm11 = ymm9[0],ymm8[0],ymm9[1],ymm8[1],ymm9[2],ymm8[2],ymm9[3],ymm8[3],ymm9[8],ymm8[8],ymm9[9],ymm8[9],ymm9[10],ymm8[10],ymm9[11],ymm8[11] +; AVX512-NEXT: vmovdqa (%rsi), %ymm2 +; AVX512-NEXT: vmovdqa (%rdi), %ymm1 +; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11] +; AVX512-NEXT: vpermt2d %zmm11, %zmm19, %zmm0 +; AVX512-NEXT: vmovdqa64 %zmm7, %zmm0 {%k1} +; AVX512-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm3[4],ymm10[4],ymm3[5],ymm10[5],ymm3[6],ymm10[6],ymm3[7],ymm10[7],ymm3[12],ymm10[12],ymm3[13],ymm10[13],ymm3[14],ymm10[14],ymm3[15],ymm10[15] +; AVX512-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm6[4],ymm5[4],ymm6[5],ymm5[5],ymm6[6],ymm5[6],ymm6[7],ymm5[7],ymm6[12],ymm5[12],ymm6[13],ymm5[13],ymm6[14],ymm5[14],ymm6[15],ymm5[15] +; AVX512-NEXT: vpermt2d %zmm3, %zmm18, %zmm5 +; AVX512-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm9[4],ymm8[4],ymm9[5],ymm8[5],ymm9[6],ymm8[6],ymm9[7],ymm8[7],ymm9[12],ymm8[12],ymm9[13],ymm8[13],ymm9[14],ymm8[14],ymm9[15],ymm8[15] +; AVX512-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15] +; AVX512-NEXT: vpermt2d %zmm3, %zmm19, %zmm1 +; AVX512-NEXT: vmovdqa64 %zmm5, %zmm1 {%k1} +; AVX512-NEXT: vmovdqa64 %xmm20, %xmm2 +; AVX512-NEXT: vmovdqa64 %xmm21, %xmm3 +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX512-NEXT: vmovdqa64 %xmm22, %xmm3 +; AVX512-NEXT: vmovdqa64 %xmm23, %xmm5 +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] +; AVX512-NEXT: vpermt2d %zmm2, %zmm16, %zmm3 +; AVX512-NEXT: vmovdqa64 %xmm24, %xmm2 +; AVX512-NEXT: vmovdqa64 %xmm25, %xmm5 +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] +; AVX512-NEXT: vmovdqa64 %xmm26, %xmm5 +; AVX512-NEXT: vmovdqa64 %xmm27, %xmm6 +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; AVX512-NEXT: vpermt2d %zmm2, %zmm17, %zmm5 +; AVX512-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: vmovdqa64 %zmm2, (%rax) -; AVX512-NEXT: vmovdqa64 %zmm16, 192(%rax) -; AVX512-NEXT: vmovdqa64 %zmm18, 128(%rax) -; AVX512-NEXT: vmovdqa64 %zmm1, 320(%rax) -; AVX512-NEXT: vmovdqa64 %zmm14, 256(%rax) -; AVX512-NEXT: vmovdqa64 %zmm21, 448(%rax) -; AVX512-NEXT: vmovdqa64 %zmm23, 384(%rax) -; AVX512-NEXT: vmovdqa64 %zmm3, 576(%rax) -; AVX512-NEXT: vmovdqa64 %zmm4, 512(%rax) -; AVX512-NEXT: vmovdqa64 %zmm24, 704(%rax) -; AVX512-NEXT: vmovdqa64 %zmm26, 640(%rax) -; AVX512-NEXT: vmovdqa64 %zmm5, 832(%rax) -; AVX512-NEXT: vmovdqa64 %zmm6, 768(%rax) -; AVX512-NEXT: vmovdqa64 %zmm31, 960(%rax) -; AVX512-NEXT: vmovdqa64 %zmm0, 896(%rax) -; AVX512-NEXT: vmovdqa64 %zmm7, 64(%rax) -; AVX512-NEXT: addq $504, %rsp # imm = 0x1F8 +; AVX512-NEXT: vmovdqa64 %zmm5, (%rax) +; AVX512-NEXT: vmovdqa64 %zmm1, 192(%rax) +; AVX512-NEXT: vmovdqa64 %zmm0, 128(%rax) +; AVX512-NEXT: vmovdqa64 %zmm4, 320(%rax) +; AVX512-NEXT: vmovdqa64 %zmm12, 256(%rax) +; AVX512-NEXT: vmovdqa64 %zmm13, 448(%rax) +; AVX512-NEXT: vmovdqa64 %zmm14, 384(%rax) +; AVX512-NEXT: vmovdqa64 %zmm15, 576(%rax) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm0, 512(%rax) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm0, 704(%rax) +; AVX512-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm0, 640(%rax) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm0, 832(%rax) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm0, 768(%rax) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm0, 960(%rax) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm0, 896(%rax) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm0, 64(%rax) +; AVX512-NEXT: addq $392, %rsp # imm = 0x188 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; ; AVX512-FCP-LABEL: store_i16_stride8_vf64: ; AVX512-FCP: # %bb.0: -; AVX512-FCP-NEXT: subq $264, %rsp # imm = 0x108 +; AVX512-FCP-NEXT: subq $328, %rsp # imm = 0x148 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512-FCP-NEXT: vmovdqa (%r10), %xmm1 -; AVX512-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-FCP-NEXT: vmovdqa (%rax), %xmm0 -; AVX512-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX512-FCP-NEXT: vmovdqa (%rax), %xmm2 +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; AVX512-FCP-NEXT: vmovdqa64 %xmm2, %xmm21 +; AVX512-FCP-NEXT: vmovdqa64 %xmm1, %xmm22 ; AVX512-FCP-NEXT: vmovdqa (%r9), %xmm2 -; AVX512-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-FCP-NEXT: vmovdqa (%r8), %xmm1 -; AVX512-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,0,0,0,0,0,1,1,2,2,2,2,0,0,3,3] -; AVX512-FCP-NEXT: vpermd %zmm1, %zmm14, %zmm1 -; AVX512-FCP-NEXT: movw $-30584, %r11w # imm = 0x8888 -; AVX512-FCP-NEXT: kmovw %r11d, %k2 -; AVX512-FCP-NEXT: vpermd %zmm0, %zmm14, %zmm1 {%k2} -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa (%rcx), %xmm1 -; AVX512-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm0 -; AVX512-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm2 -; AVX512-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm1 -; AVX512-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,0,1,1,1,1,0,0,2,0,3,0,3,0,0,0] -; AVX512-FCP-NEXT: vpermd %zmm1, %zmm2, %zmm17 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,1,1,1,1,0,0,2,2,3,3,3,3,0,0] -; AVX512-FCP-NEXT: movw $8738, %r11w # imm = 0x2222 +; AVX512-FCP-NEXT: vmovdqa (%r8), %xmm3 +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; AVX512-FCP-NEXT: vmovdqa64 %xmm3, %xmm23 +; AVX512-FCP-NEXT: vmovdqa64 %xmm2, %xmm24 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm16 = [0,0,0,16,0,0,1,17,2,2,2,18,0,0,3,19] +; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm16, %zmm1 +; AVX512-FCP-NEXT: vmovdqa (%rcx), %xmm2 +; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm3 +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; AVX512-FCP-NEXT: vmovdqa64 %xmm3, %xmm25 +; AVX512-FCP-NEXT: vmovdqa64 %xmm2, %xmm26 +; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm3 +; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm4 +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; AVX512-FCP-NEXT: vmovdqa64 %xmm4, %xmm27 +; AVX512-FCP-NEXT: vmovdqa64 %xmm3, %xmm28 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm20 = [0,16,1,0,1,17,0,0,2,18,3,0,3,19,0,0] +; AVX512-FCP-NEXT: vpermi2d %zmm0, %zmm2, %zmm20 +; AVX512-FCP-NEXT: movb $-86, %r11b ; AVX512-FCP-NEXT: kmovw %r11d, %k1 -; AVX512-FCP-NEXT: vpermd %zmm0, %zmm7, %zmm17 {%k1} -; AVX512-FCP-NEXT: vmovdqa 96(%r10), %ymm5 -; AVX512-FCP-NEXT: vmovdqa 96(%rax), %ymm4 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[8],ymm5[8],ymm4[9],ymm5[9],ymm4[10],ymm5[10],ymm4[11],ymm5[11] -; AVX512-FCP-NEXT: vmovdqa 96(%r9), %ymm13 -; AVX512-FCP-NEXT: vmovdqa 96(%r8), %ymm15 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm11 = ymm15[0],ymm13[0],ymm15[1],ymm13[1],ymm15[2],ymm13[2],ymm15[3],ymm13[3],ymm15[8],ymm13[8],ymm15[9],ymm13[9],ymm15[10],ymm13[10],ymm15[11],ymm13[11] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm23 = [0,1,4,5,4,5,5,7,2,1,6,5,6,5,7,7] -; AVX512-FCP-NEXT: vpermd %zmm11, %zmm23, %zmm0 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm22 = [0,0,4,4,4,4,6,5,0,2,4,6,4,6,6,7] -; AVX512-FCP-NEXT: vpermd %zmm6, %zmm22, %zmm0 {%k2} -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa 96(%rcx), %ymm3 -; AVX512-FCP-NEXT: vmovdqa 96(%rsi), %ymm2 -; AVX512-FCP-NEXT: vmovdqa 96(%rdi), %ymm1 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm20 = [4,5,1,3,5,7,5,7,6,5,3,3,7,7,7,7] -; AVX512-FCP-NEXT: vpermd %zmm6, %zmm20, %zmm10 -; AVX512-FCP-NEXT: vmovdqa 96(%rdx), %ymm0 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm21 = [4,4,2,1,6,5,6,5,4,6,2,3,6,7,6,7] -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm9 = ymm0[0],ymm3[0],ymm0[1],ymm3[1],ymm0[2],ymm3[2],ymm0[3],ymm3[3],ymm0[8],ymm3[8],ymm0[9],ymm3[9],ymm0[10],ymm3[10],ymm0[11],ymm3[11] -; AVX512-FCP-NEXT: vpermd %zmm9, %zmm21, %zmm10 {%k1} -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm4[4],ymm5[4],ymm4[5],ymm5[5],ymm4[6],ymm5[6],ymm4[7],ymm5[7],ymm4[12],ymm5[12],ymm4[13],ymm5[13],ymm4[14],ymm5[14],ymm4[15],ymm5[15] -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm15[4],ymm13[4],ymm15[5],ymm13[5],ymm15[6],ymm13[6],ymm15[7],ymm13[7],ymm15[12],ymm13[12],ymm15[13],ymm13[13],ymm15[14],ymm13[14],ymm15[15],ymm13[15] -; AVX512-FCP-NEXT: vpermd %zmm5, %zmm23, %zmm5 -; AVX512-FCP-NEXT: vpermd %zmm4, %zmm22, %zmm5 {%k2} -; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm3[4],ymm0[5],ymm3[5],ymm0[6],ymm3[6],ymm0[7],ymm3[7],ymm0[12],ymm3[12],ymm0[13],ymm3[13],ymm0[14],ymm3[14],ymm0[15],ymm3[15] -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15] -; AVX512-FCP-NEXT: vpermd %zmm1, %zmm20, %zmm13 -; AVX512-FCP-NEXT: vpermd %zmm0, %zmm21, %zmm13 {%k1} -; AVX512-FCP-NEXT: vmovdqa 64(%r10), %ymm0 -; AVX512-FCP-NEXT: vmovdqa 64(%rax), %ymm1 +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm20 {%k1} +; AVX512-FCP-NEXT: vmovdqa 96(%r10), %ymm0 +; AVX512-FCP-NEXT: vmovdqa 96(%rax), %ymm1 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] +; AVX512-FCP-NEXT: vmovdqa 96(%r9), %ymm3 +; AVX512-FCP-NEXT: vmovdqa 96(%r8), %ymm4 +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm17 = [0,1,4,20,4,5,5,21,2,1,6,22,6,5,7,23] +; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm17, %zmm5 +; AVX512-FCP-NEXT: vmovdqa 96(%rcx), %ymm2 +; AVX512-FCP-NEXT: vmovdqa 96(%rdx), %ymm6 +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm11 = ymm6[0],ymm2[0],ymm6[1],ymm2[1],ymm6[2],ymm2[2],ymm6[3],ymm2[3],ymm6[8],ymm2[8],ymm6[9],ymm2[9],ymm6[10],ymm2[10],ymm6[11],ymm2[11] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm19 = [4,20,1,3,5,21,5,7,6,22,3,3,7,23,7,7] +; AVX512-FCP-NEXT: vmovdqa 96(%rsi), %ymm12 +; AVX512-FCP-NEXT: vmovdqa 96(%rdi), %ymm13 +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm13[0],ymm12[0],ymm13[1],ymm12[1],ymm13[2],ymm12[2],ymm13[3],ymm12[3],ymm13[8],ymm12[8],ymm13[9],ymm12[9],ymm13[10],ymm12[10],ymm13[11],ymm12[11] +; AVX512-FCP-NEXT: vpermt2d %zmm11, %zmm19, %zmm7 +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1} +; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15] +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm4[4],ymm3[4],ymm4[5],ymm3[5],ymm4[6],ymm3[6],ymm4[7],ymm3[7],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15] +; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm17, %zmm1 +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm6[4],ymm2[4],ymm6[5],ymm2[5],ymm6[6],ymm2[6],ymm6[7],ymm2[7],ymm6[12],ymm2[12],ymm6[13],ymm2[13],ymm6[14],ymm2[14],ymm6[15],ymm2[15] +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm13[4],ymm12[4],ymm13[5],ymm12[5],ymm13[6],ymm12[6],ymm13[7],ymm12[7],ymm13[12],ymm12[12],ymm13[13],ymm12[13],ymm13[14],ymm12[14],ymm13[15],ymm12[15] +; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm19, %zmm2 +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa 96(%r10), %xmm0 +; AVX512-FCP-NEXT: vmovdqa 96(%rax), %xmm1 +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX512-FCP-NEXT: vmovdqa 96(%r9), %xmm3 +; AVX512-FCP-NEXT: vmovdqa 96(%r8), %xmm4 +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm16, %zmm5 +; AVX512-FCP-NEXT: vmovdqa 96(%rcx), %xmm2 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm18 = [0,16,1,0,1,17,0,0,2,18,3,3,3,19,0,0] +; AVX512-FCP-NEXT: vmovdqa 96(%rdx), %xmm6 +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3] +; AVX512-FCP-NEXT: vmovdqa 96(%rsi), %xmm13 +; AVX512-FCP-NEXT: vmovdqa 96(%rdi), %xmm15 +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm15[0],xmm13[0],xmm15[1],xmm13[1],xmm15[2],xmm13[2],xmm15[3],xmm13[3] +; AVX512-FCP-NEXT: vpermt2d %zmm12, %zmm18, %zmm7 +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1} +; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm16, %zmm1 +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7] +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm15[4],xmm13[4],xmm15[5],xmm13[5],xmm15[6],xmm13[6],xmm15[7],xmm13[7] +; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm18, %zmm2 +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa 64(%r10), %ymm6 +; AVX512-FCP-NEXT: vmovdqa 64(%rax), %ymm5 +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm5[0],ymm6[0],ymm5[1],ymm6[1],ymm5[2],ymm6[2],ymm5[3],ymm6[3],ymm5[8],ymm6[8],ymm5[9],ymm6[9],ymm5[10],ymm6[10],ymm5[11],ymm6[11] ; AVX512-FCP-NEXT: vmovdqa 64(%r9), %ymm3 ; AVX512-FCP-NEXT: vmovdqa 64(%r8), %ymm4 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11] -; AVX512-FCP-NEXT: vpermd %zmm5, %zmm23, %zmm5 -; AVX512-FCP-NEXT: vpermd %zmm2, %zmm22, %zmm5 {%k2} -; AVX512-FCP-NEXT: vmovdqu64 %zmm5, (%rsp) # 64-byte Spill +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm12 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11] +; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm17, %zmm12 ; AVX512-FCP-NEXT: vmovdqa 64(%rcx), %ymm2 -; AVX512-FCP-NEXT: vmovdqa 64(%rsi), %ymm5 -; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %ymm9 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm9[0],ymm5[0],ymm9[1],ymm5[1],ymm9[2],ymm5[2],ymm9[3],ymm5[3],ymm9[8],ymm5[8],ymm9[9],ymm5[9],ymm9[10],ymm5[10],ymm9[11],ymm5[11] -; AVX512-FCP-NEXT: vpermd %zmm8, %zmm20, %zmm16 -; AVX512-FCP-NEXT: vmovdqa 64(%rdx), %ymm8 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm12 = ymm8[0],ymm2[0],ymm8[1],ymm2[1],ymm8[2],ymm2[2],ymm8[3],ymm2[3],ymm8[8],ymm2[8],ymm8[9],ymm2[9],ymm8[10],ymm2[10],ymm8[11],ymm2[11] -; AVX512-FCP-NEXT: vpermd %zmm12, %zmm21, %zmm16 {%k1} -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15] -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm4[4],ymm3[4],ymm4[5],ymm3[5],ymm4[6],ymm3[6],ymm4[7],ymm3[7],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15] -; AVX512-FCP-NEXT: vpermd %zmm1, %zmm23, %zmm19 -; AVX512-FCP-NEXT: vpermd %zmm0, %zmm22, %zmm19 {%k2} -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm8[4],ymm2[4],ymm8[5],ymm2[5],ymm8[6],ymm2[6],ymm8[7],ymm2[7],ymm8[12],ymm2[12],ymm8[13],ymm2[13],ymm8[14],ymm2[14],ymm8[15],ymm2[15] -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm9[4],ymm5[4],ymm9[5],ymm5[5],ymm9[6],ymm5[6],ymm9[7],ymm5[7],ymm9[12],ymm5[12],ymm9[13],ymm5[13],ymm9[14],ymm5[14],ymm9[15],ymm5[15] -; AVX512-FCP-NEXT: vpermd %zmm1, %zmm20, %zmm18 -; AVX512-FCP-NEXT: vpermd %zmm0, %zmm21, %zmm18 {%k1} -; AVX512-FCP-NEXT: vmovdqa 32(%r10), %ymm0 -; AVX512-FCP-NEXT: vmovdqa 32(%rax), %ymm1 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] -; AVX512-FCP-NEXT: vmovdqa 32(%r9), %ymm3 -; AVX512-FCP-NEXT: vmovdqa 32(%r8), %ymm4 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11] -; AVX512-FCP-NEXT: vpermd %zmm5, %zmm23, %zmm25 -; AVX512-FCP-NEXT: vpermd %zmm2, %zmm22, %zmm25 {%k2} -; AVX512-FCP-NEXT: vmovdqa 32(%rsi), %ymm2 -; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm5 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm5[0],ymm2[0],ymm5[1],ymm2[1],ymm5[2],ymm2[2],ymm5[3],ymm2[3],ymm5[8],ymm2[8],ymm5[9],ymm2[9],ymm5[10],ymm2[10],ymm5[11],ymm2[11] -; AVX512-FCP-NEXT: vpermd %zmm8, %zmm20, %zmm24 -; AVX512-FCP-NEXT: vmovdqa 32(%rcx), %ymm8 -; AVX512-FCP-NEXT: vmovdqa 32(%rdx), %ymm9 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm12 = ymm9[0],ymm8[0],ymm9[1],ymm8[1],ymm9[2],ymm8[2],ymm9[3],ymm8[3],ymm9[8],ymm8[8],ymm9[9],ymm8[9],ymm9[10],ymm8[10],ymm9[11],ymm8[11] -; AVX512-FCP-NEXT: vpermd %zmm12, %zmm21, %zmm24 {%k1} -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15] -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm4[4],ymm3[4],ymm4[5],ymm3[5],ymm4[6],ymm3[6],ymm4[7],ymm3[7],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15] -; AVX512-FCP-NEXT: vpermd %zmm1, %zmm23, %zmm27 -; AVX512-FCP-NEXT: vpermd %zmm0, %zmm22, %zmm27 {%k2} -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm9[4],ymm8[4],ymm9[5],ymm8[5],ymm9[6],ymm8[6],ymm9[7],ymm8[7],ymm9[12],ymm8[12],ymm9[13],ymm8[13],ymm9[14],ymm8[14],ymm9[15],ymm8[15] -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm5[4],ymm2[4],ymm5[5],ymm2[5],ymm5[6],ymm2[6],ymm5[7],ymm2[7],ymm5[12],ymm2[12],ymm5[13],ymm2[13],ymm5[14],ymm2[14],ymm5[15],ymm2[15] -; AVX512-FCP-NEXT: vpermd %zmm1, %zmm20, %zmm26 -; AVX512-FCP-NEXT: vpermd %zmm0, %zmm21, %zmm26 {%k1} -; AVX512-FCP-NEXT: vmovdqa (%r10), %ymm0 -; AVX512-FCP-NEXT: vmovdqa (%r9), %ymm1 -; AVX512-FCP-NEXT: vmovdqa (%r8), %ymm2 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11] -; AVX512-FCP-NEXT: vpermd %zmm3, %zmm23, %zmm28 -; AVX512-FCP-NEXT: vmovdqa (%rax), %ymm3 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[8],ymm0[8],ymm3[9],ymm0[9],ymm3[10],ymm0[10],ymm3[11],ymm0[11] -; AVX512-FCP-NEXT: vpermd %zmm4, %zmm22, %zmm28 {%k2} -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm3[4],ymm0[4],ymm3[5],ymm0[5],ymm3[6],ymm0[6],ymm3[7],ymm0[7],ymm3[12],ymm0[12],ymm3[13],ymm0[13],ymm3[14],ymm0[14],ymm3[15],ymm0[15] -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[12],ymm1[12],ymm2[13],ymm1[13],ymm2[14],ymm1[14],ymm2[15],ymm1[15] -; AVX512-FCP-NEXT: vmovdqa (%rcx), %ymm2 -; AVX512-FCP-NEXT: vpermd %zmm1, %zmm23, %zmm23 -; AVX512-FCP-NEXT: vmovdqa (%rsi), %ymm1 -; AVX512-FCP-NEXT: vpermd %zmm0, %zmm22, %zmm23 {%k2} -; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] -; AVX512-FCP-NEXT: vpermd %zmm3, %zmm20, %zmm22 -; AVX512-FCP-NEXT: vmovdqa (%rdx), %ymm3 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11] -; AVX512-FCP-NEXT: vpermd %zmm4, %zmm21, %zmm22 {%k1} -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm3[4],ymm2[4],ymm3[5],ymm2[5],ymm3[6],ymm2[6],ymm3[7],ymm2[7],ymm3[12],ymm2[12],ymm3[13],ymm2[13],ymm3[14],ymm2[14],ymm3[15],ymm2[15] +; AVX512-FCP-NEXT: vmovdqa 64(%rdx), %ymm13 +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm15 = ymm13[0],ymm2[0],ymm13[1],ymm2[1],ymm13[2],ymm2[2],ymm13[3],ymm2[3],ymm13[8],ymm2[8],ymm13[9],ymm2[9],ymm13[10],ymm2[10],ymm13[11],ymm2[11] +; AVX512-FCP-NEXT: vmovdqa 64(%rsi), %ymm1 +; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %ymm0 +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] +; AVX512-FCP-NEXT: vpermt2d %zmm15, %zmm19, %zmm7 +; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm7 {%k1} +; AVX512-FCP-NEXT: vmovdqu64 %zmm7, (%rsp) # 64-byte Spill +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm5[4],ymm6[4],ymm5[5],ymm6[5],ymm5[6],ymm6[6],ymm5[7],ymm6[7],ymm5[12],ymm6[12],ymm5[13],ymm6[13],ymm5[14],ymm6[14],ymm5[15],ymm6[15] +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm4[4],ymm3[4],ymm4[5],ymm3[5],ymm4[6],ymm3[6],ymm4[7],ymm3[7],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15] +; AVX512-FCP-NEXT: vpermt2d %zmm5, %zmm17, %zmm3 +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm13[4],ymm2[4],ymm13[5],ymm2[5],ymm13[6],ymm2[6],ymm13[7],ymm2[7],ymm13[12],ymm2[12],ymm13[13],ymm2[13],ymm13[14],ymm2[14],ymm13[15],ymm2[15] ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15] -; AVX512-FCP-NEXT: vmovdqa 96(%r9), %xmm1 -; AVX512-FCP-NEXT: vpermd %zmm0, %zmm20, %zmm20 -; AVX512-FCP-NEXT: vmovdqa 96(%r8), %xmm0 -; AVX512-FCP-NEXT: vpermd %zmm2, %zmm21, %zmm20 {%k1} -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,0,0,0,0,0,1,0,2,2,2,2,0,0,3,3] -; AVX512-FCP-NEXT: vpermd %zmm2, %zmm3, %zmm21 -; AVX512-FCP-NEXT: vmovdqa 96(%r10), %xmm2 -; AVX512-FCP-NEXT: vmovdqa 96(%rax), %xmm3 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX512-FCP-NEXT: vpermd %zmm4, %zmm14, %zmm21 {%k2} -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX512-FCP-NEXT: vmovdqa 64(%r9), %xmm1 -; AVX512-FCP-NEXT: vpermd %zmm0, %zmm14, %zmm29 -; AVX512-FCP-NEXT: vpermd %zmm2, %zmm14, %zmm29 {%k2} -; AVX512-FCP-NEXT: vmovdqa 64(%r8), %xmm0 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512-FCP-NEXT: vpermd %zmm2, %zmm14, %zmm30 -; AVX512-FCP-NEXT: vmovdqa 64(%r10), %xmm2 -; AVX512-FCP-NEXT: vmovdqa 64(%rax), %xmm3 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX512-FCP-NEXT: vpermd %zmm4, %zmm14, %zmm30 {%k2} -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX512-FCP-NEXT: vpermd %zmm0, %zmm14, %zmm31 -; AVX512-FCP-NEXT: vpermd %zmm2, %zmm14, %zmm31 {%k2} -; AVX512-FCP-NEXT: vmovdqa 32(%r9), %xmm0 -; AVX512-FCP-NEXT: vmovdqa 32(%r8), %xmm2 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; AVX512-FCP-NEXT: vpermd %zmm1, %zmm14, %zmm6 -; AVX512-FCP-NEXT: vmovdqa 32(%r10), %xmm3 -; AVX512-FCP-NEXT: vmovdqa 32(%rax), %xmm4 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX512-FCP-NEXT: vpermd %zmm5, %zmm14, %zmm6 {%k2} -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; AVX512-FCP-NEXT: vpermd %zmm0, %zmm14, %zmm1 -; AVX512-FCP-NEXT: vpermd %zmm3, %zmm14, %zmm1 {%k2} -; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512-FCP-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload -; AVX512-FCP-NEXT: # xmm2 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512-FCP-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload -; AVX512-FCP-NEXT: # xmm3 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; AVX512-FCP-NEXT: vpermd %zmm3, %zmm14, %zmm9 -; AVX512-FCP-NEXT: vpermd %zmm2, %zmm14, %zmm9 {%k2} -; AVX512-FCP-NEXT: vmovdqa 96(%rsi), %xmm2 -; AVX512-FCP-NEXT: vmovdqa 96(%rdi), %xmm3 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,0,1,0,1,0,0,0,2,2,3,3,3,3,0,0] -; AVX512-FCP-NEXT: vpermd %zmm4, %zmm5, %zmm8 -; AVX512-FCP-NEXT: vmovdqa 96(%rcx), %xmm4 -; AVX512-FCP-NEXT: vmovdqa 96(%rdx), %xmm5 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX512-FCP-NEXT: vpermd %zmm12, %zmm7, %zmm8 {%k1} -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; AVX512-FCP-NEXT: vpermd %zmm2, %zmm7, %zmm2 -; AVX512-FCP-NEXT: vpermd %zmm4, %zmm7, %zmm2 {%k1} -; AVX512-FCP-NEXT: vmovdqa 64(%rsi), %xmm3 -; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %xmm4 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX512-FCP-NEXT: vpermd %zmm5, %zmm7, %zmm5 -; AVX512-FCP-NEXT: vmovdqa 64(%rcx), %xmm12 -; AVX512-FCP-NEXT: vmovdqa 64(%rdx), %xmm14 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm14[0],xmm12[0],xmm14[1],xmm12[1],xmm14[2],xmm12[2],xmm14[3],xmm12[3] -; AVX512-FCP-NEXT: vpermd %zmm11, %zmm7, %zmm5 {%k1} -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm14[4],xmm12[4],xmm14[5],xmm12[5],xmm14[6],xmm12[6],xmm14[7],xmm12[7] -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] -; AVX512-FCP-NEXT: vpermd %zmm3, %zmm7, %zmm3 -; AVX512-FCP-NEXT: vpermd %zmm11, %zmm7, %zmm3 {%k1} -; AVX512-FCP-NEXT: vmovdqa 32(%rsi), %xmm4 -; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm11 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm11[0],xmm4[0],xmm11[1],xmm4[1],xmm11[2],xmm4[2],xmm11[3],xmm4[3] -; AVX512-FCP-NEXT: vpermd %zmm12, %zmm7, %zmm12 -; AVX512-FCP-NEXT: vmovdqa 32(%rcx), %xmm14 -; AVX512-FCP-NEXT: vmovdqa 32(%rdx), %xmm0 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3] -; AVX512-FCP-NEXT: vpermd %zmm15, %zmm7, %zmm12 {%k1} -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7] -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm11[4],xmm4[4],xmm11[5],xmm4[5],xmm11[6],xmm4[6],xmm11[7],xmm4[7] -; AVX512-FCP-NEXT: vpermd %zmm4, %zmm7, %zmm4 -; AVX512-FCP-NEXT: vpermd %zmm0, %zmm7, %zmm4 {%k1} -; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512-FCP-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX512-FCP-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX512-FCP-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm11 # 16-byte Folded Reload -; AVX512-FCP-NEXT: # xmm11 = xmm11[0],mem[0],xmm11[1],mem[1],xmm11[2],mem[2],xmm11[3],mem[3] -; AVX512-FCP-NEXT: vpermd %zmm11, %zmm7, %zmm11 -; AVX512-FCP-NEXT: vpermd %zmm0, %zmm7, %zmm11 {%k1} -; AVX512-FCP-NEXT: movb $-86, %al -; AVX512-FCP-NEXT: kmovw %eax, %k1 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm17 {%k1} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 %zmm21, %zmm8 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 %zmm29, %zmm2 {%k1} -; AVX512-FCP-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm16 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 %zmm19, %zmm18 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 %zmm30, %zmm5 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 %zmm31, %zmm3 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 %zmm25, %zmm24 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 %zmm27, %zmm26 {%k1} +; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm19, %zmm0 +; AVX512-FCP-NEXT: vmovdqa 64(%r10), %xmm8 +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1} +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa 64(%rax), %xmm2 +; AVX512-FCP-NEXT: vmovdqa 64(%r9), %xmm7 +; AVX512-FCP-NEXT: vmovdqa 64(%r8), %xmm3 +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3] +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3] +; AVX512-FCP-NEXT: vpermt2d %zmm4, %zmm16, %zmm5 +; AVX512-FCP-NEXT: vmovdqa 64(%rcx), %xmm4 +; AVX512-FCP-NEXT: vmovdqa 64(%rdx), %xmm6 +; AVX512-FCP-NEXT: vmovdqa 64(%rsi), %xmm13 +; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %xmm0 +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3] +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm0[0],xmm13[0],xmm0[1],xmm13[1],xmm0[2],xmm13[2],xmm0[3],xmm13[3] +; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm18, %zmm9 +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm9 {%k1} +; AVX512-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7] +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm7[4],xmm3[5],xmm7[5],xmm3[6],xmm7[6],xmm3[7],xmm7[7] +; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm16, %zmm2 +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm0[4],xmm13[4],xmm0[5],xmm13[5],xmm0[6],xmm13[6],xmm0[7],xmm13[7] +; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm18, %zmm15 +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm15 {%k1} +; AVX512-FCP-NEXT: vmovdqa 32(%r10), %ymm8 +; AVX512-FCP-NEXT: vmovdqa 32(%rax), %ymm4 +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm4[0],ymm8[0],ymm4[1],ymm8[1],ymm4[2],ymm8[2],ymm4[3],ymm8[3],ymm4[8],ymm8[8],ymm4[9],ymm8[9],ymm4[10],ymm8[10],ymm4[11],ymm8[11] +; AVX512-FCP-NEXT: vmovdqa 32(%r9), %ymm3 +; AVX512-FCP-NEXT: vmovdqa 32(%r8), %ymm5 +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm5[0],ymm3[0],ymm5[1],ymm3[1],ymm5[2],ymm3[2],ymm5[3],ymm3[3],ymm5[8],ymm3[8],ymm5[9],ymm3[9],ymm5[10],ymm3[10],ymm5[11],ymm3[11] +; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm17, %zmm6 +; AVX512-FCP-NEXT: vmovdqa 32(%rcx), %ymm2 +; AVX512-FCP-NEXT: vmovdqa 32(%rdx), %ymm7 +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm9 = ymm7[0],ymm2[0],ymm7[1],ymm2[1],ymm7[2],ymm2[2],ymm7[3],ymm2[3],ymm7[8],ymm2[8],ymm7[9],ymm2[9],ymm7[10],ymm2[10],ymm7[11],ymm2[11] +; AVX512-FCP-NEXT: vmovdqa 32(%rsi), %ymm1 +; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm0 +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm14 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] +; AVX512-FCP-NEXT: vpermt2d %zmm9, %zmm19, %zmm14 +; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm14 {%k1} +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm4[4],ymm8[4],ymm4[5],ymm8[5],ymm4[6],ymm8[6],ymm4[7],ymm8[7],ymm4[12],ymm8[12],ymm4[13],ymm8[13],ymm4[14],ymm8[14],ymm4[15],ymm8[15] +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm5[4],ymm3[4],ymm5[5],ymm3[5],ymm5[6],ymm3[6],ymm5[7],ymm3[7],ymm5[12],ymm3[12],ymm5[13],ymm3[13],ymm5[14],ymm3[14],ymm5[15],ymm3[15] +; AVX512-FCP-NEXT: vpermt2d %zmm4, %zmm17, %zmm5 +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm7[4],ymm2[4],ymm7[5],ymm2[5],ymm7[6],ymm2[6],ymm7[7],ymm2[7],ymm7[12],ymm2[12],ymm7[13],ymm2[13],ymm7[14],ymm2[14],ymm7[15],ymm2[15] +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm13 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15] +; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm19, %zmm13 +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm13 {%k1} +; AVX512-FCP-NEXT: vmovdqa 32(%r10), %xmm10 +; AVX512-FCP-NEXT: vmovdqa 32(%rax), %xmm3 +; AVX512-FCP-NEXT: vmovdqa 32(%r9), %xmm4 +; AVX512-FCP-NEXT: vmovdqa 32(%r8), %xmm5 +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm10[0],xmm3[1],xmm10[1],xmm3[2],xmm10[2],xmm3[3],xmm10[3] +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm16, %zmm6 +; AVX512-FCP-NEXT: vmovdqa 32(%rcx), %xmm7 +; AVX512-FCP-NEXT: vmovdqa 32(%rdx), %xmm8 +; AVX512-FCP-NEXT: vmovdqa 32(%rsi), %xmm9 +; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm0 +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3] +; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm18, %zmm12 ; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm12 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 %zmm28, %zmm22 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 %zmm23, %zmm20 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm11 {%k1} +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm3[4],xmm10[4],xmm3[5],xmm10[5],xmm3[6],xmm10[6],xmm3[7],xmm10[7] +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm16, %zmm3 +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7] +; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm18, %zmm4 +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512-FCP-NEXT: vmovdqa (%r10), %ymm10 +; AVX512-FCP-NEXT: vmovdqa (%rax), %ymm3 +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm3[0],ymm10[0],ymm3[1],ymm10[1],ymm3[2],ymm10[2],ymm3[3],ymm10[3],ymm3[8],ymm10[8],ymm3[9],ymm10[9],ymm3[10],ymm10[10],ymm3[11],ymm10[11] +; AVX512-FCP-NEXT: vmovdqa (%r9), %ymm5 +; AVX512-FCP-NEXT: vmovdqa (%r8), %ymm6 +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm6[0],ymm5[0],ymm6[1],ymm5[1],ymm6[2],ymm5[2],ymm6[3],ymm5[3],ymm6[8],ymm5[8],ymm6[9],ymm5[9],ymm6[10],ymm5[10],ymm6[11],ymm5[11] +; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm17, %zmm7 +; AVX512-FCP-NEXT: vmovdqa (%rcx), %ymm8 +; AVX512-FCP-NEXT: vmovdqa (%rdx), %ymm9 +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm11 = ymm9[0],ymm8[0],ymm9[1],ymm8[1],ymm9[2],ymm8[2],ymm9[3],ymm8[3],ymm9[8],ymm8[8],ymm9[9],ymm8[9],ymm9[10],ymm8[10],ymm9[11],ymm8[11] +; AVX512-FCP-NEXT: vmovdqa (%rsi), %ymm2 +; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm1 +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11] +; AVX512-FCP-NEXT: vpermt2d %zmm11, %zmm19, %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm0 {%k1} +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm3[4],ymm10[4],ymm3[5],ymm10[5],ymm3[6],ymm10[6],ymm3[7],ymm10[7],ymm3[12],ymm10[12],ymm3[13],ymm10[13],ymm3[14],ymm10[14],ymm3[15],ymm10[15] +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm6[4],ymm5[4],ymm6[5],ymm5[5],ymm6[6],ymm5[6],ymm6[7],ymm5[7],ymm6[12],ymm5[12],ymm6[13],ymm5[13],ymm6[14],ymm5[14],ymm6[15],ymm5[15] +; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm17, %zmm5 +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm9[4],ymm8[4],ymm9[5],ymm8[5],ymm9[6],ymm8[6],ymm9[7],ymm8[7],ymm9[12],ymm8[12],ymm9[13],ymm8[13],ymm9[14],ymm8[14],ymm9[15],ymm8[15] +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15] +; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm19, %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 %xmm21, %xmm2 +; AVX512-FCP-NEXT: vmovdqa64 %xmm22, %xmm3 +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX512-FCP-NEXT: vmovdqa64 %xmm23, %xmm3 +; AVX512-FCP-NEXT: vmovdqa64 %xmm24, %xmm5 +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] +; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm16, %zmm3 +; AVX512-FCP-NEXT: vmovdqa64 %xmm25, %xmm2 +; AVX512-FCP-NEXT: vmovdqa64 %xmm26, %xmm5 +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] +; AVX512-FCP-NEXT: vmovdqa64 %xmm27, %xmm5 +; AVX512-FCP-NEXT: vmovdqa64 %xmm28, %xmm6 +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm18, %zmm5 +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-FCP-NEXT: vmovdqa64 %zmm11, (%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm20, 192(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm22, 128(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, (%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, 192(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, 128(%rax) ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, 320(%rax) ; AVX512-FCP-NEXT: vmovdqa64 %zmm12, 256(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm26, 448(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm24, 384(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, 576(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, 512(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm18, 704(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm16, 640(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, 832(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm8, 768(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm13, 960(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm10, 896(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm17, 64(%rax) -; AVX512-FCP-NEXT: addq $264, %rsp # imm = 0x108 +; AVX512-FCP-NEXT: vmovdqa64 %zmm13, 448(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm14, 384(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm15, 576(%rax) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm0, 512(%rax) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm0, 704(%rax) +; AVX512-FCP-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm0, 640(%rax) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm0, 832(%rax) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm0, 768(%rax) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm0, 960(%rax) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm0, 896(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm20, 64(%rax) +; AVX512-FCP-NEXT: addq $328, %rsp # imm = 0x148 ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq ; ; AVX512DQ-LABEL: store_i16_stride8_vf64: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: subq $504, %rsp # imm = 0x1F8 +; AVX512DQ-NEXT: subq $392, %rsp # imm = 0x188 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512DQ-NEXT: vmovdqa (%r10), %xmm0 -; AVX512DQ-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-NEXT: vmovdqa 64(%r10), %xmm4 -; AVX512DQ-NEXT: vmovdqa (%rax), %xmm1 -; AVX512DQ-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-NEXT: vmovdqa 64(%rax), %xmm3 -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX512DQ-NEXT: vmovdqa (%r9), %xmm0 -; AVX512DQ-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-NEXT: vmovdqa (%r8), %xmm7 -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7] -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm30 = [0,0,0,0,0,0,1,0,2,2,2,2,0,0,3,3] -; AVX512DQ-NEXT: vpermd %zmm2, %zmm30, %zmm0 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm29 = [0,0,0,0,0,0,1,1,2,2,2,2,0,0,3,3] -; AVX512DQ-NEXT: movw $-30584, %r11w # imm = 0x8888 -; AVX512DQ-NEXT: kmovw %r11d, %k2 -; AVX512DQ-NEXT: vpermd %zmm1, %zmm29, %zmm0 {%k2} -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: movw $8738, %r11w # imm = 0x2222 +; AVX512DQ-NEXT: vmovdqa (%r10), %xmm1 +; AVX512DQ-NEXT: vmovdqa (%rax), %xmm2 +; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; AVX512DQ-NEXT: vmovdqa64 %xmm2, %xmm20 +; AVX512DQ-NEXT: vmovdqa64 %xmm1, %xmm21 +; AVX512DQ-NEXT: vmovdqa (%r9), %xmm2 +; AVX512DQ-NEXT: vmovdqa (%r8), %xmm3 +; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; AVX512DQ-NEXT: vmovdqa64 %xmm3, %xmm22 +; AVX512DQ-NEXT: vmovdqa64 %xmm2, %xmm23 +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm16 = [0,0,0,16,0,0,1,17,2,2,2,18,0,0,3,19] +; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm16, %zmm1 +; AVX512DQ-NEXT: vmovdqa (%rcx), %xmm2 +; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm3 +; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; AVX512DQ-NEXT: vmovdqa64 %xmm3, %xmm24 +; AVX512DQ-NEXT: vmovdqa64 %xmm2, %xmm25 +; AVX512DQ-NEXT: vmovdqa (%rsi), %xmm3 +; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm4 +; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; AVX512DQ-NEXT: vmovdqa64 %xmm4, %xmm26 +; AVX512DQ-NEXT: vmovdqa64 %xmm3, %xmm27 +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm17 = [0,16,1,0,1,17,0,0,2,18,3,3,3,19,0,0] +; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm17, %zmm2 +; AVX512DQ-NEXT: movb $-86, %r11b ; AVX512DQ-NEXT: kmovw %r11d, %k1 -; AVX512DQ-NEXT: vmovdqa 96(%r10), %ymm2 -; AVX512DQ-NEXT: vmovdqa 96(%rax), %ymm5 -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm5[0],ymm2[0],ymm5[1],ymm2[1],ymm5[2],ymm2[2],ymm5[3],ymm2[3],ymm5[8],ymm2[8],ymm5[9],ymm2[9],ymm5[10],ymm2[10],ymm5[11],ymm2[11] -; AVX512DQ-NEXT: vmovdqa 96(%r9), %ymm8 -; AVX512DQ-NEXT: vmovdqa 96(%r8), %ymm9 -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm10 = ymm9[0],ymm8[0],ymm9[1],ymm8[1],ymm9[2],ymm8[2],ymm9[3],ymm8[3],ymm9[8],ymm8[8],ymm9[9],ymm8[9],ymm9[10],ymm8[10],ymm9[11],ymm8[11] -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm19 = [0,1,4,5,4,5,5,7,2,1,6,5,6,5,7,7] -; AVX512DQ-NEXT: vpermd %zmm10, %zmm19, %zmm0 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm18 = [0,0,4,4,4,4,6,5,0,2,4,6,4,6,6,7] -; AVX512DQ-NEXT: vpermd %zmm1, %zmm18, %zmm0 {%k2} -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa 96(%rcx), %ymm10 -; AVX512DQ-NEXT: vmovdqa 96(%rdx), %ymm11 +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa 96(%r10), %ymm0 +; AVX512DQ-NEXT: vmovdqa 96(%rax), %ymm1 +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] +; AVX512DQ-NEXT: vmovdqa 96(%r9), %ymm3 +; AVX512DQ-NEXT: vmovdqa 96(%r8), %ymm4 +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm18 = [0,1,4,20,4,5,5,21,2,1,6,22,6,5,7,23] +; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm18, %zmm5 +; AVX512DQ-NEXT: vmovdqa 96(%rcx), %ymm2 +; AVX512DQ-NEXT: vmovdqa 96(%rdx), %ymm6 +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm19 = [4,20,1,3,5,21,5,7,6,22,3,3,7,23,7,7] +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm10 = ymm6[0],ymm2[0],ymm6[1],ymm2[1],ymm6[2],ymm2[2],ymm6[3],ymm2[3],ymm6[8],ymm2[8],ymm6[9],ymm2[9],ymm6[10],ymm2[10],ymm6[11],ymm2[11] ; AVX512DQ-NEXT: vmovdqa 96(%rsi), %ymm12 ; AVX512DQ-NEXT: vmovdqa 96(%rdi), %ymm13 -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm13[0],ymm12[0],ymm13[1],ymm12[1],ymm13[2],ymm12[2],ymm13[3],ymm12[3],ymm13[8],ymm12[8],ymm13[9],ymm12[9],ymm13[10],ymm12[10],ymm13[11],ymm12[11] -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm16 = [4,5,1,3,5,7,5,7,6,5,3,3,7,7,7,7] -; AVX512DQ-NEXT: vpermd %zmm1, %zmm16, %zmm0 -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm14 = ymm11[0],ymm10[0],ymm11[1],ymm10[1],ymm11[2],ymm10[2],ymm11[3],ymm10[3],ymm11[8],ymm10[8],ymm11[9],ymm10[9],ymm11[10],ymm10[10],ymm11[11],ymm10[11] -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm17 = [4,4,2,1,6,5,6,5,4,6,2,3,6,7,6,7] -; AVX512DQ-NEXT: vpermd %zmm14, %zmm17, %zmm0 {%k1} -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm5[4],ymm2[4],ymm5[5],ymm2[5],ymm5[6],ymm2[6],ymm5[7],ymm2[7],ymm5[12],ymm2[12],ymm5[13],ymm2[13],ymm5[14],ymm2[14],ymm5[15],ymm2[15] -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm9[4],ymm8[4],ymm9[5],ymm8[5],ymm9[6],ymm8[6],ymm9[7],ymm8[7],ymm9[12],ymm8[12],ymm9[13],ymm8[13],ymm9[14],ymm8[14],ymm9[15],ymm8[15] -; AVX512DQ-NEXT: vpermd %zmm5, %zmm19, %zmm0 -; AVX512DQ-NEXT: vpermd %zmm2, %zmm18, %zmm0 {%k2} -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm11[4],ymm10[4],ymm11[5],ymm10[5],ymm11[6],ymm10[6],ymm11[7],ymm10[7],ymm11[12],ymm10[12],ymm11[13],ymm10[13],ymm11[14],ymm10[14],ymm11[15],ymm10[15] -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm13[4],ymm12[4],ymm13[5],ymm12[5],ymm13[6],ymm12[6],ymm13[7],ymm12[7],ymm13[12],ymm12[12],ymm13[13],ymm12[13],ymm13[14],ymm12[14],ymm13[15],ymm12[15] -; AVX512DQ-NEXT: vpermd %zmm5, %zmm16, %zmm31 -; AVX512DQ-NEXT: vpermd %zmm2, %zmm17, %zmm31 {%k1} -; AVX512DQ-NEXT: vmovdqa 96(%r10), %xmm2 -; AVX512DQ-NEXT: vmovdqa 96(%rax), %xmm8 -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm8[0],xmm2[0],xmm8[1],xmm2[1],xmm8[2],xmm2[2],xmm8[3],xmm2[3] -; AVX512DQ-NEXT: vmovdqa 96(%r9), %xmm10 -; AVX512DQ-NEXT: vmovdqa 96(%r8), %xmm11 -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] -; AVX512DQ-NEXT: vpermd %zmm12, %zmm30, %zmm0 -; AVX512DQ-NEXT: vpermd %zmm9, %zmm29, %zmm0 {%k2} -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm8[4],xmm2[4],xmm8[5],xmm2[5],xmm8[6],xmm2[6],xmm8[7],xmm2[7] -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] -; AVX512DQ-NEXT: vpermd %zmm8, %zmm30, %zmm0 -; AVX512DQ-NEXT: vpermd %zmm2, %zmm29, %zmm0 {%k2} -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa 64(%r10), %ymm2 -; AVX512DQ-NEXT: vmovdqa 64(%rax), %ymm9 -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm9[0],ymm2[0],ymm9[1],ymm2[1],ymm9[2],ymm2[2],ymm9[3],ymm2[3],ymm9[8],ymm2[8],ymm9[9],ymm2[9],ymm9[10],ymm2[10],ymm9[11],ymm2[11] -; AVX512DQ-NEXT: vmovdqa 64(%r9), %ymm10 -; AVX512DQ-NEXT: vmovdqa 64(%r8), %ymm11 -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm12 = ymm11[0],ymm10[0],ymm11[1],ymm10[1],ymm11[2],ymm10[2],ymm11[3],ymm10[3],ymm11[8],ymm10[8],ymm11[9],ymm10[9],ymm11[10],ymm10[10],ymm11[11],ymm10[11] -; AVX512DQ-NEXT: vpermd %zmm12, %zmm19, %zmm0 -; AVX512DQ-NEXT: vpermd %zmm8, %zmm18, %zmm0 {%k2} -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa 64(%rcx), %ymm12 -; AVX512DQ-NEXT: vmovdqa 64(%rdx), %ymm15 +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm13[0],ymm12[0],ymm13[1],ymm12[1],ymm13[2],ymm12[2],ymm13[3],ymm12[3],ymm13[8],ymm12[8],ymm13[9],ymm12[9],ymm13[10],ymm12[10],ymm13[11],ymm12[11] +; AVX512DQ-NEXT: vpermt2d %zmm10, %zmm19, %zmm7 +; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1} +; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15] +; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm4[4],ymm3[4],ymm4[5],ymm3[5],ymm4[6],ymm3[6],ymm4[7],ymm3[7],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15] +; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm18, %zmm1 +; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm6[4],ymm2[4],ymm6[5],ymm2[5],ymm6[6],ymm2[6],ymm6[7],ymm2[7],ymm6[12],ymm2[12],ymm6[13],ymm2[13],ymm6[14],ymm2[14],ymm6[15],ymm2[15] +; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm13[4],ymm12[4],ymm13[5],ymm12[5],ymm13[6],ymm12[6],ymm13[7],ymm12[7],ymm13[12],ymm12[12],ymm13[13],ymm12[13],ymm13[14],ymm12[14],ymm13[15],ymm12[15] +; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm19, %zmm2 +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa 96(%r10), %xmm0 +; AVX512DQ-NEXT: vmovdqa 96(%rax), %xmm1 +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX512DQ-NEXT: vmovdqa 96(%r9), %xmm3 +; AVX512DQ-NEXT: vmovdqa 96(%r8), %xmm4 +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm16, %zmm5 +; AVX512DQ-NEXT: vmovdqa 96(%rcx), %xmm2 +; AVX512DQ-NEXT: vmovdqa 96(%rdx), %xmm6 +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3] +; AVX512DQ-NEXT: vmovdqa 96(%rsi), %xmm13 +; AVX512DQ-NEXT: vmovdqa 96(%rdi), %xmm14 +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3] +; AVX512DQ-NEXT: vpermt2d %zmm10, %zmm17, %zmm7 +; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1} +; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm16, %zmm1 +; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7] +; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] +; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm17, %zmm2 +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa 64(%r10), %ymm6 +; AVX512DQ-NEXT: vmovdqa 64(%rax), %ymm5 +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm5[0],ymm6[0],ymm5[1],ymm6[1],ymm5[2],ymm6[2],ymm5[3],ymm6[3],ymm5[8],ymm6[8],ymm5[9],ymm6[9],ymm5[10],ymm6[10],ymm5[11],ymm6[11] +; AVX512DQ-NEXT: vmovdqa 64(%r9), %ymm3 +; AVX512DQ-NEXT: vmovdqa 64(%r8), %ymm4 +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm10 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11] +; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm18, %zmm10 +; AVX512DQ-NEXT: vmovdqa 64(%rcx), %ymm2 +; AVX512DQ-NEXT: vmovdqa 64(%rdx), %ymm13 +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm14 = ymm13[0],ymm2[0],ymm13[1],ymm2[1],ymm13[2],ymm2[2],ymm13[3],ymm2[3],ymm13[8],ymm2[8],ymm13[9],ymm2[9],ymm13[10],ymm2[10],ymm13[11],ymm2[11] ; AVX512DQ-NEXT: vmovdqa 64(%rsi), %ymm1 ; AVX512DQ-NEXT: vmovdqa 64(%rdi), %ymm0 -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] -; AVX512DQ-NEXT: vpermd %zmm8, %zmm16, %zmm26 -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm15[0],ymm12[0],ymm15[1],ymm12[1],ymm15[2],ymm12[2],ymm15[3],ymm12[3],ymm15[8],ymm12[8],ymm15[9],ymm12[9],ymm15[10],ymm12[10],ymm15[11],ymm12[11] -; AVX512DQ-NEXT: vpermd %zmm8, %zmm17, %zmm26 {%k1} -; AVX512DQ-NEXT: vmovdqa 64(%r9), %xmm8 -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm9[4],ymm2[4],ymm9[5],ymm2[5],ymm9[6],ymm2[6],ymm9[7],ymm2[7],ymm9[12],ymm2[12],ymm9[13],ymm2[13],ymm9[14],ymm2[14],ymm9[15],ymm2[15] -; AVX512DQ-NEXT: vmovdqa 64(%r8), %xmm9 -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm10 = ymm11[4],ymm10[4],ymm11[5],ymm10[5],ymm11[6],ymm10[6],ymm11[7],ymm10[7],ymm11[12],ymm10[12],ymm11[13],ymm10[13],ymm11[14],ymm10[14],ymm11[15],ymm10[15] -; AVX512DQ-NEXT: vpermd %zmm10, %zmm19, %zmm5 -; AVX512DQ-NEXT: vpermd %zmm2, %zmm18, %zmm5 {%k2} -; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm15[4],ymm12[4],ymm15[5],ymm12[5],ymm15[6],ymm12[6],ymm15[7],ymm12[7],ymm15[12],ymm12[12],ymm15[13],ymm12[13],ymm15[14],ymm12[14],ymm15[15],ymm12[15] +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] +; AVX512DQ-NEXT: vpermt2d %zmm14, %zmm19, %zmm7 +; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm7 {%k1} +; AVX512DQ-NEXT: vmovdqu64 %zmm7, (%rsp) # 64-byte Spill +; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm5[4],ymm6[4],ymm5[5],ymm6[5],ymm5[6],ymm6[6],ymm5[7],ymm6[7],ymm5[12],ymm6[12],ymm5[13],ymm6[13],ymm5[14],ymm6[14],ymm5[15],ymm6[15] +; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm4[4],ymm3[4],ymm4[5],ymm3[5],ymm4[6],ymm3[6],ymm4[7],ymm3[7],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15] +; AVX512DQ-NEXT: vpermt2d %zmm5, %zmm18, %zmm3 +; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm13[4],ymm2[4],ymm13[5],ymm2[5],ymm13[6],ymm2[6],ymm13[7],ymm2[7],ymm13[12],ymm2[12],ymm13[13],ymm2[13],ymm13[14],ymm2[14],ymm13[15],ymm2[15] ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15] -; AVX512DQ-NEXT: vpermd %zmm0, %zmm16, %zmm24 -; AVX512DQ-NEXT: vpermd %zmm2, %zmm17, %zmm24 {%k1} -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] -; AVX512DQ-NEXT: vpermd %zmm1, %zmm30, %zmm1 -; AVX512DQ-NEXT: vpermd %zmm0, %zmm29, %zmm1 {%k2} -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] -; AVX512DQ-NEXT: vmovdqa 32(%r10), %ymm0 -; AVX512DQ-NEXT: vmovdqa 32(%rax), %ymm1 -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] -; AVX512DQ-NEXT: vmovdqa 32(%r9), %ymm4 -; AVX512DQ-NEXT: vmovdqa 32(%r8), %ymm8 -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm9 = ymm8[0],ymm4[0],ymm8[1],ymm4[1],ymm8[2],ymm4[2],ymm8[3],ymm4[3],ymm8[8],ymm4[8],ymm8[9],ymm4[9],ymm8[10],ymm4[10],ymm8[11],ymm4[11] -; AVX512DQ-NEXT: vpermd %zmm9, %zmm19, %zmm28 -; AVX512DQ-NEXT: vpermd %zmm3, %zmm18, %zmm28 {%k2} -; AVX512DQ-NEXT: vmovdqa 32(%rsi), %ymm9 -; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm11 -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm11[0],ymm9[0],ymm11[1],ymm9[1],ymm11[2],ymm9[2],ymm11[3],ymm9[3],ymm11[8],ymm9[8],ymm11[9],ymm9[9],ymm11[10],ymm9[10],ymm11[11],ymm9[11] -; AVX512DQ-NEXT: vpermd %zmm3, %zmm16, %zmm23 -; AVX512DQ-NEXT: vmovdqa 32(%rcx), %ymm12 -; AVX512DQ-NEXT: vmovdqa 32(%rdx), %ymm3 -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm3[0],ymm12[0],ymm3[1],ymm12[1],ymm3[2],ymm12[2],ymm3[3],ymm12[3],ymm3[8],ymm12[8],ymm3[9],ymm12[9],ymm3[10],ymm12[10],ymm3[11],ymm12[11] -; AVX512DQ-NEXT: vpermd %zmm6, %zmm17, %zmm23 {%k1} -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15] -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm8[4],ymm4[4],ymm8[5],ymm4[5],ymm8[6],ymm4[6],ymm8[7],ymm4[7],ymm8[12],ymm4[12],ymm8[13],ymm4[13],ymm8[14],ymm4[14],ymm8[15],ymm4[15] -; AVX512DQ-NEXT: vpermd %zmm1, %zmm19, %zmm25 -; AVX512DQ-NEXT: vpermd %zmm0, %zmm18, %zmm25 {%k2} -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm3[4],ymm12[4],ymm3[5],ymm12[5],ymm3[6],ymm12[6],ymm3[7],ymm12[7],ymm3[12],ymm12[12],ymm3[13],ymm12[13],ymm3[14],ymm12[14],ymm3[15],ymm12[15] -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm11[4],ymm9[4],ymm11[5],ymm9[5],ymm11[6],ymm9[6],ymm11[7],ymm9[7],ymm11[12],ymm9[12],ymm11[13],ymm9[13],ymm11[14],ymm9[14],ymm11[15],ymm9[15] -; AVX512DQ-NEXT: vpermd %zmm0, %zmm16, %zmm21 -; AVX512DQ-NEXT: vpermd %zmm3, %zmm17, %zmm21 {%k1} -; AVX512DQ-NEXT: vmovdqa (%r10), %ymm3 -; AVX512DQ-NEXT: vmovdqa (%r9), %ymm4 -; AVX512DQ-NEXT: vmovdqa (%r8), %ymm6 -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm6[0],ymm4[0],ymm6[1],ymm4[1],ymm6[2],ymm4[2],ymm6[3],ymm4[3],ymm6[8],ymm4[8],ymm6[9],ymm4[9],ymm6[10],ymm4[10],ymm6[11],ymm4[11] -; AVX512DQ-NEXT: vpermd %zmm8, %zmm19, %zmm27 -; AVX512DQ-NEXT: vmovdqa (%rax), %ymm8 -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm9 = ymm8[0],ymm3[0],ymm8[1],ymm3[1],ymm8[2],ymm3[2],ymm8[3],ymm3[3],ymm8[8],ymm3[8],ymm8[9],ymm3[9],ymm8[10],ymm3[10],ymm8[11],ymm3[11] -; AVX512DQ-NEXT: vpermd %zmm9, %zmm18, %zmm27 {%k2} -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm8[4],ymm3[4],ymm8[5],ymm3[5],ymm8[6],ymm3[6],ymm8[7],ymm3[7],ymm8[12],ymm3[12],ymm8[13],ymm3[13],ymm8[14],ymm3[14],ymm8[15],ymm3[15] -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm6[4],ymm4[4],ymm6[5],ymm4[5],ymm6[6],ymm4[6],ymm6[7],ymm4[7],ymm6[12],ymm4[12],ymm6[13],ymm4[13],ymm6[14],ymm4[14],ymm6[15],ymm4[15] -; AVX512DQ-NEXT: vmovdqa (%rcx), %ymm6 -; AVX512DQ-NEXT: vpermd %zmm4, %zmm19, %zmm20 -; AVX512DQ-NEXT: vmovdqa (%rsi), %ymm4 -; AVX512DQ-NEXT: vpermd %zmm3, %zmm18, %zmm20 {%k2} -; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm3 -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[8],ymm4[8],ymm3[9],ymm4[9],ymm3[10],ymm4[10],ymm3[11],ymm4[11] -; AVX512DQ-NEXT: vpermd %zmm8, %zmm16, %zmm18 -; AVX512DQ-NEXT: vmovdqa (%rdx), %ymm8 -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm9 = ymm8[0],ymm6[0],ymm8[1],ymm6[1],ymm8[2],ymm6[2],ymm8[3],ymm6[3],ymm8[8],ymm6[8],ymm8[9],ymm6[9],ymm8[10],ymm6[10],ymm8[11],ymm6[11] -; AVX512DQ-NEXT: vpermd %zmm9, %zmm17, %zmm18 {%k1} -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm6 = ymm8[4],ymm6[4],ymm8[5],ymm6[5],ymm8[6],ymm6[6],ymm8[7],ymm6[7],ymm8[12],ymm6[12],ymm8[13],ymm6[13],ymm8[14],ymm6[14],ymm8[15],ymm6[15] -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm3[4],ymm4[4],ymm3[5],ymm4[5],ymm3[6],ymm4[6],ymm3[7],ymm4[7],ymm3[12],ymm4[12],ymm3[13],ymm4[13],ymm3[14],ymm4[14],ymm3[15],ymm4[15] -; AVX512DQ-NEXT: vmovdqa 32(%r10), %xmm4 -; AVX512DQ-NEXT: vpermd %zmm3, %zmm16, %zmm16 +; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm19, %zmm0 +; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1} +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa 64(%r10), %xmm7 +; AVX512DQ-NEXT: vmovdqa 64(%rax), %xmm2 +; AVX512DQ-NEXT: vmovdqa 64(%r9), %xmm8 +; AVX512DQ-NEXT: vmovdqa 64(%r8), %xmm3 +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3] +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3] +; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm16, %zmm5 +; AVX512DQ-NEXT: vmovdqa 64(%rcx), %xmm4 +; AVX512DQ-NEXT: vmovdqa 64(%rdx), %xmm6 +; AVX512DQ-NEXT: vmovdqa 64(%rsi), %xmm14 +; AVX512DQ-NEXT: vmovdqa 64(%rdi), %xmm0 +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3] +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3] +; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm17, %zmm9 +; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm9 {%k1} +; AVX512DQ-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm7[4],xmm2[5],xmm7[5],xmm2[6],xmm7[6],xmm2[7],xmm7[7] +; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm8[4],xmm3[5],xmm8[5],xmm3[6],xmm8[6],xmm3[7],xmm8[7] +; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm16, %zmm2 +; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] +; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7] +; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm17, %zmm15 +; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm15 {%k1} +; AVX512DQ-NEXT: vmovdqa 32(%r10), %ymm8 +; AVX512DQ-NEXT: vmovdqa 32(%rax), %ymm4 +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm4[0],ymm8[0],ymm4[1],ymm8[1],ymm4[2],ymm8[2],ymm4[3],ymm8[3],ymm4[8],ymm8[8],ymm4[9],ymm8[9],ymm4[10],ymm8[10],ymm4[11],ymm8[11] +; AVX512DQ-NEXT: vmovdqa 32(%r9), %ymm3 +; AVX512DQ-NEXT: vmovdqa 32(%r8), %ymm5 +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm5[0],ymm3[0],ymm5[1],ymm3[1],ymm5[2],ymm3[2],ymm5[3],ymm3[3],ymm5[8],ymm3[8],ymm5[9],ymm3[9],ymm5[10],ymm3[10],ymm5[11],ymm3[11] +; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm18, %zmm6 +; AVX512DQ-NEXT: vmovdqa 32(%rcx), %ymm2 +; AVX512DQ-NEXT: vmovdqa 32(%rdx), %ymm7 +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm9 = ymm7[0],ymm2[0],ymm7[1],ymm2[1],ymm7[2],ymm2[2],ymm7[3],ymm2[3],ymm7[8],ymm2[8],ymm7[9],ymm2[9],ymm7[10],ymm2[10],ymm7[11],ymm2[11] +; AVX512DQ-NEXT: vmovdqa 32(%rsi), %ymm1 +; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm0 +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm14 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] +; AVX512DQ-NEXT: vpermt2d %zmm9, %zmm19, %zmm14 +; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm14 {%k1} +; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm4[4],ymm8[4],ymm4[5],ymm8[5],ymm4[6],ymm8[6],ymm4[7],ymm8[7],ymm4[12],ymm8[12],ymm4[13],ymm8[13],ymm4[14],ymm8[14],ymm4[15],ymm8[15] +; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm5[4],ymm3[4],ymm5[5],ymm3[5],ymm5[6],ymm3[6],ymm5[7],ymm3[7],ymm5[12],ymm3[12],ymm5[13],ymm3[13],ymm5[14],ymm3[14],ymm5[15],ymm3[15] +; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm18, %zmm5 +; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm7[4],ymm2[4],ymm7[5],ymm2[5],ymm7[6],ymm2[6],ymm7[7],ymm2[7],ymm7[12],ymm2[12],ymm7[13],ymm2[13],ymm7[14],ymm2[14],ymm7[15],ymm2[15] +; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm13 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15] +; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm19, %zmm13 +; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm13 {%k1} +; AVX512DQ-NEXT: vmovdqa 32(%r10), %xmm10 ; AVX512DQ-NEXT: vmovdqa 32(%rax), %xmm3 -; AVX512DQ-NEXT: vpermd %zmm6, %zmm17, %zmm16 {%k1} -; AVX512DQ-NEXT: vmovdqa 32(%r9), %xmm6 -; AVX512DQ-NEXT: vpermd %zmm10, %zmm30, %zmm19 -; AVX512DQ-NEXT: vmovdqa 32(%r8), %xmm8 -; AVX512DQ-NEXT: vpermd %zmm2, %zmm29, %zmm19 {%k2} -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3] -; AVX512DQ-NEXT: vpermd %zmm2, %zmm30, %zmm10 -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; AVX512DQ-NEXT: vpermd %zmm9, %zmm29, %zmm10 {%k2} -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; AVX512DQ-NEXT: vmovdqa (%rcx), %xmm0 -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] -; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm13 -; AVX512DQ-NEXT: vpermd %zmm4, %zmm30, %zmm17 -; AVX512DQ-NEXT: vpermd %zmm3, %zmm29, %zmm17 {%k2} -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm13[4],xmm0[4],xmm13[5],xmm0[5],xmm13[6],xmm0[6],xmm13[7],xmm0[7] -; AVX512DQ-NEXT: vmovdqa64 %xmm0, %xmm22 -; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512DQ-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload -; AVX512DQ-NEXT: # xmm4 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; AVX512DQ-NEXT: vmovdqa (%rsi), %xmm9 -; AVX512DQ-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm6 # 16-byte Folded Reload -; AVX512DQ-NEXT: # xmm6 = xmm7[0],mem[0],xmm7[1],mem[1],xmm7[2],mem[2],xmm7[3],mem[3] -; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm12 -; AVX512DQ-NEXT: vpermd %zmm6, %zmm30, %zmm8 -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm12[4],xmm9[4],xmm12[5],xmm9[5],xmm12[6],xmm9[6],xmm12[7],xmm9[7] -; AVX512DQ-NEXT: vpermd %zmm4, %zmm29, %zmm8 {%k2} -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm29 = [0,0,1,0,1,0,0,0,2,2,3,3,3,3,0,0] -; AVX512DQ-NEXT: vpermd %zmm6, %zmm29, %zmm7 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm30 = [0,0,1,1,1,1,0,0,2,2,3,3,3,3,0,0] -; AVX512DQ-NEXT: vpermd %zmm3, %zmm30, %zmm7 {%k1} -; AVX512DQ-NEXT: vmovdqa 96(%rsi), %xmm3 -; AVX512DQ-NEXT: vmovdqa 96(%rdi), %xmm2 -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX512DQ-NEXT: vpermd %zmm6, %zmm29, %zmm6 -; AVX512DQ-NEXT: vmovdqa 96(%rcx), %xmm1 -; AVX512DQ-NEXT: vmovdqa 96(%rdx), %xmm0 -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512DQ-NEXT: vpermd %zmm14, %zmm30, %zmm6 {%k1} -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] -; AVX512DQ-NEXT: vpermd %zmm1, %zmm29, %zmm5 -; AVX512DQ-NEXT: vpermd %zmm0, %zmm30, %zmm5 {%k1} -; AVX512DQ-NEXT: vmovdqa 64(%rsi), %xmm1 -; AVX512DQ-NEXT: vmovdqa 64(%rdi), %xmm2 -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX512DQ-NEXT: vpermd %zmm3, %zmm29, %zmm4 -; AVX512DQ-NEXT: vmovdqa 64(%rcx), %xmm14 -; AVX512DQ-NEXT: vmovdqa 64(%rdx), %xmm0 -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3] -; AVX512DQ-NEXT: vpermd %zmm15, %zmm30, %zmm4 {%k1} -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7] -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; AVX512DQ-NEXT: vpermd %zmm1, %zmm29, %zmm3 -; AVX512DQ-NEXT: vpermd %zmm0, %zmm30, %zmm3 {%k1} -; AVX512DQ-NEXT: vmovdqa 32(%rsi), %xmm1 -; AVX512DQ-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX512DQ-NEXT: vpermd %zmm14, %zmm29, %zmm14 -; AVX512DQ-NEXT: vmovdqa 32(%rcx), %xmm15 -; AVX512DQ-NEXT: vmovdqa 32(%rdx), %xmm0 -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3] -; AVX512DQ-NEXT: vpermd %zmm11, %zmm30, %zmm14 {%k1} -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7] -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; AVX512DQ-NEXT: vpermd %zmm1, %zmm29, %zmm1 -; AVX512DQ-NEXT: vpermd %zmm0, %zmm30, %zmm1 {%k1} -; AVX512DQ-NEXT: vmovdqa64 %xmm22, %xmm0 -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm13[0],xmm0[0],xmm13[1],xmm0[1],xmm13[2],xmm0[2],xmm13[3],xmm0[3] -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm12[0],xmm9[0],xmm12[1],xmm9[1],xmm12[2],xmm9[2],xmm12[3],xmm9[3] -; AVX512DQ-NEXT: vpermd %zmm2, %zmm29, %zmm2 -; AVX512DQ-NEXT: vpermd %zmm0, %zmm30, %zmm2 {%k1} -; AVX512DQ-NEXT: movb $-86, %al -; AVX512DQ-NEXT: kmovw %eax, %k1 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm0 {%k1} -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm31 {%k1} -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm6 {%k1} -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm5 {%k1} -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm26 {%k1} -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm24 {%k1} -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm4 {%k1} -; AVX512DQ-NEXT: vmovdqa64 %zmm19, %zmm3 {%k1} -; AVX512DQ-NEXT: vmovdqa64 %zmm28, %zmm23 {%k1} -; AVX512DQ-NEXT: vmovdqa64 %zmm25, %zmm21 {%k1} -; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm14 {%k1} -; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm1 {%k1} -; AVX512DQ-NEXT: vmovdqa64 %zmm27, %zmm18 {%k1} -; AVX512DQ-NEXT: vmovdqa64 %zmm20, %zmm16 {%k1} -; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm2 {%k1} +; AVX512DQ-NEXT: vmovdqa 32(%r9), %xmm4 +; AVX512DQ-NEXT: vmovdqa 32(%r8), %xmm5 +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm10[0],xmm3[1],xmm10[1],xmm3[2],xmm10[2],xmm3[3],xmm10[3] +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm16, %zmm6 +; AVX512DQ-NEXT: vmovdqa 32(%rcx), %xmm7 +; AVX512DQ-NEXT: vmovdqa 32(%rdx), %xmm8 +; AVX512DQ-NEXT: vmovdqa 32(%rsi), %xmm9 +; AVX512DQ-NEXT: vmovdqa 32(%rdi), %xmm0 +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3] +; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm17, %zmm12 +; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm12 {%k1} +; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm3[4],xmm10[4],xmm3[5],xmm10[5],xmm3[6],xmm10[6],xmm3[7],xmm10[7] +; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm16, %zmm3 +; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] +; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7] +; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm17, %zmm4 +; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512DQ-NEXT: vmovdqa (%r10), %ymm10 +; AVX512DQ-NEXT: vmovdqa (%rax), %ymm3 +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm3[0],ymm10[0],ymm3[1],ymm10[1],ymm3[2],ymm10[2],ymm3[3],ymm10[3],ymm3[8],ymm10[8],ymm3[9],ymm10[9],ymm3[10],ymm10[10],ymm3[11],ymm10[11] +; AVX512DQ-NEXT: vmovdqa (%r9), %ymm5 +; AVX512DQ-NEXT: vmovdqa (%r8), %ymm6 +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm6[0],ymm5[0],ymm6[1],ymm5[1],ymm6[2],ymm5[2],ymm6[3],ymm5[3],ymm6[8],ymm5[8],ymm6[9],ymm5[9],ymm6[10],ymm5[10],ymm6[11],ymm5[11] +; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm18, %zmm7 +; AVX512DQ-NEXT: vmovdqa (%rcx), %ymm8 +; AVX512DQ-NEXT: vmovdqa (%rdx), %ymm9 +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm11 = ymm9[0],ymm8[0],ymm9[1],ymm8[1],ymm9[2],ymm8[2],ymm9[3],ymm8[3],ymm9[8],ymm8[8],ymm9[9],ymm8[9],ymm9[10],ymm8[10],ymm9[11],ymm8[11] +; AVX512DQ-NEXT: vmovdqa (%rsi), %ymm2 +; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm1 +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11] +; AVX512DQ-NEXT: vpermt2d %zmm11, %zmm19, %zmm0 +; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm0 {%k1} +; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm3[4],ymm10[4],ymm3[5],ymm10[5],ymm3[6],ymm10[6],ymm3[7],ymm10[7],ymm3[12],ymm10[12],ymm3[13],ymm10[13],ymm3[14],ymm10[14],ymm3[15],ymm10[15] +; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm6[4],ymm5[4],ymm6[5],ymm5[5],ymm6[6],ymm5[6],ymm6[7],ymm5[7],ymm6[12],ymm5[12],ymm6[13],ymm5[13],ymm6[14],ymm5[14],ymm6[15],ymm5[15] +; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm18, %zmm5 +; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm9[4],ymm8[4],ymm9[5],ymm8[5],ymm9[6],ymm8[6],ymm9[7],ymm8[7],ymm9[12],ymm8[12],ymm9[13],ymm8[13],ymm9[14],ymm8[14],ymm9[15],ymm8[15] +; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15] +; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm19, %zmm1 +; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm1 {%k1} +; AVX512DQ-NEXT: vmovdqa64 %xmm20, %xmm2 +; AVX512DQ-NEXT: vmovdqa64 %xmm21, %xmm3 +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX512DQ-NEXT: vmovdqa64 %xmm22, %xmm3 +; AVX512DQ-NEXT: vmovdqa64 %xmm23, %xmm5 +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] +; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm16, %zmm3 +; AVX512DQ-NEXT: vmovdqa64 %xmm24, %xmm2 +; AVX512DQ-NEXT: vmovdqa64 %xmm25, %xmm5 +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] +; AVX512DQ-NEXT: vmovdqa64 %xmm26, %xmm5 +; AVX512DQ-NEXT: vmovdqa64 %xmm27, %xmm6 +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm17, %zmm5 +; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-NEXT: vmovdqa64 %zmm2, (%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm16, 192(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm18, 128(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm1, 320(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm14, 256(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm21, 448(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm23, 384(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm3, 576(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm4, 512(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm24, 704(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm26, 640(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm5, 832(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm6, 768(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm31, 960(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm0, 896(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm7, 64(%rax) -; AVX512DQ-NEXT: addq $504, %rsp # imm = 0x1F8 +; AVX512DQ-NEXT: vmovdqa64 %zmm5, (%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm1, 192(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm0, 128(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm4, 320(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm12, 256(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm13, 448(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm14, 384(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm15, 576(%rax) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm0, 512(%rax) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm0, 704(%rax) +; AVX512DQ-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm0, 640(%rax) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm0, 832(%rax) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm0, 768(%rax) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm0, 960(%rax) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm0, 896(%rax) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm0, 64(%rax) +; AVX512DQ-NEXT: addq $392, %rsp # imm = 0x188 ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; ; AVX512DQ-FCP-LABEL: store_i16_stride8_vf64: ; AVX512DQ-FCP: # %bb.0: -; AVX512DQ-FCP-NEXT: subq $264, %rsp # imm = 0x108 +; AVX512DQ-FCP-NEXT: subq $328, %rsp # imm = 0x148 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512DQ-FCP-NEXT: vmovdqa (%r10), %xmm1 -; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa (%rax), %xmm0 -; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX512DQ-FCP-NEXT: vmovdqa (%rax), %xmm2 +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm2, %xmm21 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm1, %xmm22 ; AVX512DQ-FCP-NEXT: vmovdqa (%r9), %xmm2 -; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %xmm1 -; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,0,0,0,0,0,1,1,2,2,2,2,0,0,3,3] -; AVX512DQ-FCP-NEXT: vpermd %zmm1, %zmm14, %zmm1 -; AVX512DQ-FCP-NEXT: movw $-30584, %r11w # imm = 0x8888 -; AVX512DQ-FCP-NEXT: kmovw %r11d, %k2 -; AVX512DQ-FCP-NEXT: vpermd %zmm0, %zmm14, %zmm1 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %xmm1 -; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm0 -; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %xmm2 -; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm1 -; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,0,1,1,1,1,0,0,2,0,3,0,3,0,0,0] -; AVX512DQ-FCP-NEXT: vpermd %zmm1, %zmm2, %zmm17 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,1,1,1,1,0,0,2,2,3,3,3,3,0,0] -; AVX512DQ-FCP-NEXT: movw $8738, %r11w # imm = 0x2222 +; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %xmm3 +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm3, %xmm23 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm2, %xmm24 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm16 = [0,0,0,16,0,0,1,17,2,2,2,18,0,0,3,19] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm16, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %xmm2 +; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm3 +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm3, %xmm25 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm2, %xmm26 +; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %xmm3 +; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm4 +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm4, %xmm27 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm3, %xmm28 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm20 = [0,16,1,0,1,17,0,0,2,18,3,0,3,19,0,0] +; AVX512DQ-FCP-NEXT: vpermi2d %zmm0, %zmm2, %zmm20 +; AVX512DQ-FCP-NEXT: movb $-86, %r11b ; AVX512DQ-FCP-NEXT: kmovw %r11d, %k1 -; AVX512DQ-FCP-NEXT: vpermd %zmm0, %zmm7, %zmm17 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa 96(%r10), %ymm5 -; AVX512DQ-FCP-NEXT: vmovdqa 96(%rax), %ymm4 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[8],ymm5[8],ymm4[9],ymm5[9],ymm4[10],ymm5[10],ymm4[11],ymm5[11] -; AVX512DQ-FCP-NEXT: vmovdqa 96(%r9), %ymm13 -; AVX512DQ-FCP-NEXT: vmovdqa 96(%r8), %ymm15 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm11 = ymm15[0],ymm13[0],ymm15[1],ymm13[1],ymm15[2],ymm13[2],ymm15[3],ymm13[3],ymm15[8],ymm13[8],ymm15[9],ymm13[9],ymm15[10],ymm13[10],ymm15[11],ymm13[11] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm23 = [0,1,4,5,4,5,5,7,2,1,6,5,6,5,7,7] -; AVX512DQ-FCP-NEXT: vpermd %zmm11, %zmm23, %zmm0 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm22 = [0,0,4,4,4,4,6,5,0,2,4,6,4,6,6,7] -; AVX512DQ-FCP-NEXT: vpermd %zmm6, %zmm22, %zmm0 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa 96(%rcx), %ymm3 -; AVX512DQ-FCP-NEXT: vmovdqa 96(%rsi), %ymm2 -; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdi), %ymm1 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm20 = [4,5,1,3,5,7,5,7,6,5,3,3,7,7,7,7] -; AVX512DQ-FCP-NEXT: vpermd %zmm6, %zmm20, %zmm10 -; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdx), %ymm0 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm21 = [4,4,2,1,6,5,6,5,4,6,2,3,6,7,6,7] -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm9 = ymm0[0],ymm3[0],ymm0[1],ymm3[1],ymm0[2],ymm3[2],ymm0[3],ymm3[3],ymm0[8],ymm3[8],ymm0[9],ymm3[9],ymm0[10],ymm3[10],ymm0[11],ymm3[11] -; AVX512DQ-FCP-NEXT: vpermd %zmm9, %zmm21, %zmm10 {%k1} -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm4[4],ymm5[4],ymm4[5],ymm5[5],ymm4[6],ymm5[6],ymm4[7],ymm5[7],ymm4[12],ymm5[12],ymm4[13],ymm5[13],ymm4[14],ymm5[14],ymm4[15],ymm5[15] -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm15[4],ymm13[4],ymm15[5],ymm13[5],ymm15[6],ymm13[6],ymm15[7],ymm13[7],ymm15[12],ymm13[12],ymm15[13],ymm13[13],ymm15[14],ymm13[14],ymm15[15],ymm13[15] -; AVX512DQ-FCP-NEXT: vpermd %zmm5, %zmm23, %zmm5 -; AVX512DQ-FCP-NEXT: vpermd %zmm4, %zmm22, %zmm5 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm3[4],ymm0[5],ymm3[5],ymm0[6],ymm3[6],ymm0[7],ymm3[7],ymm0[12],ymm3[12],ymm0[13],ymm3[13],ymm0[14],ymm3[14],ymm0[15],ymm3[15] -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15] -; AVX512DQ-FCP-NEXT: vpermd %zmm1, %zmm20, %zmm13 -; AVX512DQ-FCP-NEXT: vpermd %zmm0, %zmm21, %zmm13 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa 64(%r10), %ymm0 -; AVX512DQ-FCP-NEXT: vmovdqa 64(%rax), %ymm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm20 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa 96(%r10), %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqa 96(%rax), %ymm1 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] +; AVX512DQ-FCP-NEXT: vmovdqa 96(%r9), %ymm3 +; AVX512DQ-FCP-NEXT: vmovdqa 96(%r8), %ymm4 +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm17 = [0,1,4,20,4,5,5,21,2,1,6,22,6,5,7,23] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm17, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqa 96(%rcx), %ymm2 +; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdx), %ymm6 +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm11 = ymm6[0],ymm2[0],ymm6[1],ymm2[1],ymm6[2],ymm2[2],ymm6[3],ymm2[3],ymm6[8],ymm2[8],ymm6[9],ymm2[9],ymm6[10],ymm2[10],ymm6[11],ymm2[11] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm19 = [4,20,1,3,5,21,5,7,6,22,3,3,7,23,7,7] +; AVX512DQ-FCP-NEXT: vmovdqa 96(%rsi), %ymm12 +; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdi), %ymm13 +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm13[0],ymm12[0],ymm13[1],ymm12[1],ymm13[2],ymm12[2],ymm13[3],ymm12[3],ymm13[8],ymm12[8],ymm13[9],ymm12[9],ymm13[10],ymm12[10],ymm13[11],ymm12[11] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm11, %zmm19, %zmm7 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15] +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm4[4],ymm3[4],ymm4[5],ymm3[5],ymm4[6],ymm3[6],ymm4[7],ymm3[7],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm17, %zmm1 +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm6[4],ymm2[4],ymm6[5],ymm2[5],ymm6[6],ymm2[6],ymm6[7],ymm2[7],ymm6[12],ymm2[12],ymm6[13],ymm2[13],ymm6[14],ymm2[14],ymm6[15],ymm2[15] +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm13[4],ymm12[4],ymm13[5],ymm12[5],ymm13[6],ymm12[6],ymm13[7],ymm12[7],ymm13[12],ymm12[12],ymm13[13],ymm12[13],ymm13[14],ymm12[14],ymm13[15],ymm12[15] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm19, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa 96(%r10), %xmm0 +; AVX512DQ-FCP-NEXT: vmovdqa 96(%rax), %xmm1 +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX512DQ-FCP-NEXT: vmovdqa 96(%r9), %xmm3 +; AVX512DQ-FCP-NEXT: vmovdqa 96(%r8), %xmm4 +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm16, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqa 96(%rcx), %xmm2 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm18 = [0,16,1,0,1,17,0,0,2,18,3,3,3,19,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdx), %xmm6 +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3] +; AVX512DQ-FCP-NEXT: vmovdqa 96(%rsi), %xmm13 +; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdi), %xmm15 +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm15[0],xmm13[0],xmm15[1],xmm13[1],xmm15[2],xmm13[2],xmm15[3],xmm13[3] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm12, %zmm18, %zmm7 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm16, %zmm1 +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7] +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm15[4],xmm13[4],xmm15[5],xmm13[5],xmm15[6],xmm13[6],xmm15[7],xmm13[7] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm18, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa 64(%r10), %ymm6 +; AVX512DQ-FCP-NEXT: vmovdqa 64(%rax), %ymm5 +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm5[0],ymm6[0],ymm5[1],ymm6[1],ymm5[2],ymm6[2],ymm5[3],ymm6[3],ymm5[8],ymm6[8],ymm5[9],ymm6[9],ymm5[10],ymm6[10],ymm5[11],ymm6[11] ; AVX512DQ-FCP-NEXT: vmovdqa 64(%r9), %ymm3 ; AVX512DQ-FCP-NEXT: vmovdqa 64(%r8), %ymm4 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11] -; AVX512DQ-FCP-NEXT: vpermd %zmm5, %zmm23, %zmm5 -; AVX512DQ-FCP-NEXT: vpermd %zmm2, %zmm22, %zmm5 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, (%rsp) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm12 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm17, %zmm12 ; AVX512DQ-FCP-NEXT: vmovdqa 64(%rcx), %ymm2 -; AVX512DQ-FCP-NEXT: vmovdqa 64(%rsi), %ymm5 -; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %ymm9 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm9[0],ymm5[0],ymm9[1],ymm5[1],ymm9[2],ymm5[2],ymm9[3],ymm5[3],ymm9[8],ymm5[8],ymm9[9],ymm5[9],ymm9[10],ymm5[10],ymm9[11],ymm5[11] -; AVX512DQ-FCP-NEXT: vpermd %zmm8, %zmm20, %zmm16 -; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdx), %ymm8 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm12 = ymm8[0],ymm2[0],ymm8[1],ymm2[1],ymm8[2],ymm2[2],ymm8[3],ymm2[3],ymm8[8],ymm2[8],ymm8[9],ymm2[9],ymm8[10],ymm2[10],ymm8[11],ymm2[11] -; AVX512DQ-FCP-NEXT: vpermd %zmm12, %zmm21, %zmm16 {%k1} -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15] -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm4[4],ymm3[4],ymm4[5],ymm3[5],ymm4[6],ymm3[6],ymm4[7],ymm3[7],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15] -; AVX512DQ-FCP-NEXT: vpermd %zmm1, %zmm23, %zmm19 -; AVX512DQ-FCP-NEXT: vpermd %zmm0, %zmm22, %zmm19 {%k2} -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm8[4],ymm2[4],ymm8[5],ymm2[5],ymm8[6],ymm2[6],ymm8[7],ymm2[7],ymm8[12],ymm2[12],ymm8[13],ymm2[13],ymm8[14],ymm2[14],ymm8[15],ymm2[15] -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm9[4],ymm5[4],ymm9[5],ymm5[5],ymm9[6],ymm5[6],ymm9[7],ymm5[7],ymm9[12],ymm5[12],ymm9[13],ymm5[13],ymm9[14],ymm5[14],ymm9[15],ymm5[15] -; AVX512DQ-FCP-NEXT: vpermd %zmm1, %zmm20, %zmm18 -; AVX512DQ-FCP-NEXT: vpermd %zmm0, %zmm21, %zmm18 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa 32(%r10), %ymm0 -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rax), %ymm1 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] -; AVX512DQ-FCP-NEXT: vmovdqa 32(%r9), %ymm3 -; AVX512DQ-FCP-NEXT: vmovdqa 32(%r8), %ymm4 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11] -; AVX512DQ-FCP-NEXT: vpermd %zmm5, %zmm23, %zmm25 -; AVX512DQ-FCP-NEXT: vpermd %zmm2, %zmm22, %zmm25 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rsi), %ymm2 -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm5 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm5[0],ymm2[0],ymm5[1],ymm2[1],ymm5[2],ymm2[2],ymm5[3],ymm2[3],ymm5[8],ymm2[8],ymm5[9],ymm2[9],ymm5[10],ymm2[10],ymm5[11],ymm2[11] -; AVX512DQ-FCP-NEXT: vpermd %zmm8, %zmm20, %zmm24 -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rcx), %ymm8 -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdx), %ymm9 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm12 = ymm9[0],ymm8[0],ymm9[1],ymm8[1],ymm9[2],ymm8[2],ymm9[3],ymm8[3],ymm9[8],ymm8[8],ymm9[9],ymm8[9],ymm9[10],ymm8[10],ymm9[11],ymm8[11] -; AVX512DQ-FCP-NEXT: vpermd %zmm12, %zmm21, %zmm24 {%k1} -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15] -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm4[4],ymm3[4],ymm4[5],ymm3[5],ymm4[6],ymm3[6],ymm4[7],ymm3[7],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15] -; AVX512DQ-FCP-NEXT: vpermd %zmm1, %zmm23, %zmm27 -; AVX512DQ-FCP-NEXT: vpermd %zmm0, %zmm22, %zmm27 {%k2} -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm9[4],ymm8[4],ymm9[5],ymm8[5],ymm9[6],ymm8[6],ymm9[7],ymm8[7],ymm9[12],ymm8[12],ymm9[13],ymm8[13],ymm9[14],ymm8[14],ymm9[15],ymm8[15] -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm5[4],ymm2[4],ymm5[5],ymm2[5],ymm5[6],ymm2[6],ymm5[7],ymm2[7],ymm5[12],ymm2[12],ymm5[13],ymm2[13],ymm5[14],ymm2[14],ymm5[15],ymm2[15] -; AVX512DQ-FCP-NEXT: vpermd %zmm1, %zmm20, %zmm26 -; AVX512DQ-FCP-NEXT: vpermd %zmm0, %zmm21, %zmm26 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa (%r10), %ymm0 -; AVX512DQ-FCP-NEXT: vmovdqa (%r9), %ymm1 -; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %ymm2 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11] -; AVX512DQ-FCP-NEXT: vpermd %zmm3, %zmm23, %zmm28 -; AVX512DQ-FCP-NEXT: vmovdqa (%rax), %ymm3 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[8],ymm0[8],ymm3[9],ymm0[9],ymm3[10],ymm0[10],ymm3[11],ymm0[11] -; AVX512DQ-FCP-NEXT: vpermd %zmm4, %zmm22, %zmm28 {%k2} -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm3[4],ymm0[4],ymm3[5],ymm0[5],ymm3[6],ymm0[6],ymm3[7],ymm0[7],ymm3[12],ymm0[12],ymm3[13],ymm0[13],ymm3[14],ymm0[14],ymm3[15],ymm0[15] -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[12],ymm1[12],ymm2[13],ymm1[13],ymm2[14],ymm1[14],ymm2[15],ymm1[15] -; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %ymm2 -; AVX512DQ-FCP-NEXT: vpermd %zmm1, %zmm23, %zmm23 -; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %ymm1 -; AVX512DQ-FCP-NEXT: vpermd %zmm0, %zmm22, %zmm23 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] -; AVX512DQ-FCP-NEXT: vpermd %zmm3, %zmm20, %zmm22 -; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %ymm3 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11] -; AVX512DQ-FCP-NEXT: vpermd %zmm4, %zmm21, %zmm22 {%k1} -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm3[4],ymm2[4],ymm3[5],ymm2[5],ymm3[6],ymm2[6],ymm3[7],ymm2[7],ymm3[12],ymm2[12],ymm3[13],ymm2[13],ymm3[14],ymm2[14],ymm3[15],ymm2[15] +; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdx), %ymm13 +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm15 = ymm13[0],ymm2[0],ymm13[1],ymm2[1],ymm13[2],ymm2[2],ymm13[3],ymm2[3],ymm13[8],ymm2[8],ymm13[9],ymm2[9],ymm13[10],ymm2[10],ymm13[11],ymm2[11] +; AVX512DQ-FCP-NEXT: vmovdqa 64(%rsi), %ymm1 +; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %ymm0 +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm15, %zmm19, %zmm7 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm7 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, (%rsp) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm5[4],ymm6[4],ymm5[5],ymm6[5],ymm5[6],ymm6[6],ymm5[7],ymm6[7],ymm5[12],ymm6[12],ymm5[13],ymm6[13],ymm5[14],ymm6[14],ymm5[15],ymm6[15] +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm4[4],ymm3[4],ymm4[5],ymm3[5],ymm4[6],ymm3[6],ymm4[7],ymm3[7],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm5, %zmm17, %zmm3 +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm13[4],ymm2[4],ymm13[5],ymm2[5],ymm13[6],ymm2[6],ymm13[7],ymm2[7],ymm13[12],ymm2[12],ymm13[13],ymm2[13],ymm13[14],ymm2[14],ymm13[15],ymm2[15] ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15] -; AVX512DQ-FCP-NEXT: vmovdqa 96(%r9), %xmm1 -; AVX512DQ-FCP-NEXT: vpermd %zmm0, %zmm20, %zmm20 -; AVX512DQ-FCP-NEXT: vmovdqa 96(%r8), %xmm0 -; AVX512DQ-FCP-NEXT: vpermd %zmm2, %zmm21, %zmm20 {%k1} -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,0,0,0,0,0,1,0,2,2,2,2,0,0,3,3] -; AVX512DQ-FCP-NEXT: vpermd %zmm2, %zmm3, %zmm21 -; AVX512DQ-FCP-NEXT: vmovdqa 96(%r10), %xmm2 -; AVX512DQ-FCP-NEXT: vmovdqa 96(%rax), %xmm3 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX512DQ-FCP-NEXT: vpermd %zmm4, %zmm14, %zmm21 {%k2} -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX512DQ-FCP-NEXT: vmovdqa 64(%r9), %xmm1 -; AVX512DQ-FCP-NEXT: vpermd %zmm0, %zmm14, %zmm29 -; AVX512DQ-FCP-NEXT: vpermd %zmm2, %zmm14, %zmm29 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqa 64(%r8), %xmm0 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512DQ-FCP-NEXT: vpermd %zmm2, %zmm14, %zmm30 -; AVX512DQ-FCP-NEXT: vmovdqa 64(%r10), %xmm2 -; AVX512DQ-FCP-NEXT: vmovdqa 64(%rax), %xmm3 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX512DQ-FCP-NEXT: vpermd %zmm4, %zmm14, %zmm30 {%k2} -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX512DQ-FCP-NEXT: vpermd %zmm0, %zmm14, %zmm31 -; AVX512DQ-FCP-NEXT: vpermd %zmm2, %zmm14, %zmm31 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqa 32(%r9), %xmm0 -; AVX512DQ-FCP-NEXT: vmovdqa 32(%r8), %xmm2 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; AVX512DQ-FCP-NEXT: vpermd %zmm1, %zmm14, %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqa 32(%r10), %xmm3 -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rax), %xmm4 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX512DQ-FCP-NEXT: vpermd %zmm5, %zmm14, %zmm6 {%k2} -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; AVX512DQ-FCP-NEXT: vpermd %zmm0, %zmm14, %zmm1 -; AVX512DQ-FCP-NEXT: vpermd %zmm3, %zmm14, %zmm1 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512DQ-FCP-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload -; AVX512DQ-FCP-NEXT: # xmm2 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512DQ-FCP-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload -; AVX512DQ-FCP-NEXT: # xmm3 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; AVX512DQ-FCP-NEXT: vpermd %zmm3, %zmm14, %zmm9 -; AVX512DQ-FCP-NEXT: vpermd %zmm2, %zmm14, %zmm9 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqa 96(%rsi), %xmm2 -; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdi), %xmm3 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,0,1,0,1,0,0,0,2,2,3,3,3,3,0,0] -; AVX512DQ-FCP-NEXT: vpermd %zmm4, %zmm5, %zmm8 -; AVX512DQ-FCP-NEXT: vmovdqa 96(%rcx), %xmm4 -; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdx), %xmm5 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX512DQ-FCP-NEXT: vpermd %zmm12, %zmm7, %zmm8 {%k1} -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; AVX512DQ-FCP-NEXT: vpermd %zmm2, %zmm7, %zmm2 -; AVX512DQ-FCP-NEXT: vpermd %zmm4, %zmm7, %zmm2 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa 64(%rsi), %xmm3 -; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %xmm4 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX512DQ-FCP-NEXT: vpermd %zmm5, %zmm7, %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqa 64(%rcx), %xmm12 -; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdx), %xmm14 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm14[0],xmm12[0],xmm14[1],xmm12[1],xmm14[2],xmm12[2],xmm14[3],xmm12[3] -; AVX512DQ-FCP-NEXT: vpermd %zmm11, %zmm7, %zmm5 {%k1} -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm14[4],xmm12[4],xmm14[5],xmm12[5],xmm14[6],xmm12[6],xmm14[7],xmm12[7] -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] -; AVX512DQ-FCP-NEXT: vpermd %zmm3, %zmm7, %zmm3 -; AVX512DQ-FCP-NEXT: vpermd %zmm11, %zmm7, %zmm3 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rsi), %xmm4 -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm11 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm11[0],xmm4[0],xmm11[1],xmm4[1],xmm11[2],xmm4[2],xmm11[3],xmm4[3] -; AVX512DQ-FCP-NEXT: vpermd %zmm12, %zmm7, %zmm12 -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rcx), %xmm14 -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdx), %xmm0 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3] -; AVX512DQ-FCP-NEXT: vpermd %zmm15, %zmm7, %zmm12 {%k1} -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7] -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm11[4],xmm4[4],xmm11[5],xmm4[5],xmm11[6],xmm4[6],xmm11[7],xmm4[7] -; AVX512DQ-FCP-NEXT: vpermd %zmm4, %zmm7, %zmm4 -; AVX512DQ-FCP-NEXT: vpermd %zmm0, %zmm7, %zmm4 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512DQ-FCP-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX512DQ-FCP-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX512DQ-FCP-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm11 # 16-byte Folded Reload -; AVX512DQ-FCP-NEXT: # xmm11 = xmm11[0],mem[0],xmm11[1],mem[1],xmm11[2],mem[2],xmm11[3],mem[3] -; AVX512DQ-FCP-NEXT: vpermd %zmm11, %zmm7, %zmm11 -; AVX512DQ-FCP-NEXT: vpermd %zmm0, %zmm7, %zmm11 {%k1} -; AVX512DQ-FCP-NEXT: movb $-86, %al -; AVX512DQ-FCP-NEXT: kmovw %eax, %k1 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm17 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, %zmm8 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, %zmm2 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm16 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, %zmm18 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm30, %zmm5 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm31, %zmm3 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, %zmm24 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, %zmm26 {%k1} +; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm19, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa 64(%r10), %xmm8 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa 64(%rax), %xmm2 +; AVX512DQ-FCP-NEXT: vmovdqa 64(%r9), %xmm7 +; AVX512DQ-FCP-NEXT: vmovdqa 64(%r8), %xmm3 +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3] +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm4, %zmm16, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqa 64(%rcx), %xmm4 +; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdx), %xmm6 +; AVX512DQ-FCP-NEXT: vmovdqa 64(%rsi), %xmm13 +; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %xmm0 +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3] +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm0[0],xmm13[0],xmm0[1],xmm13[1],xmm0[2],xmm13[2],xmm0[3],xmm13[3] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm18, %zmm9 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm9 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7] +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm7[4],xmm3[5],xmm7[5],xmm3[6],xmm7[6],xmm3[7],xmm7[7] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm16, %zmm2 +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm0[4],xmm13[4],xmm0[5],xmm13[5],xmm0[6],xmm13[6],xmm0[7],xmm13[7] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm18, %zmm15 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm15 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa 32(%r10), %ymm8 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rax), %ymm4 +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm4[0],ymm8[0],ymm4[1],ymm8[1],ymm4[2],ymm8[2],ymm4[3],ymm8[3],ymm4[8],ymm8[8],ymm4[9],ymm8[9],ymm4[10],ymm8[10],ymm4[11],ymm8[11] +; AVX512DQ-FCP-NEXT: vmovdqa 32(%r9), %ymm3 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%r8), %ymm5 +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm5[0],ymm3[0],ymm5[1],ymm3[1],ymm5[2],ymm3[2],ymm5[3],ymm3[3],ymm5[8],ymm3[8],ymm5[9],ymm3[9],ymm5[10],ymm3[10],ymm5[11],ymm3[11] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm17, %zmm6 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rcx), %ymm2 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdx), %ymm7 +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm9 = ymm7[0],ymm2[0],ymm7[1],ymm2[1],ymm7[2],ymm2[2],ymm7[3],ymm2[3],ymm7[8],ymm2[8],ymm7[9],ymm2[9],ymm7[10],ymm2[10],ymm7[11],ymm2[11] +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rsi), %ymm1 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm0 +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm14 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm9, %zmm19, %zmm14 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm14 {%k1} +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm4[4],ymm8[4],ymm4[5],ymm8[5],ymm4[6],ymm8[6],ymm4[7],ymm8[7],ymm4[12],ymm8[12],ymm4[13],ymm8[13],ymm4[14],ymm8[14],ymm4[15],ymm8[15] +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm5[4],ymm3[4],ymm5[5],ymm3[5],ymm5[6],ymm3[6],ymm5[7],ymm3[7],ymm5[12],ymm3[12],ymm5[13],ymm3[13],ymm5[14],ymm3[14],ymm5[15],ymm3[15] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm4, %zmm17, %zmm5 +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm7[4],ymm2[4],ymm7[5],ymm2[5],ymm7[6],ymm2[6],ymm7[7],ymm2[7],ymm7[12],ymm2[12],ymm7[13],ymm2[13],ymm7[14],ymm2[14],ymm7[15],ymm2[15] +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm13 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm19, %zmm13 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm13 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa 32(%r10), %xmm10 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rax), %xmm3 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%r9), %xmm4 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%r8), %xmm5 +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm10[0],xmm3[1],xmm10[1],xmm3[2],xmm10[2],xmm3[3],xmm10[3] +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm16, %zmm6 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rcx), %xmm7 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdx), %xmm8 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rsi), %xmm9 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm0 +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm18, %zmm12 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm12 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, %zmm22 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, %zmm20 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm11 {%k1} +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm3[4],xmm10[4],xmm3[5],xmm10[5],xmm3[6],xmm10[6],xmm3[7],xmm10[7] +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm16, %zmm3 +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm18, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa (%r10), %ymm10 +; AVX512DQ-FCP-NEXT: vmovdqa (%rax), %ymm3 +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm3[0],ymm10[0],ymm3[1],ymm10[1],ymm3[2],ymm10[2],ymm3[3],ymm10[3],ymm3[8],ymm10[8],ymm3[9],ymm10[9],ymm3[10],ymm10[10],ymm3[11],ymm10[11] +; AVX512DQ-FCP-NEXT: vmovdqa (%r9), %ymm5 +; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %ymm6 +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm6[0],ymm5[0],ymm6[1],ymm5[1],ymm6[2],ymm5[2],ymm6[3],ymm5[3],ymm6[8],ymm5[8],ymm6[9],ymm5[9],ymm6[10],ymm5[10],ymm6[11],ymm5[11] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm17, %zmm7 +; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %ymm8 +; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %ymm9 +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm11 = ymm9[0],ymm8[0],ymm9[1],ymm8[1],ymm9[2],ymm8[2],ymm9[3],ymm8[3],ymm9[8],ymm8[8],ymm9[9],ymm8[9],ymm9[10],ymm8[10],ymm9[11],ymm8[11] +; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %ymm2 +; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm1 +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm11, %zmm19, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm0 {%k1} +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm3[4],ymm10[4],ymm3[5],ymm10[5],ymm3[6],ymm10[6],ymm3[7],ymm10[7],ymm3[12],ymm10[12],ymm3[13],ymm10[13],ymm3[14],ymm10[14],ymm3[15],ymm10[15] +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm6[4],ymm5[4],ymm6[5],ymm5[5],ymm6[6],ymm5[6],ymm6[7],ymm5[7],ymm6[12],ymm5[12],ymm6[13],ymm5[13],ymm6[14],ymm5[14],ymm6[15],ymm5[15] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm17, %zmm5 +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm9[4],ymm8[4],ymm9[5],ymm8[5],ymm9[6],ymm8[6],ymm9[7],ymm8[7],ymm9[12],ymm8[12],ymm9[13],ymm8[13],ymm9[14],ymm8[14],ymm9[15],ymm8[15] +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm19, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm21, %xmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm22, %xmm3 +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm23, %xmm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm24, %xmm5 +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm16, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm25, %xmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm26, %xmm5 +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm27, %xmm5 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm28, %xmm6 +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm18, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, (%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, 192(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, 128(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, (%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, 192(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, 128(%rax) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, 320(%rax) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, 256(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, 448(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, 384(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, 576(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, 512(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, 704(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, 640(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, 832(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, 768(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, 960(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, 896(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, 64(%rax) -; AVX512DQ-FCP-NEXT: addq $264, %rsp # imm = 0x108 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, 448(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, 384(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, 576(%rax) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 512(%rax) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 704(%rax) +; AVX512DQ-FCP-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 640(%rax) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 832(%rax) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 768(%rax) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 960(%rax) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 896(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, 64(%rax) +; AVX512DQ-FCP-NEXT: addq $328, %rsp # imm = 0x148 ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll index 86efcf9c5761..ad9db98711a6 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll @@ -1190,8 +1190,7 @@ define void @store_i8_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm1[2,3,0,1] ; AVX2-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,ymm5[7],zero,zero,zero,zero,ymm5[8],zero,zero,zero,zero,ymm5[9],zero,zero,zero,ymm5[26],zero,zero,zero,zero,ymm5[27],zero,zero,zero,zero,ymm5[28],zero,zero,zero ; AVX2-NEXT: vpor %ymm4, %ymm5, %ymm4 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255] -; AVX2-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 +; AVX2-NEXT: vpor %ymm3, %ymm4, %ymm3 ; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,2,2] ; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,1,1] ; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255] @@ -1233,8 +1232,7 @@ define void @store_i8_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm5 = ymm1[2,3,0,1] ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,ymm5[7],zero,zero,zero,zero,ymm5[8],zero,zero,zero,zero,ymm5[9],zero,zero,zero,ymm5[26],zero,zero,zero,zero,ymm5[27],zero,zero,zero,zero,ymm5[28],zero,zero,zero ; AVX2-FP-NEXT: vpor %ymm4, %ymm5, %ymm4 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255] -; AVX2-FP-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 +; AVX2-FP-NEXT: vpor %ymm3, %ymm4, %ymm3 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,2,2] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,1,1] ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255] @@ -1461,20 +1459,18 @@ define void @store_i8_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vpermq {{.*#+}} ymm4 = ymm0[2,3,0,1] ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,ymm4[7],zero,zero,zero,zero,ymm4[8],zero,zero,zero,zero,ymm4[9],zero,zero,zero,ymm4[26],zero,zero,zero,zero,ymm4[27],zero,zero,zero,zero,ymm4[28],zero,zero,zero ; AVX512BW-NEXT: vpor %ymm4, %ymm3, %ymm3 +; AVX512BW-NEXT: vpermq {{.*#+}} ymm4 = ymm0[0,2,2,0] +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[0,8],zero,zero,zero,ymm4[1,9],zero,zero,zero,ymm4[2,10],zero,zero,zero,ymm4[3,19],zero,zero,zero,ymm4[28,20],zero,zero,zero,ymm4[29,21],zero,zero,zero,ymm4[30,22] +; AVX512BW-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3 ; AVX512BW-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm4 = ymm1[6],zero,zero,zero,zero,ymm1[7],zero,zero,zero,zero,ymm1[8],zero,zero,zero,zero,ymm1[9,25],zero,zero,zero,zero,ymm1[26],zero,zero,zero,zero,ymm1[27],zero,zero,zero,zero,ymm1[28] ; AVX512BW-NEXT: vpermq {{.*#+}} ymm5 = ymm1[2,3,0,1] ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm5 = zero,ymm5[6],zero,zero,zero,zero,ymm5[7],zero,zero,zero,zero,ymm5[8],zero,zero,zero,zero,zero,zero,zero,zero,ymm5[26],zero,zero,zero,zero,ymm5[27],zero,zero,zero,zero,ymm5[28],zero ; AVX512BW-NEXT: vpor %ymm4, %ymm5, %ymm4 -; AVX512BW-NEXT: movl $831283992, %eax # imm = 0x318C6318 -; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqu8 %ymm3, %ymm4 {%k1} -; AVX512BW-NEXT: vpermq {{.*#+}} ymm3 = ymm0[0,2,2,0] -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[0,8],zero,zero,zero,ymm3[1,9],zero,zero,zero,ymm3[2,10],zero,zero,zero,ymm3[3,19],zero,zero,zero,ymm3[28,20],zero,zero,zero,ymm3[29,21],zero,zero,zero,ymm3[30,22] ; AVX512BW-NEXT: vpermq {{.*#+}} ymm5 = ymm1[0,2,0,2] ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,ymm5[0,8],zero,zero,zero,ymm5[1,9],zero,zero,zero,ymm5[2,10],zero,zero,zero,ymm5[19,27],zero,zero,zero,ymm5[20,28],zero,zero,zero,ymm5[21,29],zero,zero,zero -; AVX512BW-NEXT: vpor %ymm3, %ymm5, %ymm3 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4 +; AVX512BW-NEXT: vporq %zmm3, %zmm4, %zmm3 ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,0,0,0,0,1,1,1,1,2,2,2,2,2,2] ; AVX512BW-NEXT: vpermd %zmm2, %zmm4, %zmm4 ; AVX512BW-NEXT: movabsq $595056260442243600, %rax # imm = 0x842108421084210 @@ -1531,20 +1527,18 @@ define void @store_i8_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm4 = ymm0[2,3,0,1] ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,ymm4[7],zero,zero,zero,zero,ymm4[8],zero,zero,zero,zero,ymm4[9],zero,zero,zero,ymm4[26],zero,zero,zero,zero,ymm4[27],zero,zero,zero,zero,ymm4[28],zero,zero,zero ; AVX512DQ-BW-NEXT: vpor %ymm4, %ymm3, %ymm3 +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm4 = ymm0[0,2,2,0] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[0,8],zero,zero,zero,ymm4[1,9],zero,zero,zero,ymm4[2,10],zero,zero,zero,ymm4[3,19],zero,zero,zero,ymm4[28,20],zero,zero,zero,ymm4[29,21],zero,zero,zero,ymm4[30,22] +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3 ; AVX512DQ-BW-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm4 = ymm1[6],zero,zero,zero,zero,ymm1[7],zero,zero,zero,zero,ymm1[8],zero,zero,zero,zero,ymm1[9,25],zero,zero,zero,zero,ymm1[26],zero,zero,zero,zero,ymm1[27],zero,zero,zero,zero,ymm1[28] ; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm5 = ymm1[2,3,0,1] ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm5 = zero,ymm5[6],zero,zero,zero,zero,ymm5[7],zero,zero,zero,zero,ymm5[8],zero,zero,zero,zero,zero,zero,zero,zero,ymm5[26],zero,zero,zero,zero,ymm5[27],zero,zero,zero,zero,ymm5[28],zero ; AVX512DQ-BW-NEXT: vpor %ymm4, %ymm5, %ymm4 -; AVX512DQ-BW-NEXT: movl $831283992, %eax # imm = 0x318C6318 -; AVX512DQ-BW-NEXT: kmovd %eax, %k1 -; AVX512DQ-BW-NEXT: vmovdqu8 %ymm3, %ymm4 {%k1} -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm3 = ymm0[0,2,2,0] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[0,8],zero,zero,zero,ymm3[1,9],zero,zero,zero,ymm3[2,10],zero,zero,zero,ymm3[3,19],zero,zero,zero,ymm3[28,20],zero,zero,zero,ymm3[29,21],zero,zero,zero,ymm3[30,22] ; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm5 = ymm1[0,2,0,2] ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,ymm5[0,8],zero,zero,zero,ymm5[1,9],zero,zero,zero,ymm5[2,10],zero,zero,zero,ymm5[19,27],zero,zero,zero,ymm5[20,28],zero,zero,zero,ymm5[21,29],zero,zero,zero -; AVX512DQ-BW-NEXT: vpor %ymm3, %ymm5, %ymm3 -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3 +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4 +; AVX512DQ-BW-NEXT: vporq %zmm3, %zmm4, %zmm3 ; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,0,0,0,0,1,1,1,1,2,2,2,2,2,2] ; AVX512DQ-BW-NEXT: vpermd %zmm2, %zmm4, %zmm4 ; AVX512DQ-BW-NEXT: movabsq $595056260442243600, %rax # imm = 0x842108421084210 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-6.ll index 6d499e17bfbc..03f5b90002d3 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-6.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-6.ll @@ -2996,94 +2996,88 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm2 ; AVX512BW-NEXT: vmovdqa (%rsi), %ymm3 -; AVX512BW-NEXT: vmovdqa (%rdx), %ymm5 -; AVX512BW-NEXT: vmovdqa (%rcx), %ymm6 +; AVX512BW-NEXT: vmovdqa (%rdx), %ymm4 +; AVX512BW-NEXT: vmovdqa (%rcx), %ymm5 ; AVX512BW-NEXT: vmovdqa (%r8), %ymm0 ; AVX512BW-NEXT: vmovdqa (%r9), %ymm1 -; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm4 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] -; AVX512BW-NEXT: vpshufb %ymm4, %ymm3, %ymm7 -; AVX512BW-NEXT: vpshufb %ymm4, %ymm2, %ymm4 -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm4[0],ymm7[0],ymm4[1],ymm7[1],ymm4[2],ymm7[2],ymm4[3],ymm7[3],ymm4[4],ymm7[4],ymm4[5],ymm7[5],ymm4[6],ymm7[6],ymm4[7],ymm7[7],ymm4[16],ymm7[16],ymm4[17],ymm7[17],ymm4[18],ymm7[18],ymm4[19],ymm7[19],ymm4[20],ymm7[20],ymm4[21],ymm7[21],ymm4[22],ymm7[22],ymm4[23],ymm7[23] -; AVX512BW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} ymm7 = ymm5[8],ymm6[8],ymm5[9],ymm6[9],ymm5[10],ymm6[10],ymm5[11],ymm6[11],ymm5[12],ymm6[12],ymm5[13],ymm6[13],ymm5[14],ymm6[14],ymm5[15],ymm6[15],ymm5[24],ymm6[24],ymm5[25],ymm6[25],ymm5[26],ymm6[26],ymm5[27],ymm6[27],ymm5[28],ymm6[28],ymm5[29],ymm6[29],ymm5[30],ymm6[30],ymm5[31],ymm6[31] -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm8 = [13,12,11,14,13,12,11,14,13,12,11,14,15,15,15,15] -; AVX512BW-NEXT: vpermw %ymm7, %ymm8, %ymm7 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm7, %zmm4, %zmm7 -; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm8 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] -; AVX512BW-NEXT: vpshufb %ymm8, %ymm6, %ymm4 -; AVX512BW-NEXT: vpshufb %ymm8, %ymm5, %ymm9 -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm9[0],ymm4[0],ymm9[1],ymm4[1],ymm9[2],ymm4[2],ymm9[3],ymm4[3],ymm9[4],ymm4[4],ymm9[5],ymm4[5],ymm9[6],ymm4[6],ymm9[7],ymm4[7],ymm9[16],ymm4[16],ymm9[17],ymm4[17],ymm9[18],ymm4[18],ymm9[19],ymm4[19],ymm9[20],ymm4[20],ymm9[21],ymm4[21],ymm9[22],ymm4[22],ymm9[23],ymm4[23] -; AVX512BW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} ymm9 = ymm2[8],ymm3[8],ymm2[9],ymm3[9],ymm2[10],ymm3[10],ymm2[11],ymm3[11],ymm2[12],ymm3[12],ymm2[13],ymm3[13],ymm2[14],ymm3[14],ymm2[15],ymm3[15],ymm2[24],ymm3[24],ymm2[25],ymm3[25],ymm2[26],ymm3[26],ymm2[27],ymm3[27],ymm2[28],ymm3[28],ymm2[29],ymm3[29],ymm2[30],ymm3[30],ymm2[31],ymm3[31] -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm10 = [12,11,14,13,12,11,14,13,12,11,14,13,15,15,15,15] -; AVX512BW-NEXT: vpermw %ymm9, %ymm10, %ymm9 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm9, %zmm4, %zmm4 -; AVX512BW-NEXT: movl $1227114788, %r10d # imm = 0x49244924 +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm6 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] +; AVX512BW-NEXT: vpshufb %ymm6, %ymm3, %ymm7 +; AVX512BW-NEXT: vpshufb %ymm6, %ymm2, %ymm6 +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm6 = ymm6[0],ymm7[0],ymm6[1],ymm7[1],ymm6[2],ymm7[2],ymm6[3],ymm7[3],ymm6[4],ymm7[4],ymm6[5],ymm7[5],ymm6[6],ymm7[6],ymm6[7],ymm7[7],ymm6[16],ymm7[16],ymm6[17],ymm7[17],ymm6[18],ymm7[18],ymm6[19],ymm7[19],ymm6[20],ymm7[20],ymm6[21],ymm7[21],ymm6[22],ymm7[22],ymm6[23],ymm7[23] +; AVX512BW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3] +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm7 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] +; AVX512BW-NEXT: vpshufb %ymm7, %ymm5, %ymm8 +; AVX512BW-NEXT: vpshufb %ymm7, %ymm4, %ymm9 +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm8 = ymm9[0],ymm8[0],ymm9[1],ymm8[1],ymm9[2],ymm8[2],ymm9[3],ymm8[3],ymm9[4],ymm8[4],ymm9[5],ymm8[5],ymm9[6],ymm8[6],ymm9[7],ymm8[7],ymm9[16],ymm8[16],ymm9[17],ymm8[17],ymm9[18],ymm8[18],ymm9[19],ymm8[19],ymm9[20],ymm8[20],ymm9[21],ymm8[21],ymm9[22],ymm8[22],ymm9[23],ymm8[23] +; AVX512BW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3] +; AVX512BW-NEXT: movw $18724, %r10w # imm = 0x4924 ; AVX512BW-NEXT: kmovd %r10d, %k1 -; AVX512BW-NEXT: vmovdqu16 %zmm7, %zmm4 {%k1} -; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm7 = [6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0] -; AVX512BW-NEXT: vpshufb %ymm7, %ymm1, %ymm9 -; AVX512BW-NEXT: vpshufb %ymm7, %ymm0, %ymm10 -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm9 = ymm10[0],ymm9[0],ymm10[1],ymm9[1],ymm10[2],ymm9[2],ymm10[3],ymm9[3],ymm10[4],ymm9[4],ymm10[5],ymm9[5],ymm10[6],ymm9[6],ymm10[7],ymm9[7],ymm10[16],ymm9[16],ymm10[17],ymm9[17],ymm10[18],ymm9[18],ymm10[19],ymm9[19],ymm10[20],ymm9[20],ymm10[21],ymm9[21],ymm10[22],ymm9[22],ymm10[23],ymm9[23] -; AVX512BW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,3] -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} ymm10 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31] -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm11 = [10,13,12,11,10,13,12,11,10,13,12,11,14,13,14,15] -; AVX512BW-NEXT: vpermw %ymm10, %ymm11, %ymm10 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm10, %zmm9, %zmm9 -; AVX512BW-NEXT: movl $-1840700270, %r10d # imm = 0x92492492 -; AVX512BW-NEXT: kmovd %r10d, %k1 -; AVX512BW-NEXT: vmovdqu16 %zmm9, %zmm4 {%k1} -; AVX512BW-NEXT: vmovdqa (%rcx), %xmm9 -; AVX512BW-NEXT: vpshufb %xmm8, %xmm9, %xmm10 -; AVX512BW-NEXT: vmovdqa (%rdx), %xmm11 -; AVX512BW-NEXT: vpshufb %xmm8, %xmm11, %xmm8 -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm8[0],xmm10[0],xmm8[1],xmm10[1],xmm8[2],xmm10[2],xmm8[3],xmm10[3],xmm8[4],xmm10[4],xmm8[5],xmm10[5],xmm8[6],xmm10[6],xmm8[7],xmm10[7] -; AVX512BW-NEXT: vmovdqa (%rsi), %xmm10 +; AVX512BW-NEXT: vmovdqu16 %ymm6, %ymm8 {%k1} +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm9 = [6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0] +; AVX512BW-NEXT: vpshufb %ymm9, %ymm1, %ymm6 +; AVX512BW-NEXT: vpshufb %ymm9, %ymm0, %ymm10 +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm6 = ymm10[0],ymm6[0],ymm10[1],ymm6[1],ymm10[2],ymm6[2],ymm10[3],ymm6[3],ymm10[4],ymm6[4],ymm10[5],ymm6[5],ymm10[6],ymm6[6],ymm10[7],ymm6[7],ymm10[16],ymm6[16],ymm10[17],ymm6[17],ymm10[18],ymm6[18],ymm10[19],ymm6[19],ymm10[20],ymm6[20],ymm10[21],ymm6[21],ymm10[22],ymm6[22],ymm10[23],ymm6[23] +; AVX512BW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3] +; AVX512BW-NEXT: movw $9362, %r10w # imm = 0x2492 +; AVX512BW-NEXT: kmovd %r10d, %k2 +; AVX512BW-NEXT: vmovdqu16 %ymm6, %ymm8 {%k2} +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm4[8],ymm5[8],ymm4[9],ymm5[9],ymm4[10],ymm5[10],ymm4[11],ymm5[11],ymm4[12],ymm5[12],ymm4[13],ymm5[13],ymm4[14],ymm5[14],ymm4[15],ymm5[15],ymm4[24],ymm5[24],ymm4[25],ymm5[25],ymm4[26],ymm5[26],ymm4[27],ymm5[27],ymm4[28],ymm5[28],ymm4[29],ymm5[29],ymm4[30],ymm5[30],ymm4[31],ymm5[31] +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} ymm10 = ymm2[8],ymm3[8],ymm2[9],ymm3[9],ymm2[10],ymm3[10],ymm2[11],ymm3[11],ymm2[12],ymm3[12],ymm2[13],ymm3[13],ymm2[14],ymm3[14],ymm2[15],ymm3[15],ymm2[24],ymm3[24],ymm2[25],ymm3[25],ymm2[26],ymm3[26],ymm2[27],ymm3[27],ymm2[28],ymm3[28],ymm2[29],ymm3[29],ymm2[30],ymm3[30],ymm2[31],ymm3[31] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm11 = [0,11,27,0,12,28,0,13,29,0,14,30,0,15,31,0] +; AVX512BW-NEXT: vpermi2w %ymm6, %ymm10, %ymm11 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm11, %zmm0, %zmm6 +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm10 = [42,17,18,43,20,21,44,23,24,45,26,27,46,29,30,47] +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} ymm11 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31] +; AVX512BW-NEXT: vpermi2w %zmm11, %zmm6, %zmm10 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm10, %zmm8, %zmm6 +; AVX512BW-NEXT: vmovdqa (%rsi), %xmm8 +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm10 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] +; AVX512BW-NEXT: vpshufb %xmm10, %xmm8, %xmm11 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm12 -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3],xmm12[4],xmm10[4],xmm12[5],xmm10[5],xmm12[6],xmm10[6],xmm12[7],xmm10[7] -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm14 = [0,3,2,1,0,3,2,1,0,3,2,1,4,5,6,5,32,33,34,35,32,33,34,35,32,33,34,35,36,37,38,39] -; AVX512BW-NEXT: vpermi2w %zmm8, %zmm13, %zmm14 -; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm8 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] -; AVX512BW-NEXT: vpshufb %xmm8, %xmm10, %xmm13 -; AVX512BW-NEXT: vpshufb %xmm8, %xmm12, %xmm8 -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm8[8],xmm13[8],xmm8[9],xmm13[9],xmm8[10],xmm13[10],xmm8[11],xmm13[11],xmm8[12],xmm13[12],xmm8[13],xmm13[13],xmm8[14],xmm13[14],xmm8[15],xmm13[15] -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3],xmm11[4],xmm9[4],xmm11[5],xmm9[5],xmm11[6],xmm9[6],xmm11[7],xmm9[7] -; AVX512BW-NEXT: vprold $16, %xmm13, %xmm13 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm8, %zmm13, %zmm8 -; AVX512BW-NEXT: vpermq {{.*#+}} zmm8 = zmm8[0,0,0,1,4,4,4,5] -; AVX512BW-NEXT: movl $1227105426, %ecx # imm = 0x49242492 -; AVX512BW-NEXT: kmovd %ecx, %k1 -; AVX512BW-NEXT: vmovdqu16 %zmm8, %zmm14 {%k1} -; AVX512BW-NEXT: vmovdqa (%r9), %xmm8 -; AVX512BW-NEXT: vpshufb %xmm7, %xmm8, %xmm13 -; AVX512BW-NEXT: vmovdqa (%r8), %xmm15 -; AVX512BW-NEXT: vpshufb %xmm7, %xmm15, %xmm7 +; AVX512BW-NEXT: vpshufb %xmm10, %xmm12, %xmm10 +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm10[8],xmm11[8],xmm10[9],xmm11[9],xmm10[10],xmm11[10],xmm10[11],xmm11[11],xmm10[12],xmm11[12],xmm10[13],xmm11[13],xmm10[14],xmm11[14],xmm10[15],xmm11[15] +; AVX512BW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,0,1] +; AVX512BW-NEXT: vmovdqa (%rcx), %xmm11 +; AVX512BW-NEXT: vpshufb %xmm7, %xmm11, %xmm13 +; AVX512BW-NEXT: vmovdqa (%rdx), %xmm14 +; AVX512BW-NEXT: vpshufb %xmm7, %xmm14, %xmm7 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm7[0],xmm13[0],xmm7[1],xmm13[1],xmm7[2],xmm13[2],xmm7[3],xmm13[3],xmm7[4],xmm13[4],xmm7[5],xmm13[5],xmm7[6],xmm13[6],xmm7[7],xmm13[7] -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3],xmm15[4],xmm8[4],xmm15[5],xmm8[5],xmm15[6],xmm8[6],xmm15[7],xmm8[7] -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm16 = [2,1,0,3,2,1,0,3,2,1,0,3,4,4,4,4,32,33,34,35,32,33,34,35,32,33,34,35,36,37,38,39] -; AVX512BW-NEXT: vpermi2w %zmm7, %zmm13, %zmm16 -; AVX512BW-NEXT: movl $613566756, %ecx # imm = 0x24924924 -; AVX512BW-NEXT: kmovd %ecx, %k1 -; AVX512BW-NEXT: vmovdqu16 %zmm16, %zmm14 {%k1} -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm5[0],ymm6[0],ymm5[1],ymm6[1],ymm5[2],ymm6[2],ymm5[3],ymm6[3],ymm5[4],ymm6[4],ymm5[5],ymm6[5],ymm5[6],ymm6[6],ymm5[7],ymm6[7],ymm5[16],ymm6[16],ymm5[17],ymm6[17],ymm5[18],ymm6[18],ymm5[19],ymm6[19],ymm5[20],ymm6[20],ymm5[21],ymm6[21],ymm5[22],ymm6[22],ymm5[23],ymm6[23] -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm11[8],xmm9[8],xmm11[9],xmm9[9],xmm11[10],xmm9[10],xmm11[11],xmm9[11],xmm11[12],xmm9[12],xmm11[13],xmm9[13],xmm11[14],xmm9[14],xmm11[15],xmm9[15] -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm7 = [5,4,3,6,5,4,3,6,5,4,3,6,7,7,7,7,41,40,43,42,41,40,43,42,41,40,43,42,45,44,47,46] -; AVX512BW-NEXT: vpermi2w %zmm5, %zmm6, %zmm7 +; AVX512BW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,0,1] +; AVX512BW-NEXT: vmovdqu16 %ymm10, %ymm7 {%k1} +; AVX512BW-NEXT: vmovdqa (%r9), %xmm10 +; AVX512BW-NEXT: vpshufb %xmm9, %xmm10, %xmm13 +; AVX512BW-NEXT: vmovdqa (%r8), %xmm15 +; AVX512BW-NEXT: vpshufb %xmm9, %xmm15, %xmm9 +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm9[0],xmm13[0],xmm9[1],xmm13[1],xmm9[2],xmm13[2],xmm9[3],xmm13[3],xmm9[4],xmm13[4],xmm9[5],xmm13[5],xmm9[6],xmm13[6],xmm9[7],xmm13[7] +; AVX512BW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,0,1] +; AVX512BW-NEXT: vmovdqu16 %ymm9, %ymm7 {%k2} +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm14[0],xmm11[0],xmm14[1],xmm11[1],xmm14[2],xmm11[2],xmm14[3],xmm11[3],xmm14[4],xmm11[4],xmm14[5],xmm11[5],xmm14[6],xmm11[6],xmm14[7],xmm11[7] +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm12[0],xmm8[0],xmm12[1],xmm8[1],xmm12[2],xmm8[2],xmm12[3],xmm8[3],xmm12[4],xmm8[4],xmm12[5],xmm8[5],xmm12[6],xmm8[6],xmm12[7],xmm8[7] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm16 = [0,16,0,1,17,0,2,18,0,3,19,0,4,20,0,5] +; AVX512BW-NEXT: vpermi2w %ymm9, %ymm13, %ymm16 +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm15[0],xmm10[0],xmm15[1],xmm10[1],xmm15[2],xmm10[2],xmm15[3],xmm10[3],xmm15[4],xmm10[4],xmm15[5],xmm10[5],xmm15[6],xmm10[6],xmm15[7],xmm10[7] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm13 = [0,1,16,3,4,17,6,7,18,9,10,19,12,13,20,15] +; AVX512BW-NEXT: vpermi2w %ymm9, %ymm16, %ymm13 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm7, %zmm13, %zmm7 +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm14[8],xmm11[8],xmm14[9],xmm11[9],xmm14[10],xmm11[10],xmm14[11],xmm11[11],xmm14[12],xmm11[12],xmm14[13],xmm11[13],xmm14[14],xmm11[14],xmm14[15],xmm11[15] +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm12[8],xmm8[8],xmm12[9],xmm8[9],xmm12[10],xmm8[10],xmm12[11],xmm8[11],xmm12[12],xmm8[12],xmm12[13],xmm8[13],xmm12[14],xmm8[14],xmm12[15],xmm8[15] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm11 = [0,3,19,0,4,20,0,5,21,0,6,22,0,7,23,0] +; AVX512BW-NEXT: vpermi2w %ymm9, %ymm8, %ymm11 +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm15[8],xmm10[8],xmm15[9],xmm10[9],xmm15[10],xmm10[10],xmm15[11],xmm10[11],xmm15[12],xmm10[12],xmm15[13],xmm10[13],xmm15[14],xmm10[14],xmm15[15],xmm10[15] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm9 = [18,1,2,19,4,5,20,7,8,21,10,11,22,13,14,23] +; AVX512BW-NEXT: vpermi2w %ymm8, %ymm11, %ymm9 +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[4],ymm5[4],ymm4[5],ymm5[5],ymm4[6],ymm5[6],ymm4[7],ymm5[7],ymm4[16],ymm5[16],ymm4[17],ymm5[17],ymm4[18],ymm5[18],ymm4[19],ymm5[19],ymm4[20],ymm5[20],ymm4[21],ymm5[21],ymm4[22],ymm5[22],ymm4[23],ymm5[23] ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[4],ymm3[4],ymm2[5],ymm3[5],ymm2[6],ymm3[6],ymm2[7],ymm3[7],ymm2[16],ymm3[16],ymm2[17],ymm3[17],ymm2[18],ymm3[18],ymm2[19],ymm3[19],ymm2[20],ymm3[20],ymm2[21],ymm3[21],ymm2[22],ymm3[22],ymm2[23],ymm3[23] -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm12[8],xmm10[8],xmm12[9],xmm10[9],xmm12[10],xmm10[10],xmm12[11],xmm10[11],xmm12[12],xmm10[12],xmm12[13],xmm10[13],xmm12[14],xmm10[14],xmm12[15],xmm10[15] -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm5 = [4,3,6,5,4,3,6,5,4,3,6,5,7,7,7,7,40,43,42,41,40,43,42,41,40,43,42,41,44,45,46,45] -; AVX512BW-NEXT: vpermi2w %zmm2, %zmm3, %zmm5 -; AVX512BW-NEXT: vmovdqu16 %zmm7, %zmm5 {%k1} +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm3 = [8,24,0,9,25,0,10,26,0,11,27,0,12,28,0,13] +; AVX512BW-NEXT: vpermi2w %ymm4, %ymm2, %ymm3 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm2 +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm3 = [16,17,40,19,20,41,22,23,42,25,26,43,28,29,44,31] ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm15[8],xmm8[8],xmm15[9],xmm8[9],xmm15[10],xmm8[10],xmm15[11],xmm8[11],xmm15[12],xmm8[12],xmm15[13],xmm8[13],xmm15[14],xmm8[14],xmm15[15],xmm8[15] -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm2 = [2,5,4,3,2,5,4,3,2,5,4,3,6,5,6,7,42,41,40,43,42,41,40,43,42,41,40,43,44,44,44,44] -; AVX512BW-NEXT: vpermi2w %zmm0, %zmm1, %zmm2 -; AVX512BW-NEXT: movl $1227133513, %ecx # imm = 0x49249249 -; AVX512BW-NEXT: kmovd %ecx, %k1 -; AVX512BW-NEXT: vmovdqu16 %zmm2, %zmm5 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm5, 64(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm14, (%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm4, 128(%rax) +; AVX512BW-NEXT: vpermi2w %zmm0, %zmm2, %zmm3 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm3, %zmm9, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, 64(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm7, (%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm6, 128(%rax) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; @@ -3092,93 +3086,88 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm2 ; AVX512BW-FCP-NEXT: vmovdqa (%rsi), %ymm3 -; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %ymm5 -; AVX512BW-FCP-NEXT: vmovdqa (%rcx), %ymm6 +; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %ymm4 +; AVX512BW-FCP-NEXT: vmovdqa (%rcx), %ymm5 ; AVX512BW-FCP-NEXT: vmovdqa (%r8), %ymm0 ; AVX512BW-FCP-NEXT: vmovdqa (%r9), %ymm1 -; AVX512BW-FCP-NEXT: vpbroadcastq {{.*#+}} ymm4 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] -; AVX512BW-FCP-NEXT: vpshufb %ymm4, %ymm3, %ymm7 -; AVX512BW-FCP-NEXT: vpshufb %ymm4, %ymm2, %ymm4 -; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm4[0],ymm7[0],ymm4[1],ymm7[1],ymm4[2],ymm7[2],ymm4[3],ymm7[3],ymm4[4],ymm7[4],ymm4[5],ymm7[5],ymm4[6],ymm7[6],ymm4[7],ymm7[7],ymm4[16],ymm7[16],ymm4[17],ymm7[17],ymm4[18],ymm7[18],ymm4[19],ymm7[19],ymm4[20],ymm7[20],ymm4[21],ymm7[21],ymm4[22],ymm7[22],ymm4[23],ymm7[23] -; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] -; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} ymm7 = ymm5[8],ymm6[8],ymm5[9],ymm6[9],ymm5[10],ymm6[10],ymm5[11],ymm6[11],ymm5[12],ymm6[12],ymm5[13],ymm6[13],ymm5[14],ymm6[14],ymm5[15],ymm6[15],ymm5[24],ymm6[24],ymm5[25],ymm6[25],ymm5[26],ymm6[26],ymm5[27],ymm6[27],ymm5[28],ymm6[28],ymm5[29],ymm6[29],ymm5[30],ymm6[30],ymm5[31],ymm6[31] -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm8 = [13,12,11,14,13,12,11,14,13,12,11,14,15,15,15,15] -; AVX512BW-FCP-NEXT: vpermw %ymm7, %ymm8, %ymm7 -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm4, %zmm7 -; AVX512BW-FCP-NEXT: vpbroadcastq {{.*#+}} ymm8 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] -; AVX512BW-FCP-NEXT: vpshufb %ymm8, %ymm6, %ymm4 -; AVX512BW-FCP-NEXT: vpshufb %ymm8, %ymm5, %ymm9 -; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm9[0],ymm4[0],ymm9[1],ymm4[1],ymm9[2],ymm4[2],ymm9[3],ymm4[3],ymm9[4],ymm4[4],ymm9[5],ymm4[5],ymm9[6],ymm4[6],ymm9[7],ymm4[7],ymm9[16],ymm4[16],ymm9[17],ymm4[17],ymm9[18],ymm4[18],ymm9[19],ymm4[19],ymm9[20],ymm4[20],ymm9[21],ymm4[21],ymm9[22],ymm4[22],ymm9[23],ymm4[23] -; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] -; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} ymm9 = ymm2[8],ymm3[8],ymm2[9],ymm3[9],ymm2[10],ymm3[10],ymm2[11],ymm3[11],ymm2[12],ymm3[12],ymm2[13],ymm3[13],ymm2[14],ymm3[14],ymm2[15],ymm3[15],ymm2[24],ymm3[24],ymm2[25],ymm3[25],ymm2[26],ymm3[26],ymm2[27],ymm3[27],ymm2[28],ymm3[28],ymm2[29],ymm3[29],ymm2[30],ymm3[30],ymm2[31],ymm3[31] -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm10 = [12,11,14,13,12,11,14,13,12,11,14,13,15,15,15,15] -; AVX512BW-FCP-NEXT: vpermw %ymm9, %ymm10, %ymm9 -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm4, %zmm4 -; AVX512BW-FCP-NEXT: movl $1227114788, %r10d # imm = 0x49244924 +; AVX512BW-FCP-NEXT: vpbroadcastq {{.*#+}} ymm6 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] +; AVX512BW-FCP-NEXT: vpshufb %ymm6, %ymm3, %ymm7 +; AVX512BW-FCP-NEXT: vpshufb %ymm6, %ymm2, %ymm6 +; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm6 = ymm6[0],ymm7[0],ymm6[1],ymm7[1],ymm6[2],ymm7[2],ymm6[3],ymm7[3],ymm6[4],ymm7[4],ymm6[5],ymm7[5],ymm6[6],ymm7[6],ymm6[7],ymm7[7],ymm6[16],ymm7[16],ymm6[17],ymm7[17],ymm6[18],ymm7[18],ymm6[19],ymm7[19],ymm6[20],ymm7[20],ymm6[21],ymm7[21],ymm6[22],ymm7[22],ymm6[23],ymm7[23] +; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3] +; AVX512BW-FCP-NEXT: vpbroadcastq {{.*#+}} ymm7 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] +; AVX512BW-FCP-NEXT: vpshufb %ymm7, %ymm5, %ymm8 +; AVX512BW-FCP-NEXT: vpshufb %ymm7, %ymm4, %ymm9 +; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm8 = ymm9[0],ymm8[0],ymm9[1],ymm8[1],ymm9[2],ymm8[2],ymm9[3],ymm8[3],ymm9[4],ymm8[4],ymm9[5],ymm8[5],ymm9[6],ymm8[6],ymm9[7],ymm8[7],ymm9[16],ymm8[16],ymm9[17],ymm8[17],ymm9[18],ymm8[18],ymm9[19],ymm8[19],ymm9[20],ymm8[20],ymm9[21],ymm8[21],ymm9[22],ymm8[22],ymm9[23],ymm8[23] +; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3] +; AVX512BW-FCP-NEXT: movw $18724, %r10w # imm = 0x4924 ; AVX512BW-FCP-NEXT: kmovd %r10d, %k1 -; AVX512BW-FCP-NEXT: vmovdqu16 %zmm7, %zmm4 {%k1} -; AVX512BW-FCP-NEXT: vpbroadcastq {{.*#+}} ymm7 = [6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0] -; AVX512BW-FCP-NEXT: vpshufb %ymm7, %ymm1, %ymm9 -; AVX512BW-FCP-NEXT: vpshufb %ymm7, %ymm0, %ymm10 -; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm9 = ymm10[0],ymm9[0],ymm10[1],ymm9[1],ymm10[2],ymm9[2],ymm10[3],ymm9[3],ymm10[4],ymm9[4],ymm10[5],ymm9[5],ymm10[6],ymm9[6],ymm10[7],ymm9[7],ymm10[16],ymm9[16],ymm10[17],ymm9[17],ymm10[18],ymm9[18],ymm10[19],ymm9[19],ymm10[20],ymm9[20],ymm10[21],ymm9[21],ymm10[22],ymm9[22],ymm10[23],ymm9[23] -; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,3] -; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} ymm10 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31] -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm11 = [10,13,12,11,10,13,12,11,10,13,12,11,14,13,14,15] -; AVX512BW-FCP-NEXT: vpermw %ymm10, %ymm11, %ymm10 -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm9, %zmm9 -; AVX512BW-FCP-NEXT: movl $-1840700270, %r10d # imm = 0x92492492 -; AVX512BW-FCP-NEXT: kmovd %r10d, %k1 -; AVX512BW-FCP-NEXT: vmovdqu16 %zmm9, %zmm4 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa (%rsi), %xmm9 +; AVX512BW-FCP-NEXT: vmovdqu16 %ymm6, %ymm8 {%k1} +; AVX512BW-FCP-NEXT: vpbroadcastq {{.*#+}} ymm9 = [6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0] +; AVX512BW-FCP-NEXT: vpshufb %ymm9, %ymm1, %ymm6 +; AVX512BW-FCP-NEXT: vpshufb %ymm9, %ymm0, %ymm10 +; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm6 = ymm10[0],ymm6[0],ymm10[1],ymm6[1],ymm10[2],ymm6[2],ymm10[3],ymm6[3],ymm10[4],ymm6[4],ymm10[5],ymm6[5],ymm10[6],ymm6[6],ymm10[7],ymm6[7],ymm10[16],ymm6[16],ymm10[17],ymm6[17],ymm10[18],ymm6[18],ymm10[19],ymm6[19],ymm10[20],ymm6[20],ymm10[21],ymm6[21],ymm10[22],ymm6[22],ymm10[23],ymm6[23] +; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3] +; AVX512BW-FCP-NEXT: movw $9362, %r10w # imm = 0x2492 +; AVX512BW-FCP-NEXT: kmovd %r10d, %k2 +; AVX512BW-FCP-NEXT: vmovdqu16 %ymm6, %ymm8 {%k2} +; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm4[8],ymm5[8],ymm4[9],ymm5[9],ymm4[10],ymm5[10],ymm4[11],ymm5[11],ymm4[12],ymm5[12],ymm4[13],ymm5[13],ymm4[14],ymm5[14],ymm4[15],ymm5[15],ymm4[24],ymm5[24],ymm4[25],ymm5[25],ymm4[26],ymm5[26],ymm4[27],ymm5[27],ymm4[28],ymm5[28],ymm4[29],ymm5[29],ymm4[30],ymm5[30],ymm4[31],ymm5[31] +; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} ymm10 = ymm2[8],ymm3[8],ymm2[9],ymm3[9],ymm2[10],ymm3[10],ymm2[11],ymm3[11],ymm2[12],ymm3[12],ymm2[13],ymm3[13],ymm2[14],ymm3[14],ymm2[15],ymm3[15],ymm2[24],ymm3[24],ymm2[25],ymm3[25],ymm2[26],ymm3[26],ymm2[27],ymm3[27],ymm2[28],ymm3[28],ymm2[29],ymm3[29],ymm2[30],ymm3[30],ymm2[31],ymm3[31] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm11 = [0,11,27,0,12,28,0,13,29,0,14,30,0,15,31,0] +; AVX512BW-FCP-NEXT: vpermi2w %ymm6, %ymm10, %ymm11 +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm0, %zmm6 +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm10 = [42,17,18,43,20,21,44,23,24,45,26,27,46,29,30,47] +; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} ymm11 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31] +; AVX512BW-FCP-NEXT: vpermi2w %zmm11, %zmm6, %zmm10 +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm8, %zmm6 +; AVX512BW-FCP-NEXT: vmovdqa (%rsi), %xmm8 ; AVX512BW-FCP-NEXT: vpbroadcastq {{.*#+}} xmm10 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] -; AVX512BW-FCP-NEXT: vpshufb %xmm10, %xmm9, %xmm11 +; AVX512BW-FCP-NEXT: vpshufb %xmm10, %xmm8, %xmm11 ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm12 ; AVX512BW-FCP-NEXT: vpshufb %xmm10, %xmm12, %xmm10 ; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm10[8],xmm11[8],xmm10[9],xmm11[9],xmm10[10],xmm11[10],xmm10[11],xmm11[11],xmm10[12],xmm11[12],xmm10[13],xmm11[13],xmm10[14],xmm11[14],xmm10[15],xmm11[15] +; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,0,1] ; AVX512BW-FCP-NEXT: vmovdqa (%rcx), %xmm11 -; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %xmm13 -; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3],xmm13[4],xmm11[4],xmm13[5],xmm11[5],xmm13[6],xmm11[6],xmm13[7],xmm11[7] -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm15 = [1,0,3,2,1,0,3,2,1,0,3,2,5,4,7,6,32,33,34,35,32,33,34,35,32,33,34,35,36,37,38,39] -; AVX512BW-FCP-NEXT: vpermi2w %zmm10, %zmm14, %zmm15 -; AVX512BW-FCP-NEXT: vpshufb %xmm8, %xmm11, %xmm10 -; AVX512BW-FCP-NEXT: vpshufb %xmm8, %xmm13, %xmm8 -; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm8[0],xmm10[0],xmm8[1],xmm10[1],xmm8[2],xmm10[2],xmm8[3],xmm10[3],xmm8[4],xmm10[4],xmm8[5],xmm10[5],xmm8[6],xmm10[6],xmm8[7],xmm10[7] -; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm12[0],xmm9[0],xmm12[1],xmm9[1],xmm12[2],xmm9[2],xmm12[3],xmm9[3],xmm12[4],xmm9[4],xmm12[5],xmm9[5],xmm12[6],xmm9[6],xmm12[7],xmm9[7] -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm14 = [0,3,2,1,0,3,2,1,0,3,2,1,4,5,6,5,32,33,34,35,32,33,34,35,32,33,34,35,36,37,38,39] -; AVX512BW-FCP-NEXT: vpermi2w %zmm8, %zmm10, %zmm14 -; AVX512BW-FCP-NEXT: movl $1227105426, %ecx # imm = 0x49242492 -; AVX512BW-FCP-NEXT: kmovd %ecx, %k1 -; AVX512BW-FCP-NEXT: vmovdqu16 %zmm15, %zmm14 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa (%r9), %xmm8 -; AVX512BW-FCP-NEXT: vpshufb %xmm7, %xmm8, %xmm10 +; AVX512BW-FCP-NEXT: vpshufb %xmm7, %xmm11, %xmm13 +; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %xmm14 +; AVX512BW-FCP-NEXT: vpshufb %xmm7, %xmm14, %xmm7 +; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm7[0],xmm13[0],xmm7[1],xmm13[1],xmm7[2],xmm13[2],xmm7[3],xmm13[3],xmm7[4],xmm13[4],xmm7[5],xmm13[5],xmm7[6],xmm13[6],xmm7[7],xmm13[7] +; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,0,1] +; AVX512BW-FCP-NEXT: vmovdqu16 %ymm10, %ymm7 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa (%r9), %xmm10 +; AVX512BW-FCP-NEXT: vpshufb %xmm9, %xmm10, %xmm13 ; AVX512BW-FCP-NEXT: vmovdqa (%r8), %xmm15 -; AVX512BW-FCP-NEXT: vpshufb %xmm7, %xmm15, %xmm7 -; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3],xmm7[4],xmm10[4],xmm7[5],xmm10[5],xmm7[6],xmm10[6],xmm7[7],xmm10[7] -; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3],xmm15[4],xmm8[4],xmm15[5],xmm8[5],xmm15[6],xmm8[6],xmm15[7],xmm8[7] -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm16 = [2,1,0,3,2,1,0,3,2,1,0,3,4,4,4,4,32,33,34,35,32,33,34,35,32,33,34,35,36,37,38,39] -; AVX512BW-FCP-NEXT: vpermi2w %zmm7, %zmm10, %zmm16 -; AVX512BW-FCP-NEXT: movl $613566756, %ecx # imm = 0x24924924 -; AVX512BW-FCP-NEXT: kmovd %ecx, %k1 -; AVX512BW-FCP-NEXT: vmovdqu16 %zmm16, %zmm14 {%k1} -; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm5[0],ymm6[0],ymm5[1],ymm6[1],ymm5[2],ymm6[2],ymm5[3],ymm6[3],ymm5[4],ymm6[4],ymm5[5],ymm6[5],ymm5[6],ymm6[6],ymm5[7],ymm6[7],ymm5[16],ymm6[16],ymm5[17],ymm6[17],ymm5[18],ymm6[18],ymm5[19],ymm6[19],ymm5[20],ymm6[20],ymm5[21],ymm6[21],ymm5[22],ymm6[22],ymm5[23],ymm6[23] -; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm13[8],xmm11[8],xmm13[9],xmm11[9],xmm13[10],xmm11[10],xmm13[11],xmm11[11],xmm13[12],xmm11[12],xmm13[13],xmm11[13],xmm13[14],xmm11[14],xmm13[15],xmm11[15] -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm7 = [5,4,3,6,5,4,3,6,5,4,3,6,7,7,7,7,41,40,43,42,41,40,43,42,41,40,43,42,45,44,47,46] -; AVX512BW-FCP-NEXT: vpermi2w %zmm5, %zmm6, %zmm7 +; AVX512BW-FCP-NEXT: vpshufb %xmm9, %xmm15, %xmm9 +; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm9[0],xmm13[0],xmm9[1],xmm13[1],xmm9[2],xmm13[2],xmm9[3],xmm13[3],xmm9[4],xmm13[4],xmm9[5],xmm13[5],xmm9[6],xmm13[6],xmm9[7],xmm13[7] +; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,0,1] +; AVX512BW-FCP-NEXT: vmovdqu16 %ymm9, %ymm7 {%k2} +; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm14[0],xmm11[0],xmm14[1],xmm11[1],xmm14[2],xmm11[2],xmm14[3],xmm11[3],xmm14[4],xmm11[4],xmm14[5],xmm11[5],xmm14[6],xmm11[6],xmm14[7],xmm11[7] +; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm12[0],xmm8[0],xmm12[1],xmm8[1],xmm12[2],xmm8[2],xmm12[3],xmm8[3],xmm12[4],xmm8[4],xmm12[5],xmm8[5],xmm12[6],xmm8[6],xmm12[7],xmm8[7] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm16 = [0,16,0,1,17,0,2,18,0,3,19,0,4,20,0,5] +; AVX512BW-FCP-NEXT: vpermi2w %ymm9, %ymm13, %ymm16 +; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm15[0],xmm10[0],xmm15[1],xmm10[1],xmm15[2],xmm10[2],xmm15[3],xmm10[3],xmm15[4],xmm10[4],xmm15[5],xmm10[5],xmm15[6],xmm10[6],xmm15[7],xmm10[7] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm13 = [0,1,16,3,4,17,6,7,18,9,10,19,12,13,20,15] +; AVX512BW-FCP-NEXT: vpermi2w %ymm9, %ymm16, %ymm13 +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm13, %zmm7 +; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm14[8],xmm11[8],xmm14[9],xmm11[9],xmm14[10],xmm11[10],xmm14[11],xmm11[11],xmm14[12],xmm11[12],xmm14[13],xmm11[13],xmm14[14],xmm11[14],xmm14[15],xmm11[15] +; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm12[8],xmm8[8],xmm12[9],xmm8[9],xmm12[10],xmm8[10],xmm12[11],xmm8[11],xmm12[12],xmm8[12],xmm12[13],xmm8[13],xmm12[14],xmm8[14],xmm12[15],xmm8[15] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm11 = [0,3,19,0,4,20,0,5,21,0,6,22,0,7,23,0] +; AVX512BW-FCP-NEXT: vpermi2w %ymm9, %ymm8, %ymm11 +; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm15[8],xmm10[8],xmm15[9],xmm10[9],xmm15[10],xmm10[10],xmm15[11],xmm10[11],xmm15[12],xmm10[12],xmm15[13],xmm10[13],xmm15[14],xmm10[14],xmm15[15],xmm10[15] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm9 = [18,1,2,19,4,5,20,7,8,21,10,11,22,13,14,23] +; AVX512BW-FCP-NEXT: vpermi2w %ymm8, %ymm11, %ymm9 +; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[4],ymm5[4],ymm4[5],ymm5[5],ymm4[6],ymm5[6],ymm4[7],ymm5[7],ymm4[16],ymm5[16],ymm4[17],ymm5[17],ymm4[18],ymm5[18],ymm4[19],ymm5[19],ymm4[20],ymm5[20],ymm4[21],ymm5[21],ymm4[22],ymm5[22],ymm4[23],ymm5[23] ; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[4],ymm3[4],ymm2[5],ymm3[5],ymm2[6],ymm3[6],ymm2[7],ymm3[7],ymm2[16],ymm3[16],ymm2[17],ymm3[17],ymm2[18],ymm3[18],ymm2[19],ymm3[19],ymm2[20],ymm3[20],ymm2[21],ymm3[21],ymm2[22],ymm3[22],ymm2[23],ymm3[23] -; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm12[8],xmm9[8],xmm12[9],xmm9[9],xmm12[10],xmm9[10],xmm12[11],xmm9[11],xmm12[12],xmm9[12],xmm12[13],xmm9[13],xmm12[14],xmm9[14],xmm12[15],xmm9[15] -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm5 = [4,3,6,5,4,3,6,5,4,3,6,5,7,7,7,7,40,43,42,41,40,43,42,41,40,43,42,41,44,45,46,45] -; AVX512BW-FCP-NEXT: vpermi2w %zmm2, %zmm3, %zmm5 -; AVX512BW-FCP-NEXT: vmovdqu16 %zmm7, %zmm5 {%k1} +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm3 = [8,24,0,9,25,0,10,26,0,11,27,0,12,28,0,13] +; AVX512BW-FCP-NEXT: vpermi2w %ymm4, %ymm2, %ymm3 +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm2 +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm3 = [16,17,40,19,20,41,22,23,42,25,26,43,28,29,44,31] ; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] -; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm15[8],xmm8[8],xmm15[9],xmm8[9],xmm15[10],xmm8[10],xmm15[11],xmm8[11],xmm15[12],xmm8[12],xmm15[13],xmm8[13],xmm15[14],xmm8[14],xmm15[15],xmm8[15] -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm2 = [2,5,4,3,2,5,4,3,2,5,4,3,6,5,6,7,42,41,40,43,42,41,40,43,42,41,40,43,44,44,44,44] -; AVX512BW-FCP-NEXT: vpermi2w %zmm0, %zmm1, %zmm2 -; AVX512BW-FCP-NEXT: movl $1227133513, %ecx # imm = 0x49249249 -; AVX512BW-FCP-NEXT: kmovd %ecx, %k1 -; AVX512BW-FCP-NEXT: vmovdqu16 %zmm2, %zmm5 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, 64(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, (%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, 128(%rax) +; AVX512BW-FCP-NEXT: vpermi2w %zmm0, %zmm2, %zmm3 +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm9, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, 64(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, (%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, 128(%rax) ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq ; @@ -3187,94 +3176,88 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %ymm2 ; AVX512DQ-BW-NEXT: vmovdqa (%rsi), %ymm3 -; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %ymm5 -; AVX512DQ-BW-NEXT: vmovdqa (%rcx), %ymm6 +; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %ymm4 +; AVX512DQ-BW-NEXT: vmovdqa (%rcx), %ymm5 ; AVX512DQ-BW-NEXT: vmovdqa (%r8), %ymm0 ; AVX512DQ-BW-NEXT: vmovdqa (%r9), %ymm1 -; AVX512DQ-BW-NEXT: vpbroadcastq {{.*#+}} ymm4 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] -; AVX512DQ-BW-NEXT: vpshufb %ymm4, %ymm3, %ymm7 -; AVX512DQ-BW-NEXT: vpshufb %ymm4, %ymm2, %ymm4 -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm4[0],ymm7[0],ymm4[1],ymm7[1],ymm4[2],ymm7[2],ymm4[3],ymm7[3],ymm4[4],ymm7[4],ymm4[5],ymm7[5],ymm4[6],ymm7[6],ymm4[7],ymm7[7],ymm4[16],ymm7[16],ymm4[17],ymm7[17],ymm4[18],ymm7[18],ymm4[19],ymm7[19],ymm4[20],ymm7[20],ymm4[21],ymm7[21],ymm4[22],ymm7[22],ymm4[23],ymm7[23] -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} ymm7 = ymm5[8],ymm6[8],ymm5[9],ymm6[9],ymm5[10],ymm6[10],ymm5[11],ymm6[11],ymm5[12],ymm6[12],ymm5[13],ymm6[13],ymm5[14],ymm6[14],ymm5[15],ymm6[15],ymm5[24],ymm6[24],ymm5[25],ymm6[25],ymm5[26],ymm6[26],ymm5[27],ymm6[27],ymm5[28],ymm6[28],ymm5[29],ymm6[29],ymm5[30],ymm6[30],ymm5[31],ymm6[31] -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm8 = [13,12,11,14,13,12,11,14,13,12,11,14,15,15,15,15] -; AVX512DQ-BW-NEXT: vpermw %ymm7, %ymm8, %ymm7 -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm7, %zmm4, %zmm7 -; AVX512DQ-BW-NEXT: vpbroadcastq {{.*#+}} ymm8 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] -; AVX512DQ-BW-NEXT: vpshufb %ymm8, %ymm6, %ymm4 -; AVX512DQ-BW-NEXT: vpshufb %ymm8, %ymm5, %ymm9 -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm9[0],ymm4[0],ymm9[1],ymm4[1],ymm9[2],ymm4[2],ymm9[3],ymm4[3],ymm9[4],ymm4[4],ymm9[5],ymm4[5],ymm9[6],ymm4[6],ymm9[7],ymm4[7],ymm9[16],ymm4[16],ymm9[17],ymm4[17],ymm9[18],ymm4[18],ymm9[19],ymm4[19],ymm9[20],ymm4[20],ymm9[21],ymm4[21],ymm9[22],ymm4[22],ymm9[23],ymm4[23] -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} ymm9 = ymm2[8],ymm3[8],ymm2[9],ymm3[9],ymm2[10],ymm3[10],ymm2[11],ymm3[11],ymm2[12],ymm3[12],ymm2[13],ymm3[13],ymm2[14],ymm3[14],ymm2[15],ymm3[15],ymm2[24],ymm3[24],ymm2[25],ymm3[25],ymm2[26],ymm3[26],ymm2[27],ymm3[27],ymm2[28],ymm3[28],ymm2[29],ymm3[29],ymm2[30],ymm3[30],ymm2[31],ymm3[31] -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm10 = [12,11,14,13,12,11,14,13,12,11,14,13,15,15,15,15] -; AVX512DQ-BW-NEXT: vpermw %ymm9, %ymm10, %ymm9 -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm9, %zmm4, %zmm4 -; AVX512DQ-BW-NEXT: movl $1227114788, %r10d # imm = 0x49244924 +; AVX512DQ-BW-NEXT: vpbroadcastq {{.*#+}} ymm6 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] +; AVX512DQ-BW-NEXT: vpshufb %ymm6, %ymm3, %ymm7 +; AVX512DQ-BW-NEXT: vpshufb %ymm6, %ymm2, %ymm6 +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} ymm6 = ymm6[0],ymm7[0],ymm6[1],ymm7[1],ymm6[2],ymm7[2],ymm6[3],ymm7[3],ymm6[4],ymm7[4],ymm6[5],ymm7[5],ymm6[6],ymm7[6],ymm6[7],ymm7[7],ymm6[16],ymm7[16],ymm6[17],ymm7[17],ymm6[18],ymm7[18],ymm6[19],ymm7[19],ymm6[20],ymm7[20],ymm6[21],ymm7[21],ymm6[22],ymm7[22],ymm6[23],ymm7[23] +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3] +; AVX512DQ-BW-NEXT: vpbroadcastq {{.*#+}} ymm7 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] +; AVX512DQ-BW-NEXT: vpshufb %ymm7, %ymm5, %ymm8 +; AVX512DQ-BW-NEXT: vpshufb %ymm7, %ymm4, %ymm9 +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} ymm8 = ymm9[0],ymm8[0],ymm9[1],ymm8[1],ymm9[2],ymm8[2],ymm9[3],ymm8[3],ymm9[4],ymm8[4],ymm9[5],ymm8[5],ymm9[6],ymm8[6],ymm9[7],ymm8[7],ymm9[16],ymm8[16],ymm9[17],ymm8[17],ymm9[18],ymm8[18],ymm9[19],ymm8[19],ymm9[20],ymm8[20],ymm9[21],ymm8[21],ymm9[22],ymm8[22],ymm9[23],ymm8[23] +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3] +; AVX512DQ-BW-NEXT: movw $18724, %r10w # imm = 0x4924 ; AVX512DQ-BW-NEXT: kmovd %r10d, %k1 -; AVX512DQ-BW-NEXT: vmovdqu16 %zmm7, %zmm4 {%k1} -; AVX512DQ-BW-NEXT: vpbroadcastq {{.*#+}} ymm7 = [6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0] -; AVX512DQ-BW-NEXT: vpshufb %ymm7, %ymm1, %ymm9 -; AVX512DQ-BW-NEXT: vpshufb %ymm7, %ymm0, %ymm10 -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} ymm9 = ymm10[0],ymm9[0],ymm10[1],ymm9[1],ymm10[2],ymm9[2],ymm10[3],ymm9[3],ymm10[4],ymm9[4],ymm10[5],ymm9[5],ymm10[6],ymm9[6],ymm10[7],ymm9[7],ymm10[16],ymm9[16],ymm10[17],ymm9[17],ymm10[18],ymm9[18],ymm10[19],ymm9[19],ymm10[20],ymm9[20],ymm10[21],ymm9[21],ymm10[22],ymm9[22],ymm10[23],ymm9[23] -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,3] -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} ymm10 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31] -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm11 = [10,13,12,11,10,13,12,11,10,13,12,11,14,13,14,15] -; AVX512DQ-BW-NEXT: vpermw %ymm10, %ymm11, %ymm10 -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm10, %zmm9, %zmm9 -; AVX512DQ-BW-NEXT: movl $-1840700270, %r10d # imm = 0x92492492 -; AVX512DQ-BW-NEXT: kmovd %r10d, %k1 -; AVX512DQ-BW-NEXT: vmovdqu16 %zmm9, %zmm4 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa (%rcx), %xmm9 -; AVX512DQ-BW-NEXT: vpshufb %xmm8, %xmm9, %xmm10 -; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %xmm11 -; AVX512DQ-BW-NEXT: vpshufb %xmm8, %xmm11, %xmm8 -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm8[0],xmm10[0],xmm8[1],xmm10[1],xmm8[2],xmm10[2],xmm8[3],xmm10[3],xmm8[4],xmm10[4],xmm8[5],xmm10[5],xmm8[6],xmm10[6],xmm8[7],xmm10[7] -; AVX512DQ-BW-NEXT: vmovdqa (%rsi), %xmm10 +; AVX512DQ-BW-NEXT: vmovdqu16 %ymm6, %ymm8 {%k1} +; AVX512DQ-BW-NEXT: vpbroadcastq {{.*#+}} ymm9 = [6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0] +; AVX512DQ-BW-NEXT: vpshufb %ymm9, %ymm1, %ymm6 +; AVX512DQ-BW-NEXT: vpshufb %ymm9, %ymm0, %ymm10 +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} ymm6 = ymm10[0],ymm6[0],ymm10[1],ymm6[1],ymm10[2],ymm6[2],ymm10[3],ymm6[3],ymm10[4],ymm6[4],ymm10[5],ymm6[5],ymm10[6],ymm6[6],ymm10[7],ymm6[7],ymm10[16],ymm6[16],ymm10[17],ymm6[17],ymm10[18],ymm6[18],ymm10[19],ymm6[19],ymm10[20],ymm6[20],ymm10[21],ymm6[21],ymm10[22],ymm6[22],ymm10[23],ymm6[23] +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3] +; AVX512DQ-BW-NEXT: movw $9362, %r10w # imm = 0x2492 +; AVX512DQ-BW-NEXT: kmovd %r10d, %k2 +; AVX512DQ-BW-NEXT: vmovdqu16 %ymm6, %ymm8 {%k2} +; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm4[8],ymm5[8],ymm4[9],ymm5[9],ymm4[10],ymm5[10],ymm4[11],ymm5[11],ymm4[12],ymm5[12],ymm4[13],ymm5[13],ymm4[14],ymm5[14],ymm4[15],ymm5[15],ymm4[24],ymm5[24],ymm4[25],ymm5[25],ymm4[26],ymm5[26],ymm4[27],ymm5[27],ymm4[28],ymm5[28],ymm4[29],ymm5[29],ymm4[30],ymm5[30],ymm4[31],ymm5[31] +; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} ymm10 = ymm2[8],ymm3[8],ymm2[9],ymm3[9],ymm2[10],ymm3[10],ymm2[11],ymm3[11],ymm2[12],ymm3[12],ymm2[13],ymm3[13],ymm2[14],ymm3[14],ymm2[15],ymm3[15],ymm2[24],ymm3[24],ymm2[25],ymm3[25],ymm2[26],ymm3[26],ymm2[27],ymm3[27],ymm2[28],ymm3[28],ymm2[29],ymm3[29],ymm2[30],ymm3[30],ymm2[31],ymm3[31] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm11 = [0,11,27,0,12,28,0,13,29,0,14,30,0,15,31,0] +; AVX512DQ-BW-NEXT: vpermi2w %ymm6, %ymm10, %ymm11 +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm11, %zmm0, %zmm6 +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm10 = [42,17,18,43,20,21,44,23,24,45,26,27,46,29,30,47] +; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} ymm11 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31] +; AVX512DQ-BW-NEXT: vpermi2w %zmm11, %zmm6, %zmm10 +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm10, %zmm8, %zmm6 +; AVX512DQ-BW-NEXT: vmovdqa (%rsi), %xmm8 +; AVX512DQ-BW-NEXT: vpbroadcastq {{.*#+}} xmm10 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] +; AVX512DQ-BW-NEXT: vpshufb %xmm10, %xmm8, %xmm11 ; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm12 -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3],xmm12[4],xmm10[4],xmm12[5],xmm10[5],xmm12[6],xmm10[6],xmm12[7],xmm10[7] -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm14 = [0,3,2,1,0,3,2,1,0,3,2,1,4,5,6,5,32,33,34,35,32,33,34,35,32,33,34,35,36,37,38,39] -; AVX512DQ-BW-NEXT: vpermi2w %zmm8, %zmm13, %zmm14 -; AVX512DQ-BW-NEXT: vpbroadcastq {{.*#+}} xmm8 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] -; AVX512DQ-BW-NEXT: vpshufb %xmm8, %xmm10, %xmm13 -; AVX512DQ-BW-NEXT: vpshufb %xmm8, %xmm12, %xmm8 -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm8[8],xmm13[8],xmm8[9],xmm13[9],xmm8[10],xmm13[10],xmm8[11],xmm13[11],xmm8[12],xmm13[12],xmm8[13],xmm13[13],xmm8[14],xmm13[14],xmm8[15],xmm13[15] -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3],xmm11[4],xmm9[4],xmm11[5],xmm9[5],xmm11[6],xmm9[6],xmm11[7],xmm9[7] -; AVX512DQ-BW-NEXT: vprold $16, %xmm13, %xmm13 -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm8, %zmm13, %zmm8 -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} zmm8 = zmm8[0,0,0,1,4,4,4,5] -; AVX512DQ-BW-NEXT: movl $1227105426, %ecx # imm = 0x49242492 -; AVX512DQ-BW-NEXT: kmovd %ecx, %k1 -; AVX512DQ-BW-NEXT: vmovdqu16 %zmm8, %zmm14 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa (%r9), %xmm8 -; AVX512DQ-BW-NEXT: vpshufb %xmm7, %xmm8, %xmm13 -; AVX512DQ-BW-NEXT: vmovdqa (%r8), %xmm15 -; AVX512DQ-BW-NEXT: vpshufb %xmm7, %xmm15, %xmm7 +; AVX512DQ-BW-NEXT: vpshufb %xmm10, %xmm12, %xmm10 +; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm10[8],xmm11[8],xmm10[9],xmm11[9],xmm10[10],xmm11[10],xmm10[11],xmm11[11],xmm10[12],xmm11[12],xmm10[13],xmm11[13],xmm10[14],xmm11[14],xmm10[15],xmm11[15] +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,0,1] +; AVX512DQ-BW-NEXT: vmovdqa (%rcx), %xmm11 +; AVX512DQ-BW-NEXT: vpshufb %xmm7, %xmm11, %xmm13 +; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %xmm14 +; AVX512DQ-BW-NEXT: vpshufb %xmm7, %xmm14, %xmm7 ; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm7[0],xmm13[0],xmm7[1],xmm13[1],xmm7[2],xmm13[2],xmm7[3],xmm13[3],xmm7[4],xmm13[4],xmm7[5],xmm13[5],xmm7[6],xmm13[6],xmm7[7],xmm13[7] -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3],xmm15[4],xmm8[4],xmm15[5],xmm8[5],xmm15[6],xmm8[6],xmm15[7],xmm8[7] -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm16 = [2,1,0,3,2,1,0,3,2,1,0,3,4,4,4,4,32,33,34,35,32,33,34,35,32,33,34,35,36,37,38,39] -; AVX512DQ-BW-NEXT: vpermi2w %zmm7, %zmm13, %zmm16 -; AVX512DQ-BW-NEXT: movl $613566756, %ecx # imm = 0x24924924 -; AVX512DQ-BW-NEXT: kmovd %ecx, %k1 -; AVX512DQ-BW-NEXT: vmovdqu16 %zmm16, %zmm14 {%k1} -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm5[0],ymm6[0],ymm5[1],ymm6[1],ymm5[2],ymm6[2],ymm5[3],ymm6[3],ymm5[4],ymm6[4],ymm5[5],ymm6[5],ymm5[6],ymm6[6],ymm5[7],ymm6[7],ymm5[16],ymm6[16],ymm5[17],ymm6[17],ymm5[18],ymm6[18],ymm5[19],ymm6[19],ymm5[20],ymm6[20],ymm5[21],ymm6[21],ymm5[22],ymm6[22],ymm5[23],ymm6[23] -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm11[8],xmm9[8],xmm11[9],xmm9[9],xmm11[10],xmm9[10],xmm11[11],xmm9[11],xmm11[12],xmm9[12],xmm11[13],xmm9[13],xmm11[14],xmm9[14],xmm11[15],xmm9[15] -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm7 = [5,4,3,6,5,4,3,6,5,4,3,6,7,7,7,7,41,40,43,42,41,40,43,42,41,40,43,42,45,44,47,46] -; AVX512DQ-BW-NEXT: vpermi2w %zmm5, %zmm6, %zmm7 +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,0,1] +; AVX512DQ-BW-NEXT: vmovdqu16 %ymm10, %ymm7 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa (%r9), %xmm10 +; AVX512DQ-BW-NEXT: vpshufb %xmm9, %xmm10, %xmm13 +; AVX512DQ-BW-NEXT: vmovdqa (%r8), %xmm15 +; AVX512DQ-BW-NEXT: vpshufb %xmm9, %xmm15, %xmm9 +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm9[0],xmm13[0],xmm9[1],xmm13[1],xmm9[2],xmm13[2],xmm9[3],xmm13[3],xmm9[4],xmm13[4],xmm9[5],xmm13[5],xmm9[6],xmm13[6],xmm9[7],xmm13[7] +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,0,1] +; AVX512DQ-BW-NEXT: vmovdqu16 %ymm9, %ymm7 {%k2} +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm14[0],xmm11[0],xmm14[1],xmm11[1],xmm14[2],xmm11[2],xmm14[3],xmm11[3],xmm14[4],xmm11[4],xmm14[5],xmm11[5],xmm14[6],xmm11[6],xmm14[7],xmm11[7] +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm12[0],xmm8[0],xmm12[1],xmm8[1],xmm12[2],xmm8[2],xmm12[3],xmm8[3],xmm12[4],xmm8[4],xmm12[5],xmm8[5],xmm12[6],xmm8[6],xmm12[7],xmm8[7] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm16 = [0,16,0,1,17,0,2,18,0,3,19,0,4,20,0,5] +; AVX512DQ-BW-NEXT: vpermi2w %ymm9, %ymm13, %ymm16 +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm15[0],xmm10[0],xmm15[1],xmm10[1],xmm15[2],xmm10[2],xmm15[3],xmm10[3],xmm15[4],xmm10[4],xmm15[5],xmm10[5],xmm15[6],xmm10[6],xmm15[7],xmm10[7] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm13 = [0,1,16,3,4,17,6,7,18,9,10,19,12,13,20,15] +; AVX512DQ-BW-NEXT: vpermi2w %ymm9, %ymm16, %ymm13 +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm7, %zmm13, %zmm7 +; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm14[8],xmm11[8],xmm14[9],xmm11[9],xmm14[10],xmm11[10],xmm14[11],xmm11[11],xmm14[12],xmm11[12],xmm14[13],xmm11[13],xmm14[14],xmm11[14],xmm14[15],xmm11[15] +; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm12[8],xmm8[8],xmm12[9],xmm8[9],xmm12[10],xmm8[10],xmm12[11],xmm8[11],xmm12[12],xmm8[12],xmm12[13],xmm8[13],xmm12[14],xmm8[14],xmm12[15],xmm8[15] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm11 = [0,3,19,0,4,20,0,5,21,0,6,22,0,7,23,0] +; AVX512DQ-BW-NEXT: vpermi2w %ymm9, %ymm8, %ymm11 +; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm15[8],xmm10[8],xmm15[9],xmm10[9],xmm15[10],xmm10[10],xmm15[11],xmm10[11],xmm15[12],xmm10[12],xmm15[13],xmm10[13],xmm15[14],xmm10[14],xmm15[15],xmm10[15] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm9 = [18,1,2,19,4,5,20,7,8,21,10,11,22,13,14,23] +; AVX512DQ-BW-NEXT: vpermi2w %ymm8, %ymm11, %ymm9 +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[4],ymm5[4],ymm4[5],ymm5[5],ymm4[6],ymm5[6],ymm4[7],ymm5[7],ymm4[16],ymm5[16],ymm4[17],ymm5[17],ymm4[18],ymm5[18],ymm4[19],ymm5[19],ymm4[20],ymm5[20],ymm4[21],ymm5[21],ymm4[22],ymm5[22],ymm4[23],ymm5[23] ; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[4],ymm3[4],ymm2[5],ymm3[5],ymm2[6],ymm3[6],ymm2[7],ymm3[7],ymm2[16],ymm3[16],ymm2[17],ymm3[17],ymm2[18],ymm3[18],ymm2[19],ymm3[19],ymm2[20],ymm3[20],ymm2[21],ymm3[21],ymm2[22],ymm3[22],ymm2[23],ymm3[23] -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm12[8],xmm10[8],xmm12[9],xmm10[9],xmm12[10],xmm10[10],xmm12[11],xmm10[11],xmm12[12],xmm10[12],xmm12[13],xmm10[13],xmm12[14],xmm10[14],xmm12[15],xmm10[15] -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm5 = [4,3,6,5,4,3,6,5,4,3,6,5,7,7,7,7,40,43,42,41,40,43,42,41,40,43,42,41,44,45,46,45] -; AVX512DQ-BW-NEXT: vpermi2w %zmm2, %zmm3, %zmm5 -; AVX512DQ-BW-NEXT: vmovdqu16 %zmm7, %zmm5 {%k1} +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm3 = [8,24,0,9,25,0,10,26,0,11,27,0,12,28,0,13] +; AVX512DQ-BW-NEXT: vpermi2w %ymm4, %ymm2, %ymm3 +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm2 +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm3 = [16,17,40,19,20,41,22,23,42,25,26,43,28,29,44,31] ; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm15[8],xmm8[8],xmm15[9],xmm8[9],xmm15[10],xmm8[10],xmm15[11],xmm8[11],xmm15[12],xmm8[12],xmm15[13],xmm8[13],xmm15[14],xmm8[14],xmm15[15],xmm8[15] -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm2 = [2,5,4,3,2,5,4,3,2,5,4,3,6,5,6,7,42,41,40,43,42,41,40,43,42,41,40,43,44,44,44,44] -; AVX512DQ-BW-NEXT: vpermi2w %zmm0, %zmm1, %zmm2 -; AVX512DQ-BW-NEXT: movl $1227133513, %ecx # imm = 0x49249249 -; AVX512DQ-BW-NEXT: kmovd %ecx, %k1 -; AVX512DQ-BW-NEXT: vmovdqu16 %zmm2, %zmm5 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, 64(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, (%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, 128(%rax) +; AVX512DQ-BW-NEXT: vpermi2w %zmm0, %zmm2, %zmm3 +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm3, %zmm9, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, 64(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, (%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, 128(%rax) ; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq ; @@ -3283,93 +3266,88 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rsi), %ymm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %ymm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rcx), %ymm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %ymm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rcx), %ymm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r8), %ymm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r9), %ymm1 -; AVX512DQ-BW-FCP-NEXT: vpbroadcastq {{.*#+}} ymm4 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm4, %ymm3, %ymm7 -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm4, %ymm2, %ymm4 -; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm4[0],ymm7[0],ymm4[1],ymm7[1],ymm4[2],ymm7[2],ymm4[3],ymm7[3],ymm4[4],ymm7[4],ymm4[5],ymm7[5],ymm4[6],ymm7[6],ymm4[7],ymm7[7],ymm4[16],ymm7[16],ymm4[17],ymm7[17],ymm4[18],ymm7[18],ymm4[19],ymm7[19],ymm4[20],ymm7[20],ymm4[21],ymm7[21],ymm4[22],ymm7[22],ymm4[23],ymm7[23] -; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] -; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} ymm7 = ymm5[8],ymm6[8],ymm5[9],ymm6[9],ymm5[10],ymm6[10],ymm5[11],ymm6[11],ymm5[12],ymm6[12],ymm5[13],ymm6[13],ymm5[14],ymm6[14],ymm5[15],ymm6[15],ymm5[24],ymm6[24],ymm5[25],ymm6[25],ymm5[26],ymm6[26],ymm5[27],ymm6[27],ymm5[28],ymm6[28],ymm5[29],ymm6[29],ymm5[30],ymm6[30],ymm5[31],ymm6[31] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm8 = [13,12,11,14,13,12,11,14,13,12,11,14,15,15,15,15] -; AVX512DQ-BW-FCP-NEXT: vpermw %ymm7, %ymm8, %ymm7 -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm4, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpbroadcastq {{.*#+}} ymm8 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm8, %ymm6, %ymm4 -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm8, %ymm5, %ymm9 -; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm9[0],ymm4[0],ymm9[1],ymm4[1],ymm9[2],ymm4[2],ymm9[3],ymm4[3],ymm9[4],ymm4[4],ymm9[5],ymm4[5],ymm9[6],ymm4[6],ymm9[7],ymm4[7],ymm9[16],ymm4[16],ymm9[17],ymm4[17],ymm9[18],ymm4[18],ymm9[19],ymm4[19],ymm9[20],ymm4[20],ymm9[21],ymm4[21],ymm9[22],ymm4[22],ymm9[23],ymm4[23] -; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] -; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} ymm9 = ymm2[8],ymm3[8],ymm2[9],ymm3[9],ymm2[10],ymm3[10],ymm2[11],ymm3[11],ymm2[12],ymm3[12],ymm2[13],ymm3[13],ymm2[14],ymm3[14],ymm2[15],ymm3[15],ymm2[24],ymm3[24],ymm2[25],ymm3[25],ymm2[26],ymm3[26],ymm2[27],ymm3[27],ymm2[28],ymm3[28],ymm2[29],ymm3[29],ymm2[30],ymm3[30],ymm2[31],ymm3[31] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm10 = [12,11,14,13,12,11,14,13,12,11,14,13,15,15,15,15] -; AVX512DQ-BW-FCP-NEXT: vpermw %ymm9, %ymm10, %ymm9 -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm4, %zmm4 -; AVX512DQ-BW-FCP-NEXT: movl $1227114788, %r10d # imm = 0x49244924 +; AVX512DQ-BW-FCP-NEXT: vpbroadcastq {{.*#+}} ymm6 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm6, %ymm3, %ymm7 +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm6, %ymm2, %ymm6 +; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm6 = ymm6[0],ymm7[0],ymm6[1],ymm7[1],ymm6[2],ymm7[2],ymm6[3],ymm7[3],ymm6[4],ymm7[4],ymm6[5],ymm7[5],ymm6[6],ymm7[6],ymm6[7],ymm7[7],ymm6[16],ymm7[16],ymm6[17],ymm7[17],ymm6[18],ymm7[18],ymm6[19],ymm7[19],ymm6[20],ymm7[20],ymm6[21],ymm7[21],ymm6[22],ymm7[22],ymm6[23],ymm7[23] +; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3] +; AVX512DQ-BW-FCP-NEXT: vpbroadcastq {{.*#+}} ymm7 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm7, %ymm5, %ymm8 +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm7, %ymm4, %ymm9 +; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm8 = ymm9[0],ymm8[0],ymm9[1],ymm8[1],ymm9[2],ymm8[2],ymm9[3],ymm8[3],ymm9[4],ymm8[4],ymm9[5],ymm8[5],ymm9[6],ymm8[6],ymm9[7],ymm8[7],ymm9[16],ymm8[16],ymm9[17],ymm8[17],ymm9[18],ymm8[18],ymm9[19],ymm8[19],ymm9[20],ymm8[20],ymm9[21],ymm8[21],ymm9[22],ymm8[22],ymm9[23],ymm8[23] +; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3] +; AVX512DQ-BW-FCP-NEXT: movw $18724, %r10w # imm = 0x4924 ; AVX512DQ-BW-FCP-NEXT: kmovd %r10d, %k1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm7, %zmm4 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpbroadcastq {{.*#+}} ymm7 = [6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0] -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm7, %ymm1, %ymm9 -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm7, %ymm0, %ymm10 -; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm9 = ymm10[0],ymm9[0],ymm10[1],ymm9[1],ymm10[2],ymm9[2],ymm10[3],ymm9[3],ymm10[4],ymm9[4],ymm10[5],ymm9[5],ymm10[6],ymm9[6],ymm10[7],ymm9[7],ymm10[16],ymm9[16],ymm10[17],ymm9[17],ymm10[18],ymm9[18],ymm10[19],ymm9[19],ymm10[20],ymm9[20],ymm10[21],ymm9[21],ymm10[22],ymm9[22],ymm10[23],ymm9[23] -; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,3] -; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} ymm10 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm11 = [10,13,12,11,10,13,12,11,10,13,12,11,14,13,14,15] -; AVX512DQ-BW-FCP-NEXT: vpermw %ymm10, %ymm11, %ymm10 -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm9, %zmm9 -; AVX512DQ-BW-FCP-NEXT: movl $-1840700270, %r10d # imm = 0x92492492 -; AVX512DQ-BW-FCP-NEXT: kmovd %r10d, %k1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm9, %zmm4 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rsi), %xmm9 +; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm6, %ymm8 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpbroadcastq {{.*#+}} ymm9 = [6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0] +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm9, %ymm1, %ymm6 +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm9, %ymm0, %ymm10 +; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm6 = ymm10[0],ymm6[0],ymm10[1],ymm6[1],ymm10[2],ymm6[2],ymm10[3],ymm6[3],ymm10[4],ymm6[4],ymm10[5],ymm6[5],ymm10[6],ymm6[6],ymm10[7],ymm6[7],ymm10[16],ymm6[16],ymm10[17],ymm6[17],ymm10[18],ymm6[18],ymm10[19],ymm6[19],ymm10[20],ymm6[20],ymm10[21],ymm6[21],ymm10[22],ymm6[22],ymm10[23],ymm6[23] +; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3] +; AVX512DQ-BW-FCP-NEXT: movw $9362, %r10w # imm = 0x2492 +; AVX512DQ-BW-FCP-NEXT: kmovd %r10d, %k2 +; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm6, %ymm8 {%k2} +; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm4[8],ymm5[8],ymm4[9],ymm5[9],ymm4[10],ymm5[10],ymm4[11],ymm5[11],ymm4[12],ymm5[12],ymm4[13],ymm5[13],ymm4[14],ymm5[14],ymm4[15],ymm5[15],ymm4[24],ymm5[24],ymm4[25],ymm5[25],ymm4[26],ymm5[26],ymm4[27],ymm5[27],ymm4[28],ymm5[28],ymm4[29],ymm5[29],ymm4[30],ymm5[30],ymm4[31],ymm5[31] +; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} ymm10 = ymm2[8],ymm3[8],ymm2[9],ymm3[9],ymm2[10],ymm3[10],ymm2[11],ymm3[11],ymm2[12],ymm3[12],ymm2[13],ymm3[13],ymm2[14],ymm3[14],ymm2[15],ymm3[15],ymm2[24],ymm3[24],ymm2[25],ymm3[25],ymm2[26],ymm3[26],ymm2[27],ymm3[27],ymm2[28],ymm3[28],ymm2[29],ymm3[29],ymm2[30],ymm3[30],ymm2[31],ymm3[31] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm11 = [0,11,27,0,12,28,0,13,29,0,14,30,0,15,31,0] +; AVX512DQ-BW-FCP-NEXT: vpermi2w %ymm6, %ymm10, %ymm11 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm0, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm10 = [42,17,18,43,20,21,44,23,24,45,26,27,46,29,30,47] +; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} ymm11 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31] +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm11, %zmm6, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm8, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rsi), %xmm8 ; AVX512DQ-BW-FCP-NEXT: vpbroadcastq {{.*#+}} xmm10 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm10, %xmm9, %xmm11 +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm10, %xmm8, %xmm11 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm12 ; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm10, %xmm12, %xmm10 ; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm10[8],xmm11[8],xmm10[9],xmm11[9],xmm10[10],xmm11[10],xmm10[11],xmm11[11],xmm10[12],xmm11[12],xmm10[13],xmm11[13],xmm10[14],xmm11[14],xmm10[15],xmm11[15] +; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,0,1] ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rcx), %xmm11 -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %xmm13 -; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3],xmm13[4],xmm11[4],xmm13[5],xmm11[5],xmm13[6],xmm11[6],xmm13[7],xmm11[7] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm15 = [1,0,3,2,1,0,3,2,1,0,3,2,5,4,7,6,32,33,34,35,32,33,34,35,32,33,34,35,36,37,38,39] -; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm10, %zmm14, %zmm15 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm8, %xmm11, %xmm10 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm8, %xmm13, %xmm8 -; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm8[0],xmm10[0],xmm8[1],xmm10[1],xmm8[2],xmm10[2],xmm8[3],xmm10[3],xmm8[4],xmm10[4],xmm8[5],xmm10[5],xmm8[6],xmm10[6],xmm8[7],xmm10[7] -; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm12[0],xmm9[0],xmm12[1],xmm9[1],xmm12[2],xmm9[2],xmm12[3],xmm9[3],xmm12[4],xmm9[4],xmm12[5],xmm9[5],xmm12[6],xmm9[6],xmm12[7],xmm9[7] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm14 = [0,3,2,1,0,3,2,1,0,3,2,1,4,5,6,5,32,33,34,35,32,33,34,35,32,33,34,35,36,37,38,39] -; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm8, %zmm10, %zmm14 -; AVX512DQ-BW-FCP-NEXT: movl $1227105426, %ecx # imm = 0x49242492 -; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm15, %zmm14 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r9), %xmm8 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm7, %xmm8, %xmm10 +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm7, %xmm11, %xmm13 +; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %xmm14 +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm7, %xmm14, %xmm7 +; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm7[0],xmm13[0],xmm7[1],xmm13[1],xmm7[2],xmm13[2],xmm7[3],xmm13[3],xmm7[4],xmm13[4],xmm7[5],xmm13[5],xmm7[6],xmm13[6],xmm7[7],xmm13[7] +; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,0,1] +; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm10, %ymm7 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r9), %xmm10 +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm9, %xmm10, %xmm13 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r8), %xmm15 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm7, %xmm15, %xmm7 -; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3],xmm7[4],xmm10[4],xmm7[5],xmm10[5],xmm7[6],xmm10[6],xmm7[7],xmm10[7] -; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3],xmm15[4],xmm8[4],xmm15[5],xmm8[5],xmm15[6],xmm8[6],xmm15[7],xmm8[7] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm16 = [2,1,0,3,2,1,0,3,2,1,0,3,4,4,4,4,32,33,34,35,32,33,34,35,32,33,34,35,36,37,38,39] -; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm7, %zmm10, %zmm16 -; AVX512DQ-BW-FCP-NEXT: movl $613566756, %ecx # imm = 0x24924924 -; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm16, %zmm14 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm5[0],ymm6[0],ymm5[1],ymm6[1],ymm5[2],ymm6[2],ymm5[3],ymm6[3],ymm5[4],ymm6[4],ymm5[5],ymm6[5],ymm5[6],ymm6[6],ymm5[7],ymm6[7],ymm5[16],ymm6[16],ymm5[17],ymm6[17],ymm5[18],ymm6[18],ymm5[19],ymm6[19],ymm5[20],ymm6[20],ymm5[21],ymm6[21],ymm5[22],ymm6[22],ymm5[23],ymm6[23] -; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm13[8],xmm11[8],xmm13[9],xmm11[9],xmm13[10],xmm11[10],xmm13[11],xmm11[11],xmm13[12],xmm11[12],xmm13[13],xmm11[13],xmm13[14],xmm11[14],xmm13[15],xmm11[15] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm7 = [5,4,3,6,5,4,3,6,5,4,3,6,7,7,7,7,41,40,43,42,41,40,43,42,41,40,43,42,45,44,47,46] -; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm5, %zmm6, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm9, %xmm15, %xmm9 +; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm9[0],xmm13[0],xmm9[1],xmm13[1],xmm9[2],xmm13[2],xmm9[3],xmm13[3],xmm9[4],xmm13[4],xmm9[5],xmm13[5],xmm9[6],xmm13[6],xmm9[7],xmm13[7] +; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,0,1] +; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm9, %ymm7 {%k2} +; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm14[0],xmm11[0],xmm14[1],xmm11[1],xmm14[2],xmm11[2],xmm14[3],xmm11[3],xmm14[4],xmm11[4],xmm14[5],xmm11[5],xmm14[6],xmm11[6],xmm14[7],xmm11[7] +; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm12[0],xmm8[0],xmm12[1],xmm8[1],xmm12[2],xmm8[2],xmm12[3],xmm8[3],xmm12[4],xmm8[4],xmm12[5],xmm8[5],xmm12[6],xmm8[6],xmm12[7],xmm8[7] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm16 = [0,16,0,1,17,0,2,18,0,3,19,0,4,20,0,5] +; AVX512DQ-BW-FCP-NEXT: vpermi2w %ymm9, %ymm13, %ymm16 +; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm15[0],xmm10[0],xmm15[1],xmm10[1],xmm15[2],xmm10[2],xmm15[3],xmm10[3],xmm15[4],xmm10[4],xmm15[5],xmm10[5],xmm15[6],xmm10[6],xmm15[7],xmm10[7] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm13 = [0,1,16,3,4,17,6,7,18,9,10,19,12,13,20,15] +; AVX512DQ-BW-FCP-NEXT: vpermi2w %ymm9, %ymm16, %ymm13 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm13, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm14[8],xmm11[8],xmm14[9],xmm11[9],xmm14[10],xmm11[10],xmm14[11],xmm11[11],xmm14[12],xmm11[12],xmm14[13],xmm11[13],xmm14[14],xmm11[14],xmm14[15],xmm11[15] +; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm12[8],xmm8[8],xmm12[9],xmm8[9],xmm12[10],xmm8[10],xmm12[11],xmm8[11],xmm12[12],xmm8[12],xmm12[13],xmm8[13],xmm12[14],xmm8[14],xmm12[15],xmm8[15] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm11 = [0,3,19,0,4,20,0,5,21,0,6,22,0,7,23,0] +; AVX512DQ-BW-FCP-NEXT: vpermi2w %ymm9, %ymm8, %ymm11 +; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm15[8],xmm10[8],xmm15[9],xmm10[9],xmm15[10],xmm10[10],xmm15[11],xmm10[11],xmm15[12],xmm10[12],xmm15[13],xmm10[13],xmm15[14],xmm10[14],xmm15[15],xmm10[15] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm9 = [18,1,2,19,4,5,20,7,8,21,10,11,22,13,14,23] +; AVX512DQ-BW-FCP-NEXT: vpermi2w %ymm8, %ymm11, %ymm9 +; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[4],ymm5[4],ymm4[5],ymm5[5],ymm4[6],ymm5[6],ymm4[7],ymm5[7],ymm4[16],ymm5[16],ymm4[17],ymm5[17],ymm4[18],ymm5[18],ymm4[19],ymm5[19],ymm4[20],ymm5[20],ymm4[21],ymm5[21],ymm4[22],ymm5[22],ymm4[23],ymm5[23] ; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[4],ymm3[4],ymm2[5],ymm3[5],ymm2[6],ymm3[6],ymm2[7],ymm3[7],ymm2[16],ymm3[16],ymm2[17],ymm3[17],ymm2[18],ymm3[18],ymm2[19],ymm3[19],ymm2[20],ymm3[20],ymm2[21],ymm3[21],ymm2[22],ymm3[22],ymm2[23],ymm3[23] -; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm12[8],xmm9[8],xmm12[9],xmm9[9],xmm12[10],xmm9[10],xmm12[11],xmm9[11],xmm12[12],xmm9[12],xmm12[13],xmm9[13],xmm12[14],xmm9[14],xmm12[15],xmm9[15] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm5 = [4,3,6,5,4,3,6,5,4,3,6,5,7,7,7,7,40,43,42,41,40,43,42,41,40,43,42,41,44,45,46,45] -; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm2, %zmm3, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm7, %zmm5 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm3 = [8,24,0,9,25,0,10,26,0,11,27,0,12,28,0,13] +; AVX512DQ-BW-FCP-NEXT: vpermi2w %ymm4, %ymm2, %ymm3 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm3 = [16,17,40,19,20,41,22,23,42,25,26,43,28,29,44,31] ; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] -; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm15[8],xmm8[8],xmm15[9],xmm8[9],xmm15[10],xmm8[10],xmm15[11],xmm8[11],xmm15[12],xmm8[12],xmm15[13],xmm8[13],xmm15[14],xmm8[14],xmm15[15],xmm8[15] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm2 = [2,5,4,3,2,5,4,3,2,5,4,3,6,5,6,7,42,41,40,43,42,41,40,43,42,41,40,43,44,44,44,44] -; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm0, %zmm1, %zmm2 -; AVX512DQ-BW-FCP-NEXT: movl $1227133513, %ecx # imm = 0x49249249 -; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm2, %zmm5 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, 64(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, (%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, 128(%rax) +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm0, %zmm2, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm9, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, 64(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, (%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, 128(%rax) ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq %in.vec0 = load <32 x i8>, ptr %in.vecptr0, align 64 @@ -6368,726 +6346,770 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-LABEL: store_i8_stride6_vf64: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm5 -; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm6 -; AVX512BW-NEXT: vmovdqa 32(%rsi), %ymm1 -; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm7 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] -; AVX512BW-NEXT: vpshufb %ymm7, %ymm1, %ymm0 -; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX512BW-NEXT: vpshufb %ymm7, %ymm2, %ymm3 -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[4],ymm0[4],ymm3[5],ymm0[5],ymm3[6],ymm0[6],ymm3[7],ymm0[7],ymm3[16],ymm0[16],ymm3[17],ymm0[17],ymm3[18],ymm0[18],ymm3[19],ymm0[19],ymm3[20],ymm0[20],ymm3[21],ymm0[21],ymm3[22],ymm0[22],ymm3[23],ymm0[23] -; AVX512BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11],ymm2[12],ymm1[12],ymm2[13],ymm1[13],ymm2[14],ymm1[14],ymm2[15],ymm1[15],ymm2[24],ymm1[24],ymm2[25],ymm1[25],ymm2[26],ymm1[26],ymm2[27],ymm1[27],ymm2[28],ymm1[28],ymm2[29],ymm1[29],ymm2[30],ymm1[30],ymm2[31],ymm1[31] -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm8 = [12,11,14,13,12,11,14,13,12,11,14,13,15,15,15,15] -; AVX512BW-NEXT: vpermw %ymm3, %ymm8, %ymm3 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm9 -; AVX512BW-NEXT: vmovdqa 32(%rcx), %ymm3 -; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm12 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] -; AVX512BW-NEXT: vpshufb %ymm12, %ymm3, %ymm0 -; AVX512BW-NEXT: vmovdqa 32(%rdx), %ymm4 -; AVX512BW-NEXT: vpshufb %ymm12, %ymm4, %ymm10 -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm10[0],ymm0[0],ymm10[1],ymm0[1],ymm10[2],ymm0[2],ymm10[3],ymm0[3],ymm10[4],ymm0[4],ymm10[5],ymm0[5],ymm10[6],ymm0[6],ymm10[7],ymm0[7],ymm10[16],ymm0[16],ymm10[17],ymm0[17],ymm10[18],ymm0[18],ymm10[19],ymm0[19],ymm10[20],ymm0[20],ymm10[21],ymm0[21],ymm10[22],ymm0[22],ymm10[23],ymm0[23] -; AVX512BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} ymm10 = ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15],ymm4[24],ymm3[24],ymm4[25],ymm3[25],ymm4[26],ymm3[26],ymm4[27],ymm3[27],ymm4[28],ymm3[28],ymm4[29],ymm3[29],ymm4[30],ymm3[30],ymm4[31],ymm3[31] -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm11 = [13,12,11,14,13,12,11,14,13,12,11,14,15,15,15,15] -; AVX512BW-NEXT: vpermw %ymm10, %ymm11, %ymm10 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm0 -; AVX512BW-NEXT: movl $613566756, %r10d # imm = 0x24924924 -; AVX512BW-NEXT: kmovd %r10d, %k1 -; AVX512BW-NEXT: vmovdqu16 %zmm9, %zmm0 {%k1} -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[4,5,6,7,4,5,6,7] -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm14 = [6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,10,13,12,11,14,0,0,15,10,13,12,11,14,0,0,15] -; AVX512BW-NEXT: vpshufb %zmm14, %zmm5, %zmm5 -; AVX512BW-NEXT: vpermq {{.*#+}} zmm5 = zmm5[2,2,2,3,6,6,6,7] +; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm2 +; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm1 +; AVX512BW-NEXT: vmovdqa (%rsi), %xmm15 +; AVX512BW-NEXT: vmovdqa 32(%rsi), %xmm4 +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm5 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] +; AVX512BW-NEXT: vpshufb %xmm5, %xmm4, %xmm0 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %xmm18 +; AVX512BW-NEXT: vmovdqa 32(%rdi), %xmm6 +; AVX512BW-NEXT: vpshufb %xmm5, %xmm6, %xmm3 +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15] +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3],xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm7 = [0,3,2,1,0,3,2,1,0,3,2,1,4,5,6,5,32,33,34,35,32,33,34,35,32,33,34,35,36,37,38,39] +; AVX512BW-NEXT: vpermt2w %zmm0, %zmm7, %zmm3 +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm0 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] +; AVX512BW-NEXT: vmovdqa64 (%rcx), %xmm16 +; AVX512BW-NEXT: vmovdqa 32(%rcx), %xmm10 +; AVX512BW-NEXT: vpshufb %xmm0, %xmm10, %xmm8 +; AVX512BW-NEXT: vmovdqa64 (%rdx), %xmm17 +; AVX512BW-NEXT: vmovdqa 32(%rdx), %xmm11 +; AVX512BW-NEXT: vpshufb %xmm0, %xmm11, %xmm9 +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3],xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] +; AVX512BW-NEXT: vprold $16, %xmm9, %xmm9 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm8, %zmm9, %zmm8 +; AVX512BW-NEXT: vpermq {{.*#+}} zmm8 = zmm8[0,0,0,1,4,4,4,5] ; AVX512BW-NEXT: movl $-1840700270, %r10d # imm = 0x92492492 +; AVX512BW-NEXT: kmovd %r10d, %k1 +; AVX512BW-NEXT: vmovdqu16 %zmm8, %zmm3 {%k1} +; AVX512BW-NEXT: vmovdqa 32(%r8), %xmm9 +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm8 = xmm9[0],zero,xmm9[1],zero,xmm9[2],zero,xmm9[3],zero,xmm9[4],zero,xmm9[5],zero,xmm9[6],zero,xmm9[7],zero +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm13 = [0,1,16,3,4,17,6,7,18,9,10,19,12,13,20,15] +; AVX512BW-NEXT: vmovdqa %ymm3, %ymm12 +; AVX512BW-NEXT: vpermt2w %ymm8, %ymm13, %ymm12 +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm8 = [16,37,18,19,38,21,22,39,24,25,32,27,28,33,30,31] +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm14 = xmm9[2,1,2,3] +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm14 = xmm14[0],zero,xmm14[1],zero,xmm14[2],zero,xmm14[3],zero,xmm14[4],zero,xmm14[5],zero,xmm14[6],zero,xmm14[7],zero +; AVX512BW-NEXT: vpermt2w %zmm14, %zmm8, %zmm3 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm3, %zmm12, %zmm3 +; AVX512BW-NEXT: vmovdqa 32(%r9), %xmm12 +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm12[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm19 = xmm12[2,1,2,3] +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm19 = xmm19[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm20 = [2,1,0,3,2,1,0,3,2,1,0,3,4,4,4,4,38,37,32,39,38,37,32,39,38,37,32,39,33,33,33,33] +; AVX512BW-NEXT: vpermt2w %zmm19, %zmm20, %zmm14 +; AVX512BW-NEXT: movabsq $585610922974906400, %r10 # imm = 0x820820820820820 +; AVX512BW-NEXT: kmovq %r10, %k2 +; AVX512BW-NEXT: vmovdqu8 %zmm14, %zmm3 {%k2} +; AVX512BW-NEXT: vpshufb %xmm5, %xmm15, %xmm14 +; AVX512BW-NEXT: vpshufb %xmm5, %xmm18, %xmm5 +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm5[8],xmm14[8],xmm5[9],xmm14[9],xmm5[10],xmm14[10],xmm5[11],xmm14[11],xmm5[12],xmm14[12],xmm5[13],xmm14[13],xmm5[14],xmm14[14],xmm5[15],xmm14[15] +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm18[0],xmm15[0],xmm18[1],xmm15[1],xmm18[2],xmm15[2],xmm18[3],xmm15[3],xmm18[4],xmm15[4],xmm18[5],xmm15[5],xmm18[6],xmm15[6],xmm18[7],xmm15[7] +; AVX512BW-NEXT: vpermt2w %zmm5, %zmm7, %zmm14 +; AVX512BW-NEXT: vpshufb %xmm0, %xmm16, %xmm5 +; AVX512BW-NEXT: vpshufb %xmm0, %xmm17, %xmm7 +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3],xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7] +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm17[0],xmm16[0],xmm17[1],xmm16[1],xmm17[2],xmm16[2],xmm17[3],xmm16[3],xmm17[4],xmm16[4],xmm17[5],xmm16[5],xmm17[6],xmm16[6],xmm17[7],xmm16[7] +; AVX512BW-NEXT: vprold $16, %xmm7, %xmm7 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm5, %zmm7, %zmm5 +; AVX512BW-NEXT: vpermq {{.*#+}} zmm5 = zmm5[0,0,0,1,4,4,4,5] +; AVX512BW-NEXT: vmovdqu16 %zmm5, %zmm14 {%k1} +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm7 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero +; AVX512BW-NEXT: vpermi2w %ymm7, %ymm14, %ymm13 +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[2,1,2,3] +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm7 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero,xmm7[4],zero,xmm7[5],zero,xmm7[6],zero,xmm7[7],zero +; AVX512BW-NEXT: vpermt2w %zmm7, %zmm8, %zmm14 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm14, %zmm13, %zmm7 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm8[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm14 = xmm8[2,1,2,3] +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm14[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX512BW-NEXT: vpermt2w %zmm14, %zmm20, %zmm13 +; AVX512BW-NEXT: vmovdqu8 %zmm13, %zmm7 {%k2} +; AVX512BW-NEXT: vmovdqa (%rcx), %ymm13 +; AVX512BW-NEXT: vmovdqa (%rdx), %ymm14 +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm19 = ymm14[0],ymm13[0],ymm14[1],ymm13[1],ymm14[2],ymm13[2],ymm14[3],ymm13[3],ymm14[4],ymm13[4],ymm14[5],ymm13[5],ymm14[6],ymm13[6],ymm14[7],ymm13[7],ymm14[16],ymm13[16],ymm14[17],ymm13[17],ymm14[18],ymm13[18],ymm14[19],ymm13[19],ymm14[20],ymm13[20],ymm14[21],ymm13[21],ymm14[22],ymm13[22],ymm14[23],ymm13[23] +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm20 = xmm17[8],xmm16[8],xmm17[9],xmm16[9],xmm17[10],xmm16[10],xmm17[11],xmm16[11],xmm17[12],xmm16[12],xmm17[13],xmm16[13],xmm17[14],xmm16[14],xmm17[15],xmm16[15] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm21 = [5,4,3,6,5,4,3,6,5,4,3,6,7,7,7,7,41,40,43,42,41,40,43,42,41,40,43,42,45,44,47,46] +; AVX512BW-NEXT: vpermt2w %zmm19, %zmm21, %zmm20 +; AVX512BW-NEXT: vmovdqa64 (%rsi), %ymm16 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %ymm17 +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm19 = ymm17[0],ymm16[0],ymm17[1],ymm16[1],ymm17[2],ymm16[2],ymm17[3],ymm16[3],ymm17[4],ymm16[4],ymm17[5],ymm16[5],ymm17[6],ymm16[6],ymm17[7],ymm16[7],ymm17[16],ymm16[16],ymm17[17],ymm16[17],ymm17[18],ymm16[18],ymm17[19],ymm16[19],ymm17[20],ymm16[20],ymm17[21],ymm16[21],ymm17[22],ymm16[22],ymm17[23],ymm16[23] +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm15 = xmm18[8],xmm15[8],xmm18[9],xmm15[9],xmm18[10],xmm15[10],xmm18[11],xmm15[11],xmm18[12],xmm15[12],xmm18[13],xmm15[13],xmm18[14],xmm15[14],xmm18[15],xmm15[15] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm18 = [4,3,6,5,4,3,6,5,4,3,6,5,7,7,7,7,40,43,42,41,40,43,42,41,40,43,42,41,44,45,46,45] +; AVX512BW-NEXT: vpermt2w %zmm19, %zmm18, %zmm15 +; AVX512BW-NEXT: movl $613566756, %r10d # imm = 0x24924924 ; AVX512BW-NEXT: kmovd %r10d, %k2 -; AVX512BW-NEXT: vmovdqu16 %zmm5, %zmm0 {%k2} -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm6[4,5,6,7,4,5,6,7] -; AVX512BW-NEXT: vpshufb {{.*#+}} zmm5 = zmm5[u,6,u,5,u,8,u,7,u,u,u,9,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u,u,42,u,45,u,44,u,43,u,46,u,u,u,u,u,47,u,58,u,61,u,60,u,59,u,62,u,u,u,u,u,63] -; AVX512BW-NEXT: vpermq {{.*#+}} zmm5 = zmm5[2,2,2,3,6,6,6,7] -; AVX512BW-NEXT: movabsq $-9076969306111049208, %r10 # imm = 0x8208208208208208 -; AVX512BW-NEXT: kmovq %r10, %k3 -; AVX512BW-NEXT: vmovdqu8 %zmm5, %zmm0 {%k3} -; AVX512BW-NEXT: vmovdqa (%rsi), %ymm9 -; AVX512BW-NEXT: vpshufb %ymm7, %ymm9, %ymm5 -; AVX512BW-NEXT: vmovdqa (%rdi), %ymm10 -; AVX512BW-NEXT: vpshufb %ymm7, %ymm10, %ymm6 -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm6[0],ymm5[0],ymm6[1],ymm5[1],ymm6[2],ymm5[2],ymm6[3],ymm5[3],ymm6[4],ymm5[4],ymm6[5],ymm5[5],ymm6[6],ymm5[6],ymm6[7],ymm5[7],ymm6[16],ymm5[16],ymm6[17],ymm5[17],ymm6[18],ymm5[18],ymm6[19],ymm5[19],ymm6[20],ymm5[20],ymm6[21],ymm5[21],ymm6[22],ymm5[22],ymm6[23],ymm5[23] -; AVX512BW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm10[8],ymm9[8],ymm10[9],ymm9[9],ymm10[10],ymm9[10],ymm10[11],ymm9[11],ymm10[12],ymm9[12],ymm10[13],ymm9[13],ymm10[14],ymm9[14],ymm10[15],ymm9[15],ymm10[24],ymm9[24],ymm10[25],ymm9[25],ymm10[26],ymm9[26],ymm10[27],ymm9[27],ymm10[28],ymm9[28],ymm10[29],ymm9[29],ymm10[30],ymm9[30],ymm10[31],ymm9[31] -; AVX512BW-NEXT: vpermw %ymm6, %ymm8, %ymm6 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm6, %zmm5, %zmm6 -; AVX512BW-NEXT: vmovdqa64 (%rcx), %ymm17 -; AVX512BW-NEXT: vpshufb %ymm12, %ymm17, %ymm5 -; AVX512BW-NEXT: vmovdqa64 (%rdx), %ymm19 -; AVX512BW-NEXT: vpshufb %ymm12, %ymm19, %ymm7 -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm7[0],ymm5[0],ymm7[1],ymm5[1],ymm7[2],ymm5[2],ymm7[3],ymm5[3],ymm7[4],ymm5[4],ymm7[5],ymm5[5],ymm7[6],ymm5[6],ymm7[7],ymm5[7],ymm7[16],ymm5[16],ymm7[17],ymm5[17],ymm7[18],ymm5[18],ymm7[19],ymm5[19],ymm7[20],ymm5[20],ymm7[21],ymm5[21],ymm7[22],ymm5[22],ymm7[23],ymm5[23] -; AVX512BW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} ymm7 = ymm19[8],ymm17[8],ymm19[9],ymm17[9],ymm19[10],ymm17[10],ymm19[11],ymm17[11],ymm19[12],ymm17[12],ymm19[13],ymm17[13],ymm19[14],ymm17[14],ymm19[15],ymm17[15],ymm19[24],ymm17[24],ymm19[25],ymm17[25],ymm19[26],ymm17[26],ymm19[27],ymm17[27],ymm19[28],ymm17[28],ymm19[29],ymm17[29],ymm19[30],ymm17[30],ymm19[31],ymm17[31] -; AVX512BW-NEXT: vpermw %ymm7, %ymm11, %ymm7 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm7, %zmm5, %zmm5 -; AVX512BW-NEXT: vmovdqu16 %zmm6, %zmm5 {%k1} -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpshufb %zmm14, %zmm13, %zmm6 -; AVX512BW-NEXT: vpermq {{.*#+}} zmm6 = zmm6[2,2,2,3,6,6,6,7] -; AVX512BW-NEXT: vmovdqu16 %zmm6, %zmm5 {%k2} -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpshufb {{.*#+}} zmm6 = zmm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,58,u,61,u,60,u,59,u,62,u,u,u,u,u,63] -; AVX512BW-NEXT: vpermq {{.*#+}} zmm6 = zmm6[2,2,2,3,6,6,6,7] -; AVX512BW-NEXT: vmovdqu8 %zmm6, %zmm5 {%k3} -; AVX512BW-NEXT: vmovdqa64 (%rsi), %xmm21 -; AVX512BW-NEXT: vmovdqa 32(%rsi), %xmm7 -; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm20 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] -; AVX512BW-NEXT: vpshufb %xmm20, %xmm7, %xmm6 -; AVX512BW-NEXT: vmovdqa64 (%rdi), %xmm22 -; AVX512BW-NEXT: vmovdqa 32(%rdi), %xmm8 -; AVX512BW-NEXT: vpshufb %xmm20, %xmm8, %xmm14 -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm14 = xmm14[8],xmm6[8],xmm14[9],xmm6[9],xmm14[10],xmm6[10],xmm14[11],xmm6[11],xmm14[12],xmm6[12],xmm14[13],xmm6[13],xmm14[14],xmm6[14],xmm14[15],xmm6[15] -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3],xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm25 = [0,3,2,1,0,3,2,1,0,3,2,1,4,5,6,5,32,33,34,35,32,33,34,35,32,33,34,35,36,37,38,39] -; AVX512BW-NEXT: vpermt2w %zmm14, %zmm25, %zmm6 -; AVX512BW-NEXT: vmovdqa64 (%rcx), %xmm23 -; AVX512BW-NEXT: vmovdqa 32(%rcx), %xmm15 -; AVX512BW-NEXT: vpshufb %xmm12, %xmm15, %xmm14 -; AVX512BW-NEXT: vmovdqa64 (%rdx), %xmm24 -; AVX512BW-NEXT: vmovdqa64 32(%rdx), %xmm18 -; AVX512BW-NEXT: vpshufb %xmm12, %xmm18, %xmm16 -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm16[0],xmm14[0],xmm16[1],xmm14[1],xmm16[2],xmm14[2],xmm16[3],xmm14[3],xmm16[4],xmm14[4],xmm16[5],xmm14[5],xmm16[6],xmm14[6],xmm16[7],xmm14[7] -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm16 = xmm18[0],xmm15[0],xmm18[1],xmm15[1],xmm18[2],xmm15[2],xmm18[3],xmm15[3],xmm18[4],xmm15[4],xmm18[5],xmm15[5],xmm18[6],xmm15[6],xmm18[7],xmm15[7] -; AVX512BW-NEXT: vprold $16, %xmm16, %xmm16 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm14, %zmm16, %zmm14 -; AVX512BW-NEXT: vpermq {{.*#+}} zmm14 = zmm14[0,0,0,1,4,4,4,5] -; AVX512BW-NEXT: vmovdqu16 %zmm14, %zmm6 {%k2} -; AVX512BW-NEXT: vmovdqa 32(%r8), %xmm14 -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm16 = xmm14[0],zero,xmm14[1],zero,xmm14[2],zero,xmm14[3],zero,xmm14[4],zero,xmm14[5],zero,xmm14[6],zero,xmm14[7],zero -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm26 = xmm14[2,1,2,3] -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm26 = xmm26[0],zero,xmm26[1],zero,xmm26[2],zero,xmm26[3],zero,xmm26[4],zero,xmm26[5],zero,xmm26[6],zero,xmm26[7],zero -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm27 = [2,1,0,3,2,1,0,3,2,1,0,3,4,4,4,4,38,37,32,39,38,37,32,39,38,37,32,39,33,33,33,33] -; AVX512BW-NEXT: vpermt2w %zmm26, %zmm27, %zmm16 -; AVX512BW-NEXT: vmovdqu16 %zmm16, %zmm6 {%k1} -; AVX512BW-NEXT: vmovdqa64 32(%r9), %xmm16 -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm26 = xmm16[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm28 = xmm16[2,1,2,3] -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm28 = xmm28[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX512BW-NEXT: vpermt2w %zmm28, %zmm27, %zmm26 -; AVX512BW-NEXT: movabsq $585610922974906400, %rcx # imm = 0x820820820820820 -; AVX512BW-NEXT: kmovq %rcx, %k3 -; AVX512BW-NEXT: vmovdqu8 %zmm26, %zmm6 {%k3} -; AVX512BW-NEXT: vpshufb %xmm20, %xmm21, %xmm26 -; AVX512BW-NEXT: vpshufb %xmm20, %xmm22, %xmm20 -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm26 = xmm20[8],xmm26[8],xmm20[9],xmm26[9],xmm20[10],xmm26[10],xmm20[11],xmm26[11],xmm20[12],xmm26[12],xmm20[13],xmm26[13],xmm20[14],xmm26[14],xmm20[15],xmm26[15] -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm20 = xmm22[0],xmm21[0],xmm22[1],xmm21[1],xmm22[2],xmm21[2],xmm22[3],xmm21[3],xmm22[4],xmm21[4],xmm22[5],xmm21[5],xmm22[6],xmm21[6],xmm22[7],xmm21[7] -; AVX512BW-NEXT: vpermt2w %zmm26, %zmm25, %zmm20 -; AVX512BW-NEXT: vpshufb %xmm12, %xmm23, %xmm25 -; AVX512BW-NEXT: vpshufb %xmm12, %xmm24, %xmm12 -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm12[0],xmm25[0],xmm12[1],xmm25[1],xmm12[2],xmm25[2],xmm12[3],xmm25[3],xmm12[4],xmm25[4],xmm12[5],xmm25[5],xmm12[6],xmm25[6],xmm12[7],xmm25[7] -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm25 = xmm24[0],xmm23[0],xmm24[1],xmm23[1],xmm24[2],xmm23[2],xmm24[3],xmm23[3],xmm24[4],xmm23[4],xmm24[5],xmm23[5],xmm24[6],xmm23[6],xmm24[7],xmm23[7] -; AVX512BW-NEXT: vprold $16, %xmm25, %xmm25 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm12, %zmm25, %zmm12 -; AVX512BW-NEXT: vpermq {{.*#+}} zmm12 = zmm12[0,0,0,1,4,4,4,5] -; AVX512BW-NEXT: vmovdqu16 %zmm12, %zmm20 {%k2} -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm12 = xmm13[0],zero,xmm13[1],zero,xmm13[2],zero,xmm13[3],zero,xmm13[4],zero,xmm13[5],zero,xmm13[6],zero,xmm13[7],zero -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm25 = xmm13[2,1,2,3] -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm25 = xmm25[0],zero,xmm25[1],zero,xmm25[2],zero,xmm25[3],zero,xmm25[4],zero,xmm25[5],zero,xmm25[6],zero,xmm25[7],zero -; AVX512BW-NEXT: vpermt2w %zmm25, %zmm27, %zmm12 -; AVX512BW-NEXT: vmovdqu16 %zmm12, %zmm20 {%k1} -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm11[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm25 = xmm11[2,1,2,3] -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm25 = xmm25[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX512BW-NEXT: vpermt2w %zmm25, %zmm27, %zmm12 -; AVX512BW-NEXT: vmovdqu8 %zmm12, %zmm20 {%k3} -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm12 = ymm19[0],ymm17[0],ymm19[1],ymm17[1],ymm19[2],ymm17[2],ymm19[3],ymm17[3],ymm19[4],ymm17[4],ymm19[5],ymm17[5],ymm19[6],ymm17[6],ymm19[7],ymm17[7],ymm19[16],ymm17[16],ymm19[17],ymm17[17],ymm19[18],ymm17[18],ymm19[19],ymm17[19],ymm19[20],ymm17[20],ymm19[21],ymm17[21],ymm19[22],ymm17[22],ymm19[23],ymm17[23] -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm17 = xmm24[8],xmm23[8],xmm24[9],xmm23[9],xmm24[10],xmm23[10],xmm24[11],xmm23[11],xmm24[12],xmm23[12],xmm24[13],xmm23[13],xmm24[14],xmm23[14],xmm24[15],xmm23[15] -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm19 = [5,4,3,6,5,4,3,6,5,4,3,6,7,7,7,7,41,40,43,42,41,40,43,42,41,40,43,42,45,44,47,46] -; AVX512BW-NEXT: vpermt2w %zmm12, %zmm19, %zmm17 +; AVX512BW-NEXT: vmovdqu16 %zmm20, %zmm15 {%k2} +; AVX512BW-NEXT: vextracti64x4 $1, %zmm15, %ymm19 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} ymm20 = [2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0] +; AVX512BW-NEXT: # ymm20 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpshufb %ymm20, %ymm5, %ymm22 +; AVX512BW-NEXT: vpermq {{.*#+}} ymm22 = ymm22[2,2,2,3] +; AVX512BW-NEXT: movw $18724, %r10w # imm = 0x4924 +; AVX512BW-NEXT: kmovd %r10d, %k3 +; AVX512BW-NEXT: vmovdqu16 %ymm22, %ymm19 {%k3} +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm22 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm23 = [18,1,2,19,4,5,20,7,8,21,10,11,22,13,14,23] +; AVX512BW-NEXT: vpermt2w %ymm22, %ymm23, %ymm15 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm19, %zmm15, %zmm15 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} ymm19 = [0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0] +; AVX512BW-NEXT: # ymm19 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpshufb %ymm19, %ymm8, %ymm22 +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm24 = xmm8[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm25 = [2,5,4,3,2,5,4,3,2,5,4,3,6,5,6,7,40,41,42,43,40,41,42,43,40,41,42,43,44,45,46,47] +; AVX512BW-NEXT: vpermt2w %zmm22, %zmm25, %zmm24 +; AVX512BW-NEXT: movabsq $2342443691899625602, %r10 # imm = 0x2082082082082082 +; AVX512BW-NEXT: kmovq %r10, %k4 +; AVX512BW-NEXT: vmovdqu8 %zmm24, %zmm15 {%k4} +; AVX512BW-NEXT: vmovdqa64 32(%rcx), %ymm22 +; AVX512BW-NEXT: vmovdqa64 32(%rdx), %ymm24 +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm26 = ymm24[0],ymm22[0],ymm24[1],ymm22[1],ymm24[2],ymm22[2],ymm24[3],ymm22[3],ymm24[4],ymm22[4],ymm24[5],ymm22[5],ymm24[6],ymm22[6],ymm24[7],ymm22[7],ymm24[16],ymm22[16],ymm24[17],ymm22[17],ymm24[18],ymm22[18],ymm24[19],ymm22[19],ymm24[20],ymm22[20],ymm24[21],ymm22[21],ymm24[22],ymm22[22],ymm24[23],ymm22[23] +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm11[8],xmm10[8],xmm11[9],xmm10[9],xmm11[10],xmm10[10],xmm11[11],xmm10[11],xmm11[12],xmm10[12],xmm11[13],xmm10[13],xmm11[14],xmm10[14],xmm11[15],xmm10[15] +; AVX512BW-NEXT: vpermt2w %zmm26, %zmm21, %zmm10 +; AVX512BW-NEXT: vmovdqa 32(%rsi), %ymm11 +; AVX512BW-NEXT: vmovdqa64 32(%rdi), %ymm21 +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm26 = ymm21[0],ymm11[0],ymm21[1],ymm11[1],ymm21[2],ymm11[2],ymm21[3],ymm11[3],ymm21[4],ymm11[4],ymm21[5],ymm11[5],ymm21[6],ymm11[6],ymm21[7],ymm11[7],ymm21[16],ymm11[16],ymm21[17],ymm11[17],ymm21[18],ymm11[18],ymm21[19],ymm11[19],ymm21[20],ymm11[20],ymm21[21],ymm11[21],ymm21[22],ymm11[22],ymm21[23],ymm11[23] +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm6[8],xmm4[8],xmm6[9],xmm4[9],xmm6[10],xmm4[10],xmm6[11],xmm4[11],xmm6[12],xmm4[12],xmm6[13],xmm4[13],xmm6[14],xmm4[14],xmm6[15],xmm4[15] +; AVX512BW-NEXT: vpermt2w %zmm26, %zmm18, %zmm4 +; AVX512BW-NEXT: vmovdqu16 %zmm10, %zmm4 {%k2} +; AVX512BW-NEXT: vextracti64x4 $1, %zmm4, %ymm6 +; AVX512BW-NEXT: vmovdqa 32(%r8), %ymm10 +; AVX512BW-NEXT: vpshufb %ymm20, %ymm10, %ymm10 +; AVX512BW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,2,3] +; AVX512BW-NEXT: vmovdqu16 %ymm10, %ymm6 {%k3} +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm9[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX512BW-NEXT: vpermt2w %ymm9, %ymm23, %ymm4 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm6, %zmm4, %zmm4 +; AVX512BW-NEXT: vmovdqa 32(%r9), %ymm6 +; AVX512BW-NEXT: vpshufb %ymm19, %ymm6, %ymm6 +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm12[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX512BW-NEXT: vpermt2w %zmm6, %zmm25, %zmm9 +; AVX512BW-NEXT: vmovdqu8 %zmm9, %zmm4 {%k4} +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm6 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] +; AVX512BW-NEXT: vpshufb %ymm6, %ymm11, %ymm9 +; AVX512BW-NEXT: vpshufb %ymm6, %ymm21, %ymm10 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm9 = ymm10[0],ymm9[0],ymm10[1],ymm9[1],ymm10[2],ymm9[2],ymm10[3],ymm9[3],ymm10[4],ymm9[4],ymm10[5],ymm9[5],ymm10[6],ymm9[6],ymm10[7],ymm9[7],ymm10[16],ymm9[16],ymm10[17],ymm9[17],ymm10[18],ymm9[18],ymm10[19],ymm9[19],ymm10[20],ymm9[20],ymm10[21],ymm9[21],ymm10[22],ymm9[22],ymm10[23],ymm9[23] -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm22[8],xmm21[8],xmm22[9],xmm21[9],xmm22[10],xmm21[10],xmm22[11],xmm21[11],xmm22[12],xmm21[12],xmm22[13],xmm21[13],xmm22[14],xmm21[14],xmm22[15],xmm21[15] -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm12 = [4,3,6,5,4,3,6,5,4,3,6,5,7,7,7,7,40,43,42,41,40,43,42,41,40,43,42,41,44,45,46,45] -; AVX512BW-NEXT: vpermt2w %zmm9, %zmm12, %zmm10 -; AVX512BW-NEXT: vmovdqu16 %zmm17, %zmm10 {%k1} -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0] -; AVX512BW-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512BW-NEXT: vpshufb %ymm9, %ymm13, %ymm17 -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm13 = xmm13[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm21 = [2,5,4,3,2,5,4,3,2,5,4,3,6,5,6,7,40,41,42,43,40,41,42,43,40,41,42,43,44,45,46,47] -; AVX512BW-NEXT: vpermt2w %zmm17, %zmm21, %zmm13 -; AVX512BW-NEXT: movl $1227133513, %ecx # imm = 0x49249249 -; AVX512BW-NEXT: kmovd %ecx, %k2 -; AVX512BW-NEXT: vmovdqu16 %zmm13, %zmm10 {%k2} -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0] -; AVX512BW-NEXT: # ymm13 = mem[0,1,0,1] -; AVX512BW-NEXT: vpshufb %ymm13, %ymm11, %ymm17 -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm11[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX512BW-NEXT: vpermt2w %zmm17, %zmm21, %zmm11 -; AVX512BW-NEXT: movabsq $2342443691899625602, %rcx # imm = 0x2082082082082082 +; AVX512BW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,3] +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} ymm10 = ymm21[8],ymm11[8],ymm21[9],ymm11[9],ymm21[10],ymm11[10],ymm21[11],ymm11[11],ymm21[12],ymm11[12],ymm21[13],ymm11[13],ymm21[14],ymm11[14],ymm21[15],ymm11[15],ymm21[24],ymm11[24],ymm21[25],ymm11[25],ymm21[26],ymm11[26],ymm21[27],ymm11[27],ymm21[28],ymm11[28],ymm21[29],ymm11[29],ymm21[30],ymm11[30],ymm21[31],ymm11[31] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm11 = [12,11,14,13,12,11,14,13,12,11,14,13,15,15,15,15] +; AVX512BW-NEXT: vpermw %ymm10, %ymm11, %ymm10 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm10, %zmm9, %zmm9 +; AVX512BW-NEXT: vpshufb %ymm0, %ymm22, %ymm10 +; AVX512BW-NEXT: vpshufb %ymm0, %ymm24, %ymm12 +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm10 = ymm12[0],ymm10[0],ymm12[1],ymm10[1],ymm12[2],ymm10[2],ymm12[3],ymm10[3],ymm12[4],ymm10[4],ymm12[5],ymm10[5],ymm12[6],ymm10[6],ymm12[7],ymm10[7],ymm12[16],ymm10[16],ymm12[17],ymm10[17],ymm12[18],ymm10[18],ymm12[19],ymm10[19],ymm12[20],ymm10[20],ymm12[21],ymm10[21],ymm12[22],ymm10[22],ymm12[23],ymm10[23] +; AVX512BW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,2,3] +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} ymm12 = ymm24[8],ymm22[8],ymm24[9],ymm22[9],ymm24[10],ymm22[10],ymm24[11],ymm22[11],ymm24[12],ymm22[12],ymm24[13],ymm22[13],ymm24[14],ymm22[14],ymm24[15],ymm22[15],ymm24[24],ymm22[24],ymm24[25],ymm22[25],ymm24[26],ymm22[26],ymm24[27],ymm22[27],ymm24[28],ymm22[28],ymm24[29],ymm22[29],ymm24[30],ymm22[30],ymm24[31],ymm22[31] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm18 = [13,12,11,14,13,12,11,14,13,12,11,14,15,15,15,15] +; AVX512BW-NEXT: vpermw %ymm12, %ymm18, %ymm12 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm12, %zmm10, %zmm10 +; AVX512BW-NEXT: vmovdqu16 %zmm9, %zmm10 {%k2} +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[4,5,6,7,4,5,6,7] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm9 = [6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,10,13,12,11,14,0,0,15,10,13,12,11,14,0,0,15] +; AVX512BW-NEXT: vpshufb %zmm9, %zmm2, %zmm2 +; AVX512BW-NEXT: vpermq {{.*#+}} zmm2 = zmm2[2,2,2,3,6,6,6,7] +; AVX512BW-NEXT: vmovdqu16 %zmm2, %zmm10 {%k1} +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[4,5,6,7,4,5,6,7] +; AVX512BW-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[u,6,u,5,u,8,u,7,u,u,u,9,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u,u,42,u,45,u,44,u,43,u,46,u,u,u,u,u,47,u,58,u,61,u,60,u,59,u,62,u,u,u,u,u,63] +; AVX512BW-NEXT: vpermq {{.*#+}} zmm1 = zmm1[2,2,2,3,6,6,6,7] +; AVX512BW-NEXT: movabsq $-9076969306111049208, %rcx # imm = 0x8208208208208208 ; AVX512BW-NEXT: kmovq %rcx, %k3 -; AVX512BW-NEXT: vmovdqu8 %zmm11, %zmm10 {%k3} -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[4],ymm3[4],ymm4[5],ymm3[5],ymm4[6],ymm3[6],ymm4[7],ymm3[7],ymm4[16],ymm3[16],ymm4[17],ymm3[17],ymm4[18],ymm3[18],ymm4[19],ymm3[19],ymm4[20],ymm3[20],ymm4[21],ymm3[21],ymm4[22],ymm3[22],ymm4[23],ymm3[23] -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm18[8],xmm15[8],xmm18[9],xmm15[9],xmm18[10],xmm15[10],xmm18[11],xmm15[11],xmm18[12],xmm15[12],xmm18[13],xmm15[13],xmm18[14],xmm15[14],xmm18[15],xmm15[15] -; AVX512BW-NEXT: vpermt2w %zmm3, %zmm19, %zmm4 +; AVX512BW-NEXT: vmovdqu8 %zmm1, %zmm10 {%k3} +; AVX512BW-NEXT: vpshufb %ymm6, %ymm16, %ymm1 +; AVX512BW-NEXT: vpshufb %ymm6, %ymm17, %ymm2 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[16],ymm1[16],ymm2[17],ymm1[17],ymm2[18],ymm1[18],ymm2[19],ymm1[19],ymm2[20],ymm1[20],ymm2[21],ymm1[21],ymm2[22],ymm1[22],ymm2[23],ymm1[23] -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm8[8],xmm7[8],xmm8[9],xmm7[9],xmm8[10],xmm7[10],xmm8[11],xmm7[11],xmm8[12],xmm7[12],xmm8[13],xmm7[13],xmm8[14],xmm7[14],xmm8[15],xmm7[15] -; AVX512BW-NEXT: vpermt2w %zmm1, %zmm12, %zmm2 -; AVX512BW-NEXT: vmovdqu16 %zmm4, %zmm2 {%k1} -; AVX512BW-NEXT: vmovdqa 32(%r8), %ymm1 -; AVX512BW-NEXT: vpshufb %ymm9, %ymm1, %ymm1 -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm14[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX512BW-NEXT: vpermt2w %zmm1, %zmm21, %zmm3 -; AVX512BW-NEXT: vmovdqu16 %zmm3, %zmm2 {%k2} -; AVX512BW-NEXT: vmovdqa 32(%r9), %ymm1 -; AVX512BW-NEXT: vpshufb %ymm13, %ymm1, %ymm1 -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm16[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX512BW-NEXT: vpermt2w %zmm1, %zmm21, %zmm3 -; AVX512BW-NEXT: vmovdqu8 %zmm3, %zmm2 {%k3} -; AVX512BW-NEXT: vmovdqa64 %zmm2, 256(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm10, 64(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm20, (%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm6, 192(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm5, 128(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm0, 320(%rax) +; AVX512BW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm17[8],ymm16[8],ymm17[9],ymm16[9],ymm17[10],ymm16[10],ymm17[11],ymm16[11],ymm17[12],ymm16[12],ymm17[13],ymm16[13],ymm17[14],ymm16[14],ymm17[15],ymm16[15],ymm17[24],ymm16[24],ymm17[25],ymm16[25],ymm17[26],ymm16[26],ymm17[27],ymm16[27],ymm17[28],ymm16[28],ymm17[29],ymm16[29],ymm17[30],ymm16[30],ymm17[31],ymm16[31] +; AVX512BW-NEXT: vpermw %ymm2, %ymm11, %ymm2 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 +; AVX512BW-NEXT: vpshufb %ymm0, %ymm13, %ymm2 +; AVX512BW-NEXT: vpshufb %ymm0, %ymm14, %ymm0 +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[16],ymm2[16],ymm0[17],ymm2[17],ymm0[18],ymm2[18],ymm0[19],ymm2[19],ymm0[20],ymm2[20],ymm0[21],ymm2[21],ymm0[22],ymm2[22],ymm0[23],ymm2[23] +; AVX512BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm14[8],ymm13[8],ymm14[9],ymm13[9],ymm14[10],ymm13[10],ymm14[11],ymm13[11],ymm14[12],ymm13[12],ymm14[13],ymm13[13],ymm14[14],ymm13[14],ymm14[15],ymm13[15],ymm14[24],ymm13[24],ymm14[25],ymm13[25],ymm14[26],ymm13[26],ymm14[27],ymm13[27],ymm14[28],ymm13[28],ymm14[29],ymm13[29],ymm14[30],ymm13[30],ymm14[31],ymm13[31] +; AVX512BW-NEXT: vpermw %ymm2, %ymm18, %ymm2 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqu16 %zmm1, %zmm0 {%k2} +; AVX512BW-NEXT: vpshufb %zmm9, %zmm5, %zmm1 +; AVX512BW-NEXT: vpermq {{.*#+}} zmm1 = zmm1[2,2,2,3,6,6,6,7] +; AVX512BW-NEXT: vmovdqu16 %zmm1, %zmm0 {%k1} +; AVX512BW-NEXT: vpshufb {{.*#+}} zmm1 = zmm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,58,u,61,u,60,u,59,u,62,u,u,u,u,u,63] +; AVX512BW-NEXT: vpermq {{.*#+}} zmm1 = zmm1[2,2,2,3,6,6,6,7] +; AVX512BW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k3} +; AVX512BW-NEXT: vmovdqa64 %zmm0, 128(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm10, 320(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm4, 256(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm15, 64(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm7, (%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm3, 192(%rax) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BW-FCP-LABEL: store_i8_stride6_vf64: ; AVX512BW-FCP: # %bb.0: -; AVX512BW-FCP-NEXT: vmovdqa 32(%rsi), %ymm10 -; AVX512BW-FCP-NEXT: vpbroadcastq {{.*#+}} ymm0 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] -; AVX512BW-FCP-NEXT: vpshufb %ymm0, %ymm10, %ymm1 -; AVX512BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm11 -; AVX512BW-FCP-NEXT: vpshufb %ymm0, %ymm11, %ymm2 -; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[16],ymm1[16],ymm2[17],ymm1[17],ymm2[18],ymm1[18],ymm2[19],ymm1[19],ymm2[20],ymm1[20],ymm2[21],ymm1[21],ymm2[22],ymm1[22],ymm2[23],ymm1[23] -; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,2,2,3] -; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm11[8],ymm10[8],ymm11[9],ymm10[9],ymm11[10],ymm10[10],ymm11[11],ymm10[11],ymm11[12],ymm10[12],ymm11[13],ymm10[13],ymm11[14],ymm10[14],ymm11[15],ymm10[15],ymm11[24],ymm10[24],ymm11[25],ymm10[25],ymm11[26],ymm10[26],ymm11[27],ymm10[27],ymm11[28],ymm10[28],ymm11[29],ymm10[29],ymm11[30],ymm10[30],ymm11[31],ymm10[31] -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm0 = [12,11,14,13,12,11,14,13,12,11,14,13,15,15,15,15] -; AVX512BW-FCP-NEXT: vpermw %ymm3, %ymm0, %ymm3 -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa 32(%rcx), %ymm4 -; AVX512BW-FCP-NEXT: vpbroadcastq {{.*#+}} ymm2 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] -; AVX512BW-FCP-NEXT: vmovdqa (%rsi), %ymm5 -; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm6 -; AVX512BW-FCP-NEXT: vmovdqa (%rcx), %ymm7 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 (%rcx), %xmm21 -; AVX512BW-FCP-NEXT: vmovdqa64 32(%rcx), %xmm16 -; AVX512BW-FCP-NEXT: vmovdqa64 (%rdx), %xmm22 -; AVX512BW-FCP-NEXT: vmovdqa64 32(%rdx), %xmm20 -; AVX512BW-FCP-NEXT: vmovdqa64 (%rsi), %xmm24 -; AVX512BW-FCP-NEXT: vmovdqa 32(%rsi), %xmm15 -; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %xmm26 -; AVX512BW-FCP-NEXT: vmovdqa64 32(%rdi), %xmm19 -; AVX512BW-FCP-NEXT: vpshufb %xmm2, %xmm16, %xmm12 -; AVX512BW-FCP-NEXT: vpshufb %xmm2, %xmm20, %xmm13 -; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3],xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] -; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm20[0],xmm16[0],xmm20[1],xmm16[1],xmm20[2],xmm16[2],xmm20[3],xmm16[3],xmm20[4],xmm16[4],xmm20[5],xmm16[5],xmm20[6],xmm16[6],xmm20[7],xmm16[7] -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm18 = [1,0,3,2,1,0,3,2,1,0,3,2,5,4,7,6,32,33,34,35,32,33,34,35,32,33,34,35,36,37,38,39] -; AVX512BW-FCP-NEXT: vpermt2w %zmm12, %zmm18, %zmm13 -; AVX512BW-FCP-NEXT: vpbroadcastq {{.*#+}} xmm23 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] -; AVX512BW-FCP-NEXT: vpshufb %xmm23, %xmm15, %xmm12 -; AVX512BW-FCP-NEXT: vpshufb %xmm23, %xmm19, %xmm14 -; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm14 = xmm14[8],xmm12[8],xmm14[9],xmm12[9],xmm14[10],xmm12[10],xmm14[11],xmm12[11],xmm14[12],xmm12[12],xmm14[13],xmm12[13],xmm14[14],xmm12[14],xmm14[15],xmm12[15] -; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm19[0],xmm15[0],xmm19[1],xmm15[1],xmm19[2],xmm15[2],xmm19[3],xmm15[3],xmm19[4],xmm15[4],xmm19[5],xmm15[5],xmm19[6],xmm15[6],xmm19[7],xmm15[7] -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm25 = [0,3,2,1,0,3,2,1,0,3,2,1,4,5,6,5,32,33,34,35,32,33,34,35,32,33,34,35,36,37,38,39] -; AVX512BW-FCP-NEXT: vpermt2w %zmm14, %zmm25, %zmm12 -; AVX512BW-FCP-NEXT: vmovdqa64 32(%r8), %xmm28 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm27 = [8,9,0,0,0,5,6,7] -; AVX512BW-FCP-NEXT: vpshufb %xmm27, %xmm28, %xmm17 -; AVX512BW-FCP-NEXT: vpmovzxbw {{.*#+}} xmm14 = xmm28[0],zero,xmm28[1],zero,xmm28[2],zero,xmm28[3],zero,xmm28[4],zero,xmm28[5],zero,xmm28[6],zero,xmm28[7],zero -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm30 = [2,1,0,3,2,1,0,3,2,1,0,3,4,4,4,4,38,37,32,39,38,37,32,39,38,37,32,39,33,33,33,33] -; AVX512BW-FCP-NEXT: vpermt2w %zmm17, %zmm30, %zmm14 -; AVX512BW-FCP-NEXT: vmovdqa64 32(%r9), %xmm29 -; AVX512BW-FCP-NEXT: vpshufb %xmm2, %xmm21, %xmm17 -; AVX512BW-FCP-NEXT: vpshufb %xmm2, %xmm22, %xmm31 -; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm31 = xmm31[0],xmm17[0],xmm31[1],xmm17[1],xmm31[2],xmm17[2],xmm31[3],xmm17[3],xmm31[4],xmm17[4],xmm31[5],xmm17[5],xmm31[6],xmm17[6],xmm31[7],xmm17[7] -; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm17 = xmm22[0],xmm21[0],xmm22[1],xmm21[1],xmm22[2],xmm21[2],xmm22[3],xmm21[3],xmm22[4],xmm21[4],xmm22[5],xmm21[5],xmm22[6],xmm21[6],xmm22[7],xmm21[7] -; AVX512BW-FCP-NEXT: vpermt2w %zmm31, %zmm18, %zmm17 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm31 = [u,8,u,9,u,10,u,11,u,4,u,5,u,6,u,7] -; AVX512BW-FCP-NEXT: vpshufb %xmm23, %xmm24, %xmm18 -; AVX512BW-FCP-NEXT: vpshufb %xmm23, %xmm26, %xmm23 -; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm23 = xmm23[8],xmm18[8],xmm23[9],xmm18[9],xmm23[10],xmm18[10],xmm23[11],xmm18[11],xmm23[12],xmm18[12],xmm23[13],xmm18[13],xmm23[14],xmm18[14],xmm23[15],xmm18[15] -; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm18 = xmm26[0],xmm24[0],xmm26[1],xmm24[1],xmm26[2],xmm24[2],xmm26[3],xmm24[3],xmm26[4],xmm24[4],xmm26[5],xmm24[5],xmm26[6],xmm24[6],xmm26[7],xmm24[7] -; AVX512BW-FCP-NEXT: vpermt2w %zmm23, %zmm25, %zmm18 -; AVX512BW-FCP-NEXT: vpshufb %xmm31, %xmm29, %xmm25 -; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm23 = xmm29[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX512BW-FCP-NEXT: vpermt2w %zmm25, %zmm30, %zmm23 -; AVX512BW-FCP-NEXT: vpshufb %xmm27, %xmm8, %xmm27 -; AVX512BW-FCP-NEXT: vpmovzxbw {{.*#+}} xmm25 = xmm8[0],zero,xmm8[1],zero,xmm8[2],zero,xmm8[3],zero,xmm8[4],zero,xmm8[5],zero,xmm8[6],zero,xmm8[7],zero -; AVX512BW-FCP-NEXT: vpermt2w %zmm27, %zmm30, %zmm25 -; AVX512BW-FCP-NEXT: vpshufb %xmm31, %xmm9, %xmm31 -; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm27 = xmm9[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX512BW-FCP-NEXT: vpermt2w %zmm31, %zmm30, %zmm27 -; AVX512BW-FCP-NEXT: vmovdqa64 (%rdx), %ymm30 -; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm31 = ymm30[0],ymm7[0],ymm30[1],ymm7[1],ymm30[2],ymm7[2],ymm30[3],ymm7[3],ymm30[4],ymm7[4],ymm30[5],ymm7[5],ymm30[6],ymm7[6],ymm30[7],ymm7[7],ymm30[16],ymm7[16],ymm30[17],ymm7[17],ymm30[18],ymm7[18],ymm30[19],ymm7[19],ymm30[20],ymm7[20],ymm30[21],ymm7[21],ymm30[22],ymm7[22],ymm30[23],ymm7[23] -; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm22 = xmm22[8],xmm21[8],xmm22[9],xmm21[9],xmm22[10],xmm21[10],xmm22[11],xmm21[11],xmm22[12],xmm21[12],xmm22[13],xmm21[13],xmm22[14],xmm21[14],xmm22[15],xmm21[15] -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm0 = [5,4,3,6,5,4,3,6,5,4,3,6,7,7,7,7,41,40,43,42,41,40,43,42,41,40,43,42,45,44,47,46] -; AVX512BW-FCP-NEXT: vpermt2w %zmm31, %zmm0, %zmm22 -; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm31 = ymm6[0],ymm5[0],ymm6[1],ymm5[1],ymm6[2],ymm5[2],ymm6[3],ymm5[3],ymm6[4],ymm5[4],ymm6[5],ymm5[5],ymm6[6],ymm5[6],ymm6[7],ymm5[7],ymm6[16],ymm5[16],ymm6[17],ymm5[17],ymm6[18],ymm5[18],ymm6[19],ymm5[19],ymm6[20],ymm5[20],ymm6[21],ymm5[21],ymm6[22],ymm5[22],ymm6[23],ymm5[23] -; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm21 = xmm26[8],xmm24[8],xmm26[9],xmm24[9],xmm26[10],xmm24[10],xmm26[11],xmm24[11],xmm26[12],xmm24[12],xmm26[13],xmm24[13],xmm26[14],xmm24[14],xmm26[15],xmm24[15] -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm24 = [4,3,6,5,4,3,6,5,4,3,6,5,7,7,7,7,40,43,42,41,40,43,42,41,40,43,42,41,44,45,46,45] -; AVX512BW-FCP-NEXT: vpermt2w %zmm31, %zmm24, %zmm21 -; AVX512BW-FCP-NEXT: vmovdqa64 32(%rdx), %ymm26 -; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm16 = xmm20[8],xmm16[8],xmm20[9],xmm16[9],xmm20[10],xmm16[10],xmm20[11],xmm16[11],xmm20[12],xmm16[12],xmm20[13],xmm16[13],xmm20[14],xmm16[14],xmm20[15],xmm16[15] -; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm20 = ymm26[0],ymm4[0],ymm26[1],ymm4[1],ymm26[2],ymm4[2],ymm26[3],ymm4[3],ymm26[4],ymm4[4],ymm26[5],ymm4[5],ymm26[6],ymm4[6],ymm26[7],ymm4[7],ymm26[16],ymm4[16],ymm26[17],ymm4[17],ymm26[18],ymm4[18],ymm26[19],ymm4[19],ymm26[20],ymm4[20],ymm26[21],ymm4[21],ymm26[22],ymm4[22],ymm26[23],ymm4[23] -; AVX512BW-FCP-NEXT: vpermt2w %zmm20, %zmm0, %zmm16 -; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0] -; AVX512BW-FCP-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512BW-FCP-NEXT: vpshufb %ymm1, %ymm8, %ymm31 -; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm20 = xmm8[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm0 = [2,5,4,3,2,5,4,3,2,5,4,3,6,5,6,7,40,41,42,43,40,41,42,43,40,41,42,43,44,45,46,47] -; AVX512BW-FCP-NEXT: vpermt2w %zmm31, %zmm0, %zmm20 -; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm11 = ymm11[0],ymm10[0],ymm11[1],ymm10[1],ymm11[2],ymm10[2],ymm11[3],ymm10[3],ymm11[4],ymm10[4],ymm11[5],ymm10[5],ymm11[6],ymm10[6],ymm11[7],ymm10[7],ymm11[16],ymm10[16],ymm11[17],ymm10[17],ymm11[18],ymm10[18],ymm11[19],ymm10[19],ymm11[20],ymm10[20],ymm11[21],ymm10[21],ymm11[22],ymm10[22],ymm11[23],ymm10[23] -; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm19[8],xmm15[8],xmm19[9],xmm15[9],xmm19[10],xmm15[10],xmm19[11],xmm15[11],xmm19[12],xmm15[12],xmm19[13],xmm15[13],xmm19[14],xmm15[14],xmm19[15],xmm15[15] +; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512BW-FCP-NEXT: vmovdqa64 (%r8), %zmm2 +; AVX512BW-FCP-NEXT: vmovdqa64 (%r9), %zmm1 +; AVX512BW-FCP-NEXT: vpbroadcastq {{.*#+}} ymm0 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] +; AVX512BW-FCP-NEXT: vmovdqa (%rcx), %xmm15 +; AVX512BW-FCP-NEXT: vmovdqa 32(%rcx), %xmm6 +; AVX512BW-FCP-NEXT: vpshufb %xmm0, %xmm6, %xmm3 +; AVX512BW-FCP-NEXT: vmovdqa64 (%rdx), %xmm16 +; AVX512BW-FCP-NEXT: vmovdqa 32(%rdx), %xmm8 +; AVX512BW-FCP-NEXT: vpshufb %xmm0, %xmm8, %xmm4 +; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3],xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm5 = [1,0,3,2,1,0,3,2,1,0,3,2,5,4,7,6,32,33,34,35,32,33,34,35,32,33,34,35,36,37,38,39] +; AVX512BW-FCP-NEXT: vpermt2w %zmm3, %zmm5, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqa64 (%rsi), %xmm17 +; AVX512BW-FCP-NEXT: vmovdqa 32(%rsi), %xmm9 +; AVX512BW-FCP-NEXT: vpbroadcastq {{.*#+}} xmm7 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] +; AVX512BW-FCP-NEXT: vpshufb %xmm7, %xmm9, %xmm3 +; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %xmm18 +; AVX512BW-FCP-NEXT: vmovdqa 32(%rdi), %xmm10 +; AVX512BW-FCP-NEXT: vpshufb %xmm7, %xmm10, %xmm11 +; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm11[8],xmm3[8],xmm11[9],xmm3[9],xmm11[10],xmm3[10],xmm11[11],xmm3[11],xmm11[12],xmm3[12],xmm11[13],xmm3[13],xmm11[14],xmm3[14],xmm11[15],xmm3[15] +; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm13 = [0,3,2,1,0,3,2,1,0,3,2,1,4,5,6,5,32,33,34,35,32,33,34,35,32,33,34,35,36,37,38,39] +; AVX512BW-FCP-NEXT: vpermt2w %zmm3, %zmm13, %zmm12 +; AVX512BW-FCP-NEXT: movl $-1840700270, %r10d # imm = 0x92492492 +; AVX512BW-FCP-NEXT: kmovd %r10d, %k1 +; AVX512BW-FCP-NEXT: vmovdqu16 %zmm4, %zmm12 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa 32(%r8), %xmm11 +; AVX512BW-FCP-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm11[0],zero,xmm11[1],zero,xmm11[2],zero,xmm11[3],zero,xmm11[4],zero,xmm11[5],zero,xmm11[6],zero,xmm11[7],zero +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm14 = [0,1,16,3,4,17,6,7,18,9,10,19,12,13,20,15] +; AVX512BW-FCP-NEXT: vmovdqa %ymm12, %ymm4 +; AVX512BW-FCP-NEXT: vpermt2w %ymm3, %ymm14, %ymm4 +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm19 = [16,37,18,19,38,21,22,39,24,25,32,27,28,33,30,31] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm20 = [8,9,0,0,0,5,6,7] +; AVX512BW-FCP-NEXT: vpshufb %xmm20, %xmm11, %xmm3 +; AVX512BW-FCP-NEXT: vpermt2w %zmm3, %zmm19, %zmm12 +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm12, %zmm4, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqa 32(%r9), %xmm12 +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm21 = [u,8,u,9,u,10,u,11,u,4,u,5,u,6,u,7] +; AVX512BW-FCP-NEXT: vpshufb %xmm21, %xmm12, %xmm4 +; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm22 = xmm12[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm23 = [2,1,0,3,2,1,0,3,2,1,0,3,4,4,4,4,38,37,32,39,38,37,32,39,38,37,32,39,33,33,33,33] +; AVX512BW-FCP-NEXT: vpermt2w %zmm4, %zmm23, %zmm22 +; AVX512BW-FCP-NEXT: movabsq $585610922974906400, %r10 # imm = 0x820820820820820 +; AVX512BW-FCP-NEXT: kmovq %r10, %k2 +; AVX512BW-FCP-NEXT: vmovdqu8 %zmm22, %zmm3 {%k2} +; AVX512BW-FCP-NEXT: vpshufb %xmm0, %xmm15, %xmm4 +; AVX512BW-FCP-NEXT: vpshufb %xmm0, %xmm16, %xmm22 +; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm22[0],xmm4[0],xmm22[1],xmm4[1],xmm22[2],xmm4[2],xmm22[3],xmm4[3],xmm22[4],xmm4[4],xmm22[5],xmm4[5],xmm22[6],xmm4[6],xmm22[7],xmm4[7] +; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm22 = xmm16[0],xmm15[0],xmm16[1],xmm15[1],xmm16[2],xmm15[2],xmm16[3],xmm15[3],xmm16[4],xmm15[4],xmm16[5],xmm15[5],xmm16[6],xmm15[6],xmm16[7],xmm15[7] +; AVX512BW-FCP-NEXT: vpermt2w %zmm4, %zmm5, %zmm22 +; AVX512BW-FCP-NEXT: vpshufb %xmm7, %xmm17, %xmm4 +; AVX512BW-FCP-NEXT: vpshufb %xmm7, %xmm18, %xmm5 +; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] +; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm18[0],xmm17[0],xmm18[1],xmm17[1],xmm18[2],xmm17[2],xmm18[3],xmm17[3],xmm18[4],xmm17[4],xmm18[5],xmm17[5],xmm18[6],xmm17[6],xmm18[7],xmm17[7] +; AVX512BW-FCP-NEXT: vpermt2w %zmm4, %zmm13, %zmm5 +; AVX512BW-FCP-NEXT: vmovdqu16 %zmm22, %zmm5 {%k1} +; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpmovzxbw {{.*#+}} xmm7 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero +; AVX512BW-FCP-NEXT: vpermi2w %ymm7, %ymm5, %ymm14 +; AVX512BW-FCP-NEXT: vpshufb %xmm20, %xmm4, %xmm7 +; AVX512BW-FCP-NEXT: vpermt2w %zmm7, %zmm19, %zmm5 +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm14, %zmm5 +; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpshufb %xmm21, %xmm7, %xmm13 +; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm7[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX512BW-FCP-NEXT: vpermt2w %zmm13, %zmm23, %zmm14 +; AVX512BW-FCP-NEXT: vmovdqu8 %zmm14, %zmm5 {%k2} +; AVX512BW-FCP-NEXT: vmovdqa (%rcx), %ymm13 +; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %ymm14 +; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm19 = ymm14[0],ymm13[0],ymm14[1],ymm13[1],ymm14[2],ymm13[2],ymm14[3],ymm13[3],ymm14[4],ymm13[4],ymm14[5],ymm13[5],ymm14[6],ymm13[6],ymm14[7],ymm13[7],ymm14[16],ymm13[16],ymm14[17],ymm13[17],ymm14[18],ymm13[18],ymm14[19],ymm13[19],ymm14[20],ymm13[20],ymm14[21],ymm13[21],ymm14[22],ymm13[22],ymm14[23],ymm13[23] +; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm20 = xmm16[8],xmm15[8],xmm16[9],xmm15[9],xmm16[10],xmm15[10],xmm16[11],xmm15[11],xmm16[12],xmm15[12],xmm16[13],xmm15[13],xmm16[14],xmm15[14],xmm16[15],xmm15[15] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm21 = [5,4,3,6,5,4,3,6,5,4,3,6,7,7,7,7,41,40,43,42,41,40,43,42,41,40,43,42,45,44,47,46] +; AVX512BW-FCP-NEXT: vpermt2w %zmm19, %zmm21, %zmm20 +; AVX512BW-FCP-NEXT: vmovdqa (%rsi), %ymm15 +; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %ymm16 +; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm19 = ymm16[0],ymm15[0],ymm16[1],ymm15[1],ymm16[2],ymm15[2],ymm16[3],ymm15[3],ymm16[4],ymm15[4],ymm16[5],ymm15[5],ymm16[6],ymm15[6],ymm16[7],ymm15[7],ymm16[16],ymm15[16],ymm16[17],ymm15[17],ymm16[18],ymm15[18],ymm16[19],ymm15[19],ymm16[20],ymm15[20],ymm16[21],ymm15[21],ymm16[22],ymm15[22],ymm16[23],ymm15[23] +; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm17 = xmm18[8],xmm17[8],xmm18[9],xmm17[9],xmm18[10],xmm17[10],xmm18[11],xmm17[11],xmm18[12],xmm17[12],xmm18[13],xmm17[13],xmm18[14],xmm17[14],xmm18[15],xmm17[15] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm18 = [4,3,6,5,4,3,6,5,4,3,6,5,7,7,7,7,40,43,42,41,40,43,42,41,40,43,42,41,44,45,46,45] +; AVX512BW-FCP-NEXT: vpermt2w %zmm19, %zmm18, %zmm17 +; AVX512BW-FCP-NEXT: movl $613566756, %r10d # imm = 0x24924924 +; AVX512BW-FCP-NEXT: kmovd %r10d, %k2 +; AVX512BW-FCP-NEXT: vmovdqu16 %zmm20, %zmm17 {%k2} +; AVX512BW-FCP-NEXT: vextracti64x4 $1, %zmm17, %ymm19 +; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} ymm20 = [2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0] +; AVX512BW-FCP-NEXT: # ymm20 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpshufb %ymm20, %ymm4, %ymm22 +; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm22 = ymm22[2,2,2,3] +; AVX512BW-FCP-NEXT: movw $18724, %r10w # imm = 0x4924 +; AVX512BW-FCP-NEXT: kmovd %r10d, %k3 +; AVX512BW-FCP-NEXT: vmovdqu16 %ymm22, %ymm19 {%k3} +; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm22 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm23 = [18,1,2,19,4,5,20,7,8,21,10,11,22,13,14,23] +; AVX512BW-FCP-NEXT: vpermt2w %ymm22, %ymm23, %ymm17 +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm19, %zmm17, %zmm17 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} ymm19 = [0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0] ; AVX512BW-FCP-NEXT: # ymm19 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermt2w %zmm11, %zmm24, %zmm10 -; AVX512BW-FCP-NEXT: vpshufb %ymm19, %ymm9, %ymm11 -; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm15 = xmm9[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX512BW-FCP-NEXT: vpermt2w %zmm11, %zmm0, %zmm15 -; AVX512BW-FCP-NEXT: vmovdqa 32(%r8), %ymm11 -; AVX512BW-FCP-NEXT: vpshufb %ymm1, %ymm11, %ymm1 -; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm28[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX512BW-FCP-NEXT: vpermt2w %zmm1, %zmm0, %zmm11 -; AVX512BW-FCP-NEXT: vmovdqa 32(%r9), %ymm1 -; AVX512BW-FCP-NEXT: vpshufb %ymm19, %ymm1, %ymm1 -; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm19 = xmm29[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX512BW-FCP-NEXT: vpermt2w %zmm1, %zmm0, %zmm19 -; AVX512BW-FCP-NEXT: vpshufb %ymm2, %ymm4, %ymm0 -; AVX512BW-FCP-NEXT: vpshufb %ymm2, %ymm26, %ymm1 -; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23] +; AVX512BW-FCP-NEXT: vpshufb %ymm19, %ymm7, %ymm22 +; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm24 = xmm7[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm25 = [2,5,4,3,2,5,4,3,2,5,4,3,6,5,6,7,40,41,42,43,40,41,42,43,40,41,42,43,44,45,46,47] +; AVX512BW-FCP-NEXT: vpermt2w %zmm22, %zmm25, %zmm24 +; AVX512BW-FCP-NEXT: movabsq $2342443691899625602, %r10 # imm = 0x2082082082082082 +; AVX512BW-FCP-NEXT: kmovq %r10, %k4 +; AVX512BW-FCP-NEXT: vmovdqu8 %zmm24, %zmm17 {%k4} +; AVX512BW-FCP-NEXT: vmovdqa64 32(%rcx), %ymm22 +; AVX512BW-FCP-NEXT: vmovdqa64 32(%rdx), %ymm24 +; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm26 = ymm24[0],ymm22[0],ymm24[1],ymm22[1],ymm24[2],ymm22[2],ymm24[3],ymm22[3],ymm24[4],ymm22[4],ymm24[5],ymm22[5],ymm24[6],ymm22[6],ymm24[7],ymm22[7],ymm24[16],ymm22[16],ymm24[17],ymm22[17],ymm24[18],ymm22[18],ymm24[19],ymm22[19],ymm24[20],ymm22[20],ymm24[21],ymm22[21],ymm24[22],ymm22[22],ymm24[23],ymm22[23] +; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm8[8],xmm6[8],xmm8[9],xmm6[9],xmm8[10],xmm6[10],xmm8[11],xmm6[11],xmm8[12],xmm6[12],xmm8[13],xmm6[13],xmm8[14],xmm6[14],xmm8[15],xmm6[15] +; AVX512BW-FCP-NEXT: vpermt2w %zmm26, %zmm21, %zmm6 +; AVX512BW-FCP-NEXT: vmovdqa 32(%rsi), %ymm8 +; AVX512BW-FCP-NEXT: vmovdqa64 32(%rdi), %ymm21 +; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm26 = ymm21[0],ymm8[0],ymm21[1],ymm8[1],ymm21[2],ymm8[2],ymm21[3],ymm8[3],ymm21[4],ymm8[4],ymm21[5],ymm8[5],ymm21[6],ymm8[6],ymm21[7],ymm8[7],ymm21[16],ymm8[16],ymm21[17],ymm8[17],ymm21[18],ymm8[18],ymm21[19],ymm8[19],ymm21[20],ymm8[20],ymm21[21],ymm8[21],ymm21[22],ymm8[22],ymm21[23],ymm8[23] +; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm10[8],xmm9[8],xmm10[9],xmm9[9],xmm10[10],xmm9[10],xmm10[11],xmm9[11],xmm10[12],xmm9[12],xmm10[13],xmm9[13],xmm10[14],xmm9[14],xmm10[15],xmm9[15] +; AVX512BW-FCP-NEXT: vpermt2w %zmm26, %zmm18, %zmm9 +; AVX512BW-FCP-NEXT: vmovdqu16 %zmm6, %zmm9 {%k2} +; AVX512BW-FCP-NEXT: vextracti64x4 $1, %zmm9, %ymm6 +; AVX512BW-FCP-NEXT: vmovdqa 32(%r8), %ymm10 +; AVX512BW-FCP-NEXT: vpshufb %ymm20, %ymm10, %ymm10 +; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,2,3] +; AVX512BW-FCP-NEXT: vmovdqu16 %ymm10, %ymm6 {%k3} +; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm11[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX512BW-FCP-NEXT: vpermt2w %ymm10, %ymm23, %ymm9 +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm9, %zmm6 +; AVX512BW-FCP-NEXT: vmovdqa 32(%r9), %ymm9 +; AVX512BW-FCP-NEXT: vpshufb %ymm19, %ymm9, %ymm9 +; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm12[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX512BW-FCP-NEXT: vpermt2w %zmm9, %zmm25, %zmm10 +; AVX512BW-FCP-NEXT: vmovdqu8 %zmm10, %zmm6 {%k4} +; AVX512BW-FCP-NEXT: vpbroadcastq {{.*#+}} ymm9 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] +; AVX512BW-FCP-NEXT: vpshufb %ymm9, %ymm8, %ymm10 +; AVX512BW-FCP-NEXT: vpshufb %ymm9, %ymm21, %ymm11 +; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm10 = ymm11[0],ymm10[0],ymm11[1],ymm10[1],ymm11[2],ymm10[2],ymm11[3],ymm10[3],ymm11[4],ymm10[4],ymm11[5],ymm10[5],ymm11[6],ymm10[6],ymm11[7],ymm10[7],ymm11[16],ymm10[16],ymm11[17],ymm10[17],ymm11[18],ymm10[18],ymm11[19],ymm10[19],ymm11[20],ymm10[20],ymm11[21],ymm10[21],ymm11[22],ymm10[22],ymm11[23],ymm10[23] +; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,2,3] +; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} ymm8 = ymm21[8],ymm8[8],ymm21[9],ymm8[9],ymm21[10],ymm8[10],ymm21[11],ymm8[11],ymm21[12],ymm8[12],ymm21[13],ymm8[13],ymm21[14],ymm8[14],ymm21[15],ymm8[15],ymm21[24],ymm8[24],ymm21[25],ymm8[25],ymm21[26],ymm8[26],ymm21[27],ymm8[27],ymm21[28],ymm8[28],ymm21[29],ymm8[29],ymm21[30],ymm8[30],ymm21[31],ymm8[31] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm11 = [12,11,14,13,12,11,14,13,12,11,14,13,15,15,15,15] +; AVX512BW-FCP-NEXT: vpermw %ymm8, %ymm11, %ymm8 +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm10, %zmm8 +; AVX512BW-FCP-NEXT: vpshufb %ymm0, %ymm22, %ymm10 +; AVX512BW-FCP-NEXT: vpshufb %ymm0, %ymm24, %ymm12 +; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm10 = ymm12[0],ymm10[0],ymm12[1],ymm10[1],ymm12[2],ymm10[2],ymm12[3],ymm10[3],ymm12[4],ymm10[4],ymm12[5],ymm10[5],ymm12[6],ymm10[6],ymm12[7],ymm10[7],ymm12[16],ymm10[16],ymm12[17],ymm10[17],ymm12[18],ymm10[18],ymm12[19],ymm10[19],ymm12[20],ymm10[20],ymm12[21],ymm10[21],ymm12[22],ymm10[22],ymm12[23],ymm10[23] +; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,2,3] +; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} ymm12 = ymm24[8],ymm22[8],ymm24[9],ymm22[9],ymm24[10],ymm22[10],ymm24[11],ymm22[11],ymm24[12],ymm22[12],ymm24[13],ymm22[13],ymm24[14],ymm22[14],ymm24[15],ymm22[15],ymm24[24],ymm22[24],ymm24[25],ymm22[25],ymm24[26],ymm22[26],ymm24[27],ymm22[27],ymm24[28],ymm22[28],ymm24[29],ymm22[29],ymm24[30],ymm22[30],ymm24[31],ymm22[31] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm18 = [13,12,11,14,13,12,11,14,13,12,11,14,15,15,15,15] +; AVX512BW-FCP-NEXT: vpermw %ymm12, %ymm18, %ymm12 +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm12, %zmm10, %zmm10 +; AVX512BW-FCP-NEXT: vmovdqu16 %zmm8, %zmm10 {%k2} +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[4,5,6,7,4,5,6,7] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm8 = [6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,10,13,12,11,14,0,0,15,10,13,12,11,14,0,0,15] +; AVX512BW-FCP-NEXT: vpshufb %zmm8, %zmm2, %zmm2 +; AVX512BW-FCP-NEXT: vpermq {{.*#+}} zmm2 = zmm2[2,2,2,3,6,6,6,7] +; AVX512BW-FCP-NEXT: vmovdqu16 %zmm2, %zmm10 {%k1} +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[4,5,6,7,4,5,6,7] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[u,6,u,5,u,8,u,7,u,u,u,9,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u,u,42,u,45,u,44,u,43,u,46,u,u,u,u,u,47,u,58,u,61,u,60,u,59,u,62,u,u,u,u,u,63] +; AVX512BW-FCP-NEXT: vpermq {{.*#+}} zmm1 = zmm1[2,2,2,3,6,6,6,7] +; AVX512BW-FCP-NEXT: movabsq $-9076969306111049208, %rcx # imm = 0x8208208208208208 +; AVX512BW-FCP-NEXT: kmovq %rcx, %k3 +; AVX512BW-FCP-NEXT: vmovdqu8 %zmm1, %zmm10 {%k3} +; AVX512BW-FCP-NEXT: vpshufb %ymm9, %ymm15, %ymm1 +; AVX512BW-FCP-NEXT: vpshufb %ymm9, %ymm16, %ymm2 +; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[16],ymm1[16],ymm2[17],ymm1[17],ymm2[18],ymm1[18],ymm2[19],ymm1[19],ymm2[20],ymm1[20],ymm2[21],ymm1[21],ymm2[22],ymm1[22],ymm2[23],ymm1[23] +; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] +; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm16[8],ymm15[8],ymm16[9],ymm15[9],ymm16[10],ymm15[10],ymm16[11],ymm15[11],ymm16[12],ymm15[12],ymm16[13],ymm15[13],ymm16[14],ymm15[14],ymm16[15],ymm15[15],ymm16[24],ymm15[24],ymm16[25],ymm15[25],ymm16[26],ymm15[26],ymm16[27],ymm15[27],ymm16[28],ymm15[28],ymm16[29],ymm15[29],ymm16[30],ymm15[30],ymm16[31],ymm15[31] +; AVX512BW-FCP-NEXT: vpermw %ymm2, %ymm11, %ymm2 +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 +; AVX512BW-FCP-NEXT: vpshufb %ymm0, %ymm13, %ymm2 +; AVX512BW-FCP-NEXT: vpshufb %ymm0, %ymm14, %ymm0 +; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[16],ymm2[16],ymm0[17],ymm2[17],ymm0[18],ymm2[18],ymm0[19],ymm2[19],ymm0[20],ymm2[20],ymm0[21],ymm2[21],ymm0[22],ymm2[22],ymm0[23],ymm2[23] ; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] -; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm26[8],ymm4[8],ymm26[9],ymm4[9],ymm26[10],ymm4[10],ymm26[11],ymm4[11],ymm26[12],ymm4[12],ymm26[13],ymm4[13],ymm26[14],ymm4[14],ymm26[15],ymm4[15],ymm26[24],ymm4[24],ymm26[25],ymm4[25],ymm26[26],ymm4[26],ymm26[27],ymm4[27],ymm26[28],ymm4[28],ymm26[29],ymm4[29],ymm26[30],ymm4[30],ymm26[31],ymm4[31] -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm24 = [13,12,11,14,13,12,11,14,13,12,11,14,15,15,15,15] -; AVX512BW-FCP-NEXT: vpermw %ymm1, %ymm24, %ymm1 -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa64 (%r8), %zmm0 -; AVX512BW-FCP-NEXT: movl $613566756, %eax # imm = 0x24924924 -; AVX512BW-FCP-NEXT: kmovd %eax, %k1 -; AVX512BW-FCP-NEXT: vmovdqu16 %zmm3, %zmm4 {%k1} -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[4,5,6,7,4,5,6,7] -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm1 = [6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,10,13,12,11,14,0,0,15,10,13,12,11,14,0,0,15] -; AVX512BW-FCP-NEXT: vpshufb %zmm1, %zmm0, %zmm0 -; AVX512BW-FCP-NEXT: vpermq {{.*#+}} zmm0 = zmm0[2,2,2,3,6,6,6,7] -; AVX512BW-FCP-NEXT: movl $-1840700270, %eax # imm = 0x92492492 -; AVX512BW-FCP-NEXT: kmovd %eax, %k2 -; AVX512BW-FCP-NEXT: vmovdqu16 %zmm0, %zmm4 {%k2} -; AVX512BW-FCP-NEXT: vmovdqa64 (%r9), %zmm0 -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[4,5,6,7,4,5,6,7] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[u,6,u,5,u,8,u,7,u,u,u,9,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u,u,42,u,45,u,44,u,43,u,46,u,u,u,u,u,47,u,58,u,61,u,60,u,59,u,62,u,u,u,u,u,63] -; AVX512BW-FCP-NEXT: vpermq {{.*#+}} zmm0 = zmm0[2,2,2,3,6,6,6,7] -; AVX512BW-FCP-NEXT: movabsq $-9076969306111049208, %rax # imm = 0x8208208208208208 -; AVX512BW-FCP-NEXT: kmovq %rax, %k3 -; AVX512BW-FCP-NEXT: vmovdqu8 %zmm0, %zmm4 {%k3} -; AVX512BW-FCP-NEXT: vpbroadcastq {{.*#+}} ymm3 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] -; AVX512BW-FCP-NEXT: vpshufb %ymm3, %ymm5, %ymm0 -; AVX512BW-FCP-NEXT: vpshufb %ymm3, %ymm6, %ymm3 -; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[4],ymm0[4],ymm3[5],ymm0[5],ymm3[6],ymm0[6],ymm3[7],ymm0[7],ymm3[16],ymm0[16],ymm3[17],ymm0[17],ymm3[18],ymm0[18],ymm3[19],ymm0[19],ymm3[20],ymm0[20],ymm3[21],ymm0[21],ymm3[22],ymm0[22],ymm3[23],ymm0[23] -; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm6[8],ymm5[8],ymm6[9],ymm5[9],ymm6[10],ymm5[10],ymm6[11],ymm5[11],ymm6[12],ymm5[12],ymm6[13],ymm5[13],ymm6[14],ymm5[14],ymm6[15],ymm5[15],ymm6[24],ymm5[24],ymm6[25],ymm5[25],ymm6[26],ymm5[26],ymm6[27],ymm5[27],ymm6[28],ymm5[28],ymm6[29],ymm5[29],ymm6[30],ymm5[30],ymm6[31],ymm5[31] -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm5 = [12,11,14,13,12,11,14,13,12,11,14,13,15,15,15,15] -; AVX512BW-FCP-NEXT: vpermw %ymm3, %ymm5, %ymm3 -; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 -; AVX512BW-FCP-NEXT: vpshufb %ymm2, %ymm7, %ymm3 -; AVX512BW-FCP-NEXT: vpshufb %ymm2, %ymm30, %ymm2 -; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[4],ymm3[4],ymm2[5],ymm3[5],ymm2[6],ymm3[6],ymm2[7],ymm3[7],ymm2[16],ymm3[16],ymm2[17],ymm3[17],ymm2[18],ymm3[18],ymm2[19],ymm3[19],ymm2[20],ymm3[20],ymm2[21],ymm3[21],ymm2[22],ymm3[22],ymm2[23],ymm3[23] -; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm30[8],ymm7[8],ymm30[9],ymm7[9],ymm30[10],ymm7[10],ymm30[11],ymm7[11],ymm30[12],ymm7[12],ymm30[13],ymm7[13],ymm30[14],ymm7[14],ymm30[15],ymm7[15],ymm30[24],ymm7[24],ymm30[25],ymm7[25],ymm30[26],ymm7[26],ymm30[27],ymm7[27],ymm30[28],ymm7[28],ymm30[29],ymm7[29],ymm30[30],ymm7[30],ymm30[31],ymm7[31] -; AVX512BW-FCP-NEXT: vpermw %ymm3, %ymm24, %ymm3 -; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqu16 %zmm0, %zmm2 {%k1} -; AVX512BW-FCP-NEXT: vpshufb %zmm1, %zmm8, %zmm0 -; AVX512BW-FCP-NEXT: vpermq {{.*#+}} zmm0 = zmm0[2,2,2,3,6,6,6,7] -; AVX512BW-FCP-NEXT: vmovdqu16 %zmm0, %zmm2 {%k2} -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm0 = zmm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,58,u,61,u,60,u,59,u,62,u,u,u,u,u,63] -; AVX512BW-FCP-NEXT: vpermq {{.*#+}} zmm0 = zmm0[2,2,2,3,6,6,6,7] -; AVX512BW-FCP-NEXT: vmovdqu8 %zmm0, %zmm2 {%k3} -; AVX512BW-FCP-NEXT: vmovdqu16 %zmm22, %zmm21 {%k1} -; AVX512BW-FCP-NEXT: movl $1227133513, %eax # imm = 0x49249249 -; AVX512BW-FCP-NEXT: kmovd %eax, %k3 -; AVX512BW-FCP-NEXT: vmovdqu16 %zmm20, %zmm21 {%k3} -; AVX512BW-FCP-NEXT: movabsq $2342443691899625602, %rax # imm = 0x2082082082082082 -; AVX512BW-FCP-NEXT: kmovq %rax, %k4 -; AVX512BW-FCP-NEXT: vmovdqu8 %zmm15, %zmm21 {%k4} -; AVX512BW-FCP-NEXT: vmovdqu16 %zmm13, %zmm12 {%k2} -; AVX512BW-FCP-NEXT: vmovdqu16 %zmm14, %zmm12 {%k1} -; AVX512BW-FCP-NEXT: movabsq $585610922974906400, %rax # imm = 0x820820820820820 -; AVX512BW-FCP-NEXT: kmovq %rax, %k5 -; AVX512BW-FCP-NEXT: vmovdqu8 %zmm23, %zmm12 {%k5} -; AVX512BW-FCP-NEXT: vmovdqu16 %zmm17, %zmm18 {%k2} -; AVX512BW-FCP-NEXT: vmovdqu16 %zmm25, %zmm18 {%k1} -; AVX512BW-FCP-NEXT: vmovdqu8 %zmm27, %zmm18 {%k5} -; AVX512BW-FCP-NEXT: vmovdqu16 %zmm16, %zmm10 {%k1} -; AVX512BW-FCP-NEXT: vmovdqu16 %zmm11, %zmm10 {%k3} -; AVX512BW-FCP-NEXT: vmovdqu8 %zmm19, %zmm10 {%k4} -; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, 256(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, (%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, 192(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, 64(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, 128(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, 320(%rax) +; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm14[8],ymm13[8],ymm14[9],ymm13[9],ymm14[10],ymm13[10],ymm14[11],ymm13[11],ymm14[12],ymm13[12],ymm14[13],ymm13[13],ymm14[14],ymm13[14],ymm14[15],ymm13[15],ymm14[24],ymm13[24],ymm14[25],ymm13[25],ymm14[26],ymm13[26],ymm14[27],ymm13[27],ymm14[28],ymm13[28],ymm14[29],ymm13[29],ymm14[30],ymm13[30],ymm14[31],ymm13[31] +; AVX512BW-FCP-NEXT: vpermw %ymm2, %ymm18, %ymm2 +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqu16 %zmm1, %zmm0 {%k2} +; AVX512BW-FCP-NEXT: vpshufb %zmm8, %zmm4, %zmm1 +; AVX512BW-FCP-NEXT: vpermq {{.*#+}} zmm1 = zmm1[2,2,2,3,6,6,6,7] +; AVX512BW-FCP-NEXT: vmovdqu16 %zmm1, %zmm0 {%k1} +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm1 = zmm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,58,u,61,u,60,u,59,u,62,u,u,u,u,u,63] +; AVX512BW-FCP-NEXT: vpermq {{.*#+}} zmm1 = zmm1[2,2,2,3,6,6,6,7] +; AVX512BW-FCP-NEXT: vmovdqu8 %zmm1, %zmm0 {%k3} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, 128(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, 320(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, 256(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, 64(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, (%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, 192(%rax) ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq ; ; AVX512DQ-BW-LABEL: store_i8_stride6_vf64: ; AVX512DQ-BW: # %bb.0: ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-BW-NEXT: vmovdqa64 (%r8), %zmm5 -; AVX512DQ-BW-NEXT: vmovdqa64 (%r9), %zmm6 -; AVX512DQ-BW-NEXT: vmovdqa 32(%rsi), %ymm1 -; AVX512DQ-BW-NEXT: vpbroadcastq {{.*#+}} ymm7 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] -; AVX512DQ-BW-NEXT: vpshufb %ymm7, %ymm1, %ymm0 -; AVX512DQ-BW-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX512DQ-BW-NEXT: vpshufb %ymm7, %ymm2, %ymm3 -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[4],ymm0[4],ymm3[5],ymm0[5],ymm3[6],ymm0[6],ymm3[7],ymm0[7],ymm3[16],ymm0[16],ymm3[17],ymm0[17],ymm3[18],ymm0[18],ymm3[19],ymm0[19],ymm3[20],ymm0[20],ymm3[21],ymm0[21],ymm3[22],ymm0[22],ymm3[23],ymm0[23] -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11],ymm2[12],ymm1[12],ymm2[13],ymm1[13],ymm2[14],ymm1[14],ymm2[15],ymm1[15],ymm2[24],ymm1[24],ymm2[25],ymm1[25],ymm2[26],ymm1[26],ymm2[27],ymm1[27],ymm2[28],ymm1[28],ymm2[29],ymm1[29],ymm2[30],ymm1[30],ymm2[31],ymm1[31] -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm8 = [12,11,14,13,12,11,14,13,12,11,14,13,15,15,15,15] -; AVX512DQ-BW-NEXT: vpermw %ymm3, %ymm8, %ymm3 -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm9 -; AVX512DQ-BW-NEXT: vmovdqa 32(%rcx), %ymm3 -; AVX512DQ-BW-NEXT: vpbroadcastq {{.*#+}} ymm12 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] -; AVX512DQ-BW-NEXT: vpshufb %ymm12, %ymm3, %ymm0 -; AVX512DQ-BW-NEXT: vmovdqa 32(%rdx), %ymm4 -; AVX512DQ-BW-NEXT: vpshufb %ymm12, %ymm4, %ymm10 -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm10[0],ymm0[0],ymm10[1],ymm0[1],ymm10[2],ymm0[2],ymm10[3],ymm0[3],ymm10[4],ymm0[4],ymm10[5],ymm0[5],ymm10[6],ymm0[6],ymm10[7],ymm0[7],ymm10[16],ymm0[16],ymm10[17],ymm0[17],ymm10[18],ymm0[18],ymm10[19],ymm0[19],ymm10[20],ymm0[20],ymm10[21],ymm0[21],ymm10[22],ymm0[22],ymm10[23],ymm0[23] -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} ymm10 = ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15],ymm4[24],ymm3[24],ymm4[25],ymm3[25],ymm4[26],ymm3[26],ymm4[27],ymm3[27],ymm4[28],ymm3[28],ymm4[29],ymm3[29],ymm4[30],ymm3[30],ymm4[31],ymm3[31] -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm11 = [13,12,11,14,13,12,11,14,13,12,11,14,15,15,15,15] -; AVX512DQ-BW-NEXT: vpermw %ymm10, %ymm11, %ymm10 -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm0 -; AVX512DQ-BW-NEXT: movl $613566756, %r10d # imm = 0x24924924 -; AVX512DQ-BW-NEXT: kmovd %r10d, %k1 -; AVX512DQ-BW-NEXT: vmovdqu16 %zmm9, %zmm0 {%k1} -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[4,5,6,7,4,5,6,7] -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm14 = [6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,10,13,12,11,14,0,0,15,10,13,12,11,14,0,0,15] -; AVX512DQ-BW-NEXT: vpshufb %zmm14, %zmm5, %zmm5 -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} zmm5 = zmm5[2,2,2,3,6,6,6,7] +; AVX512DQ-BW-NEXT: vmovdqa64 (%r8), %zmm2 +; AVX512DQ-BW-NEXT: vmovdqa64 (%r9), %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa (%rsi), %xmm15 +; AVX512DQ-BW-NEXT: vmovdqa 32(%rsi), %xmm4 +; AVX512DQ-BW-NEXT: vpbroadcastq {{.*#+}} xmm5 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] +; AVX512DQ-BW-NEXT: vpshufb %xmm5, %xmm4, %xmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %xmm18 +; AVX512DQ-BW-NEXT: vmovdqa 32(%rdi), %xmm6 +; AVX512DQ-BW-NEXT: vpshufb %xmm5, %xmm6, %xmm3 +; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15] +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3],xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm7 = [0,3,2,1,0,3,2,1,0,3,2,1,4,5,6,5,32,33,34,35,32,33,34,35,32,33,34,35,36,37,38,39] +; AVX512DQ-BW-NEXT: vpermt2w %zmm0, %zmm7, %zmm3 +; AVX512DQ-BW-NEXT: vpbroadcastq {{.*#+}} ymm0 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] +; AVX512DQ-BW-NEXT: vmovdqa64 (%rcx), %xmm16 +; AVX512DQ-BW-NEXT: vmovdqa 32(%rcx), %xmm10 +; AVX512DQ-BW-NEXT: vpshufb %xmm0, %xmm10, %xmm8 +; AVX512DQ-BW-NEXT: vmovdqa64 (%rdx), %xmm17 +; AVX512DQ-BW-NEXT: vmovdqa 32(%rdx), %xmm11 +; AVX512DQ-BW-NEXT: vpshufb %xmm0, %xmm11, %xmm9 +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3],xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] +; AVX512DQ-BW-NEXT: vprold $16, %xmm9, %xmm9 +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm8, %zmm9, %zmm8 +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} zmm8 = zmm8[0,0,0,1,4,4,4,5] ; AVX512DQ-BW-NEXT: movl $-1840700270, %r10d # imm = 0x92492492 +; AVX512DQ-BW-NEXT: kmovd %r10d, %k1 +; AVX512DQ-BW-NEXT: vmovdqu16 %zmm8, %zmm3 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa 32(%r8), %xmm9 +; AVX512DQ-BW-NEXT: vpmovzxbw {{.*#+}} xmm8 = xmm9[0],zero,xmm9[1],zero,xmm9[2],zero,xmm9[3],zero,xmm9[4],zero,xmm9[5],zero,xmm9[6],zero,xmm9[7],zero +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm13 = [0,1,16,3,4,17,6,7,18,9,10,19,12,13,20,15] +; AVX512DQ-BW-NEXT: vmovdqa %ymm3, %ymm12 +; AVX512DQ-BW-NEXT: vpermt2w %ymm8, %ymm13, %ymm12 +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm8 = [16,37,18,19,38,21,22,39,24,25,32,27,28,33,30,31] +; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm14 = xmm9[2,1,2,3] +; AVX512DQ-BW-NEXT: vpmovzxbw {{.*#+}} xmm14 = xmm14[0],zero,xmm14[1],zero,xmm14[2],zero,xmm14[3],zero,xmm14[4],zero,xmm14[5],zero,xmm14[6],zero,xmm14[7],zero +; AVX512DQ-BW-NEXT: vpermt2w %zmm14, %zmm8, %zmm3 +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm3, %zmm12, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqa 32(%r9), %xmm12 +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm12[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm19 = xmm12[2,1,2,3] +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm19 = xmm19[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm20 = [2,1,0,3,2,1,0,3,2,1,0,3,4,4,4,4,38,37,32,39,38,37,32,39,38,37,32,39,33,33,33,33] +; AVX512DQ-BW-NEXT: vpermt2w %zmm19, %zmm20, %zmm14 +; AVX512DQ-BW-NEXT: movabsq $585610922974906400, %r10 # imm = 0x820820820820820 +; AVX512DQ-BW-NEXT: kmovq %r10, %k2 +; AVX512DQ-BW-NEXT: vmovdqu8 %zmm14, %zmm3 {%k2} +; AVX512DQ-BW-NEXT: vpshufb %xmm5, %xmm15, %xmm14 +; AVX512DQ-BW-NEXT: vpshufb %xmm5, %xmm18, %xmm5 +; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm5[8],xmm14[8],xmm5[9],xmm14[9],xmm5[10],xmm14[10],xmm5[11],xmm14[11],xmm5[12],xmm14[12],xmm5[13],xmm14[13],xmm5[14],xmm14[14],xmm5[15],xmm14[15] +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm18[0],xmm15[0],xmm18[1],xmm15[1],xmm18[2],xmm15[2],xmm18[3],xmm15[3],xmm18[4],xmm15[4],xmm18[5],xmm15[5],xmm18[6],xmm15[6],xmm18[7],xmm15[7] +; AVX512DQ-BW-NEXT: vpermt2w %zmm5, %zmm7, %zmm14 +; AVX512DQ-BW-NEXT: vpshufb %xmm0, %xmm16, %xmm5 +; AVX512DQ-BW-NEXT: vpshufb %xmm0, %xmm17, %xmm7 +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3],xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7] +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm17[0],xmm16[0],xmm17[1],xmm16[1],xmm17[2],xmm16[2],xmm17[3],xmm16[3],xmm17[4],xmm16[4],xmm17[5],xmm16[5],xmm17[6],xmm16[6],xmm17[7],xmm16[7] +; AVX512DQ-BW-NEXT: vprold $16, %xmm7, %xmm7 +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm5, %zmm7, %zmm5 +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} zmm5 = zmm5[0,0,0,1,4,4,4,5] +; AVX512DQ-BW-NEXT: vmovdqu16 %zmm5, %zmm14 {%k1} +; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpmovzxbw {{.*#+}} xmm7 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero +; AVX512DQ-BW-NEXT: vpermi2w %ymm7, %ymm14, %ymm13 +; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[2,1,2,3] +; AVX512DQ-BW-NEXT: vpmovzxbw {{.*#+}} xmm7 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero,xmm7[4],zero,xmm7[5],zero,xmm7[6],zero,xmm7[7],zero +; AVX512DQ-BW-NEXT: vpermt2w %zmm7, %zmm8, %zmm14 +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm14, %zmm13, %zmm7 +; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm8[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm14 = xmm8[2,1,2,3] +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm14[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX512DQ-BW-NEXT: vpermt2w %zmm14, %zmm20, %zmm13 +; AVX512DQ-BW-NEXT: vmovdqu8 %zmm13, %zmm7 {%k2} +; AVX512DQ-BW-NEXT: vmovdqa (%rcx), %ymm13 +; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %ymm14 +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} ymm19 = ymm14[0],ymm13[0],ymm14[1],ymm13[1],ymm14[2],ymm13[2],ymm14[3],ymm13[3],ymm14[4],ymm13[4],ymm14[5],ymm13[5],ymm14[6],ymm13[6],ymm14[7],ymm13[7],ymm14[16],ymm13[16],ymm14[17],ymm13[17],ymm14[18],ymm13[18],ymm14[19],ymm13[19],ymm14[20],ymm13[20],ymm14[21],ymm13[21],ymm14[22],ymm13[22],ymm14[23],ymm13[23] +; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm20 = xmm17[8],xmm16[8],xmm17[9],xmm16[9],xmm17[10],xmm16[10],xmm17[11],xmm16[11],xmm17[12],xmm16[12],xmm17[13],xmm16[13],xmm17[14],xmm16[14],xmm17[15],xmm16[15] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm21 = [5,4,3,6,5,4,3,6,5,4,3,6,7,7,7,7,41,40,43,42,41,40,43,42,41,40,43,42,45,44,47,46] +; AVX512DQ-BW-NEXT: vpermt2w %zmm19, %zmm21, %zmm20 +; AVX512DQ-BW-NEXT: vmovdqa64 (%rsi), %ymm16 +; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %ymm17 +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} ymm19 = ymm17[0],ymm16[0],ymm17[1],ymm16[1],ymm17[2],ymm16[2],ymm17[3],ymm16[3],ymm17[4],ymm16[4],ymm17[5],ymm16[5],ymm17[6],ymm16[6],ymm17[7],ymm16[7],ymm17[16],ymm16[16],ymm17[17],ymm16[17],ymm17[18],ymm16[18],ymm17[19],ymm16[19],ymm17[20],ymm16[20],ymm17[21],ymm16[21],ymm17[22],ymm16[22],ymm17[23],ymm16[23] +; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm15 = xmm18[8],xmm15[8],xmm18[9],xmm15[9],xmm18[10],xmm15[10],xmm18[11],xmm15[11],xmm18[12],xmm15[12],xmm18[13],xmm15[13],xmm18[14],xmm15[14],xmm18[15],xmm15[15] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm18 = [4,3,6,5,4,3,6,5,4,3,6,5,7,7,7,7,40,43,42,41,40,43,42,41,40,43,42,41,44,45,46,45] +; AVX512DQ-BW-NEXT: vpermt2w %zmm19, %zmm18, %zmm15 +; AVX512DQ-BW-NEXT: movl $613566756, %r10d # imm = 0x24924924 ; AVX512DQ-BW-NEXT: kmovd %r10d, %k2 -; AVX512DQ-BW-NEXT: vmovdqu16 %zmm5, %zmm0 {%k2} -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm6[4,5,6,7,4,5,6,7] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm5 = zmm5[u,6,u,5,u,8,u,7,u,u,u,9,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u,u,42,u,45,u,44,u,43,u,46,u,u,u,u,u,47,u,58,u,61,u,60,u,59,u,62,u,u,u,u,u,63] -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} zmm5 = zmm5[2,2,2,3,6,6,6,7] -; AVX512DQ-BW-NEXT: movabsq $-9076969306111049208, %r10 # imm = 0x8208208208208208 -; AVX512DQ-BW-NEXT: kmovq %r10, %k3 -; AVX512DQ-BW-NEXT: vmovdqu8 %zmm5, %zmm0 {%k3} -; AVX512DQ-BW-NEXT: vmovdqa (%rsi), %ymm9 -; AVX512DQ-BW-NEXT: vpshufb %ymm7, %ymm9, %ymm5 -; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %ymm10 -; AVX512DQ-BW-NEXT: vpshufb %ymm7, %ymm10, %ymm6 -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm6[0],ymm5[0],ymm6[1],ymm5[1],ymm6[2],ymm5[2],ymm6[3],ymm5[3],ymm6[4],ymm5[4],ymm6[5],ymm5[5],ymm6[6],ymm5[6],ymm6[7],ymm5[7],ymm6[16],ymm5[16],ymm6[17],ymm5[17],ymm6[18],ymm5[18],ymm6[19],ymm5[19],ymm6[20],ymm5[20],ymm6[21],ymm5[21],ymm6[22],ymm5[22],ymm6[23],ymm5[23] -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm10[8],ymm9[8],ymm10[9],ymm9[9],ymm10[10],ymm9[10],ymm10[11],ymm9[11],ymm10[12],ymm9[12],ymm10[13],ymm9[13],ymm10[14],ymm9[14],ymm10[15],ymm9[15],ymm10[24],ymm9[24],ymm10[25],ymm9[25],ymm10[26],ymm9[26],ymm10[27],ymm9[27],ymm10[28],ymm9[28],ymm10[29],ymm9[29],ymm10[30],ymm9[30],ymm10[31],ymm9[31] -; AVX512DQ-BW-NEXT: vpermw %ymm6, %ymm8, %ymm6 -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm6, %zmm5, %zmm6 -; AVX512DQ-BW-NEXT: vmovdqa64 (%rcx), %ymm17 -; AVX512DQ-BW-NEXT: vpshufb %ymm12, %ymm17, %ymm5 -; AVX512DQ-BW-NEXT: vmovdqa64 (%rdx), %ymm19 -; AVX512DQ-BW-NEXT: vpshufb %ymm12, %ymm19, %ymm7 -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm7[0],ymm5[0],ymm7[1],ymm5[1],ymm7[2],ymm5[2],ymm7[3],ymm5[3],ymm7[4],ymm5[4],ymm7[5],ymm5[5],ymm7[6],ymm5[6],ymm7[7],ymm5[7],ymm7[16],ymm5[16],ymm7[17],ymm5[17],ymm7[18],ymm5[18],ymm7[19],ymm5[19],ymm7[20],ymm5[20],ymm7[21],ymm5[21],ymm7[22],ymm5[22],ymm7[23],ymm5[23] -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} ymm7 = ymm19[8],ymm17[8],ymm19[9],ymm17[9],ymm19[10],ymm17[10],ymm19[11],ymm17[11],ymm19[12],ymm17[12],ymm19[13],ymm17[13],ymm19[14],ymm17[14],ymm19[15],ymm17[15],ymm19[24],ymm17[24],ymm19[25],ymm17[25],ymm19[26],ymm17[26],ymm19[27],ymm17[27],ymm19[28],ymm17[28],ymm19[29],ymm17[29],ymm19[30],ymm17[30],ymm19[31],ymm17[31] -; AVX512DQ-BW-NEXT: vpermw %ymm7, %ymm11, %ymm7 -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm7, %zmm5, %zmm5 -; AVX512DQ-BW-NEXT: vmovdqu16 %zmm6, %zmm5 {%k1} -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpshufb %zmm14, %zmm13, %zmm6 -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} zmm6 = zmm6[2,2,2,3,6,6,6,7] -; AVX512DQ-BW-NEXT: vmovdqu16 %zmm6, %zmm5 {%k2} -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm6 = zmm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,58,u,61,u,60,u,59,u,62,u,u,u,u,u,63] -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} zmm6 = zmm6[2,2,2,3,6,6,6,7] -; AVX512DQ-BW-NEXT: vmovdqu8 %zmm6, %zmm5 {%k3} -; AVX512DQ-BW-NEXT: vmovdqa64 (%rsi), %xmm21 -; AVX512DQ-BW-NEXT: vmovdqa 32(%rsi), %xmm7 -; AVX512DQ-BW-NEXT: vpbroadcastq {{.*#+}} xmm20 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] -; AVX512DQ-BW-NEXT: vpshufb %xmm20, %xmm7, %xmm6 -; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %xmm22 -; AVX512DQ-BW-NEXT: vmovdqa 32(%rdi), %xmm8 -; AVX512DQ-BW-NEXT: vpshufb %xmm20, %xmm8, %xmm14 -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm14 = xmm14[8],xmm6[8],xmm14[9],xmm6[9],xmm14[10],xmm6[10],xmm14[11],xmm6[11],xmm14[12],xmm6[12],xmm14[13],xmm6[13],xmm14[14],xmm6[14],xmm14[15],xmm6[15] -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3],xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm25 = [0,3,2,1,0,3,2,1,0,3,2,1,4,5,6,5,32,33,34,35,32,33,34,35,32,33,34,35,36,37,38,39] -; AVX512DQ-BW-NEXT: vpermt2w %zmm14, %zmm25, %zmm6 -; AVX512DQ-BW-NEXT: vmovdqa64 (%rcx), %xmm23 -; AVX512DQ-BW-NEXT: vmovdqa 32(%rcx), %xmm15 -; AVX512DQ-BW-NEXT: vpshufb %xmm12, %xmm15, %xmm14 -; AVX512DQ-BW-NEXT: vmovdqa64 (%rdx), %xmm24 -; AVX512DQ-BW-NEXT: vmovdqa64 32(%rdx), %xmm18 -; AVX512DQ-BW-NEXT: vpshufb %xmm12, %xmm18, %xmm16 -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm16[0],xmm14[0],xmm16[1],xmm14[1],xmm16[2],xmm14[2],xmm16[3],xmm14[3],xmm16[4],xmm14[4],xmm16[5],xmm14[5],xmm16[6],xmm14[6],xmm16[7],xmm14[7] -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm16 = xmm18[0],xmm15[0],xmm18[1],xmm15[1],xmm18[2],xmm15[2],xmm18[3],xmm15[3],xmm18[4],xmm15[4],xmm18[5],xmm15[5],xmm18[6],xmm15[6],xmm18[7],xmm15[7] -; AVX512DQ-BW-NEXT: vprold $16, %xmm16, %xmm16 -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm14, %zmm16, %zmm14 -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} zmm14 = zmm14[0,0,0,1,4,4,4,5] -; AVX512DQ-BW-NEXT: vmovdqu16 %zmm14, %zmm6 {%k2} -; AVX512DQ-BW-NEXT: vmovdqa 32(%r8), %xmm14 -; AVX512DQ-BW-NEXT: vpmovzxbw {{.*#+}} xmm16 = xmm14[0],zero,xmm14[1],zero,xmm14[2],zero,xmm14[3],zero,xmm14[4],zero,xmm14[5],zero,xmm14[6],zero,xmm14[7],zero -; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm26 = xmm14[2,1,2,3] -; AVX512DQ-BW-NEXT: vpmovzxbw {{.*#+}} xmm26 = xmm26[0],zero,xmm26[1],zero,xmm26[2],zero,xmm26[3],zero,xmm26[4],zero,xmm26[5],zero,xmm26[6],zero,xmm26[7],zero -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm27 = [2,1,0,3,2,1,0,3,2,1,0,3,4,4,4,4,38,37,32,39,38,37,32,39,38,37,32,39,33,33,33,33] -; AVX512DQ-BW-NEXT: vpermt2w %zmm26, %zmm27, %zmm16 -; AVX512DQ-BW-NEXT: vmovdqu16 %zmm16, %zmm6 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 32(%r9), %xmm16 -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm26 = xmm16[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm28 = xmm16[2,1,2,3] -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm28 = xmm28[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX512DQ-BW-NEXT: vpermt2w %zmm28, %zmm27, %zmm26 -; AVX512DQ-BW-NEXT: movabsq $585610922974906400, %rcx # imm = 0x820820820820820 -; AVX512DQ-BW-NEXT: kmovq %rcx, %k3 -; AVX512DQ-BW-NEXT: vmovdqu8 %zmm26, %zmm6 {%k3} -; AVX512DQ-BW-NEXT: vpshufb %xmm20, %xmm21, %xmm26 -; AVX512DQ-BW-NEXT: vpshufb %xmm20, %xmm22, %xmm20 -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm26 = xmm20[8],xmm26[8],xmm20[9],xmm26[9],xmm20[10],xmm26[10],xmm20[11],xmm26[11],xmm20[12],xmm26[12],xmm20[13],xmm26[13],xmm20[14],xmm26[14],xmm20[15],xmm26[15] -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm20 = xmm22[0],xmm21[0],xmm22[1],xmm21[1],xmm22[2],xmm21[2],xmm22[3],xmm21[3],xmm22[4],xmm21[4],xmm22[5],xmm21[5],xmm22[6],xmm21[6],xmm22[7],xmm21[7] -; AVX512DQ-BW-NEXT: vpermt2w %zmm26, %zmm25, %zmm20 -; AVX512DQ-BW-NEXT: vpshufb %xmm12, %xmm23, %xmm25 -; AVX512DQ-BW-NEXT: vpshufb %xmm12, %xmm24, %xmm12 -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm12[0],xmm25[0],xmm12[1],xmm25[1],xmm12[2],xmm25[2],xmm12[3],xmm25[3],xmm12[4],xmm25[4],xmm12[5],xmm25[5],xmm12[6],xmm25[6],xmm12[7],xmm25[7] -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm25 = xmm24[0],xmm23[0],xmm24[1],xmm23[1],xmm24[2],xmm23[2],xmm24[3],xmm23[3],xmm24[4],xmm23[4],xmm24[5],xmm23[5],xmm24[6],xmm23[6],xmm24[7],xmm23[7] -; AVX512DQ-BW-NEXT: vprold $16, %xmm25, %xmm25 -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm12, %zmm25, %zmm12 -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} zmm12 = zmm12[0,0,0,1,4,4,4,5] -; AVX512DQ-BW-NEXT: vmovdqu16 %zmm12, %zmm20 {%k2} -; AVX512DQ-BW-NEXT: vpmovzxbw {{.*#+}} xmm12 = xmm13[0],zero,xmm13[1],zero,xmm13[2],zero,xmm13[3],zero,xmm13[4],zero,xmm13[5],zero,xmm13[6],zero,xmm13[7],zero -; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm25 = xmm13[2,1,2,3] -; AVX512DQ-BW-NEXT: vpmovzxbw {{.*#+}} xmm25 = xmm25[0],zero,xmm25[1],zero,xmm25[2],zero,xmm25[3],zero,xmm25[4],zero,xmm25[5],zero,xmm25[6],zero,xmm25[7],zero -; AVX512DQ-BW-NEXT: vpermt2w %zmm25, %zmm27, %zmm12 -; AVX512DQ-BW-NEXT: vmovdqu16 %zmm12, %zmm20 {%k1} -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm11[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm25 = xmm11[2,1,2,3] -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm25 = xmm25[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX512DQ-BW-NEXT: vpermt2w %zmm25, %zmm27, %zmm12 -; AVX512DQ-BW-NEXT: vmovdqu8 %zmm12, %zmm20 {%k3} -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} ymm12 = ymm19[0],ymm17[0],ymm19[1],ymm17[1],ymm19[2],ymm17[2],ymm19[3],ymm17[3],ymm19[4],ymm17[4],ymm19[5],ymm17[5],ymm19[6],ymm17[6],ymm19[7],ymm17[7],ymm19[16],ymm17[16],ymm19[17],ymm17[17],ymm19[18],ymm17[18],ymm19[19],ymm17[19],ymm19[20],ymm17[20],ymm19[21],ymm17[21],ymm19[22],ymm17[22],ymm19[23],ymm17[23] -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm17 = xmm24[8],xmm23[8],xmm24[9],xmm23[9],xmm24[10],xmm23[10],xmm24[11],xmm23[11],xmm24[12],xmm23[12],xmm24[13],xmm23[13],xmm24[14],xmm23[14],xmm24[15],xmm23[15] -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm19 = [5,4,3,6,5,4,3,6,5,4,3,6,7,7,7,7,41,40,43,42,41,40,43,42,41,40,43,42,45,44,47,46] -; AVX512DQ-BW-NEXT: vpermt2w %zmm12, %zmm19, %zmm17 +; AVX512DQ-BW-NEXT: vmovdqu16 %zmm20, %zmm15 {%k2} +; AVX512DQ-BW-NEXT: vextracti64x4 $1, %zmm15, %ymm19 +; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} ymm20 = [2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0] +; AVX512DQ-BW-NEXT: # ymm20 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpshufb %ymm20, %ymm5, %ymm22 +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm22 = ymm22[2,2,2,3] +; AVX512DQ-BW-NEXT: movw $18724, %r10w # imm = 0x4924 +; AVX512DQ-BW-NEXT: kmovd %r10d, %k3 +; AVX512DQ-BW-NEXT: vmovdqu16 %ymm22, %ymm19 {%k3} +; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm22 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm23 = [18,1,2,19,4,5,20,7,8,21,10,11,22,13,14,23] +; AVX512DQ-BW-NEXT: vpermt2w %ymm22, %ymm23, %ymm15 +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm19, %zmm15, %zmm15 +; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} ymm19 = [0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0] +; AVX512DQ-BW-NEXT: # ymm19 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpshufb %ymm19, %ymm8, %ymm22 +; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm24 = xmm8[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm25 = [2,5,4,3,2,5,4,3,2,5,4,3,6,5,6,7,40,41,42,43,40,41,42,43,40,41,42,43,44,45,46,47] +; AVX512DQ-BW-NEXT: vpermt2w %zmm22, %zmm25, %zmm24 +; AVX512DQ-BW-NEXT: movabsq $2342443691899625602, %r10 # imm = 0x2082082082082082 +; AVX512DQ-BW-NEXT: kmovq %r10, %k4 +; AVX512DQ-BW-NEXT: vmovdqu8 %zmm24, %zmm15 {%k4} +; AVX512DQ-BW-NEXT: vmovdqa64 32(%rcx), %ymm22 +; AVX512DQ-BW-NEXT: vmovdqa64 32(%rdx), %ymm24 +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} ymm26 = ymm24[0],ymm22[0],ymm24[1],ymm22[1],ymm24[2],ymm22[2],ymm24[3],ymm22[3],ymm24[4],ymm22[4],ymm24[5],ymm22[5],ymm24[6],ymm22[6],ymm24[7],ymm22[7],ymm24[16],ymm22[16],ymm24[17],ymm22[17],ymm24[18],ymm22[18],ymm24[19],ymm22[19],ymm24[20],ymm22[20],ymm24[21],ymm22[21],ymm24[22],ymm22[22],ymm24[23],ymm22[23] +; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm11[8],xmm10[8],xmm11[9],xmm10[9],xmm11[10],xmm10[10],xmm11[11],xmm10[11],xmm11[12],xmm10[12],xmm11[13],xmm10[13],xmm11[14],xmm10[14],xmm11[15],xmm10[15] +; AVX512DQ-BW-NEXT: vpermt2w %zmm26, %zmm21, %zmm10 +; AVX512DQ-BW-NEXT: vmovdqa 32(%rsi), %ymm11 +; AVX512DQ-BW-NEXT: vmovdqa64 32(%rdi), %ymm21 +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} ymm26 = ymm21[0],ymm11[0],ymm21[1],ymm11[1],ymm21[2],ymm11[2],ymm21[3],ymm11[3],ymm21[4],ymm11[4],ymm21[5],ymm11[5],ymm21[6],ymm11[6],ymm21[7],ymm11[7],ymm21[16],ymm11[16],ymm21[17],ymm11[17],ymm21[18],ymm11[18],ymm21[19],ymm11[19],ymm21[20],ymm11[20],ymm21[21],ymm11[21],ymm21[22],ymm11[22],ymm21[23],ymm11[23] +; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm6[8],xmm4[8],xmm6[9],xmm4[9],xmm6[10],xmm4[10],xmm6[11],xmm4[11],xmm6[12],xmm4[12],xmm6[13],xmm4[13],xmm6[14],xmm4[14],xmm6[15],xmm4[15] +; AVX512DQ-BW-NEXT: vpermt2w %zmm26, %zmm18, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqu16 %zmm10, %zmm4 {%k2} +; AVX512DQ-BW-NEXT: vextracti64x4 $1, %zmm4, %ymm6 +; AVX512DQ-BW-NEXT: vmovdqa 32(%r8), %ymm10 +; AVX512DQ-BW-NEXT: vpshufb %ymm20, %ymm10, %ymm10 +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,2,3] +; AVX512DQ-BW-NEXT: vmovdqu16 %ymm10, %ymm6 {%k3} +; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm9[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX512DQ-BW-NEXT: vpermt2w %ymm9, %ymm23, %ymm4 +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm6, %zmm4, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqa 32(%r9), %ymm6 +; AVX512DQ-BW-NEXT: vpshufb %ymm19, %ymm6, %ymm6 +; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm12[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX512DQ-BW-NEXT: vpermt2w %zmm6, %zmm25, %zmm9 +; AVX512DQ-BW-NEXT: vmovdqu8 %zmm9, %zmm4 {%k4} +; AVX512DQ-BW-NEXT: vpbroadcastq {{.*#+}} ymm6 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] +; AVX512DQ-BW-NEXT: vpshufb %ymm6, %ymm11, %ymm9 +; AVX512DQ-BW-NEXT: vpshufb %ymm6, %ymm21, %ymm10 ; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} ymm9 = ymm10[0],ymm9[0],ymm10[1],ymm9[1],ymm10[2],ymm9[2],ymm10[3],ymm9[3],ymm10[4],ymm9[4],ymm10[5],ymm9[5],ymm10[6],ymm9[6],ymm10[7],ymm9[7],ymm10[16],ymm9[16],ymm10[17],ymm9[17],ymm10[18],ymm9[18],ymm10[19],ymm9[19],ymm10[20],ymm9[20],ymm10[21],ymm9[21],ymm10[22],ymm9[22],ymm10[23],ymm9[23] -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm22[8],xmm21[8],xmm22[9],xmm21[9],xmm22[10],xmm21[10],xmm22[11],xmm21[11],xmm22[12],xmm21[12],xmm22[13],xmm21[13],xmm22[14],xmm21[14],xmm22[15],xmm21[15] -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm12 = [4,3,6,5,4,3,6,5,4,3,6,5,7,7,7,7,40,43,42,41,40,43,42,41,40,43,42,41,44,45,46,45] -; AVX512DQ-BW-NEXT: vpermt2w %zmm9, %zmm12, %zmm10 -; AVX512DQ-BW-NEXT: vmovdqu16 %zmm17, %zmm10 {%k1} -; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0] -; AVX512DQ-BW-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512DQ-BW-NEXT: vpshufb %ymm9, %ymm13, %ymm17 -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm13 = xmm13[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm21 = [2,5,4,3,2,5,4,3,2,5,4,3,6,5,6,7,40,41,42,43,40,41,42,43,40,41,42,43,44,45,46,47] -; AVX512DQ-BW-NEXT: vpermt2w %zmm17, %zmm21, %zmm13 -; AVX512DQ-BW-NEXT: movl $1227133513, %ecx # imm = 0x49249249 -; AVX512DQ-BW-NEXT: kmovd %ecx, %k2 -; AVX512DQ-BW-NEXT: vmovdqu16 %zmm13, %zmm10 {%k2} -; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0] -; AVX512DQ-BW-NEXT: # ymm13 = mem[0,1,0,1] -; AVX512DQ-BW-NEXT: vpshufb %ymm13, %ymm11, %ymm17 -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm11[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX512DQ-BW-NEXT: vpermt2w %zmm17, %zmm21, %zmm11 -; AVX512DQ-BW-NEXT: movabsq $2342443691899625602, %rcx # imm = 0x2082082082082082 +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,3] +; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} ymm10 = ymm21[8],ymm11[8],ymm21[9],ymm11[9],ymm21[10],ymm11[10],ymm21[11],ymm11[11],ymm21[12],ymm11[12],ymm21[13],ymm11[13],ymm21[14],ymm11[14],ymm21[15],ymm11[15],ymm21[24],ymm11[24],ymm21[25],ymm11[25],ymm21[26],ymm11[26],ymm21[27],ymm11[27],ymm21[28],ymm11[28],ymm21[29],ymm11[29],ymm21[30],ymm11[30],ymm21[31],ymm11[31] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm11 = [12,11,14,13,12,11,14,13,12,11,14,13,15,15,15,15] +; AVX512DQ-BW-NEXT: vpermw %ymm10, %ymm11, %ymm10 +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm10, %zmm9, %zmm9 +; AVX512DQ-BW-NEXT: vpshufb %ymm0, %ymm22, %ymm10 +; AVX512DQ-BW-NEXT: vpshufb %ymm0, %ymm24, %ymm12 +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} ymm10 = ymm12[0],ymm10[0],ymm12[1],ymm10[1],ymm12[2],ymm10[2],ymm12[3],ymm10[3],ymm12[4],ymm10[4],ymm12[5],ymm10[5],ymm12[6],ymm10[6],ymm12[7],ymm10[7],ymm12[16],ymm10[16],ymm12[17],ymm10[17],ymm12[18],ymm10[18],ymm12[19],ymm10[19],ymm12[20],ymm10[20],ymm12[21],ymm10[21],ymm12[22],ymm10[22],ymm12[23],ymm10[23] +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,2,3] +; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} ymm12 = ymm24[8],ymm22[8],ymm24[9],ymm22[9],ymm24[10],ymm22[10],ymm24[11],ymm22[11],ymm24[12],ymm22[12],ymm24[13],ymm22[13],ymm24[14],ymm22[14],ymm24[15],ymm22[15],ymm24[24],ymm22[24],ymm24[25],ymm22[25],ymm24[26],ymm22[26],ymm24[27],ymm22[27],ymm24[28],ymm22[28],ymm24[29],ymm22[29],ymm24[30],ymm22[30],ymm24[31],ymm22[31] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm18 = [13,12,11,14,13,12,11,14,13,12,11,14,15,15,15,15] +; AVX512DQ-BW-NEXT: vpermw %ymm12, %ymm18, %ymm12 +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm12, %zmm10, %zmm10 +; AVX512DQ-BW-NEXT: vmovdqu16 %zmm9, %zmm10 {%k2} +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[4,5,6,7,4,5,6,7] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm9 = [6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,10,13,12,11,14,0,0,15,10,13,12,11,14,0,0,15] +; AVX512DQ-BW-NEXT: vpshufb %zmm9, %zmm2, %zmm2 +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} zmm2 = zmm2[2,2,2,3,6,6,6,7] +; AVX512DQ-BW-NEXT: vmovdqu16 %zmm2, %zmm10 {%k1} +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[4,5,6,7,4,5,6,7] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[u,6,u,5,u,8,u,7,u,u,u,9,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u,u,42,u,45,u,44,u,43,u,46,u,u,u,u,u,47,u,58,u,61,u,60,u,59,u,62,u,u,u,u,u,63] +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} zmm1 = zmm1[2,2,2,3,6,6,6,7] +; AVX512DQ-BW-NEXT: movabsq $-9076969306111049208, %rcx # imm = 0x8208208208208208 ; AVX512DQ-BW-NEXT: kmovq %rcx, %k3 -; AVX512DQ-BW-NEXT: vmovdqu8 %zmm11, %zmm10 {%k3} -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[4],ymm3[4],ymm4[5],ymm3[5],ymm4[6],ymm3[6],ymm4[7],ymm3[7],ymm4[16],ymm3[16],ymm4[17],ymm3[17],ymm4[18],ymm3[18],ymm4[19],ymm3[19],ymm4[20],ymm3[20],ymm4[21],ymm3[21],ymm4[22],ymm3[22],ymm4[23],ymm3[23] -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm18[8],xmm15[8],xmm18[9],xmm15[9],xmm18[10],xmm15[10],xmm18[11],xmm15[11],xmm18[12],xmm15[12],xmm18[13],xmm15[13],xmm18[14],xmm15[14],xmm18[15],xmm15[15] -; AVX512DQ-BW-NEXT: vpermt2w %zmm3, %zmm19, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqu8 %zmm1, %zmm10 {%k3} +; AVX512DQ-BW-NEXT: vpshufb %ymm6, %ymm16, %ymm1 +; AVX512DQ-BW-NEXT: vpshufb %ymm6, %ymm17, %ymm2 ; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[16],ymm1[16],ymm2[17],ymm1[17],ymm2[18],ymm1[18],ymm2[19],ymm1[19],ymm2[20],ymm1[20],ymm2[21],ymm1[21],ymm2[22],ymm1[22],ymm2[23],ymm1[23] -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm8[8],xmm7[8],xmm8[9],xmm7[9],xmm8[10],xmm7[10],xmm8[11],xmm7[11],xmm8[12],xmm7[12],xmm8[13],xmm7[13],xmm8[14],xmm7[14],xmm8[15],xmm7[15] -; AVX512DQ-BW-NEXT: vpermt2w %zmm1, %zmm12, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqu16 %zmm4, %zmm2 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa 32(%r8), %ymm1 -; AVX512DQ-BW-NEXT: vpshufb %ymm9, %ymm1, %ymm1 -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm14[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX512DQ-BW-NEXT: vpermt2w %zmm1, %zmm21, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqu16 %zmm3, %zmm2 {%k2} -; AVX512DQ-BW-NEXT: vmovdqa 32(%r9), %ymm1 -; AVX512DQ-BW-NEXT: vpshufb %ymm13, %ymm1, %ymm1 -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm16[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX512DQ-BW-NEXT: vpermt2w %zmm1, %zmm21, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqu8 %zmm3, %zmm2 {%k3} -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, 256(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, 64(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, (%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, 192(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, 128(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, 320(%rax) +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] +; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm17[8],ymm16[8],ymm17[9],ymm16[9],ymm17[10],ymm16[10],ymm17[11],ymm16[11],ymm17[12],ymm16[12],ymm17[13],ymm16[13],ymm17[14],ymm16[14],ymm17[15],ymm16[15],ymm17[24],ymm16[24],ymm17[25],ymm16[25],ymm17[26],ymm16[26],ymm17[27],ymm16[27],ymm17[28],ymm16[28],ymm17[29],ymm16[29],ymm17[30],ymm16[30],ymm17[31],ymm16[31] +; AVX512DQ-BW-NEXT: vpermw %ymm2, %ymm11, %ymm2 +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 +; AVX512DQ-BW-NEXT: vpshufb %ymm0, %ymm13, %ymm2 +; AVX512DQ-BW-NEXT: vpshufb %ymm0, %ymm14, %ymm0 +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[16],ymm2[16],ymm0[17],ymm2[17],ymm0[18],ymm2[18],ymm0[19],ymm2[19],ymm0[20],ymm2[20],ymm0[21],ymm2[21],ymm0[22],ymm2[22],ymm0[23],ymm2[23] +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] +; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm14[8],ymm13[8],ymm14[9],ymm13[9],ymm14[10],ymm13[10],ymm14[11],ymm13[11],ymm14[12],ymm13[12],ymm14[13],ymm13[13],ymm14[14],ymm13[14],ymm14[15],ymm13[15],ymm14[24],ymm13[24],ymm14[25],ymm13[25],ymm14[26],ymm13[26],ymm14[27],ymm13[27],ymm14[28],ymm13[28],ymm14[29],ymm13[29],ymm14[30],ymm13[30],ymm14[31],ymm13[31] +; AVX512DQ-BW-NEXT: vpermw %ymm2, %ymm18, %ymm2 +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqu16 %zmm1, %zmm0 {%k2} +; AVX512DQ-BW-NEXT: vpshufb %zmm9, %zmm5, %zmm1 +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} zmm1 = zmm1[2,2,2,3,6,6,6,7] +; AVX512DQ-BW-NEXT: vmovdqu16 %zmm1, %zmm0 {%k1} +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm1 = zmm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,58,u,61,u,60,u,59,u,62,u,u,u,u,u,63] +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} zmm1 = zmm1[2,2,2,3,6,6,6,7] +; AVX512DQ-BW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k3} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, 128(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, 320(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, 256(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, 64(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, (%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, 192(%rax) ; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq ; ; AVX512DQ-BW-FCP-LABEL: store_i8_stride6_vf64: ; AVX512DQ-BW-FCP: # %bb.0: -; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rsi), %ymm10 -; AVX512DQ-BW-FCP-NEXT: vpbroadcastq {{.*#+}} ymm0 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm0, %ymm10, %ymm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm11 -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm0, %ymm11, %ymm2 -; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[16],ymm1[16],ymm2[17],ymm1[17],ymm2[18],ymm1[18],ymm2[19],ymm1[19],ymm2[20],ymm1[20],ymm2[21],ymm1[21],ymm2[22],ymm1[22],ymm2[23],ymm1[23] -; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,2,2,3] -; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm11[8],ymm10[8],ymm11[9],ymm10[9],ymm11[10],ymm10[10],ymm11[11],ymm10[11],ymm11[12],ymm10[12],ymm11[13],ymm10[13],ymm11[14],ymm10[14],ymm11[15],ymm10[15],ymm11[24],ymm10[24],ymm11[25],ymm10[25],ymm11[26],ymm10[26],ymm11[27],ymm10[27],ymm11[28],ymm10[28],ymm11[29],ymm10[29],ymm11[30],ymm10[30],ymm11[31],ymm10[31] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm0 = [12,11,14,13,12,11,14,13,12,11,14,13,15,15,15,15] -; AVX512DQ-BW-FCP-NEXT: vpermw %ymm3, %ymm0, %ymm3 -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rcx), %ymm4 -; AVX512DQ-BW-FCP-NEXT: vpbroadcastq {{.*#+}} ymm2 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rsi), %ymm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rcx), %ymm7 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rcx), %xmm21 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 32(%rcx), %xmm16 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdx), %xmm22 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 32(%rdx), %xmm20 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rsi), %xmm24 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rsi), %xmm15 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %xmm26 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 32(%rdi), %xmm19 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm2, %xmm16, %xmm12 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm2, %xmm20, %xmm13 -; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3],xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] -; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm20[0],xmm16[0],xmm20[1],xmm16[1],xmm20[2],xmm16[2],xmm20[3],xmm16[3],xmm20[4],xmm16[4],xmm20[5],xmm16[5],xmm20[6],xmm16[6],xmm20[7],xmm16[7] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm18 = [1,0,3,2,1,0,3,2,1,0,3,2,5,4,7,6,32,33,34,35,32,33,34,35,32,33,34,35,36,37,38,39] -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm12, %zmm18, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vpbroadcastq {{.*#+}} xmm23 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm23, %xmm15, %xmm12 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm23, %xmm19, %xmm14 -; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm14 = xmm14[8],xmm12[8],xmm14[9],xmm12[9],xmm14[10],xmm12[10],xmm14[11],xmm12[11],xmm14[12],xmm12[12],xmm14[13],xmm12[13],xmm14[14],xmm12[14],xmm14[15],xmm12[15] -; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm19[0],xmm15[0],xmm19[1],xmm15[1],xmm19[2],xmm15[2],xmm19[3],xmm15[3],xmm19[4],xmm15[4],xmm19[5],xmm15[5],xmm19[6],xmm15[6],xmm19[7],xmm15[7] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm25 = [0,3,2,1,0,3,2,1,0,3,2,1,4,5,6,5,32,33,34,35,32,33,34,35,32,33,34,35,36,37,38,39] -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm14, %zmm25, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 32(%r8), %xmm28 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm27 = [8,9,0,0,0,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm27, %xmm28, %xmm17 -; AVX512DQ-BW-FCP-NEXT: vpmovzxbw {{.*#+}} xmm14 = xmm28[0],zero,xmm28[1],zero,xmm28[2],zero,xmm28[3],zero,xmm28[4],zero,xmm28[5],zero,xmm28[6],zero,xmm28[7],zero -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm30 = [2,1,0,3,2,1,0,3,2,1,0,3,4,4,4,4,38,37,32,39,38,37,32,39,38,37,32,39,33,33,33,33] -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm17, %zmm30, %zmm14 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 32(%r9), %xmm29 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm2, %xmm21, %xmm17 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm2, %xmm22, %xmm31 -; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm31 = xmm31[0],xmm17[0],xmm31[1],xmm17[1],xmm31[2],xmm17[2],xmm31[3],xmm17[3],xmm31[4],xmm17[4],xmm31[5],xmm17[5],xmm31[6],xmm17[6],xmm31[7],xmm17[7] -; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm17 = xmm22[0],xmm21[0],xmm22[1],xmm21[1],xmm22[2],xmm21[2],xmm22[3],xmm21[3],xmm22[4],xmm21[4],xmm22[5],xmm21[5],xmm22[6],xmm21[6],xmm22[7],xmm21[7] -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm31, %zmm18, %zmm17 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm31 = [u,8,u,9,u,10,u,11,u,4,u,5,u,6,u,7] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm23, %xmm24, %xmm18 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm23, %xmm26, %xmm23 -; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm23 = xmm23[8],xmm18[8],xmm23[9],xmm18[9],xmm23[10],xmm18[10],xmm23[11],xmm18[11],xmm23[12],xmm18[12],xmm23[13],xmm18[13],xmm23[14],xmm18[14],xmm23[15],xmm18[15] -; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm18 = xmm26[0],xmm24[0],xmm26[1],xmm24[1],xmm26[2],xmm24[2],xmm26[3],xmm24[3],xmm26[4],xmm24[4],xmm26[5],xmm24[5],xmm26[6],xmm24[6],xmm26[7],xmm24[7] -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm23, %zmm25, %zmm18 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm31, %xmm29, %xmm25 -; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm23 = xmm29[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm25, %zmm30, %zmm23 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm27, %xmm8, %xmm27 -; AVX512DQ-BW-FCP-NEXT: vpmovzxbw {{.*#+}} xmm25 = xmm8[0],zero,xmm8[1],zero,xmm8[2],zero,xmm8[3],zero,xmm8[4],zero,xmm8[5],zero,xmm8[6],zero,xmm8[7],zero -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm27, %zmm30, %zmm25 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm31, %xmm9, %xmm31 -; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm27 = xmm9[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm31, %zmm30, %zmm27 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdx), %ymm30 -; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm31 = ymm30[0],ymm7[0],ymm30[1],ymm7[1],ymm30[2],ymm7[2],ymm30[3],ymm7[3],ymm30[4],ymm7[4],ymm30[5],ymm7[5],ymm30[6],ymm7[6],ymm30[7],ymm7[7],ymm30[16],ymm7[16],ymm30[17],ymm7[17],ymm30[18],ymm7[18],ymm30[19],ymm7[19],ymm30[20],ymm7[20],ymm30[21],ymm7[21],ymm30[22],ymm7[22],ymm30[23],ymm7[23] -; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm22 = xmm22[8],xmm21[8],xmm22[9],xmm21[9],xmm22[10],xmm21[10],xmm22[11],xmm21[11],xmm22[12],xmm21[12],xmm22[13],xmm21[13],xmm22[14],xmm21[14],xmm22[15],xmm21[15] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm0 = [5,4,3,6,5,4,3,6,5,4,3,6,7,7,7,7,41,40,43,42,41,40,43,42,41,40,43,42,45,44,47,46] -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm31, %zmm0, %zmm22 -; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm31 = ymm6[0],ymm5[0],ymm6[1],ymm5[1],ymm6[2],ymm5[2],ymm6[3],ymm5[3],ymm6[4],ymm5[4],ymm6[5],ymm5[5],ymm6[6],ymm5[6],ymm6[7],ymm5[7],ymm6[16],ymm5[16],ymm6[17],ymm5[17],ymm6[18],ymm5[18],ymm6[19],ymm5[19],ymm6[20],ymm5[20],ymm6[21],ymm5[21],ymm6[22],ymm5[22],ymm6[23],ymm5[23] -; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm21 = xmm26[8],xmm24[8],xmm26[9],xmm24[9],xmm26[10],xmm24[10],xmm26[11],xmm24[11],xmm26[12],xmm24[12],xmm26[13],xmm24[13],xmm26[14],xmm24[14],xmm26[15],xmm24[15] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm24 = [4,3,6,5,4,3,6,5,4,3,6,5,7,7,7,7,40,43,42,41,40,43,42,41,40,43,42,41,44,45,46,45] -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm31, %zmm24, %zmm21 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 32(%rdx), %ymm26 -; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm16 = xmm20[8],xmm16[8],xmm20[9],xmm16[9],xmm20[10],xmm16[10],xmm20[11],xmm16[11],xmm20[12],xmm16[12],xmm20[13],xmm16[13],xmm20[14],xmm16[14],xmm20[15],xmm16[15] -; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm20 = ymm26[0],ymm4[0],ymm26[1],ymm4[1],ymm26[2],ymm4[2],ymm26[3],ymm4[3],ymm26[4],ymm4[4],ymm26[5],ymm4[5],ymm26[6],ymm4[6],ymm26[7],ymm4[7],ymm26[16],ymm4[16],ymm26[17],ymm4[17],ymm26[18],ymm4[18],ymm26[19],ymm4[19],ymm26[20],ymm4[20],ymm26[21],ymm4[21],ymm26[22],ymm4[22],ymm26[23],ymm4[23] -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm20, %zmm0, %zmm16 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0] -; AVX512DQ-BW-FCP-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm1, %ymm8, %ymm31 -; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm20 = xmm8[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm0 = [2,5,4,3,2,5,4,3,2,5,4,3,6,5,6,7,40,41,42,43,40,41,42,43,40,41,42,43,44,45,46,47] -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm31, %zmm0, %zmm20 -; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm11 = ymm11[0],ymm10[0],ymm11[1],ymm10[1],ymm11[2],ymm10[2],ymm11[3],ymm10[3],ymm11[4],ymm10[4],ymm11[5],ymm10[5],ymm11[6],ymm10[6],ymm11[7],ymm10[7],ymm11[16],ymm10[16],ymm11[17],ymm10[17],ymm11[18],ymm10[18],ymm11[19],ymm10[19],ymm11[20],ymm10[20],ymm11[21],ymm10[21],ymm11[22],ymm10[22],ymm11[23],ymm10[23] -; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm19[8],xmm15[8],xmm19[9],xmm15[9],xmm19[10],xmm15[10],xmm19[11],xmm15[11],xmm19[12],xmm15[12],xmm19[13],xmm15[13],xmm19[14],xmm15[14],xmm19[15],xmm15[15] +; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r8), %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r9), %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpbroadcastq {{.*#+}} ymm0 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] +; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rcx), %xmm15 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rcx), %xmm6 +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm0, %xmm6, %xmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdx), %xmm16 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdx), %xmm8 +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm0, %xmm8, %xmm4 +; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3],xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm5 = [1,0,3,2,1,0,3,2,1,0,3,2,5,4,7,6,32,33,34,35,32,33,34,35,32,33,34,35,36,37,38,39] +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm3, %zmm5, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rsi), %xmm17 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rsi), %xmm9 +; AVX512DQ-BW-FCP-NEXT: vpbroadcastq {{.*#+}} xmm7 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm7, %xmm9, %xmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %xmm18 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdi), %xmm10 +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm7, %xmm10, %xmm11 +; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm11[8],xmm3[8],xmm11[9],xmm3[9],xmm11[10],xmm3[10],xmm11[11],xmm3[11],xmm11[12],xmm3[12],xmm11[13],xmm3[13],xmm11[14],xmm3[14],xmm11[15],xmm3[15] +; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm13 = [0,3,2,1,0,3,2,1,0,3,2,1,4,5,6,5,32,33,34,35,32,33,34,35,32,33,34,35,36,37,38,39] +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm3, %zmm13, %zmm12 +; AVX512DQ-BW-FCP-NEXT: movl $-1840700270, %r10d # imm = 0x92492492 +; AVX512DQ-BW-FCP-NEXT: kmovd %r10d, %k1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm4, %zmm12 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%r8), %xmm11 +; AVX512DQ-BW-FCP-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm11[0],zero,xmm11[1],zero,xmm11[2],zero,xmm11[3],zero,xmm11[4],zero,xmm11[5],zero,xmm11[6],zero,xmm11[7],zero +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm14 = [0,1,16,3,4,17,6,7,18,9,10,19,12,13,20,15] +; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm12, %ymm4 +; AVX512DQ-BW-FCP-NEXT: vpermt2w %ymm3, %ymm14, %ymm4 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm19 = [16,37,18,19,38,21,22,39,24,25,32,27,28,33,30,31] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm20 = [8,9,0,0,0,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm20, %xmm11, %xmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm3, %zmm19, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm12, %zmm4, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%r9), %xmm12 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm21 = [u,8,u,9,u,10,u,11,u,4,u,5,u,6,u,7] +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm21, %xmm12, %xmm4 +; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm22 = xmm12[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm23 = [2,1,0,3,2,1,0,3,2,1,0,3,4,4,4,4,38,37,32,39,38,37,32,39,38,37,32,39,33,33,33,33] +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm4, %zmm23, %zmm22 +; AVX512DQ-BW-FCP-NEXT: movabsq $585610922974906400, %r10 # imm = 0x820820820820820 +; AVX512DQ-BW-FCP-NEXT: kmovq %r10, %k2 +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm22, %zmm3 {%k2} +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm0, %xmm15, %xmm4 +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm0, %xmm16, %xmm22 +; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm22[0],xmm4[0],xmm22[1],xmm4[1],xmm22[2],xmm4[2],xmm22[3],xmm4[3],xmm22[4],xmm4[4],xmm22[5],xmm4[5],xmm22[6],xmm4[6],xmm22[7],xmm4[7] +; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm22 = xmm16[0],xmm15[0],xmm16[1],xmm15[1],xmm16[2],xmm15[2],xmm16[3],xmm15[3],xmm16[4],xmm15[4],xmm16[5],xmm15[5],xmm16[6],xmm15[6],xmm16[7],xmm15[7] +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm4, %zmm5, %zmm22 +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm7, %xmm17, %xmm4 +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm7, %xmm18, %xmm5 +; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] +; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm18[0],xmm17[0],xmm18[1],xmm17[1],xmm18[2],xmm17[2],xmm18[3],xmm17[3],xmm18[4],xmm17[4],xmm18[5],xmm17[5],xmm18[6],xmm17[6],xmm18[7],xmm17[7] +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm4, %zmm13, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm22, %zmm5 {%k1} +; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpmovzxbw {{.*#+}} xmm7 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero +; AVX512DQ-BW-FCP-NEXT: vpermi2w %ymm7, %ymm5, %ymm14 +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm20, %xmm4, %xmm7 +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm7, %zmm19, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm14, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm21, %xmm7, %xmm13 +; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm7[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm13, %zmm23, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm14, %zmm5 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rcx), %ymm13 +; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %ymm14 +; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm19 = ymm14[0],ymm13[0],ymm14[1],ymm13[1],ymm14[2],ymm13[2],ymm14[3],ymm13[3],ymm14[4],ymm13[4],ymm14[5],ymm13[5],ymm14[6],ymm13[6],ymm14[7],ymm13[7],ymm14[16],ymm13[16],ymm14[17],ymm13[17],ymm14[18],ymm13[18],ymm14[19],ymm13[19],ymm14[20],ymm13[20],ymm14[21],ymm13[21],ymm14[22],ymm13[22],ymm14[23],ymm13[23] +; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm20 = xmm16[8],xmm15[8],xmm16[9],xmm15[9],xmm16[10],xmm15[10],xmm16[11],xmm15[11],xmm16[12],xmm15[12],xmm16[13],xmm15[13],xmm16[14],xmm15[14],xmm16[15],xmm15[15] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm21 = [5,4,3,6,5,4,3,6,5,4,3,6,7,7,7,7,41,40,43,42,41,40,43,42,41,40,43,42,45,44,47,46] +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm19, %zmm21, %zmm20 +; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rsi), %ymm15 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %ymm16 +; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm19 = ymm16[0],ymm15[0],ymm16[1],ymm15[1],ymm16[2],ymm15[2],ymm16[3],ymm15[3],ymm16[4],ymm15[4],ymm16[5],ymm15[5],ymm16[6],ymm15[6],ymm16[7],ymm15[7],ymm16[16],ymm15[16],ymm16[17],ymm15[17],ymm16[18],ymm15[18],ymm16[19],ymm15[19],ymm16[20],ymm15[20],ymm16[21],ymm15[21],ymm16[22],ymm15[22],ymm16[23],ymm15[23] +; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm17 = xmm18[8],xmm17[8],xmm18[9],xmm17[9],xmm18[10],xmm17[10],xmm18[11],xmm17[11],xmm18[12],xmm17[12],xmm18[13],xmm17[13],xmm18[14],xmm17[14],xmm18[15],xmm17[15] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm18 = [4,3,6,5,4,3,6,5,4,3,6,5,7,7,7,7,40,43,42,41,40,43,42,41,40,43,42,41,44,45,46,45] +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm19, %zmm18, %zmm17 +; AVX512DQ-BW-FCP-NEXT: movl $613566756, %r10d # imm = 0x24924924 +; AVX512DQ-BW-FCP-NEXT: kmovd %r10d, %k2 +; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm20, %zmm17 {%k2} +; AVX512DQ-BW-FCP-NEXT: vextracti64x4 $1, %zmm17, %ymm19 +; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} ymm20 = [2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0] +; AVX512DQ-BW-FCP-NEXT: # ymm20 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm20, %ymm4, %ymm22 +; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm22 = ymm22[2,2,2,3] +; AVX512DQ-BW-FCP-NEXT: movw $18724, %r10w # imm = 0x4924 +; AVX512DQ-BW-FCP-NEXT: kmovd %r10d, %k3 +; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm22, %ymm19 {%k3} +; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm22 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm23 = [18,1,2,19,4,5,20,7,8,21,10,11,22,13,14,23] +; AVX512DQ-BW-FCP-NEXT: vpermt2w %ymm22, %ymm23, %ymm17 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm19, %zmm17, %zmm17 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} ymm19 = [0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0] ; AVX512DQ-BW-FCP-NEXT: # ymm19 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm11, %zmm24, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm19, %ymm9, %ymm11 -; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm15 = xmm9[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm11, %zmm0, %zmm15 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%r8), %ymm11 -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm1, %ymm11, %ymm1 -; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm28[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm1, %zmm0, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%r9), %ymm1 -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm19, %ymm1, %ymm1 -; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm19 = xmm29[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm1, %zmm0, %zmm19 -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm2, %ymm4, %ymm0 -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm2, %ymm26, %ymm1 -; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23] +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm19, %ymm7, %ymm22 +; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm24 = xmm7[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm25 = [2,5,4,3,2,5,4,3,2,5,4,3,6,5,6,7,40,41,42,43,40,41,42,43,40,41,42,43,44,45,46,47] +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm22, %zmm25, %zmm24 +; AVX512DQ-BW-FCP-NEXT: movabsq $2342443691899625602, %r10 # imm = 0x2082082082082082 +; AVX512DQ-BW-FCP-NEXT: kmovq %r10, %k4 +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm24, %zmm17 {%k4} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 32(%rcx), %ymm22 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 32(%rdx), %ymm24 +; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm26 = ymm24[0],ymm22[0],ymm24[1],ymm22[1],ymm24[2],ymm22[2],ymm24[3],ymm22[3],ymm24[4],ymm22[4],ymm24[5],ymm22[5],ymm24[6],ymm22[6],ymm24[7],ymm22[7],ymm24[16],ymm22[16],ymm24[17],ymm22[17],ymm24[18],ymm22[18],ymm24[19],ymm22[19],ymm24[20],ymm22[20],ymm24[21],ymm22[21],ymm24[22],ymm22[22],ymm24[23],ymm22[23] +; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm8[8],xmm6[8],xmm8[9],xmm6[9],xmm8[10],xmm6[10],xmm8[11],xmm6[11],xmm8[12],xmm6[12],xmm8[13],xmm6[13],xmm8[14],xmm6[14],xmm8[15],xmm6[15] +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm26, %zmm21, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rsi), %ymm8 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 32(%rdi), %ymm21 +; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm26 = ymm21[0],ymm8[0],ymm21[1],ymm8[1],ymm21[2],ymm8[2],ymm21[3],ymm8[3],ymm21[4],ymm8[4],ymm21[5],ymm8[5],ymm21[6],ymm8[6],ymm21[7],ymm8[7],ymm21[16],ymm8[16],ymm21[17],ymm8[17],ymm21[18],ymm8[18],ymm21[19],ymm8[19],ymm21[20],ymm8[20],ymm21[21],ymm8[21],ymm21[22],ymm8[22],ymm21[23],ymm8[23] +; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm10[8],xmm9[8],xmm10[9],xmm9[9],xmm10[10],xmm9[10],xmm10[11],xmm9[11],xmm10[12],xmm9[12],xmm10[13],xmm9[13],xmm10[14],xmm9[14],xmm10[15],xmm9[15] +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm26, %zmm18, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm6, %zmm9 {%k2} +; AVX512DQ-BW-FCP-NEXT: vextracti64x4 $1, %zmm9, %ymm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%r8), %ymm10 +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm20, %ymm10, %ymm10 +; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,2,3] +; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm10, %ymm6 {%k3} +; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm11[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX512DQ-BW-FCP-NEXT: vpermt2w %ymm10, %ymm23, %ymm9 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm9, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%r9), %ymm9 +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm19, %ymm9, %ymm9 +; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm12[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm9, %zmm25, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm10, %zmm6 {%k4} +; AVX512DQ-BW-FCP-NEXT: vpbroadcastq {{.*#+}} ymm9 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm9, %ymm8, %ymm10 +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm9, %ymm21, %ymm11 +; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm10 = ymm11[0],ymm10[0],ymm11[1],ymm10[1],ymm11[2],ymm10[2],ymm11[3],ymm10[3],ymm11[4],ymm10[4],ymm11[5],ymm10[5],ymm11[6],ymm10[6],ymm11[7],ymm10[7],ymm11[16],ymm10[16],ymm11[17],ymm10[17],ymm11[18],ymm10[18],ymm11[19],ymm10[19],ymm11[20],ymm10[20],ymm11[21],ymm10[21],ymm11[22],ymm10[22],ymm11[23],ymm10[23] +; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,2,3] +; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} ymm8 = ymm21[8],ymm8[8],ymm21[9],ymm8[9],ymm21[10],ymm8[10],ymm21[11],ymm8[11],ymm21[12],ymm8[12],ymm21[13],ymm8[13],ymm21[14],ymm8[14],ymm21[15],ymm8[15],ymm21[24],ymm8[24],ymm21[25],ymm8[25],ymm21[26],ymm8[26],ymm21[27],ymm8[27],ymm21[28],ymm8[28],ymm21[29],ymm8[29],ymm21[30],ymm8[30],ymm21[31],ymm8[31] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm11 = [12,11,14,13,12,11,14,13,12,11,14,13,15,15,15,15] +; AVX512DQ-BW-FCP-NEXT: vpermw %ymm8, %ymm11, %ymm8 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm10, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm0, %ymm22, %ymm10 +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm0, %ymm24, %ymm12 +; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm10 = ymm12[0],ymm10[0],ymm12[1],ymm10[1],ymm12[2],ymm10[2],ymm12[3],ymm10[3],ymm12[4],ymm10[4],ymm12[5],ymm10[5],ymm12[6],ymm10[6],ymm12[7],ymm10[7],ymm12[16],ymm10[16],ymm12[17],ymm10[17],ymm12[18],ymm10[18],ymm12[19],ymm10[19],ymm12[20],ymm10[20],ymm12[21],ymm10[21],ymm12[22],ymm10[22],ymm12[23],ymm10[23] +; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,2,3] +; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} ymm12 = ymm24[8],ymm22[8],ymm24[9],ymm22[9],ymm24[10],ymm22[10],ymm24[11],ymm22[11],ymm24[12],ymm22[12],ymm24[13],ymm22[13],ymm24[14],ymm22[14],ymm24[15],ymm22[15],ymm24[24],ymm22[24],ymm24[25],ymm22[25],ymm24[26],ymm22[26],ymm24[27],ymm22[27],ymm24[28],ymm22[28],ymm24[29],ymm22[29],ymm24[30],ymm22[30],ymm24[31],ymm22[31] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm18 = [13,12,11,14,13,12,11,14,13,12,11,14,15,15,15,15] +; AVX512DQ-BW-FCP-NEXT: vpermw %ymm12, %ymm18, %ymm12 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm12, %zmm10, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm8, %zmm10 {%k2} +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[4,5,6,7,4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm8 = [6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,10,13,12,11,14,0,0,15,10,13,12,11,14,0,0,15] +; AVX512DQ-BW-FCP-NEXT: vpshufb %zmm8, %zmm2, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} zmm2 = zmm2[2,2,2,3,6,6,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm2, %zmm10 {%k1} +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[4,5,6,7,4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[u,6,u,5,u,8,u,7,u,u,u,9,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u,u,42,u,45,u,44,u,43,u,46,u,u,u,u,u,47,u,58,u,61,u,60,u,59,u,62,u,u,u,u,u,63] +; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} zmm1 = zmm1[2,2,2,3,6,6,6,7] +; AVX512DQ-BW-FCP-NEXT: movabsq $-9076969306111049208, %rcx # imm = 0x8208208208208208 +; AVX512DQ-BW-FCP-NEXT: kmovq %rcx, %k3 +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm1, %zmm10 {%k3} +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm9, %ymm15, %ymm1 +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm9, %ymm16, %ymm2 +; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[16],ymm1[16],ymm2[17],ymm1[17],ymm2[18],ymm1[18],ymm2[19],ymm1[19],ymm2[20],ymm1[20],ymm2[21],ymm1[21],ymm2[22],ymm1[22],ymm2[23],ymm1[23] +; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] +; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm16[8],ymm15[8],ymm16[9],ymm15[9],ymm16[10],ymm15[10],ymm16[11],ymm15[11],ymm16[12],ymm15[12],ymm16[13],ymm15[13],ymm16[14],ymm15[14],ymm16[15],ymm15[15],ymm16[24],ymm15[24],ymm16[25],ymm15[25],ymm16[26],ymm15[26],ymm16[27],ymm15[27],ymm16[28],ymm15[28],ymm16[29],ymm15[29],ymm16[30],ymm15[30],ymm16[31],ymm15[31] +; AVX512DQ-BW-FCP-NEXT: vpermw %ymm2, %ymm11, %ymm2 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm0, %ymm13, %ymm2 +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm0, %ymm14, %ymm0 +; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[16],ymm2[16],ymm0[17],ymm2[17],ymm0[18],ymm2[18],ymm0[19],ymm2[19],ymm0[20],ymm2[20],ymm0[21],ymm2[21],ymm0[22],ymm2[22],ymm0[23],ymm2[23] ; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] -; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm26[8],ymm4[8],ymm26[9],ymm4[9],ymm26[10],ymm4[10],ymm26[11],ymm4[11],ymm26[12],ymm4[12],ymm26[13],ymm4[13],ymm26[14],ymm4[14],ymm26[15],ymm4[15],ymm26[24],ymm4[24],ymm26[25],ymm4[25],ymm26[26],ymm4[26],ymm26[27],ymm4[27],ymm26[28],ymm4[28],ymm26[29],ymm4[29],ymm26[30],ymm4[30],ymm26[31],ymm4[31] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm24 = [13,12,11,14,13,12,11,14,13,12,11,14,15,15,15,15] -; AVX512DQ-BW-FCP-NEXT: vpermw %ymm1, %ymm24, %ymm1 -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r8), %zmm0 -; AVX512DQ-BW-FCP-NEXT: movl $613566756, %eax # imm = 0x24924924 -; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm3, %zmm4 {%k1} -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[4,5,6,7,4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm1 = [6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,10,13,12,11,14,0,0,15,10,13,12,11,14,0,0,15] -; AVX512DQ-BW-FCP-NEXT: vpshufb %zmm1, %zmm0, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} zmm0 = zmm0[2,2,2,3,6,6,6,7] -; AVX512DQ-BW-FCP-NEXT: movl $-1840700270, %eax # imm = 0x92492492 -; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm0, %zmm4 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r9), %zmm0 -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[4,5,6,7,4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[u,6,u,5,u,8,u,7,u,u,u,9,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u,u,42,u,45,u,44,u,43,u,46,u,u,u,u,u,47,u,58,u,61,u,60,u,59,u,62,u,u,u,u,u,63] -; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} zmm0 = zmm0[2,2,2,3,6,6,6,7] -; AVX512DQ-BW-FCP-NEXT: movabsq $-9076969306111049208, %rax # imm = 0x8208208208208208 -; AVX512DQ-BW-FCP-NEXT: kmovq %rax, %k3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm0, %zmm4 {%k3} -; AVX512DQ-BW-FCP-NEXT: vpbroadcastq {{.*#+}} ymm3 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm3, %ymm5, %ymm0 -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm3, %ymm6, %ymm3 -; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[4],ymm0[4],ymm3[5],ymm0[5],ymm3[6],ymm0[6],ymm3[7],ymm0[7],ymm3[16],ymm0[16],ymm3[17],ymm0[17],ymm3[18],ymm0[18],ymm3[19],ymm0[19],ymm3[20],ymm0[20],ymm3[21],ymm0[21],ymm3[22],ymm0[22],ymm3[23],ymm0[23] -; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm6[8],ymm5[8],ymm6[9],ymm5[9],ymm6[10],ymm5[10],ymm6[11],ymm5[11],ymm6[12],ymm5[12],ymm6[13],ymm5[13],ymm6[14],ymm5[14],ymm6[15],ymm5[15],ymm6[24],ymm5[24],ymm6[25],ymm5[25],ymm6[26],ymm5[26],ymm6[27],ymm5[27],ymm6[28],ymm5[28],ymm6[29],ymm5[29],ymm6[30],ymm5[30],ymm6[31],ymm5[31] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm5 = [12,11,14,13,12,11,14,13,12,11,14,13,15,15,15,15] -; AVX512DQ-BW-FCP-NEXT: vpermw %ymm3, %ymm5, %ymm3 -; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm2, %ymm7, %ymm3 -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm2, %ymm30, %ymm2 -; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[4],ymm3[4],ymm2[5],ymm3[5],ymm2[6],ymm3[6],ymm2[7],ymm3[7],ymm2[16],ymm3[16],ymm2[17],ymm3[17],ymm2[18],ymm3[18],ymm2[19],ymm3[19],ymm2[20],ymm3[20],ymm2[21],ymm3[21],ymm2[22],ymm3[22],ymm2[23],ymm3[23] -; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm30[8],ymm7[8],ymm30[9],ymm7[9],ymm30[10],ymm7[10],ymm30[11],ymm7[11],ymm30[12],ymm7[12],ymm30[13],ymm7[13],ymm30[14],ymm7[14],ymm30[15],ymm7[15],ymm30[24],ymm7[24],ymm30[25],ymm7[25],ymm30[26],ymm7[26],ymm30[27],ymm7[27],ymm30[28],ymm7[28],ymm30[29],ymm7[29],ymm30[30],ymm7[30],ymm30[31],ymm7[31] -; AVX512DQ-BW-FCP-NEXT: vpermw %ymm3, %ymm24, %ymm3 -; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm0, %zmm2 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpshufb %zmm1, %zmm8, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} zmm0 = zmm0[2,2,2,3,6,6,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm0, %zmm2 {%k2} -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm0 = zmm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,58,u,61,u,60,u,59,u,62,u,u,u,u,u,63] -; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} zmm0 = zmm0[2,2,2,3,6,6,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm0, %zmm2 {%k3} -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm22, %zmm21 {%k1} -; AVX512DQ-BW-FCP-NEXT: movl $1227133513, %eax # imm = 0x49249249 -; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm20, %zmm21 {%k3} -; AVX512DQ-BW-FCP-NEXT: movabsq $2342443691899625602, %rax # imm = 0x2082082082082082 -; AVX512DQ-BW-FCP-NEXT: kmovq %rax, %k4 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm15, %zmm21 {%k4} -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm13, %zmm12 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm14, %zmm12 {%k1} -; AVX512DQ-BW-FCP-NEXT: movabsq $585610922974906400, %rax # imm = 0x820820820820820 -; AVX512DQ-BW-FCP-NEXT: kmovq %rax, %k5 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm23, %zmm12 {%k5} -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm17, %zmm18 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm25, %zmm18 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm27, %zmm18 {%k5} -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm16, %zmm10 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm11, %zmm10 {%k3} -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm19, %zmm10 {%k4} -; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, 256(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, (%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, 192(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, 64(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, 128(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, 320(%rax) +; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm14[8],ymm13[8],ymm14[9],ymm13[9],ymm14[10],ymm13[10],ymm14[11],ymm13[11],ymm14[12],ymm13[12],ymm14[13],ymm13[13],ymm14[14],ymm13[14],ymm14[15],ymm13[15],ymm14[24],ymm13[24],ymm14[25],ymm13[25],ymm14[26],ymm13[26],ymm14[27],ymm13[27],ymm14[28],ymm13[28],ymm14[29],ymm13[29],ymm14[30],ymm13[30],ymm14[31],ymm13[31] +; AVX512DQ-BW-FCP-NEXT: vpermw %ymm2, %ymm18, %ymm2 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm1, %zmm0 {%k2} +; AVX512DQ-BW-FCP-NEXT: vpshufb %zmm8, %zmm4, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} zmm1 = zmm1[2,2,2,3,6,6,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm1, %zmm0 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm1 = zmm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,58,u,61,u,60,u,59,u,62,u,u,u,u,u,63] +; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} zmm1 = zmm1[2,2,2,3,6,6,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm1, %zmm0 {%k3} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, 128(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, 320(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, 256(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, 64(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, (%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, 192(%rax) ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq %in.vec0 = load <64 x i8>, ptr %in.vecptr0, align 64 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll index f4055a953bad..25e489eef9d1 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll @@ -925,16 +925,14 @@ define void @store_i8_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX2-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,ymm0[4],zero,zero,zero,zero,zero,zero,ymm0[5],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[23,31],zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-NEXT: vpshufb {{.*#+}} ymm6 = ymm1[4,12],zero,zero,zero,zero,zero,ymm1[5,13],zero,zero,zero,zero,zero,ymm1[6,14,22],zero,zero,zero,zero,zero,zero,ymm1[23],zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-NEXT: vpor %ymm5, %ymm6, %ymm5 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpblendvb %ymm6, %ymm3, %ymm5, %ymm3 +; AVX2-NEXT: vpor %ymm3, %ymm5, %ymm3 ; AVX2-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,ymm2[0,8],zero,zero,zero,zero,zero,ymm2[1,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[19,27],zero,zero,zero,zero,zero,ymm2[20,28],zero,zero ; AVX2-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[0,8],zero,zero,zero,zero,zero,ymm4[1,9],zero,zero,zero,zero,zero,ymm4[2,10,18,26],zero,zero,zero,zero,zero,ymm4[19,27],zero,zero,zero,zero,zero,ymm4[20,28] ; AVX2-NEXT: vpor %ymm2, %ymm4, %ymm2 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,ymm0[0],zero,zero,zero,zero,zero,zero,ymm0[1],zero,zero,zero,zero,ymm0[18,26],zero,zero,zero,zero,zero,ymm0[19,27],zero,zero,zero,zero,zero ; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,ymm1[0,8],zero,zero,zero,zero,zero,ymm1[1,9],zero,zero,zero,zero,zero,zero,zero,ymm1[18],zero,zero,zero,zero,zero,zero,ymm1[19],zero,zero,zero,zero ; AVX2-NEXT: vpor %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] -; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpor %ymm0, %ymm2, %ymm0 ; AVX2-NEXT: vmovdqa %ymm0, (%rax) ; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm0 ; AVX2-NEXT: vmovq %xmm0, 48(%rax) @@ -967,16 +965,14 @@ define void @store_i8_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,ymm0[4],zero,zero,zero,zero,zero,zero,ymm0[5],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[23,31],zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm6 = ymm1[4,12],zero,zero,zero,zero,zero,ymm1[5,13],zero,zero,zero,zero,zero,ymm1[6,14,22],zero,zero,zero,zero,zero,zero,ymm1[23],zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-FP-NEXT: vpor %ymm5, %ymm6, %ymm5 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm6 = [0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,u,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm3, %ymm5, %ymm3 +; AVX2-FP-NEXT: vpor %ymm3, %ymm5, %ymm3 ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,ymm2[0,8],zero,zero,zero,zero,zero,ymm2[1,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[19,27],zero,zero,zero,zero,zero,ymm2[20,28],zero,zero ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[0,8],zero,zero,zero,zero,zero,ymm4[1,9],zero,zero,zero,zero,zero,ymm4[2,10,18,26],zero,zero,zero,zero,zero,ymm4[19,27],zero,zero,zero,zero,zero,ymm4[20,28] ; AVX2-FP-NEXT: vpor %ymm2, %ymm4, %ymm2 ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,ymm0[0],zero,zero,zero,zero,zero,zero,ymm0[1],zero,zero,zero,zero,ymm0[18,26],zero,zero,zero,zero,zero,ymm0[19,27],zero,zero,zero,zero,zero ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,ymm1[0,8],zero,zero,zero,zero,zero,ymm1[1,9],zero,zero,zero,zero,zero,zero,zero,ymm1[18],zero,zero,zero,zero,zero,zero,ymm1[19],zero,zero,zero,zero ; AVX2-FP-NEXT: vpor %ymm0, %ymm1, %ymm0 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] -; AVX2-FP-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 +; AVX2-FP-NEXT: vpor %ymm0, %ymm2, %ymm0 ; AVX2-FP-NEXT: vmovdqa %ymm0, (%rax) ; AVX2-FP-NEXT: vextracti128 $1, %ymm3, %xmm0 ; AVX2-FP-NEXT: vmovq %xmm0, 48(%rax) @@ -1205,24 +1201,21 @@ define void @store_i8_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX512BW-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero ; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] -; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512BW-NEXT: vpermq {{.*#+}} zmm1 = zmm0[0,2,1,3,4,6,5,7] -; AVX512BW-NEXT: vpshufb {{.*#+}} zmm1 = zero,zero,zero,zero,zmm1[0,8],zero,zero,zero,zero,zero,zmm1[1,9],zero,zero,zero,zero,zero,zero,zero,zmm1[18],zero,zero,zero,zero,zero,zero,zmm1[19],zero,zero,zero,zero,zmm1[36,44],zero,zero,zero,zero,zero,zmm1[37,45],zero,zero,zero,zero,zero,zmm1[38,46,54],zero,zero,zero,zero,zero,zero,zmm1[55],zero,zero,zero,zero,zero,zero,zero,zero -; AVX512BW-NEXT: vpermq {{.*#+}} zmm0 = zmm0[1,3,0,2,5,7,4,6] -; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zero,zero,zero,zero,zero,zero,zmm0[0],zero,zero,zero,zero,zero,zero,zmm0[1],zero,zero,zero,zero,zmm0[18,26],zero,zero,zero,zero,zero,zmm0[19,27],zero,zero,zero,zero,zero,zero,zero,zmm0[36],zero,zero,zero,zero,zero,zero,zmm0[37],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[55,63],zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm1 ; AVX512BW-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[0,8],zero,zero,zero,zero,zero,zmm1[1,9],zero,zero,zero,zero,zero,zmm1[2,10,18,26],zero,zero,zero,zero,zero,zmm1[19,27],zero,zero,zero,zero,zero,zmm1[20,28],zero,zero,zero,zmm1[37,45],zero,zero,zero,zero,zero,zmm1[38,46],zero,zero,zero,zero,zero,zero,zero,zmm1[55,63],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[2,3,0,1,2,3,0,1] ; AVX512BW-NEXT: vpshufb {{.*#+}} zmm2 = zero,zero,zmm2[0,8],zero,zero,zero,zero,zero,zmm2[1,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm2[19,27],zero,zero,zero,zero,zero,zmm2[20,28],zero,zero,zero,zero,zero,zero,zero,zmm2[37,45],zero,zero,zero,zero,zero,zmm2[38,46],zero,zero,zero,zmm2[55,63],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512BW-NEXT: vporq %zmm1, %zmm2, %zmm1 -; AVX512BW-NEXT: movabsq $63546854584629360, %rcx # imm = 0xE1C3870E1C3870 -; AVX512BW-NEXT: kmovq %rcx, %k1 -; AVX512BW-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1} -; AVX512BW-NEXT: vextracti32x4 $2, %zmm1, 32(%rax) -; AVX512BW-NEXT: vextracti32x4 $3, %zmm1, %xmm0 -; AVX512BW-NEXT: vmovq %xmm0, 48(%rax) -; AVX512BW-NEXT: vmovdqa %ymm1, (%rax) +; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; AVX512BW-NEXT: vpermq {{.*#+}} zmm2 = zmm0[0,2,1,3,4,6,5,7] +; AVX512BW-NEXT: vpshufb {{.*#+}} zmm2 = zero,zero,zero,zero,zmm2[0,8],zero,zero,zero,zero,zero,zmm2[1,9],zero,zero,zero,zero,zero,zero,zero,zmm2[18],zero,zero,zero,zero,zero,zero,zmm2[19],zero,zero,zero,zero,zmm2[36,44],zero,zero,zero,zero,zero,zmm2[37,45],zero,zero,zero,zero,zero,zmm2[38,46,54],zero,zero,zero,zero,zero,zero,zmm2[55],zero,zero,zero,zero,zero,zero,zero,zero +; AVX512BW-NEXT: vpermq {{.*#+}} zmm0 = zmm0[1,3,0,2,5,7,4,6] +; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zero,zero,zero,zero,zero,zero,zmm0[0],zero,zero,zero,zero,zero,zero,zmm0[1],zero,zero,zero,zero,zmm0[18,26],zero,zero,zero,zero,zero,zmm0[19,27],zero,zero,zero,zero,zero,zero,zero,zmm0[36],zero,zero,zero,zero,zero,zero,zmm0[37],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[55,63],zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512BW-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 | zmm1 | zmm2 +; AVX512BW-NEXT: vextracti32x4 $2, %zmm0, 32(%rax) +; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm1 +; AVX512BW-NEXT: vmovq %xmm1, 48(%rax) +; AVX512BW-NEXT: vmovdqa %ymm0, (%rax) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; @@ -1283,24 +1276,21 @@ define void @store_i8_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} zmm1 = zmm0[0,2,1,3,4,6,5,7] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm1 = zero,zero,zero,zero,zmm1[0,8],zero,zero,zero,zero,zero,zmm1[1,9],zero,zero,zero,zero,zero,zero,zero,zmm1[18],zero,zero,zero,zero,zero,zero,zmm1[19],zero,zero,zero,zero,zmm1[36,44],zero,zero,zero,zero,zero,zmm1[37,45],zero,zero,zero,zero,zero,zmm1[38,46,54],zero,zero,zero,zero,zero,zero,zmm1[55],zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} zmm0 = zmm0[1,3,0,2,5,7,4,6] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm0 = zero,zero,zero,zero,zero,zero,zmm0[0],zero,zero,zero,zero,zero,zero,zmm0[1],zero,zero,zero,zero,zmm0[18,26],zero,zero,zero,zero,zero,zmm0[19,27],zero,zero,zero,zero,zero,zero,zero,zmm0[36],zero,zero,zero,zero,zero,zero,zmm0[37],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[55,63],zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-BW-NEXT: vporq %zmm1, %zmm0, %zmm0 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm1 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[0,8],zero,zero,zero,zero,zero,zmm1[1,9],zero,zero,zero,zero,zero,zmm1[2,10,18,26],zero,zero,zero,zero,zero,zmm1[19,27],zero,zero,zero,zero,zero,zmm1[20,28],zero,zero,zero,zmm1[37,45],zero,zero,zero,zero,zero,zmm1[38,46],zero,zero,zero,zero,zero,zero,zero,zmm1[55,63],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[2,3,0,1,2,3,0,1] ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm2 = zero,zero,zmm2[0,8],zero,zero,zero,zero,zero,zmm2[1,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm2[19,27],zero,zero,zero,zero,zero,zmm2[20,28],zero,zero,zero,zero,zero,zero,zero,zmm2[37,45],zero,zero,zero,zero,zero,zmm2[38,46],zero,zero,zero,zmm2[55,63],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512DQ-BW-NEXT: vporq %zmm1, %zmm2, %zmm1 -; AVX512DQ-BW-NEXT: movabsq $63546854584629360, %rcx # imm = 0xE1C3870E1C3870 -; AVX512DQ-BW-NEXT: kmovq %rcx, %k1 -; AVX512DQ-BW-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1} -; AVX512DQ-BW-NEXT: vextracti32x4 $2, %zmm1, 32(%rax) -; AVX512DQ-BW-NEXT: vextracti32x4 $3, %zmm1, %xmm0 -; AVX512DQ-BW-NEXT: vmovq %xmm0, 48(%rax) -; AVX512DQ-BW-NEXT: vmovdqa %ymm1, (%rax) +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} zmm2 = zmm0[0,2,1,3,4,6,5,7] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm2 = zero,zero,zero,zero,zmm2[0,8],zero,zero,zero,zero,zero,zmm2[1,9],zero,zero,zero,zero,zero,zero,zero,zmm2[18],zero,zero,zero,zero,zero,zero,zmm2[19],zero,zero,zero,zero,zmm2[36,44],zero,zero,zero,zero,zero,zmm2[37,45],zero,zero,zero,zero,zero,zmm2[38,46,54],zero,zero,zero,zero,zero,zero,zmm2[55],zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} zmm0 = zmm0[1,3,0,2,5,7,4,6] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm0 = zero,zero,zero,zero,zero,zero,zmm0[0],zero,zero,zero,zero,zero,zero,zmm0[1],zero,zero,zero,zero,zmm0[18,26],zero,zero,zero,zero,zero,zmm0[19,27],zero,zero,zero,zero,zero,zero,zero,zmm0[36],zero,zero,zero,zero,zero,zero,zmm0[37],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[55,63],zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-BW-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 | zmm1 | zmm2 +; AVX512DQ-BW-NEXT: vextracti32x4 $2, %zmm0, 32(%rax) +; AVX512DQ-BW-NEXT: vextracti32x4 $3, %zmm0, %xmm1 +; AVX512DQ-BW-NEXT: vmovq %xmm1, 48(%rax) +; AVX512DQ-BW-NEXT: vmovdqa %ymm0, (%rax) ; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq ; @@ -1824,8 +1814,7 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-NEXT: vpermq {{.*#+}} ymm8 = ymm4[2,3,0,1] ; AVX2-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,ymm8[5],zero,zero,zero,zero,zero,zero,ymm8[6],zero,zero,zero,zero,zero,ymm8[23],zero,zero,zero,zero,zero,zero,ymm8[24],zero,zero,zero,zero,zero ; AVX2-NEXT: vpor %ymm7, %ymm8, %ymm7 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255] -; AVX2-NEXT: vpblendvb %ymm8, %ymm5, %ymm7, %ymm5 +; AVX2-NEXT: vpor %ymm5, %ymm7, %ymm5 ; AVX2-NEXT: vpshufb {{.*#+}} ymm7 = ymm6[4],zero,zero,zero,zero,zero,zero,ymm6[5],zero,zero,zero,zero,zero,zero,ymm6[6],zero,zero,zero,zero,zero,zero,zero,ymm6[23],zero,zero,zero,zero,zero,zero,ymm6[24],zero,zero ; AVX2-NEXT: vpermq {{.*#+}} ymm8 = ymm6[2,3,0,1] ; AVX2-NEXT: vpshufb {{.*#+}} ymm8 = zero,ymm8[4],zero,zero,zero,zero,zero,zero,ymm8[5],zero,zero,zero,zero,zero,zero,ymm8[6],zero,zero,zero,zero,zero,ymm8[23],zero,zero,zero,zero,zero,zero,ymm8[24],zero,zero,zero @@ -1903,8 +1892,7 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm7 = ymm3[2,3,0,1] ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,ymm7[5],zero,zero,zero,zero,zero,zero,ymm7[6],zero,zero,zero,zero,zero,ymm7[23],zero,zero,zero,zero,zero,zero,ymm7[24],zero,zero,zero,zero,zero ; AVX2-FP-NEXT: vpor %ymm7, %ymm5, %ymm5 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255] -; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm4, %ymm5, %ymm5 +; AVX2-FP-NEXT: vpor %ymm4, %ymm5, %ymm5 ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm4 = ymm6[4],zero,zero,zero,zero,zero,zero,ymm6[5],zero,zero,zero,zero,zero,zero,ymm6[6],zero,zero,zero,zero,zero,zero,zero,ymm6[23],zero,zero,zero,zero,zero,zero,ymm6[24],zero,zero ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm7 = ymm6[2,3,0,1] ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm7 = zero,ymm7[4],zero,zero,zero,zero,zero,zero,ymm7[5],zero,zero,zero,zero,zero,zero,ymm7[6],zero,zero,zero,zero,zero,ymm7[23],zero,zero,zero,zero,zero,zero,ymm7[24],zero,zero,zero @@ -2323,19 +2311,17 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,ymm6[5],zero,zero,zero,zero,zero,zero,ymm6[6],zero,zero,zero,zero,zero,ymm6[23],zero,zero,zero,zero,zero,zero,ymm6[24],zero,zero,zero,zero,zero ; AVX512BW-NEXT: vpor %ymm5, %ymm6, %ymm5 ; AVX512BW-NEXT: vinserti128 $1, (%rsi), %ymm1, %ymm1 +; AVX512BW-NEXT: vpermq {{.*#+}} ymm6 = ymm1[0,2,0,2] +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[0,8],zero,zero,zero,zero,zero,ymm6[1,9],zero,zero,zero,zero,zero,ymm6[2,10],zero,zero,zero,zero,zero,ymm6[19,27],zero,zero,zero,zero,zero,ymm6[20,28],zero,zero +; AVX512BW-NEXT: vinserti64x4 $1, %ymm5, %zmm6, %zmm5 ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,ymm1[5],zero,zero,zero,zero,zero,zero,ymm1[6],zero,zero,zero,zero,zero,zero,zero,ymm1[23],zero,zero,zero,zero,zero,zero,ymm1[24],zero,zero,zero,zero,zero,zero ; AVX512BW-NEXT: vpermq {{.*#+}} ymm7 = ymm1[2,3,0,1] ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,ymm7[5],zero,zero,zero,zero,zero,zero,ymm7[6],zero,zero,zero,zero,zero,ymm7[23],zero,zero,zero,zero,zero,zero,ymm7[24],zero,zero,zero,zero,zero,zero,ymm7[25] ; AVX512BW-NEXT: vpor %ymm7, %ymm6, %ymm6 -; AVX512BW-NEXT: movl $202911840, %ecx # imm = 0xC183060 -; AVX512BW-NEXT: kmovd %ecx, %k1 -; AVX512BW-NEXT: vmovdqu8 %ymm5, %ymm6 {%k1} -; AVX512BW-NEXT: vpermq {{.*#+}} ymm5 = ymm1[0,2,0,2] -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[0,8],zero,zero,zero,zero,zero,ymm5[1,9],zero,zero,zero,zero,zero,ymm5[2,10],zero,zero,zero,zero,zero,ymm5[19,27],zero,zero,zero,zero,zero,ymm5[20,28],zero,zero ; AVX512BW-NEXT: vpermq {{.*#+}} ymm7 = ymm0[0,2,0,2] ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,ymm7[0,8],zero,zero,zero,zero,zero,ymm7[1,9],zero,zero,zero,zero,zero,ymm7[18,26],zero,zero,zero,zero,zero,ymm7[19,27],zero,zero,zero,zero,zero,ymm7[20,28] -; AVX512BW-NEXT: vpor %ymm5, %ymm7, %ymm5 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm6, %zmm5, %zmm5 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm6, %zmm7, %zmm6 +; AVX512BW-NEXT: vporq %zmm5, %zmm6, %zmm5 ; AVX512BW-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm6 ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm7 = ymm6[4],zero,zero,zero,zero,zero,zero,ymm6[5],zero,zero,zero,zero,zero,zero,ymm6[6],zero,zero,zero,zero,zero,zero,zero,ymm6[23],zero,zero,zero,zero,zero,zero,ymm6[24],zero,zero ; AVX512BW-NEXT: vpermq {{.*#+}} ymm8 = ymm6[2,3,0,1] @@ -2445,12 +2431,9 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[10],zero,zero,zero,zero,zero,xmm1[13,12],zero,zero,zero,zero,zero,xmm1[15,14],zero ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,xmm4[13],zero,zero,zero,zero,zero,zero,xmm4[14],zero,zero,zero,zero,zero,zero,xmm4[15] -; AVX512BW-FCP-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX512BW-FCP-NEXT: movw $-7741, %cx # imm = 0xE1C3 -; AVX512BW-FCP-NEXT: kmovd %ecx, %k1 -; AVX512BW-FCP-NEXT: vmovdqu8 %xmm1, %xmm0 {%k1} +; AVX512BW-FCP-NEXT: vpternlogq {{.*#+}} xmm2 = xmm2 | xmm0 | xmm1 ; AVX512BW-FCP-NEXT: vmovdqa %ymm5, 64(%rax) -; AVX512BW-FCP-NEXT: vmovdqa %xmm0, 96(%rax) +; AVX512BW-FCP-NEXT: vmovdqa %xmm2, 96(%rax) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, (%rax) ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq @@ -2470,19 +2453,17 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,ymm6[5],zero,zero,zero,zero,zero,zero,ymm6[6],zero,zero,zero,zero,zero,ymm6[23],zero,zero,zero,zero,zero,zero,ymm6[24],zero,zero,zero,zero,zero ; AVX512DQ-BW-NEXT: vpor %ymm5, %ymm6, %ymm5 ; AVX512DQ-BW-NEXT: vinserti128 $1, (%rsi), %ymm1, %ymm1 +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm6 = ymm1[0,2,0,2] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[0,8],zero,zero,zero,zero,zero,ymm6[1,9],zero,zero,zero,zero,zero,ymm6[2,10],zero,zero,zero,zero,zero,ymm6[19,27],zero,zero,zero,zero,zero,ymm6[20,28],zero,zero +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm5, %zmm6, %zmm5 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,ymm1[5],zero,zero,zero,zero,zero,zero,ymm1[6],zero,zero,zero,zero,zero,zero,zero,ymm1[23],zero,zero,zero,zero,zero,zero,ymm1[24],zero,zero,zero,zero,zero,zero ; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm7 = ymm1[2,3,0,1] ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,ymm7[5],zero,zero,zero,zero,zero,zero,ymm7[6],zero,zero,zero,zero,zero,ymm7[23],zero,zero,zero,zero,zero,zero,ymm7[24],zero,zero,zero,zero,zero,zero,ymm7[25] ; AVX512DQ-BW-NEXT: vpor %ymm7, %ymm6, %ymm6 -; AVX512DQ-BW-NEXT: movl $202911840, %ecx # imm = 0xC183060 -; AVX512DQ-BW-NEXT: kmovd %ecx, %k1 -; AVX512DQ-BW-NEXT: vmovdqu8 %ymm5, %ymm6 {%k1} -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm5 = ymm1[0,2,0,2] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[0,8],zero,zero,zero,zero,zero,ymm5[1,9],zero,zero,zero,zero,zero,ymm5[2,10],zero,zero,zero,zero,zero,ymm5[19,27],zero,zero,zero,zero,zero,ymm5[20,28],zero,zero ; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm7 = ymm0[0,2,0,2] ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,ymm7[0,8],zero,zero,zero,zero,zero,ymm7[1,9],zero,zero,zero,zero,zero,ymm7[18,26],zero,zero,zero,zero,zero,ymm7[19,27],zero,zero,zero,zero,zero,ymm7[20,28] -; AVX512DQ-BW-NEXT: vpor %ymm5, %ymm7, %ymm5 -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm6, %zmm5, %zmm5 +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm6, %zmm7, %zmm6 +; AVX512DQ-BW-NEXT: vporq %zmm5, %zmm6, %zmm5 ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm6 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm7 = ymm6[4],zero,zero,zero,zero,zero,zero,ymm6[5],zero,zero,zero,zero,zero,zero,ymm6[6],zero,zero,zero,zero,zero,zero,zero,ymm6[23],zero,zero,zero,zero,zero,zero,ymm6[24],zero,zero ; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm8 = ymm6[2,3,0,1] @@ -2592,12 +2573,9 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[10],zero,zero,zero,zero,zero,xmm1[13,12],zero,zero,zero,zero,zero,xmm1[15,14],zero ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,xmm4[13],zero,zero,zero,zero,zero,zero,xmm4[14],zero,zero,zero,zero,zero,zero,xmm4[15] -; AVX512DQ-BW-FCP-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX512DQ-BW-FCP-NEXT: movw $-7741, %cx # imm = 0xE1C3 -; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %xmm1, %xmm0 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpternlogq {{.*#+}} xmm2 = xmm2 | xmm0 | xmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm5, 64(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm0, 96(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm2, 96(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, (%rax) ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq @@ -3598,24 +3576,24 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2: # %bb.0: ; AVX2-NEXT: pushq %rax ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-NEXT: vmovdqa (%rdi), %ymm4 -; AVX2-NEXT: vmovdqa (%rsi), %ymm6 -; AVX2-NEXT: vmovdqa (%rdx), %ymm3 -; AVX2-NEXT: vmovdqa (%rcx), %ymm5 -; AVX2-NEXT: vmovdqa (%r8), %ymm7 +; AVX2-NEXT: vmovdqa (%rdi), %ymm7 ; AVX2-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqa (%r9), %ymm2 -; AVX2-NEXT: vmovdqa (%rax), %ymm1 +; AVX2-NEXT: vmovdqa (%rsi), %ymm3 +; AVX2-NEXT: vmovdqa (%rdx), %ymm1 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27,u,u,u,u,30,u,28,u,u,u,u,31,u,29,u] -; AVX2-NEXT: vpshufhw {{.*#+}} ymm8 = ymm3[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX2-NEXT: vmovdqa (%rcx), %ymm2 +; AVX2-NEXT: vmovdqa (%r8), %ymm5 +; AVX2-NEXT: vmovdqa (%r9), %ymm6 +; AVX2-NEXT: vmovdqa (%rax), %ymm4 +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27,u,u,u,u,30,u,28,u,u,u,u,31,u,29,u] +; AVX2-NEXT: vpshufhw {{.*#+}} ymm8 = ymm1[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] ; AVX2-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[0,2,3,3,4,6,7,7] ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [255,0,0,0,0,255,0,255,0,0,0,0,255,0,255,0,255,0,0,0,0,255,0,255,0,0,0,0,255,0,255,0] ; AVX2-NEXT: # ymm9 = mem[0,1,0,1] ; AVX2-NEXT: vpblendvb %ymm9, %ymm0, %ymm8, %ymm0 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX2-NEXT: vpshufb {{.*#+}} ymm8 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,30,u,28,u,u,u,u,31,u,29,u,u,u] -; AVX2-NEXT: vpshufhw {{.*#+}} ymm9 = ymm4[0,1,2,3,6,7,7,6,8,9,10,11,14,15,15,14] +; AVX2-NEXT: vpshufb {{.*#+}} ymm8 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,30,u,28,u,u,u,u,31,u,29,u,u,u] +; AVX2-NEXT: vpshufhw {{.*#+}} ymm9 = ymm7[0,1,2,3,6,7,7,6,8,9,10,11,14,15,15,14] ; AVX2-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[2,2,3,3,6,6,7,7] ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [0,0,255,0,255,0,0,0,0,255,0,255,0,0,0,0,0,0,255,0,255,0,0,0,0,255,0,255,0,0,0,0] ; AVX2-NEXT: # ymm10 = mem[0,1,0,1] @@ -3623,13 +3601,13 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] ; AVX2-NEXT: vmovdqa {{.*#+}} ymm9 = [255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u] ; AVX2-NEXT: vpblendvb %ymm9, %ymm0, %ymm8, %ymm0 -; AVX2-NEXT: vpshufb {{.*#+}} ymm8 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm7[27],zero,ymm7[27,28,29,30],zero,ymm7[28],zero,ymm7[26,27,30,31],zero,ymm7[29] -; AVX2-NEXT: vpshufb {{.*#+}} ymm9 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29],zero,ymm2[27],zero,zero,zero,zero,ymm2[30],zero,ymm2[28],zero,zero,zero,zero,ymm2[31],zero +; AVX2-NEXT: vpshufb {{.*#+}} ymm8 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm5[27],zero,ymm5[27,28,29,30],zero,ymm5[28],zero,ymm5[26,27,30,31],zero,ymm5[29] +; AVX2-NEXT: vpshufb {{.*#+}} ymm9 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29],zero,ymm6[27],zero,zero,zero,zero,ymm6[30],zero,ymm6[28],zero,zero,zero,zero,ymm6[31],zero ; AVX2-NEXT: vpor %ymm8, %ymm9, %ymm8 ; AVX2-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] ; AVX2-NEXT: vmovdqa {{.*#+}} ymm9 = [255,0,0,u,255,255,255,255,0,0,u,255,255,255,255,0,0,u,255,255,255,255,0,0,u,255,255,255,255,0,0,u] ; AVX2-NEXT: vpblendvb %ymm9, %ymm0, %ymm8, %ymm0 -; AVX2-NEXT: vpshufb {{.*#+}} ymm8 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] +; AVX2-NEXT: vpshufb {{.*#+}} ymm8 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] ; AVX2-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] ; AVX2-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0] ; AVX2-NEXT: vpblendvb %ymm9, %ymm0, %ymm8, %ymm0 @@ -3698,68 +3676,67 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-NEXT: vpblendvb %ymm8, %ymm1, %ymm7, %ymm1 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] ; AVX2-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm10 -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,21,u,19,u,u,u,u,22,u,20,u,u] -; AVX2-NEXT: vpshuflw {{.*#+}} ymm1 = ymm4[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,21,u,19,u,u,u,u,22,u,20,u,u] +; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-NEXT: vpshuflw {{.*#+}} ymm1 = ymm12[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] ; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,1,1,4,4,5,5] ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [0,0,0,255,0,255,0,0,0,0,255,0,255,0,0,0,0,0,0,255,0,255,0,0,0,0,255,0,255,0,0,0] ; AVX2-NEXT: # ymm7 = mem[0,1,0,1] ; AVX2-NEXT: vpblendvb %ymm7, %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm5[18],zero,zero,zero,zero,ymm5[21],zero,ymm5[19],zero,zero,zero,zero,ymm5[22],zero,ymm5[20] -; AVX2-NEXT: vpshufb {{.*#+}} ymm7 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,zero,zero,zero,ymm3[21],zero,ymm3[19],zero,zero,zero,zero,ymm3[22],zero,ymm3[20],zero +; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm2[18],zero,zero,zero,zero,ymm2[21],zero,ymm2[19],zero,zero,zero,zero,ymm2[22],zero,ymm2[20] +; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-NEXT: vpshufb {{.*#+}} ymm7 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,zero,zero,zero,ymm13[21],zero,ymm13[19],zero,zero,zero,zero,ymm13[22],zero,ymm13[20],zero ; AVX2-NEXT: vpor %ymm1, %ymm7, %ymm1 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX2-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u] ; AVX2-NEXT: vpblendvb %ymm7, %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm2[20],zero,ymm2[18],zero,zero,zero,zero,ymm2[21],zero,ymm2[19],zero,zero,zero,zero,ymm2[22] -; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-NEXT: vpshufb {{.*#+}} ymm7 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20],zero,ymm13[18],zero,zero,zero,zero,ymm13[21],zero,ymm13[19],zero,zero,zero,zero,ymm13[22],zero +; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm6[20],zero,ymm6[18],zero,zero,zero,zero,ymm6[21],zero,ymm6[19],zero,zero,zero,zero,ymm6[22] +; AVX2-NEXT: vpshufb {{.*#+}} ymm7 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20],zero,ymm5[18],zero,zero,zero,zero,ymm5[21],zero,ymm5[19],zero,zero,zero,zero,ymm5[22],zero ; AVX2-NEXT: vpor %ymm1, %ymm7, %ymm1 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] -; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-NEXT: vpshuflw {{.*#+}} ymm7 = ymm12[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] +; AVX2-NEXT: vpshuflw {{.*#+}} ymm7 = ymm4[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] ; AVX2-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[0,1,1,3,4,5,5,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,3,2] ; AVX2-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255] ; AVX2-NEXT: vpblendvb %ymm8, %ymm1, %ymm7, %ymm1 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0] ; AVX2-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm4[23],zero,ymm4[27,20,21,26],zero,ymm4[24],zero,ymm4[26,27,26,27],zero,ymm4[25] -; AVX2-NEXT: vpshufb {{.*#+}} ymm7 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm6[23],zero,zero,zero,zero,ymm6[26],zero,ymm6[24],zero,zero,zero,zero,ymm6[27],zero +; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm12[23],zero,ymm12[27,20,21,26],zero,ymm12[24],zero,ymm12[26,27,26,27],zero,ymm12[25] +; AVX2-NEXT: vpshufb {{.*#+}} ymm7 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm3[23],zero,zero,zero,zero,ymm3[26],zero,ymm3[24],zero,zero,zero,zero,ymm3[27],zero ; AVX2-NEXT: vpor %ymm1, %ymm7, %ymm1 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] -; AVX2-NEXT: vpshufb {{.*#+}} ymm7 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm5[25],zero,ymm5[23],zero,zero,zero,zero,ymm5[26],zero,ymm5[24],zero,zero,zero,zero -; AVX2-NEXT: vpshufb {{.*#+}} ymm8 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm3[25],zero,ymm3[23],zero,zero,zero,zero,ymm3[26],zero,ymm3[24],zero,zero,zero,zero,ymm3[27] +; AVX2-NEXT: vpshufb {{.*#+}} ymm7 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm2[25],zero,ymm2[23],zero,zero,zero,zero,ymm2[26],zero,ymm2[24],zero,zero,zero,zero +; AVX2-NEXT: vpshufb {{.*#+}} ymm8 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm13[25],zero,ymm13[23],zero,zero,zero,zero,ymm13[26],zero,ymm13[24],zero,zero,zero,zero,ymm13[27] ; AVX2-NEXT: vpor %ymm7, %ymm8, %ymm7 ; AVX2-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,3,2,3] ; AVX2-NEXT: vmovdqa {{.*#+}} ymm8 = [u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0] ; AVX2-NEXT: vpblendvb %ymm8, %ymm1, %ymm7, %ymm1 -; AVX2-NEXT: vpshufb {{.*#+}} ymm7 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm2[25],zero,ymm2[23],zero,zero,zero,zero,ymm2[26],zero,ymm2[24],zero,zero -; AVX2-NEXT: vpshufb {{.*#+}} ymm8 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm13[25],zero,ymm13[23],zero,zero,zero,zero,ymm13[26],zero,ymm13[24],zero,zero,zero +; AVX2-NEXT: vpshufb {{.*#+}} ymm7 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm6[25],zero,ymm6[23],zero,zero,zero,zero,ymm6[26],zero,ymm6[24],zero,zero +; AVX2-NEXT: vpshufb {{.*#+}} ymm8 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm5[25],zero,ymm5[23],zero,zero,zero,zero,ymm5[26],zero,ymm5[24],zero,zero,zero ; AVX2-NEXT: vpor %ymm7, %ymm8, %ymm7 ; AVX2-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,3,2,3] -; AVX2-NEXT: vpshufb {{.*#+}} ymm8 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] +; AVX2-NEXT: vpshufb {{.*#+}} ymm8 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] ; AVX2-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] ; AVX2-NEXT: vmovdqa {{.*#+}} ymm11 = [0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u] ; AVX2-NEXT: vpblendvb %ymm11, %ymm7, %ymm8, %ymm7 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm8 = [0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255] ; AVX2-NEXT: vpblendvb %ymm8, %ymm1, %ymm7, %ymm1 -; AVX2-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,ymm6[14],zero,zero,zero,zero,zero,zero,ymm6[15],zero,zero,zero,zero,zero,zero,ymm6[16],zero,zero,zero,zero,zero,zero,ymm6[17],zero,zero,zero,zero,zero,zero,ymm6[18] -; AVX2-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,ymm4[14],zero,zero,zero,zero,zero,zero,ymm4[15],zero,zero,zero,zero,zero,zero,ymm4[16],zero,zero,zero,zero,zero,zero,ymm4[17],zero,zero,zero,zero,zero,zero,ymm4[18],zero -; AVX2-NEXT: vpor %ymm6, %ymm4, %ymm4 -; AVX2-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,zero,ymm5[14],zero,zero,zero,zero,zero,zero,ymm5[15],zero,zero,zero,zero,zero,zero,ymm5[16],zero,zero,zero,zero,zero,zero,ymm5[17],zero,zero,zero,zero,zero -; AVX2-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,ymm3[14],zero,zero,zero,zero,zero,zero,ymm3[15],zero,zero,zero,zero,zero,zero,ymm3[16],zero,zero,zero,zero,zero,zero,ymm3[17],zero,zero,zero,zero,zero,zero -; AVX2-NEXT: vpor %ymm5, %ymm3, %ymm3 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255] -; AVX2-NEXT: vpblendvb %ymm5, %ymm4, %ymm3, %ymm3 -; AVX2-NEXT: vpshufb {{.*#+}} ymm4 = zero,ymm13[1,2,3,0,1,14],zero,ymm13[0,1,0,1,14,15],zero,ymm13[15,16,17,18,19,16],zero,ymm13[30,31,16,17,16,17],zero,ymm13[31,30,31] -; AVX2-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[13],zero,zero,zero,zero,zero,zero,ymm2[14],zero,zero,zero,zero,zero,zero,ymm2[15],zero,zero,zero,zero,zero,zero,ymm2[16],zero,zero,zero,zero,zero,zero,ymm2[17],zero,zero,zero -; AVX2-NEXT: vpor %ymm4, %ymm2, %ymm2 -; AVX2-NEXT: vpshufb {{.*#+}} ymm4 = ymm12[12,13,2,3,12,13,0,1,14,15,2,3,0,1,14,15,28,29,18,19,28,29,16,17,30,31,18,19,16,17,30,31] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u] -; AVX2-NEXT: vpblendvb %ymm5, %ymm2, %ymm4, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255] -; AVX2-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 +; AVX2-NEXT: vpshufb {{.*#+}} ymm5 = zero,ymm5[1,2,3,0,1,14],zero,ymm5[0,1,0,1,14,15],zero,ymm5[15,16,17,18,19,16],zero,ymm5[30,31,16,17,16,17],zero,ymm5[31,30,31] +; AVX2-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[13],zero,zero,zero,zero,zero,zero,ymm6[14],zero,zero,zero,zero,zero,zero,ymm6[15],zero,zero,zero,zero,zero,zero,ymm6[16],zero,zero,zero,zero,zero,zero,ymm6[17],zero,zero,zero +; AVX2-NEXT: vpor %ymm5, %ymm6, %ymm5 +; AVX2-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[12,13,2,3,12,13,0,1,14,15,2,3,0,1,14,15,28,29,18,19,28,29,16,17,30,31,18,19,16,17,30,31] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u] +; AVX2-NEXT: vpblendvb %ymm6, %ymm5, %ymm4, %ymm4 +; AVX2-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,ymm3[14],zero,zero,zero,zero,zero,zero,ymm3[15],zero,zero,zero,zero,zero,zero,ymm3[16],zero,zero,zero,zero,zero,zero,ymm3[17],zero,zero,zero,zero,zero,zero,ymm3[18] +; AVX2-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,ymm12[14],zero,zero,zero,zero,zero,zero,ymm12[15],zero,zero,zero,zero,zero,zero,ymm12[16],zero,zero,zero,zero,zero,zero,ymm12[17],zero,zero,zero,zero,zero,zero,ymm12[18],zero +; AVX2-NEXT: vpor %ymm3, %ymm5, %ymm3 +; AVX2-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,ymm2[14],zero,zero,zero,zero,zero,zero,ymm2[15],zero,zero,zero,zero,zero,zero,ymm2[16],zero,zero,zero,zero,zero,zero,ymm2[17],zero,zero,zero,zero,zero +; AVX2-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,ymm13[14],zero,zero,zero,zero,zero,zero,ymm13[15],zero,zero,zero,zero,zero,zero,ymm13[16],zero,zero,zero,zero,zero,zero,ymm13[17],zero,zero,zero,zero,zero,zero +; AVX2-NEXT: vpor %ymm2, %ymm5, %ymm2 +; AVX2-NEXT: vpor %ymm3, %ymm2, %ymm2 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255] +; AVX2-NEXT: vpblendvb %ymm3, %ymm2, %ymm4, %ymm2 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-NEXT: vmovdqa %ymm2, 96(%rax) ; AVX2-NEXT: vmovdqa %ymm1, 160(%rax) @@ -3905,22 +3882,21 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FP-NEXT: vpblendvb %ymm15, %ymm13, %ymm14, %ymm13 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm14 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0] ; AVX2-FP-NEXT: vpblendvb %ymm14, %ymm12, %ymm13, %ymm12 +; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm6 = zero,ymm6[1,2,3,0,1,14],zero,ymm6[0,1,0,1,14,15],zero,ymm6[15,16,17,18,19,16],zero,ymm6[30,31,16,17,16,17],zero,ymm6[31,30,31] +; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[13],zero,zero,zero,zero,zero,zero,ymm8[14],zero,zero,zero,zero,zero,zero,ymm8[15],zero,zero,zero,zero,zero,zero,ymm8[16],zero,zero,zero,zero,zero,zero,ymm8[17],zero,zero,zero +; AVX2-FP-NEXT: vpor %ymm6, %ymm8, %ymm6 +; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[12,13,2,3,12,13,0,1,14,15,2,3,0,1,14,15,28,29,18,19,28,29,16,17,30,31,18,19,16,17,30,31] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm8 = [255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u] +; AVX2-FP-NEXT: vpblendvb %ymm8, %ymm6, %ymm7, %ymm6 ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,ymm3[14],zero,zero,zero,zero,zero,zero,ymm3[15],zero,zero,zero,zero,zero,zero,ymm3[16],zero,zero,zero,zero,zero,zero,ymm3[17],zero,zero,zero,zero,zero,zero,ymm3[18] ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,ymm1[14],zero,zero,zero,zero,zero,zero,ymm1[15],zero,zero,zero,zero,zero,zero,ymm1[16],zero,zero,zero,zero,zero,zero,ymm1[17],zero,zero,zero,zero,zero,zero,ymm1[18],zero ; AVX2-FP-NEXT: vpor %ymm3, %ymm1, %ymm1 ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,ymm2[14],zero,zero,zero,zero,zero,zero,ymm2[15],zero,zero,zero,zero,zero,zero,ymm2[16],zero,zero,zero,zero,zero,zero,ymm2[17],zero,zero,zero,zero,zero ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[14],zero,zero,zero,zero,zero,zero,ymm0[15],zero,zero,zero,zero,zero,zero,ymm0[16],zero,zero,zero,zero,zero,zero,ymm0[17],zero,zero,zero,zero,zero,zero ; AVX2-FP-NEXT: vpor %ymm2, %ymm0, %ymm0 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255] -; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm1 = zero,ymm6[1,2,3,0,1,14],zero,ymm6[0,1,0,1,14,15],zero,ymm6[15,16,17,18,19,16],zero,ymm6[30,31,16,17,16,17],zero,ymm6[31,30,31] -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm2 = ymm8[13],zero,zero,zero,zero,zero,zero,ymm8[14],zero,zero,zero,zero,zero,zero,ymm8[15],zero,zero,zero,zero,zero,zero,ymm8[16],zero,zero,zero,zero,zero,zero,ymm8[17],zero,zero,zero -; AVX2-FP-NEXT: vpor %ymm1, %ymm2, %ymm1 -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm2 = ymm7[12,13,2,3,12,13,0,1,14,15,2,3,0,1,14,15,28,29,18,19,28,29,16,17,30,31,18,19,16,17,30,31] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm3 = [255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u] -; AVX2-FP-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255] -; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-FP-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255] +; AVX2-FP-NEXT: vpblendvb %ymm1, %ymm0, %ymm6, %ymm0 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FP-NEXT: vmovdqa %ymm0, 96(%rax) ; AVX2-FP-NEXT: vmovdqa %ymm12, 128(%rax) @@ -4067,22 +4043,21 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FCP-NEXT: vpblendvb %ymm15, %ymm13, %ymm14, %ymm13 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255] ; AVX2-FCP-NEXT: vpblendvb %ymm14, %ymm12, %ymm13, %ymm12 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm6 = zero,ymm6[1,2,3,0,1,14],zero,ymm6[0,1,0,1,14,15],zero,ymm6[15,16,17,18,19,16],zero,ymm6[30,31,16,17,16,17],zero,ymm6[31,30,31] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[13],zero,zero,zero,zero,zero,zero,ymm8[14],zero,zero,zero,zero,zero,zero,ymm8[15],zero,zero,zero,zero,zero,zero,ymm8[16],zero,zero,zero,zero,zero,zero,ymm8[17],zero,zero,zero +; AVX2-FCP-NEXT: vpor %ymm6, %ymm8, %ymm6 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[12,13,2,3,12,13,0,1,14,15,2,3,0,1,14,15,28,29,18,19,28,29,16,17,30,31,18,19,16,17,30,31] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u] +; AVX2-FCP-NEXT: vpblendvb %ymm8, %ymm6, %ymm7, %ymm6 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,ymm3[14],zero,zero,zero,zero,zero,zero,ymm3[15],zero,zero,zero,zero,zero,zero,ymm3[16],zero,zero,zero,zero,zero,zero,ymm3[17],zero,zero,zero,zero,zero,zero,ymm3[18] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,ymm1[14],zero,zero,zero,zero,zero,zero,ymm1[15],zero,zero,zero,zero,zero,zero,ymm1[16],zero,zero,zero,zero,zero,zero,ymm1[17],zero,zero,zero,zero,zero,zero,ymm1[18],zero ; AVX2-FCP-NEXT: vpor %ymm3, %ymm1, %ymm1 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,ymm2[14],zero,zero,zero,zero,zero,zero,ymm2[15],zero,zero,zero,zero,zero,zero,ymm2[16],zero,zero,zero,zero,zero,zero,ymm2[17],zero,zero,zero,zero,zero ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[14],zero,zero,zero,zero,zero,zero,ymm0[15],zero,zero,zero,zero,zero,zero,ymm0[16],zero,zero,zero,zero,zero,zero,ymm0[17],zero,zero,zero,zero,zero,zero ; AVX2-FCP-NEXT: vpor %ymm2, %ymm0, %ymm0 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255] -; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = zero,ymm6[1,2,3,0,1,14],zero,ymm6[0,1,0,1,14,15],zero,ymm6[15,16,17,18,19,16],zero,ymm6[30,31,16,17,16,17],zero,ymm6[31,30,31] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm8[13],zero,zero,zero,zero,zero,zero,ymm8[14],zero,zero,zero,zero,zero,zero,ymm8[15],zero,zero,zero,zero,zero,zero,ymm8[16],zero,zero,zero,zero,zero,zero,ymm8[17],zero,zero,zero -; AVX2-FCP-NEXT: vpor %ymm1, %ymm2, %ymm1 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm7[12,13,2,3,12,13,0,1,14,15,2,3,0,1,14,15,28,29,18,19,28,29,16,17,30,31,18,19,16,17,30,31] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u] -; AVX2-FCP-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255] -; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-FCP-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255] +; AVX2-FCP-NEXT: vpblendvb %ymm1, %ymm0, %ymm6, %ymm0 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FCP-NEXT: vmovdqa %ymm0, 96(%rax) ; AVX2-FCP-NEXT: vmovdqa %ymm10, 128(%rax) diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll index a9da7abaa945..3acc94d6e1fc 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll @@ -2071,9 +2071,7 @@ define void @store_i8_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vpermq {{.*#+}} zmm9 = zmm3[0,2,0,2,4,6,4,6] ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [128,128,128,128,128,128,0,8,128,128,128,128,128,128,1,9,128,128,128,128,128,128,18,26,128,128,128,128,128,128,19,27,128,128,128,128,128,128,36,44,128,128,128,128,128,128,37,45,128,128,128,128,128,128,54,62,128,128,128,128,128,128,55,63] ; AVX512BW-NEXT: vpshufb %zmm10, %zmm9, %zmm9 -; AVX512BW-NEXT: movw $-21846, %cx # imm = 0xAAAA -; AVX512BW-NEXT: kmovd %ecx, %k1 -; AVX512BW-NEXT: vpord %zmm6, %zmm9, %zmm4 {%k1} +; AVX512BW-NEXT: vpternlogq {{.*#+}} zmm9 = zmm9 | zmm4 | zmm6 ; AVX512BW-NEXT: vpermq {{.*#+}} zmm0 = zmm0[1,3,1,3,5,7,5,7] ; AVX512BW-NEXT: vpshufb %zmm5, %zmm0, %zmm0 ; AVX512BW-NEXT: vpermq {{.*#+}} zmm1 = zmm1[1,3,1,3,5,7,5,7] @@ -2083,9 +2081,9 @@ define void @store_i8_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vpshufb %zmm8, %zmm1, %zmm1 ; AVX512BW-NEXT: vpermq {{.*#+}} zmm2 = zmm3[1,3,1,3,5,7,5,7] ; AVX512BW-NEXT: vpshufb %zmm10, %zmm2, %zmm2 -; AVX512BW-NEXT: vpord %zmm1, %zmm2, %zmm0 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm0, 64(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm4, (%rax) +; AVX512BW-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 | zmm0 | zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm2, 64(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm9, (%rax) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; @@ -2117,23 +2115,21 @@ define void @store_i8_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: vpermq %zmm3, %zmm4, %zmm4 ; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [128,128,128,128,128,128,0,8,128,128,128,128,128,128,1,9,128,128,128,128,128,128,18,26,128,128,128,128,128,128,19,27,128,128,128,128,128,128,36,44,128,128,128,128,128,128,37,45,128,128,128,128,128,128,54,62,128,128,128,128,128,128,55,63] ; AVX512BW-FCP-NEXT: vpshufb %zmm10, %zmm4, %zmm4 -; AVX512BW-FCP-NEXT: movw $-21846, %cx # imm = 0xAAAA -; AVX512BW-FCP-NEXT: kmovd %ecx, %k1 -; AVX512BW-FCP-NEXT: vpord %zmm7, %zmm4, %zmm5 {%k1} -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [1,3,1,3,1,3,1,3] -; AVX512BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermq %zmm0, %zmm4, %zmm0 +; AVX512BW-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 | zmm5 | zmm7 +; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [1,3,1,3,1,3,1,3] +; AVX512BW-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpermq %zmm0, %zmm5, %zmm0 ; AVX512BW-FCP-NEXT: vpshufb %zmm6, %zmm0, %zmm0 -; AVX512BW-FCP-NEXT: vpermq %zmm1, %zmm4, %zmm1 +; AVX512BW-FCP-NEXT: vpermq %zmm1, %zmm5, %zmm1 ; AVX512BW-FCP-NEXT: vpshufb %zmm8, %zmm1, %zmm1 ; AVX512BW-FCP-NEXT: vporq %zmm0, %zmm1, %zmm0 -; AVX512BW-FCP-NEXT: vpermq %zmm2, %zmm4, %zmm1 +; AVX512BW-FCP-NEXT: vpermq %zmm2, %zmm5, %zmm1 ; AVX512BW-FCP-NEXT: vpshufb %zmm9, %zmm1, %zmm1 -; AVX512BW-FCP-NEXT: vpermq %zmm3, %zmm4, %zmm2 +; AVX512BW-FCP-NEXT: vpermq %zmm3, %zmm5, %zmm2 ; AVX512BW-FCP-NEXT: vpshufb %zmm10, %zmm2, %zmm2 -; AVX512BW-FCP-NEXT: vpord %zmm1, %zmm2, %zmm0 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, 64(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, (%rax) +; AVX512BW-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 | zmm0 | zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, 64(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, (%rax) ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq ; @@ -2167,9 +2163,7 @@ define void @store_i8_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: vpermq {{.*#+}} zmm9 = zmm3[0,2,0,2,4,6,4,6] ; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [128,128,128,128,128,128,0,8,128,128,128,128,128,128,1,9,128,128,128,128,128,128,18,26,128,128,128,128,128,128,19,27,128,128,128,128,128,128,36,44,128,128,128,128,128,128,37,45,128,128,128,128,128,128,54,62,128,128,128,128,128,128,55,63] ; AVX512DQ-BW-NEXT: vpshufb %zmm10, %zmm9, %zmm9 -; AVX512DQ-BW-NEXT: movw $-21846, %cx # imm = 0xAAAA -; AVX512DQ-BW-NEXT: kmovd %ecx, %k1 -; AVX512DQ-BW-NEXT: vpord %zmm6, %zmm9, %zmm4 {%k1} +; AVX512DQ-BW-NEXT: vpternlogq {{.*#+}} zmm9 = zmm9 | zmm4 | zmm6 ; AVX512DQ-BW-NEXT: vpermq {{.*#+}} zmm0 = zmm0[1,3,1,3,5,7,5,7] ; AVX512DQ-BW-NEXT: vpshufb %zmm5, %zmm0, %zmm0 ; AVX512DQ-BW-NEXT: vpermq {{.*#+}} zmm1 = zmm1[1,3,1,3,5,7,5,7] @@ -2179,9 +2173,9 @@ define void @store_i8_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: vpshufb %zmm8, %zmm1, %zmm1 ; AVX512DQ-BW-NEXT: vpermq {{.*#+}} zmm2 = zmm3[1,3,1,3,5,7,5,7] ; AVX512DQ-BW-NEXT: vpshufb %zmm10, %zmm2, %zmm2 -; AVX512DQ-BW-NEXT: vpord %zmm1, %zmm2, %zmm0 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, 64(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, (%rax) +; AVX512DQ-BW-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 | zmm0 | zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, 64(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, (%rax) ; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq ; @@ -2213,23 +2207,21 @@ define void @store_i8_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: vpermq %zmm3, %zmm4, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [128,128,128,128,128,128,0,8,128,128,128,128,128,128,1,9,128,128,128,128,128,128,18,26,128,128,128,128,128,128,19,27,128,128,128,128,128,128,36,44,128,128,128,128,128,128,37,45,128,128,128,128,128,128,54,62,128,128,128,128,128,128,55,63] ; AVX512DQ-BW-FCP-NEXT: vpshufb %zmm10, %zmm4, %zmm4 -; AVX512DQ-BW-FCP-NEXT: movw $-21846, %cx # imm = 0xAAAA -; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k1 -; AVX512DQ-BW-FCP-NEXT: vpord %zmm7, %zmm4, %zmm5 {%k1} -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [1,3,1,3,1,3,1,3] -; AVX512DQ-BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermq %zmm0, %zmm4, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 | zmm5 | zmm7 +; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [1,3,1,3,1,3,1,3] +; AVX512DQ-BW-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpermq %zmm0, %zmm5, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vpshufb %zmm6, %zmm0, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermq %zmm1, %zmm4, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermq %zmm1, %zmm5, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpshufb %zmm8, %zmm1, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vporq %zmm0, %zmm1, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermq %zmm2, %zmm4, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermq %zmm2, %zmm5, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpshufb %zmm9, %zmm1, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermq %zmm3, %zmm4, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpermq %zmm3, %zmm5, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vpshufb %zmm10, %zmm2, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpord %zmm1, %zmm2, %zmm0 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, 64(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, (%rax) +; AVX512DQ-BW-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 | zmm0 | zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, 64(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, (%rax) ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq %in.vec0 = load <16 x i8>, ptr %in.vecptr0, align 64 @@ -8050,128 +8042,107 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512BW-NEXT: vmovdqa (%rcx), %xmm1 -; AVX512BW-NEXT: vmovdqa64 32(%rcx), %xmm16 -; AVX512BW-NEXT: vmovdqa 48(%rcx), %xmm14 -; AVX512BW-NEXT: vmovdqa (%rdx), %xmm2 -; AVX512BW-NEXT: vmovdqa64 32(%rdx), %xmm18 -; AVX512BW-NEXT: vmovdqa64 48(%rdx), %xmm17 -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] -; AVX512BW-NEXT: vmovdqa (%rsi), %xmm3 -; AVX512BW-NEXT: vmovdqa64 32(%rsi), %xmm20 -; AVX512BW-NEXT: vmovdqa64 48(%rsi), %xmm19 -; AVX512BW-NEXT: vmovdqa (%rdi), %xmm5 -; AVX512BW-NEXT: vmovdqa64 32(%rdi), %xmm21 -; AVX512BW-NEXT: vmovdqa64 48(%rdi), %xmm22 -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm5[8],xmm3[8],xmm5[9],xmm3[9],xmm5[10],xmm3[10],xmm5[11],xmm3[11],xmm5[12],xmm3[12],xmm5[13],xmm3[13],xmm5[14],xmm3[14],xmm5[15],xmm3[15] -; AVX512BW-NEXT: vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm6 = [0,0,2,1,2,1,6,7,0,2,2,3,2,3,6,7,4,4,2,3,6,5,6,5,4,6,2,3,6,7,6,7] -; AVX512BW-NEXT: movl $572662306, %r11d # imm = 0x22222222 -; AVX512BW-NEXT: kmovd %r11d, %k1 -; AVX512BW-NEXT: vpermw %zmm4, %zmm6, %zmm0 {%k1} -; AVX512BW-NEXT: vmovdqa (%r10), %xmm4 -; AVX512BW-NEXT: vmovdqa64 48(%r10), %xmm23 -; AVX512BW-NEXT: vmovdqa (%rax), %xmm7 -; AVX512BW-NEXT: vmovdqa64 48(%rax), %xmm24 -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm7[8],xmm4[8],xmm7[9],xmm4[9],xmm7[10],xmm4[10],xmm7[11],xmm4[11],xmm7[12],xmm4[12],xmm7[13],xmm4[13],xmm7[14],xmm4[14],xmm7[15],xmm4[15] -; AVX512BW-NEXT: vmovdqa (%r9), %xmm8 -; AVX512BW-NEXT: vmovdqa64 48(%r9), %xmm25 -; AVX512BW-NEXT: vmovdqa (%r8), %xmm10 -; AVX512BW-NEXT: vmovdqa64 48(%r8), %xmm26 -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm10[8],xmm8[8],xmm10[9],xmm8[9],xmm10[10],xmm8[10],xmm10[11],xmm8[11],xmm10[12],xmm8[12],xmm10[13],xmm8[13],xmm10[14],xmm8[14],xmm10[15],xmm8[15] -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm12 = [0,1,0,1,4,5,1,3,2,1,2,1,4,5,3,3,0,1,4,5,4,5,5,7,0,1,6,5,6,5,7,7] -; AVX512BW-NEXT: vpermw %zmm11, %zmm12, %zmm11 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm13 = [0,0,0,0,4,5,2,1,0,2,0,2,4,5,2,3,0,1,4,4,4,4,6,5,0,1,4,6,4,6,6,7] -; AVX512BW-NEXT: movl $-2004318072, %r11d # imm = 0x88888888 -; AVX512BW-NEXT: kmovd %r11d, %k2 -; AVX512BW-NEXT: vpermw %zmm9, %zmm13, %zmm11 {%k2} -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm17[0],xmm14[0],xmm17[1],xmm14[1],xmm17[2],xmm14[2],xmm17[3],xmm14[3],xmm17[4],xmm14[4],xmm17[5],xmm14[5],xmm17[6],xmm14[6],xmm17[7],xmm14[7] -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm22[0],xmm19[0],xmm22[1],xmm19[1],xmm22[2],xmm19[2],xmm22[3],xmm19[3],xmm22[4],xmm19[4],xmm22[5],xmm19[5],xmm22[6],xmm19[6],xmm22[7],xmm19[7] -; AVX512BW-NEXT: vpmovzxwq {{.*#+}} zmm9 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero,xmm9[2],zero,zero,zero,xmm9[3],zero,zero,zero,xmm9[4],zero,zero,zero,xmm9[5],zero,zero,zero,xmm9[6],zero,zero,zero,xmm9[7],zero,zero,zero -; AVX512BW-NEXT: vpermw %zmm15, %zmm6, %zmm9 {%k1} -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm27 = xmm24[0],xmm23[0],xmm24[1],xmm23[1],xmm24[2],xmm23[2],xmm24[3],xmm23[3],xmm24[4],xmm23[4],xmm24[5],xmm23[5],xmm24[6],xmm23[6],xmm24[7],xmm23[7] -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm26[0],xmm25[0],xmm26[1],xmm25[1],xmm26[2],xmm25[2],xmm26[3],xmm25[3],xmm26[4],xmm25[4],xmm26[5],xmm25[5],xmm26[6],xmm25[6],xmm26[7],xmm25[7] -; AVX512BW-NEXT: vpermw %zmm15, %zmm12, %zmm15 -; AVX512BW-NEXT: vpermw %zmm27, %zmm13, %zmm15 {%k2} -; AVX512BW-NEXT: vmovdqa64 32(%r10), %xmm27 -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm17 = xmm17[8],xmm14[8],xmm17[9],xmm14[9],xmm17[10],xmm14[10],xmm17[11],xmm14[11],xmm17[12],xmm14[12],xmm17[13],xmm14[13],xmm17[14],xmm14[14],xmm17[15],xmm14[15] -; AVX512BW-NEXT: vmovdqa64 32(%rax), %xmm28 -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm14 = xmm22[8],xmm19[8],xmm22[9],xmm19[9],xmm22[10],xmm19[10],xmm22[11],xmm19[11],xmm22[12],xmm19[12],xmm22[13],xmm19[13],xmm22[14],xmm19[14],xmm22[15],xmm19[15] -; AVX512BW-NEXT: vmovdqa64 32(%r9), %xmm29 -; AVX512BW-NEXT: vpmovzxwq {{.*#+}} zmm14 = xmm14[0],zero,zero,zero,xmm14[1],zero,zero,zero,xmm14[2],zero,zero,zero,xmm14[3],zero,zero,zero,xmm14[4],zero,zero,zero,xmm14[5],zero,zero,zero,xmm14[6],zero,zero,zero,xmm14[7],zero,zero,zero -; AVX512BW-NEXT: vpermw %zmm17, %zmm6, %zmm14 {%k1} -; AVX512BW-NEXT: vmovdqa64 32(%r8), %xmm30 -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm17 = xmm24[8],xmm23[8],xmm24[9],xmm23[9],xmm24[10],xmm23[10],xmm24[11],xmm23[11],xmm24[12],xmm23[12],xmm24[13],xmm23[13],xmm24[14],xmm23[14],xmm24[15],xmm23[15] -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm19 = xmm26[8],xmm25[8],xmm26[9],xmm25[9],xmm26[10],xmm25[10],xmm26[11],xmm25[11],xmm26[12],xmm25[12],xmm26[13],xmm25[13],xmm26[14],xmm25[14],xmm26[15],xmm25[15] -; AVX512BW-NEXT: vpermw %zmm19, %zmm12, %zmm19 -; AVX512BW-NEXT: vpermw %zmm17, %zmm13, %zmm19 {%k2} -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm22 = xmm18[0],xmm16[0],xmm18[1],xmm16[1],xmm18[2],xmm16[2],xmm18[3],xmm16[3],xmm18[4],xmm16[4],xmm18[5],xmm16[5],xmm18[6],xmm16[6],xmm18[7],xmm16[7] -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm17 = xmm21[0],xmm20[0],xmm21[1],xmm20[1],xmm21[2],xmm20[2],xmm21[3],xmm20[3],xmm21[4],xmm20[4],xmm21[5],xmm20[5],xmm21[6],xmm20[6],xmm21[7],xmm20[7] -; AVX512BW-NEXT: vpmovzxwq {{.*#+}} zmm17 = xmm17[0],zero,zero,zero,xmm17[1],zero,zero,zero,xmm17[2],zero,zero,zero,xmm17[3],zero,zero,zero,xmm17[4],zero,zero,zero,xmm17[5],zero,zero,zero,xmm17[6],zero,zero,zero,xmm17[7],zero,zero,zero -; AVX512BW-NEXT: vpermw %zmm22, %zmm6, %zmm17 {%k1} -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm23 = xmm28[0],xmm27[0],xmm28[1],xmm27[1],xmm28[2],xmm27[2],xmm28[3],xmm27[3],xmm28[4],xmm27[4],xmm28[5],xmm27[5],xmm28[6],xmm27[6],xmm28[7],xmm27[7] -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm22 = xmm30[0],xmm29[0],xmm30[1],xmm29[1],xmm30[2],xmm29[2],xmm30[3],xmm29[3],xmm30[4],xmm29[4],xmm30[5],xmm29[5],xmm30[6],xmm29[6],xmm30[7],xmm29[7] -; AVX512BW-NEXT: vpermw %zmm22, %zmm12, %zmm22 -; AVX512BW-NEXT: vpermw %zmm23, %zmm13, %zmm22 {%k2} -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm18 = xmm18[8],xmm16[8],xmm18[9],xmm16[9],xmm18[10],xmm16[10],xmm18[11],xmm16[11],xmm18[12],xmm16[12],xmm18[13],xmm16[13],xmm18[14],xmm16[14],xmm18[15],xmm16[15] -; AVX512BW-NEXT: vmovdqa64 16(%rcx), %xmm23 -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm16 = xmm21[8],xmm20[8],xmm21[9],xmm20[9],xmm21[10],xmm20[10],xmm21[11],xmm20[11],xmm21[12],xmm20[12],xmm21[13],xmm20[13],xmm21[14],xmm20[14],xmm21[15],xmm20[15] -; AVX512BW-NEXT: vmovdqa64 16(%rdx), %xmm21 -; AVX512BW-NEXT: vpmovzxwq {{.*#+}} zmm16 = xmm16[0],zero,zero,zero,xmm16[1],zero,zero,zero,xmm16[2],zero,zero,zero,xmm16[3],zero,zero,zero,xmm16[4],zero,zero,zero,xmm16[5],zero,zero,zero,xmm16[6],zero,zero,zero,xmm16[7],zero,zero,zero -; AVX512BW-NEXT: vpermw %zmm18, %zmm6, %zmm16 {%k1} -; AVX512BW-NEXT: vmovdqa64 16(%rsi), %xmm24 -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm20 = xmm28[8],xmm27[8],xmm28[9],xmm27[9],xmm28[10],xmm27[10],xmm28[11],xmm27[11],xmm28[12],xmm27[12],xmm28[13],xmm27[13],xmm28[14],xmm27[14],xmm28[15],xmm27[15] -; AVX512BW-NEXT: vmovdqa64 16(%rdi), %xmm25 -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm18 = xmm30[8],xmm29[8],xmm30[9],xmm29[9],xmm30[10],xmm29[10],xmm30[11],xmm29[11],xmm30[12],xmm29[12],xmm30[13],xmm29[13],xmm30[14],xmm29[14],xmm30[15],xmm29[15] -; AVX512BW-NEXT: vpermw %zmm18, %zmm12, %zmm18 -; AVX512BW-NEXT: vpermw %zmm20, %zmm13, %zmm18 {%k2} -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm26 = xmm21[0],xmm23[0],xmm21[1],xmm23[1],xmm21[2],xmm23[2],xmm21[3],xmm23[3],xmm21[4],xmm23[4],xmm21[5],xmm23[5],xmm21[6],xmm23[6],xmm21[7],xmm23[7] -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm20 = xmm25[0],xmm24[0],xmm25[1],xmm24[1],xmm25[2],xmm24[2],xmm25[3],xmm24[3],xmm25[4],xmm24[4],xmm25[5],xmm24[5],xmm25[6],xmm24[6],xmm25[7],xmm24[7] -; AVX512BW-NEXT: vpmovzxwq {{.*#+}} zmm20 = xmm20[0],zero,zero,zero,xmm20[1],zero,zero,zero,xmm20[2],zero,zero,zero,xmm20[3],zero,zero,zero,xmm20[4],zero,zero,zero,xmm20[5],zero,zero,zero,xmm20[6],zero,zero,zero,xmm20[7],zero,zero,zero -; AVX512BW-NEXT: vpermw %zmm26, %zmm6, %zmm20 {%k1} -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm21 = xmm21[8],xmm23[8],xmm21[9],xmm23[9],xmm21[10],xmm23[10],xmm21[11],xmm23[11],xmm21[12],xmm23[12],xmm21[13],xmm23[13],xmm21[14],xmm23[14],xmm21[15],xmm23[15] -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm23 = xmm25[8],xmm24[8],xmm25[9],xmm24[9],xmm25[10],xmm24[10],xmm25[11],xmm24[11],xmm25[12],xmm24[12],xmm25[13],xmm24[13],xmm25[14],xmm24[14],xmm25[15],xmm24[15] -; AVX512BW-NEXT: vmovdqa64 16(%r10), %xmm24 -; AVX512BW-NEXT: vpmovzxwq {{.*#+}} zmm23 = xmm23[0],zero,zero,zero,xmm23[1],zero,zero,zero,xmm23[2],zero,zero,zero,xmm23[3],zero,zero,zero,xmm23[4],zero,zero,zero,xmm23[5],zero,zero,zero,xmm23[6],zero,zero,zero,xmm23[7],zero,zero,zero -; AVX512BW-NEXT: vpermw %zmm21, %zmm6, %zmm23 {%k1} -; AVX512BW-NEXT: vmovdqa64 16(%rax), %xmm21 -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; AVX512BW-NEXT: vmovdqa 16(%r9), %xmm2 -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3],xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] -; AVX512BW-NEXT: vmovdqa 16(%r8), %xmm5 -; AVX512BW-NEXT: vpmovzxwq {{.*#+}} zmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero,xmm3[4],zero,zero,zero,xmm3[5],zero,zero,zero,xmm3[6],zero,zero,zero,xmm3[7],zero,zero,zero -; AVX512BW-NEXT: vpermw %zmm1, %zmm6, %zmm3 {%k1} -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm21[0],xmm24[0],xmm21[1],xmm24[1],xmm21[2],xmm24[2],xmm21[3],xmm24[3],xmm21[4],xmm24[4],xmm21[5],xmm24[5],xmm21[6],xmm24[6],xmm21[7],xmm24[7] -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3],xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7] -; AVX512BW-NEXT: vpermw %zmm6, %zmm12, %zmm6 -; AVX512BW-NEXT: vpermw %zmm1, %zmm13, %zmm6 {%k2} -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm21[8],xmm24[8],xmm21[9],xmm24[9],xmm21[10],xmm24[10],xmm21[11],xmm24[11],xmm21[12],xmm24[12],xmm21[13],xmm24[13],xmm21[14],xmm24[14],xmm21[15],xmm24[15] -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm5[8],xmm2[8],xmm5[9],xmm2[9],xmm5[10],xmm2[10],xmm5[11],xmm2[11],xmm5[12],xmm2[12],xmm5[13],xmm2[13],xmm5[14],xmm2[14],xmm5[15],xmm2[15] -; AVX512BW-NEXT: vpermw %zmm2, %zmm12, %zmm2 -; AVX512BW-NEXT: vpermw %zmm1, %zmm13, %zmm2 {%k2} -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3],xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7] -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3],xmm10[4],xmm8[4],xmm10[5],xmm8[5],xmm10[6],xmm8[6],xmm10[7],xmm8[7] -; AVX512BW-NEXT: vpermw %zmm4, %zmm12, %zmm4 -; AVX512BW-NEXT: vpermw %zmm1, %zmm13, %zmm4 {%k2} +; AVX512BW-NEXT: vmovdqa (%r10), %xmm1 +; AVX512BW-NEXT: vmovdqa64 32(%r10), %xmm16 +; AVX512BW-NEXT: vmovdqa 48(%r10), %xmm14 +; AVX512BW-NEXT: vmovdqa (%rax), %xmm3 +; AVX512BW-NEXT: vmovdqa64 32(%rax), %xmm17 +; AVX512BW-NEXT: vmovdqa 48(%rax), %xmm15 +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15] +; AVX512BW-NEXT: vmovdqa (%r9), %xmm4 +; AVX512BW-NEXT: vmovdqa64 32(%r9), %xmm19 +; AVX512BW-NEXT: vmovdqa64 48(%r9), %xmm18 +; AVX512BW-NEXT: vmovdqa (%r8), %xmm6 +; AVX512BW-NEXT: vmovdqa64 32(%r8), %xmm21 +; AVX512BW-NEXT: vmovdqa64 48(%r8), %xmm20 +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm6[8],xmm4[8],xmm6[9],xmm4[9],xmm6[10],xmm4[10],xmm6[11],xmm4[11],xmm6[12],xmm4[12],xmm6[13],xmm4[13],xmm6[14],xmm4[14],xmm6[15],xmm4[15] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm7 = [0,1,0,32,4,5,1,33,2,1,2,34,4,5,3,35,0,1,4,36,4,5,5,37,0,1,6,38,6,5,7,39] +; AVX512BW-NEXT: vpermt2w %zmm2, %zmm7, %zmm0 +; AVX512BW-NEXT: vmovdqa (%rcx), %xmm5 +; AVX512BW-NEXT: vmovdqa64 48(%rcx), %xmm22 +; AVX512BW-NEXT: vmovdqa (%rdx), %xmm8 +; AVX512BW-NEXT: vmovdqa64 48(%rdx), %xmm23 +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm8[8],xmm5[8],xmm8[9],xmm5[9],xmm8[10],xmm5[10],xmm8[11],xmm5[11],xmm8[12],xmm5[12],xmm8[13],xmm5[13],xmm8[14],xmm5[14],xmm8[15],xmm5[15] +; AVX512BW-NEXT: vmovdqa (%rsi), %xmm9 +; AVX512BW-NEXT: vmovdqa64 48(%rsi), %xmm24 +; AVX512BW-NEXT: vmovdqa (%rdi), %xmm10 +; AVX512BW-NEXT: vmovdqa64 48(%rdi), %xmm25 +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm10[8],xmm9[8],xmm10[9],xmm9[9],xmm10[10],xmm9[10],xmm10[11],xmm9[11],xmm10[12],xmm9[12],xmm10[13],xmm9[13],xmm10[14],xmm9[14],xmm10[15],xmm9[15] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm12 = [0,32,0,0,1,33,0,0,2,34,0,0,3,35,0,0,4,36,0,0,5,37,0,0,6,38,0,0,7,39,0,0] +; AVX512BW-NEXT: vpermt2w %zmm11, %zmm12, %zmm2 +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3],xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7] +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm20[0],xmm18[0],xmm20[1],xmm18[1],xmm20[2],xmm18[2],xmm20[3],xmm18[3],xmm20[4],xmm18[4],xmm20[5],xmm18[5],xmm20[6],xmm18[6],xmm20[7],xmm18[7] +; AVX512BW-NEXT: vpermt2w %zmm11, %zmm7, %zmm13 +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm26 = xmm23[0],xmm22[0],xmm23[1],xmm22[1],xmm23[2],xmm22[2],xmm23[3],xmm22[3],xmm23[4],xmm22[4],xmm23[5],xmm22[5],xmm23[6],xmm22[6],xmm23[7],xmm22[7] +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm25[0],xmm24[0],xmm25[1],xmm24[1],xmm25[2],xmm24[2],xmm25[3],xmm24[3],xmm25[4],xmm24[4],xmm25[5],xmm24[5],xmm25[6],xmm24[6],xmm25[7],xmm24[7] +; AVX512BW-NEXT: vpermt2w %zmm26, %zmm12, %zmm11 +; AVX512BW-NEXT: vmovdqa64 32(%rcx), %xmm26 +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm14 = xmm15[8],xmm14[8],xmm15[9],xmm14[9],xmm15[10],xmm14[10],xmm15[11],xmm14[11],xmm15[12],xmm14[12],xmm15[13],xmm14[13],xmm15[14],xmm14[14],xmm15[15],xmm14[15] +; AVX512BW-NEXT: vmovdqa64 32(%rdx), %xmm27 +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm15 = xmm20[8],xmm18[8],xmm20[9],xmm18[9],xmm20[10],xmm18[10],xmm20[11],xmm18[11],xmm20[12],xmm18[12],xmm20[13],xmm18[13],xmm20[14],xmm18[14],xmm20[15],xmm18[15] +; AVX512BW-NEXT: vmovdqa64 32(%rsi), %xmm28 +; AVX512BW-NEXT: vpermt2w %zmm14, %zmm7, %zmm15 +; AVX512BW-NEXT: vmovdqa64 32(%rdi), %xmm29 +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm18 = xmm23[8],xmm22[8],xmm23[9],xmm22[9],xmm23[10],xmm22[10],xmm23[11],xmm22[11],xmm23[12],xmm22[12],xmm23[13],xmm22[13],xmm23[14],xmm22[14],xmm23[15],xmm22[15] +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm14 = xmm25[8],xmm24[8],xmm25[9],xmm24[9],xmm25[10],xmm24[10],xmm25[11],xmm24[11],xmm25[12],xmm24[12],xmm25[13],xmm24[13],xmm25[14],xmm24[14],xmm25[15],xmm24[15] +; AVX512BW-NEXT: vpermt2w %zmm18, %zmm12, %zmm14 +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm18 = xmm17[0],xmm16[0],xmm17[1],xmm16[1],xmm17[2],xmm16[2],xmm17[3],xmm16[3],xmm17[4],xmm16[4],xmm17[5],xmm16[5],xmm17[6],xmm16[6],xmm17[7],xmm16[7] +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm20 = xmm21[0],xmm19[0],xmm21[1],xmm19[1],xmm21[2],xmm19[2],xmm21[3],xmm19[3],xmm21[4],xmm19[4],xmm21[5],xmm19[5],xmm21[6],xmm19[6],xmm21[7],xmm19[7] +; AVX512BW-NEXT: vpermt2w %zmm18, %zmm7, %zmm20 +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm22 = xmm27[0],xmm26[0],xmm27[1],xmm26[1],xmm27[2],xmm26[2],xmm27[3],xmm26[3],xmm27[4],xmm26[4],xmm27[5],xmm26[5],xmm27[6],xmm26[6],xmm27[7],xmm26[7] +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm18 = xmm29[0],xmm28[0],xmm29[1],xmm28[1],xmm29[2],xmm28[2],xmm29[3],xmm28[3],xmm29[4],xmm28[4],xmm29[5],xmm28[5],xmm29[6],xmm28[6],xmm29[7],xmm28[7] +; AVX512BW-NEXT: vpermt2w %zmm22, %zmm12, %zmm18 +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm17 = xmm17[8],xmm16[8],xmm17[9],xmm16[9],xmm17[10],xmm16[10],xmm17[11],xmm16[11],xmm17[12],xmm16[12],xmm17[13],xmm16[13],xmm17[14],xmm16[14],xmm17[15],xmm16[15] +; AVX512BW-NEXT: vmovdqa64 16(%r10), %xmm22 +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm16 = xmm21[8],xmm19[8],xmm21[9],xmm19[9],xmm21[10],xmm19[10],xmm21[11],xmm19[11],xmm21[12],xmm19[12],xmm21[13],xmm19[13],xmm21[14],xmm19[14],xmm21[15],xmm19[15] +; AVX512BW-NEXT: vmovdqa64 16(%rax), %xmm19 +; AVX512BW-NEXT: vpermt2w %zmm17, %zmm7, %zmm16 +; AVX512BW-NEXT: vmovdqa64 16(%r9), %xmm21 +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm23 = xmm27[8],xmm26[8],xmm27[9],xmm26[9],xmm27[10],xmm26[10],xmm27[11],xmm26[11],xmm27[12],xmm26[12],xmm27[13],xmm26[13],xmm27[14],xmm26[14],xmm27[15],xmm26[15] +; AVX512BW-NEXT: vmovdqa64 16(%r8), %xmm24 +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm17 = xmm29[8],xmm28[8],xmm29[9],xmm28[9],xmm29[10],xmm28[10],xmm29[11],xmm28[11],xmm29[12],xmm28[12],xmm29[13],xmm28[13],xmm29[14],xmm28[14],xmm29[15],xmm28[15] +; AVX512BW-NEXT: vpermt2w %zmm23, %zmm12, %zmm17 +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm23 = xmm19[0],xmm22[0],xmm19[1],xmm22[1],xmm19[2],xmm22[2],xmm19[3],xmm22[3],xmm19[4],xmm22[4],xmm19[5],xmm22[5],xmm19[6],xmm22[6],xmm19[7],xmm22[7] +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm25 = xmm24[0],xmm21[0],xmm24[1],xmm21[1],xmm24[2],xmm21[2],xmm24[3],xmm21[3],xmm24[4],xmm21[4],xmm24[5],xmm21[5],xmm24[6],xmm21[6],xmm24[7],xmm21[7] +; AVX512BW-NEXT: vpermt2w %zmm23, %zmm7, %zmm25 +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm19 = xmm19[8],xmm22[8],xmm19[9],xmm22[9],xmm19[10],xmm22[10],xmm19[11],xmm22[11],xmm19[12],xmm22[12],xmm19[13],xmm22[13],xmm19[14],xmm22[14],xmm19[15],xmm22[15] +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm21 = xmm24[8],xmm21[8],xmm24[9],xmm21[9],xmm24[10],xmm21[10],xmm24[11],xmm21[11],xmm24[12],xmm21[12],xmm24[13],xmm21[13],xmm24[14],xmm21[14],xmm24[15],xmm21[15] +; AVX512BW-NEXT: vmovdqa64 16(%rcx), %xmm22 +; AVX512BW-NEXT: vpermt2w %zmm19, %zmm7, %zmm21 +; AVX512BW-NEXT: vmovdqa64 16(%rdx), %xmm19 +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] +; AVX512BW-NEXT: vmovdqa 16(%rsi), %xmm3 +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3],xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] +; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm6 +; AVX512BW-NEXT: vpermt2w %zmm1, %zmm7, %zmm4 +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm19[0],xmm22[0],xmm19[1],xmm22[1],xmm19[2],xmm22[2],xmm19[3],xmm22[3],xmm19[4],xmm22[4],xmm19[5],xmm22[5],xmm19[6],xmm22[6],xmm19[7],xmm22[7] +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3],xmm6[4],xmm3[4],xmm6[5],xmm3[5],xmm6[6],xmm3[6],xmm6[7],xmm3[7] +; AVX512BW-NEXT: vpermt2w %zmm1, %zmm12, %zmm7 +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm19[8],xmm22[8],xmm19[9],xmm22[9],xmm19[10],xmm22[10],xmm19[11],xmm22[11],xmm19[12],xmm22[12],xmm19[13],xmm22[13],xmm19[14],xmm22[14],xmm19[15],xmm22[15] +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm6[8],xmm3[8],xmm6[9],xmm3[9],xmm6[10],xmm3[10],xmm6[11],xmm3[11],xmm6[12],xmm3[12],xmm6[13],xmm3[13],xmm6[14],xmm3[14],xmm6[15],xmm3[15] +; AVX512BW-NEXT: vpermt2w %zmm1, %zmm12, %zmm3 +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3],xmm8[4],xmm5[4],xmm8[5],xmm5[5],xmm8[6],xmm5[6],xmm8[7],xmm5[7] +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] +; AVX512BW-NEXT: vpermt2w %zmm1, %zmm12, %zmm5 ; AVX512BW-NEXT: movw $-21846, %ax # imm = 0xAAAA ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqa32 %zmm11, %zmm0 {%k1} -; AVX512BW-NEXT: vmovdqa32 %zmm15, %zmm9 {%k1} -; AVX512BW-NEXT: vmovdqa32 %zmm19, %zmm14 {%k1} -; AVX512BW-NEXT: vmovdqa32 %zmm22, %zmm17 {%k1} -; AVX512BW-NEXT: vmovdqa32 %zmm18, %zmm16 {%k1} -; AVX512BW-NEXT: vmovdqa32 %zmm6, %zmm20 {%k1} -; AVX512BW-NEXT: vmovdqa32 %zmm2, %zmm23 {%k1} -; AVX512BW-NEXT: vmovdqa32 %zmm4, %zmm3 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm2 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm13, %zmm11 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm15, %zmm14 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm20, %zmm18 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm16, %zmm17 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm25, %zmm7 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm21, %zmm3 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm4, %zmm5 {%k1} ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovdqa64 %zmm3, (%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm23, 192(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm20, 128(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm16, 320(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm17, 256(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm5, (%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm3, 192(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm7, 128(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm17, 320(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm18, 256(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm14, 448(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm9, 384(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm0, 64(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm11, 384(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm2, 64(%rax) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; @@ -8179,172 +8150,173 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP: # %bb.0: ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512BW-FCP-NEXT: vmovdqa (%rsi), %xmm0 -; AVX512BW-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 32(%rsi), %xmm20 -; AVX512BW-FCP-NEXT: vmovdqa64 48(%rsi), %xmm17 +; AVX512BW-FCP-NEXT: vmovdqa (%rsi), %xmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 32(%rsi), %xmm19 +; AVX512BW-FCP-NEXT: vmovdqa64 48(%rsi), %xmm16 ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 32(%rdi), %xmm21 -; AVX512BW-FCP-NEXT: vmovdqa64 48(%rdi), %xmm18 -; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] +; AVX512BW-FCP-NEXT: vmovdqa64 32(%rdi), %xmm20 +; AVX512BW-FCP-NEXT: vmovdqa64 48(%rdi), %xmm17 +; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] ; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm3 -; AVX512BW-FCP-NEXT: vpmovsxwq {{.*#+}} ymm4 = [2312,2826,3340,3854] -; AVX512BW-FCP-NEXT: vpshufb %ymm4, %ymm3, %ymm3 -; AVX512BW-FCP-NEXT: vpmovsxwq {{.*#+}} xmm5 = [1284,1798] -; AVX512BW-FCP-NEXT: vpshufb %xmm5, %xmm0, %xmm6 +; AVX512BW-FCP-NEXT: vpmovsxwq {{.*#+}} ymm5 = [2312,2826,3340,3854] +; AVX512BW-FCP-NEXT: vpshufb %ymm5, %ymm3, %ymm3 +; AVX512BW-FCP-NEXT: vpmovsxwq {{.*#+}} xmm6 = [1284,1798] +; AVX512BW-FCP-NEXT: vpshufb %xmm6, %xmm0, %xmm4 ; AVX512BW-FCP-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm0 -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm1 +; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm0 +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqa (%rcx), %xmm3 -; AVX512BW-FCP-NEXT: vmovdqa64 32(%rcx), %xmm22 -; AVX512BW-FCP-NEXT: vmovdqa64 48(%rcx), %xmm19 +; AVX512BW-FCP-NEXT: vmovdqa64 32(%rcx), %xmm21 +; AVX512BW-FCP-NEXT: vmovdqa64 48(%rcx), %xmm18 ; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %xmm7 -; AVX512BW-FCP-NEXT: vmovdqa64 32(%rdx), %xmm23 -; AVX512BW-FCP-NEXT: vmovdqa64 48(%rdx), %xmm24 -; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm7[8],xmm3[8],xmm7[9],xmm3[9],xmm7[10],xmm3[10],xmm7[11],xmm3[11],xmm7[12],xmm3[12],xmm7[13],xmm3[13],xmm7[14],xmm3[14],xmm7[15],xmm3[15] -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm8 = [0,0,2,1,2,1,6,7,0,2,2,3,2,3,6,7,4,4,2,3,6,5,6,5,4,6,2,3,6,7,6,7] -; AVX512BW-FCP-NEXT: movl $572662306, %r11d # imm = 0x22222222 -; AVX512BW-FCP-NEXT: kmovd %r11d, %k1 -; AVX512BW-FCP-NEXT: vpermw %zmm6, %zmm8, %zmm1 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa (%r10), %xmm6 -; AVX512BW-FCP-NEXT: vmovdqa64 48(%r10), %xmm25 -; AVX512BW-FCP-NEXT: vmovdqa (%rax), %xmm9 +; AVX512BW-FCP-NEXT: vmovdqa64 32(%rdx), %xmm22 +; AVX512BW-FCP-NEXT: vmovdqa64 48(%rdx), %xmm23 +; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm7[8],xmm3[8],xmm7[9],xmm3[9],xmm7[10],xmm3[10],xmm7[11],xmm3[11],xmm7[12],xmm3[12],xmm7[13],xmm3[13],xmm7[14],xmm3[14],xmm7[15],xmm3[15] +; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm4, %ymm4, %ymm4 +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm9 = [0,32,2,3,4,33,6,7,8,42,10,11,12,43,14,15,16,36,18,19,20,37,22,23,24,46,26,27,28,47,30,31] +; AVX512BW-FCP-NEXT: vpermt2w %zmm4, %zmm9, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa (%r10), %xmm4 +; AVX512BW-FCP-NEXT: vmovdqa64 48(%r10), %xmm24 +; AVX512BW-FCP-NEXT: vmovdqa (%rax), %xmm8 ; AVX512BW-FCP-NEXT: vmovdqa64 48(%rax), %xmm26 -; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm15 = xmm9[8],xmm6[8],xmm9[9],xmm6[9],xmm9[10],xmm6[10],xmm9[11],xmm6[11],xmm9[12],xmm6[12],xmm9[13],xmm6[13],xmm9[14],xmm6[14],xmm9[15],xmm6[15] +; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm8[8],xmm4[8],xmm8[9],xmm4[9],xmm8[10],xmm4[10],xmm8[11],xmm4[11],xmm8[12],xmm4[12],xmm8[13],xmm4[13],xmm8[14],xmm4[14],xmm8[15],xmm4[15] +; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm10, %ymm10, %ymm13 ; AVX512BW-FCP-NEXT: vmovdqa (%r9), %xmm10 -; AVX512BW-FCP-NEXT: vmovdqa64 48(%r9), %xmm27 -; AVX512BW-FCP-NEXT: vmovdqa (%r8), %xmm11 -; AVX512BW-FCP-NEXT: vmovdqa64 48(%r8), %xmm28 -; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm11[8],xmm10[8],xmm11[9],xmm10[9],xmm11[10],xmm10[10],xmm11[11],xmm10[11],xmm11[12],xmm10[12],xmm11[13],xmm10[13],xmm11[14],xmm10[14],xmm11[15],xmm10[15] -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm13 = [0,1,0,1,4,5,1,3,2,1,2,1,4,5,3,3,0,1,4,5,4,5,5,7,0,1,6,5,6,5,7,7] -; AVX512BW-FCP-NEXT: vpermw %zmm12, %zmm13, %zmm12 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm14 = [0,0,0,0,4,5,2,1,0,2,0,2,4,5,2,3,0,1,4,4,4,4,6,5,0,1,4,6,4,6,6,7] -; AVX512BW-FCP-NEXT: movl $-2004318072, %r11d # imm = 0x88888888 -; AVX512BW-FCP-NEXT: kmovd %r11d, %k2 -; AVX512BW-FCP-NEXT: vpermw %zmm15, %zmm14, %zmm12 {%k2} -; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm18[0],xmm17[0],xmm18[1],xmm17[1],xmm18[2],xmm17[2],xmm18[3],xmm17[3],xmm18[4],xmm17[4],xmm18[5],xmm17[5],xmm18[6],xmm17[6],xmm18[7],xmm17[7] -; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm15, %ymm15, %ymm16 -; AVX512BW-FCP-NEXT: vpshufb %ymm4, %ymm16, %ymm16 -; AVX512BW-FCP-NEXT: vpshufb %xmm5, %xmm15, %xmm29 -; AVX512BW-FCP-NEXT: vpmovzxwq {{.*#+}} xmm15 = xmm15[0],zero,zero,zero,xmm15[1],zero,zero,zero -; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm29, %ymm15, %ymm15 -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm16, %zmm15, %zmm15 -; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm16 = xmm24[0],xmm19[0],xmm24[1],xmm19[1],xmm24[2],xmm19[2],xmm24[3],xmm19[3],xmm24[4],xmm19[4],xmm24[5],xmm19[5],xmm24[6],xmm19[6],xmm24[7],xmm19[7] -; AVX512BW-FCP-NEXT: vpermw %zmm16, %zmm8, %zmm15 {%k1} -; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm29 = xmm26[0],xmm25[0],xmm26[1],xmm25[1],xmm26[2],xmm25[2],xmm26[3],xmm25[3],xmm26[4],xmm25[4],xmm26[5],xmm25[5],xmm26[6],xmm25[6],xmm26[7],xmm25[7] -; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm16 = xmm28[0],xmm27[0],xmm28[1],xmm27[1],xmm28[2],xmm27[2],xmm28[3],xmm27[3],xmm28[4],xmm27[4],xmm28[5],xmm27[5],xmm28[6],xmm27[6],xmm28[7],xmm27[7] -; AVX512BW-FCP-NEXT: vpermw %zmm16, %zmm13, %zmm16 -; AVX512BW-FCP-NEXT: vpermw %zmm29, %zmm14, %zmm16 {%k2} -; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm17 = xmm18[8],xmm17[8],xmm18[9],xmm17[9],xmm18[10],xmm17[10],xmm18[11],xmm17[11],xmm18[12],xmm17[12],xmm18[13],xmm17[13],xmm18[14],xmm17[14],xmm18[15],xmm17[15] -; AVX512BW-FCP-NEXT: vpshufb %xmm5, %xmm17, %xmm18 -; AVX512BW-FCP-NEXT: vpmovzxwq {{.*#+}} xmm29 = xmm17[0],zero,zero,zero,xmm17[1],zero,zero,zero -; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm18, %ymm29, %ymm18 -; AVX512BW-FCP-NEXT: vmovdqa64 32(%r10), %xmm29 +; AVX512BW-FCP-NEXT: vmovdqa64 48(%r9), %xmm28 +; AVX512BW-FCP-NEXT: vmovdqa (%r8), %xmm12 +; AVX512BW-FCP-NEXT: vmovdqa64 48(%r8), %xmm30 +; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm12[8],xmm10[8],xmm12[9],xmm10[9],xmm12[10],xmm10[10],xmm12[11],xmm10[11],xmm12[12],xmm10[12],xmm12[13],xmm10[13],xmm12[14],xmm10[14],xmm12[15],xmm10[15] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm14 = [0,1,0,32,4,5,1,33,2,1,2,42,4,5,3,43,0,1,4,36,4,5,5,37,0,1,6,46,6,5,7,47] +; AVX512BW-FCP-NEXT: vpermt2w %zmm13, %zmm14, %zmm11 +; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm17[0],xmm16[0],xmm17[1],xmm16[1],xmm17[2],xmm16[2],xmm17[3],xmm16[3],xmm17[4],xmm16[4],xmm17[5],xmm16[5],xmm17[6],xmm16[6],xmm17[7],xmm16[7] +; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm13, %ymm13, %ymm15 +; AVX512BW-FCP-NEXT: vpshufb %ymm5, %ymm15, %ymm15 +; AVX512BW-FCP-NEXT: vpshufb %xmm6, %xmm13, %xmm25 +; AVX512BW-FCP-NEXT: vpmovzxwq {{.*#+}} xmm13 = xmm13[0],zero,zero,zero,xmm13[1],zero,zero,zero +; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm25, %ymm13, %ymm13 +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm15, %zmm13, %zmm13 +; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm23[0],xmm18[0],xmm23[1],xmm18[1],xmm23[2],xmm18[2],xmm23[3],xmm18[3],xmm23[4],xmm18[4],xmm23[5],xmm18[5],xmm23[6],xmm18[6],xmm23[7],xmm18[7] +; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm15, %ymm15, %ymm15 +; AVX512BW-FCP-NEXT: vpermt2w %zmm15, %zmm9, %zmm13 +; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm26[0],xmm24[0],xmm26[1],xmm24[1],xmm26[2],xmm24[2],xmm26[3],xmm24[3],xmm26[4],xmm24[4],xmm26[5],xmm24[5],xmm26[6],xmm24[6],xmm26[7],xmm24[7] +; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm15, %ymm15, %ymm25 +; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm30[0],xmm28[0],xmm30[1],xmm28[1],xmm30[2],xmm28[2],xmm30[3],xmm28[3],xmm30[4],xmm28[4],xmm30[5],xmm28[5],xmm30[6],xmm28[6],xmm30[7],xmm28[7] +; AVX512BW-FCP-NEXT: vpermt2w %zmm25, %zmm14, %zmm15 +; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm16 = xmm17[8],xmm16[8],xmm17[9],xmm16[9],xmm17[10],xmm16[10],xmm17[11],xmm16[11],xmm17[12],xmm16[12],xmm17[13],xmm16[13],xmm17[14],xmm16[14],xmm17[15],xmm16[15] +; AVX512BW-FCP-NEXT: vpshufb %xmm6, %xmm16, %xmm17 +; AVX512BW-FCP-NEXT: vpmovzxwq {{.*#+}} xmm25 = xmm16[0],zero,zero,zero,xmm16[1],zero,zero,zero +; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm17, %ymm25, %ymm17 +; AVX512BW-FCP-NEXT: vmovdqa64 32(%r10), %xmm25 +; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm16, %ymm16, %ymm16 +; AVX512BW-FCP-NEXT: vpshufb %ymm5, %ymm16, %ymm16 +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm16, %zmm17, %zmm16 +; AVX512BW-FCP-NEXT: vmovdqa64 32(%rax), %xmm29 +; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm17 = xmm23[8],xmm18[8],xmm23[9],xmm18[9],xmm23[10],xmm18[10],xmm23[11],xmm18[11],xmm23[12],xmm18[12],xmm23[13],xmm18[13],xmm23[14],xmm18[14],xmm23[15],xmm18[15] +; AVX512BW-FCP-NEXT: vmovdqa64 32(%r9), %xmm27 ; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm17, %ymm17, %ymm17 -; AVX512BW-FCP-NEXT: vpshufb %ymm4, %ymm17, %ymm17 -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm17, %zmm18, %zmm17 -; AVX512BW-FCP-NEXT: vmovdqa64 32(%rax), %xmm30 -; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm18 = xmm24[8],xmm19[8],xmm24[9],xmm19[9],xmm24[10],xmm19[10],xmm24[11],xmm19[11],xmm24[12],xmm19[12],xmm24[13],xmm19[13],xmm24[14],xmm19[14],xmm24[15],xmm19[15] -; AVX512BW-FCP-NEXT: vmovdqa64 32(%r9), %xmm31 -; AVX512BW-FCP-NEXT: vpermw %zmm18, %zmm8, %zmm17 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa 32(%r8), %xmm0 -; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm19 = xmm26[8],xmm25[8],xmm26[9],xmm25[9],xmm26[10],xmm25[10],xmm26[11],xmm25[11],xmm26[12],xmm25[12],xmm26[13],xmm25[13],xmm26[14],xmm25[14],xmm26[15],xmm25[15] -; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm18 = xmm28[8],xmm27[8],xmm28[9],xmm27[9],xmm28[10],xmm27[10],xmm28[11],xmm27[11],xmm28[12],xmm27[12],xmm28[13],xmm27[13],xmm28[14],xmm27[14],xmm28[15],xmm27[15] -; AVX512BW-FCP-NEXT: vpermw %zmm18, %zmm13, %zmm18 -; AVX512BW-FCP-NEXT: vpermw %zmm19, %zmm14, %zmm18 {%k2} -; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm19 = xmm21[0],xmm20[0],xmm21[1],xmm20[1],xmm21[2],xmm20[2],xmm21[3],xmm20[3],xmm21[4],xmm20[4],xmm21[5],xmm20[5],xmm21[6],xmm20[6],xmm21[7],xmm20[7] -; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm19, %ymm19, %ymm24 -; AVX512BW-FCP-NEXT: vpshufb %ymm4, %ymm24, %ymm24 -; AVX512BW-FCP-NEXT: vpshufb %xmm5, %xmm19, %xmm25 -; AVX512BW-FCP-NEXT: vpmovzxwq {{.*#+}} xmm19 = xmm19[0],zero,zero,zero,xmm19[1],zero,zero,zero -; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm25, %ymm19, %ymm19 -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm24, %zmm19, %zmm19 -; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm24 = xmm23[0],xmm22[0],xmm23[1],xmm22[1],xmm23[2],xmm22[2],xmm23[3],xmm22[3],xmm23[4],xmm22[4],xmm23[5],xmm22[5],xmm23[6],xmm22[6],xmm23[7],xmm22[7] -; AVX512BW-FCP-NEXT: vpermw %zmm24, %zmm8, %zmm19 {%k1} -; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm25 = xmm30[0],xmm29[0],xmm30[1],xmm29[1],xmm30[2],xmm29[2],xmm30[3],xmm29[3],xmm30[4],xmm29[4],xmm30[5],xmm29[5],xmm30[6],xmm29[6],xmm30[7],xmm29[7] -; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm24 = xmm0[0],xmm31[0],xmm0[1],xmm31[1],xmm0[2],xmm31[2],xmm0[3],xmm31[3],xmm0[4],xmm31[4],xmm0[5],xmm31[5],xmm0[6],xmm31[6],xmm0[7],xmm31[7] -; AVX512BW-FCP-NEXT: vpermw %zmm24, %zmm13, %zmm24 -; AVX512BW-FCP-NEXT: vpermw %zmm25, %zmm14, %zmm24 {%k2} -; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm20 = xmm21[8],xmm20[8],xmm21[9],xmm20[9],xmm21[10],xmm20[10],xmm21[11],xmm20[11],xmm21[12],xmm20[12],xmm21[13],xmm20[13],xmm21[14],xmm20[14],xmm21[15],xmm20[15] -; AVX512BW-FCP-NEXT: vpshufb %xmm5, %xmm20, %xmm21 -; AVX512BW-FCP-NEXT: vpmovzxwq {{.*#+}} xmm25 = xmm20[0],zero,zero,zero,xmm20[1],zero,zero,zero -; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm21, %ymm25, %ymm21 +; AVX512BW-FCP-NEXT: vpermt2w %zmm17, %zmm9, %zmm16 +; AVX512BW-FCP-NEXT: vmovdqa64 32(%r8), %xmm31 +; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm17 = xmm26[8],xmm24[8],xmm26[9],xmm24[9],xmm26[10],xmm24[10],xmm26[11],xmm24[11],xmm26[12],xmm24[12],xmm26[13],xmm24[13],xmm26[14],xmm24[14],xmm26[15],xmm24[15] +; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm17, %ymm17, %ymm18 +; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm17 = xmm30[8],xmm28[8],xmm30[9],xmm28[9],xmm30[10],xmm28[10],xmm30[11],xmm28[11],xmm30[12],xmm28[12],xmm30[13],xmm28[13],xmm30[14],xmm28[14],xmm30[15],xmm28[15] +; AVX512BW-FCP-NEXT: vpermt2w %zmm18, %zmm14, %zmm17 +; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm18 = xmm20[0],xmm19[0],xmm20[1],xmm19[1],xmm20[2],xmm19[2],xmm20[3],xmm19[3],xmm20[4],xmm19[4],xmm20[5],xmm19[5],xmm20[6],xmm19[6],xmm20[7],xmm19[7] +; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm18, %ymm18, %ymm23 +; AVX512BW-FCP-NEXT: vpshufb %ymm5, %ymm23, %ymm23 +; AVX512BW-FCP-NEXT: vpshufb %xmm6, %xmm18, %xmm24 +; AVX512BW-FCP-NEXT: vpmovzxwq {{.*#+}} xmm18 = xmm18[0],zero,zero,zero,xmm18[1],zero,zero,zero +; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm24, %ymm18, %ymm18 +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm23, %zmm18, %zmm18 +; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm23 = xmm22[0],xmm21[0],xmm22[1],xmm21[1],xmm22[2],xmm21[2],xmm22[3],xmm21[3],xmm22[4],xmm21[4],xmm22[5],xmm21[5],xmm22[6],xmm21[6],xmm22[7],xmm21[7] +; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm23, %ymm23, %ymm23 +; AVX512BW-FCP-NEXT: vpermt2w %zmm23, %zmm9, %zmm18 +; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm23 = xmm29[0],xmm25[0],xmm29[1],xmm25[1],xmm29[2],xmm25[2],xmm29[3],xmm25[3],xmm29[4],xmm25[4],xmm29[5],xmm25[5],xmm29[6],xmm25[6],xmm29[7],xmm25[7] +; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm23, %ymm23, %ymm24 +; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm23 = xmm31[0],xmm27[0],xmm31[1],xmm27[1],xmm31[2],xmm27[2],xmm31[3],xmm27[3],xmm31[4],xmm27[4],xmm31[5],xmm27[5],xmm31[6],xmm27[6],xmm31[7],xmm27[7] +; AVX512BW-FCP-NEXT: vpermt2w %zmm24, %zmm14, %zmm23 +; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm19 = xmm20[8],xmm19[8],xmm20[9],xmm19[9],xmm20[10],xmm19[10],xmm20[11],xmm19[11],xmm20[12],xmm19[12],xmm20[13],xmm19[13],xmm20[14],xmm19[14],xmm20[15],xmm19[15] +; AVX512BW-FCP-NEXT: vpshufb %xmm6, %xmm19, %xmm20 +; AVX512BW-FCP-NEXT: vpmovzxwq {{.*#+}} xmm24 = xmm19[0],zero,zero,zero,xmm19[1],zero,zero,zero +; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm20, %ymm24, %ymm20 +; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm19, %ymm19, %ymm19 +; AVX512BW-FCP-NEXT: vpshufb %ymm5, %ymm19, %ymm19 +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm19, %zmm20, %zmm19 +; AVX512BW-FCP-NEXT: vmovdqa64 16(%rsi), %xmm24 +; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm20 = xmm22[8],xmm21[8],xmm22[9],xmm21[9],xmm22[10],xmm21[10],xmm22[11],xmm21[11],xmm22[12],xmm21[12],xmm22[13],xmm21[13],xmm22[14],xmm21[14],xmm22[15],xmm21[15] +; AVX512BW-FCP-NEXT: vmovdqa64 16(%rdi), %xmm22 ; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm20, %ymm20, %ymm20 -; AVX512BW-FCP-NEXT: vpshufb %ymm4, %ymm20, %ymm20 -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm20, %zmm21, %zmm20 -; AVX512BW-FCP-NEXT: vmovdqa64 16(%rsi), %xmm25 -; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm21 = xmm23[8],xmm22[8],xmm23[9],xmm22[9],xmm23[10],xmm22[10],xmm23[11],xmm22[11],xmm23[12],xmm22[12],xmm23[13],xmm22[13],xmm23[14],xmm22[14],xmm23[15],xmm22[15] -; AVX512BW-FCP-NEXT: vmovdqa64 16(%rdi), %xmm23 -; AVX512BW-FCP-NEXT: vpermw %zmm21, %zmm8, %zmm20 {%k1} +; AVX512BW-FCP-NEXT: vpermt2w %zmm20, %zmm9, %zmm19 ; AVX512BW-FCP-NEXT: vmovdqa64 16(%rcx), %xmm26 -; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm22 = xmm30[8],xmm29[8],xmm30[9],xmm29[9],xmm30[10],xmm29[10],xmm30[11],xmm29[11],xmm30[12],xmm29[12],xmm30[13],xmm29[13],xmm30[14],xmm29[14],xmm30[15],xmm29[15] -; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm31[8],xmm0[9],xmm31[9],xmm0[10],xmm31[10],xmm0[11],xmm31[11],xmm0[12],xmm31[12],xmm0[13],xmm31[13],xmm0[14],xmm31[14],xmm0[15],xmm31[15] -; AVX512BW-FCP-NEXT: vpermw %zmm0, %zmm13, %zmm21 -; AVX512BW-FCP-NEXT: vpermw %zmm22, %zmm14, %zmm21 {%k2} -; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm23[0],xmm25[0],xmm23[1],xmm25[1],xmm23[2],xmm25[2],xmm23[3],xmm25[3],xmm23[4],xmm25[4],xmm23[5],xmm25[5],xmm23[6],xmm25[6],xmm23[7],xmm25[7] -; AVX512BW-FCP-NEXT: vpshufb %xmm5, %xmm0, %xmm22 -; AVX512BW-FCP-NEXT: vpmovzxwq {{.*#+}} xmm27 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm22, %ymm27, %ymm22 +; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm20 = xmm29[8],xmm25[8],xmm29[9],xmm25[9],xmm29[10],xmm25[10],xmm29[11],xmm25[11],xmm29[12],xmm25[12],xmm29[13],xmm25[13],xmm29[14],xmm25[14],xmm29[15],xmm25[15] +; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm20, %ymm20, %ymm21 +; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm20 = xmm31[8],xmm27[8],xmm31[9],xmm27[9],xmm31[10],xmm27[10],xmm31[11],xmm27[11],xmm31[12],xmm27[12],xmm31[13],xmm27[13],xmm31[14],xmm27[14],xmm31[15],xmm27[15] +; AVX512BW-FCP-NEXT: vpermt2w %zmm21, %zmm14, %zmm20 +; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm21 = xmm22[0],xmm24[0],xmm22[1],xmm24[1],xmm22[2],xmm24[2],xmm22[3],xmm24[3],xmm22[4],xmm24[4],xmm22[5],xmm24[5],xmm22[6],xmm24[6],xmm22[7],xmm24[7] +; AVX512BW-FCP-NEXT: vpshufb %xmm6, %xmm21, %xmm25 +; AVX512BW-FCP-NEXT: vpmovzxwq {{.*#+}} xmm27 = xmm21[0],zero,zero,zero,xmm21[1],zero,zero,zero +; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm25, %ymm27, %ymm25 ; AVX512BW-FCP-NEXT: vmovdqa64 16(%rdx), %xmm27 -; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512BW-FCP-NEXT: vpshufb %ymm4, %ymm0, %ymm0 -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm22, %zmm22 -; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm27[0],xmm26[0],xmm27[1],xmm26[1],xmm27[2],xmm26[2],xmm27[3],xmm26[3],xmm27[4],xmm26[4],xmm27[5],xmm26[5],xmm27[6],xmm26[6],xmm27[7],xmm26[7] -; AVX512BW-FCP-NEXT: vpermw %zmm0, %zmm8, %zmm22 {%k1} -; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm23[8],xmm25[8],xmm23[9],xmm25[9],xmm23[10],xmm25[10],xmm23[11],xmm25[11],xmm23[12],xmm25[12],xmm23[13],xmm25[13],xmm23[14],xmm25[14],xmm23[15],xmm25[15] -; AVX512BW-FCP-NEXT: vpshufb %xmm5, %xmm0, %xmm23 -; AVX512BW-FCP-NEXT: vpmovzxwq {{.*#+}} xmm25 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm23, %ymm25, %ymm23 -; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512BW-FCP-NEXT: vpshufb %ymm4, %ymm0, %ymm0 -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm23, %zmm0 -; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm23 = xmm27[8],xmm26[8],xmm27[9],xmm26[9],xmm27[10],xmm26[10],xmm27[11],xmm26[11],xmm27[12],xmm26[12],xmm27[13],xmm26[13],xmm27[14],xmm26[14],xmm27[15],xmm26[15] -; AVX512BW-FCP-NEXT: vpermw %zmm23, %zmm8, %zmm0 {%k1} -; AVX512BW-FCP-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX512BW-FCP-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3],xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] -; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm2, %ymm2, %ymm23 -; AVX512BW-FCP-NEXT: vpshufb %ymm4, %ymm23, %ymm4 -; AVX512BW-FCP-NEXT: vmovdqa64 16(%r10), %xmm23 -; AVX512BW-FCP-NEXT: vpshufb %xmm5, %xmm2, %xmm5 -; AVX512BW-FCP-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm5, %ymm2, %ymm2 -; AVX512BW-FCP-NEXT: vmovdqa 16(%rax), %xmm5 -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa 16(%r9), %xmm4 +; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm21, %ymm21, %ymm21 +; AVX512BW-FCP-NEXT: vpshufb %ymm5, %ymm21, %ymm21 +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm21, %zmm25, %zmm21 +; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm25 = xmm27[0],xmm26[0],xmm27[1],xmm26[1],xmm27[2],xmm26[2],xmm27[3],xmm26[3],xmm27[4],xmm26[4],xmm27[5],xmm26[5],xmm27[6],xmm26[6],xmm27[7],xmm26[7] +; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm25, %ymm25, %ymm25 +; AVX512BW-FCP-NEXT: vpermt2w %zmm25, %zmm9, %zmm21 +; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm22 = xmm22[8],xmm24[8],xmm22[9],xmm24[9],xmm22[10],xmm24[10],xmm22[11],xmm24[11],xmm22[12],xmm24[12],xmm22[13],xmm24[13],xmm22[14],xmm24[14],xmm22[15],xmm24[15] +; AVX512BW-FCP-NEXT: vpshufb %xmm6, %xmm22, %xmm24 +; AVX512BW-FCP-NEXT: vpmovzxwq {{.*#+}} xmm25 = xmm22[0],zero,zero,zero,xmm22[1],zero,zero,zero +; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm24, %ymm25, %ymm24 +; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm22, %ymm22, %ymm22 +; AVX512BW-FCP-NEXT: vpshufb %ymm5, %ymm22, %ymm22 +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm22, %zmm24, %zmm22 +; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm24 = xmm27[8],xmm26[8],xmm27[9],xmm26[9],xmm27[10],xmm26[10],xmm27[11],xmm26[11],xmm27[12],xmm26[12],xmm27[13],xmm26[13],xmm27[14],xmm26[14],xmm27[15],xmm26[15] +; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm24, %ymm24, %ymm24 +; AVX512BW-FCP-NEXT: vpermt2w %zmm24, %zmm9, %zmm22 +; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm2 +; AVX512BW-FCP-NEXT: vpshufb %ymm5, %ymm2, %ymm2 +; AVX512BW-FCP-NEXT: vmovdqa 16(%r10), %xmm5 +; AVX512BW-FCP-NEXT: vpshufb %xmm6, %xmm1, %xmm6 +; AVX512BW-FCP-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm6, %ymm1, %ymm1 +; AVX512BW-FCP-NEXT: vmovdqa 16(%rax), %xmm6 +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa 16(%r9), %xmm2 ; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3],xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7] ; AVX512BW-FCP-NEXT: vmovdqa 16(%r8), %xmm7 -; AVX512BW-FCP-NEXT: vpermw %zmm3, %zmm8, %zmm2 {%k1} -; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm5[0],xmm23[0],xmm5[1],xmm23[1],xmm5[2],xmm23[2],xmm5[3],xmm23[3],xmm5[4],xmm23[4],xmm5[5],xmm23[5],xmm5[6],xmm23[6],xmm5[7],xmm23[7] -; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3],xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7] -; AVX512BW-FCP-NEXT: vpermw %zmm8, %zmm13, %zmm8 -; AVX512BW-FCP-NEXT: vpermw %zmm3, %zmm14, %zmm8 {%k2} -; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm5[8],xmm23[8],xmm5[9],xmm23[9],xmm5[10],xmm23[10],xmm5[11],xmm23[11],xmm5[12],xmm23[12],xmm5[13],xmm23[13],xmm5[14],xmm23[14],xmm5[15],xmm23[15] -; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm7[8],xmm4[8],xmm7[9],xmm4[9],xmm7[10],xmm4[10],xmm7[11],xmm4[11],xmm7[12],xmm4[12],xmm7[13],xmm4[13],xmm7[14],xmm4[14],xmm7[15],xmm4[15] -; AVX512BW-FCP-NEXT: vpermw %zmm4, %zmm13, %zmm4 -; AVX512BW-FCP-NEXT: vpermw %zmm3, %zmm14, %zmm4 {%k2} -; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3],xmm9[4],xmm6[4],xmm9[5],xmm6[5],xmm9[6],xmm6[6],xmm9[7],xmm6[7] -; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3],xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] -; AVX512BW-FCP-NEXT: vpermw %zmm5, %zmm13, %zmm5 -; AVX512BW-FCP-NEXT: vpermw %zmm3, %zmm14, %zmm5 {%k2} +; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm3 +; AVX512BW-FCP-NEXT: vpermt2w %zmm3, %zmm9, %zmm1 +; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm3 +; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm7[0],xmm2[0],xmm7[1],xmm2[1],xmm7[2],xmm2[2],xmm7[3],xmm2[3],xmm7[4],xmm2[4],xmm7[5],xmm2[5],xmm7[6],xmm2[6],xmm7[7],xmm2[7] +; AVX512BW-FCP-NEXT: vpermt2w %zmm3, %zmm14, %zmm9 +; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15] +; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm7[8],xmm2[8],xmm7[9],xmm2[9],xmm7[10],xmm2[10],xmm7[11],xmm2[11],xmm7[12],xmm2[12],xmm7[13],xmm2[13],xmm7[14],xmm2[14],xmm7[15],xmm2[15] +; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm3 +; AVX512BW-FCP-NEXT: vpermt2w %zmm3, %zmm14, %zmm2 +; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3],xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7] +; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3],xmm12[4],xmm10[4],xmm12[5],xmm10[5],xmm12[6],xmm10[6],xmm12[7],xmm10[7] +; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm3 +; AVX512BW-FCP-NEXT: vpermt2w %zmm3, %zmm14, %zmm4 ; AVX512BW-FCP-NEXT: movw $-21846, %ax # imm = 0xAAAA ; AVX512BW-FCP-NEXT: kmovd %eax, %k1 -; AVX512BW-FCP-NEXT: vmovdqa32 %zmm12, %zmm1 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa32 %zmm16, %zmm15 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa32 %zmm18, %zmm17 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa32 %zmm24, %zmm19 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa32 %zmm21, %zmm20 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa32 %zmm8, %zmm22 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa32 %zmm4, %zmm0 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa32 %zmm5, %zmm2 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa32 %zmm11, %zmm0 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa32 %zmm15, %zmm13 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa32 %zmm17, %zmm16 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa32 %zmm23, %zmm18 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa32 %zmm20, %zmm19 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa32 %zmm9, %zmm21 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa32 %zmm2, %zmm22 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa32 %zmm4, %zmm1 {%k1} ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, (%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, 192(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, 128(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, 320(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, 256(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, 448(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, 384(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, 64(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, (%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, 192(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, 128(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, 320(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, 256(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, 448(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, 384(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, 64(%rax) ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq ; @@ -8352,128 +8324,107 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW: # %bb.0: ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512DQ-BW-NEXT: vmovdqa (%rcx), %xmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 32(%rcx), %xmm16 -; AVX512DQ-BW-NEXT: vmovdqa 48(%rcx), %xmm14 -; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %xmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 32(%rdx), %xmm18 -; AVX512DQ-BW-NEXT: vmovdqa64 48(%rdx), %xmm17 -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] -; AVX512DQ-BW-NEXT: vmovdqa (%rsi), %xmm3 -; AVX512DQ-BW-NEXT: vmovdqa64 32(%rsi), %xmm20 -; AVX512DQ-BW-NEXT: vmovdqa64 48(%rsi), %xmm19 -; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm5 -; AVX512DQ-BW-NEXT: vmovdqa64 32(%rdi), %xmm21 -; AVX512DQ-BW-NEXT: vmovdqa64 48(%rdi), %xmm22 -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm5[8],xmm3[8],xmm5[9],xmm3[9],xmm5[10],xmm3[10],xmm5[11],xmm3[11],xmm5[12],xmm3[12],xmm5[13],xmm3[13],xmm5[14],xmm3[14],xmm5[15],xmm3[15] -; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm6 = [0,0,2,1,2,1,6,7,0,2,2,3,2,3,6,7,4,4,2,3,6,5,6,5,4,6,2,3,6,7,6,7] -; AVX512DQ-BW-NEXT: movl $572662306, %r11d # imm = 0x22222222 -; AVX512DQ-BW-NEXT: kmovd %r11d, %k1 -; AVX512DQ-BW-NEXT: vpermw %zmm4, %zmm6, %zmm0 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa (%r10), %xmm4 -; AVX512DQ-BW-NEXT: vmovdqa64 48(%r10), %xmm23 -; AVX512DQ-BW-NEXT: vmovdqa (%rax), %xmm7 -; AVX512DQ-BW-NEXT: vmovdqa64 48(%rax), %xmm24 -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm7[8],xmm4[8],xmm7[9],xmm4[9],xmm7[10],xmm4[10],xmm7[11],xmm4[11],xmm7[12],xmm4[12],xmm7[13],xmm4[13],xmm7[14],xmm4[14],xmm7[15],xmm4[15] -; AVX512DQ-BW-NEXT: vmovdqa (%r9), %xmm8 -; AVX512DQ-BW-NEXT: vmovdqa64 48(%r9), %xmm25 -; AVX512DQ-BW-NEXT: vmovdqa (%r8), %xmm10 -; AVX512DQ-BW-NEXT: vmovdqa64 48(%r8), %xmm26 -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm10[8],xmm8[8],xmm10[9],xmm8[9],xmm10[10],xmm8[10],xmm10[11],xmm8[11],xmm10[12],xmm8[12],xmm10[13],xmm8[13],xmm10[14],xmm8[14],xmm10[15],xmm8[15] -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm12 = [0,1,0,1,4,5,1,3,2,1,2,1,4,5,3,3,0,1,4,5,4,5,5,7,0,1,6,5,6,5,7,7] -; AVX512DQ-BW-NEXT: vpermw %zmm11, %zmm12, %zmm11 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm13 = [0,0,0,0,4,5,2,1,0,2,0,2,4,5,2,3,0,1,4,4,4,4,6,5,0,1,4,6,4,6,6,7] -; AVX512DQ-BW-NEXT: movl $-2004318072, %r11d # imm = 0x88888888 -; AVX512DQ-BW-NEXT: kmovd %r11d, %k2 -; AVX512DQ-BW-NEXT: vpermw %zmm9, %zmm13, %zmm11 {%k2} -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm17[0],xmm14[0],xmm17[1],xmm14[1],xmm17[2],xmm14[2],xmm17[3],xmm14[3],xmm17[4],xmm14[4],xmm17[5],xmm14[5],xmm17[6],xmm14[6],xmm17[7],xmm14[7] -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm22[0],xmm19[0],xmm22[1],xmm19[1],xmm22[2],xmm19[2],xmm22[3],xmm19[3],xmm22[4],xmm19[4],xmm22[5],xmm19[5],xmm22[6],xmm19[6],xmm22[7],xmm19[7] -; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} zmm9 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero,xmm9[2],zero,zero,zero,xmm9[3],zero,zero,zero,xmm9[4],zero,zero,zero,xmm9[5],zero,zero,zero,xmm9[6],zero,zero,zero,xmm9[7],zero,zero,zero -; AVX512DQ-BW-NEXT: vpermw %zmm15, %zmm6, %zmm9 {%k1} -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm27 = xmm24[0],xmm23[0],xmm24[1],xmm23[1],xmm24[2],xmm23[2],xmm24[3],xmm23[3],xmm24[4],xmm23[4],xmm24[5],xmm23[5],xmm24[6],xmm23[6],xmm24[7],xmm23[7] -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm26[0],xmm25[0],xmm26[1],xmm25[1],xmm26[2],xmm25[2],xmm26[3],xmm25[3],xmm26[4],xmm25[4],xmm26[5],xmm25[5],xmm26[6],xmm25[6],xmm26[7],xmm25[7] -; AVX512DQ-BW-NEXT: vpermw %zmm15, %zmm12, %zmm15 -; AVX512DQ-BW-NEXT: vpermw %zmm27, %zmm13, %zmm15 {%k2} -; AVX512DQ-BW-NEXT: vmovdqa64 32(%r10), %xmm27 -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm17 = xmm17[8],xmm14[8],xmm17[9],xmm14[9],xmm17[10],xmm14[10],xmm17[11],xmm14[11],xmm17[12],xmm14[12],xmm17[13],xmm14[13],xmm17[14],xmm14[14],xmm17[15],xmm14[15] -; AVX512DQ-BW-NEXT: vmovdqa64 32(%rax), %xmm28 -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm14 = xmm22[8],xmm19[8],xmm22[9],xmm19[9],xmm22[10],xmm19[10],xmm22[11],xmm19[11],xmm22[12],xmm19[12],xmm22[13],xmm19[13],xmm22[14],xmm19[14],xmm22[15],xmm19[15] -; AVX512DQ-BW-NEXT: vmovdqa64 32(%r9), %xmm29 -; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} zmm14 = xmm14[0],zero,zero,zero,xmm14[1],zero,zero,zero,xmm14[2],zero,zero,zero,xmm14[3],zero,zero,zero,xmm14[4],zero,zero,zero,xmm14[5],zero,zero,zero,xmm14[6],zero,zero,zero,xmm14[7],zero,zero,zero -; AVX512DQ-BW-NEXT: vpermw %zmm17, %zmm6, %zmm14 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 32(%r8), %xmm30 -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm17 = xmm24[8],xmm23[8],xmm24[9],xmm23[9],xmm24[10],xmm23[10],xmm24[11],xmm23[11],xmm24[12],xmm23[12],xmm24[13],xmm23[13],xmm24[14],xmm23[14],xmm24[15],xmm23[15] -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm19 = xmm26[8],xmm25[8],xmm26[9],xmm25[9],xmm26[10],xmm25[10],xmm26[11],xmm25[11],xmm26[12],xmm25[12],xmm26[13],xmm25[13],xmm26[14],xmm25[14],xmm26[15],xmm25[15] -; AVX512DQ-BW-NEXT: vpermw %zmm19, %zmm12, %zmm19 -; AVX512DQ-BW-NEXT: vpermw %zmm17, %zmm13, %zmm19 {%k2} -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm22 = xmm18[0],xmm16[0],xmm18[1],xmm16[1],xmm18[2],xmm16[2],xmm18[3],xmm16[3],xmm18[4],xmm16[4],xmm18[5],xmm16[5],xmm18[6],xmm16[6],xmm18[7],xmm16[7] -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm17 = xmm21[0],xmm20[0],xmm21[1],xmm20[1],xmm21[2],xmm20[2],xmm21[3],xmm20[3],xmm21[4],xmm20[4],xmm21[5],xmm20[5],xmm21[6],xmm20[6],xmm21[7],xmm20[7] -; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} zmm17 = xmm17[0],zero,zero,zero,xmm17[1],zero,zero,zero,xmm17[2],zero,zero,zero,xmm17[3],zero,zero,zero,xmm17[4],zero,zero,zero,xmm17[5],zero,zero,zero,xmm17[6],zero,zero,zero,xmm17[7],zero,zero,zero -; AVX512DQ-BW-NEXT: vpermw %zmm22, %zmm6, %zmm17 {%k1} -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm23 = xmm28[0],xmm27[0],xmm28[1],xmm27[1],xmm28[2],xmm27[2],xmm28[3],xmm27[3],xmm28[4],xmm27[4],xmm28[5],xmm27[5],xmm28[6],xmm27[6],xmm28[7],xmm27[7] -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm22 = xmm30[0],xmm29[0],xmm30[1],xmm29[1],xmm30[2],xmm29[2],xmm30[3],xmm29[3],xmm30[4],xmm29[4],xmm30[5],xmm29[5],xmm30[6],xmm29[6],xmm30[7],xmm29[7] -; AVX512DQ-BW-NEXT: vpermw %zmm22, %zmm12, %zmm22 -; AVX512DQ-BW-NEXT: vpermw %zmm23, %zmm13, %zmm22 {%k2} -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm18 = xmm18[8],xmm16[8],xmm18[9],xmm16[9],xmm18[10],xmm16[10],xmm18[11],xmm16[11],xmm18[12],xmm16[12],xmm18[13],xmm16[13],xmm18[14],xmm16[14],xmm18[15],xmm16[15] -; AVX512DQ-BW-NEXT: vmovdqa64 16(%rcx), %xmm23 -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm16 = xmm21[8],xmm20[8],xmm21[9],xmm20[9],xmm21[10],xmm20[10],xmm21[11],xmm20[11],xmm21[12],xmm20[12],xmm21[13],xmm20[13],xmm21[14],xmm20[14],xmm21[15],xmm20[15] -; AVX512DQ-BW-NEXT: vmovdqa64 16(%rdx), %xmm21 -; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} zmm16 = xmm16[0],zero,zero,zero,xmm16[1],zero,zero,zero,xmm16[2],zero,zero,zero,xmm16[3],zero,zero,zero,xmm16[4],zero,zero,zero,xmm16[5],zero,zero,zero,xmm16[6],zero,zero,zero,xmm16[7],zero,zero,zero -; AVX512DQ-BW-NEXT: vpermw %zmm18, %zmm6, %zmm16 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 16(%rsi), %xmm24 -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm20 = xmm28[8],xmm27[8],xmm28[9],xmm27[9],xmm28[10],xmm27[10],xmm28[11],xmm27[11],xmm28[12],xmm27[12],xmm28[13],xmm27[13],xmm28[14],xmm27[14],xmm28[15],xmm27[15] -; AVX512DQ-BW-NEXT: vmovdqa64 16(%rdi), %xmm25 -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm18 = xmm30[8],xmm29[8],xmm30[9],xmm29[9],xmm30[10],xmm29[10],xmm30[11],xmm29[11],xmm30[12],xmm29[12],xmm30[13],xmm29[13],xmm30[14],xmm29[14],xmm30[15],xmm29[15] -; AVX512DQ-BW-NEXT: vpermw %zmm18, %zmm12, %zmm18 -; AVX512DQ-BW-NEXT: vpermw %zmm20, %zmm13, %zmm18 {%k2} -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm26 = xmm21[0],xmm23[0],xmm21[1],xmm23[1],xmm21[2],xmm23[2],xmm21[3],xmm23[3],xmm21[4],xmm23[4],xmm21[5],xmm23[5],xmm21[6],xmm23[6],xmm21[7],xmm23[7] -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm20 = xmm25[0],xmm24[0],xmm25[1],xmm24[1],xmm25[2],xmm24[2],xmm25[3],xmm24[3],xmm25[4],xmm24[4],xmm25[5],xmm24[5],xmm25[6],xmm24[6],xmm25[7],xmm24[7] -; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} zmm20 = xmm20[0],zero,zero,zero,xmm20[1],zero,zero,zero,xmm20[2],zero,zero,zero,xmm20[3],zero,zero,zero,xmm20[4],zero,zero,zero,xmm20[5],zero,zero,zero,xmm20[6],zero,zero,zero,xmm20[7],zero,zero,zero -; AVX512DQ-BW-NEXT: vpermw %zmm26, %zmm6, %zmm20 {%k1} -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm21 = xmm21[8],xmm23[8],xmm21[9],xmm23[9],xmm21[10],xmm23[10],xmm21[11],xmm23[11],xmm21[12],xmm23[12],xmm21[13],xmm23[13],xmm21[14],xmm23[14],xmm21[15],xmm23[15] -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm23 = xmm25[8],xmm24[8],xmm25[9],xmm24[9],xmm25[10],xmm24[10],xmm25[11],xmm24[11],xmm25[12],xmm24[12],xmm25[13],xmm24[13],xmm25[14],xmm24[14],xmm25[15],xmm24[15] -; AVX512DQ-BW-NEXT: vmovdqa64 16(%r10), %xmm24 -; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} zmm23 = xmm23[0],zero,zero,zero,xmm23[1],zero,zero,zero,xmm23[2],zero,zero,zero,xmm23[3],zero,zero,zero,xmm23[4],zero,zero,zero,xmm23[5],zero,zero,zero,xmm23[6],zero,zero,zero,xmm23[7],zero,zero,zero -; AVX512DQ-BW-NEXT: vpermw %zmm21, %zmm6, %zmm23 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 16(%rax), %xmm21 -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; AVX512DQ-BW-NEXT: vmovdqa 16(%r9), %xmm2 -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3],xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] -; AVX512DQ-BW-NEXT: vmovdqa 16(%r8), %xmm5 -; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} zmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero,xmm3[4],zero,zero,zero,xmm3[5],zero,zero,zero,xmm3[6],zero,zero,zero,xmm3[7],zero,zero,zero -; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm6, %zmm3 {%k1} -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm21[0],xmm24[0],xmm21[1],xmm24[1],xmm21[2],xmm24[2],xmm21[3],xmm24[3],xmm21[4],xmm24[4],xmm21[5],xmm24[5],xmm21[6],xmm24[6],xmm21[7],xmm24[7] -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3],xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7] -; AVX512DQ-BW-NEXT: vpermw %zmm6, %zmm12, %zmm6 -; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm13, %zmm6 {%k2} -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm21[8],xmm24[8],xmm21[9],xmm24[9],xmm21[10],xmm24[10],xmm21[11],xmm24[11],xmm21[12],xmm24[12],xmm21[13],xmm24[13],xmm21[14],xmm24[14],xmm21[15],xmm24[15] -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm5[8],xmm2[8],xmm5[9],xmm2[9],xmm5[10],xmm2[10],xmm5[11],xmm2[11],xmm5[12],xmm2[12],xmm5[13],xmm2[13],xmm5[14],xmm2[14],xmm5[15],xmm2[15] -; AVX512DQ-BW-NEXT: vpermw %zmm2, %zmm12, %zmm2 -; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm13, %zmm2 {%k2} -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3],xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7] -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3],xmm10[4],xmm8[4],xmm10[5],xmm8[5],xmm10[6],xmm8[6],xmm10[7],xmm8[7] -; AVX512DQ-BW-NEXT: vpermw %zmm4, %zmm12, %zmm4 -; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm13, %zmm4 {%k2} +; AVX512DQ-BW-NEXT: vmovdqa (%r10), %xmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 32(%r10), %xmm16 +; AVX512DQ-BW-NEXT: vmovdqa 48(%r10), %xmm14 +; AVX512DQ-BW-NEXT: vmovdqa (%rax), %xmm3 +; AVX512DQ-BW-NEXT: vmovdqa64 32(%rax), %xmm17 +; AVX512DQ-BW-NEXT: vmovdqa 48(%rax), %xmm15 +; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15] +; AVX512DQ-BW-NEXT: vmovdqa (%r9), %xmm4 +; AVX512DQ-BW-NEXT: vmovdqa64 32(%r9), %xmm19 +; AVX512DQ-BW-NEXT: vmovdqa64 48(%r9), %xmm18 +; AVX512DQ-BW-NEXT: vmovdqa (%r8), %xmm6 +; AVX512DQ-BW-NEXT: vmovdqa64 32(%r8), %xmm21 +; AVX512DQ-BW-NEXT: vmovdqa64 48(%r8), %xmm20 +; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm6[8],xmm4[8],xmm6[9],xmm4[9],xmm6[10],xmm4[10],xmm6[11],xmm4[11],xmm6[12],xmm4[12],xmm6[13],xmm4[13],xmm6[14],xmm4[14],xmm6[15],xmm4[15] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm7 = [0,1,0,32,4,5,1,33,2,1,2,34,4,5,3,35,0,1,4,36,4,5,5,37,0,1,6,38,6,5,7,39] +; AVX512DQ-BW-NEXT: vpermt2w %zmm2, %zmm7, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa (%rcx), %xmm5 +; AVX512DQ-BW-NEXT: vmovdqa64 48(%rcx), %xmm22 +; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %xmm8 +; AVX512DQ-BW-NEXT: vmovdqa64 48(%rdx), %xmm23 +; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm8[8],xmm5[8],xmm8[9],xmm5[9],xmm8[10],xmm5[10],xmm8[11],xmm5[11],xmm8[12],xmm5[12],xmm8[13],xmm5[13],xmm8[14],xmm5[14],xmm8[15],xmm5[15] +; AVX512DQ-BW-NEXT: vmovdqa (%rsi), %xmm9 +; AVX512DQ-BW-NEXT: vmovdqa64 48(%rsi), %xmm24 +; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm10 +; AVX512DQ-BW-NEXT: vmovdqa64 48(%rdi), %xmm25 +; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm10[8],xmm9[8],xmm10[9],xmm9[9],xmm10[10],xmm9[10],xmm10[11],xmm9[11],xmm10[12],xmm9[12],xmm10[13],xmm9[13],xmm10[14],xmm9[14],xmm10[15],xmm9[15] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm12 = [0,32,0,0,1,33,0,0,2,34,0,0,3,35,0,0,4,36,0,0,5,37,0,0,6,38,0,0,7,39,0,0] +; AVX512DQ-BW-NEXT: vpermt2w %zmm11, %zmm12, %zmm2 +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3],xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7] +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm20[0],xmm18[0],xmm20[1],xmm18[1],xmm20[2],xmm18[2],xmm20[3],xmm18[3],xmm20[4],xmm18[4],xmm20[5],xmm18[5],xmm20[6],xmm18[6],xmm20[7],xmm18[7] +; AVX512DQ-BW-NEXT: vpermt2w %zmm11, %zmm7, %zmm13 +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm26 = xmm23[0],xmm22[0],xmm23[1],xmm22[1],xmm23[2],xmm22[2],xmm23[3],xmm22[3],xmm23[4],xmm22[4],xmm23[5],xmm22[5],xmm23[6],xmm22[6],xmm23[7],xmm22[7] +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm25[0],xmm24[0],xmm25[1],xmm24[1],xmm25[2],xmm24[2],xmm25[3],xmm24[3],xmm25[4],xmm24[4],xmm25[5],xmm24[5],xmm25[6],xmm24[6],xmm25[7],xmm24[7] +; AVX512DQ-BW-NEXT: vpermt2w %zmm26, %zmm12, %zmm11 +; AVX512DQ-BW-NEXT: vmovdqa64 32(%rcx), %xmm26 +; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm14 = xmm15[8],xmm14[8],xmm15[9],xmm14[9],xmm15[10],xmm14[10],xmm15[11],xmm14[11],xmm15[12],xmm14[12],xmm15[13],xmm14[13],xmm15[14],xmm14[14],xmm15[15],xmm14[15] +; AVX512DQ-BW-NEXT: vmovdqa64 32(%rdx), %xmm27 +; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm15 = xmm20[8],xmm18[8],xmm20[9],xmm18[9],xmm20[10],xmm18[10],xmm20[11],xmm18[11],xmm20[12],xmm18[12],xmm20[13],xmm18[13],xmm20[14],xmm18[14],xmm20[15],xmm18[15] +; AVX512DQ-BW-NEXT: vmovdqa64 32(%rsi), %xmm28 +; AVX512DQ-BW-NEXT: vpermt2w %zmm14, %zmm7, %zmm15 +; AVX512DQ-BW-NEXT: vmovdqa64 32(%rdi), %xmm29 +; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm18 = xmm23[8],xmm22[8],xmm23[9],xmm22[9],xmm23[10],xmm22[10],xmm23[11],xmm22[11],xmm23[12],xmm22[12],xmm23[13],xmm22[13],xmm23[14],xmm22[14],xmm23[15],xmm22[15] +; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm14 = xmm25[8],xmm24[8],xmm25[9],xmm24[9],xmm25[10],xmm24[10],xmm25[11],xmm24[11],xmm25[12],xmm24[12],xmm25[13],xmm24[13],xmm25[14],xmm24[14],xmm25[15],xmm24[15] +; AVX512DQ-BW-NEXT: vpermt2w %zmm18, %zmm12, %zmm14 +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm18 = xmm17[0],xmm16[0],xmm17[1],xmm16[1],xmm17[2],xmm16[2],xmm17[3],xmm16[3],xmm17[4],xmm16[4],xmm17[5],xmm16[5],xmm17[6],xmm16[6],xmm17[7],xmm16[7] +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm20 = xmm21[0],xmm19[0],xmm21[1],xmm19[1],xmm21[2],xmm19[2],xmm21[3],xmm19[3],xmm21[4],xmm19[4],xmm21[5],xmm19[5],xmm21[6],xmm19[6],xmm21[7],xmm19[7] +; AVX512DQ-BW-NEXT: vpermt2w %zmm18, %zmm7, %zmm20 +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm22 = xmm27[0],xmm26[0],xmm27[1],xmm26[1],xmm27[2],xmm26[2],xmm27[3],xmm26[3],xmm27[4],xmm26[4],xmm27[5],xmm26[5],xmm27[6],xmm26[6],xmm27[7],xmm26[7] +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm18 = xmm29[0],xmm28[0],xmm29[1],xmm28[1],xmm29[2],xmm28[2],xmm29[3],xmm28[3],xmm29[4],xmm28[4],xmm29[5],xmm28[5],xmm29[6],xmm28[6],xmm29[7],xmm28[7] +; AVX512DQ-BW-NEXT: vpermt2w %zmm22, %zmm12, %zmm18 +; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm17 = xmm17[8],xmm16[8],xmm17[9],xmm16[9],xmm17[10],xmm16[10],xmm17[11],xmm16[11],xmm17[12],xmm16[12],xmm17[13],xmm16[13],xmm17[14],xmm16[14],xmm17[15],xmm16[15] +; AVX512DQ-BW-NEXT: vmovdqa64 16(%r10), %xmm22 +; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm16 = xmm21[8],xmm19[8],xmm21[9],xmm19[9],xmm21[10],xmm19[10],xmm21[11],xmm19[11],xmm21[12],xmm19[12],xmm21[13],xmm19[13],xmm21[14],xmm19[14],xmm21[15],xmm19[15] +; AVX512DQ-BW-NEXT: vmovdqa64 16(%rax), %xmm19 +; AVX512DQ-BW-NEXT: vpermt2w %zmm17, %zmm7, %zmm16 +; AVX512DQ-BW-NEXT: vmovdqa64 16(%r9), %xmm21 +; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm23 = xmm27[8],xmm26[8],xmm27[9],xmm26[9],xmm27[10],xmm26[10],xmm27[11],xmm26[11],xmm27[12],xmm26[12],xmm27[13],xmm26[13],xmm27[14],xmm26[14],xmm27[15],xmm26[15] +; AVX512DQ-BW-NEXT: vmovdqa64 16(%r8), %xmm24 +; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm17 = xmm29[8],xmm28[8],xmm29[9],xmm28[9],xmm29[10],xmm28[10],xmm29[11],xmm28[11],xmm29[12],xmm28[12],xmm29[13],xmm28[13],xmm29[14],xmm28[14],xmm29[15],xmm28[15] +; AVX512DQ-BW-NEXT: vpermt2w %zmm23, %zmm12, %zmm17 +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm23 = xmm19[0],xmm22[0],xmm19[1],xmm22[1],xmm19[2],xmm22[2],xmm19[3],xmm22[3],xmm19[4],xmm22[4],xmm19[5],xmm22[5],xmm19[6],xmm22[6],xmm19[7],xmm22[7] +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm25 = xmm24[0],xmm21[0],xmm24[1],xmm21[1],xmm24[2],xmm21[2],xmm24[3],xmm21[3],xmm24[4],xmm21[4],xmm24[5],xmm21[5],xmm24[6],xmm21[6],xmm24[7],xmm21[7] +; AVX512DQ-BW-NEXT: vpermt2w %zmm23, %zmm7, %zmm25 +; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm19 = xmm19[8],xmm22[8],xmm19[9],xmm22[9],xmm19[10],xmm22[10],xmm19[11],xmm22[11],xmm19[12],xmm22[12],xmm19[13],xmm22[13],xmm19[14],xmm22[14],xmm19[15],xmm22[15] +; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm21 = xmm24[8],xmm21[8],xmm24[9],xmm21[9],xmm24[10],xmm21[10],xmm24[11],xmm21[11],xmm24[12],xmm21[12],xmm24[13],xmm21[13],xmm24[14],xmm21[14],xmm24[15],xmm21[15] +; AVX512DQ-BW-NEXT: vmovdqa64 16(%rcx), %xmm22 +; AVX512DQ-BW-NEXT: vpermt2w %zmm19, %zmm7, %zmm21 +; AVX512DQ-BW-NEXT: vmovdqa64 16(%rdx), %xmm19 +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] +; AVX512DQ-BW-NEXT: vmovdqa 16(%rsi), %xmm3 +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3],xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] +; AVX512DQ-BW-NEXT: vmovdqa 16(%rdi), %xmm6 +; AVX512DQ-BW-NEXT: vpermt2w %zmm1, %zmm7, %zmm4 +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm19[0],xmm22[0],xmm19[1],xmm22[1],xmm19[2],xmm22[2],xmm19[3],xmm22[3],xmm19[4],xmm22[4],xmm19[5],xmm22[5],xmm19[6],xmm22[6],xmm19[7],xmm22[7] +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3],xmm6[4],xmm3[4],xmm6[5],xmm3[5],xmm6[6],xmm3[6],xmm6[7],xmm3[7] +; AVX512DQ-BW-NEXT: vpermt2w %zmm1, %zmm12, %zmm7 +; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm19[8],xmm22[8],xmm19[9],xmm22[9],xmm19[10],xmm22[10],xmm19[11],xmm22[11],xmm19[12],xmm22[12],xmm19[13],xmm22[13],xmm19[14],xmm22[14],xmm19[15],xmm22[15] +; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm6[8],xmm3[8],xmm6[9],xmm3[9],xmm6[10],xmm3[10],xmm6[11],xmm3[11],xmm6[12],xmm3[12],xmm6[13],xmm3[13],xmm6[14],xmm3[14],xmm6[15],xmm3[15] +; AVX512DQ-BW-NEXT: vpermt2w %zmm1, %zmm12, %zmm3 +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3],xmm8[4],xmm5[4],xmm8[5],xmm5[5],xmm8[6],xmm5[6],xmm8[7],xmm5[7] +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] +; AVX512DQ-BW-NEXT: vpermt2w %zmm1, %zmm12, %zmm5 ; AVX512DQ-BW-NEXT: movw $-21846, %ax # imm = 0xAAAA ; AVX512DQ-BW-NEXT: kmovd %eax, %k1 -; AVX512DQ-BW-NEXT: vmovdqa32 %zmm11, %zmm0 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa32 %zmm15, %zmm9 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa32 %zmm19, %zmm14 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa32 %zmm22, %zmm17 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa32 %zmm18, %zmm16 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa32 %zmm6, %zmm20 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa32 %zmm2, %zmm23 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa32 %zmm4, %zmm3 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa32 %zmm0, %zmm2 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa32 %zmm13, %zmm11 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa32 %zmm15, %zmm14 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa32 %zmm20, %zmm18 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa32 %zmm16, %zmm17 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa32 %zmm25, %zmm7 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa32 %zmm21, %zmm3 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa32 %zmm4, %zmm5 {%k1} ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, (%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, 192(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, 128(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, 320(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, 256(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, (%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, 192(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, 128(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, 320(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, 256(%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, 448(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, 384(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, 64(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, 384(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, 64(%rax) ; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq ; @@ -8481,172 +8432,173 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP: # %bb.0: ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rsi), %xmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 32(%rsi), %xmm20 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 48(%rsi), %xmm17 +; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rsi), %xmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 32(%rsi), %xmm19 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 48(%rsi), %xmm16 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 32(%rdi), %xmm21 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 48(%rdi), %xmm18 -; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 32(%rdi), %xmm20 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 48(%rdi), %xmm17 +; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm3 -; AVX512DQ-BW-FCP-NEXT: vpmovsxwq {{.*#+}} ymm4 = [2312,2826,3340,3854] -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm4, %ymm3, %ymm3 -; AVX512DQ-BW-FCP-NEXT: vpmovsxwq {{.*#+}} xmm5 = [1284,1798] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm5, %xmm0, %xmm6 +; AVX512DQ-BW-FCP-NEXT: vpmovsxwq {{.*#+}} ymm5 = [2312,2826,3340,3854] +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm5, %ymm3, %ymm3 +; AVX512DQ-BW-FCP-NEXT: vpmovsxwq {{.*#+}} xmm6 = [1284,1798] +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm6, %xmm0, %xmm4 ; AVX512DQ-BW-FCP-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm0 -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm0 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rcx), %xmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 32(%rcx), %xmm22 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 48(%rcx), %xmm19 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 32(%rcx), %xmm21 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 48(%rcx), %xmm18 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %xmm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 32(%rdx), %xmm23 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 48(%rdx), %xmm24 -; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm7[8],xmm3[8],xmm7[9],xmm3[9],xmm7[10],xmm3[10],xmm7[11],xmm3[11],xmm7[12],xmm3[12],xmm7[13],xmm3[13],xmm7[14],xmm3[14],xmm7[15],xmm3[15] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm8 = [0,0,2,1,2,1,6,7,0,2,2,3,2,3,6,7,4,4,2,3,6,5,6,5,4,6,2,3,6,7,6,7] -; AVX512DQ-BW-FCP-NEXT: movl $572662306, %r11d # imm = 0x22222222 -; AVX512DQ-BW-FCP-NEXT: kmovd %r11d, %k1 -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm6, %zmm8, %zmm1 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r10), %xmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 48(%r10), %xmm25 -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rax), %xmm9 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 32(%rdx), %xmm22 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 48(%rdx), %xmm23 +; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm7[8],xmm3[8],xmm7[9],xmm3[9],xmm7[10],xmm3[10],xmm7[11],xmm3[11],xmm7[12],xmm3[12],xmm7[13],xmm3[13],xmm7[14],xmm3[14],xmm7[15],xmm3[15] +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm4, %ymm4, %ymm4 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm9 = [0,32,2,3,4,33,6,7,8,42,10,11,12,43,14,15,16,36,18,19,20,37,22,23,24,46,26,27,28,47,30,31] +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm4, %zmm9, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r10), %xmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 48(%r10), %xmm24 +; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rax), %xmm8 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 48(%rax), %xmm26 -; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm15 = xmm9[8],xmm6[8],xmm9[9],xmm6[9],xmm9[10],xmm6[10],xmm9[11],xmm6[11],xmm9[12],xmm6[12],xmm9[13],xmm6[13],xmm9[14],xmm6[14],xmm9[15],xmm6[15] +; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm8[8],xmm4[8],xmm8[9],xmm4[9],xmm8[10],xmm4[10],xmm8[11],xmm4[11],xmm8[12],xmm4[12],xmm8[13],xmm4[13],xmm8[14],xmm4[14],xmm8[15],xmm4[15] +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm10, %ymm10, %ymm13 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r9), %xmm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 48(%r9), %xmm27 -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r8), %xmm11 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 48(%r8), %xmm28 -; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm11[8],xmm10[8],xmm11[9],xmm10[9],xmm11[10],xmm10[10],xmm11[11],xmm10[11],xmm11[12],xmm10[12],xmm11[13],xmm10[13],xmm11[14],xmm10[14],xmm11[15],xmm10[15] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm13 = [0,1,0,1,4,5,1,3,2,1,2,1,4,5,3,3,0,1,4,5,4,5,5,7,0,1,6,5,6,5,7,7] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm12, %zmm13, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm14 = [0,0,0,0,4,5,2,1,0,2,0,2,4,5,2,3,0,1,4,4,4,4,6,5,0,1,4,6,4,6,6,7] -; AVX512DQ-BW-FCP-NEXT: movl $-2004318072, %r11d # imm = 0x88888888 -; AVX512DQ-BW-FCP-NEXT: kmovd %r11d, %k2 -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm15, %zmm14, %zmm12 {%k2} -; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm18[0],xmm17[0],xmm18[1],xmm17[1],xmm18[2],xmm17[2],xmm18[3],xmm17[3],xmm18[4],xmm17[4],xmm18[5],xmm17[5],xmm18[6],xmm17[6],xmm18[7],xmm17[7] -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm15, %ymm15, %ymm16 -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm4, %ymm16, %ymm16 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm5, %xmm15, %xmm29 -; AVX512DQ-BW-FCP-NEXT: vpmovzxwq {{.*#+}} xmm15 = xmm15[0],zero,zero,zero,xmm15[1],zero,zero,zero -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm29, %ymm15, %ymm15 -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm16, %zmm15, %zmm15 -; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm16 = xmm24[0],xmm19[0],xmm24[1],xmm19[1],xmm24[2],xmm19[2],xmm24[3],xmm19[3],xmm24[4],xmm19[4],xmm24[5],xmm19[5],xmm24[6],xmm19[6],xmm24[7],xmm19[7] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm16, %zmm8, %zmm15 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm29 = xmm26[0],xmm25[0],xmm26[1],xmm25[1],xmm26[2],xmm25[2],xmm26[3],xmm25[3],xmm26[4],xmm25[4],xmm26[5],xmm25[5],xmm26[6],xmm25[6],xmm26[7],xmm25[7] -; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm16 = xmm28[0],xmm27[0],xmm28[1],xmm27[1],xmm28[2],xmm27[2],xmm28[3],xmm27[3],xmm28[4],xmm27[4],xmm28[5],xmm27[5],xmm28[6],xmm27[6],xmm28[7],xmm27[7] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm16, %zmm13, %zmm16 -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm29, %zmm14, %zmm16 {%k2} -; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm17 = xmm18[8],xmm17[8],xmm18[9],xmm17[9],xmm18[10],xmm17[10],xmm18[11],xmm17[11],xmm18[12],xmm17[12],xmm18[13],xmm17[13],xmm18[14],xmm17[14],xmm18[15],xmm17[15] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm5, %xmm17, %xmm18 -; AVX512DQ-BW-FCP-NEXT: vpmovzxwq {{.*#+}} xmm29 = xmm17[0],zero,zero,zero,xmm17[1],zero,zero,zero -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm18, %ymm29, %ymm18 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 32(%r10), %xmm29 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 48(%r9), %xmm28 +; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r8), %xmm12 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 48(%r8), %xmm30 +; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm12[8],xmm10[8],xmm12[9],xmm10[9],xmm12[10],xmm10[10],xmm12[11],xmm10[11],xmm12[12],xmm10[12],xmm12[13],xmm10[13],xmm12[14],xmm10[14],xmm12[15],xmm10[15] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm14 = [0,1,0,32,4,5,1,33,2,1,2,42,4,5,3,43,0,1,4,36,4,5,5,37,0,1,6,46,6,5,7,47] +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm13, %zmm14, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm17[0],xmm16[0],xmm17[1],xmm16[1],xmm17[2],xmm16[2],xmm17[3],xmm16[3],xmm17[4],xmm16[4],xmm17[5],xmm16[5],xmm17[6],xmm16[6],xmm17[7],xmm16[7] +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm13, %ymm13, %ymm15 +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm5, %ymm15, %ymm15 +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm6, %xmm13, %xmm25 +; AVX512DQ-BW-FCP-NEXT: vpmovzxwq {{.*#+}} xmm13 = xmm13[0],zero,zero,zero,xmm13[1],zero,zero,zero +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm25, %ymm13, %ymm13 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm15, %zmm13, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm23[0],xmm18[0],xmm23[1],xmm18[1],xmm23[2],xmm18[2],xmm23[3],xmm18[3],xmm23[4],xmm18[4],xmm23[5],xmm18[5],xmm23[6],xmm18[6],xmm23[7],xmm18[7] +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm15, %ymm15, %ymm15 +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm15, %zmm9, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm26[0],xmm24[0],xmm26[1],xmm24[1],xmm26[2],xmm24[2],xmm26[3],xmm24[3],xmm26[4],xmm24[4],xmm26[5],xmm24[5],xmm26[6],xmm24[6],xmm26[7],xmm24[7] +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm15, %ymm15, %ymm25 +; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm30[0],xmm28[0],xmm30[1],xmm28[1],xmm30[2],xmm28[2],xmm30[3],xmm28[3],xmm30[4],xmm28[4],xmm30[5],xmm28[5],xmm30[6],xmm28[6],xmm30[7],xmm28[7] +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm25, %zmm14, %zmm15 +; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm16 = xmm17[8],xmm16[8],xmm17[9],xmm16[9],xmm17[10],xmm16[10],xmm17[11],xmm16[11],xmm17[12],xmm16[12],xmm17[13],xmm16[13],xmm17[14],xmm16[14],xmm17[15],xmm16[15] +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm6, %xmm16, %xmm17 +; AVX512DQ-BW-FCP-NEXT: vpmovzxwq {{.*#+}} xmm25 = xmm16[0],zero,zero,zero,xmm16[1],zero,zero,zero +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm17, %ymm25, %ymm17 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 32(%r10), %xmm25 +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm16, %ymm16, %ymm16 +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm5, %ymm16, %ymm16 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm16, %zmm17, %zmm16 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 32(%rax), %xmm29 +; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm17 = xmm23[8],xmm18[8],xmm23[9],xmm18[9],xmm23[10],xmm18[10],xmm23[11],xmm18[11],xmm23[12],xmm18[12],xmm23[13],xmm18[13],xmm23[14],xmm18[14],xmm23[15],xmm18[15] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 32(%r9), %xmm27 ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm17, %ymm17, %ymm17 -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm4, %ymm17, %ymm17 -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm17, %zmm18, %zmm17 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 32(%rax), %xmm30 -; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm18 = xmm24[8],xmm19[8],xmm24[9],xmm19[9],xmm24[10],xmm19[10],xmm24[11],xmm19[11],xmm24[12],xmm19[12],xmm24[13],xmm19[13],xmm24[14],xmm19[14],xmm24[15],xmm19[15] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 32(%r9), %xmm31 -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm18, %zmm8, %zmm17 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%r8), %xmm0 -; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm19 = xmm26[8],xmm25[8],xmm26[9],xmm25[9],xmm26[10],xmm25[10],xmm26[11],xmm25[11],xmm26[12],xmm25[12],xmm26[13],xmm25[13],xmm26[14],xmm25[14],xmm26[15],xmm25[15] -; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm18 = xmm28[8],xmm27[8],xmm28[9],xmm27[9],xmm28[10],xmm27[10],xmm28[11],xmm27[11],xmm28[12],xmm27[12],xmm28[13],xmm27[13],xmm28[14],xmm27[14],xmm28[15],xmm27[15] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm18, %zmm13, %zmm18 -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm19, %zmm14, %zmm18 {%k2} -; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm19 = xmm21[0],xmm20[0],xmm21[1],xmm20[1],xmm21[2],xmm20[2],xmm21[3],xmm20[3],xmm21[4],xmm20[4],xmm21[5],xmm20[5],xmm21[6],xmm20[6],xmm21[7],xmm20[7] -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm19, %ymm19, %ymm24 -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm4, %ymm24, %ymm24 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm5, %xmm19, %xmm25 -; AVX512DQ-BW-FCP-NEXT: vpmovzxwq {{.*#+}} xmm19 = xmm19[0],zero,zero,zero,xmm19[1],zero,zero,zero -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm25, %ymm19, %ymm19 -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm24, %zmm19, %zmm19 -; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm24 = xmm23[0],xmm22[0],xmm23[1],xmm22[1],xmm23[2],xmm22[2],xmm23[3],xmm22[3],xmm23[4],xmm22[4],xmm23[5],xmm22[5],xmm23[6],xmm22[6],xmm23[7],xmm22[7] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm24, %zmm8, %zmm19 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm25 = xmm30[0],xmm29[0],xmm30[1],xmm29[1],xmm30[2],xmm29[2],xmm30[3],xmm29[3],xmm30[4],xmm29[4],xmm30[5],xmm29[5],xmm30[6],xmm29[6],xmm30[7],xmm29[7] -; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm24 = xmm0[0],xmm31[0],xmm0[1],xmm31[1],xmm0[2],xmm31[2],xmm0[3],xmm31[3],xmm0[4],xmm31[4],xmm0[5],xmm31[5],xmm0[6],xmm31[6],xmm0[7],xmm31[7] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm24, %zmm13, %zmm24 -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm25, %zmm14, %zmm24 {%k2} -; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm20 = xmm21[8],xmm20[8],xmm21[9],xmm20[9],xmm21[10],xmm20[10],xmm21[11],xmm20[11],xmm21[12],xmm20[12],xmm21[13],xmm20[13],xmm21[14],xmm20[14],xmm21[15],xmm20[15] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm5, %xmm20, %xmm21 -; AVX512DQ-BW-FCP-NEXT: vpmovzxwq {{.*#+}} xmm25 = xmm20[0],zero,zero,zero,xmm20[1],zero,zero,zero -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm21, %ymm25, %ymm21 +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm17, %zmm9, %zmm16 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 32(%r8), %xmm31 +; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm17 = xmm26[8],xmm24[8],xmm26[9],xmm24[9],xmm26[10],xmm24[10],xmm26[11],xmm24[11],xmm26[12],xmm24[12],xmm26[13],xmm24[13],xmm26[14],xmm24[14],xmm26[15],xmm24[15] +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm17, %ymm17, %ymm18 +; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm17 = xmm30[8],xmm28[8],xmm30[9],xmm28[9],xmm30[10],xmm28[10],xmm30[11],xmm28[11],xmm30[12],xmm28[12],xmm30[13],xmm28[13],xmm30[14],xmm28[14],xmm30[15],xmm28[15] +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm18, %zmm14, %zmm17 +; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm18 = xmm20[0],xmm19[0],xmm20[1],xmm19[1],xmm20[2],xmm19[2],xmm20[3],xmm19[3],xmm20[4],xmm19[4],xmm20[5],xmm19[5],xmm20[6],xmm19[6],xmm20[7],xmm19[7] +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm18, %ymm18, %ymm23 +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm5, %ymm23, %ymm23 +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm6, %xmm18, %xmm24 +; AVX512DQ-BW-FCP-NEXT: vpmovzxwq {{.*#+}} xmm18 = xmm18[0],zero,zero,zero,xmm18[1],zero,zero,zero +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm24, %ymm18, %ymm18 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm23, %zmm18, %zmm18 +; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm23 = xmm22[0],xmm21[0],xmm22[1],xmm21[1],xmm22[2],xmm21[2],xmm22[3],xmm21[3],xmm22[4],xmm21[4],xmm22[5],xmm21[5],xmm22[6],xmm21[6],xmm22[7],xmm21[7] +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm23, %ymm23, %ymm23 +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm23, %zmm9, %zmm18 +; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm23 = xmm29[0],xmm25[0],xmm29[1],xmm25[1],xmm29[2],xmm25[2],xmm29[3],xmm25[3],xmm29[4],xmm25[4],xmm29[5],xmm25[5],xmm29[6],xmm25[6],xmm29[7],xmm25[7] +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm23, %ymm23, %ymm24 +; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm23 = xmm31[0],xmm27[0],xmm31[1],xmm27[1],xmm31[2],xmm27[2],xmm31[3],xmm27[3],xmm31[4],xmm27[4],xmm31[5],xmm27[5],xmm31[6],xmm27[6],xmm31[7],xmm27[7] +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm24, %zmm14, %zmm23 +; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm19 = xmm20[8],xmm19[8],xmm20[9],xmm19[9],xmm20[10],xmm19[10],xmm20[11],xmm19[11],xmm20[12],xmm19[12],xmm20[13],xmm19[13],xmm20[14],xmm19[14],xmm20[15],xmm19[15] +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm6, %xmm19, %xmm20 +; AVX512DQ-BW-FCP-NEXT: vpmovzxwq {{.*#+}} xmm24 = xmm19[0],zero,zero,zero,xmm19[1],zero,zero,zero +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm20, %ymm24, %ymm20 +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm19, %ymm19, %ymm19 +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm5, %ymm19, %ymm19 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm19, %zmm20, %zmm19 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 16(%rsi), %xmm24 +; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm20 = xmm22[8],xmm21[8],xmm22[9],xmm21[9],xmm22[10],xmm21[10],xmm22[11],xmm21[11],xmm22[12],xmm21[12],xmm22[13],xmm21[13],xmm22[14],xmm21[14],xmm22[15],xmm21[15] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 16(%rdi), %xmm22 ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm20, %ymm20, %ymm20 -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm4, %ymm20, %ymm20 -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm20, %zmm21, %zmm20 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 16(%rsi), %xmm25 -; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm21 = xmm23[8],xmm22[8],xmm23[9],xmm22[9],xmm23[10],xmm22[10],xmm23[11],xmm22[11],xmm23[12],xmm22[12],xmm23[13],xmm22[13],xmm23[14],xmm22[14],xmm23[15],xmm22[15] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 16(%rdi), %xmm23 -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm21, %zmm8, %zmm20 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm20, %zmm9, %zmm19 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 16(%rcx), %xmm26 -; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm22 = xmm30[8],xmm29[8],xmm30[9],xmm29[9],xmm30[10],xmm29[10],xmm30[11],xmm29[11],xmm30[12],xmm29[12],xmm30[13],xmm29[13],xmm30[14],xmm29[14],xmm30[15],xmm29[15] -; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm31[8],xmm0[9],xmm31[9],xmm0[10],xmm31[10],xmm0[11],xmm31[11],xmm0[12],xmm31[12],xmm0[13],xmm31[13],xmm0[14],xmm31[14],xmm0[15],xmm31[15] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm0, %zmm13, %zmm21 -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm22, %zmm14, %zmm21 {%k2} -; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm23[0],xmm25[0],xmm23[1],xmm25[1],xmm23[2],xmm25[2],xmm23[3],xmm25[3],xmm23[4],xmm25[4],xmm23[5],xmm25[5],xmm23[6],xmm25[6],xmm23[7],xmm25[7] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm5, %xmm0, %xmm22 -; AVX512DQ-BW-FCP-NEXT: vpmovzxwq {{.*#+}} xmm27 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm22, %ymm27, %ymm22 +; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm20 = xmm29[8],xmm25[8],xmm29[9],xmm25[9],xmm29[10],xmm25[10],xmm29[11],xmm25[11],xmm29[12],xmm25[12],xmm29[13],xmm25[13],xmm29[14],xmm25[14],xmm29[15],xmm25[15] +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm20, %ymm20, %ymm21 +; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm20 = xmm31[8],xmm27[8],xmm31[9],xmm27[9],xmm31[10],xmm27[10],xmm31[11],xmm27[11],xmm31[12],xmm27[12],xmm31[13],xmm27[13],xmm31[14],xmm27[14],xmm31[15],xmm27[15] +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm21, %zmm14, %zmm20 +; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm21 = xmm22[0],xmm24[0],xmm22[1],xmm24[1],xmm22[2],xmm24[2],xmm22[3],xmm24[3],xmm22[4],xmm24[4],xmm22[5],xmm24[5],xmm22[6],xmm24[6],xmm22[7],xmm24[7] +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm6, %xmm21, %xmm25 +; AVX512DQ-BW-FCP-NEXT: vpmovzxwq {{.*#+}} xmm27 = xmm21[0],zero,zero,zero,xmm21[1],zero,zero,zero +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm25, %ymm27, %ymm25 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 16(%rdx), %xmm27 -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm4, %ymm0, %ymm0 -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm22, %zmm22 -; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm27[0],xmm26[0],xmm27[1],xmm26[1],xmm27[2],xmm26[2],xmm27[3],xmm26[3],xmm27[4],xmm26[4],xmm27[5],xmm26[5],xmm27[6],xmm26[6],xmm27[7],xmm26[7] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm0, %zmm8, %zmm22 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm23[8],xmm25[8],xmm23[9],xmm25[9],xmm23[10],xmm25[10],xmm23[11],xmm25[11],xmm23[12],xmm25[12],xmm23[13],xmm25[13],xmm23[14],xmm25[14],xmm23[15],xmm25[15] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm5, %xmm0, %xmm23 -; AVX512DQ-BW-FCP-NEXT: vpmovzxwq {{.*#+}} xmm25 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm23, %ymm25, %ymm23 -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm4, %ymm0, %ymm0 -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm23, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm23 = xmm27[8],xmm26[8],xmm27[9],xmm26[9],xmm27[10],xmm26[10],xmm27[11],xmm26[11],xmm27[12],xmm26[12],xmm27[13],xmm26[13],xmm27[14],xmm26[14],xmm27[15],xmm26[15] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm23, %zmm8, %zmm0 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3],xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm2, %ymm2, %ymm23 -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm4, %ymm23, %ymm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 16(%r10), %xmm23 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm5, %xmm2, %xmm5 -; AVX512DQ-BW-FCP-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm5, %ymm2, %ymm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 16(%rax), %xmm5 -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 16(%r9), %xmm4 +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm21, %ymm21, %ymm21 +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm5, %ymm21, %ymm21 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm21, %zmm25, %zmm21 +; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm25 = xmm27[0],xmm26[0],xmm27[1],xmm26[1],xmm27[2],xmm26[2],xmm27[3],xmm26[3],xmm27[4],xmm26[4],xmm27[5],xmm26[5],xmm27[6],xmm26[6],xmm27[7],xmm26[7] +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm25, %ymm25, %ymm25 +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm25, %zmm9, %zmm21 +; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm22 = xmm22[8],xmm24[8],xmm22[9],xmm24[9],xmm22[10],xmm24[10],xmm22[11],xmm24[11],xmm22[12],xmm24[12],xmm22[13],xmm24[13],xmm22[14],xmm24[14],xmm22[15],xmm24[15] +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm6, %xmm22, %xmm24 +; AVX512DQ-BW-FCP-NEXT: vpmovzxwq {{.*#+}} xmm25 = xmm22[0],zero,zero,zero,xmm22[1],zero,zero,zero +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm24, %ymm25, %ymm24 +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm22, %ymm22, %ymm22 +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm5, %ymm22, %ymm22 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm22, %zmm24, %zmm22 +; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm24 = xmm27[8],xmm26[8],xmm27[9],xmm26[9],xmm27[10],xmm26[10],xmm27[11],xmm26[11],xmm27[12],xmm26[12],xmm27[13],xmm26[13],xmm27[14],xmm26[14],xmm27[15],xmm26[15] +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm24, %ymm24, %ymm24 +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm24, %zmm9, %zmm22 +; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm2 +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm5, %ymm2, %ymm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 16(%r10), %xmm5 +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm6, %xmm1, %xmm6 +; AVX512DQ-BW-FCP-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm6, %ymm1, %ymm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 16(%rax), %xmm6 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 16(%r9), %xmm2 ; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3],xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7] ; AVX512DQ-BW-FCP-NEXT: vmovdqa 16(%r8), %xmm7 -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm3, %zmm8, %zmm2 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm5[0],xmm23[0],xmm5[1],xmm23[1],xmm5[2],xmm23[2],xmm5[3],xmm23[3],xmm5[4],xmm23[4],xmm5[5],xmm23[5],xmm5[6],xmm23[6],xmm5[7],xmm23[7] -; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3],xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm8, %zmm13, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm3, %zmm14, %zmm8 {%k2} -; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm5[8],xmm23[8],xmm5[9],xmm23[9],xmm5[10],xmm23[10],xmm5[11],xmm23[11],xmm5[12],xmm23[12],xmm5[13],xmm23[13],xmm5[14],xmm23[14],xmm5[15],xmm23[15] -; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm7[8],xmm4[8],xmm7[9],xmm4[9],xmm7[10],xmm4[10],xmm7[11],xmm4[11],xmm7[12],xmm4[12],xmm7[13],xmm4[13],xmm7[14],xmm4[14],xmm7[15],xmm4[15] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm4, %zmm13, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm3, %zmm14, %zmm4 {%k2} -; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3],xmm9[4],xmm6[4],xmm9[5],xmm6[5],xmm9[6],xmm6[6],xmm9[7],xmm6[7] -; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3],xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm5, %zmm13, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm3, %zmm14, %zmm5 {%k2} +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm3, %zmm9, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm3 +; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm7[0],xmm2[0],xmm7[1],xmm2[1],xmm7[2],xmm2[2],xmm7[3],xmm2[3],xmm7[4],xmm2[4],xmm7[5],xmm2[5],xmm7[6],xmm2[6],xmm7[7],xmm2[7] +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm3, %zmm14, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15] +; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm7[8],xmm2[8],xmm7[9],xmm2[9],xmm7[10],xmm2[10],xmm7[11],xmm2[11],xmm7[12],xmm2[12],xmm7[13],xmm2[13],xmm7[14],xmm2[14],xmm7[15],xmm2[15] +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm3, %zmm14, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3],xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7] +; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3],xmm12[4],xmm10[4],xmm12[5],xmm10[5],xmm12[6],xmm10[6],xmm12[7],xmm10[7] +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm3, %zmm14, %zmm4 ; AVX512DQ-BW-FCP-NEXT: movw $-21846, %ax # imm = 0xAAAA ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm12, %zmm1 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm16, %zmm15 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm18, %zmm17 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm24, %zmm19 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm21, %zmm20 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm8, %zmm22 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm4, %zmm0 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm5, %zmm2 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm11, %zmm0 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm15, %zmm13 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm17, %zmm16 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm23, %zmm18 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm20, %zmm19 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm9, %zmm21 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm2, %zmm22 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm4, %zmm1 {%k1} ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, (%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, 192(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, 128(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, 320(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, 256(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, 448(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, 384(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, 64(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, (%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, 192(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, 128(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, 320(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, 256(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, 448(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, 384(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, 64(%rax) ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq %in.vec0 = load <64 x i8>, ptr %in.vecptr0, align 64 diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512f.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512f.ll index 68967c2ce653..c33776daf18f 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512f.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512f.ll @@ -964,41 +964,11 @@ define <8 x i64> @combine_vpermvar_insertion_as_broadcast_v8i64(i64 %a0) { } define <16 x i32> @blend_of_permutes_v16i32(<8 x i64> %a0, <8x i64> %a1) { -; X86-AVX512F-LABEL: blend_of_permutes_v16i32: -; X86-AVX512F: # %bb.0: -; X86-AVX512F-NEXT: vpermq {{.*#+}} zmm0 = zmm0[2,3,0,1,6,7,4,5] -; X86-AVX512F-NEXT: vpermq {{.*#+}} zmm1 = zmm1[2,3,0,1,6,7,4,5] -; X86-AVX512F-NEXT: movw $-25958, %ax # imm = 0x9A9A -; X86-AVX512F-NEXT: kmovw %eax, %k1 -; X86-AVX512F-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} -; X86-AVX512F-NEXT: retl -; -; X86-AVX512BW-LABEL: blend_of_permutes_v16i32: -; X86-AVX512BW: # %bb.0: -; X86-AVX512BW-NEXT: vpermq {{.*#+}} zmm0 = zmm0[2,3,0,1,6,7,4,5] -; X86-AVX512BW-NEXT: vpermq {{.*#+}} zmm1 = zmm1[2,3,0,1,6,7,4,5] -; X86-AVX512BW-NEXT: movw $-25958, %ax # imm = 0x9A9A -; X86-AVX512BW-NEXT: kmovd %eax, %k1 -; X86-AVX512BW-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} -; X86-AVX512BW-NEXT: retl -; -; X64-AVX512F-LABEL: blend_of_permutes_v16i32: -; X64-AVX512F: # %bb.0: -; X64-AVX512F-NEXT: vpermq {{.*#+}} zmm0 = zmm0[2,3,0,1,6,7,4,5] -; X64-AVX512F-NEXT: vpermq {{.*#+}} zmm1 = zmm1[2,3,0,1,6,7,4,5] -; X64-AVX512F-NEXT: movw $-25958, %ax # imm = 0x9A9A -; X64-AVX512F-NEXT: kmovw %eax, %k1 -; X64-AVX512F-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} -; X64-AVX512F-NEXT: retq -; -; X64-AVX512BW-LABEL: blend_of_permutes_v16i32: -; X64-AVX512BW: # %bb.0: -; X64-AVX512BW-NEXT: vpermq {{.*#+}} zmm0 = zmm0[2,3,0,1,6,7,4,5] -; X64-AVX512BW-NEXT: vpermq {{.*#+}} zmm1 = zmm1[2,3,0,1,6,7,4,5] -; X64-AVX512BW-NEXT: movw $-25958, %ax # imm = 0x9A9A -; X64-AVX512BW-NEXT: kmovd %eax, %k1 -; X64-AVX512BW-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} -; X64-AVX512BW-NEXT: retq +; CHECK-LABEL: blend_of_permutes_v16i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmovsxbd {{.*#+}} zmm2 = [4,21,6,23,16,1,2,19,12,29,14,31,24,9,10,27] +; CHECK-NEXT: vpermt2d %zmm1, %zmm2, %zmm0 +; CHECK-NEXT: ret{{[l|q]}} %s0 = shufflevector <8 x i64> %a0, <8 x i64> undef, <8 x i32> %s1 = shufflevector <8 x i64> %a1, <8 x i64> undef, <8 x i32> %x0 = bitcast <8 x i64> %s0 to <16 x i32> From 4079ed3c9e72d64746c5d3f05fc585d844c1e8a7 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Thu, 12 Jun 2025 17:35:55 +0900 Subject: [PATCH 0104/1322] ARM: Move setting of more runtime libcalls to RuntimeLibcallInfo (#143826) These are the easy cases that do not really depend on the subtarget, other than for the deceptive predicates on the subtarget class. Most of the rest of the cases here also do not, but this is obscured by going through helper predicates added onto the subtarget which hide dependence on TargetOptions. --- llvm/lib/IR/RuntimeLibcalls.cpp | 28 +++++++++++++++++++++++ llvm/lib/Target/ARM/ARMISelLowering.cpp | 30 ------------------------- 2 files changed, 28 insertions(+), 30 deletions(-) diff --git a/llvm/lib/IR/RuntimeLibcalls.cpp b/llvm/lib/IR/RuntimeLibcalls.cpp index 31013310a746..331b319511ae 100644 --- a/llvm/lib/IR/RuntimeLibcalls.cpp +++ b/llvm/lib/IR/RuntimeLibcalls.cpp @@ -79,6 +79,34 @@ static void setARMLibcallNames(RuntimeLibcallsInfo &Info, const Triple &TT) { } } } + + if (TT.isOSWindows()) { + static const struct { + const RTLIB::Libcall Op; + const char *const Name; + const CallingConv::ID CC; + } LibraryCalls[] = { + {RTLIB::FPTOSINT_F32_I64, "__stoi64", CallingConv::ARM_AAPCS_VFP}, + {RTLIB::FPTOSINT_F64_I64, "__dtoi64", CallingConv::ARM_AAPCS_VFP}, + {RTLIB::FPTOUINT_F32_I64, "__stou64", CallingConv::ARM_AAPCS_VFP}, + {RTLIB::FPTOUINT_F64_I64, "__dtou64", CallingConv::ARM_AAPCS_VFP}, + {RTLIB::SINTTOFP_I64_F32, "__i64tos", CallingConv::ARM_AAPCS_VFP}, + {RTLIB::SINTTOFP_I64_F64, "__i64tod", CallingConv::ARM_AAPCS_VFP}, + {RTLIB::UINTTOFP_I64_F32, "__u64tos", CallingConv::ARM_AAPCS_VFP}, + {RTLIB::UINTTOFP_I64_F64, "__u64tod", CallingConv::ARM_AAPCS_VFP}, + }; + + for (const auto &LC : LibraryCalls) { + Info.setLibcallName(LC.Op, LC.Name); + Info.setLibcallCallingConv(LC.Op, LC.CC); + } + } + + // Use divmod compiler-rt calls for iOS 5.0 and later. + if (TT.isOSBinFormatMachO() && (!TT.isiOS() || !TT.isOSVersionLT(5, 0))) { + Info.setLibcallName(RTLIB::SDIVREM_I32, "__divmodsi4"); + Info.setLibcallName(RTLIB::UDIVREM_I32, "__udivmodsi4"); + } } static void setMSP430Libcalls(RuntimeLibcallsInfo &Info, const Triple &TT) { diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp index 8455eef9bad3..d2e910a248f2 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -708,36 +708,6 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, } } - if (Subtarget->isTargetWindows()) { - static const struct { - const RTLIB::Libcall Op; - const char * const Name; - const CallingConv::ID CC; - } LibraryCalls[] = { - { RTLIB::FPTOSINT_F32_I64, "__stoi64", CallingConv::ARM_AAPCS_VFP }, - { RTLIB::FPTOSINT_F64_I64, "__dtoi64", CallingConv::ARM_AAPCS_VFP }, - { RTLIB::FPTOUINT_F32_I64, "__stou64", CallingConv::ARM_AAPCS_VFP }, - { RTLIB::FPTOUINT_F64_I64, "__dtou64", CallingConv::ARM_AAPCS_VFP }, - { RTLIB::SINTTOFP_I64_F32, "__i64tos", CallingConv::ARM_AAPCS_VFP }, - { RTLIB::SINTTOFP_I64_F64, "__i64tod", CallingConv::ARM_AAPCS_VFP }, - { RTLIB::UINTTOFP_I64_F32, "__u64tos", CallingConv::ARM_AAPCS_VFP }, - { RTLIB::UINTTOFP_I64_F64, "__u64tod", CallingConv::ARM_AAPCS_VFP }, - }; - - for (const auto &LC : LibraryCalls) { - setLibcallName(LC.Op, LC.Name); - setLibcallCallingConv(LC.Op, LC.CC); - } - } - - // Use divmod compiler-rt calls for iOS 5.0 and later. - if (Subtarget->isTargetMachO() && - !(Subtarget->isTargetIOS() && - Subtarget->getTargetTriple().isOSVersionLT(5, 0))) { - setLibcallName(RTLIB::SDIVREM_I32, "__divmodsi4"); - setLibcallName(RTLIB::UDIVREM_I32, "__udivmodsi4"); - } - // The half <-> float conversion functions are always soft-float on // non-watchos platforms, but are needed for some targets which use a // hard-float calling convention by default. From 5434b85d2c7a83d9cebae06dad2f9d630e9a3927 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Thu, 12 Jun 2025 17:38:52 +0900 Subject: [PATCH 0105/1322] ARM: Remove fake entries for divrem libcalls (#143832) This was defining aliases of the i32 divrem functions for the i8 and i16 cases. This is unnecessary and was unused. The divrem candidate cases wouldn't have formed with illegal types in the first place, so codegen wouldn't even query these. --- llvm/lib/IR/RuntimeLibcalls.cpp | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/llvm/lib/IR/RuntimeLibcalls.cpp b/llvm/lib/IR/RuntimeLibcalls.cpp index 331b319511ae..d84c56f0af5c 100644 --- a/llvm/lib/IR/RuntimeLibcalls.cpp +++ b/llvm/lib/IR/RuntimeLibcalls.cpp @@ -41,13 +41,8 @@ static void setARMLibcallNames(RuntimeLibcallsInfo &Info, const Triple &TT) { const char *const Name; const CallingConv::ID CC; } LibraryCalls[] = { - {RTLIB::SDIVREM_I8, "__rt_sdiv", CallingConv::ARM_AAPCS}, - {RTLIB::SDIVREM_I16, "__rt_sdiv", CallingConv::ARM_AAPCS}, {RTLIB::SDIVREM_I32, "__rt_sdiv", CallingConv::ARM_AAPCS}, {RTLIB::SDIVREM_I64, "__rt_sdiv64", CallingConv::ARM_AAPCS}, - - {RTLIB::UDIVREM_I8, "__rt_udiv", CallingConv::ARM_AAPCS}, - {RTLIB::UDIVREM_I16, "__rt_udiv", CallingConv::ARM_AAPCS}, {RTLIB::UDIVREM_I32, "__rt_udiv", CallingConv::ARM_AAPCS}, {RTLIB::UDIVREM_I64, "__rt_udiv64", CallingConv::ARM_AAPCS}, }; @@ -62,13 +57,8 @@ static void setARMLibcallNames(RuntimeLibcallsInfo &Info, const Triple &TT) { const char *const Name; const CallingConv::ID CC; } LibraryCalls[] = { - {RTLIB::SDIVREM_I8, "__aeabi_idivmod", CallingConv::ARM_AAPCS}, - {RTLIB::SDIVREM_I16, "__aeabi_idivmod", CallingConv::ARM_AAPCS}, {RTLIB::SDIVREM_I32, "__aeabi_idivmod", CallingConv::ARM_AAPCS}, {RTLIB::SDIVREM_I64, "__aeabi_ldivmod", CallingConv::ARM_AAPCS}, - - {RTLIB::UDIVREM_I8, "__aeabi_uidivmod", CallingConv::ARM_AAPCS}, - {RTLIB::UDIVREM_I16, "__aeabi_uidivmod", CallingConv::ARM_AAPCS}, {RTLIB::UDIVREM_I32, "__aeabi_uidivmod", CallingConv::ARM_AAPCS}, {RTLIB::UDIVREM_I64, "__aeabi_uldivmod", CallingConv::ARM_AAPCS}, }; From ce621041c2f162c50d630810491c2feee8eb6c64 Mon Sep 17 00:00:00 2001 From: Pengcheng Wang Date: Thu, 12 Jun 2025 16:39:57 +0800 Subject: [PATCH 0106/1322] [RISCV] Get host CPU name via hwprobe (#142745) We can get the `mvendorid/marchid/mimpid` via hwprobe and then we can compare these IDs with those defined in processors to find the CPU name. With this change, `-mcpu/-mtune=native` can set the proper name. --- .../llvm/TargetParser/RISCVTargetParser.h | 8 +++++ llvm/lib/TargetParser/Host.cpp | 30 +++++++++++++++---- llvm/lib/TargetParser/RISCVTargetParser.cpp | 15 +++++++--- 3 files changed, 44 insertions(+), 9 deletions(-) diff --git a/llvm/include/llvm/TargetParser/RISCVTargetParser.h b/llvm/include/llvm/TargetParser/RISCVTargetParser.h index 41fdab6012aa..19a8af0cb956 100644 --- a/llvm/include/llvm/TargetParser/RISCVTargetParser.h +++ b/llvm/include/llvm/TargetParser/RISCVTargetParser.h @@ -29,6 +29,13 @@ struct CPUModel { uint32_t MVendorID; uint64_t MArchID; uint64_t MImpID; + + bool isValid() const { return MVendorID != 0 && MArchID != 0 && MImpID != 0; } + + bool operator==(const CPUModel &Other) const { + return MVendorID == Other.MVendorID && MArchID == Other.MArchID && + MImpID == Other.MImpID; + } }; struct CPUInfo { @@ -58,6 +65,7 @@ LLVM_ABI bool hasFastScalarUnalignedAccess(StringRef CPU); LLVM_ABI bool hasFastVectorUnalignedAccess(StringRef CPU); LLVM_ABI bool hasValidCPUModel(StringRef CPU); LLVM_ABI CPUModel getCPUModel(StringRef CPU); +LLVM_ABI StringRef getCPUNameFromCPUModel(const CPUModel &Model); } // namespace RISCV diff --git a/llvm/lib/TargetParser/Host.cpp b/llvm/lib/TargetParser/Host.cpp index 14acef116708..5957e1befe2d 100644 --- a/llvm/lib/TargetParser/Host.cpp +++ b/llvm/lib/TargetParser/Host.cpp @@ -18,6 +18,7 @@ #include "llvm/Config/llvm-config.h" #include "llvm/Support/MemoryBuffer.h" #include "llvm/Support/raw_ostream.h" +#include "llvm/TargetParser/RISCVTargetParser.h" #include "llvm/TargetParser/Triple.h" #include "llvm/TargetParser/X86TargetParser.h" #include @@ -1672,8 +1673,32 @@ StringRef sys::getHostCPUName() { return "generic"; } #elif defined(__riscv) +#if defined(__linux__) +// struct riscv_hwprobe +struct RISCVHwProbe { + int64_t Key; + uint64_t Value; +}; +#endif + StringRef sys::getHostCPUName() { #if defined(__linux__) + // Try the hwprobe way first. + RISCVHwProbe Query[]{{/*RISCV_HWPROBE_KEY_MVENDORID=*/0, 0}, + {/*RISCV_HWPROBE_KEY_MARCHID=*/1, 0}, + {/*RISCV_HWPROBE_KEY_MIMPID=*/2, 0}}; + int Ret = syscall(/*__NR_riscv_hwprobe=*/258, /*pairs=*/Query, + /*pair_count=*/std::size(Query), /*cpu_count=*/0, + /*cpus=*/0, /*flags=*/0); + if (Ret == 0) { + RISCV::CPUModel Model{static_cast(Query[0].Value), Query[1].Value, + Query[2].Value}; + StringRef Name = RISCV::getCPUNameFromCPUModel(Model); + if (!Name.empty()) + return Name; + } + + // Then try the cpuinfo way. std::unique_ptr P = getProcCpuinfoContent(); StringRef Content = P ? P->getBuffer() : ""; StringRef Name = detail::getHostCPUNameForRISCV(Content); @@ -2148,11 +2173,6 @@ const StringMap sys::getHostCPUFeatures() { return Features; } #elif defined(__linux__) && defined(__riscv) -// struct riscv_hwprobe -struct RISCVHwProbe { - int64_t Key; - uint64_t Value; -}; const StringMap sys::getHostCPUFeatures() { RISCVHwProbe Query[]{{/*RISCV_HWPROBE_KEY_BASE_BEHAVIOR=*/3, 0}, {/*RISCV_HWPROBE_KEY_IMA_EXT_0=*/4, 0}, diff --git a/llvm/lib/TargetParser/RISCVTargetParser.cpp b/llvm/lib/TargetParser/RISCVTargetParser.cpp index 2e5e8f4e50c9..9957ec0c28d8 100644 --- a/llvm/lib/TargetParser/RISCVTargetParser.cpp +++ b/llvm/lib/TargetParser/RISCVTargetParser.cpp @@ -57,10 +57,7 @@ bool hasFastVectorUnalignedAccess(StringRef CPU) { return Info && Info->FastVectorUnalignedAccess; } -bool hasValidCPUModel(StringRef CPU) { - const CPUModel Model = getCPUModel(CPU); - return Model.MVendorID != 0 && Model.MArchID != 0 && Model.MImpID != 0; -} +bool hasValidCPUModel(StringRef CPU) { return getCPUModel(CPU).isValid(); } CPUModel getCPUModel(StringRef CPU) { const CPUInfo *Info = getCPUInfoByName(CPU); @@ -69,6 +66,16 @@ CPUModel getCPUModel(StringRef CPU) { return Info->Model; } +StringRef getCPUNameFromCPUModel(const CPUModel &Model) { + if (!Model.isValid()) + return ""; + + for (auto &C : RISCVCPUInfo) + if (C.Model == Model) + return C.Name; + return ""; +} + bool parseCPU(StringRef CPU, bool IsRV64) { const CPUInfo *Info = getCPUInfoByName(CPU); From 4551e5035565606eb04253a35f31d51685657436 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?kadir=20=C3=A7etinkaya?= Date: Thu, 12 Jun 2025 10:49:23 +0200 Subject: [PATCH 0107/1322] [clang] Reset FileID based diag state mappings (#143695) When sharing same compiler instance for multiple compilations, we reset source manager's file id tables in between runs. Diagnostics engine keeps a cache based on these file ids, that became dangling references across compilations. This patch makes sure we reset those whenever sourcemanager is trashing its FileIDs. --- clang/include/clang/Basic/Diagnostic.h | 13 +++-- clang/lib/Basic/Diagnostic.cpp | 4 +- clang/lib/Basic/SourceManager.cpp | 3 ++ .../Frontend/CompilerInstanceTest.cpp | 51 +++++++++++++++++++ 4 files changed, 67 insertions(+), 4 deletions(-) diff --git a/clang/include/clang/Basic/Diagnostic.h b/clang/include/clang/Basic/Diagnostic.h index efee8302e750..7ae4ef7df138 100644 --- a/clang/include/clang/Basic/Diagnostic.h +++ b/clang/include/clang/Basic/Diagnostic.h @@ -424,10 +424,13 @@ private: bool empty() const { return Files.empty(); } /// Clear out this map. - void clear() { + void clear(bool Soft) { + // Just clear the cache when in soft mode. Files.clear(); - FirstDiagState = CurDiagState = nullptr; - CurDiagStateLoc = SourceLocation(); + if (!Soft) { + FirstDiagState = CurDiagState = nullptr; + CurDiagStateLoc = SourceLocation(); + } } /// Produce a debugging dump of the diagnostic state. @@ -920,6 +923,10 @@ public: /// Reset the state of the diagnostic object to its initial configuration. /// \param[in] soft - if true, doesn't reset the diagnostic mappings and state void Reset(bool soft = false); + /// We keep a cache of FileIDs for diagnostics mapped by pragmas. These might + /// get invalidated when diagnostics engine is shared across different + /// compilations. Provide users with a way to reset that. + void ResetPragmas(); //===--------------------------------------------------------------------===// // DiagnosticsEngine classification and reporting interfaces. diff --git a/clang/lib/Basic/Diagnostic.cpp b/clang/lib/Basic/Diagnostic.cpp index 95d86cb153b4..a30bfa28eca7 100644 --- a/clang/lib/Basic/Diagnostic.cpp +++ b/clang/lib/Basic/Diagnostic.cpp @@ -119,6 +119,8 @@ bool DiagnosticsEngine::popMappings(SourceLocation Loc) { return true; } +void DiagnosticsEngine::ResetPragmas() { DiagStatesByLoc.clear(/*Soft=*/true); } + void DiagnosticsEngine::Reset(bool soft /*=false*/) { ErrorOccurred = false; UncompilableErrorOccurred = false; @@ -135,7 +137,7 @@ void DiagnosticsEngine::Reset(bool soft /*=false*/) { if (!soft) { // Clear state related to #pragma diagnostic. DiagStates.clear(); - DiagStatesByLoc.clear(); + DiagStatesByLoc.clear(false); DiagStateOnPushStack.clear(); // Create a DiagState and DiagStatePoint representing diagnostic changes diff --git a/clang/lib/Basic/SourceManager.cpp b/clang/lib/Basic/SourceManager.cpp index 09e5c6547fb5..053e82683a4a 100644 --- a/clang/lib/Basic/SourceManager.cpp +++ b/clang/lib/Basic/SourceManager.cpp @@ -344,6 +344,9 @@ void SourceManager::clearIDTables() { NextLocalOffset = 0; CurrentLoadedOffset = MaxLoadedOffset; createExpansionLoc(SourceLocation(), SourceLocation(), SourceLocation(), 1); + // Diagnostics engine keeps some references to fileids, mostly for dealing + // with diagnostic pragmas, make sure they're reset as well. + Diag.ResetPragmas(); } bool SourceManager::isMainFile(const FileEntry &SourceFile) { diff --git a/clang/unittests/Frontend/CompilerInstanceTest.cpp b/clang/unittests/Frontend/CompilerInstanceTest.cpp index a7b258d5e537..459a3864887e 100644 --- a/clang/unittests/Frontend/CompilerInstanceTest.cpp +++ b/clang/unittests/Frontend/CompilerInstanceTest.cpp @@ -9,9 +9,12 @@ #include "clang/Frontend/CompilerInstance.h" #include "clang/Basic/FileManager.h" #include "clang/Frontend/CompilerInvocation.h" +#include "clang/Frontend/FrontendActions.h" #include "clang/Frontend/TextDiagnosticPrinter.h" +#include "llvm/ADT/IntrusiveRefCntPtr.h" #include "llvm/Support/FileSystem.h" #include "llvm/Support/Format.h" +#include "llvm/Support/MemoryBuffer.h" #include "llvm/Support/ToolOutputFile.h" #include "llvm/Support/VirtualFileSystem.h" #include "gtest/gtest.h" @@ -97,4 +100,52 @@ TEST(CompilerInstance, AllowDiagnosticLogWithUnownedDiagnosticConsumer) { ASSERT_EQ(DiagnosticOutput, "error: expected no crash\n"); } +TEST(CompilerInstance, MultipleInputsCleansFileIDs) { + auto VFS = makeIntrusiveRefCnt(); + VFS->addFile("a.cc", /*ModificationTime=*/{}, + MemoryBuffer::getMemBuffer(R"cpp( + #include "a.h" + )cpp")); + // Paddings of `void foo();` in the sources below are "important". We're + // testing against source locations from previous compilations colliding. + // Hence the `unused` variable in `b.h` needs to be within `#pragma clang + // diagnostic` block from `a.h`. + VFS->addFile("a.h", /*ModificationTime=*/{}, MemoryBuffer::getMemBuffer(R"cpp( + #include "b.h" + #pragma clang diagnostic push + #pragma clang diagnostic warning "-Wunused" + void foo(); + #pragma clang diagnostic pop + )cpp")); + VFS->addFile("b.h", /*ModificationTime=*/{}, MemoryBuffer::getMemBuffer(R"cpp( + void foo(); void foo(); void foo(); void foo(); + inline void foo() { int unused = 2; } + )cpp")); + + DiagnosticOptions DiagOpts; + IntrusiveRefCntPtr Diags = + CompilerInstance::createDiagnostics(*VFS, DiagOpts); + + CreateInvocationOptions CIOpts; + CIOpts.Diags = Diags; + + const char *Args[] = {"clang", "-xc++", "a.cc"}; + std::shared_ptr CInvok = + createInvocation(Args, std::move(CIOpts)); + ASSERT_TRUE(CInvok) << "could not create compiler invocation"; + + CompilerInstance Instance(std::move(CInvok)); + Instance.setDiagnostics(Diags.get()); + Instance.createFileManager(VFS); + + // Run once for `a.cc` and then for `a.h`. This makes sure we get the same + // file ID for `b.h` in the second run as `a.h` from first run. + const auto &OrigInputKind = Instance.getFrontendOpts().Inputs[0].getKind(); + Instance.getFrontendOpts().Inputs.emplace_back("a.h", OrigInputKind); + + SyntaxOnlyAction Act; + EXPECT_TRUE(Instance.ExecuteAction(Act)) << "Failed to execute action"; + EXPECT_FALSE(Diags->hasErrorOccurred()); + EXPECT_EQ(Diags->getNumWarnings(), 0u); +} } // anonymous namespace From db8d34db26e9ea92c08d6e813eca9cce40c48478 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Thu, 12 Jun 2025 10:04:08 +0100 Subject: [PATCH 0108/1322] [VPlan] Set branch weight metadata on middle term in VPlan (NFC) (#143035) Manage branch weights for the BranchOnCond in the middle block in VPlan. This requires updating VPInstruction to inherit from VPIRMetadata, which in general makes sense as there are a number of opcodes that could take metadata. There are other branches (part of the skeleton) that also need branch weights adding. PR: https://github.com/llvm/llvm-project/pull/143035 --- .../Transforms/Vectorize/LoopVectorize.cpp | 48 ++++++++++------- llvm/lib/Transforms/Vectorize/VPlan.h | 53 ++++++++++--------- .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 6 ++- 3 files changed, 62 insertions(+), 45 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index d23611183639..93ab3353a296 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -7273,6 +7273,33 @@ static void fixReductionScalarResumeWhenVectorizingEpilog( BypassBlock, MainResumePhi->getIncomingValueForBlock(BypassBlock)); } +/// Add branch weight metadata, if the \p Plan's middle block is terminated by a +/// BranchOnCond recipe. +static void addBranchWeightToMiddleTerminator(VPlan &Plan, ElementCount VF, + Loop *OrigLoop) { + // 4. Adjust branch weight of the branch in the middle block. + Instruction *LatchTerm = OrigLoop->getLoopLatch()->getTerminator(); + if (!hasBranchWeightMD(*LatchTerm)) + return; + + VPBasicBlock *MiddleVPBB = Plan.getMiddleBlock(); + auto *MiddleTerm = + dyn_cast_or_null(MiddleVPBB->getTerminator()); + // Only add branch metadata if there is a (conditional) terminator. + if (!MiddleTerm) + return; + + assert(MiddleTerm->getOpcode() == VPInstruction::BranchOnCond && + "must have a BranchOnCond"); + // Assume that `Count % VectorTripCount` is equally distributed. + unsigned TripCount = Plan.getUF() * VF.getKnownMinValue(); + assert(TripCount > 0 && "trip count should not be zero"); + MDBuilder MDB(LatchTerm->getContext()); + MDNode *BranchWeights = + MDB.createBranchWeights({1, TripCount - 1}, /*IsExpected=*/false); + MiddleTerm->addMetadata(LLVMContext::MD_prof, BranchWeights); +} + DenseMap LoopVectorizationPlanner::executePlan( ElementCount BestVF, unsigned BestUF, VPlan &BestVPlan, InnerLoopVectorizer &ILV, DominatorTree *DT, bool VectorizingEpilogue) { @@ -7295,11 +7322,8 @@ DenseMap LoopVectorizationPlanner::executePlan( VPlanTransforms::convertToConcreteRecipes(BestVPlan, *Legal->getWidestInductionType()); - // Retrieve and store the middle block before dissolving regions. Regions are - // dissolved after optimizing for VF and UF, which completely removes unneeded - // loop regions first. - VPBasicBlock *MiddleVPBB = - BestVPlan.getVectorLoopRegion() ? BestVPlan.getMiddleBlock() : nullptr; + + addBranchWeightToMiddleTerminator(BestVPlan, BestVF, OrigLoop); VPlanTransforms::dissolveLoopRegions(BestVPlan); // Perform the actual loop transformation. VPTransformState State(&TTI, BestVF, LI, DT, ILV.AC, ILV.Builder, &BestVPlan, @@ -7442,20 +7466,6 @@ DenseMap LoopVectorizationPlanner::executePlan( ILV.printDebugTracesAtEnd(); - // 4. Adjust branch weight of the branch in the middle block. - if (HeaderVPBB) { - auto *MiddleTerm = - cast(State.CFG.VPBB2IRBB[MiddleVPBB]->getTerminator()); - if (MiddleTerm->isConditional() && - hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())) { - // Assume that `Count % VectorTripCount` is equally distributed. - unsigned TripCount = BestVPlan.getUF() * State.VF.getKnownMinValue(); - assert(TripCount > 0 && "trip count should not be zero"); - const uint32_t Weights[] = {1, TripCount - 1}; - setBranchWeights(*MiddleTerm, Weights, /*IsExpected=*/false); - } - } - return ExpandedSCEVs; } diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index acc861b99197..468284168e9c 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -882,11 +882,39 @@ protected: unsigned getUnrollPart(VPUser &U) const; }; +/// Helper to manage IR metadata for recipes. It filters out metadata that +/// cannot be propagated. +class VPIRMetadata { + SmallVector> Metadata; + +public: + VPIRMetadata() {} + + /// Adds metatadata that can be preserved from the original instruction + /// \p I. + VPIRMetadata(Instruction &I) { getMetadataToPropagate(&I, Metadata); } + + /// Adds metatadata that can be preserved from the original instruction + /// \p I and noalias metadata guaranteed by runtime checks using \p LVer. + VPIRMetadata(Instruction &I, LoopVersioning *LVer); + + /// Copy constructor for cloning. + VPIRMetadata(const VPIRMetadata &Other) : Metadata(Other.Metadata) {} + + /// Add all metadata to \p I. + void applyMetadata(Instruction &I) const; + + void addMetadata(unsigned Kind, MDNode *Node) { + Metadata.emplace_back(Kind, Node); + } +}; + /// This is a concrete Recipe that models a single VPlan-level instruction. /// While as any Recipe it may generate a sequence of IR instructions when /// executed, these instructions would always form a single-def expression as /// the VPInstruction is also a single def-use vertex. class VPInstruction : public VPRecipeWithIRFlags, + public VPIRMetadata, public VPUnrollPartAccessor<1> { friend class VPlanSlp; @@ -976,7 +1004,7 @@ public: VPInstruction(unsigned Opcode, ArrayRef Operands, DebugLoc DL = {}, const Twine &Name = "") : VPRecipeWithIRFlags(VPDef::VPInstructionSC, Operands, DL), - Opcode(Opcode), Name(Name.str()) {} + VPIRMetadata(), Opcode(Opcode), Name(Name.str()) {} VPInstruction(unsigned Opcode, ArrayRef Operands, const VPIRFlags &Flags, DebugLoc DL = {}, @@ -1268,29 +1296,6 @@ protected: const VPRecipeBase *getAsRecipe() const override { return this; } }; -/// Helper to manage IR metadata for recipes. It filters out metadata that -/// cannot be propagated. -class VPIRMetadata { - SmallVector> Metadata; - -public: - VPIRMetadata() {} - - /// Adds metatadata that can be preserved from the original instruction - /// \p I. - VPIRMetadata(Instruction &I) { getMetadataToPropagate(&I, Metadata); } - - /// Adds metatadata that can be preserved from the original instruction - /// \p I and noalias metadata guaranteed by runtime checks using \p LVer. - VPIRMetadata(Instruction &I, LoopVersioning *LVer); - - /// Copy constructor for cloning. - VPIRMetadata(const VPIRMetadata &Other) : Metadata(Other.Metadata) {} - - /// Add all metadata to \p I. - void applyMetadata(Instruction &I) const; -}; - /// VPWidenRecipe is a recipe for producing a widened instruction using the /// opcode and operands of the recipe. This recipe covers most of the /// traditional vectorization cases where each recipe transforms into a diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 62b99d98a2b5..f5a2533727b3 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -410,7 +410,7 @@ VPInstruction::VPInstruction(unsigned Opcode, ArrayRef Operands, const VPIRFlags &Flags, DebugLoc DL, const Twine &Name) : VPRecipeWithIRFlags(VPDef::VPInstructionSC, Operands, Flags, DL), - Opcode(Opcode), Name(Name.str()) { + VPIRMetadata(), Opcode(Opcode), Name(Name.str()) { assert(flagsValidForOpcode(getOpcode()) && "Set flags not supported for the provided opcode"); } @@ -591,7 +591,9 @@ Value *VPInstruction::generate(VPTransformState &State) { } case VPInstruction::BranchOnCond: { Value *Cond = State.get(getOperand(0), VPLane(0)); - return createCondBranch(Cond, getParent(), State); + auto *Br = createCondBranch(Cond, getParent(), State); + applyMetadata(*Br); + return Br; } case VPInstruction::BranchOnCount: { // First create the compare. From 2a27c059eccd96b6e46464dbdf69fd2f6237a56c Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Thu, 12 Jun 2025 10:46:08 +0100 Subject: [PATCH 0109/1322] [X86] Use BSR passthrough behaviour to fold (CMOV (BSR ?, X), Y, (X == 0)) -> (BSR Y, X) (#143662) Make use of targets that support BSR "pass through behaviour" on a zero input to remove a CMOV thats performing the same function BSF will be a trickier patch as we need to make sure it works with the "REP BSF" hack in X86MCInstLower --- llvm/lib/Target/X86/X86ISelLowering.cpp | 10 ++++++++++ llvm/test/CodeGen/X86/bsr.ll | 10 ++++------ llvm/test/CodeGen/X86/pr40090.ll | 11 ++++------- 3 files changed, 18 insertions(+), 13 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index b0553aa4b819..f0fbf55e97be 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -49398,6 +49398,8 @@ static SDValue combineCMov(SDNode *N, SelectionDAG &DAG, // (ADD (CMOV C1-C2, (CTTZ X), (X != 0)), C2) // Or (CMOV (ADD (CTTZ X), C2), C1, (X == 0)) -> // (ADD (CMOV (CTTZ X), C1-C2, (X == 0)), C2) + // Or (CMOV (BSR ?, X), Y, (X == 0)) -> (BSR Y, X) + // TODO: Or (CMOV (BSF ?, X), Y, (X == 0)) -> (BSF Y, X) if ((CC == X86::COND_NE || CC == X86::COND_E) && Cond.getOpcode() == X86ISD::CMP && isNullConstant(Cond.getOperand(1))) { SDValue Add = TrueOp; @@ -49406,6 +49408,14 @@ static SDValue combineCMov(SDNode *N, SelectionDAG &DAG, if (CC == X86::COND_E) std::swap(Add, Const); + // TODO: ADD BSF support, but requires changes to the "REP BSF" CTTZ hack. + if (Subtarget.hasBitScanPassThrough() && Add.getOpcode() == X86ISD::BSR && + Add.getResNo() == 0 && Add.hasOneUse() && + Add.getOperand(1) == Cond.getOperand(0)) { + return DAG.getNode(Add.getOpcode(), DL, Add->getVTList(), Const, + Add.getOperand(1)); + } + // We might have replaced the constant in the cmov with the LHS of the // compare. If so change it to the RHS of the compare. if (Const == Cond.getOperand(0)) diff --git a/llvm/test/CodeGen/X86/bsr.ll b/llvm/test/CodeGen/X86/bsr.ll index 1247b3ec5932..fbca4af425ea 100644 --- a/llvm/test/CodeGen/X86/bsr.ll +++ b/llvm/test/CodeGen/X86/bsr.ll @@ -162,9 +162,8 @@ define i32 @cmov_bsr32(i32 %x, i32 %y) nounwind { ; ; X64-LABEL: cmov_bsr32: ; X64: # %bb.0: -; X64-NEXT: movl $63, %eax +; X64-NEXT: movl %esi, %eax ; X64-NEXT: bsrl %edi, %eax -; X64-NEXT: cmovel %esi, %eax ; X64-NEXT: retq %1 = tail call i32 @llvm.ctlz.i32(i32 %x, i1 false) %2 = xor i32 %1, 31 @@ -188,8 +187,8 @@ define i32 @cmov_bsr32_undef(i32 %x, i32 %y) nounwind { ; ; X64-LABEL: cmov_bsr32_undef: ; X64: # %bb.0: +; X64-NEXT: movl %esi, %eax ; X64-NEXT: bsrl %edi, %eax -; X64-NEXT: cmovel %esi, %eax ; X64-NEXT: retq %1 = tail call i32 @llvm.ctlz.i32(i32 %x, i1 true) %2 = xor i32 %1, 31 @@ -239,9 +238,8 @@ define i64 @cmov_bsr64(i64 %x, i64 %y) nounwind { ; ; X64-LABEL: cmov_bsr64: ; X64: # %bb.0: -; X64-NEXT: movl $127, %eax +; X64-NEXT: movq %rsi, %rax ; X64-NEXT: bsrq %rdi, %rax -; X64-NEXT: cmoveq %rsi, %rax ; X64-NEXT: retq %1 = tail call i64 @llvm.ctlz.i64(i64 %x, i1 false) %2 = xor i64 %1, 63 @@ -279,8 +277,8 @@ define i64 @cmov_bsr64_undef(i64 %x, i64 %y) nounwind { ; ; X64-LABEL: cmov_bsr64_undef: ; X64: # %bb.0: +; X64-NEXT: movq %rsi, %rax ; X64-NEXT: bsrq %rdi, %rax -; X64-NEXT: cmoveq %rsi, %rax ; X64-NEXT: retq %1 = tail call i64 @llvm.ctlz.i64(i64 %x, i1 true) %2 = xor i64 %1, 63 diff --git a/llvm/test/CodeGen/X86/pr40090.ll b/llvm/test/CodeGen/X86/pr40090.ll index 24e957ac59f5..af933c950e11 100644 --- a/llvm/test/CodeGen/X86/pr40090.ll +++ b/llvm/test/CodeGen/X86/pr40090.ll @@ -4,10 +4,9 @@ define i64 @foo(i64 %x, i64 %y) { ; CHECK-LABEL: foo: ; CHECK: # %bb.0: -; CHECK-NEXT: bsrq %rdi, %rax -; CHECK-NEXT: orq $64, %rax +; CHECK-NEXT: bsrq %rdi, %rcx +; CHECK-NEXT: orq $64, %rcx ; CHECK-NEXT: bsrq %rsi, %rcx -; CHECK-NEXT: cmoveq %rax, %rcx ; CHECK-NEXT: movl $63, %eax ; CHECK-NEXT: subq %rcx, %rax ; CHECK-NEXT: retq @@ -25,11 +24,9 @@ define i64 @bar(i64 %x, i64 %y) { ; CHECK-LABEL: bar: ; CHECK: # %bb.0: ; CHECK-NEXT: movl $127, %ecx -; CHECK-NEXT: movl $127, %eax -; CHECK-NEXT: bsrq %rdi, %rax -; CHECK-NEXT: xorq $64, %rax +; CHECK-NEXT: bsrq %rdi, %rcx +; CHECK-NEXT: xorq $64, %rcx ; CHECK-NEXT: bsrq %rsi, %rcx -; CHECK-NEXT: cmoveq %rax, %rcx ; CHECK-NEXT: movl $63, %eax ; CHECK-NEXT: subq %rcx, %rax ; CHECK-NEXT: retq From 1d1f9afe911c360b9505b5fd2c712cb112c8aa5f Mon Sep 17 00:00:00 2001 From: Chuanqi Xu Date: Thu, 12 Jun 2025 17:42:00 +0800 Subject: [PATCH 0110/1322] [C++20] [Modules] Treat directly imported internal partition unit as reachable Close https://github.com/llvm/llvm-project/issues/143788 See the discussion for details. --- clang/lib/Sema/SemaLookup.cpp | 23 ++++++++++++++++++----- clang/lib/Sema/SemaModule.cpp | 13 +++++++------ clang/test/Modules/pr143788.cppm | 28 ++++++++++++++++++++++++++++ 3 files changed, 53 insertions(+), 11 deletions(-) create mode 100644 clang/test/Modules/pr143788.cppm diff --git a/clang/lib/Sema/SemaLookup.cpp b/clang/lib/Sema/SemaLookup.cpp index eef134b15843..91822909f1fd 100644 --- a/clang/lib/Sema/SemaLookup.cpp +++ b/clang/lib/Sema/SemaLookup.cpp @@ -1978,6 +1978,8 @@ bool LookupResult::isReachableSlow(Sema &SemaRef, NamedDecl *D) { if (D->isModulePrivate()) return false; + Module *DeclTopModule = DeclModule->getTopLevelModule(); + // [module.reach]/p1 // A translation unit U is necessarily reachable from a point P if U is a // module interface unit on which the translation unit containing P has an @@ -1996,17 +1998,28 @@ bool LookupResult::isReachableSlow(Sema &SemaRef, NamedDecl *D) { // // Here we only check for the first condition. Since we couldn't see // DeclModule if it isn't (transitively) imported. - if (DeclModule->getTopLevelModule()->isModuleInterfaceUnit()) + if (DeclTopModule->isModuleInterfaceUnit()) return true; - // [module.reach]/p2 + // [module.reach]/p1,2 + // A translation unit U is necessarily reachable from a point P if U is a + // module interface unit on which the translation unit containing P has an + // interface dependency, or the translation unit containing P imports U, in + // either case prior to P + // // Additional translation units on // which the point within the program has an interface dependency may be // considered reachable, but it is unspecified which are and under what // circumstances. - // - // The decision here is to treat all additional tranditional units as - // unreachable. + Module *CurrentM = SemaRef.getCurrentModule(); + + // Directly imported module are necessarily reachable. + // Since we can't export import a module implementation partition unit, we + // don't need to count for Exports here. + if (CurrentM && CurrentM->getTopLevelModule()->Imports.count(DeclTopModule)) + return true; + + // Then we treat all module implementation partition unit as unreachable. return false; } diff --git a/clang/lib/Sema/SemaModule.cpp b/clang/lib/Sema/SemaModule.cpp index 6c4df0aa35af..9fcaad48d305 100644 --- a/clang/lib/Sema/SemaModule.cpp +++ b/clang/lib/Sema/SemaModule.cpp @@ -712,7 +712,13 @@ DeclResult Sema::ActOnModuleImport(SourceLocation StartLoc, Mod->Kind == Module::ModuleKind::ModulePartitionImplementation) { Diag(ExportLoc, diag::err_export_partition_impl) << SourceRange(ExportLoc, Path.back().getLoc()); - } else if (!ModuleScopes.empty() && !currentModuleIsImplementation()) { + } else if (ExportLoc.isValid() && + (ModuleScopes.empty() || currentModuleIsImplementation())) { + // [module.interface]p1: + // An export-declaration shall inhabit a namespace scope and appear in the + // purview of a module interface unit. + Diag(ExportLoc, diag::err_export_not_in_module_interface); + } else if (!ModuleScopes.empty()) { // Re-export the module if the imported module is exported. // Note that we don't need to add re-exported module to Imports field // since `Exports` implies the module is imported already. @@ -720,11 +726,6 @@ DeclResult Sema::ActOnModuleImport(SourceLocation StartLoc, getCurrentModule()->Exports.emplace_back(Mod, false); else getCurrentModule()->Imports.insert(Mod); - } else if (ExportLoc.isValid()) { - // [module.interface]p1: - // An export-declaration shall inhabit a namespace scope and appear in the - // purview of a module interface unit. - Diag(ExportLoc, diag::err_export_not_in_module_interface); } return Import; diff --git a/clang/test/Modules/pr143788.cppm b/clang/test/Modules/pr143788.cppm new file mode 100644 index 000000000000..5ae36d8d0e85 --- /dev/null +++ b/clang/test/Modules/pr143788.cppm @@ -0,0 +1,28 @@ +// RUN: rm -rf %t +// RUN: mkdir -p %t +// RUN: split-file %s %t +// +// RUN: %clang_cc1 -std=c++20 %t/M.cppm -emit-module-interface -o %t/M.pcm +// RUN: %clang_cc1 -std=c++20 %t/P.cppm -emit-module-interface -o %t/P.pcm +// RUN: %clang_cc1 -std=c++20 %t/I.cpp -fmodule-file=M:P=%t/P.pcm -fmodule-file=M=%t/M.pcm -fsyntax-only -verify + +//--- H.hpp +struct S{}; + +//--- M.cppm +export module M; + + +//--- P.cppm +module; +#include "H.hpp" +module M:P; + +using T = S; + +//--- I.cpp +// expected-no-diagnostics +module M; +import :P; + +T f() { return {}; } From 8e4fdff6f02161d878a63900abb35aaa32ff85e9 Mon Sep 17 00:00:00 2001 From: Omair Javaid Date: Thu, 12 Jun 2025 14:48:13 +0500 Subject: [PATCH 0111/1322] [X86] Update tailcc-ssp.ll assertions using update_llc_test_checks.py (#143500) The assertions in llvm/test/CodeGen/X86/tailcc-ssp.ll were outdated. The initial comment indicated they were generated with `utils/update_llc_test_checks.py UTC_ARGS: --version 5`, but this was not accurate based on the file's content. Running `utils/update_llc_test_checks.py` regenerated the assertions, aligning them with the current `llc` output. This commit ensures that the test's claimed behavior accurately reflects the actual `llc` output, even though the tests were already passing. This was identified by @efriedma-quic during review of #136290. Submitting a separate PR to make sure these changes stay isolated. --- llvm/test/CodeGen/X86/tailcc-ssp.ll | 55 ++++++++++++++++++++++------- 1 file changed, 43 insertions(+), 12 deletions(-) diff --git a/llvm/test/CodeGen/X86/tailcc-ssp.ll b/llvm/test/CodeGen/X86/tailcc-ssp.ll index 5211e4fe9eef..7ea5dd49f024 100644 --- a/llvm/test/CodeGen/X86/tailcc-ssp.ll +++ b/llvm/test/CodeGen/X86/tailcc-ssp.ll @@ -78,7 +78,7 @@ define void @tailcall_unrelated_frame() sspreq { ; WINDOWS-NEXT: callq __security_check_cookie ; WINDOWS-NEXT: int3 ; WINDOWS-NEXT: .seh_endproc - +; ; LINUX-LABEL: tailcall_unrelated_frame: ; LINUX: # %bb.0: ; LINUX-NEXT: pushq %rax @@ -97,6 +97,7 @@ define void @tailcall_unrelated_frame() sspreq { ; LINUX-NEXT: .cfi_def_cfa_offset 16 ; LINUX-NEXT: callq __stack_chk_fail@PLT + call void @bar() tail call void @bar() ret void @@ -105,18 +106,48 @@ define void @tailcall_unrelated_frame() sspreq { declare void @callee() define void @caller() sspreq { ; WINDOWS-LABEL: caller: -; WINDOWS: callq callee -; WINDOWS: callq callee -; WINDOWS: cmpq __security_cookie(%rip), %rcx -; WINDOWS: jne -; WINDOWS: callq __security_check_cookie - +; WINDOWS: # %bb.0: +; WINDOWS-NEXT: subq $40, %rsp +; WINDOWS-NEXT: .seh_stackalloc 40 +; WINDOWS-NEXT: .seh_endprologue +; WINDOWS-NEXT: movq __security_cookie(%rip), %rax +; WINDOWS-NEXT: xorq %rsp, %rax +; WINDOWS-NEXT: movq %rax, {{[0-9]+}}(%rsp) +; WINDOWS-NEXT: callq callee +; WINDOWS-NEXT: callq callee +; WINDOWS-NEXT: movq {{[0-9]+}}(%rsp), %rcx +; WINDOWS-NEXT: xorq %rsp, %rcx +; WINDOWS-NEXT: cmpq __security_cookie(%rip), %rcx +; WINDOWS-NEXT: jne .LBB2_2 +; WINDOWS-NEXT: # %bb.1: +; WINDOWS-NEXT: .seh_startepilogue +; WINDOWS-NEXT: addq $40, %rsp +; WINDOWS-NEXT: .seh_endepilogue +; WINDOWS-NEXT: retq +; WINDOWS-NEXT: .LBB2_2: +; WINDOWS-NEXT: callq __security_check_cookie +; WINDOWS-NEXT: int3 +; WINDOWS-NEXT: .seh_endproc +; ; LINUX-LABEL: caller: -; LINUX: callq callee@PLT -; LINUX: callq callee@PLT -; LINUX: cmpq -; LINUX: jne -; LINUX: callq __stack_chk_fail@PLT +; LINUX: # %bb.0: +; LINUX-NEXT: pushq %rax +; LINUX-NEXT: .cfi_def_cfa_offset 16 +; LINUX-NEXT: movq %fs:40, %rax +; LINUX-NEXT: movq %rax, (%rsp) +; LINUX-NEXT: callq callee@PLT +; LINUX-NEXT: callq callee@PLT +; LINUX-NEXT: movq %fs:40, %rax +; LINUX-NEXT: cmpq (%rsp), %rax +; LINUX-NEXT: jne .LBB2_2 +; LINUX-NEXT: # %bb.1: # %SP_return +; LINUX-NEXT: popq %rax +; LINUX-NEXT: .cfi_def_cfa_offset 8 +; LINUX-NEXT: retq +; LINUX-NEXT: .LBB2_2: # %CallStackCheckFailBlk +; LINUX-NEXT: .cfi_def_cfa_offset 16 +; LINUX-NEXT: callq __stack_chk_fail@PLT + tail call void @callee() call void @callee() From 3e5d50f9c61bb266ab17919ab5209c7b08520aff Mon Sep 17 00:00:00 2001 From: Durgadoss R Date: Thu, 12 Jun 2025 15:20:39 +0530 Subject: [PATCH 0112/1322] [NVPTX] Add cta_group support to TMA G2S intrinsics (#143178) This patch extends the TMA G2S intrinsics with the support for cta_group::1/2 available from Blackwell onwards. The existing intrinsics are auto-upgraded with a default value of '0' for the `cta_group` flag operand. * lit tests are added for all combinations of the newer variants. * Negative tests are added to validate the error-handling when the value of the cta_group flag falls out-of-range. * The generated PTX is verified with a 12.8 ptxas executable. Signed-off-by: Durgadoss R --- llvm/docs/NVPTXUsage.rst | 32 +- llvm/include/llvm/IR/IntrinsicsNVVM.td | 32 +- llvm/include/llvm/IR/NVVMIntrinsicUtils.h | 9 + llvm/lib/IR/AutoUpgrade.cpp | 104 ++++- .../NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp | 19 + .../NVPTX/MCTargetDesc/NVPTXInstPrinter.h | 1 + llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp | 19 +- llvm/lib/Target/NVPTX/NVPTXIntrinsics.td | 17 +- llvm/lib/Target/NVPTX/NVPTXSubtarget.h | 8 + .../Assembler/auto_upgrade_nvvm_intrinsics.ll | 16 +- .../NVPTX/cp-async-bulk-tensor-g2s-1cta.ll | 435 ++++++++++++++++++ .../NVPTX/cp-async-bulk-tensor-g2s-2cta.ll | 435 ++++++++++++++++++ .../NVPTX/cp-async-bulk-tensor-g2s-invalid.ll | 15 + 13 files changed, 1078 insertions(+), 64 deletions(-) create mode 100644 llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-1cta.ll create mode 100644 llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-2cta.ll create mode 100644 llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-invalid.ll diff --git a/llvm/docs/NVPTXUsage.rst b/llvm/docs/NVPTXUsage.rst index d51686c0b830..abd7ca545364 100644 --- a/llvm/docs/NVPTXUsage.rst +++ b/llvm/docs/NVPTXUsage.rst @@ -1016,7 +1016,7 @@ Syntax: .. code-block:: llvm - declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.1d(ptr addrspace(7) %dst, ptr addrspace(3) %bar, ptr %tensor_map, i32 %d0, i16 %mc, i64 %ch, i1 %flag_mc, i1 %flag_ch) + declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.1d(ptr addrspace(7) %dst, ptr addrspace(3) %bar, ptr %tensor_map, i32 %d0, i16 %mc, i64 %ch, i1 %flag_mc, i1 %flag_ch, i32 %flag_cta_group) declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.2d(..., i32 %d0, i32 %d1, ...) declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.3d(..., i32 %d0, i32 %d1, i32 %d2, ...) declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.4d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, ...) @@ -1034,18 +1034,26 @@ source tensor is preserved at the destination. The dimension of the tensor data ranges from 1d to 5d with the coordinates specified by the ``i32 %d0 ... i32 %d4`` arguments. -* The last two arguments to these intrinsics are boolean flags - indicating support for cache_hint and/or multicast modifiers. - These flag arguments must be compile-time constants. The backend - looks through these flags and lowers the intrinsics appropriately. +* The last three arguments to these intrinsics are flags + indicating support for multicast, cache_hint and cta_group::1/2 + modifiers. These flag arguments must be compile-time constants. + The backend looks through these flags and lowers the intrinsics + appropriately. -* The Nth argument (denoted by ``i1 flag_ch``) when set, indicates +* The argument denoted by ``i1 %flag_ch`` when set, indicates a valid cache_hint (``i64 %ch``) and generates the ``.L2::cache_hint`` variant of the PTX instruction. -* The [N-1]th argument (denoted by ``i1 flag_mc``) when set, indicates - the presence of a multicast mask (``i16 %mc``) and generates the PTX - instruction with the ``.multicast::cluster`` modifier. +* The argument denoted by ``i1 %flag_mc`` when set, indicates + the presence of a multicast mask (``i16 %mc``) and generates + the PTX instruction with the ``.multicast::cluster`` modifier. + +* The argument denoted by ``i32 %flag_cta_group`` takes values within + the range [0, 3) i.e. {0,1,2}. When the value of ``%flag_cta_group`` + is not within the range, it may raise an error from the Verifier. + The default value is '0' with no cta_group modifier in the + instruction. The values of '1' and '2' lower to ``cta_group::1`` + and ``cta_group::2`` variants of the PTX instruction respectively. For more information, refer PTX ISA ``_. @@ -1058,7 +1066,7 @@ Syntax: .. code-block:: llvm - declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.3d(ptr addrspace(3) %dst, ptr addrspace(3) %bar, ptr %tensor_map, i32 %d0, i32 %d1, i32 %d2, i16 %im2col0, i16 %mc, i64 %ch, i1 %flag_mc, i1 %flag_ch) + declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.3d(ptr addrspace(3) %dst, ptr addrspace(3) %bar, ptr %tensor_map, i32 %d0, i32 %d1, i32 %d2, i16 %im2col0, i16 %mc, i64 %ch, i1 %flag_mc, i1 %flag_ch, i32 %flag_cta_group) declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.4d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %im2col0, i16 %im2col1, ...) declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.5d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %im2col0, i16 %im2col1, i16 %im2col2, ...) @@ -1074,8 +1082,8 @@ are unrolled into a single dimensional column at the destination. In this mode, the tensor has to be at least three-dimensional. Along with the tensor coordinates, im2col offsets are also specified (denoted by ``i16 im2col0...i16 %im2col2``). The number of im2col offsets is two less -than the number of dimensions of the tensor operation. The last two arguments -to these intrinsics are boolean flags, with the same functionality as described +than the number of dimensions of the tensor operation. The last three arguments +to these intrinsics are flags, with the same functionality as described in the ``tile`` mode intrinsics above. For more information, refer PTX ISA diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td index 8c8e778b5706..4efdff71c016 100644 --- a/llvm/include/llvm/IR/IntrinsicsNVVM.td +++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td @@ -2020,20 +2020,26 @@ foreach dim = 1...5 in { defvar num_im2col_offsets = !if(is_im2col, !add(dim, -2), 0); defvar im2col_offsets_args = !listsplat(llvm_i16_ty, num_im2col_offsets); + defvar g2s_params = !listconcat( + [llvm_shared_cluster_ptr_ty, // dst_ptr + llvm_shared_ptr_ty, // mbarrier_ptr + llvm_ptr_ty], // tensormap_ptr + tensor_dim_args, // actual tensor dims + im2col_offsets_args, // im2col offsets + [llvm_i16_ty, // cta_mask + llvm_i64_ty]); // cache_hint + defvar g2s_flags = [llvm_i1_ty, // Flag for cta_mask + llvm_i1_ty, // Flag for cache_hint + llvm_i32_ty]; // Flag for cta_group + defvar cta_group_idx = !add( + !size(g2s_params), + !sub(!size(g2s_flags), 1)); + defvar g2s_props = [IntrConvergent, + WriteOnly>, ReadOnly>, + // Allowed values for cta_group are {0,1,2} i.e [0, 3). + Range, 0, 3>]; def int_nvvm_cp_async_bulk_tensor_g2s_ # mode # _ # dim # d : - DefaultAttrsIntrinsicFlags<[], - !listconcat([llvm_shared_cluster_ptr_ty, // dst_shared_cluster_ptr - llvm_shared_ptr_ty, // mbarrier_smem_ptr - llvm_ptr_ty], // tensormap_ptr - tensor_dim_args, // actual tensor dims - im2col_offsets_args, // im2col offsets - [llvm_i16_ty, // cta_mask - llvm_i64_ty]), // cache_hint - [llvm_i1_ty, // Flag for cta_mask - llvm_i1_ty], // Flag for cache_hint - [IntrConvergent, - WriteOnly>, ReadOnly>, - NoCapture>, NoCapture>, NoCapture>]>; + DefaultAttrsIntrinsicFlags<[], g2s_params, g2s_flags, g2s_props>; def int_nvvm_cp_async_bulk_tensor_s2g_ # mode # _ # dim # d : DefaultAttrsIntrinsicFlags<[], diff --git a/llvm/include/llvm/IR/NVVMIntrinsicUtils.h b/llvm/include/llvm/IR/NVVMIntrinsicUtils.h index ce794e257363..737610b73b08 100644 --- a/llvm/include/llvm/IR/NVVMIntrinsicUtils.h +++ b/llvm/include/llvm/IR/NVVMIntrinsicUtils.h @@ -38,6 +38,15 @@ enum class TMAReductionOp : uint8_t { XOR = 7, }; +// Enum to represent the cta_group::1 and +// cta_group::2 variants in TMA/TCGEN05 family of +// PTX instructions. +enum class CTAGroupKind : uint8_t { + CG_NONE = 0, // default with no cta_group modifier + CG_1 = 1, // cta_group::1 modifier + CG_2 = 2, // cta_group::2 modifier +}; + inline bool FPToIntegerIntrinsicShouldFTZ(Intrinsic::ID IntrinsicID) { switch (IntrinsicID) { case Intrinsic::nvvm_f2i_rm_ftz: diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp index a0886776ff93..6e7254ec3e31 100644 --- a/llvm/lib/IR/AutoUpgrade.cpp +++ b/llvm/lib/IR/AutoUpgrade.cpp @@ -945,6 +945,53 @@ static bool upgradeArmOrAarch64IntrinsicFunction(bool IsArm, Function *F, return false; // No other 'arm.*', 'aarch64.*'. } +static Intrinsic::ID shouldUpgradeNVPTXTMAG2SIntrinsics(Function *F, + StringRef Name) { + if (Name.consume_front("cp.async.bulk.tensor.g2s.")) { + Intrinsic::ID ID = + StringSwitch(Name) + .Case("im2col.3d", + Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_3d) + .Case("im2col.4d", + Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_4d) + .Case("im2col.5d", + Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_5d) + .Case("tile.1d", Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_1d) + .Case("tile.2d", Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_2d) + .Case("tile.3d", Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_3d) + .Case("tile.4d", Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_4d) + .Case("tile.5d", Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_5d) + .Default(Intrinsic::not_intrinsic); + + if (ID == Intrinsic::not_intrinsic) + return ID; + + // These intrinsics may need upgrade for two reasons: + // (1) When the address-space of the first argument is shared[AS=3] + // (and we upgrade it to use shared_cluster address-space[AS=7]) + if (F->getArg(0)->getType()->getPointerAddressSpace() == + NVPTXAS::ADDRESS_SPACE_SHARED) + return ID; + + // (2) When there are only two boolean flag arguments at the end: + // + // The last three parameters of the older version of these + // intrinsics are: arg1, arg2, .. i64 ch, i1 mc_flag, i1 ch_flag + // + // The newer version reads as: + // arg1, arg2, .. i64 ch, i1 mc_flag, i1 ch_flag, i32 cta_group_flag + // + // So, when the type of the [N-3]rd argument is "not i1", then + // it is the older version and we need to upgrade. + size_t FlagStartIndex = F->getFunctionType()->getNumParams() - 3; + Type *ArgType = F->getFunctionType()->getParamType(FlagStartIndex); + if (!ArgType->isIntegerTy(1)) + return ID; + } + + return Intrinsic::not_intrinsic; +} + static Intrinsic::ID shouldUpgradeNVPTXSharedClusterIntrinsic(Function *F, StringRef Name) { if (Name.consume_front("mapa.shared.cluster")) @@ -959,22 +1006,6 @@ static Intrinsic::ID shouldUpgradeNVPTXSharedClusterIntrinsic(Function *F, Intrinsic::nvvm_cp_async_bulk_global_to_shared_cluster) .Case("shared.cta.to.cluster", Intrinsic::nvvm_cp_async_bulk_shared_cta_to_cluster) - .Case("tensor.g2s.im2col.3d", - Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_3d) - .Case("tensor.g2s.im2col.4d", - Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_4d) - .Case("tensor.g2s.im2col.5d", - Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_5d) - .Case("tensor.g2s.tile.1d", - Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_1d) - .Case("tensor.g2s.tile.2d", - Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_2d) - .Case("tensor.g2s.tile.3d", - Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_3d) - .Case("tensor.g2s.tile.4d", - Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_4d) - .Case("tensor.g2s.tile.5d", - Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_5d) .Default(Intrinsic::not_intrinsic); if (ID != Intrinsic::not_intrinsic) @@ -1339,6 +1370,14 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn, return true; } + // Upgrade TMA copy G2S Intrinsics + IID = shouldUpgradeNVPTXTMAG2SIntrinsics(F, Name); + if (IID != Intrinsic::not_intrinsic) { + rename(F); + NewFn = Intrinsic::getOrInsertDeclaration(F->getParent(), IID); + return true; + } + // The following nvvm intrinsics correspond exactly to an LLVM idiom, but // not to an intrinsic alone. We expand them in UpgradeIntrinsicCall. // @@ -4831,7 +4870,18 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) { return; } case Intrinsic::nvvm_cp_async_bulk_global_to_shared_cluster: - case Intrinsic::nvvm_cp_async_bulk_shared_cta_to_cluster: + case Intrinsic::nvvm_cp_async_bulk_shared_cta_to_cluster: { + // Create a new call with the correct address space. + SmallVector Args(CI->args()); + Args[0] = Builder.CreateAddrSpaceCast( + Args[0], Builder.getPtrTy(NVPTXAS::ADDRESS_SPACE_SHARED_CLUSTER)); + + NewCall = Builder.CreateCall(NewFn, Args); + NewCall->takeName(CI); + CI->replaceAllUsesWith(NewCall); + CI->eraseFromParent(); + return; + } case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_3d: case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_4d: case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_5d: @@ -4840,10 +4890,22 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) { case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_3d: case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_4d: case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_5d: { - // Create a new call with the correct address space. - SmallVector Args(CI->args()); - Args[0] = Builder.CreateAddrSpaceCast( - Args[0], Builder.getPtrTy(NVPTXAS::ADDRESS_SPACE_SHARED_CLUSTER)); + SmallVector Args(CI->args()); + + // Create AddrSpaceCast to shared_cluster if needed. + // This handles case (1) in shouldUpgradeNVPTXTMAG2SIntrinsics(). + unsigned AS = CI->getArgOperand(0)->getType()->getPointerAddressSpace(); + if (AS == NVPTXAS::ADDRESS_SPACE_SHARED) + Args[0] = Builder.CreateAddrSpaceCast( + Args[0], Builder.getPtrTy(NVPTXAS::ADDRESS_SPACE_SHARED_CLUSTER)); + + // Attach the flag argument for cta_group, with a + // default value of 0. This handles case (2) in + // shouldUpgradeNVPTXTMAG2SIntrinsics(). + size_t NumArgs = CI->arg_size(); + Value *FlagArg = CI->getArgOperand(NumArgs - 3); + if (!FlagArg->getType()->isIntegerTy(1)) + Args.push_back(ConstantInt::get(Builder.getInt32Ty(), 0)); NewCall = Builder.CreateCall(NewFn, Args); NewCall->takeName(CI); diff --git a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp index b4616b64bad1..732950deca9f 100644 --- a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp +++ b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp @@ -437,3 +437,22 @@ void NVPTXInstPrinter::printTmaReductionMode(const MCInst *MI, int OpNum, llvm_unreachable( "Invalid Reduction Op in printCpAsyncBulkTensorReductionMode"); } + +void NVPTXInstPrinter::printCTAGroup(const MCInst *MI, int OpNum, + raw_ostream &O) { + const MCOperand &MO = MI->getOperand(OpNum); + using CGTy = nvvm::CTAGroupKind; + + switch (static_cast(MO.getImm())) { + case CGTy::CG_NONE: + O << ""; + return; + case CGTy::CG_1: + O << ".cta_group::1"; + return; + case CGTy::CG_2: + O << ".cta_group::2"; + return; + } + llvm_unreachable("Invalid cta_group in printCTAGroup"); +} diff --git a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.h b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.h index a2dd772cd86d..f73af7a3f2c6 100644 --- a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.h +++ b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.h @@ -51,6 +51,7 @@ public: void printProtoIdent(const MCInst *MI, int OpNum, raw_ostream &O); void printPrmtMode(const MCInst *MI, int OpNum, raw_ostream &O); void printTmaReductionMode(const MCInst *MI, int OpNum, raw_ostream &O); + void printCTAGroup(const MCInst *MI, int OpNum, raw_ostream &O); }; } diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp index 32223bf3d601..a20099788d09 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp @@ -2556,19 +2556,25 @@ void NVPTXDAGToDAGISel::SelectCpAsyncBulkTensorG2SCommon(SDNode *N, // We have {Chain, Intrinsic-ID} followed by the actual intrisic args: // {dst, mbar, src, dims{d0...dN}, im2col_offsets{dims-2} // multicast, cache_hint, - // multicast_flag, cache_hint_flag} + // multicast_flag, cache_hint_flag, cta_group_flag} // NumOperands = {Chain, IID} + {Actual intrinsic args} - // = {2} + {7 + dims + im2col_offsets} + // = {2} + {8 + dims + im2col_offsets} size_t NumOps = N->getNumOperands(); size_t NumDims = IsIm2Col ? GetDimsFromIntrinsic(N->getConstantOperandVal(1)) - : (NumOps - 9); + : (NumOps - 10); // Offsets is always 'NumDims - 2' and only for im2col mode size_t NumOffsets = IsIm2Col ? (NumDims - 2) : 0; - bool IsCacheHint = N->getConstantOperandVal(NumOps - 1) == 1; - bool IsMultiCast = N->getConstantOperandVal(NumOps - 2) == 1; + bool IsCacheHint = N->getConstantOperandVal(NumOps - 2) == 1; + bool IsMultiCast = N->getConstantOperandVal(NumOps - 3) == 1; size_t NumBaseArgs = NumDims + NumOffsets + 3; // for {dst, mbar, src} size_t MultiCastIdx = NumBaseArgs + 2; // for Chain and IID + unsigned CTAGroupVal = N->getConstantOperandVal(NumOps - 1); + if ((CTAGroupVal > 0) && !Subtarget->hasCpAsyncBulkTensorCTAGroupSupport()) + report_fatal_error( + formatv("CpAsyncBulkTensorG2S cta_group::1/2 is not supported on sm_{}", + Subtarget->getSmVersion())); + SDLoc DL(N); SmallVector Ops(N->ops().slice(2, NumBaseArgs)); @@ -2580,6 +2586,9 @@ void NVPTXDAGToDAGISel::SelectCpAsyncBulkTensorG2SCommon(SDNode *N, if (IsCacheHint) Ops.push_back(N->getOperand(MultiCastIdx + 1)); + // Flag for CTA Group + Ops.push_back(getI32Imm(CTAGroupVal, DL)); + // Finally, the chain operand Ops.push_back(N->getOperand(0)); diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td index 83d7defe6d9a..f52ff39c3e1a 100644 --- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td +++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td @@ -578,10 +578,14 @@ class G2S_STRINGS { # !if(!eq(mode, "tile"), "_TILE", "_IM2COL"); } +def CTAGroupFlags : Operand { + let PrintMethod = "printCTAGroup"; +} + multiclass CP_ASYNC_BULK_TENSOR_G2S_INTR { defvar dims_dag = !dag(ins, !listsplat(Int32Regs, dim), !foreach(i, !range(dim), "d" # i)); defvar dims_str = !interleave(!foreach(i, !range(dim), "$d" # i), ", "); - defvar asm_str_default = " [$dst], [$tmap, {{" # dims_str # "}}], [$mbar]"; + defvar asm_str_default = "$cg [$dst], [$tmap, {{" # dims_str # "}}], [$mbar]"; defvar rc = !if(is_shared32, Int32Regs, Int64Regs); defvar num_im2col = !if(!ge(dim, 3), !add(dim, -2), 0); @@ -595,19 +599,22 @@ multiclass CP_ASYNC_BULK_TENSOR_G2S_INTR !strconcat(asm_str_default, im2col_asm_str), asm_str_default); def "" : NVPTXInst<(outs), - !con((ins rc:$dst, rc:$mbar, Int64Regs:$tmap), dims_dag, im2col_dag), + !con((ins rc:$dst, rc:$mbar, Int64Regs:$tmap), dims_dag, im2col_dag, (ins CTAGroupFlags:$cg)), !strconcat(G2S_STRINGS.inst_name, asm_str, ";"), []>, Requires<[hasPTX<80>, hasSM<90>]>; def _MC : NVPTXInst<(outs), - !con((ins rc:$dst, rc:$mbar, Int64Regs:$tmap), dims_dag, im2col_dag, (ins Int16Regs:$mc)), + !con((ins rc:$dst, rc:$mbar, Int64Regs:$tmap), dims_dag, im2col_dag, + (ins Int16Regs:$mc, CTAGroupFlags:$cg)), !strconcat(G2S_STRINGS.inst_name, asm_str, ", $mc;"), []>, Requires<[hasPTX<80>, hasSM<90>]>; def _CH : NVPTXInst<(outs), - !con((ins rc:$dst, rc:$mbar, Int64Regs:$tmap), dims_dag, im2col_dag, (ins Int64Regs:$ch)), + !con((ins rc:$dst, rc:$mbar, Int64Regs:$tmap), dims_dag, im2col_dag, + (ins Int64Regs:$ch, CTAGroupFlags:$cg)), !strconcat(G2S_STRINGS.inst_name, asm_str, ", $ch;"), []>, Requires<[hasPTX<80>, hasSM<90>]>; def _MC_CH : NVPTXInst<(outs), - !con((ins rc:$dst, rc:$mbar, Int64Regs:$tmap), dims_dag, im2col_dag, (ins Int16Regs:$mc, Int64Regs:$ch)), + !con((ins rc:$dst, rc:$mbar, Int64Regs:$tmap), dims_dag, im2col_dag, + (ins Int16Regs:$mc, Int64Regs:$ch, CTAGroupFlags:$cg)), !strconcat(G2S_STRINGS.inst_name, asm_str, ", $mc, $ch;"), []>, Requires<[hasPTX<80>, hasSM<90>]>; } diff --git a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h index 5136b1ee2850..d2eae4882682 100644 --- a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h +++ b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h @@ -117,6 +117,14 @@ public: return HasTcgen05 && PTXVersion >= 86; } + // TMA G2S copy with cta_group::1/2 support + bool hasCpAsyncBulkTensorCTAGroupSupport() const { + // TODO: Update/tidy-up after the family-conditional support arrives + return ((FullSmVersion == 1001 || FullSmVersion == 1011) && + PTXVersion >= 86) || + (FullSmVersion == 1031 && PTXVersion >= 88); + } + // Prior to CUDA 12.3 ptxas did not recognize that the trap instruction // terminates a basic block. Instead, it would assume that control flow // continued to the next instruction. The next instruction could be in the diff --git a/llvm/test/Assembler/auto_upgrade_nvvm_intrinsics.ll b/llvm/test/Assembler/auto_upgrade_nvvm_intrinsics.ll index b7bdca42d559..a17f11a680aa 100644 --- a/llvm/test/Assembler/auto_upgrade_nvvm_intrinsics.ll +++ b/llvm/test/Assembler/auto_upgrade_nvvm_intrinsics.ll @@ -307,9 +307,9 @@ define void @nvvm_cp_async_bulk_intrinsics(ptr addrspace(3) %dst, ptr addrspace( ; CHECK-LABEL: @nvvm_cp_async_bulk_tensor_g2s_im2col define void @nvvm_cp_async_bulk_tensor_g2s_im2col(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %im2col0, i16 %im2col1, i16 %im2col2, i16 %mc, i64 %ch) { -; CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.3d(ptr addrspace(7) %1, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %im2col0, i16 0, i64 0, i1 false, i1 false) -; CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.4d(ptr addrspace(7) %2, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %im2col0, i16 %im2col1, i16 0, i64 0, i1 false, i1 false) -; CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.5d(ptr addrspace(7) %3, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %im2col0, i16 %im2col1, i16 %im2col2, i16 0, i64 0, i1 false, i1 false) +; CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.3d(ptr addrspace(7) %1, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %im2col0, i16 0, i64 0, i1 false, i1 false, i32 0) +; CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.4d(ptr addrspace(7) %2, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %im2col0, i16 %im2col1, i16 0, i64 0, i1 false, i1 false, i32 0) +; CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.5d(ptr addrspace(7) %3, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %im2col0, i16 %im2col1, i16 %im2col2, i16 0, i64 0, i1 false, i1 false, i32 0) call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.3d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %im2col0, i16 0, i64 0, i1 0, i1 0) call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.4d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %im2col0, i16 %im2col1, i16 0, i64 0, i1 0, i1 0) call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.5d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %im2col0, i16 %im2col1, i16 %im2col2, i16 0, i64 0, i1 0, i1 0) @@ -318,11 +318,11 @@ define void @nvvm_cp_async_bulk_tensor_g2s_im2col(ptr addrspace(3) %d, ptr addrs ; CHECK-LABEL: @nvvm_cp_async_bulk_tensor_g2s_tile define void @nvvm_cp_async_bulk_tensor_g2s_tile(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %mc, i64 %ch) { -; CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.1d(ptr addrspace(7) %1, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i16 0, i64 0, i1 false, i1 false) -; CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.2d(ptr addrspace(7) %2, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i16 0, i64 0, i1 false, i1 false) -; CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.3d(ptr addrspace(7) %3, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 0, i64 0, i1 false, i1 false) -; CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.4d(ptr addrspace(7) %4, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 0, i64 0, i1 false, i1 false) -; CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.5d(ptr addrspace(7) %5, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 0, i64 0, i1 false, i1 false) +; CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.1d(ptr addrspace(7) %1, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i16 0, i64 0, i1 false, i1 false, i32 0) +; CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.2d(ptr addrspace(7) %2, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i16 0, i64 0, i1 false, i1 false, i32 0) +; CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.3d(ptr addrspace(7) %3, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 0, i64 0, i1 false, i1 false, i32 0) +; CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.4d(ptr addrspace(7) %4, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 0, i64 0, i1 false, i1 false, i32 0) +; CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.5d(ptr addrspace(7) %5, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 0, i64 0, i1 false, i1 false, i32 0) call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.1d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i16 0, i64 0, i1 0, i1 0) call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.2d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i16 0, i64 0, i1 0, i1 0) call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.3d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 0, i64 0, i1 0, i1 0) diff --git a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-1cta.ll b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-1cta.ll new file mode 100644 index 000000000000..5cfa25dfe55f --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-1cta.ll @@ -0,0 +1,435 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| FileCheck --check-prefixes=CHECK-PTX64 %s +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| FileCheck --check-prefixes=CHECK-PTX-SHARED32 %s +; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| %ptxas-verify -arch=sm_100a %} +; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| %ptxas-verify -arch=sm_100a %} + +target triple = "nvptx64-nvidia-cuda" + +declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.1d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i16 %mc, i64 %ch, i1 %f1, i1 %f2, i32 %f3); +declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.2d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i16 %mc, i64 %ch, i1 %f1, i1 %f2, i32 %f3); +declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i16 %mc, i64 %ch, i1 %f1, i1 %f2, i32 %f3); +declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %mc, i64 %ch, i1 %f1, i1 %f2, i32 %f3); +declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %mc, i64 %ch, i1 %f1, i1 %f2, i32 %f3); + +declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i16 %im2col0, i16 %mc, i64 %ch, i1 %f1, i1 %f2, i32 %f3); +declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %im2col0, i16 %im2col1, i16 %mc, i64 %ch, i1 %f1, i1 %f2, i32 %f3); +declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %im2col0, i16 %im2col1, i16 %im2col2, i16 %mc, i64 %ch, i1 %f1, i1 %f2, i32 %f3); + +; CHECK-LABEL: test_cp_async_bulk_tensor_g2s_tile_1d +define void @test_cp_async_bulk_tensor_g2s_tile_1d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i16 %mc, i64 %ch) { +; CHECK-PTX64-LABEL: test_cp_async_bulk_tensor_g2s_tile_1d( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b16 %rs<2>; +; CHECK-PTX64-NEXT: .reg .b32 %r<2>; +; CHECK-PTX64-NEXT: .reg .b64 %rd<5>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_tile_1d_param_0]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_tile_1d_param_1]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [test_cp_async_bulk_tensor_g2s_tile_1d_param_2]; +; CHECK-PTX64-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_tile_1d_param_3]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_tile_1d_param_4]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [test_cp_async_bulk_tensor_g2s_tile_1d_param_5]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::1 [%rd1], [%rd3, {%r1}], [%rd2], %rs1, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 [%rd1], [%rd3, {%r1}], [%rd2], %rs1; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::1 [%rd1], [%rd3, {%r1}], [%rd2], %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.cta_group::1 [%rd1], [%rd3, {%r1}], [%rd2]; +; CHECK-PTX64-NEXT: ret; +; +; CHECK-PTX-SHARED32-LABEL: test_cp_async_bulk_tensor_g2s_tile_1d( +; CHECK-PTX-SHARED32: { +; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<2>; +; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<4>; +; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX-SHARED32-EMPTY: +; CHECK-PTX-SHARED32-NEXT: // %bb.0: +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_tile_1d_param_0]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_tile_1d_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_tile_1d_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_tile_1d_param_3]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_tile_1d_param_4]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_tile_1d_param_5]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::1 [%r1], [%rd1, {%r3}], [%r2], %rs1, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 [%r1], [%rd1, {%r3}], [%r2], %rs1; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::1 [%r1], [%rd1, {%r3}], [%r2], %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.cta_group::1 [%r1], [%rd1, {%r3}], [%r2]; +; CHECK-PTX-SHARED32-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.1d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i16 %mc, i64 %ch, i1 1, i1 1, i32 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.1d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i16 %mc, i64 %ch, i1 1, i1 0, i32 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.1d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i16 %mc, i64 %ch, i1 0, i1 1, i32 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.1d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i16 %mc, i64 %ch, i1 0, i1 0, i32 1) + ret void +} + +; CHECK-LABEL: test_cp_async_bulk_tensor_g2s_tile_2d +define void @test_cp_async_bulk_tensor_g2s_tile_2d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i16 %mc, i64 %ch) { +; CHECK-PTX64-LABEL: test_cp_async_bulk_tensor_g2s_tile_2d( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b16 %rs<2>; +; CHECK-PTX64-NEXT: .reg .b32 %r<3>; +; CHECK-PTX64-NEXT: .reg .b64 %rd<5>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_tile_2d_param_0]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_tile_2d_param_1]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [test_cp_async_bulk_tensor_g2s_tile_2d_param_2]; +; CHECK-PTX64-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_tile_2d_param_3]; +; CHECK-PTX64-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_tile_2d_param_4]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_tile_2d_param_5]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [test_cp_async_bulk_tensor_g2s_tile_2d_param_6]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::1 [%rd1], [%rd3, {%r1, %r2}], [%rd2], %rs1, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 [%rd1], [%rd3, {%r1, %r2}], [%rd2], %rs1; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::1 [%rd1], [%rd3, {%r1, %r2}], [%rd2], %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.cta_group::1 [%rd1], [%rd3, {%r1, %r2}], [%rd2]; +; CHECK-PTX64-NEXT: ret; +; +; CHECK-PTX-SHARED32-LABEL: test_cp_async_bulk_tensor_g2s_tile_2d( +; CHECK-PTX-SHARED32: { +; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<2>; +; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<5>; +; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX-SHARED32-EMPTY: +; CHECK-PTX-SHARED32-NEXT: // %bb.0: +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_tile_2d_param_0]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_tile_2d_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_tile_2d_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_tile_2d_param_3]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_tile_2d_param_4]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_tile_2d_param_5]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_tile_2d_param_6]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::1 [%r1], [%rd1, {%r3, %r4}], [%r2], %rs1, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 [%r1], [%rd1, {%r3, %r4}], [%r2], %rs1; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::1 [%r1], [%rd1, {%r3, %r4}], [%r2], %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.cta_group::1 [%r1], [%rd1, {%r3, %r4}], [%r2]; +; CHECK-PTX-SHARED32-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.2d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i16 %mc, i64 %ch, i1 1, i1 1, i32 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.2d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i16 %mc, i64 %ch, i1 1, i1 0, i32 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.2d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i16 %mc, i64 %ch, i1 0, i1 1, i32 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.2d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i16 %mc, i64 %ch, i1 0, i1 0, i32 1) + ret void +} + +; CHECK-LABEL: test_cp_async_bulk_tensor_g2s_tile_3d +define void @test_cp_async_bulk_tensor_g2s_tile_3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %mc, i64 %ch) { +; CHECK-PTX64-LABEL: test_cp_async_bulk_tensor_g2s_tile_3d( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b16 %rs<2>; +; CHECK-PTX64-NEXT: .reg .b32 %r<4>; +; CHECK-PTX64-NEXT: .reg .b64 %rd<5>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_tile_3d_param_0]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_tile_3d_param_1]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [test_cp_async_bulk_tensor_g2s_tile_3d_param_2]; +; CHECK-PTX64-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_tile_3d_param_3]; +; CHECK-PTX64-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_tile_3d_param_4]; +; CHECK-PTX64-NEXT: ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_tile_3d_param_5]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_tile_3d_param_6]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [test_cp_async_bulk_tensor_g2s_tile_3d_param_7]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], %rs1, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], %rs1; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2]; +; CHECK-PTX64-NEXT: ret; +; +; CHECK-PTX-SHARED32-LABEL: test_cp_async_bulk_tensor_g2s_tile_3d( +; CHECK-PTX-SHARED32: { +; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<2>; +; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<6>; +; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX-SHARED32-EMPTY: +; CHECK-PTX-SHARED32-NEXT: // %bb.0: +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_tile_3d_param_0]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_tile_3d_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_tile_3d_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_tile_3d_param_3]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_tile_3d_param_4]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [test_cp_async_bulk_tensor_g2s_tile_3d_param_5]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_tile_3d_param_6]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_tile_3d_param_7]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], %rs1, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], %rs1; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5}], [%r2]; +; CHECK-PTX-SHARED32-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %mc, i64 %ch, i1 1, i1 1, i32 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %mc, i64 %ch, i1 1, i1 0, i32 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %mc, i64 %ch, i1 0, i1 1, i32 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %mc, i64 %ch, i1 0, i1 0, i32 1) + ret void +} + +; CHECK-LABEL: test_cp_async_bulk_tensor_g2s_tile_4d +define void @test_cp_async_bulk_tensor_g2s_tile_4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %mc, i64 %ch) { +; CHECK-PTX64-LABEL: test_cp_async_bulk_tensor_g2s_tile_4d( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b16 %rs<2>; +; CHECK-PTX64-NEXT: .reg .b32 %r<5>; +; CHECK-PTX64-NEXT: .reg .b64 %rd<5>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_tile_4d_param_0]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_tile_4d_param_1]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [test_cp_async_bulk_tensor_g2s_tile_4d_param_2]; +; CHECK-PTX64-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_tile_4d_param_3]; +; CHECK-PTX64-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_tile_4d_param_4]; +; CHECK-PTX64-NEXT: ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_tile_4d_param_5]; +; CHECK-PTX64-NEXT: ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_tile_4d_param_6]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_tile_4d_param_7]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [test_cp_async_bulk_tensor_g2s_tile_4d_param_8]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], %rs1, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], %rs1; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2]; +; CHECK-PTX64-NEXT: ret; +; +; CHECK-PTX-SHARED32-LABEL: test_cp_async_bulk_tensor_g2s_tile_4d( +; CHECK-PTX-SHARED32: { +; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<2>; +; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<7>; +; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX-SHARED32-EMPTY: +; CHECK-PTX-SHARED32-NEXT: // %bb.0: +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_tile_4d_param_0]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_tile_4d_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_tile_4d_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_tile_4d_param_3]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_tile_4d_param_4]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [test_cp_async_bulk_tensor_g2s_tile_4d_param_5]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r6, [test_cp_async_bulk_tensor_g2s_tile_4d_param_6]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_tile_4d_param_7]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_tile_4d_param_8]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], %rs1, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], %rs1; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2]; +; CHECK-PTX-SHARED32-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %mc, i64 %ch, i1 1, i1 1, i32 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %mc, i64 %ch, i1 1, i1 0, i32 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %mc, i64 %ch, i1 0, i1 1, i32 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %mc, i64 %ch, i1 0, i1 0, i32 1) + ret void +} + +; CHECK-LABEL: test_cp_async_bulk_tensor_g2s_tile_5d +define void @test_cp_async_bulk_tensor_g2s_tile_5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %mc, i64 %ch) { +; CHECK-PTX64-LABEL: test_cp_async_bulk_tensor_g2s_tile_5d( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b16 %rs<2>; +; CHECK-PTX64-NEXT: .reg .b32 %r<6>; +; CHECK-PTX64-NEXT: .reg .b64 %rd<5>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_tile_5d_param_0]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_tile_5d_param_1]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [test_cp_async_bulk_tensor_g2s_tile_5d_param_2]; +; CHECK-PTX64-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_tile_5d_param_3]; +; CHECK-PTX64-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_tile_5d_param_4]; +; CHECK-PTX64-NEXT: ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_tile_5d_param_5]; +; CHECK-PTX64-NEXT: ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_tile_5d_param_6]; +; CHECK-PTX64-NEXT: ld.param.b32 %r5, [test_cp_async_bulk_tensor_g2s_tile_5d_param_7]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_tile_5d_param_8]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [test_cp_async_bulk_tensor_g2s_tile_5d_param_9]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], %rs1, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], %rs1; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2]; +; CHECK-PTX64-NEXT: ret; +; +; CHECK-PTX-SHARED32-LABEL: test_cp_async_bulk_tensor_g2s_tile_5d( +; CHECK-PTX-SHARED32: { +; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<2>; +; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<8>; +; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX-SHARED32-EMPTY: +; CHECK-PTX-SHARED32-NEXT: // %bb.0: +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_tile_5d_param_0]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_tile_5d_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_tile_5d_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_tile_5d_param_3]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_tile_5d_param_4]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [test_cp_async_bulk_tensor_g2s_tile_5d_param_5]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r6, [test_cp_async_bulk_tensor_g2s_tile_5d_param_6]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r7, [test_cp_async_bulk_tensor_g2s_tile_5d_param_7]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_tile_5d_param_8]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_tile_5d_param_9]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], %rs1, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], %rs1; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2]; +; CHECK-PTX-SHARED32-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %mc, i64 %ch, i1 1, i1 1, i32 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %mc, i64 %ch, i1 1, i1 0, i32 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %mc, i64 %ch, i1 0, i1 1, i32 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %mc, i64 %ch, i1 0, i1 0, i32 1) + ret void +} + +; CHECK-LABEL: test_cp_async_bulk_tensor_g2s_im2col_3d +define void @test_cp_async_bulk_tensor_g2s_im2col_3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %im2col0, i16 %mc, i64 %ch) { +; CHECK-PTX64-LABEL: test_cp_async_bulk_tensor_g2s_im2col_3d( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b16 %rs<3>; +; CHECK-PTX64-NEXT: .reg .b32 %r<4>; +; CHECK-PTX64-NEXT: .reg .b64 %rd<5>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_im2col_3d_param_0]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_im2col_3d_param_1]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [test_cp_async_bulk_tensor_g2s_im2col_3d_param_2]; +; CHECK-PTX64-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_im2col_3d_param_3]; +; CHECK-PTX64-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_im2col_3d_param_4]; +; CHECK-PTX64-NEXT: ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_im2col_3d_param_5]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_im2col_3d_param_6]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs2, [test_cp_async_bulk_tensor_g2s_im2col_3d_param_7]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [test_cp_async_bulk_tensor_g2s_im2col_3d_param_8]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1}, %rs2, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1}, %rs2; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1}, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1}; +; CHECK-PTX64-NEXT: ret; +; +; CHECK-PTX-SHARED32-LABEL: test_cp_async_bulk_tensor_g2s_im2col_3d( +; CHECK-PTX-SHARED32: { +; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<3>; +; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<6>; +; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX-SHARED32-EMPTY: +; CHECK-PTX-SHARED32-NEXT: // %bb.0: +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_im2col_3d_param_0]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_im2col_3d_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_im2col_3d_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_im2col_3d_param_3]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_im2col_3d_param_4]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [test_cp_async_bulk_tensor_g2s_im2col_3d_param_5]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_im2col_3d_param_6]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs2, [test_cp_async_bulk_tensor_g2s_im2col_3d_param_7]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_im2col_3d_param_8]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1}, %rs2, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1}, %rs2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1}, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1}; +; CHECK-PTX-SHARED32-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %im2col0, i16 %mc, i64 %ch, i1 1, i1 1, i32 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %im2col0, i16 %mc, i64 %ch, i1 1, i1 0, i32 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %im2col0, i16 %mc, i64 %ch, i1 0, i1 1, i32 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %im2col0, i16 %mc, i64 %ch, i1 0, i1 0, i32 1) + ret void +} + +; CHECK-LABEL: test_cp_async_bulk_tensor_g2s_im2col_4d +define void @test_cp_async_bulk_tensor_g2s_im2col_4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %im2col0, i16 %im2col1, i16 %mc, i64 %ch) { +; CHECK-PTX64-LABEL: test_cp_async_bulk_tensor_g2s_im2col_4d( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b16 %rs<4>; +; CHECK-PTX64-NEXT: .reg .b32 %r<5>; +; CHECK-PTX64-NEXT: .reg .b64 %rd<5>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_im2col_4d_param_0]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_im2col_4d_param_1]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [test_cp_async_bulk_tensor_g2s_im2col_4d_param_2]; +; CHECK-PTX64-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_im2col_4d_param_3]; +; CHECK-PTX64-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_im2col_4d_param_4]; +; CHECK-PTX64-NEXT: ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_im2col_4d_param_5]; +; CHECK-PTX64-NEXT: ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_im2col_4d_param_6]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_im2col_4d_param_7]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs2, [test_cp_async_bulk_tensor_g2s_im2col_4d_param_8]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs3, [test_cp_async_bulk_tensor_g2s_im2col_4d_param_9]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [test_cp_async_bulk_tensor_g2s_im2col_4d_param_10]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}, %rs3, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}, %rs3; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}; +; CHECK-PTX64-NEXT: ret; +; +; CHECK-PTX-SHARED32-LABEL: test_cp_async_bulk_tensor_g2s_im2col_4d( +; CHECK-PTX-SHARED32: { +; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<4>; +; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<7>; +; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX-SHARED32-EMPTY: +; CHECK-PTX-SHARED32-NEXT: // %bb.0: +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_im2col_4d_param_0]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_im2col_4d_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_im2col_4d_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_im2col_4d_param_3]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_im2col_4d_param_4]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [test_cp_async_bulk_tensor_g2s_im2col_4d_param_5]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r6, [test_cp_async_bulk_tensor_g2s_im2col_4d_param_6]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_im2col_4d_param_7]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs2, [test_cp_async_bulk_tensor_g2s_im2col_4d_param_8]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs3, [test_cp_async_bulk_tensor_g2s_im2col_4d_param_9]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_im2col_4d_param_10]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}, %rs3, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}, %rs3; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}; +; CHECK-PTX-SHARED32-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %im2col0, i16 %im2col1, i16 %mc, i64 %ch, i1 1, i1 1, i32 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %im2col0, i16 %im2col1, i16 %mc, i64 %ch, i1 1, i1 0, i32 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %im2col0, i16 %im2col1, i16 %mc, i64 %ch, i1 0, i1 1, i32 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %im2col0, i16 %im2col1, i16 %mc, i64 %ch, i1 0, i1 0, i32 1) + ret void +} + +; CHECK-LABEL: test_cp_async_bulk_tensor_g2s_im2col_5d +define void @test_cp_async_bulk_tensor_g2s_im2col_5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %im2col0, i16 %im2col1, i16 %im2col2, i16 %mc, i64 %ch) { +; CHECK-PTX64-LABEL: test_cp_async_bulk_tensor_g2s_im2col_5d( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b16 %rs<5>; +; CHECK-PTX64-NEXT: .reg .b32 %r<6>; +; CHECK-PTX64-NEXT: .reg .b64 %rd<5>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_0]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_1]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_2]; +; CHECK-PTX64-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_3]; +; CHECK-PTX64-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_4]; +; CHECK-PTX64-NEXT: ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_5]; +; CHECK-PTX64-NEXT: ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_6]; +; CHECK-PTX64-NEXT: ld.param.b32 %r5, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_7]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_8]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs2, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_9]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs3, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_10]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs4, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_11]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_12]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2, %rs3}, %rs4, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2, %rs3}, %rs4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2, %rs3}, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2, %rs3}; +; CHECK-PTX64-NEXT: ret; +; +; CHECK-PTX-SHARED32-LABEL: test_cp_async_bulk_tensor_g2s_im2col_5d( +; CHECK-PTX-SHARED32: { +; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<5>; +; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<8>; +; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX-SHARED32-EMPTY: +; CHECK-PTX-SHARED32-NEXT: // %bb.0: +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_0]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_3]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_4]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_5]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r6, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_6]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r7, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_7]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_8]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs2, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_9]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs3, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_10]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs4, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_11]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_12]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2, %rs3}, %rs4, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2, %rs3}, %rs4; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2, %rs3}, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2, %rs3}; +; CHECK-PTX-SHARED32-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %im2col0, i16 %im2col1, i16 %im2col2, i16 %mc, i64 %ch, i1 1, i1 1, i32 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %im2col0, i16 %im2col1, i16 %im2col2, i16 %mc, i64 %ch, i1 1, i1 0, i32 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %im2col0, i16 %im2col1, i16 %im2col2, i16 %mc, i64 %ch, i1 0, i1 1, i32 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %im2col0, i16 %im2col1, i16 %im2col2, i16 %mc, i64 %ch, i1 0, i1 0, i32 1) + ret void +} diff --git a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-2cta.ll b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-2cta.ll new file mode 100644 index 000000000000..a7e6bec6aef1 --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-2cta.ll @@ -0,0 +1,435 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| FileCheck --check-prefixes=CHECK-PTX64 %s +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| FileCheck --check-prefixes=CHECK-PTX-SHARED32 %s +; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| %ptxas-verify -arch=sm_100a %} +; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| %ptxas-verify -arch=sm_100a %} + +target triple = "nvptx64-nvidia-cuda" + +declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.1d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i16 %mc, i64 %ch, i1 %f1, i1 %f2, i32 %f3); +declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.2d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i16 %mc, i64 %ch, i1 %f1, i1 %f2, i32 %f3); +declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i16 %mc, i64 %ch, i1 %f1, i1 %f2, i32 %f3); +declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %mc, i64 %ch, i1 %f1, i1 %f2, i32 %f3); +declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %mc, i64 %ch, i1 %f1, i1 %f2, i32 %f3); + +declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i16 %im2col0, i16 %mc, i64 %ch, i1 %f1, i1 %f2, i32 %f3); +declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %im2col0, i16 %im2col1, i16 %mc, i64 %ch, i1 %f1, i1 %f2, i32 %f3); +declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %im2col0, i16 %im2col1, i16 %im2col2, i16 %mc, i64 %ch, i1 %f1, i1 %f2, i32 %f3); + +; CHECK-LABEL: test_cp_async_bulk_tensor_g2s_tile_1d +define void @test_cp_async_bulk_tensor_g2s_tile_1d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i16 %mc, i64 %ch) { +; CHECK-PTX64-LABEL: test_cp_async_bulk_tensor_g2s_tile_1d( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b16 %rs<2>; +; CHECK-PTX64-NEXT: .reg .b32 %r<2>; +; CHECK-PTX64-NEXT: .reg .b64 %rd<5>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_tile_1d_param_0]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_tile_1d_param_1]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [test_cp_async_bulk_tensor_g2s_tile_1d_param_2]; +; CHECK-PTX64-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_tile_1d_param_3]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_tile_1d_param_4]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [test_cp_async_bulk_tensor_g2s_tile_1d_param_5]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::2 [%rd1], [%rd3, {%r1}], [%rd2], %rs1, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 [%rd1], [%rd3, {%r1}], [%rd2], %rs1; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::2 [%rd1], [%rd3, {%r1}], [%rd2], %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.cta_group::2 [%rd1], [%rd3, {%r1}], [%rd2]; +; CHECK-PTX64-NEXT: ret; +; +; CHECK-PTX-SHARED32-LABEL: test_cp_async_bulk_tensor_g2s_tile_1d( +; CHECK-PTX-SHARED32: { +; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<2>; +; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<4>; +; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX-SHARED32-EMPTY: +; CHECK-PTX-SHARED32-NEXT: // %bb.0: +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_tile_1d_param_0]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_tile_1d_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_tile_1d_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_tile_1d_param_3]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_tile_1d_param_4]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_tile_1d_param_5]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::2 [%r1], [%rd1, {%r3}], [%r2], %rs1, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 [%r1], [%rd1, {%r3}], [%r2], %rs1; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::2 [%r1], [%rd1, {%r3}], [%r2], %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.cta_group::2 [%r1], [%rd1, {%r3}], [%r2]; +; CHECK-PTX-SHARED32-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.1d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i16 %mc, i64 %ch, i1 1, i1 1, i32 2) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.1d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i16 %mc, i64 %ch, i1 1, i1 0, i32 2) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.1d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i16 %mc, i64 %ch, i1 0, i1 1, i32 2) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.1d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i16 %mc, i64 %ch, i1 0, i1 0, i32 2) + ret void +} + +; CHECK-LABEL: test_cp_async_bulk_tensor_g2s_tile_2d +define void @test_cp_async_bulk_tensor_g2s_tile_2d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i16 %mc, i64 %ch) { +; CHECK-PTX64-LABEL: test_cp_async_bulk_tensor_g2s_tile_2d( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b16 %rs<2>; +; CHECK-PTX64-NEXT: .reg .b32 %r<3>; +; CHECK-PTX64-NEXT: .reg .b64 %rd<5>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_tile_2d_param_0]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_tile_2d_param_1]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [test_cp_async_bulk_tensor_g2s_tile_2d_param_2]; +; CHECK-PTX64-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_tile_2d_param_3]; +; CHECK-PTX64-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_tile_2d_param_4]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_tile_2d_param_5]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [test_cp_async_bulk_tensor_g2s_tile_2d_param_6]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::2 [%rd1], [%rd3, {%r1, %r2}], [%rd2], %rs1, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 [%rd1], [%rd3, {%r1, %r2}], [%rd2], %rs1; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::2 [%rd1], [%rd3, {%r1, %r2}], [%rd2], %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.cta_group::2 [%rd1], [%rd3, {%r1, %r2}], [%rd2]; +; CHECK-PTX64-NEXT: ret; +; +; CHECK-PTX-SHARED32-LABEL: test_cp_async_bulk_tensor_g2s_tile_2d( +; CHECK-PTX-SHARED32: { +; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<2>; +; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<5>; +; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX-SHARED32-EMPTY: +; CHECK-PTX-SHARED32-NEXT: // %bb.0: +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_tile_2d_param_0]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_tile_2d_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_tile_2d_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_tile_2d_param_3]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_tile_2d_param_4]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_tile_2d_param_5]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_tile_2d_param_6]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::2 [%r1], [%rd1, {%r3, %r4}], [%r2], %rs1, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 [%r1], [%rd1, {%r3, %r4}], [%r2], %rs1; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::2 [%r1], [%rd1, {%r3, %r4}], [%r2], %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.cta_group::2 [%r1], [%rd1, {%r3, %r4}], [%r2]; +; CHECK-PTX-SHARED32-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.2d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i16 %mc, i64 %ch, i1 1, i1 1, i32 2) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.2d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i16 %mc, i64 %ch, i1 1, i1 0, i32 2) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.2d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i16 %mc, i64 %ch, i1 0, i1 1, i32 2) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.2d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i16 %mc, i64 %ch, i1 0, i1 0, i32 2) + ret void +} + +; CHECK-LABEL: test_cp_async_bulk_tensor_g2s_tile_3d +define void @test_cp_async_bulk_tensor_g2s_tile_3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %mc, i64 %ch) { +; CHECK-PTX64-LABEL: test_cp_async_bulk_tensor_g2s_tile_3d( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b16 %rs<2>; +; CHECK-PTX64-NEXT: .reg .b32 %r<4>; +; CHECK-PTX64-NEXT: .reg .b64 %rd<5>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_tile_3d_param_0]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_tile_3d_param_1]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [test_cp_async_bulk_tensor_g2s_tile_3d_param_2]; +; CHECK-PTX64-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_tile_3d_param_3]; +; CHECK-PTX64-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_tile_3d_param_4]; +; CHECK-PTX64-NEXT: ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_tile_3d_param_5]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_tile_3d_param_6]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [test_cp_async_bulk_tensor_g2s_tile_3d_param_7]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], %rs1, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], %rs1; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2]; +; CHECK-PTX64-NEXT: ret; +; +; CHECK-PTX-SHARED32-LABEL: test_cp_async_bulk_tensor_g2s_tile_3d( +; CHECK-PTX-SHARED32: { +; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<2>; +; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<6>; +; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX-SHARED32-EMPTY: +; CHECK-PTX-SHARED32-NEXT: // %bb.0: +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_tile_3d_param_0]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_tile_3d_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_tile_3d_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_tile_3d_param_3]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_tile_3d_param_4]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [test_cp_async_bulk_tensor_g2s_tile_3d_param_5]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_tile_3d_param_6]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_tile_3d_param_7]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], %rs1, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], %rs1; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5}], [%r2]; +; CHECK-PTX-SHARED32-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %mc, i64 %ch, i1 1, i1 1, i32 2) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %mc, i64 %ch, i1 1, i1 0, i32 2) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %mc, i64 %ch, i1 0, i1 1, i32 2) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %mc, i64 %ch, i1 0, i1 0, i32 2) + ret void +} + +; CHECK-LABEL: test_cp_async_bulk_tensor_g2s_tile_4d +define void @test_cp_async_bulk_tensor_g2s_tile_4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %mc, i64 %ch) { +; CHECK-PTX64-LABEL: test_cp_async_bulk_tensor_g2s_tile_4d( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b16 %rs<2>; +; CHECK-PTX64-NEXT: .reg .b32 %r<5>; +; CHECK-PTX64-NEXT: .reg .b64 %rd<5>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_tile_4d_param_0]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_tile_4d_param_1]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [test_cp_async_bulk_tensor_g2s_tile_4d_param_2]; +; CHECK-PTX64-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_tile_4d_param_3]; +; CHECK-PTX64-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_tile_4d_param_4]; +; CHECK-PTX64-NEXT: ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_tile_4d_param_5]; +; CHECK-PTX64-NEXT: ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_tile_4d_param_6]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_tile_4d_param_7]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [test_cp_async_bulk_tensor_g2s_tile_4d_param_8]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], %rs1, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], %rs1; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2]; +; CHECK-PTX64-NEXT: ret; +; +; CHECK-PTX-SHARED32-LABEL: test_cp_async_bulk_tensor_g2s_tile_4d( +; CHECK-PTX-SHARED32: { +; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<2>; +; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<7>; +; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX-SHARED32-EMPTY: +; CHECK-PTX-SHARED32-NEXT: // %bb.0: +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_tile_4d_param_0]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_tile_4d_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_tile_4d_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_tile_4d_param_3]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_tile_4d_param_4]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [test_cp_async_bulk_tensor_g2s_tile_4d_param_5]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r6, [test_cp_async_bulk_tensor_g2s_tile_4d_param_6]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_tile_4d_param_7]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_tile_4d_param_8]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], %rs1, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], %rs1; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2]; +; CHECK-PTX-SHARED32-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %mc, i64 %ch, i1 1, i1 1, i32 2) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %mc, i64 %ch, i1 1, i1 0, i32 2) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %mc, i64 %ch, i1 0, i1 1, i32 2) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %mc, i64 %ch, i1 0, i1 0, i32 2) + ret void +} + +; CHECK-LABEL: test_cp_async_bulk_tensor_g2s_tile_5d +define void @test_cp_async_bulk_tensor_g2s_tile_5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %mc, i64 %ch) { +; CHECK-PTX64-LABEL: test_cp_async_bulk_tensor_g2s_tile_5d( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b16 %rs<2>; +; CHECK-PTX64-NEXT: .reg .b32 %r<6>; +; CHECK-PTX64-NEXT: .reg .b64 %rd<5>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_tile_5d_param_0]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_tile_5d_param_1]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [test_cp_async_bulk_tensor_g2s_tile_5d_param_2]; +; CHECK-PTX64-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_tile_5d_param_3]; +; CHECK-PTX64-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_tile_5d_param_4]; +; CHECK-PTX64-NEXT: ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_tile_5d_param_5]; +; CHECK-PTX64-NEXT: ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_tile_5d_param_6]; +; CHECK-PTX64-NEXT: ld.param.b32 %r5, [test_cp_async_bulk_tensor_g2s_tile_5d_param_7]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_tile_5d_param_8]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [test_cp_async_bulk_tensor_g2s_tile_5d_param_9]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], %rs1, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], %rs1; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2]; +; CHECK-PTX64-NEXT: ret; +; +; CHECK-PTX-SHARED32-LABEL: test_cp_async_bulk_tensor_g2s_tile_5d( +; CHECK-PTX-SHARED32: { +; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<2>; +; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<8>; +; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX-SHARED32-EMPTY: +; CHECK-PTX-SHARED32-NEXT: // %bb.0: +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_tile_5d_param_0]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_tile_5d_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_tile_5d_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_tile_5d_param_3]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_tile_5d_param_4]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [test_cp_async_bulk_tensor_g2s_tile_5d_param_5]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r6, [test_cp_async_bulk_tensor_g2s_tile_5d_param_6]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r7, [test_cp_async_bulk_tensor_g2s_tile_5d_param_7]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_tile_5d_param_8]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_tile_5d_param_9]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], %rs1, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], %rs1; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2]; +; CHECK-PTX-SHARED32-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %mc, i64 %ch, i1 1, i1 1, i32 2) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %mc, i64 %ch, i1 1, i1 0, i32 2) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %mc, i64 %ch, i1 0, i1 1, i32 2) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %mc, i64 %ch, i1 0, i1 0, i32 2) + ret void +} + +; CHECK-LABEL: test_cp_async_bulk_tensor_g2s_im2col_3d +define void @test_cp_async_bulk_tensor_g2s_im2col_3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %im2col0, i16 %mc, i64 %ch) { +; CHECK-PTX64-LABEL: test_cp_async_bulk_tensor_g2s_im2col_3d( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b16 %rs<3>; +; CHECK-PTX64-NEXT: .reg .b32 %r<4>; +; CHECK-PTX64-NEXT: .reg .b64 %rd<5>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_im2col_3d_param_0]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_im2col_3d_param_1]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [test_cp_async_bulk_tensor_g2s_im2col_3d_param_2]; +; CHECK-PTX64-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_im2col_3d_param_3]; +; CHECK-PTX64-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_im2col_3d_param_4]; +; CHECK-PTX64-NEXT: ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_im2col_3d_param_5]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_im2col_3d_param_6]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs2, [test_cp_async_bulk_tensor_g2s_im2col_3d_param_7]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [test_cp_async_bulk_tensor_g2s_im2col_3d_param_8]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1}, %rs2, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1}, %rs2; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1}, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1}; +; CHECK-PTX64-NEXT: ret; +; +; CHECK-PTX-SHARED32-LABEL: test_cp_async_bulk_tensor_g2s_im2col_3d( +; CHECK-PTX-SHARED32: { +; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<3>; +; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<6>; +; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX-SHARED32-EMPTY: +; CHECK-PTX-SHARED32-NEXT: // %bb.0: +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_im2col_3d_param_0]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_im2col_3d_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_im2col_3d_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_im2col_3d_param_3]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_im2col_3d_param_4]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [test_cp_async_bulk_tensor_g2s_im2col_3d_param_5]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_im2col_3d_param_6]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs2, [test_cp_async_bulk_tensor_g2s_im2col_3d_param_7]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_im2col_3d_param_8]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1}, %rs2, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1}, %rs2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1}, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1}; +; CHECK-PTX-SHARED32-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %im2col0, i16 %mc, i64 %ch, i1 1, i1 1, i32 2) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %im2col0, i16 %mc, i64 %ch, i1 1, i1 0, i32 2) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %im2col0, i16 %mc, i64 %ch, i1 0, i1 1, i32 2) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %im2col0, i16 %mc, i64 %ch, i1 0, i1 0, i32 2) + ret void +} + +; CHECK-LABEL: test_cp_async_bulk_tensor_g2s_im2col_4d +define void @test_cp_async_bulk_tensor_g2s_im2col_4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %im2col0, i16 %im2col1, i16 %mc, i64 %ch) { +; CHECK-PTX64-LABEL: test_cp_async_bulk_tensor_g2s_im2col_4d( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b16 %rs<4>; +; CHECK-PTX64-NEXT: .reg .b32 %r<5>; +; CHECK-PTX64-NEXT: .reg .b64 %rd<5>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_im2col_4d_param_0]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_im2col_4d_param_1]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [test_cp_async_bulk_tensor_g2s_im2col_4d_param_2]; +; CHECK-PTX64-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_im2col_4d_param_3]; +; CHECK-PTX64-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_im2col_4d_param_4]; +; CHECK-PTX64-NEXT: ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_im2col_4d_param_5]; +; CHECK-PTX64-NEXT: ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_im2col_4d_param_6]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_im2col_4d_param_7]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs2, [test_cp_async_bulk_tensor_g2s_im2col_4d_param_8]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs3, [test_cp_async_bulk_tensor_g2s_im2col_4d_param_9]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [test_cp_async_bulk_tensor_g2s_im2col_4d_param_10]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}, %rs3, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}, %rs3; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}; +; CHECK-PTX64-NEXT: ret; +; +; CHECK-PTX-SHARED32-LABEL: test_cp_async_bulk_tensor_g2s_im2col_4d( +; CHECK-PTX-SHARED32: { +; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<4>; +; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<7>; +; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX-SHARED32-EMPTY: +; CHECK-PTX-SHARED32-NEXT: // %bb.0: +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_im2col_4d_param_0]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_im2col_4d_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_im2col_4d_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_im2col_4d_param_3]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_im2col_4d_param_4]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [test_cp_async_bulk_tensor_g2s_im2col_4d_param_5]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r6, [test_cp_async_bulk_tensor_g2s_im2col_4d_param_6]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_im2col_4d_param_7]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs2, [test_cp_async_bulk_tensor_g2s_im2col_4d_param_8]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs3, [test_cp_async_bulk_tensor_g2s_im2col_4d_param_9]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_im2col_4d_param_10]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}, %rs3, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}, %rs3; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}; +; CHECK-PTX-SHARED32-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %im2col0, i16 %im2col1, i16 %mc, i64 %ch, i1 1, i1 1, i32 2) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %im2col0, i16 %im2col1, i16 %mc, i64 %ch, i1 1, i1 0, i32 2) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %im2col0, i16 %im2col1, i16 %mc, i64 %ch, i1 0, i1 1, i32 2) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %im2col0, i16 %im2col1, i16 %mc, i64 %ch, i1 0, i1 0, i32 2) + ret void +} + +; CHECK-LABEL: test_cp_async_bulk_tensor_g2s_im2col_5d +define void @test_cp_async_bulk_tensor_g2s_im2col_5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %im2col0, i16 %im2col1, i16 %im2col2, i16 %mc, i64 %ch) { +; CHECK-PTX64-LABEL: test_cp_async_bulk_tensor_g2s_im2col_5d( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b16 %rs<5>; +; CHECK-PTX64-NEXT: .reg .b32 %r<6>; +; CHECK-PTX64-NEXT: .reg .b64 %rd<5>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_0]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_1]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_2]; +; CHECK-PTX64-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_3]; +; CHECK-PTX64-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_4]; +; CHECK-PTX64-NEXT: ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_5]; +; CHECK-PTX64-NEXT: ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_6]; +; CHECK-PTX64-NEXT: ld.param.b32 %r5, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_7]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_8]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs2, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_9]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs3, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_10]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs4, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_11]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_12]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2, %rs3}, %rs4, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2, %rs3}, %rs4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2, %rs3}, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2, %rs3}; +; CHECK-PTX64-NEXT: ret; +; +; CHECK-PTX-SHARED32-LABEL: test_cp_async_bulk_tensor_g2s_im2col_5d( +; CHECK-PTX-SHARED32: { +; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<5>; +; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<8>; +; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX-SHARED32-EMPTY: +; CHECK-PTX-SHARED32-NEXT: // %bb.0: +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_0]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_3]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_4]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_5]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r6, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_6]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r7, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_7]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_8]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs2, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_9]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs3, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_10]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs4, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_11]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_12]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2, %rs3}, %rs4, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2, %rs3}, %rs4; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2, %rs3}, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2, %rs3}; +; CHECK-PTX-SHARED32-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %im2col0, i16 %im2col1, i16 %im2col2, i16 %mc, i64 %ch, i1 1, i1 1, i32 2) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %im2col0, i16 %im2col1, i16 %im2col2, i16 %mc, i64 %ch, i1 1, i1 0, i32 2) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %im2col0, i16 %im2col1, i16 %im2col2, i16 %mc, i64 %ch, i1 0, i1 1, i32 2) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %im2col0, i16 %im2col1, i16 %im2col2, i16 %mc, i64 %ch, i1 0, i1 0, i32 2) + ret void +} diff --git a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-invalid.ll b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-invalid.ll new file mode 100644 index 000000000000..1c35fbead389 --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-invalid.ll @@ -0,0 +1,15 @@ +; RUN: not llc < %s -mtriple=nvptx64 -mcpu=sm_100a -o /dev/null 2>&1 | FileCheck %s + +target triple = "nvptx64-nvidia-cuda" + +declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.1d(ptr addrspace(7) writeonly, ptr addrspace(3), ptr readonly, i32, i16, i64, i1 immarg, i1 immarg, i32 immarg range(i32 0, 3)) + +define void @test_cp_async_bulk_tensor_g2s_tile_1d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i16 %mc, i64 %ch) { + ; CHECK: immarg value 3 out of range [0, 3) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.1d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i16 %mc, i64 %ch, i1 0, i1 0, i32 3) + + ; CHECK: immarg value -1 out of range [0, 3) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.1d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i16 %mc, i64 %ch, i1 0, i1 0, i32 -1) + + ret void +} From a8c6fb4cb8e686f733e022afc549bc085d1558f4 Mon Sep 17 00:00:00 2001 From: Nikita Popov Date: Thu, 12 Jun 2025 11:53:32 +0200 Subject: [PATCH 0113/1322] [MemCpyOpt] Fix lifetime marker sizes in tests (NFC) As pointed out in https://github.com/llvm/llvm-project/pull/143782, these tests were specifying the size in bits instead of bytes. In order to preserve the intent of the tests, add a use of %src, which prevents stack-move optimization. These are supposed to test the handling of scoped alias metadata in call slot optimization. --- .../test/Analysis/ScopedNoAliasAA/alias-scope-merging.ll | 7 +++++-- llvm/test/Transforms/MemCpyOpt/callslot_badaa.ll | 9 ++++++--- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/llvm/test/Analysis/ScopedNoAliasAA/alias-scope-merging.ll b/llvm/test/Analysis/ScopedNoAliasAA/alias-scope-merging.ll index 989049ab67a0..840a5172561d 100644 --- a/llvm/test/Analysis/ScopedNoAliasAA/alias-scope-merging.ll +++ b/llvm/test/Analysis/ScopedNoAliasAA/alias-scope-merging.ll @@ -1,17 +1,20 @@ ; RUN: opt < %s -S -passes=memcpyopt | FileCheck --match-full-lines %s +declare void @use(ptr) + ; Alias scopes are merged by taking the intersection of domains, then the union of the scopes within those domains define i8 @test(i8 %input) { %tmp = alloca i8 %dst = alloca i8 %src = alloca i8 ; CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 %dst, ptr align 8 %src, i64 1, i1 false), !alias.scope ![[SCOPE:[0-9]+]] - call void @llvm.lifetime.start.p0(i64 8, ptr nonnull %src), !noalias !4 + call void @llvm.lifetime.start.p0(i64 1, ptr nonnull %src), !noalias !4 store i8 %input, ptr %src call void @llvm.memcpy.p0.p0.i64(ptr align 8 %tmp, ptr align 8 %src, i64 1, i1 false), !alias.scope !0 - call void @llvm.lifetime.end.p0(i64 8, ptr nonnull %src), !noalias !4 + call void @llvm.lifetime.end.p0(i64 1, ptr nonnull %src), !noalias !4 call void @llvm.memcpy.p0.p0.i64(ptr align 8 %dst, ptr align 8 %tmp, i64 1, i1 false), !alias.scope !4 %ret_value = load i8, ptr %dst + call void @use(ptr %src) ret i8 %ret_value } diff --git a/llvm/test/Transforms/MemCpyOpt/callslot_badaa.ll b/llvm/test/Transforms/MemCpyOpt/callslot_badaa.ll index efdbdce401b7..601498e36a7a 100644 --- a/llvm/test/Transforms/MemCpyOpt/callslot_badaa.ll +++ b/llvm/test/Transforms/MemCpyOpt/callslot_badaa.ll @@ -1,9 +1,11 @@ ; RUN: opt < %s -S -passes=memcpyopt | FileCheck --match-full-lines %s +declare void @use(ptr) + ; Make sure callslot optimization merges alias.scope metadata correctly when it merges instructions. ; Merging here naively generates: ; call void @llvm.memcpy.p0.p0.i64(ptr align 8 %dst, ptr align 8 %src, i64 1, i1 false), !alias.scope !3 -; call void @llvm.lifetime.end.p0(i64 8, ptr nonnull %src), !noalias !0 +; call void @llvm.lifetime.end.p0(i64 1, ptr nonnull %src), !noalias !0 ; ... ; !0 = !{!1} ; !1 = distinct !{!1, !2, !"callee1: %a"} @@ -18,12 +20,13 @@ define i8 @test(i8 %input) { %src = alloca i8 ; NOTE: we're matching the full line and looking for the lack of !alias.scope here ; CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 %dst, ptr align 8 %src, i64 1, i1 false) - call void @llvm.lifetime.start.p0(i64 8, ptr nonnull %src), !noalias !3 + call void @llvm.lifetime.start.p0(i64 1, ptr nonnull %src), !noalias !3 store i8 %input, ptr %src call void @llvm.memcpy.p0.p0.i64(ptr align 8 %tmp, ptr align 8 %src, i64 1, i1 false), !alias.scope !0 - call void @llvm.lifetime.end.p0(i64 8, ptr nonnull %src), !noalias !3 + call void @llvm.lifetime.end.p0(i64 1, ptr nonnull %src), !noalias !3 call void @llvm.memcpy.p0.p0.i64(ptr align 8 %dst, ptr align 8 %tmp, i64 1, i1 false), !alias.scope !3 %ret_value = load i8, ptr %dst + call void @use(ptr %src) ret i8 %ret_value } From 5987f1ee5cc59a05961156c04010ab0f3c857628 Mon Sep 17 00:00:00 2001 From: Antonio Frighetto Date: Thu, 12 Jun 2025 11:52:28 +0200 Subject: [PATCH 0114/1322] [InstCombine] Regenerate `narrow-switch.ll` test (NFC) `narrow-switch.ll` test has been regenerated via latest UTC using `--prefix-filecheck-ir-name _`, so as to avoid conflicts with scripted variable names. --- .../Transforms/InstCombine/narrow-switch.ll | 194 +++++++++++++----- 1 file changed, 148 insertions(+), 46 deletions(-) diff --git a/llvm/test/Transforms/InstCombine/narrow-switch.ll b/llvm/test/Transforms/InstCombine/narrow-switch.ll index 05a30b910e5e..90f56a61fa41 100644 --- a/llvm/test/Transforms/InstCombine/narrow-switch.ll +++ b/llvm/test/Transforms/InstCombine/narrow-switch.ll @@ -1,15 +1,27 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --prefix-filecheck-ir-name _ --version 5 ; Vary legal integer types in data layout. ; RUN: opt < %s -passes=instcombine -S -data-layout=n32 | FileCheck %s --check-prefix=ALL --check-prefix=CHECK32 ; RUN: opt < %s -passes=instcombine -S -data-layout=n32:64 | FileCheck %s --check-prefix=ALL --check-prefix=CHECK64 define i32 @positive1(i64 %a) { -; ALL-LABEL: @positive1( -; ALL: switch i32 -; ALL-NEXT: i32 10, label %return -; ALL-NEXT: i32 100, label %sw.bb1 -; ALL-NEXT: i32 1001, label %sw.bb2 +; ALL-LABEL: define i32 @positive1( +; ALL-SAME: i64 [[A:%.*]]) { +; ALL-NEXT: [[ENTRY:.*]]: +; ALL-NEXT: [[TRUNC:%.*]] = trunc i64 [[A]] to i32 +; ALL-NEXT: switch i32 [[TRUNC]], label %[[SW_DEFAULT:.*]] [ +; ALL-NEXT: i32 10, label %[[RETURN:.*]] +; ALL-NEXT: i32 100, label %[[SW_BB1:.*]] +; ALL-NEXT: i32 1001, label %[[SW_BB2:.*]] ; ALL-NEXT: ] +; ALL: [[SW_BB1]]: +; ALL-NEXT: br label %[[RETURN]] +; ALL: [[SW_BB2]]: +; ALL-NEXT: br label %[[RETURN]] +; ALL: [[SW_DEFAULT]]: +; ALL-NEXT: br label %[[RETURN]] +; ALL: [[RETURN]]: +; ALL-NEXT: [[RETVAL_0:%.*]] = phi i32 [ 24, %[[SW_DEFAULT]] ], [ 123, %[[SW_BB2]] ], [ 213, %[[SW_BB1]] ], [ 231, %[[ENTRY]] ] +; ALL-NEXT: ret i32 [[RETVAL_0]] ; entry: %and = and i64 %a, 4294967295 @@ -34,12 +46,24 @@ return: } define i32 @negative1(i64 %a) { -; ALL-LABEL: @negative1( -; ALL: switch i32 -; ALL-NEXT: i32 -10, label %return -; ALL-NEXT: i32 -100, label %sw.bb1 -; ALL-NEXT: i32 -1001, label %sw.bb2 +; ALL-LABEL: define i32 @negative1( +; ALL-SAME: i64 [[A:%.*]]) { +; ALL-NEXT: [[ENTRY:.*]]: +; ALL-NEXT: [[TRUNC:%.*]] = trunc i64 [[A]] to i32 +; ALL-NEXT: switch i32 [[TRUNC]], label %[[SW_DEFAULT:.*]] [ +; ALL-NEXT: i32 -10, label %[[RETURN:.*]] +; ALL-NEXT: i32 -100, label %[[SW_BB1:.*]] +; ALL-NEXT: i32 -1001, label %[[SW_BB2:.*]] ; ALL-NEXT: ] +; ALL: [[SW_BB1]]: +; ALL-NEXT: br label %[[RETURN]] +; ALL: [[SW_BB2]]: +; ALL-NEXT: br label %[[RETURN]] +; ALL: [[SW_DEFAULT]]: +; ALL-NEXT: br label %[[RETURN]] +; ALL: [[RETURN]]: +; ALL-NEXT: [[RETVAL_0:%.*]] = phi i32 [ 24, %[[SW_DEFAULT]] ], [ 123, %[[SW_BB2]] ], [ 213, %[[SW_BB1]] ], [ 231, %[[ENTRY]] ] +; ALL-NEXT: ret i32 [[RETVAL_0]] ; entry: %or = or i64 %a, -4294967296 @@ -67,12 +91,24 @@ return: ; assertion. define i32 @trunc72to68(i72 %a) { -; ALL-LABEL: @trunc72to68( -; ALL: switch i68 -; ALL-NEXT: i68 10, label %return -; ALL-NEXT: i68 100, label %sw.bb1 -; ALL-NEXT: i68 1001, label %sw.bb2 +; ALL-LABEL: define i32 @trunc72to68( +; ALL-SAME: i72 [[A:%.*]]) { +; ALL-NEXT: [[ENTRY:.*]]: +; ALL-NEXT: [[TRUNC:%.*]] = trunc i72 [[A]] to i68 +; ALL-NEXT: switch i68 [[TRUNC]], label %[[SW_DEFAULT:.*]] [ +; ALL-NEXT: i68 10, label %[[RETURN:.*]] +; ALL-NEXT: i68 100, label %[[SW_BB1:.*]] +; ALL-NEXT: i68 1001, label %[[SW_BB2:.*]] ; ALL-NEXT: ] +; ALL: [[SW_BB1]]: +; ALL-NEXT: br label %[[RETURN]] +; ALL: [[SW_BB2]]: +; ALL-NEXT: br label %[[RETURN]] +; ALL: [[SW_DEFAULT]]: +; ALL-NEXT: br label %[[RETURN]] +; ALL: [[RETURN]]: +; ALL-NEXT: [[RETVAL_0:%.*]] = phi i32 [ 24, %[[SW_DEFAULT]] ], [ 123, %[[SW_BB2]] ], [ 213, %[[SW_BB1]] ], [ 231, %[[ENTRY]] ] +; ALL-NEXT: ret i32 [[RETVAL_0]] ; entry: %and = and i72 %a, 295147905179352825855 @@ -103,15 +139,38 @@ return: ; because both are illegal. define void @trunc64to58(i64 %a) { -; ALL-LABEL: @trunc64to58( -; CHECK32: switch i58 -; CHECK32-NEXT: i58 0, label %sw.bb1 -; CHECK32-NEXT: i58 18717182647723699, label %sw.bb2 +; CHECK32-LABEL: define void @trunc64to58( +; CHECK32-SAME: i64 [[A:%.*]]) { +; CHECK32-NEXT: [[ENTRY:.*:]] +; CHECK32-NEXT: [[TMP0:%.*]] = trunc i64 [[A]] to i58 +; CHECK32-NEXT: [[TMP1:%.*]] = and i58 [[TMP0]], 15 +; CHECK32-NEXT: [[TRUNC:%.*]] = mul nuw i58 [[TMP1]], 18717182647723699 +; CHECK32-NEXT: switch i58 [[TRUNC]], label %[[SW_DEFAULT:.*]] [ +; CHECK32-NEXT: i58 0, label %[[SW_BB1:.*]] +; CHECK32-NEXT: i58 18717182647723699, label %[[SW_BB2:.*]] ; CHECK32-NEXT: ] -; CHECK64: switch i64 -; CHECK64-NEXT: i64 0, label %sw.bb1 -; CHECK64-NEXT: i64 18717182647723699, label %sw.bb2 +; CHECK32: [[SW_BB1]]: +; CHECK32-NEXT: br label %[[SW_DEFAULT]] +; CHECK32: [[SW_BB2]]: +; CHECK32-NEXT: br label %[[SW_DEFAULT]] +; CHECK32: [[SW_DEFAULT]]: +; CHECK32-NEXT: ret void +; +; CHECK64-LABEL: define void @trunc64to58( +; CHECK64-SAME: i64 [[A:%.*]]) { +; CHECK64-NEXT: [[ENTRY:.*:]] +; CHECK64-NEXT: [[_TMP0:%.*]] = and i64 [[A]], 15 +; CHECK64-NEXT: [[TMP0:%.*]] = mul nuw nsw i64 [[_TMP0]], 18717182647723699 +; CHECK64-NEXT: switch i64 [[TMP0]], label %[[SW_DEFAULT:.*]] [ +; CHECK64-NEXT: i64 0, label %[[SW_BB1:.*]] +; CHECK64-NEXT: i64 18717182647723699, label %[[SW_BB2:.*]] ; CHECK64-NEXT: ] +; CHECK64: [[SW_BB1]]: +; CHECK64-NEXT: br label %[[SW_DEFAULT]] +; CHECK64: [[SW_BB2]]: +; CHECK64-NEXT: br label %[[SW_DEFAULT]] +; CHECK64: [[SW_DEFAULT]]: +; CHECK64-NEXT: ret void ; entry: %tmp0 = and i64 %a, 15 @@ -136,18 +195,19 @@ sw.default: ; https://llvm.org/bugs/show_bug.cgi?id=31260 define i8 @PR31260(i8 %x) { -; ALL-LABEL: @PR31260( -; ALL-NEXT: entry: -; ALL-NEXT: [[T4:%.*]] = and i8 [[X:%.*]], 2 -; ALL-NEXT: switch i8 [[T4]], label [[EXIT:%.*]] [ -; ALL-NEXT: i8 0, label [[CASE126:%.*]] -; ALL-NEXT: i8 2, label [[CASE124:%.*]] +; ALL-LABEL: define i8 @PR31260( +; ALL-SAME: i8 [[X:%.*]]) { +; ALL-NEXT: [[ENTRY:.*:]] +; ALL-NEXT: [[T4:%.*]] = and i8 [[X]], 2 +; ALL-NEXT: switch i8 [[T4]], label %[[EXIT:.*]] [ +; ALL-NEXT: i8 0, label %[[CASE126:.*]] +; ALL-NEXT: i8 2, label %[[CASE124:.*]] ; ALL-NEXT: ] -; ALL: exit: +; ALL: [[EXIT]]: ; ALL-NEXT: ret i8 1 -; ALL: case126: +; ALL: [[CASE126]]: ; ALL-NEXT: ret i8 3 -; ALL: case124: +; ALL: [[CASE124]]: ; ALL-NEXT: ret i8 5 ; entry: @@ -169,12 +229,33 @@ case124: ; Make sure the arithmetic evaluation of the switch ; condition is evaluated on the original type define i32 @trunc32to16(i32 %a0) #0 { -; ALL-LABEL: @trunc32to16( -; ALL: switch i16 -; ALL-NEXT: i16 63, label %sw.bb -; ALL-NEXT: i16 1, label %sw.bb1 -; ALL-NEXT: i16 100, label %sw.bb2 +; ALL-LABEL: define i32 @trunc32to16( +; ALL-SAME: i32 [[A0:%.*]]) { +; ALL-NEXT: [[ENTRY:.*:]] +; ALL-NEXT: [[RETVAL:%.*]] = alloca i32, align 4 +; ALL-NEXT: [[XOR:%.*]] = lshr i32 [[A0]], 16 +; ALL-NEXT: [[TMP0:%.*]] = trunc nuw i32 [[XOR]] to i16 +; ALL-NEXT: [[TRUNC:%.*]] = xor i16 [[TMP0]], 15784 +; ALL-NEXT: switch i16 [[TRUNC]], label %[[SW_EPILOG:.*]] [ +; ALL-NEXT: i16 63, label %[[SW_BB:.*]] +; ALL-NEXT: i16 1, label %[[SW_BB1:.*]] +; ALL-NEXT: i16 100, label %[[SW_BB2:.*]] ; ALL-NEXT: ] +; ALL: [[SW_BB]]: +; ALL-NEXT: store i32 90, ptr [[RETVAL]], align 4 +; ALL-NEXT: br label %[[RETURN:.*]] +; ALL: [[SW_BB1]]: +; ALL-NEXT: store i32 91, ptr [[RETVAL]], align 4 +; ALL-NEXT: br label %[[RETURN]] +; ALL: [[SW_BB2]]: +; ALL-NEXT: store i32 92, ptr [[RETVAL]], align 4 +; ALL-NEXT: br label %[[RETURN]] +; ALL: [[SW_EPILOG]]: +; ALL-NEXT: store i32 113, ptr [[RETVAL]], align 4 +; ALL-NEXT: br label %[[RETURN]] +; ALL: [[RETURN]]: +; ALL-NEXT: [[RVAL:%.*]] = load i32, ptr [[RETVAL]], align 4 +; ALL-NEXT: ret i32 [[RVAL]] ; entry: %retval = alloca i32, align 4 @@ -182,9 +263,9 @@ entry: %shr = lshr i32 %xor, 16 %add = add i32 %shr, -917677090 switch i32 %add, label %sw.epilog [ - i32 -917677027, label %sw.bb - i32 -917677089, label %sw.bb1 - i32 -917676990, label %sw.bb2 + i32 -917677027, label %sw.bb + i32 -917677089, label %sw.bb1 + i32 -917676990, label %sw.bb2 ] sw.bb: ; preds = %entry @@ -219,11 +300,32 @@ declare i32 @goo() ; if original type is legal (i32 in this case) define void @PR29009() { -; ALL-LABEL: @PR29009( -; ALL: switch i32 -; ALL-NEXT: i32 0, label -; ALL-NEXT: i32 3, label +; ALL-LABEL: define void @PR29009() { +; ALL-NEXT: br label %[[BB1:.*]] +; ALL: [[BB1]]: +; ALL-NEXT: [[TMP2:%.*]] = load volatile i32, ptr @njob, align 4 +; ALL-NEXT: [[DOTNOT:%.*]] = icmp eq i32 [[TMP2]], 0 +; ALL-NEXT: br i1 [[DOTNOT]], label %[[BB10:.*]], label %[[BB3:.*]] +; ALL: [[BB3]]: +; ALL-NEXT: [[TMP4:%.*]] = call i32 @goo() +; ALL-NEXT: [[TMP5:%.*]] = and i32 [[TMP4]], 7 +; ALL-NEXT: switch i32 [[TMP5]], label %[[BB6:.*]] [ +; ALL-NEXT: i32 0, label %[[BB7:.*]] +; ALL-NEXT: i32 3, label %[[BB8:.*]] ; ALL-NEXT: ] +; ALL: [[BB6]]: +; ALL-NEXT: store i32 6, ptr @a, align 4 +; ALL-NEXT: br label %[[BB9:.*]] +; ALL: [[BB7]]: +; ALL-NEXT: store i32 1, ptr @a, align 4 +; ALL-NEXT: br label %[[BB9]] +; ALL: [[BB8]]: +; ALL-NEXT: store i32 2, ptr @a, align 4 +; ALL-NEXT: br label %[[BB9]] +; ALL: [[BB9]]: +; ALL-NEXT: br label %[[BB1]] +; ALL: [[BB10]]: +; ALL-NEXT: ret void ; br label %1 @@ -236,8 +338,8 @@ define void @PR29009() { %5 = call i32 @goo() %6 = and i32 %5, 7 switch i32 %6, label %7 [ - i32 0, label %8 - i32 3, label %9 + i32 0, label %8 + i32 3, label %9 ] ;