Files
clang-p2996/libcxx/include/__string/constexpr_c_functions.h
Nikolas Klauser d07fdf9779 [libc++] Optimize lexicographical_compare (#65279)
If the comparison operation is equivalent to < and that is a total
order, we know that we can use equality comparison on that type instead
to extract some information. Furthermore, if equality comparison on that
type is trivial, the user can't observe that we're calling it. So
instead of using the user-provided total order, we use std::mismatch,
which uses equality comparison (and is vertorized). Additionally, if the
type is trivially lexicographically comparable, we can go one step
further and use std::memcmp directly instead of calling std::mismatch.

Benchmarks:
```
-------------------------------------------------------------------------------------
Benchmark                                                         old             new
-------------------------------------------------------------------------------------
bm_lexicographical_compare<unsigned char>/1                   1.17 ns         2.34 ns
bm_lexicographical_compare<unsigned char>/2                   1.64 ns         2.57 ns
bm_lexicographical_compare<unsigned char>/3                   2.23 ns         2.58 ns
bm_lexicographical_compare<unsigned char>/4                   2.82 ns         2.57 ns
bm_lexicographical_compare<unsigned char>/5                   3.34 ns         2.11 ns
bm_lexicographical_compare<unsigned char>/6                   3.94 ns         2.21 ns
bm_lexicographical_compare<unsigned char>/7                   4.56 ns         2.11 ns
bm_lexicographical_compare<unsigned char>/8                   5.25 ns         2.11 ns
bm_lexicographical_compare<unsigned char>/16                  9.88 ns         2.11 ns
bm_lexicographical_compare<unsigned char>/64                  38.9 ns         2.36 ns
bm_lexicographical_compare<unsigned char>/512                  317 ns         6.54 ns
bm_lexicographical_compare<unsigned char>/4096                2517 ns         41.4 ns
bm_lexicographical_compare<unsigned char>/32768              20052 ns          488 ns
bm_lexicographical_compare<unsigned char>/262144            159579 ns         4409 ns
bm_lexicographical_compare<unsigned char>/1048576           640456 ns        20342 ns
bm_lexicographical_compare<signed char>/1                     1.18 ns         2.37 ns
bm_lexicographical_compare<signed char>/2                     1.65 ns         2.60 ns
bm_lexicographical_compare<signed char>/3                     2.23 ns         2.83 ns
bm_lexicographical_compare<signed char>/4                     2.81 ns         3.06 ns
bm_lexicographical_compare<signed char>/5                     3.35 ns         3.30 ns
bm_lexicographical_compare<signed char>/6                     3.90 ns         3.99 ns
bm_lexicographical_compare<signed char>/7                     4.56 ns         3.78 ns
bm_lexicographical_compare<signed char>/8                     5.20 ns         4.02 ns
bm_lexicographical_compare<signed char>/16                    9.80 ns         6.21 ns
bm_lexicographical_compare<signed char>/64                    39.0 ns         3.16 ns
bm_lexicographical_compare<signed char>/512                    318 ns         7.58 ns
bm_lexicographical_compare<signed char>/4096                  2514 ns         47.4 ns
bm_lexicographical_compare<signed char>/32768                20096 ns          504 ns
bm_lexicographical_compare<signed char>/262144              156617 ns         4146 ns
bm_lexicographical_compare<signed char>/1048576             624265 ns        19810 ns
bm_lexicographical_compare<int>/1                             1.15 ns         2.12 ns
bm_lexicographical_compare<int>/2                             1.60 ns         2.36 ns
bm_lexicographical_compare<int>/3                             2.21 ns         2.59 ns
bm_lexicographical_compare<int>/4                             2.74 ns         2.83 ns
bm_lexicographical_compare<int>/5                             3.26 ns         3.06 ns
bm_lexicographical_compare<int>/6                             3.81 ns         4.53 ns
bm_lexicographical_compare<int>/7                             4.41 ns         4.72 ns
bm_lexicographical_compare<int>/8                             5.08 ns         2.36 ns
bm_lexicographical_compare<int>/16                            9.54 ns         3.08 ns
bm_lexicographical_compare<int>/64                            37.8 ns         4.71 ns
bm_lexicographical_compare<int>/512                            309 ns         24.6 ns
bm_lexicographical_compare<int>/4096                          2422 ns          204 ns
bm_lexicographical_compare<int>/32768                        19362 ns         1947 ns
bm_lexicographical_compare<int>/262144                      155727 ns        19793 ns
bm_lexicographical_compare<int>/1048576                     623614 ns        80180 ns
bm_ranges_lexicographical_compare<unsigned char>/1            1.07 ns         2.35 ns
bm_ranges_lexicographical_compare<unsigned char>/2            1.72 ns         2.13 ns
bm_ranges_lexicographical_compare<unsigned char>/3            2.46 ns         2.12 ns
bm_ranges_lexicographical_compare<unsigned char>/4            3.17 ns         2.12 ns
bm_ranges_lexicographical_compare<unsigned char>/5            3.86 ns         2.12 ns
bm_ranges_lexicographical_compare<unsigned char>/6            4.55 ns         2.12 ns
bm_ranges_lexicographical_compare<unsigned char>/7            5.25 ns         2.12 ns
bm_ranges_lexicographical_compare<unsigned char>/8            5.95 ns         2.13 ns
bm_ranges_lexicographical_compare<unsigned char>/16           11.7 ns         2.13 ns
bm_ranges_lexicographical_compare<unsigned char>/64           45.5 ns         2.36 ns
bm_ranges_lexicographical_compare<unsigned char>/512           366 ns         6.35 ns
bm_ranges_lexicographical_compare<unsigned char>/4096         2886 ns         40.9 ns
bm_ranges_lexicographical_compare<unsigned char>/32768       23054 ns          489 ns
bm_ranges_lexicographical_compare<unsigned char>/262144     185302 ns         4339 ns
bm_ranges_lexicographical_compare<unsigned char>/1048576    741576 ns        19430 ns
bm_ranges_lexicographical_compare<signed char>/1              1.10 ns         2.12 ns
bm_ranges_lexicographical_compare<signed char>/2              1.66 ns         2.35 ns
bm_ranges_lexicographical_compare<signed char>/3              2.23 ns         2.58 ns
bm_ranges_lexicographical_compare<signed char>/4              2.82 ns         2.82 ns
bm_ranges_lexicographical_compare<signed char>/5              3.34 ns         3.06 ns
bm_ranges_lexicographical_compare<signed char>/6              3.92 ns         3.99 ns
bm_ranges_lexicographical_compare<signed char>/7              4.64 ns         4.10 ns
bm_ranges_lexicographical_compare<signed char>/8              5.21 ns         4.61 ns
bm_ranges_lexicographical_compare<signed char>/16             9.79 ns         7.42 ns
bm_ranges_lexicographical_compare<signed char>/64             38.9 ns         2.93 ns
bm_ranges_lexicographical_compare<signed char>/512             317 ns         7.31 ns
bm_ranges_lexicographical_compare<signed char>/4096           2500 ns         47.5 ns
bm_ranges_lexicographical_compare<signed char>/32768         19940 ns          496 ns
bm_ranges_lexicographical_compare<signed char>/262144       159166 ns         4393 ns
bm_ranges_lexicographical_compare<signed char>/1048576      638206 ns        19786 ns
bm_ranges_lexicographical_compare<int>/1                      1.10 ns         2.12 ns
bm_ranges_lexicographical_compare<int>/2                      1.64 ns         3.04 ns
bm_ranges_lexicographical_compare<int>/3                      2.23 ns         2.58 ns
bm_ranges_lexicographical_compare<int>/4                      2.81 ns         2.81 ns
bm_ranges_lexicographical_compare<int>/5                      3.35 ns         3.05 ns
bm_ranges_lexicographical_compare<int>/6                      3.94 ns         4.60 ns
bm_ranges_lexicographical_compare<int>/7                      4.60 ns         4.81 ns
bm_ranges_lexicographical_compare<int>/8                      5.19 ns         2.35 ns
bm_ranges_lexicographical_compare<int>/16                     9.85 ns         2.87 ns
bm_ranges_lexicographical_compare<int>/64                     38.9 ns         4.70 ns
bm_ranges_lexicographical_compare<int>/512                     318 ns         24.5 ns
bm_ranges_lexicographical_compare<int>/4096                   2494 ns          202 ns
bm_ranges_lexicographical_compare<int>/32768                 20000 ns         1939 ns
bm_ranges_lexicographical_compare<int>/262144               160433 ns        19730 ns
bm_ranges_lexicographical_compare<int>/1048576              642636 ns        80760 ns
```
2024-08-04 10:02:43 +02:00

235 lines
9.2 KiB
C++

//===----------------------------------------------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#ifndef _LIBCPP___STRING_CONSTEXPR_C_FUNCTIONS_H
#define _LIBCPP___STRING_CONSTEXPR_C_FUNCTIONS_H
#include <__config>
#include <__memory/addressof.h>
#include <__memory/construct_at.h>
#include <__type_traits/datasizeof.h>
#include <__type_traits/is_always_bitcastable.h>
#include <__type_traits/is_assignable.h>
#include <__type_traits/is_constant_evaluated.h>
#include <__type_traits/is_constructible.h>
#include <__type_traits/is_equality_comparable.h>
#include <__type_traits/is_same.h>
#include <__type_traits/is_trivially_copyable.h>
#include <__type_traits/is_trivially_lexicographically_comparable.h>
#include <__type_traits/remove_cv.h>
#include <__utility/is_pointer_in_range.h>
#include <cstddef>
#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
# pragma GCC system_header
#endif
_LIBCPP_BEGIN_NAMESPACE_STD
// Type used to encode that a function takes an integer that represents a number
// of elements as opposed to a number of bytes.
enum class __element_count : size_t {};
template <class _Tp>
inline const bool __is_char_type = false;
template <>
inline const bool __is_char_type<char> = true;
#ifndef _LIBCPP_HAS_NO_CHAR8_T
template <>
inline const bool __is_char_type<char8_t> = true;
#endif
template <class _Tp>
inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 size_t __constexpr_strlen(const _Tp* __str) _NOEXCEPT {
static_assert(__is_char_type<_Tp>, "__constexpr_strlen only works with char and char8_t");
// GCC currently doesn't support __builtin_strlen for heap-allocated memory during constant evaluation.
// https://gcc.gnu.org/bugzilla/show_bug.cgi?id=70816
if (__libcpp_is_constant_evaluated()) {
#if _LIBCPP_STD_VER >= 17 && defined(_LIBCPP_COMPILER_CLANG_BASED)
if constexpr (is_same_v<_Tp, char>)
return __builtin_strlen(__str);
#endif
size_t __i = 0;
for (; __str[__i] != '\0'; ++__i)
;
return __i;
}
return __builtin_strlen(reinterpret_cast<const char*>(__str));
}
// Because of __is_trivially_lexicographically_comparable_v we know that comparing the object representations is
// equivalent to a std::memcmp. Since we have multiple objects contiguously in memory, we can call memcmp once instead
// of invoking it on every object individually.
template <class _Tp, class _Up>
_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 int
__constexpr_memcmp(const _Tp* __lhs, const _Up* __rhs, __element_count __n) {
static_assert(__is_trivially_lexicographically_comparable_v<_Tp, _Up>,
"_Tp and _Up have to be trivially lexicographically comparable");
auto __count = static_cast<size_t>(__n);
if (__libcpp_is_constant_evaluated()) {
#ifdef _LIBCPP_COMPILER_CLANG_BASED
if (sizeof(_Tp) == 1 && !is_same<_Tp, bool>::value)
return __builtin_memcmp(__lhs, __rhs, __count * sizeof(_Tp));
#endif
while (__count != 0) {
if (*__lhs < *__rhs)
return -1;
if (*__rhs < *__lhs)
return 1;
--__count;
++__lhs;
++__rhs;
}
return 0;
} else {
return __builtin_memcmp(__lhs, __rhs, __count * sizeof(_Tp));
}
}
// Because of __libcpp_is_trivially_equality_comparable we know that comparing the object representations is equivalent
// to a std::memcmp(...) == 0. Since we have multiple objects contiguously in memory, we can call memcmp once instead
// of invoking it on every object individually.
template <class _Tp, class _Up>
_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 bool
__constexpr_memcmp_equal(const _Tp* __lhs, const _Up* __rhs, __element_count __n) {
static_assert(__libcpp_is_trivially_equality_comparable<_Tp, _Up>::value,
"_Tp and _Up have to be trivially equality comparable");
auto __count = static_cast<size_t>(__n);
if (__libcpp_is_constant_evaluated()) {
#ifdef _LIBCPP_COMPILER_CLANG_BASED
if (sizeof(_Tp) == 1 && is_integral<_Tp>::value && !is_same<_Tp, bool>::value)
return __builtin_memcmp(__lhs, __rhs, __count * sizeof(_Tp)) == 0;
#endif
while (__count != 0) {
if (*__lhs != *__rhs)
return false;
--__count;
++__lhs;
++__rhs;
}
return true;
} else {
return ::__builtin_memcmp(__lhs, __rhs, __count * sizeof(_Tp)) == 0;
}
}
template <class _Tp, class _Up>
_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 _Tp* __constexpr_memchr(_Tp* __str, _Up __value, size_t __count) {
static_assert(sizeof(_Tp) == 1 && __libcpp_is_trivially_equality_comparable<_Tp, _Up>::value,
"Calling memchr on non-trivially equality comparable types is unsafe.");
if (__libcpp_is_constant_evaluated()) {
// use __builtin_char_memchr to optimize constexpr evaluation if we can
#if _LIBCPP_STD_VER >= 17 && __has_builtin(__builtin_char_memchr)
if constexpr (is_same_v<remove_cv_t<_Tp>, char> && is_same_v<remove_cv_t<_Up>, char>)
return __builtin_char_memchr(__str, __value, __count);
#endif
for (; __count; --__count) {
if (*__str == __value)
return __str;
++__str;
}
return nullptr;
} else {
char __value_buffer = 0;
__builtin_memcpy(&__value_buffer, &__value, sizeof(char));
return static_cast<_Tp*>(__builtin_memchr(__str, __value_buffer, __count));
}
}
// This function performs an assignment to an existing, already alive TriviallyCopyable object
// from another TriviallyCopyable object.
//
// It basically works around the fact that TriviallyCopyable objects are not required to be
// syntactically copy/move constructible or copy/move assignable. Technically, only one of the
// four operations is required to be syntactically valid -- but at least one definitely has to
// be valid.
//
// This is necessary in order to implement __constexpr_memmove below in a way that mirrors as
// closely as possible what the compiler's __builtin_memmove is able to do.
template <class _Tp, class _Up, __enable_if_t<is_assignable<_Tp&, _Up const&>::value, int> = 0>
_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 _Tp& __assign_trivially_copyable(_Tp& __dest, _Up const& __src) {
__dest = __src;
return __dest;
}
// clang-format off
template <class _Tp, class _Up, __enable_if_t<!is_assignable<_Tp&, _Up const&>::value &&
is_assignable<_Tp&, _Up&&>::value, int> = 0>
// clang-format on
_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 _Tp& __assign_trivially_copyable(_Tp& __dest, _Up& __src) {
__dest =
static_cast<_Up&&>(__src); // this is safe, we're not actually moving anything since the assignment is trivial
return __dest;
}
// clang-format off
template <class _Tp, class _Up, __enable_if_t<!is_assignable<_Tp&, _Up const&>::value &&
!is_assignable<_Tp&, _Up&&>::value &&
is_constructible<_Tp, _Up const&>::value, int> = 0>
// clang-format on
_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _Tp& __assign_trivially_copyable(_Tp& __dest, _Up const& __src) {
// _Tp is trivially destructible, so we don't need to call its destructor to end the lifetime of the object
// that was there previously
std::__construct_at(std::addressof(__dest), __src);
return __dest;
}
// clang-format off
template <class _Tp, class _Up, __enable_if_t<!is_assignable<_Tp&, _Up const&>::value &&
!is_assignable<_Tp&, _Up&&>::value &&
!is_constructible<_Tp, _Up const&>::value &&
is_constructible<_Tp, _Up&&>::value, int> = 0>
// clang-format on
_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _Tp& __assign_trivially_copyable(_Tp& __dest, _Up& __src) {
// _Tp is trivially destructible, so we don't need to call its destructor to end the lifetime of the object
// that was there previously
std::__construct_at(
std::addressof(__dest),
static_cast<_Up&&>(__src)); // this is safe, we're not actually moving anything since the constructor is trivial
return __dest;
}
template <class _Tp, class _Up, __enable_if_t<__is_always_bitcastable<_Up, _Tp>::value, int> = 0>
_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 _Tp*
__constexpr_memmove(_Tp* __dest, _Up* __src, __element_count __n) {
size_t __count = static_cast<size_t>(__n);
if (__libcpp_is_constant_evaluated()) {
#ifdef _LIBCPP_COMPILER_CLANG_BASED
if (is_same<__remove_cv_t<_Tp>, __remove_cv_t<_Up> >::value) {
::__builtin_memmove(__dest, __src, __count * sizeof(_Tp));
return __dest;
}
#endif
if (std::__is_pointer_in_range(__src, __src + __count, __dest)) {
for (; __count > 0; --__count)
std::__assign_trivially_copyable(__dest[__count - 1], __src[__count - 1]);
} else {
for (size_t __i = 0; __i != __count; ++__i)
std::__assign_trivially_copyable(__dest[__i], __src[__i]);
}
} else if (__count > 0) {
::__builtin_memmove(__dest, __src, (__count - 1) * sizeof(_Tp) + __datasizeof_v<_Tp>);
}
return __dest;
}
_LIBCPP_END_NAMESPACE_STD
#endif // _LIBCPP___STRING_CONSTEXPR_C_FUNCTIONS_H