[libc++] Optimize std::for_each_n for segmented iterators (#135468)
This patch enhances the performance of `std::for_each_n` when used with segmented iterators, leading to significant performance improvements, summarized in the tables below. This addresses a subtask of https://github.com/llvm/llvm-project/issues/102817.
This commit is contained in:
@@ -70,6 +70,9 @@ Improvements and New Features
|
||||
- The segmented iterator optimization for ``std::for_each`` has been backported to C++11. Previously it was only available
|
||||
in C++23 and later.
|
||||
|
||||
- The ``std::for_each_n`` algorithm has been optimized for segmented iterators, resulting in a performance improvement of
|
||||
up to 17.7x for ``std::deque<short>`` iterators, and up to 13.9x for ``std::join_view<vector<vector<short>>>`` iterators.
|
||||
|
||||
Deprecations and Removals
|
||||
-------------------------
|
||||
|
||||
|
||||
@@ -25,6 +25,7 @@ set(files
|
||||
__algorithm/find_segment_if.h
|
||||
__algorithm/for_each.h
|
||||
__algorithm/for_each_n.h
|
||||
__algorithm/for_each_n_segment.h
|
||||
__algorithm/for_each_segment.h
|
||||
__algorithm/generate.h
|
||||
__algorithm/generate_n.h
|
||||
|
||||
@@ -10,20 +10,35 @@
|
||||
#ifndef _LIBCPP___ALGORITHM_FOR_EACH_N_H
|
||||
#define _LIBCPP___ALGORITHM_FOR_EACH_N_H
|
||||
|
||||
#include <__algorithm/for_each.h>
|
||||
#include <__algorithm/for_each_n_segment.h>
|
||||
#include <__config>
|
||||
#include <__iterator/iterator_traits.h>
|
||||
#include <__iterator/segmented_iterator.h>
|
||||
#include <__type_traits/disjunction.h>
|
||||
#include <__type_traits/enable_if.h>
|
||||
#include <__type_traits/negation.h>
|
||||
#include <__utility/convert_to_integral.h>
|
||||
#include <__utility/move.h>
|
||||
|
||||
#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
|
||||
# pragma GCC system_header
|
||||
#endif
|
||||
|
||||
_LIBCPP_PUSH_MACROS
|
||||
#include <__undef_macros>
|
||||
|
||||
_LIBCPP_BEGIN_NAMESPACE_STD
|
||||
|
||||
#if _LIBCPP_STD_VER >= 17
|
||||
|
||||
template <class _InputIterator, class _Size, class _Function>
|
||||
inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _InputIterator
|
||||
for_each_n(_InputIterator __first, _Size __orig_n, _Function __f) {
|
||||
template <class _InputIterator,
|
||||
class _Size,
|
||||
class _Func,
|
||||
__enable_if_t<!__has_random_access_iterator_category<_InputIterator>::value &&
|
||||
_Or< _Not<__is_segmented_iterator<_InputIterator> >,
|
||||
_Not<__has_random_access_local_iterator<_InputIterator> > >::value,
|
||||
int> = 0>
|
||||
_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _InputIterator
|
||||
__for_each_n(_InputIterator __first, _Size __orig_n, _Func& __f) {
|
||||
typedef decltype(std::__convert_to_integral(__orig_n)) _IntegralSize;
|
||||
_IntegralSize __n = __orig_n;
|
||||
while (__n > 0) {
|
||||
@@ -31,11 +46,51 @@ for_each_n(_InputIterator __first, _Size __orig_n, _Function __f) {
|
||||
++__first;
|
||||
--__n;
|
||||
}
|
||||
return __first;
|
||||
return std::move(__first);
|
||||
}
|
||||
|
||||
#endif
|
||||
template <class _RandIter,
|
||||
class _Size,
|
||||
class _Func,
|
||||
__enable_if_t<__has_random_access_iterator_category<_RandIter>::value, int> = 0>
|
||||
_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _RandIter
|
||||
__for_each_n(_RandIter __first, _Size __orig_n, _Func& __f) {
|
||||
typename std::iterator_traits<_RandIter>::difference_type __n = __orig_n;
|
||||
auto __last = __first + __n;
|
||||
std::__for_each(__first, __last, __f);
|
||||
return std::move(__last);
|
||||
}
|
||||
|
||||
#ifndef _LIBCPP_CXX03_LANG
|
||||
template <class _SegmentedIterator,
|
||||
class _Size,
|
||||
class _Func,
|
||||
__enable_if_t<!__has_random_access_iterator_category<_SegmentedIterator>::value &&
|
||||
__is_segmented_iterator<_SegmentedIterator>::value &&
|
||||
__has_random_access_iterator_category<
|
||||
typename __segmented_iterator_traits<_SegmentedIterator>::__local_iterator>::value,
|
||||
int> = 0>
|
||||
_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _SegmentedIterator
|
||||
__for_each_n(_SegmentedIterator __first, _Size __orig_n, _Func& __f) {
|
||||
using __local_iterator_t = typename __segmented_iterator_traits<_SegmentedIterator>::__local_iterator;
|
||||
return std::__for_each_n_segment(__first, __orig_n, [&](__local_iterator_t __lfirst, __local_iterator_t __llast) {
|
||||
std::__for_each(__lfirst, __llast, __f);
|
||||
});
|
||||
}
|
||||
#endif // !_LIBCPP_CXX03_LANG
|
||||
|
||||
#if _LIBCPP_STD_VER >= 17
|
||||
|
||||
template <class _InputIterator, class _Size, class _Function>
|
||||
inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _InputIterator
|
||||
for_each_n(_InputIterator __first, _Size __orig_n, _Function __f) {
|
||||
return std::__for_each_n(__first, __orig_n, __f);
|
||||
}
|
||||
|
||||
#endif // _LIBCPP_STD_VER >= 17
|
||||
|
||||
_LIBCPP_END_NAMESPACE_STD
|
||||
|
||||
_LIBCPP_POP_MACROS
|
||||
|
||||
#endif // _LIBCPP___ALGORITHM_FOR_EACH_N_H
|
||||
|
||||
63
libcxx/include/__algorithm/for_each_n_segment.h
Normal file
63
libcxx/include/__algorithm/for_each_n_segment.h
Normal file
@@ -0,0 +1,63 @@
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#ifndef _LIBCPP___ALGORITHM_FOR_EACH_N_SEGMENT_H
|
||||
#define _LIBCPP___ALGORITHM_FOR_EACH_N_SEGMENT_H
|
||||
|
||||
#include <__config>
|
||||
#include <__iterator/iterator_traits.h>
|
||||
#include <__iterator/segmented_iterator.h>
|
||||
|
||||
#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
|
||||
# pragma GCC system_header
|
||||
#endif
|
||||
|
||||
_LIBCPP_BEGIN_NAMESPACE_STD
|
||||
|
||||
// __for_each_n_segment optimizes linear iteration over segmented iterators. It processes a segmented
|
||||
// input range [__first, __first + __n) by applying the functor __func to each element within the segment.
|
||||
// The return value of __func is ignored, and the function returns an iterator pointing to one past the
|
||||
// last processed element in the input range.
|
||||
|
||||
template <class _SegmentedIterator, class _Size, class _Functor>
|
||||
_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 _SegmentedIterator
|
||||
__for_each_n_segment(_SegmentedIterator __first, _Size __orig_n, _Functor __func) {
|
||||
static_assert(__is_segmented_iterator<_SegmentedIterator>::value &&
|
||||
__has_random_access_iterator_category<
|
||||
typename __segmented_iterator_traits<_SegmentedIterator>::__local_iterator>::value,
|
||||
"__for_each_n_segment only works with segmented iterators with random-access local iterators");
|
||||
if (__orig_n <= 0)
|
||||
return __first;
|
||||
|
||||
using _Traits = __segmented_iterator_traits<_SegmentedIterator>;
|
||||
using __local_iter_t = typename _Traits::__local_iterator;
|
||||
using __difference_t = typename std::iterator_traits<__local_iter_t>::difference_type;
|
||||
__difference_t __n = __orig_n;
|
||||
auto __seg = _Traits::__segment(__first);
|
||||
auto __local_first = _Traits::__local(__first);
|
||||
__local_iter_t __local_last;
|
||||
|
||||
while (__n > 0) {
|
||||
__local_last = _Traits::__end(__seg);
|
||||
auto __seg_size = __local_last - __local_first;
|
||||
if (__n <= __seg_size) {
|
||||
__local_last = __local_first + __n;
|
||||
__func(__local_first, __local_last);
|
||||
break;
|
||||
}
|
||||
__func(__local_first, __local_last);
|
||||
__n -= __seg_size;
|
||||
__local_first = _Traits::__begin(++__seg);
|
||||
}
|
||||
|
||||
return _Traits::__compose(__seg, __local_last);
|
||||
}
|
||||
|
||||
_LIBCPP_END_NAMESPACE_STD
|
||||
|
||||
#endif // _LIBCPP___ALGORITHM_FOR_EACH_N_SEGMENT_H
|
||||
@@ -42,6 +42,7 @@
|
||||
|
||||
#include <__config>
|
||||
#include <__cstddef/size_t.h>
|
||||
#include <__iterator/iterator_traits.h>
|
||||
#include <__type_traits/integral_constant.h>
|
||||
|
||||
#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
|
||||
@@ -74,6 +75,11 @@ struct __has_specialization<_Tp, sizeof(_Tp) * 0> : true_type {};
|
||||
template <class _Iterator>
|
||||
using __is_segmented_iterator _LIBCPP_NODEBUG = __has_specialization<__segmented_iterator_traits<_Iterator> >;
|
||||
|
||||
template <class _SegmentedIterator>
|
||||
struct __has_random_access_local_iterator
|
||||
: __has_random_access_iterator_category<
|
||||
typename __segmented_iterator_traits< _SegmentedIterator >::__local_iterator > {};
|
||||
|
||||
_LIBCPP_END_NAMESPACE_STD
|
||||
|
||||
#endif // _LIBCPP___SEGMENTED_ITERATOR_H
|
||||
|
||||
@@ -437,6 +437,7 @@ module std [system] {
|
||||
module find_segment_if { header "__algorithm/find_segment_if.h" }
|
||||
module find { header "__algorithm/find.h" }
|
||||
module for_each_n { header "__algorithm/for_each_n.h" }
|
||||
module for_each_n_segment { header "__algorithm/for_each_n_segment.h" }
|
||||
module for_each_segment { header "__algorithm/for_each_segment.h" }
|
||||
module for_each { header "__algorithm/for_each.h" }
|
||||
module generate_n { header "__algorithm/generate_n.h" }
|
||||
|
||||
@@ -0,0 +1,98 @@
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
// UNSUPPORTED: c++03, c++11, c++14, c++17
|
||||
|
||||
#include <algorithm>
|
||||
#include <cstddef>
|
||||
#include <deque>
|
||||
#include <list>
|
||||
#include <ranges>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include <benchmark/benchmark.h>
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
auto std_for_each_n = [](auto first, auto n, auto f) { return std::for_each_n(first, n, f); };
|
||||
|
||||
// std::for_each_n
|
||||
{
|
||||
auto bm = []<class Container>(std::string name, auto for_each_n) {
|
||||
using ElemType = typename Container::value_type;
|
||||
benchmark::RegisterBenchmark(
|
||||
name,
|
||||
[for_each_n](auto& st) {
|
||||
std::size_t const n = st.range(0);
|
||||
Container c(n, 1);
|
||||
auto first = c.begin();
|
||||
|
||||
for ([[maybe_unused]] auto _ : st) {
|
||||
benchmark::DoNotOptimize(c);
|
||||
auto result = for_each_n(first, n, [](ElemType& x) { x = std::clamp<ElemType>(x, 10, 100); });
|
||||
benchmark::DoNotOptimize(result);
|
||||
}
|
||||
})
|
||||
->Arg(8)
|
||||
->Arg(32)
|
||||
->Arg(50) // non power-of-two
|
||||
->Arg(1024)
|
||||
->Arg(4096)
|
||||
->Arg(8192)
|
||||
->Arg(1 << 14)
|
||||
->Arg(1 << 16)
|
||||
->Arg(1 << 18);
|
||||
};
|
||||
bm.operator()<std::vector<int>>("std::for_each_n(vector<int>)", std_for_each_n);
|
||||
bm.operator()<std::deque<int>>("std::for_each_n(deque<int>)", std_for_each_n);
|
||||
bm.operator()<std::list<int>>("std::for_each_n(list<int>)", std_for_each_n);
|
||||
}
|
||||
|
||||
// std::for_each_n for join_view
|
||||
{
|
||||
auto bm = []<class Container>(std::string name, auto for_each_n) {
|
||||
using C1 = typename Container::value_type;
|
||||
using ElemType = typename C1::value_type;
|
||||
benchmark::RegisterBenchmark(
|
||||
name,
|
||||
[for_each_n](auto& st) {
|
||||
std::size_t const size = st.range(0);
|
||||
std::size_t const seg_size = 256;
|
||||
std::size_t const segments = (size + seg_size - 1) / seg_size;
|
||||
Container c(segments);
|
||||
for (std::size_t i = 0, n = size; i < segments; ++i, n -= seg_size) {
|
||||
c[i].resize(std::min(seg_size, n), ElemType(1));
|
||||
}
|
||||
|
||||
auto view = c | std::views::join;
|
||||
auto first = view.begin();
|
||||
|
||||
for ([[maybe_unused]] auto _ : st) {
|
||||
benchmark::DoNotOptimize(c);
|
||||
auto result = for_each_n(first, size, [](ElemType& x) { x = std::clamp<ElemType>(x, 10, 100); });
|
||||
benchmark::DoNotOptimize(result);
|
||||
}
|
||||
})
|
||||
->Arg(8)
|
||||
->Arg(32)
|
||||
->Arg(50) // non power-of-two
|
||||
->Arg(1024)
|
||||
->Arg(4096)
|
||||
->Arg(8192)
|
||||
->Arg(1 << 14)
|
||||
->Arg(1 << 16)
|
||||
->Arg(1 << 18);
|
||||
};
|
||||
bm.operator()<std::vector<std::vector<int>>>("std::for_each_n(join_view(vector<vector<int>>))", std_for_each_n);
|
||||
}
|
||||
|
||||
benchmark::Initialize(&argc, argv);
|
||||
benchmark::RunSpecifiedBenchmarks();
|
||||
benchmark::Shutdown();
|
||||
return 0;
|
||||
}
|
||||
@@ -13,69 +13,128 @@
|
||||
// constexpr InputIterator // constexpr after C++17
|
||||
// for_each_n(InputIterator first, Size n, Function f);
|
||||
|
||||
|
||||
#include <algorithm>
|
||||
#include <cassert>
|
||||
#include <deque>
|
||||
#include <functional>
|
||||
#include <iterator>
|
||||
#include <list>
|
||||
#include <ranges>
|
||||
#include <vector>
|
||||
|
||||
#include "test_macros.h"
|
||||
#include "test_iterators.h"
|
||||
|
||||
#if TEST_STD_VER > 17
|
||||
TEST_CONSTEXPR bool test_constexpr() {
|
||||
int ia[] = {1, 3, 6, 7};
|
||||
int expected[] = {3, 5, 8, 9};
|
||||
const std::size_t N = 4;
|
||||
|
||||
auto it = std::for_each_n(std::begin(ia), N, [](int &a) { a += 2; });
|
||||
return it == (std::begin(ia) + N)
|
||||
&& std::equal(std::begin(ia), std::end(ia), std::begin(expected))
|
||||
;
|
||||
}
|
||||
#endif
|
||||
|
||||
struct for_each_test
|
||||
{
|
||||
for_each_test(int c) : count(c) {}
|
||||
int count;
|
||||
void operator()(int& i) {++i; ++count;}
|
||||
struct for_each_test {
|
||||
TEST_CONSTEXPR for_each_test(int c) : count(c) {}
|
||||
int count;
|
||||
TEST_CONSTEXPR_CXX14 void operator()(int& i) {
|
||||
++i;
|
||||
++count;
|
||||
}
|
||||
};
|
||||
|
||||
int main(int, char**)
|
||||
{
|
||||
struct deque_test {
|
||||
std::deque<int>* d_;
|
||||
int* i_;
|
||||
|
||||
deque_test(std::deque<int>& d, int& i) : d_(&d), i_(&i) {}
|
||||
|
||||
void operator()(int& v) {
|
||||
assert(&(*d_)[*i_] == &v);
|
||||
++*i_;
|
||||
}
|
||||
};
|
||||
|
||||
/*TEST_CONSTEXPR_CXX26*/
|
||||
void test_deque_and_join_view_iterators() { // TODO: Mark as TEST_CONSTEXPR_CXX26 once std::deque is constexpr
|
||||
{ // Verify that segmented deque iterators work properly
|
||||
int sizes[] = {0, 1, 2, 1023, 1024, 1025, 2047, 2048, 2049};
|
||||
for (const int size : sizes) {
|
||||
std::deque<int> d(size);
|
||||
int index = 0;
|
||||
|
||||
std::for_each_n(d.begin(), d.size(), deque_test(d, index));
|
||||
}
|
||||
}
|
||||
#if TEST_STD_VER >= 20
|
||||
{ // Verify that join_view of lists work properly. Note that join_view of (non-random access) lists does
|
||||
// not produce segmented iterators.
|
||||
std::list<std::list<int>> lst = {{}, {0}, {1, 2}, {}, {3, 4, 5}, {6, 7, 8, 9}, {10}, {11, 12, 13}};
|
||||
auto v = lst | std::views::join;
|
||||
std::for_each_n(v.begin(), std::ranges::distance(v), [i = 0](int& a) mutable { assert(a == i++); });
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
TEST_CONSTEXPR_CXX20 bool test() {
|
||||
{
|
||||
typedef cpp17_input_iterator<int*> Iter;
|
||||
int ia[] = {0, 1, 2, 3, 4, 5};
|
||||
const unsigned s = sizeof(ia)/sizeof(ia[0]);
|
||||
int ia[] = {0, 1, 2, 3, 4, 5};
|
||||
const unsigned s = sizeof(ia) / sizeof(ia[0]);
|
||||
|
||||
{
|
||||
auto f = for_each_test(0);
|
||||
Iter it = std::for_each_n(Iter(ia), 0, std::ref(f));
|
||||
assert(it == Iter(ia));
|
||||
assert(f.count == 0);
|
||||
unsigned count = 0;
|
||||
Iter it = std::for_each_n(Iter(ia), 0, [&count](int& i) {
|
||||
++i;
|
||||
++count;
|
||||
});
|
||||
assert(it == Iter(ia));
|
||||
assert(count == 0);
|
||||
}
|
||||
|
||||
{
|
||||
auto f = for_each_test(0);
|
||||
Iter it = std::for_each_n(Iter(ia), s, std::ref(f));
|
||||
|
||||
assert(it == Iter(ia+s));
|
||||
assert(f.count == s);
|
||||
for (unsigned i = 0; i < s; ++i)
|
||||
assert(ia[i] == static_cast<int>(i+1));
|
||||
unsigned count = 0;
|
||||
Iter it = std::for_each_n(Iter(ia), s, [&count](int& i) {
|
||||
++i;
|
||||
++count;
|
||||
});
|
||||
assert(it == Iter(ia + s));
|
||||
assert(count == s);
|
||||
for (unsigned i = 0; i < s; ++i)
|
||||
assert(ia[i] == static_cast<int>(i + 1));
|
||||
}
|
||||
|
||||
{
|
||||
auto f = for_each_test(0);
|
||||
Iter it = std::for_each_n(Iter(ia), 1, std::ref(f));
|
||||
|
||||
assert(it == Iter(ia+1));
|
||||
assert(f.count == 1);
|
||||
for (unsigned i = 0; i < 1; ++i)
|
||||
assert(ia[i] == static_cast<int>(i+2));
|
||||
unsigned count = 0;
|
||||
Iter it = std::for_each_n(Iter(ia), 1, [&count](int& i) {
|
||||
++i;
|
||||
++count;
|
||||
});
|
||||
assert(it == Iter(ia + 1));
|
||||
assert(count == 1);
|
||||
for (unsigned i = 0; i < 1; ++i)
|
||||
assert(ia[i] == static_cast<int>(i + 2));
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
int ia[] = {1, 3, 6, 7};
|
||||
int expected[] = {3, 5, 8, 9};
|
||||
const std::size_t N = 4;
|
||||
|
||||
auto it = std::for_each_n(std::begin(ia), N, [](int& a) { a += 2; });
|
||||
assert(it == (std::begin(ia) + N) && std::equal(std::begin(ia), std::end(ia), std::begin(expected)));
|
||||
}
|
||||
|
||||
if (!TEST_IS_CONSTANT_EVALUATED) // TODO: Use TEST_STD_AT_LEAST_26_OR_RUNTIME_EVALUATED when std::deque is made constexpr
|
||||
test_deque_and_join_view_iterators();
|
||||
|
||||
#if TEST_STD_VER >= 20
|
||||
{ // join_views of (random-access) vectors yield segmented iterators
|
||||
std::vector<std::vector<int>> vec = {{}, {0}, {1, 2}, {}, {3, 4, 5}, {6, 7, 8, 9}, {10}, {11, 12, 13}};
|
||||
auto v = vec | std::views::join;
|
||||
std::for_each_n(v.begin(), std::ranges::distance(v), [i = 0](int& a) mutable { assert(a == i++); });
|
||||
}
|
||||
#endif
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
int main(int, char**) {
|
||||
assert(test());
|
||||
#if TEST_STD_VER > 17
|
||||
static_assert(test_constexpr());
|
||||
static_assert(test());
|
||||
#endif
|
||||
|
||||
return 0;
|
||||
|
||||
Reference in New Issue
Block a user