[libc++] Optimize std::for_each_n for segmented iterators (#135468)

This patch enhances the performance of `std::for_each_n` when used with
segmented iterators, leading to significant performance improvements,
summarized in the tables below. This addresses a subtask of
https://github.com/llvm/llvm-project/issues/102817.
This commit is contained in:
Peng Liu
2025-05-21 12:10:50 -04:00
committed by GitHub
parent 5a3776af52
commit 09c266b75d
8 changed files with 335 additions and 49 deletions

View File

@@ -70,6 +70,9 @@ Improvements and New Features
- The segmented iterator optimization for ``std::for_each`` has been backported to C++11. Previously it was only available
in C++23 and later.
- The ``std::for_each_n`` algorithm has been optimized for segmented iterators, resulting in a performance improvement of
up to 17.7x for ``std::deque<short>`` iterators, and up to 13.9x for ``std::join_view<vector<vector<short>>>`` iterators.
Deprecations and Removals
-------------------------

View File

@@ -25,6 +25,7 @@ set(files
__algorithm/find_segment_if.h
__algorithm/for_each.h
__algorithm/for_each_n.h
__algorithm/for_each_n_segment.h
__algorithm/for_each_segment.h
__algorithm/generate.h
__algorithm/generate_n.h

View File

@@ -10,20 +10,35 @@
#ifndef _LIBCPP___ALGORITHM_FOR_EACH_N_H
#define _LIBCPP___ALGORITHM_FOR_EACH_N_H
#include <__algorithm/for_each.h>
#include <__algorithm/for_each_n_segment.h>
#include <__config>
#include <__iterator/iterator_traits.h>
#include <__iterator/segmented_iterator.h>
#include <__type_traits/disjunction.h>
#include <__type_traits/enable_if.h>
#include <__type_traits/negation.h>
#include <__utility/convert_to_integral.h>
#include <__utility/move.h>
#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
# pragma GCC system_header
#endif
_LIBCPP_PUSH_MACROS
#include <__undef_macros>
_LIBCPP_BEGIN_NAMESPACE_STD
#if _LIBCPP_STD_VER >= 17
template <class _InputIterator, class _Size, class _Function>
inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _InputIterator
for_each_n(_InputIterator __first, _Size __orig_n, _Function __f) {
template <class _InputIterator,
class _Size,
class _Func,
__enable_if_t<!__has_random_access_iterator_category<_InputIterator>::value &&
_Or< _Not<__is_segmented_iterator<_InputIterator> >,
_Not<__has_random_access_local_iterator<_InputIterator> > >::value,
int> = 0>
_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _InputIterator
__for_each_n(_InputIterator __first, _Size __orig_n, _Func& __f) {
typedef decltype(std::__convert_to_integral(__orig_n)) _IntegralSize;
_IntegralSize __n = __orig_n;
while (__n > 0) {
@@ -31,11 +46,51 @@ for_each_n(_InputIterator __first, _Size __orig_n, _Function __f) {
++__first;
--__n;
}
return __first;
return std::move(__first);
}
#endif
template <class _RandIter,
class _Size,
class _Func,
__enable_if_t<__has_random_access_iterator_category<_RandIter>::value, int> = 0>
_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _RandIter
__for_each_n(_RandIter __first, _Size __orig_n, _Func& __f) {
typename std::iterator_traits<_RandIter>::difference_type __n = __orig_n;
auto __last = __first + __n;
std::__for_each(__first, __last, __f);
return std::move(__last);
}
#ifndef _LIBCPP_CXX03_LANG
template <class _SegmentedIterator,
class _Size,
class _Func,
__enable_if_t<!__has_random_access_iterator_category<_SegmentedIterator>::value &&
__is_segmented_iterator<_SegmentedIterator>::value &&
__has_random_access_iterator_category<
typename __segmented_iterator_traits<_SegmentedIterator>::__local_iterator>::value,
int> = 0>
_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _SegmentedIterator
__for_each_n(_SegmentedIterator __first, _Size __orig_n, _Func& __f) {
using __local_iterator_t = typename __segmented_iterator_traits<_SegmentedIterator>::__local_iterator;
return std::__for_each_n_segment(__first, __orig_n, [&](__local_iterator_t __lfirst, __local_iterator_t __llast) {
std::__for_each(__lfirst, __llast, __f);
});
}
#endif // !_LIBCPP_CXX03_LANG
#if _LIBCPP_STD_VER >= 17
template <class _InputIterator, class _Size, class _Function>
inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _InputIterator
for_each_n(_InputIterator __first, _Size __orig_n, _Function __f) {
return std::__for_each_n(__first, __orig_n, __f);
}
#endif // _LIBCPP_STD_VER >= 17
_LIBCPP_END_NAMESPACE_STD
_LIBCPP_POP_MACROS
#endif // _LIBCPP___ALGORITHM_FOR_EACH_N_H

View File

@@ -0,0 +1,63 @@
//===----------------------------------------------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#ifndef _LIBCPP___ALGORITHM_FOR_EACH_N_SEGMENT_H
#define _LIBCPP___ALGORITHM_FOR_EACH_N_SEGMENT_H
#include <__config>
#include <__iterator/iterator_traits.h>
#include <__iterator/segmented_iterator.h>
#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
# pragma GCC system_header
#endif
_LIBCPP_BEGIN_NAMESPACE_STD
// __for_each_n_segment optimizes linear iteration over segmented iterators. It processes a segmented
// input range [__first, __first + __n) by applying the functor __func to each element within the segment.
// The return value of __func is ignored, and the function returns an iterator pointing to one past the
// last processed element in the input range.
template <class _SegmentedIterator, class _Size, class _Functor>
_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 _SegmentedIterator
__for_each_n_segment(_SegmentedIterator __first, _Size __orig_n, _Functor __func) {
static_assert(__is_segmented_iterator<_SegmentedIterator>::value &&
__has_random_access_iterator_category<
typename __segmented_iterator_traits<_SegmentedIterator>::__local_iterator>::value,
"__for_each_n_segment only works with segmented iterators with random-access local iterators");
if (__orig_n <= 0)
return __first;
using _Traits = __segmented_iterator_traits<_SegmentedIterator>;
using __local_iter_t = typename _Traits::__local_iterator;
using __difference_t = typename std::iterator_traits<__local_iter_t>::difference_type;
__difference_t __n = __orig_n;
auto __seg = _Traits::__segment(__first);
auto __local_first = _Traits::__local(__first);
__local_iter_t __local_last;
while (__n > 0) {
__local_last = _Traits::__end(__seg);
auto __seg_size = __local_last - __local_first;
if (__n <= __seg_size) {
__local_last = __local_first + __n;
__func(__local_first, __local_last);
break;
}
__func(__local_first, __local_last);
__n -= __seg_size;
__local_first = _Traits::__begin(++__seg);
}
return _Traits::__compose(__seg, __local_last);
}
_LIBCPP_END_NAMESPACE_STD
#endif // _LIBCPP___ALGORITHM_FOR_EACH_N_SEGMENT_H

View File

@@ -42,6 +42,7 @@
#include <__config>
#include <__cstddef/size_t.h>
#include <__iterator/iterator_traits.h>
#include <__type_traits/integral_constant.h>
#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
@@ -74,6 +75,11 @@ struct __has_specialization<_Tp, sizeof(_Tp) * 0> : true_type {};
template <class _Iterator>
using __is_segmented_iterator _LIBCPP_NODEBUG = __has_specialization<__segmented_iterator_traits<_Iterator> >;
template <class _SegmentedIterator>
struct __has_random_access_local_iterator
: __has_random_access_iterator_category<
typename __segmented_iterator_traits< _SegmentedIterator >::__local_iterator > {};
_LIBCPP_END_NAMESPACE_STD
#endif // _LIBCPP___SEGMENTED_ITERATOR_H

View File

@@ -437,6 +437,7 @@ module std [system] {
module find_segment_if { header "__algorithm/find_segment_if.h" }
module find { header "__algorithm/find.h" }
module for_each_n { header "__algorithm/for_each_n.h" }
module for_each_n_segment { header "__algorithm/for_each_n_segment.h" }
module for_each_segment { header "__algorithm/for_each_segment.h" }
module for_each { header "__algorithm/for_each.h" }
module generate_n { header "__algorithm/generate_n.h" }

View File

@@ -0,0 +1,98 @@
//===----------------------------------------------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
// UNSUPPORTED: c++03, c++11, c++14, c++17
#include <algorithm>
#include <cstddef>
#include <deque>
#include <list>
#include <ranges>
#include <string>
#include <vector>
#include <benchmark/benchmark.h>
int main(int argc, char** argv) {
auto std_for_each_n = [](auto first, auto n, auto f) { return std::for_each_n(first, n, f); };
// std::for_each_n
{
auto bm = []<class Container>(std::string name, auto for_each_n) {
using ElemType = typename Container::value_type;
benchmark::RegisterBenchmark(
name,
[for_each_n](auto& st) {
std::size_t const n = st.range(0);
Container c(n, 1);
auto first = c.begin();
for ([[maybe_unused]] auto _ : st) {
benchmark::DoNotOptimize(c);
auto result = for_each_n(first, n, [](ElemType& x) { x = std::clamp<ElemType>(x, 10, 100); });
benchmark::DoNotOptimize(result);
}
})
->Arg(8)
->Arg(32)
->Arg(50) // non power-of-two
->Arg(1024)
->Arg(4096)
->Arg(8192)
->Arg(1 << 14)
->Arg(1 << 16)
->Arg(1 << 18);
};
bm.operator()<std::vector<int>>("std::for_each_n(vector<int>)", std_for_each_n);
bm.operator()<std::deque<int>>("std::for_each_n(deque<int>)", std_for_each_n);
bm.operator()<std::list<int>>("std::for_each_n(list<int>)", std_for_each_n);
}
// std::for_each_n for join_view
{
auto bm = []<class Container>(std::string name, auto for_each_n) {
using C1 = typename Container::value_type;
using ElemType = typename C1::value_type;
benchmark::RegisterBenchmark(
name,
[for_each_n](auto& st) {
std::size_t const size = st.range(0);
std::size_t const seg_size = 256;
std::size_t const segments = (size + seg_size - 1) / seg_size;
Container c(segments);
for (std::size_t i = 0, n = size; i < segments; ++i, n -= seg_size) {
c[i].resize(std::min(seg_size, n), ElemType(1));
}
auto view = c | std::views::join;
auto first = view.begin();
for ([[maybe_unused]] auto _ : st) {
benchmark::DoNotOptimize(c);
auto result = for_each_n(first, size, [](ElemType& x) { x = std::clamp<ElemType>(x, 10, 100); });
benchmark::DoNotOptimize(result);
}
})
->Arg(8)
->Arg(32)
->Arg(50) // non power-of-two
->Arg(1024)
->Arg(4096)
->Arg(8192)
->Arg(1 << 14)
->Arg(1 << 16)
->Arg(1 << 18);
};
bm.operator()<std::vector<std::vector<int>>>("std::for_each_n(join_view(vector<vector<int>>))", std_for_each_n);
}
benchmark::Initialize(&argc, argv);
benchmark::RunSpecifiedBenchmarks();
benchmark::Shutdown();
return 0;
}

View File

@@ -13,69 +13,128 @@
// constexpr InputIterator // constexpr after C++17
// for_each_n(InputIterator first, Size n, Function f);
#include <algorithm>
#include <cassert>
#include <deque>
#include <functional>
#include <iterator>
#include <list>
#include <ranges>
#include <vector>
#include "test_macros.h"
#include "test_iterators.h"
#if TEST_STD_VER > 17
TEST_CONSTEXPR bool test_constexpr() {
int ia[] = {1, 3, 6, 7};
int expected[] = {3, 5, 8, 9};
const std::size_t N = 4;
auto it = std::for_each_n(std::begin(ia), N, [](int &a) { a += 2; });
return it == (std::begin(ia) + N)
&& std::equal(std::begin(ia), std::end(ia), std::begin(expected))
;
}
#endif
struct for_each_test
{
for_each_test(int c) : count(c) {}
int count;
void operator()(int& i) {++i; ++count;}
struct for_each_test {
TEST_CONSTEXPR for_each_test(int c) : count(c) {}
int count;
TEST_CONSTEXPR_CXX14 void operator()(int& i) {
++i;
++count;
}
};
int main(int, char**)
{
struct deque_test {
std::deque<int>* d_;
int* i_;
deque_test(std::deque<int>& d, int& i) : d_(&d), i_(&i) {}
void operator()(int& v) {
assert(&(*d_)[*i_] == &v);
++*i_;
}
};
/*TEST_CONSTEXPR_CXX26*/
void test_deque_and_join_view_iterators() { // TODO: Mark as TEST_CONSTEXPR_CXX26 once std::deque is constexpr
{ // Verify that segmented deque iterators work properly
int sizes[] = {0, 1, 2, 1023, 1024, 1025, 2047, 2048, 2049};
for (const int size : sizes) {
std::deque<int> d(size);
int index = 0;
std::for_each_n(d.begin(), d.size(), deque_test(d, index));
}
}
#if TEST_STD_VER >= 20
{ // Verify that join_view of lists work properly. Note that join_view of (non-random access) lists does
// not produce segmented iterators.
std::list<std::list<int>> lst = {{}, {0}, {1, 2}, {}, {3, 4, 5}, {6, 7, 8, 9}, {10}, {11, 12, 13}};
auto v = lst | std::views::join;
std::for_each_n(v.begin(), std::ranges::distance(v), [i = 0](int& a) mutable { assert(a == i++); });
}
#endif
}
TEST_CONSTEXPR_CXX20 bool test() {
{
typedef cpp17_input_iterator<int*> Iter;
int ia[] = {0, 1, 2, 3, 4, 5};
const unsigned s = sizeof(ia)/sizeof(ia[0]);
int ia[] = {0, 1, 2, 3, 4, 5};
const unsigned s = sizeof(ia) / sizeof(ia[0]);
{
auto f = for_each_test(0);
Iter it = std::for_each_n(Iter(ia), 0, std::ref(f));
assert(it == Iter(ia));
assert(f.count == 0);
unsigned count = 0;
Iter it = std::for_each_n(Iter(ia), 0, [&count](int& i) {
++i;
++count;
});
assert(it == Iter(ia));
assert(count == 0);
}
{
auto f = for_each_test(0);
Iter it = std::for_each_n(Iter(ia), s, std::ref(f));
assert(it == Iter(ia+s));
assert(f.count == s);
for (unsigned i = 0; i < s; ++i)
assert(ia[i] == static_cast<int>(i+1));
unsigned count = 0;
Iter it = std::for_each_n(Iter(ia), s, [&count](int& i) {
++i;
++count;
});
assert(it == Iter(ia + s));
assert(count == s);
for (unsigned i = 0; i < s; ++i)
assert(ia[i] == static_cast<int>(i + 1));
}
{
auto f = for_each_test(0);
Iter it = std::for_each_n(Iter(ia), 1, std::ref(f));
assert(it == Iter(ia+1));
assert(f.count == 1);
for (unsigned i = 0; i < 1; ++i)
assert(ia[i] == static_cast<int>(i+2));
unsigned count = 0;
Iter it = std::for_each_n(Iter(ia), 1, [&count](int& i) {
++i;
++count;
});
assert(it == Iter(ia + 1));
assert(count == 1);
for (unsigned i = 0; i < 1; ++i)
assert(ia[i] == static_cast<int>(i + 2));
}
}
{
int ia[] = {1, 3, 6, 7};
int expected[] = {3, 5, 8, 9};
const std::size_t N = 4;
auto it = std::for_each_n(std::begin(ia), N, [](int& a) { a += 2; });
assert(it == (std::begin(ia) + N) && std::equal(std::begin(ia), std::end(ia), std::begin(expected)));
}
if (!TEST_IS_CONSTANT_EVALUATED) // TODO: Use TEST_STD_AT_LEAST_26_OR_RUNTIME_EVALUATED when std::deque is made constexpr
test_deque_and_join_view_iterators();
#if TEST_STD_VER >= 20
{ // join_views of (random-access) vectors yield segmented iterators
std::vector<std::vector<int>> vec = {{}, {0}, {1, 2}, {}, {3, 4, 5}, {6, 7, 8, 9}, {10}, {11, 12, 13}};
auto v = vec | std::views::join;
std::for_each_n(v.begin(), std::ranges::distance(v), [i = 0](int& a) mutable { assert(a == i++); });
}
#endif
return true;
}
int main(int, char**) {
assert(test());
#if TEST_STD_VER > 17
static_assert(test_constexpr());
static_assert(test());
#endif
return 0;