From 6fe4e033f07d332980e1997c19fe705cff9d07a4 Mon Sep 17 00:00:00 2001 From: Martijn Vels Date: Wed, 13 Sep 2023 18:16:35 -0400 Subject: [PATCH] [libc++] Optimize vector push_back to avoid continuous load and store of end pointer MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Credits: this change is based on analysis and a proof of concept by gerbens@google.com. Before, the compiler loses track of end as 'this' and other references possibly escape beyond the compiler's scope. This can be see in the generated assembly: 16.28 │200c80: mov %r15d,(%rax) 60.87 │200c83: add $0x4,%rax │200c87: mov %rax,-0x38(%rbp) 0.03 │200c8b: → jmpq 200d4e ... ... 1.69 │200d4e: cmp %r15d,%r12d │200d51: → je 200c40 16.34 │200d57: inc %r15d 0.05 │200d5a: mov -0x38(%rbp),%rax 3.27 │200d5e: mov -0x30(%rbp),%r13 1.47 │200d62: cmp %r13,%rax │200d65: → jne 200c80 We fix this by always explicitly storing the loaded local and pointer back at the end of push back. This generates some slight source 'noise', but creates nice and compact fast path code, i.e.: 32.64 │200760: mov %r14d,(%r12) 9.97 │200764: add $0x4,%r12 6.97 │200768: mov %r12,-0x38(%rbp) 32.17 │20076c: add $0x1,%r14d 2.36 │200770: cmp %r14d,%ebx │200773: → je 200730 8.98 │200775: mov -0x30(%rbp),%r13 6.75 │200779: cmp %r13,%r12 │20077c: → jne 200760 Now there is a single store for the push_back value (as before), and a single store for the end without a reload (dependency). For fully local vectors, (i.e., not referenced elsewhere), the capacity load and store inside the loop could also be removed, but this requires more substantial refactoring inside vector. Differential Revision: https://reviews.llvm.org/D80588 --- libcxx/benchmarks/ContainerBenchmarks.h | 13 ++++++ libcxx/benchmarks/vector_operations.bench.cpp | 2 + libcxx/include/vector | 42 +++++++++++-------- 3 files changed, 40 insertions(+), 17 deletions(-) diff --git a/libcxx/benchmarks/ContainerBenchmarks.h b/libcxx/benchmarks/ContainerBenchmarks.h index 071e46c2a1c6..9a9abfd3b0d0 100644 --- a/libcxx/benchmarks/ContainerBenchmarks.h +++ b/libcxx/benchmarks/ContainerBenchmarks.h @@ -79,6 +79,19 @@ void BM_ConstructFromRange(benchmark::State& st, Container, GenInputs gen) { } } +template +void BM_Pushback(benchmark::State& state, Container c) { + int count = state.range(0); + c.reserve(count); + while (state.KeepRunningBatch(count)) { + c.clear(); + for (int i = 0; i != count; ++i) { + c.push_back(i); + } + benchmark::DoNotOptimize(c.data()); + } +} + template void BM_InsertValue(benchmark::State& st, Container c, GenInputs gen) { auto in = gen(st.range(0)); diff --git a/libcxx/benchmarks/vector_operations.bench.cpp b/libcxx/benchmarks/vector_operations.bench.cpp index be0bee6b6456..38b14c56756f 100644 --- a/libcxx/benchmarks/vector_operations.bench.cpp +++ b/libcxx/benchmarks/vector_operations.bench.cpp @@ -39,4 +39,6 @@ BENCHMARK_CAPTURE(BM_ConstructFromRange, vector_size_t, std::vector{}, g BENCHMARK_CAPTURE(BM_ConstructFromRange, vector_string, std::vector{}, getRandomStringInputs) ->Arg(TestNumInputs); +BENCHMARK_CAPTURE(BM_Pushback, vector_int, std::vector{})->Arg(TestNumInputs); + BENCHMARK_MAIN(); diff --git a/libcxx/include/vector b/libcxx/include/vector index 6d26ce169b60..4ec6b602371e 100644 --- a/libcxx/include/vector +++ b/libcxx/include/vector @@ -833,11 +833,11 @@ private: template _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI - inline void __push_back_slow_path(_Up&& __x); + inline pointer __push_back_slow_path(_Up&& __x); template _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI - inline void __emplace_back_slow_path(_Args&&... __args); + inline pointer __emplace_back_slow_path(_Args&&... __args); // The following functions are no-ops outside of AddressSanitizer mode. // We call annotations for every allocator, unless explicitly disabled. @@ -1609,7 +1609,7 @@ vector<_Tp, _Allocator>::shrink_to_fit() _NOEXCEPT template template _LIBCPP_CONSTEXPR_SINCE_CXX20 -void +typename vector<_Tp, _Allocator>::pointer vector<_Tp, _Allocator>::__push_back_slow_path(_Up&& __x) { allocator_type& __a = this->__alloc(); @@ -1618,6 +1618,7 @@ vector<_Tp, _Allocator>::__push_back_slow_path(_Up&& __x) __alloc_traits::construct(__a, std::__to_address(__v.__end_), std::forward<_Up>(__x)); __v.__end_++; __swap_out_circular_buffer(__v); + return this->__end_; } template @@ -1626,12 +1627,14 @@ inline _LIBCPP_HIDE_FROM_ABI void vector<_Tp, _Allocator>::push_back(const_reference __x) { - if (this->__end_ != this->__end_cap()) - { + pointer __end = this->__end_; + if (__end < this->__end_cap()) { __construct_one_at_end(__x); + ++__end; + } else { + __end = __push_back_slow_path(__x); } - else - __push_back_slow_path(__x); + this->__end_ = __end; } template @@ -1640,18 +1643,20 @@ inline _LIBCPP_HIDE_FROM_ABI void vector<_Tp, _Allocator>::push_back(value_type&& __x) { - if (this->__end_ < this->__end_cap()) - { + pointer __end = this->__end_; + if (__end < this->__end_cap()) { __construct_one_at_end(std::move(__x)); + ++__end; + } else { + __end = __push_back_slow_path(std::move(__x)); } - else - __push_back_slow_path(std::move(__x)); + this->__end_ = __end; } template template _LIBCPP_CONSTEXPR_SINCE_CXX20 -void +typename vector<_Tp, _Allocator>::pointer vector<_Tp, _Allocator>::__emplace_back_slow_path(_Args&&... __args) { allocator_type& __a = this->__alloc(); @@ -1660,6 +1665,7 @@ vector<_Tp, _Allocator>::__emplace_back_slow_path(_Args&&... __args) __alloc_traits::construct(__a, std::__to_address(__v.__end_), std::forward<_Args>(__args)...); __v.__end_++; __swap_out_circular_buffer(__v); + return this->__end_; } template @@ -1673,14 +1679,16 @@ void #endif vector<_Tp, _Allocator>::emplace_back(_Args&&... __args) { - if (this->__end_ < this->__end_cap()) - { + pointer __end = this->__end_; + if (__end < this->__end_cap()) { __construct_one_at_end(std::forward<_Args>(__args)...); + ++__end; + } else { + __end = __emplace_back_slow_path(std::forward<_Args>(__args)...); } - else - __emplace_back_slow_path(std::forward<_Args>(__args)...); + this->__end_ = __end; #if _LIBCPP_STD_VER >= 17 - return this->back(); + return *(__end - 1); #endif }