The per-PSB packet decoding logic was wrong because it was assuming that pt_insn_get_sync_offset was being udpated after every PSB. Silly me, that is not true. It returns the offset of the PSB packet after invoking pt_insn_sync_forward regardless of how many PSBs are visited later. Instead, I'm now following the approach described in https://github.com/intel/libipt/blob/master/doc/howto_libipt.md#parallel-decode for parallel decoding, which is basically what we need. A nasty error that happened because of this is that when we had two PSBs (A and B), the following was happening 1. PSB A was processed all the way up to the end of the trace, which includes PSB B. 2. PSB B was then processed until the end of the trace. The instructions emitted by step 2. were also emitted as part of step 1. so our trace had duplicated chunks. This problem becomes worse when you many PSBs. As part of making sure this diff is correct, I added some other features that are very useful. - Added a "synchronization point" event to the TraceCursor, so we can inspect when PSBs are emitted. - Removed the single-thread decoder. Now the per-cpu decoder and single-thread decoder use the same code paths. - Use the query decoder to fetch PSBs and timestamps. It turns out that the pt_insn_sync_forward of the instruction decoder can move past several PSBs (this means that we could skip some TSCs). On the other hand, the pt_query_sync_forward method doesn't skip PSBs, so we can get more accurate sync events and timing information. - Turned LibiptDecoder into PSBBlockDecoder, which decodes single PSB blocks. It is the fundamental processing unit for decoding. - Added many comments, asserts and improved error handling for clarity. - Improved DecodeSystemWideTraceForThread so that a TSC is emitted always before a cpu change event. This was a bug that was annoying me before. - SplitTraceInContinuousExecutions and FindLowestTSCInTrace are now using the query decoder, which can identify precisely each PSB along with their TSCs. - Added an "only-events" option to the trace dumper to inspect only events. I did extensive testing and I think we should have an in-house testing CI. The LLVM buildbots are not capable of supporting testing post-mortem traces of hundreds of megabytes. I'll leave that for later, but at least for now the current tests were able to catch most of the issues I encountered when doing this task. A sample output of a program that I was single stepping is the following. You can see that only one PSB is emitted even though stepping happened! ``` thread #1: tid = 3578223 0: (event) trace synchronization point [offset = 0x0xef0] a.out`main + 20 at main.cpp:29:20 1: 0x0000000000402479 leaq -0x1210(%rbp), %rax 2: (event) software disabled tracing 3: 0x0000000000402480 movq %rax, %rdi 4: (event) software disabled tracing 5: (event) software disabled tracing 6: 0x0000000000402483 callq 0x403bd4 ; std::vector<int, std::allocator<int>>::vector at stl_vector.h:391:7 7: (event) software disabled tracing a.out`std::vector<int, std::allocator<int>>::vector() at stl_vector.h:391:7 8: 0x0000000000403bd4 pushq %rbp 9: (event) software disabled tracing 10: 0x0000000000403bd5 movq %rsp, %rbp 11: (event) software disabled tracing ``` This is another trace of a long program with a few PSBs. ``` (lldb) thread trace dump instructions -E -f thread #1: tid = 3603082 0: (event) trace synchronization point [offset = 0x0x80] 47417: (event) software disabled tracing 129231: (event) trace synchronization point [offset = 0x0x800] 146747: (event) software disabled tracing 246076: (event) software disabled tracing 259068: (event) trace synchronization point [offset = 0x0xf78] 259276: (event) software disabled tracing 259278: (event) software disabled tracing no more data ``` Differential Revision: https://reviews.llvm.org/D131630
238 lines
8.2 KiB
C++
238 lines
8.2 KiB
C++
//===-- TraceIntelPTMultiCpuDecoder.cpp -----------------------------------===//
|
|
//
|
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
// See https://llvm.org/LICENSE.txt for license information.
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
#include "TraceIntelPTMultiCpuDecoder.h"
|
|
#include "TraceIntelPT.h"
|
|
#include "llvm/Support/Error.h"
|
|
|
|
using namespace lldb;
|
|
using namespace lldb_private;
|
|
using namespace lldb_private::trace_intel_pt;
|
|
using namespace llvm;
|
|
|
|
TraceIntelPTMultiCpuDecoder::TraceIntelPTMultiCpuDecoder(
|
|
TraceIntelPTSP trace_sp)
|
|
: m_trace_wp(trace_sp) {
|
|
for (Process *proc : trace_sp->GetAllProcesses()) {
|
|
for (ThreadSP thread_sp : proc->GetThreadList().Threads()) {
|
|
m_tids.insert(thread_sp->GetID());
|
|
}
|
|
}
|
|
}
|
|
|
|
TraceIntelPTSP TraceIntelPTMultiCpuDecoder::GetTrace() {
|
|
return m_trace_wp.lock();
|
|
}
|
|
|
|
bool TraceIntelPTMultiCpuDecoder::TracesThread(lldb::tid_t tid) const {
|
|
return m_tids.count(tid);
|
|
}
|
|
|
|
Expected<Optional<uint64_t>> TraceIntelPTMultiCpuDecoder::FindLowestTSC() {
|
|
Optional<uint64_t> lowest_tsc;
|
|
TraceIntelPTSP trace_sp = GetTrace();
|
|
|
|
Error err = GetTrace()->OnAllCpusBinaryDataRead(
|
|
IntelPTDataKinds::kIptTrace,
|
|
[&](const DenseMap<cpu_id_t, ArrayRef<uint8_t>> &buffers) -> Error {
|
|
for (auto &cpu_id_to_buffer : buffers) {
|
|
Expected<Optional<uint64_t>> tsc =
|
|
FindLowestTSCInTrace(*trace_sp, cpu_id_to_buffer.second);
|
|
if (!tsc)
|
|
return tsc.takeError();
|
|
if (*tsc && (!lowest_tsc || *lowest_tsc > **tsc))
|
|
lowest_tsc = **tsc;
|
|
}
|
|
return Error::success();
|
|
});
|
|
if (err)
|
|
return std::move(err);
|
|
return lowest_tsc;
|
|
}
|
|
|
|
Expected<DecodedThreadSP> TraceIntelPTMultiCpuDecoder::Decode(Thread &thread) {
|
|
if (Error err = CorrelateContextSwitchesAndIntelPtTraces())
|
|
return std::move(err);
|
|
|
|
TraceIntelPTSP trace_sp = GetTrace();
|
|
|
|
return trace_sp->GetThreadTimer(thread.GetID())
|
|
.TimeTask("Decoding instructions", [&]() -> Expected<DecodedThreadSP> {
|
|
auto it = m_decoded_threads.find(thread.GetID());
|
|
if (it != m_decoded_threads.end())
|
|
return it->second;
|
|
|
|
DecodedThreadSP decoded_thread_sp = std::make_shared<DecodedThread>(
|
|
thread.shared_from_this(), trace_sp->GetPerfZeroTscConversion());
|
|
|
|
Error err = trace_sp->OnAllCpusBinaryDataRead(
|
|
IntelPTDataKinds::kIptTrace,
|
|
[&](const DenseMap<cpu_id_t, ArrayRef<uint8_t>> &buffers) -> Error {
|
|
auto it =
|
|
m_continuous_executions_per_thread->find(thread.GetID());
|
|
if (it != m_continuous_executions_per_thread->end())
|
|
return DecodeSystemWideTraceForThread(
|
|
*decoded_thread_sp, *trace_sp, buffers, it->second);
|
|
|
|
return Error::success();
|
|
});
|
|
if (err)
|
|
return std::move(err);
|
|
|
|
m_decoded_threads.try_emplace(thread.GetID(), decoded_thread_sp);
|
|
return decoded_thread_sp;
|
|
});
|
|
}
|
|
|
|
static Expected<std::vector<PSBBlock>> GetPSBBlocksForCPU(TraceIntelPT &trace,
|
|
cpu_id_t cpu_id) {
|
|
std::vector<PSBBlock> psb_blocks;
|
|
Error err = trace.OnCpuBinaryDataRead(
|
|
cpu_id, IntelPTDataKinds::kIptTrace,
|
|
[&](ArrayRef<uint8_t> data) -> Error {
|
|
Expected<std::vector<PSBBlock>> split_trace =
|
|
SplitTraceIntoPSBBlock(trace, data, /*expect_tscs=*/true);
|
|
if (!split_trace)
|
|
return split_trace.takeError();
|
|
|
|
psb_blocks = std::move(*split_trace);
|
|
return Error::success();
|
|
});
|
|
if (err)
|
|
return std::move(err);
|
|
return psb_blocks;
|
|
}
|
|
|
|
Expected<DenseMap<lldb::tid_t, std::vector<IntelPTThreadContinousExecution>>>
|
|
TraceIntelPTMultiCpuDecoder::DoCorrelateContextSwitchesAndIntelPtTraces() {
|
|
DenseMap<lldb::tid_t, std::vector<IntelPTThreadContinousExecution>>
|
|
continuous_executions_per_thread;
|
|
TraceIntelPTSP trace_sp = GetTrace();
|
|
|
|
Optional<LinuxPerfZeroTscConversion> conv_opt =
|
|
trace_sp->GetPerfZeroTscConversion();
|
|
if (!conv_opt)
|
|
return createStringError(
|
|
inconvertibleErrorCode(),
|
|
"TSC to nanoseconds conversion values were not found");
|
|
|
|
LinuxPerfZeroTscConversion tsc_conversion = *conv_opt;
|
|
|
|
for (cpu_id_t cpu_id : trace_sp->GetTracedCpus()) {
|
|
Expected<std::vector<PSBBlock>> psb_blocks =
|
|
GetPSBBlocksForCPU(*trace_sp, cpu_id);
|
|
if (!psb_blocks)
|
|
return psb_blocks.takeError();
|
|
|
|
m_total_psb_blocks += psb_blocks->size();
|
|
// We'll be iterating through the thread continuous executions and the intel
|
|
// pt subtraces sorted by time.
|
|
auto it = psb_blocks->begin();
|
|
auto on_new_thread_execution =
|
|
[&](const ThreadContinuousExecution &thread_execution) {
|
|
IntelPTThreadContinousExecution execution(thread_execution);
|
|
|
|
for (; it != psb_blocks->end() &&
|
|
*it->tsc < thread_execution.GetEndTSC();
|
|
it++) {
|
|
if (*it->tsc > thread_execution.GetStartTSC()) {
|
|
execution.psb_blocks.push_back(*it);
|
|
} else {
|
|
m_unattributed_psb_blocks++;
|
|
}
|
|
}
|
|
continuous_executions_per_thread[thread_execution.tid].push_back(
|
|
execution);
|
|
};
|
|
Error err = trace_sp->OnCpuBinaryDataRead(
|
|
cpu_id, IntelPTDataKinds::kPerfContextSwitchTrace,
|
|
[&](ArrayRef<uint8_t> data) -> Error {
|
|
Expected<std::vector<ThreadContinuousExecution>> executions =
|
|
DecodePerfContextSwitchTrace(data, cpu_id, tsc_conversion);
|
|
if (!executions)
|
|
return executions.takeError();
|
|
for (const ThreadContinuousExecution &exec : *executions)
|
|
on_new_thread_execution(exec);
|
|
return Error::success();
|
|
});
|
|
if (err)
|
|
return std::move(err);
|
|
|
|
m_unattributed_psb_blocks += psb_blocks->end() - it;
|
|
}
|
|
// We now sort the executions of each thread to have them ready for
|
|
// instruction decoding
|
|
for (auto &tid_executions : continuous_executions_per_thread)
|
|
std::sort(tid_executions.second.begin(), tid_executions.second.end());
|
|
|
|
return continuous_executions_per_thread;
|
|
}
|
|
|
|
Error TraceIntelPTMultiCpuDecoder::CorrelateContextSwitchesAndIntelPtTraces() {
|
|
if (m_setup_error)
|
|
return createStringError(inconvertibleErrorCode(), m_setup_error->c_str());
|
|
|
|
if (m_continuous_executions_per_thread)
|
|
return Error::success();
|
|
|
|
Error err = GetTrace()->GetGlobalTimer().TimeTask(
|
|
"Context switch and Intel PT traces correlation", [&]() -> Error {
|
|
if (auto correlation = DoCorrelateContextSwitchesAndIntelPtTraces()) {
|
|
m_continuous_executions_per_thread.emplace(std::move(*correlation));
|
|
return Error::success();
|
|
} else {
|
|
return correlation.takeError();
|
|
}
|
|
});
|
|
if (err) {
|
|
m_setup_error = toString(std::move(err));
|
|
return createStringError(inconvertibleErrorCode(), m_setup_error->c_str());
|
|
}
|
|
return Error::success();
|
|
}
|
|
|
|
size_t TraceIntelPTMultiCpuDecoder::GetNumContinuousExecutionsForThread(
|
|
lldb::tid_t tid) const {
|
|
if (!m_continuous_executions_per_thread)
|
|
return 0;
|
|
auto it = m_continuous_executions_per_thread->find(tid);
|
|
if (it == m_continuous_executions_per_thread->end())
|
|
return 0;
|
|
return it->second.size();
|
|
}
|
|
|
|
size_t TraceIntelPTMultiCpuDecoder::GetTotalContinuousExecutionsCount() const {
|
|
if (!m_continuous_executions_per_thread)
|
|
return 0;
|
|
size_t count = 0;
|
|
for (const auto &kv : *m_continuous_executions_per_thread)
|
|
count += kv.second.size();
|
|
return count;
|
|
}
|
|
|
|
size_t
|
|
TraceIntelPTMultiCpuDecoder::GePSBBlocksCountForThread(lldb::tid_t tid) const {
|
|
if (!m_continuous_executions_per_thread)
|
|
return 0;
|
|
size_t count = 0;
|
|
auto it = m_continuous_executions_per_thread->find(tid);
|
|
if (it == m_continuous_executions_per_thread->end())
|
|
return 0;
|
|
for (const IntelPTThreadContinousExecution &execution : it->second)
|
|
count += execution.psb_blocks.size();
|
|
return count;
|
|
}
|
|
|
|
size_t TraceIntelPTMultiCpuDecoder::GetUnattributedPSBBlocksCount() const {
|
|
return m_unattributed_psb_blocks;
|
|
}
|
|
|
|
size_t TraceIntelPTMultiCpuDecoder::GetTotalPSBBlocksCount() const {
|
|
return m_total_psb_blocks;
|
|
}
|