## Summary Three pre-existing bugs cause worker processes to crash with SEGV or SIGABRT. On the main branch these crashes are silent (workers die, requests fail fast with "transport closed", tests still pass because null responses are accepted). However when combined with #432's worker respawn mechanism, the crash-respawn-crash cycle on low-core CI machines causes request timeouts and smoke test hangs. ### Fixes - **compilation.cpp**: `ProxyAction::CreateASTConsumer` now checks for null before passing to `MultiplexConsumer`. When the wrapped action's `CreateASTConsumer` fails (e.g. missing system headers during PCH generation), this previously caused a null pointer dereference, SEGV, ASAN kills the stateless worker. - **compilation_unit.cpp**: `file_path()` returns empty `StringRef` on invalid `FileID` instead of asserting. The assert fired when `IncludeGraph::from()` called `file_path(interested_file())` on an AST compiled with synthesized default commands (no compile_commands.json, clang++ -std=c++20 fallback, no system headers, invalid main file ID), SIGABRT, stateful worker crash. - **compiler.cpp**: `ensure_pch` now creates the PCH cache directory before sending the build request. Previously, when `load_workspace()` exited early (no compile_commands.json), the cache subdirectories were never created, causing every PCH write to fail with "No such file or directory". - **master_server.cpp/h**: `load_workspace()` changed from `kota::task<>` to plain `void` -- it contains only synchronous filesystem operations and no co_await, so the coroutine wrapper was unnecessary. Called directly instead of via `loop.schedule()`. ## Test plan - [x] Verified zero SEGV/SIGABRT/assertion crashes in worker stderr after fix - [x] rapid_edit.jsonl smoke test passes 3/3 runs consistently (34s each) - [x] Behavior matches main branch (both return 134 responses, 0 pending) - [x] Debug build with ASAN (detect_leaks=0) -- clean run, no sanitizer reports <!-- codesmith:footer --> --- <a href="https://app.blacksmith.sh/clice-io/codesmith/clice/pr/435"><picture><source media="(prefers-color-scheme: dark)" srcset="https://pr-comments-assets.blacksmith.sh/codesmith/view-in-codesmith-dark.svg"><source media="(prefers-color-scheme: light)" srcset="https://pr-comments-assets.blacksmith.sh/codesmith/view-in-codesmith-light.svg"><img alt="View in Codesmith" src="https://pr-comments-assets.blacksmith.sh/codesmith/view-in-codesmith-dark.svg"></picture></a> <sup>Codesmith can help with this PR — just tag <code>@codesmith</code> or enable autofix.</sup> - [ ] Autofix CI and bot reviews <!-- /codesmith:footer --> <!-- This is an auto-generated comment: release notes by coderabbit.ai --> ## Summary by CodeRabbit * **Bug Fixes** * Improved error handling for AST consumer creation with null checks and a clear failure path. * Safer file-path access that returns empty for invalid identifiers instead of asserting. * PCH cache handling now validates cache configuration, attempts directory creation, logs warnings, and aborts PCH builds on failure. * **Refactor** * Workspace loading changed from asynchronous to synchronous execution. <!-- end of auto-generated comment: release notes by coderabbit.ai -->
412 lines
15 KiB
C++
412 lines
15 KiB
C++
/// Benchmark for scan_dependency_graph on a real compilation database.
|
|
///
|
|
/// Usage:
|
|
/// scan_benchmark [OPTIONS] <compile_commands.json>
|
|
///
|
|
/// Example:
|
|
/// ./build/RelWithDebInfo/bin/scan_benchmark \
|
|
/// /home/ykiko/C++/clice/.llvm/build-debug/compile_commands.json
|
|
///
|
|
/// ./build/RelWithDebInfo/bin/scan_benchmark --log-level info --export graph.json \
|
|
/// /home/ykiko/C++/clice/.llvm/build-debug/compile_commands.json
|
|
|
|
#include <algorithm>
|
|
#include <chrono>
|
|
#include <cstdlib>
|
|
#include <fstream>
|
|
#include <map>
|
|
#include <numeric>
|
|
#include <print>
|
|
#include <set>
|
|
#include <thread>
|
|
|
|
#include "command/command.h"
|
|
#include "support/filesystem.h"
|
|
#include "support/logging.h"
|
|
#include "support/path_pool.h"
|
|
#include "syntax/dependency_graph.h"
|
|
|
|
#include "kota/codec/json/json.h"
|
|
#include "kota/deco/deco.h"
|
|
#include "llvm/Support/FileSystem.h"
|
|
|
|
using namespace clice;
|
|
|
|
struct BenchmarkOptions {
|
|
DecoKV(names = {"--log-level"}; help = "Log level: trace, debug, info, warn, error, off";
|
|
required = false;)
|
|
<std::string> log_level = "off";
|
|
|
|
DecoKV(names = {"--export"}; help = "Export dependency graph as JSON to this path";
|
|
required = false;)
|
|
<std::string> export_path;
|
|
|
|
DecoKV(names = {"--runs"}; help = "Number of cold start iterations"; required = false;)
|
|
<int> runs = 20;
|
|
|
|
DecoFlag(names = {"-h", "--help"}; help = "Show help message"; required = false;)
|
|
help;
|
|
|
|
DecoInput(meta_var = "CDB"; help = "Path to compile_commands.json"; required = false;)
|
|
<std::string> cdb_path;
|
|
};
|
|
|
|
struct FileNode {
|
|
std::string path;
|
|
std::string module_name;
|
|
std::vector<std::string> includes;
|
|
};
|
|
|
|
struct GraphExport {
|
|
std::vector<FileNode> files;
|
|
};
|
|
|
|
void export_graph_json(const PathPool& path_pool,
|
|
const DependencyGraph& graph,
|
|
llvm::StringRef output_path) {
|
|
// Build reverse module map: path_id -> module_name.
|
|
llvm::DenseMap<std::uint32_t, llvm::StringRef> path_to_module;
|
|
for(auto& [name, path_ids]: graph.modules()) {
|
|
for(auto path_id: path_ids) {
|
|
path_to_module[path_id] = name;
|
|
}
|
|
}
|
|
|
|
GraphExport export_data;
|
|
for(std::uint32_t id = 0; id < path_pool.paths.size(); id++) {
|
|
auto inc_ids = graph.get_all_includes(id);
|
|
if(inc_ids.empty()) {
|
|
continue;
|
|
}
|
|
|
|
FileNode node;
|
|
node.path = path_pool.paths[id].str();
|
|
|
|
auto mod_it = path_to_module.find(id);
|
|
if(mod_it != path_to_module.end()) {
|
|
node.module_name = mod_it->second.str();
|
|
}
|
|
|
|
for(auto flagged_id: inc_ids) {
|
|
auto raw_id = flagged_id & DependencyGraph::PATH_ID_MASK;
|
|
node.includes.push_back(path_pool.paths[raw_id].str());
|
|
}
|
|
|
|
export_data.files.push_back(std::move(node));
|
|
}
|
|
|
|
auto json = kota::codec::json::to_json(export_data);
|
|
if(!json) {
|
|
std::println(stderr, "Failed to serialize dependency graph");
|
|
return;
|
|
}
|
|
|
|
std::ofstream out(output_path.str());
|
|
if(!out) {
|
|
std::println(stderr, "Failed to open output file: {}", output_path);
|
|
return;
|
|
}
|
|
out << *json;
|
|
std::println("Graph exported to {} ({} files)", output_path, export_data.files.size());
|
|
}
|
|
|
|
void print_report(const ScanReport& report) {
|
|
std::println("===============================================================");
|
|
std::println(" Dependency Scan Report");
|
|
std::println("===============================================================");
|
|
|
|
// Timing.
|
|
std::println("");
|
|
std::println(" Time: {}ms", report.elapsed_ms);
|
|
std::println(" Waves: {}", report.waves);
|
|
|
|
// File counts.
|
|
std::println("");
|
|
std::println(" Files");
|
|
std::println(" Source files (from CDB): {}", report.source_files);
|
|
std::println(" Header files (discovered): {}", report.header_files);
|
|
std::println(" Total: {}", report.total_files);
|
|
std::println(" Modules: {}", report.modules);
|
|
|
|
// Include edges.
|
|
std::println("");
|
|
std::println(" Include Edges");
|
|
std::println(" Total: {}", report.total_edges);
|
|
std::println(" Unconditional: {}", report.unconditional_edges);
|
|
std::println(" Conditional: {} (inside #if/#ifdef)", report.conditional_edges);
|
|
|
|
// Resolution accuracy.
|
|
std::println("");
|
|
std::println(" Resolution");
|
|
std::println(" #include directives: {}", report.includes_found);
|
|
std::println(" Resolved: {}", report.includes_resolved);
|
|
auto unresolved_count = report.includes_found - report.includes_resolved;
|
|
std::println(" Unresolved: {}", unresolved_count);
|
|
if(report.includes_found > 0) {
|
|
double rate = 100.0 * static_cast<double>(report.includes_resolved) /
|
|
static_cast<double>(report.includes_found);
|
|
std::println(" Accuracy: {:.1f}%", rate);
|
|
}
|
|
|
|
// Wall-clock phase breakdown.
|
|
std::println("");
|
|
std::println(" Phase Breakdown (wall-clock)");
|
|
std::println(" Config extraction: {}ms (prewarm={}ms, loop={}ms)",
|
|
report.config_ms,
|
|
report.prewarm_ms,
|
|
report.config_loop_ms);
|
|
std::println(" Dir cache pre-pop: {}ms (overlapped with Phase 1)", report.dir_cache_ms);
|
|
std::println(" Phase 1 (read+scan, parallel): {}ms", report.phase1_ms);
|
|
std::println(" Phase 2 (include resolve): {}ms", report.phase2_ms);
|
|
std::println(" Phase 3 (graph build): {}ms", report.phase3_ms);
|
|
|
|
// Per-wave breakdown.
|
|
if(!report.wave_stats.empty()) {
|
|
std::println("");
|
|
std::println(" Per-Wave Breakdown");
|
|
std::println(" {:>5s} {:>8s} {:>8s} {:>8s} {:>8s} {:>8s} {:>10s} {:>10s}",
|
|
"Wave",
|
|
"Files",
|
|
"P1(ms)",
|
|
"P2(ms)",
|
|
"Next",
|
|
"Prefetch",
|
|
"DirList",
|
|
"DirHits");
|
|
for(std::size_t i = 0; i < report.wave_stats.size(); i++) {
|
|
auto& ws = report.wave_stats[i];
|
|
std::println(" {:>5} {:>8} {:>8} {:>8} {:>8} {:>8} {:>10} {:>10}",
|
|
i,
|
|
ws.files,
|
|
ws.phase1_ms,
|
|
ws.phase2_ms,
|
|
ws.next_files,
|
|
ws.prefetch_count,
|
|
ws.dir_listings,
|
|
ws.dir_hits);
|
|
}
|
|
}
|
|
|
|
// Phase 2 breakdown.
|
|
if(report.p2_resolve_us > 0) {
|
|
auto other_us = report.phase2_ms * 1000 - report.p2_resolve_us;
|
|
std::println("");
|
|
std::println(" Phase 2 Breakdown (single-threaded)");
|
|
std::println(" resolve_include: {:.1f}ms", report.p2_resolve_us / 1000.0);
|
|
std::println(" Other (cache lookup, intern, graph): {:.1f}ms", other_us / 1000.0);
|
|
}
|
|
|
|
// Cumulative I/O statistics.
|
|
std::println("");
|
|
std::println(" I/O Statistics (cumulative across threads)");
|
|
std::println(" File read: {:.1f}ms (sum of all threads)", report.read_us / 1000.0);
|
|
std::println(" Lexer scan: {:.1f}ms (sum of all threads)", report.scan_us / 1000.0);
|
|
std::println(" Filesystem: {:.1f}ms ({} readdir calls, {} dir cache hits)",
|
|
report.fs_us / 1000.0,
|
|
report.dir_listings,
|
|
report.dir_hits);
|
|
std::println(" File lookups: {}", report.fs_lookups);
|
|
std::println(" Include cache hits: {}", report.include_cache_hits);
|
|
std::println(" Scan result cache hits: {}", report.scan_cache_hits);
|
|
if(report.dir_listings + report.dir_hits > 0) {
|
|
double hit_rate = 100.0 * static_cast<double>(report.dir_hits) /
|
|
static_cast<double>(report.dir_listings + report.dir_hits);
|
|
std::println(" Dir cache hit rate: {:.1f}%", hit_rate);
|
|
}
|
|
|
|
std::println("");
|
|
std::println("===============================================================");
|
|
}
|
|
|
|
int main(int argc, const char** argv) {
|
|
auto args = kota::deco::util::argvify(argc, argv);
|
|
auto result = kota::deco::cli::parse<BenchmarkOptions>(args);
|
|
|
|
if(!result.has_value()) {
|
|
std::println(stderr, "Error: {}", result.error().message);
|
|
return 1;
|
|
}
|
|
|
|
auto& opts = result->options;
|
|
|
|
if(opts.help.value_or(false) || !opts.cdb_path.has_value()) {
|
|
std::ostringstream oss;
|
|
kota::deco::cli::write_usage_for<BenchmarkOptions>(oss, "scan_benchmark [OPTIONS] <cdb>");
|
|
std::print("{}", oss.str());
|
|
return opts.help.value_or(false) ? 0 : 1;
|
|
}
|
|
|
|
// Configure logging.
|
|
auto level = spdlog::level::from_str(*opts.log_level);
|
|
clice::logging::options.level = level;
|
|
clice::logging::stderr_logger("scan_benchmark", clice::logging::options);
|
|
|
|
// resource_dir() is self-initializing (lazy static) — no setup needed.
|
|
|
|
auto& cdb_path = *opts.cdb_path;
|
|
auto hw_threads = std::thread::hardware_concurrency();
|
|
auto runs = *opts.runs;
|
|
if(runs <= 0) {
|
|
std::println(stderr, "Error: --runs must be positive (got {})", runs);
|
|
return 1;
|
|
}
|
|
|
|
// Set UV_THREADPOOL_SIZE if not already set.
|
|
// Use at least libuv's default (4) so low-core CI runners don't regress.
|
|
if(!std::getenv("UV_THREADPOOL_SIZE")) {
|
|
auto pool_size = std::max(hw_threads, 4u);
|
|
static std::string env = "UV_THREADPOOL_SIZE=" + std::to_string(pool_size);
|
|
putenv(env.data());
|
|
}
|
|
|
|
std::println("Hardware threads: {}", hw_threads);
|
|
std::println("UV_THREADPOOL_SIZE: {}", std::getenv("UV_THREADPOOL_SIZE"));
|
|
std::println("Log level: {}", *opts.log_level);
|
|
std::println("CDB: {}", cdb_path);
|
|
std::println("");
|
|
|
|
// Load compilation database.
|
|
auto t0 = std::chrono::steady_clock::now();
|
|
|
|
CompilationDatabase cdb;
|
|
auto count = cdb.load(cdb_path);
|
|
|
|
auto t1 = std::chrono::steady_clock::now();
|
|
auto load_ms = std::chrono::duration_cast<std::chrono::milliseconds>(t1 - t0).count();
|
|
|
|
std::println("CDB loaded: {} entries in {}ms", count, load_ms);
|
|
|
|
{
|
|
std::set<const CompilationInfo*> unique_contexts;
|
|
std::set<const CanonicalCommand*> unique_canonicals;
|
|
std::map<const CanonicalCommand*, int> canonical_hist;
|
|
for(auto& entry: cdb.get_entries()) {
|
|
unique_contexts.insert(entry.info.ptr);
|
|
unique_canonicals.insert(entry.info->canonical.ptr);
|
|
canonical_hist[entry.info->canonical.ptr]++;
|
|
}
|
|
double dedup_ratio =
|
|
unique_contexts.empty() ? 0.0 : static_cast<double>(count) / unique_contexts.size();
|
|
std::println(
|
|
"Context dedup: {} files -> {} unique contexts ({:.1f}x), {} unique canonicals",
|
|
count,
|
|
unique_contexts.size(),
|
|
dedup_ratio,
|
|
unique_canonicals.size());
|
|
|
|
// If canonical dedup is poor, dump diagnostics.
|
|
if(unique_canonicals.size() > 200) {
|
|
// Sort canonicals by frequency (descending).
|
|
std::vector<std::pair<int, const CanonicalCommand*>> sorted;
|
|
for(auto& [ptr, cnt]: canonical_hist)
|
|
sorted.push_back({cnt, ptr});
|
|
std::ranges::sort(sorted,
|
|
std::greater{},
|
|
&std::pair<int, const CanonicalCommand*>::first);
|
|
|
|
// Show top-5 canonical commands.
|
|
for(int i = 0; i < std::min(5, (int)sorted.size()); i++) {
|
|
auto [cnt, cmd] = sorted[i];
|
|
std::println(" canonical[{}] ({} files, {} args):", i, cnt, cmd->arguments.size());
|
|
for(auto arg: cmd->arguments)
|
|
std::println(" {}", arg);
|
|
}
|
|
|
|
// Show a singleton canonical (count==1) to see what per-file arg leaks in.
|
|
for(auto& [cnt, cmd]: sorted) {
|
|
if(cnt == 1) {
|
|
std::println(" singleton canonical ({} args):", cmd->arguments.size());
|
|
for(auto arg: cmd->arguments)
|
|
std::println(" {}", arg);
|
|
break;
|
|
}
|
|
}
|
|
|
|
// Find two canonicals that differ by only a few args.
|
|
if(sorted.size() >= 2) {
|
|
auto* a = sorted[0].second;
|
|
auto* b = sorted[1].second;
|
|
std::println(" --- Canonical diff (top-1 vs top-2) ---");
|
|
auto max_len = std::max(a->arguments.size(), b->arguments.size());
|
|
for(std::size_t i = 0; i < max_len; i++) {
|
|
llvm::StringRef av = i < a->arguments.size() ? a->arguments[i] : "<missing>";
|
|
llvm::StringRef bv = i < b->arguments.size() ? b->arguments[i] : "<missing>";
|
|
if(av != bv)
|
|
std::println(" DIFF[{}]: '{}' vs '{}'", i, av, bv);
|
|
else
|
|
std::println(" SAME[{}]: '{}'", i, av);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
std::println("\nRunning {} cold start scan(s)...\n", runs);
|
|
|
|
PathPool path_pool;
|
|
DependencyGraph graph;
|
|
std::vector<std::int64_t> elapsed_times;
|
|
std::vector<std::int64_t> config_times;
|
|
std::vector<std::int64_t> phase1_times;
|
|
std::vector<std::int64_t> phase2_times;
|
|
elapsed_times.reserve(runs);
|
|
config_times.reserve(runs);
|
|
phase1_times.reserve(runs);
|
|
phase2_times.reserve(runs);
|
|
|
|
for(int i = 0; i < runs; i++) {
|
|
// True cold start: rebuild CDB (clears toolchain & config caches),
|
|
// reset PathPool and DependencyGraph.
|
|
cdb = CompilationDatabase{};
|
|
cdb.load(cdb_path);
|
|
path_pool = PathPool{};
|
|
graph = DependencyGraph{};
|
|
|
|
auto report = scan_dependency_graph(cdb, path_pool, graph);
|
|
|
|
elapsed_times.push_back(report.elapsed_ms);
|
|
config_times.push_back(report.config_ms);
|
|
phase1_times.push_back(report.phase1_ms);
|
|
phase2_times.push_back(report.phase2_ms);
|
|
|
|
std::println("[run {:2}] {}ms | config={}ms phase1={}ms phase2={}ms | files={}",
|
|
i + 1,
|
|
report.elapsed_ms,
|
|
report.config_ms,
|
|
report.phase1_ms,
|
|
report.phase2_ms,
|
|
report.total_files);
|
|
|
|
// Print detailed report for the first run only.
|
|
if(i == 0) {
|
|
std::println("");
|
|
print_report(report);
|
|
}
|
|
}
|
|
|
|
// Summary statistics.
|
|
if(runs > 1) {
|
|
auto stats = [](std::vector<std::int64_t>& v) {
|
|
std::ranges::sort(v);
|
|
auto sum = std::accumulate(v.begin(), v.end(), std::int64_t{0});
|
|
return std::tuple{v.front(), sum / static_cast<std::int64_t>(v.size()), v.back()};
|
|
};
|
|
auto [e_min, e_avg, e_max] = stats(elapsed_times);
|
|
auto [c_min, c_avg, c_max] = stats(config_times);
|
|
auto [p1_min, p1_avg, p1_max] = stats(phase1_times);
|
|
auto [p2_min, p2_avg, p2_max] = stats(phase2_times);
|
|
|
|
std::println("\n Summary ({} runs) min avg max", runs);
|
|
std::println(" Total: {:>7} {:>6} {:>6}", e_min, e_avg, e_max);
|
|
std::println(" Config extraction: {:>7} {:>6} {:>6}", c_min, c_avg, c_max);
|
|
std::println(" Phase 1 (read+scan):{:>7} {:>6} {:>6}", p1_min, p1_avg, p1_max);
|
|
std::println(" Phase 2 (resolve): {:>7} {:>6} {:>6}", p2_min, p2_avg, p2_max);
|
|
}
|
|
|
|
// Export dependency graph as JSON if requested.
|
|
if(opts.export_path.has_value()) {
|
|
export_graph_json(path_pool, graph, *opts.export_path);
|
|
}
|
|
|
|
return 0;
|
|
}
|