/// Benchmark for scan_dependency_graph on a real compilation database. /// /// Usage: /// scan_benchmark [OPTIONS] /// /// Example: /// ./build/RelWithDebInfo/bin/scan_benchmark \ /// /home/ykiko/C++/clice/.llvm/build-debug/compile_commands.json /// /// ./build/RelWithDebInfo/bin/scan_benchmark --log-level info --export graph.json \ /// /home/ykiko/C++/clice/.llvm/build-debug/compile_commands.json #include #include #include #include #include #include #include #include #include #include "command/command.h" #include "eventide/deco/deco.h" #include "eventide/serde/json/serializer.h" #include "support/filesystem.h" #include "support/logging.h" #include "support/path_pool.h" #include "syntax/dependency_graph.h" #include "llvm/Support/FileSystem.h" namespace et = eventide; using namespace clice; struct BenchmarkOptions { DecoKV(names = {"--log-level"}; help = "Log level: trace, debug, info, warn, error, off"; required = false;) log_level = "off"; DecoKV(names = {"--export"}; help = "Export dependency graph as JSON to this path"; required = false;) export_path; DecoKV(names = {"--runs"}; help = "Number of cold start iterations"; required = false;) runs = 20; DecoFlag(names = {"-h", "--help"}; help = "Show help message"; required = false;) help; DecoInput(meta_var = "CDB"; help = "Path to compile_commands.json"; required = false;) cdb_path; }; struct FileNode { std::string path; std::string module_name; std::vector includes; }; struct GraphExport { std::vector files; }; void export_graph_json(const PathPool& path_pool, const DependencyGraph& graph, llvm::StringRef output_path) { // Build reverse module map: path_id -> module_name. llvm::DenseMap path_to_module; for(auto& [name, path_ids]: graph.modules()) { for(auto path_id: path_ids) { path_to_module[path_id] = name; } } GraphExport export_data; for(std::uint32_t id = 0; id < path_pool.paths.size(); id++) { auto inc_ids = graph.get_all_includes(id); if(inc_ids.empty()) { continue; } FileNode node; node.path = path_pool.paths[id].str(); auto mod_it = path_to_module.find(id); if(mod_it != path_to_module.end()) { node.module_name = mod_it->second.str(); } for(auto flagged_id: inc_ids) { auto raw_id = flagged_id & DependencyGraph::PATH_ID_MASK; node.includes.push_back(path_pool.paths[raw_id].str()); } export_data.files.push_back(std::move(node)); } auto json = et::serde::json::to_json(export_data); if(!json) { std::println(stderr, "Failed to serialize dependency graph"); return; } std::ofstream out(output_path.str()); if(!out) { std::println(stderr, "Failed to open output file: {}", output_path); return; } out << *json; std::println("Graph exported to {} ({} files)", output_path, export_data.files.size()); } void print_report(const ScanReport& report) { std::println("==============================================================="); std::println(" Dependency Scan Report"); std::println("==============================================================="); // Timing. std::println(""); std::println(" Time: {}ms", report.elapsed_ms); std::println(" Waves: {}", report.waves); // File counts. std::println(""); std::println(" Files"); std::println(" Source files (from CDB): {}", report.source_files); std::println(" Header files (discovered): {}", report.header_files); std::println(" Total: {}", report.total_files); std::println(" Modules: {}", report.modules); // Include edges. std::println(""); std::println(" Include Edges"); std::println(" Total: {}", report.total_edges); std::println(" Unconditional: {}", report.unconditional_edges); std::println(" Conditional: {} (inside #if/#ifdef)", report.conditional_edges); // Resolution accuracy. std::println(""); std::println(" Resolution"); std::println(" #include directives: {}", report.includes_found); std::println(" Resolved: {}", report.includes_resolved); auto unresolved_count = report.includes_found - report.includes_resolved; std::println(" Unresolved: {}", unresolved_count); if(report.includes_found > 0) { double rate = 100.0 * static_cast(report.includes_resolved) / static_cast(report.includes_found); std::println(" Accuracy: {:.1f}%", rate); } // Wall-clock phase breakdown. std::println(""); std::println(" Phase Breakdown (wall-clock)"); std::println(" Config extraction: {}ms (prewarm={}ms, loop={}ms)", report.config_ms, report.prewarm_ms, report.config_loop_ms); std::println(" Dir cache pre-pop: {}ms (overlapped with Phase 1)", report.dir_cache_ms); std::println(" Phase 1 (read+scan, parallel): {}ms", report.phase1_ms); std::println(" Phase 2 (include resolve): {}ms", report.phase2_ms); std::println(" Phase 3 (graph build): {}ms", report.phase3_ms); // Per-wave breakdown. if(!report.wave_stats.empty()) { std::println(""); std::println(" Per-Wave Breakdown"); std::println(" {:>5s} {:>8s} {:>8s} {:>8s} {:>8s} {:>8s} {:>10s} {:>10s}", "Wave", "Files", "P1(ms)", "P2(ms)", "Next", "Prefetch", "DirList", "DirHits"); for(std::size_t i = 0; i < report.wave_stats.size(); i++) { auto& ws = report.wave_stats[i]; std::println(" {:>5} {:>8} {:>8} {:>8} {:>8} {:>8} {:>10} {:>10}", i, ws.files, ws.phase1_ms, ws.phase2_ms, ws.next_files, ws.prefetch_count, ws.dir_listings, ws.dir_hits); } } // Phase 2 breakdown. if(report.p2_resolve_us > 0) { auto other_us = report.phase2_ms * 1000 - report.p2_resolve_us; std::println(""); std::println(" Phase 2 Breakdown (single-threaded)"); std::println(" resolve_include: {:.1f}ms", report.p2_resolve_us / 1000.0); std::println(" Other (cache lookup, intern, graph): {:.1f}ms", other_us / 1000.0); } // Cumulative I/O statistics. std::println(""); std::println(" I/O Statistics (cumulative across threads)"); std::println(" File read: {:.1f}ms (sum of all threads)", report.read_us / 1000.0); std::println(" Lexer scan: {:.1f}ms (sum of all threads)", report.scan_us / 1000.0); std::println(" Filesystem: {:.1f}ms ({} readdir calls, {} dir cache hits)", report.fs_us / 1000.0, report.dir_listings, report.dir_hits); std::println(" File lookups: {}", report.fs_lookups); std::println(" Include cache hits: {}", report.include_cache_hits); std::println(" Scan result cache hits: {}", report.scan_cache_hits); if(report.dir_listings + report.dir_hits > 0) { double hit_rate = 100.0 * static_cast(report.dir_hits) / static_cast(report.dir_listings + report.dir_hits); std::println(" Dir cache hit rate: {:.1f}%", hit_rate); } std::println(""); std::println("==============================================================="); } int main(int argc, const char** argv) { auto args = deco::util::argvify(argc, argv); auto result = deco::cli::parse(args); if(!result.has_value()) { std::println(stderr, "Error: {}", result.error().message); return 1; } auto& opts = result->options; if(opts.help.value_or(false) || !opts.cdb_path.has_value()) { std::ostringstream oss; deco::cli::write_usage_for(oss, "scan_benchmark [OPTIONS] "); std::print("{}", oss.str()); return opts.help.value_or(false) ? 0 : 1; } // Configure logging. auto level = spdlog::level::from_str(*opts.log_level); clice::logging::options.level = level; clice::logging::stderr_logger("scan_benchmark", clice::logging::options); // resource_dir() is self-initializing (lazy static) — no setup needed. auto& cdb_path = *opts.cdb_path; auto hw_threads = std::thread::hardware_concurrency(); auto runs = *opts.runs; if(runs <= 0) { std::println(stderr, "Error: --runs must be positive (got {})", runs); return 1; } // Set UV_THREADPOOL_SIZE if not already set. // Use at least libuv's default (4) so low-core CI runners don't regress. if(!std::getenv("UV_THREADPOOL_SIZE")) { auto pool_size = std::max(hw_threads, 4u); static std::string env = "UV_THREADPOOL_SIZE=" + std::to_string(pool_size); putenv(env.data()); } std::println("Hardware threads: {}", hw_threads); std::println("UV_THREADPOOL_SIZE: {}", std::getenv("UV_THREADPOOL_SIZE")); std::println("Log level: {}", *opts.log_level); std::println("CDB: {}", cdb_path); std::println(""); // Load compilation database. auto t0 = std::chrono::steady_clock::now(); CompilationDatabase cdb; auto count = cdb.load(cdb_path); auto t1 = std::chrono::steady_clock::now(); auto load_ms = std::chrono::duration_cast(t1 - t0).count(); std::println("CDB loaded: {} entries in {}ms", count, load_ms); { std::set unique_contexts; std::set unique_canonicals; std::map canonical_hist; for(auto& entry: cdb.get_entries()) { unique_contexts.insert(entry.info.ptr); unique_canonicals.insert(entry.info->canonical.ptr); canonical_hist[entry.info->canonical.ptr]++; } double dedup_ratio = unique_contexts.empty() ? 0.0 : static_cast(count) / unique_contexts.size(); std::println( "Context dedup: {} files -> {} unique contexts ({:.1f}x), {} unique canonicals", count, unique_contexts.size(), dedup_ratio, unique_canonicals.size()); // If canonical dedup is poor, dump diagnostics. if(unique_canonicals.size() > 200) { // Sort canonicals by frequency (descending). std::vector> sorted; for(auto& [ptr, cnt]: canonical_hist) sorted.push_back({cnt, ptr}); std::ranges::sort(sorted, std::greater{}, &std::pair::first); // Show top-5 canonical commands. for(int i = 0; i < std::min(5, (int)sorted.size()); i++) { auto [cnt, cmd] = sorted[i]; std::println(" canonical[{}] ({} files, {} args):", i, cnt, cmd->arguments.size()); for(auto arg: cmd->arguments) std::println(" {}", arg); } // Show a singleton canonical (count==1) to see what per-file arg leaks in. for(auto& [cnt, cmd]: sorted) { if(cnt == 1) { std::println(" singleton canonical ({} args):", cmd->arguments.size()); for(auto arg: cmd->arguments) std::println(" {}", arg); break; } } // Find two canonicals that differ by only a few args. if(sorted.size() >= 2) { auto* a = sorted[0].second; auto* b = sorted[1].second; std::println(" --- Canonical diff (top-1 vs top-2) ---"); auto max_len = std::max(a->arguments.size(), b->arguments.size()); for(std::size_t i = 0; i < max_len; i++) { llvm::StringRef av = i < a->arguments.size() ? a->arguments[i] : ""; llvm::StringRef bv = i < b->arguments.size() ? b->arguments[i] : ""; if(av != bv) std::println(" DIFF[{}]: '{}' vs '{}'", i, av, bv); else std::println(" SAME[{}]: '{}'", i, av); } } } } std::println("\nRunning {} cold start scan(s)...\n", runs); PathPool path_pool; DependencyGraph graph; std::vector elapsed_times; std::vector config_times; std::vector phase1_times; std::vector phase2_times; elapsed_times.reserve(runs); config_times.reserve(runs); phase1_times.reserve(runs); phase2_times.reserve(runs); for(int i = 0; i < runs; i++) { // True cold start: rebuild CDB (clears toolchain & config caches), // reset PathPool and DependencyGraph. cdb = CompilationDatabase{}; cdb.load(cdb_path); path_pool = PathPool{}; graph = DependencyGraph{}; auto report = scan_dependency_graph(cdb, path_pool, graph); elapsed_times.push_back(report.elapsed_ms); config_times.push_back(report.config_ms); phase1_times.push_back(report.phase1_ms); phase2_times.push_back(report.phase2_ms); std::println("[run {:2}] {}ms | config={}ms phase1={}ms phase2={}ms | files={}", i + 1, report.elapsed_ms, report.config_ms, report.phase1_ms, report.phase2_ms, report.total_files); // Print detailed report for the first run only. if(i == 0) { std::println(""); print_report(report); } } // Summary statistics. if(runs > 1) { auto stats = [](std::vector& v) { std::ranges::sort(v); auto sum = std::accumulate(v.begin(), v.end(), std::int64_t{0}); return std::tuple{v.front(), sum / static_cast(v.size()), v.back()}; }; auto [e_min, e_avg, e_max] = stats(elapsed_times); auto [c_min, c_avg, c_max] = stats(config_times); auto [p1_min, p1_avg, p1_max] = stats(phase1_times); auto [p2_min, p2_avg, p2_max] = stats(phase2_times); std::println("\n Summary ({} runs) min avg max", runs); std::println(" Total: {:>7} {:>6} {:>6}", e_min, e_avg, e_max); std::println(" Config extraction: {:>7} {:>6} {:>6}", c_min, c_avg, c_max); std::println(" Phase 1 (read+scan):{:>7} {:>6} {:>6}", p1_min, p1_avg, p1_max); std::println(" Phase 2 (resolve): {:>7} {:>6} {:>6}", p2_min, p2_avg, p2_max); } // Export dependency graph as JSON if requested. if(opts.export_path.has_value()) { export_graph_json(path_pool, graph, *opts.export_path); } return 0; }