diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml new file mode 100644 index 00000000..afa01819 --- /dev/null +++ b/.github/workflows/benchmark.yml @@ -0,0 +1,41 @@ +name: benchmark + +on: + pull_request: + branches: [main] + +jobs: + benchmark: + strategy: + fail-fast: false + matrix: + os: [ubuntu-24.04, macos-15, windows-2025] + runs-on: ${{ matrix.os }} + defaults: + run: + shell: bash + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - uses: ./.github/actions/setup-pixi + + - name: Build scan_benchmark + run: | + pixi run cmake-config RelWithDebInfo ON + cmake --build build/RelWithDebInfo --target scan_benchmark + + - name: Clone LLVM + run: git clone --depth 1 https://github.com/llvm/llvm-project.git + + - name: Generate CDB + run: | + cmake -B llvm-build -G Ninja \ + -DCMAKE_EXPORT_COMPILE_COMMANDS=ON \ + -DCMAKE_TOOLCHAIN_FILE="$(pwd)/cmake/toolchain.cmake" \ + -DLLVM_ENABLE_PROJECTS="clang;clang-tools-extra;lld;lldb;mlir;polly;flang;bolt" \ + -DLLVM_ENABLE_RUNTIMES="compiler-rt;libcxx;libcxxabi;libunwind" \ + llvm-project/llvm + + - name: Run benchmark + run: ./build/RelWithDebInfo/bin/scan_benchmark --runs 20 llvm-build/compile_commands.json diff --git a/CMakeLists.txt b/CMakeLists.txt index 0905ecbd..7798cfe9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -147,6 +147,9 @@ add_library(clice-core STATIC "${PROJECT_SOURCE_DIR}/src/support/logging.cpp" "${PROJECT_SOURCE_DIR}/src/syntax/lexer.cpp" "${PROJECT_SOURCE_DIR}/src/syntax/scan.cpp" + "${PROJECT_SOURCE_DIR}/src/syntax/include_resolver.cpp" + "${PROJECT_SOURCE_DIR}/src/syntax/dependency_graph.cpp" + "${PROJECT_SOURCE_DIR}/src/syntax/include_resolver.cpp" "${PROJECT_SOURCE_DIR}/src/feature/semantic_tokens.cpp" "${PROJECT_SOURCE_DIR}/src/feature/document_links.cpp" "${PROJECT_SOURCE_DIR}/src/feature/document_symbols.cpp" @@ -224,3 +227,11 @@ if(CLICE_ENABLE_TEST) ) target_link_libraries(unit_tests PRIVATE clice::core eventide::zest eventide::deco) endif() + +add_executable(scan_benchmark + "${PROJECT_SOURCE_DIR}/benchmarks/scan_benchmark.cpp" +) +target_include_directories(scan_benchmark PRIVATE + "${PROJECT_SOURCE_DIR}/src" +) +target_link_libraries(scan_benchmark PRIVATE clice::core eventide::deco) diff --git a/benchmarks/scan_benchmark.cpp b/benchmarks/scan_benchmark.cpp new file mode 100644 index 00000000..34f2e6f7 --- /dev/null +++ b/benchmarks/scan_benchmark.cpp @@ -0,0 +1,413 @@ +/// Benchmark for scan_dependency_graph on a real compilation database. +/// +/// Usage: +/// scan_benchmark [OPTIONS] +/// +/// Example: +/// ./build/RelWithDebInfo/bin/scan_benchmark \ +/// /home/ykiko/C++/clice/.llvm/build-debug/compile_commands.json +/// +/// ./build/RelWithDebInfo/bin/scan_benchmark --log-level info --export graph.json \ +/// /home/ykiko/C++/clice/.llvm/build-debug/compile_commands.json + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "command/command.h" +#include "eventide/deco/deco.h" +#include "eventide/serde/json/serializer.h" +#include "support/filesystem.h" +#include "support/logging.h" +#include "support/path_pool.h" +#include "syntax/dependency_graph.h" + +#include "llvm/Support/FileSystem.h" + +namespace et = eventide; + +using namespace clice; + +struct BenchmarkOptions { + DecoKV(names = {"--log-level"}; help = "Log level: trace, debug, info, warn, error, off"; + required = false;) + log_level = "off"; + + DecoKV(names = {"--export"}; help = "Export dependency graph as JSON to this path"; + required = false;) + export_path; + + DecoKV(names = {"--runs"}; help = "Number of cold start iterations"; required = false;) + runs = 20; + + DecoFlag(names = {"-h", "--help"}; help = "Show help message"; required = false;) + help; + + DecoInput(meta_var = "CDB"; help = "Path to compile_commands.json"; required = false;) + cdb_path; +}; + +struct FileNode { + std::string path; + std::string module_name; + std::vector includes; +}; + +struct GraphExport { + std::vector files; +}; + +void export_graph_json(const PathPool& path_pool, + const DependencyGraph& graph, + llvm::StringRef output_path) { + // Build reverse module map: path_id -> module_name. + llvm::DenseMap path_to_module; + for(auto& [name, path_ids]: graph.modules()) { + for(auto path_id: path_ids) { + path_to_module[path_id] = name; + } + } + + GraphExport export_data; + for(std::uint32_t id = 0; id < path_pool.paths.size(); id++) { + auto inc_ids = graph.get_all_includes(id); + if(inc_ids.empty()) { + continue; + } + + FileNode node; + node.path = path_pool.paths[id].str(); + + auto mod_it = path_to_module.find(id); + if(mod_it != path_to_module.end()) { + node.module_name = mod_it->second.str(); + } + + for(auto flagged_id: inc_ids) { + auto raw_id = flagged_id & DependencyGraph::PATH_ID_MASK; + node.includes.push_back(path_pool.paths[raw_id].str()); + } + + export_data.files.push_back(std::move(node)); + } + + auto json = et::serde::json::to_json(export_data); + if(!json) { + std::println(stderr, "Failed to serialize dependency graph"); + return; + } + + std::ofstream out(output_path.str()); + if(!out) { + std::println(stderr, "Failed to open output file: {}", output_path); + return; + } + out << *json; + std::println("Graph exported to {} ({} files)", output_path, export_data.files.size()); +} + +void print_report(const ScanReport& report) { + std::println("==============================================================="); + std::println(" Dependency Scan Report"); + std::println("==============================================================="); + + // Timing. + std::println(""); + std::println(" Time: {}ms", report.elapsed_ms); + std::println(" Waves: {}", report.waves); + + // File counts. + std::println(""); + std::println(" Files"); + std::println(" Source files (from CDB): {}", report.source_files); + std::println(" Header files (discovered): {}", report.header_files); + std::println(" Total: {}", report.total_files); + std::println(" Modules: {}", report.modules); + + // Include edges. + std::println(""); + std::println(" Include Edges"); + std::println(" Total: {}", report.total_edges); + std::println(" Unconditional: {}", report.unconditional_edges); + std::println(" Conditional: {} (inside #if/#ifdef)", report.conditional_edges); + + // Resolution accuracy. + std::println(""); + std::println(" Resolution"); + std::println(" #include directives: {}", report.includes_found); + std::println(" Resolved: {}", report.includes_resolved); + auto unresolved_count = report.includes_found - report.includes_resolved; + std::println(" Unresolved: {}", unresolved_count); + if(report.includes_found > 0) { + double rate = 100.0 * static_cast(report.includes_resolved) / + static_cast(report.includes_found); + std::println(" Accuracy: {:.1f}%", rate); + } + + // Wall-clock phase breakdown. + std::println(""); + std::println(" Phase Breakdown (wall-clock)"); + std::println(" Config extraction: {}ms (prewarm={}ms, loop={}ms)", + report.config_ms, + report.prewarm_ms, + report.config_loop_ms); + std::println(" Dir cache pre-pop: {}ms (overlapped with Phase 1)", report.dir_cache_ms); + std::println(" Phase 1 (read+scan, parallel): {}ms", report.phase1_ms); + std::println(" Phase 2 (include resolve): {}ms", report.phase2_ms); + std::println(" Phase 3 (graph build): {}ms", report.phase3_ms); + + // Per-wave breakdown. + if(!report.wave_stats.empty()) { + std::println(""); + std::println(" Per-Wave Breakdown"); + std::println(" {:>5s} {:>8s} {:>8s} {:>8s} {:>8s} {:>8s} {:>10s} {:>10s}", + "Wave", + "Files", + "P1(ms)", + "P2(ms)", + "Next", + "Prefetch", + "DirList", + "DirHits"); + for(std::size_t i = 0; i < report.wave_stats.size(); i++) { + auto& ws = report.wave_stats[i]; + std::println(" {:>5} {:>8} {:>8} {:>8} {:>8} {:>8} {:>10} {:>10}", + i, + ws.files, + ws.phase1_ms, + ws.phase2_ms, + ws.next_files, + ws.prefetch_count, + ws.dir_listings, + ws.dir_hits); + } + } + + // Phase 2 breakdown. + if(report.p2_resolve_us > 0) { + auto other_us = report.phase2_ms * 1000 - report.p2_resolve_us; + std::println(""); + std::println(" Phase 2 Breakdown (single-threaded)"); + std::println(" resolve_include: {:.1f}ms", report.p2_resolve_us / 1000.0); + std::println(" Other (cache lookup, intern, graph): {:.1f}ms", other_us / 1000.0); + } + + // Cumulative I/O statistics. + std::println(""); + std::println(" I/O Statistics (cumulative across threads)"); + std::println(" File read: {:.1f}ms (sum of all threads)", report.read_us / 1000.0); + std::println(" Lexer scan: {:.1f}ms (sum of all threads)", report.scan_us / 1000.0); + std::println(" Filesystem: {:.1f}ms ({} readdir calls, {} dir cache hits)", + report.fs_us / 1000.0, + report.dir_listings, + report.dir_hits); + std::println(" File lookups: {}", report.fs_lookups); + std::println(" Include cache hits: {}", report.include_cache_hits); + std::println(" Scan result cache hits: {}", report.scan_cache_hits); + if(report.dir_listings + report.dir_hits > 0) { + double hit_rate = 100.0 * static_cast(report.dir_hits) / + static_cast(report.dir_listings + report.dir_hits); + std::println(" Dir cache hit rate: {:.1f}%", hit_rate); + } + + std::println(""); + std::println("==============================================================="); +} + +int main(int argc, const char** argv) { + auto args = deco::util::argvify(argc, argv); + auto result = deco::cli::parse(args); + + if(!result.has_value()) { + std::println(stderr, "Error: {}", result.error().message); + return 1; + } + + auto& opts = result->options; + + if(opts.help.value_or(false) || !opts.cdb_path.has_value()) { + std::ostringstream oss; + deco::cli::write_usage_for(oss, "scan_benchmark [OPTIONS] "); + std::print("{}", oss.str()); + return opts.help.value_or(false) ? 0 : 1; + } + + // Configure logging. + auto level = spdlog::level::from_str(*opts.log_level); + clice::logging::options.level = level; + clice::logging::stderr_logger("scan_benchmark", clice::logging::options); + + // resource_dir() is self-initializing (lazy static) — no setup needed. + + auto& cdb_path = *opts.cdb_path; + auto hw_threads = std::thread::hardware_concurrency(); + auto runs = *opts.runs; + if(runs <= 0) { + std::println(stderr, "Error: --runs must be positive (got {})", runs); + return 1; + } + + // Set UV_THREADPOOL_SIZE if not already set. + // Use at least libuv's default (4) so low-core CI runners don't regress. + if(!std::getenv("UV_THREADPOOL_SIZE")) { + auto pool_size = std::max(hw_threads, 4u); + static std::string env = "UV_THREADPOOL_SIZE=" + std::to_string(pool_size); + putenv(env.data()); + } + + std::println("Hardware threads: {}", hw_threads); + std::println("UV_THREADPOOL_SIZE: {}", std::getenv("UV_THREADPOOL_SIZE")); + std::println("Log level: {}", *opts.log_level); + std::println("CDB: {}", cdb_path); + std::println(""); + + // Load compilation database. + auto t0 = std::chrono::steady_clock::now(); + + CompilationDatabase cdb; + auto count = cdb.load(cdb_path); + + auto t1 = std::chrono::steady_clock::now(); + auto load_ms = std::chrono::duration_cast(t1 - t0).count(); + + std::println("CDB loaded: {} entries in {}ms", count, load_ms); + + { + std::set unique_contexts; + std::set unique_canonicals; + std::map canonical_hist; + for(auto& entry: cdb.get_entries()) { + unique_contexts.insert(entry.info.ptr); + unique_canonicals.insert(entry.info->canonical.ptr); + canonical_hist[entry.info->canonical.ptr]++; + } + double dedup_ratio = + unique_contexts.empty() ? 0.0 : static_cast(count) / unique_contexts.size(); + std::println( + "Context dedup: {} files -> {} unique contexts ({:.1f}x), {} unique canonicals", + count, + unique_contexts.size(), + dedup_ratio, + unique_canonicals.size()); + + // If canonical dedup is poor, dump diagnostics. + if(unique_canonicals.size() > 200) { + // Sort canonicals by frequency (descending). + std::vector> sorted; + for(auto& [ptr, cnt]: canonical_hist) + sorted.push_back({cnt, ptr}); + std::ranges::sort(sorted, + std::greater{}, + &std::pair::first); + + // Show top-5 canonical commands. + for(int i = 0; i < std::min(5, (int)sorted.size()); i++) { + auto [cnt, cmd] = sorted[i]; + std::println(" canonical[{}] ({} files, {} args):", i, cnt, cmd->arguments.size()); + for(auto arg: cmd->arguments) + std::println(" {}", arg); + } + + // Show a singleton canonical (count==1) to see what per-file arg leaks in. + for(auto& [cnt, cmd]: sorted) { + if(cnt == 1) { + std::println(" singleton canonical ({} args):", cmd->arguments.size()); + for(auto arg: cmd->arguments) + std::println(" {}", arg); + break; + } + } + + // Find two canonicals that differ by only a few args. + if(sorted.size() >= 2) { + auto* a = sorted[0].second; + auto* b = sorted[1].second; + std::println(" --- Canonical diff (top-1 vs top-2) ---"); + auto max_len = std::max(a->arguments.size(), b->arguments.size()); + for(std::size_t i = 0; i < max_len; i++) { + llvm::StringRef av = i < a->arguments.size() ? a->arguments[i] : ""; + llvm::StringRef bv = i < b->arguments.size() ? b->arguments[i] : ""; + if(av != bv) + std::println(" DIFF[{}]: '{}' vs '{}'", i, av, bv); + else + std::println(" SAME[{}]: '{}'", i, av); + } + } + } + } + + std::println("\nRunning {} cold start scan(s)...\n", runs); + + PathPool path_pool; + DependencyGraph graph; + std::vector elapsed_times; + std::vector config_times; + std::vector phase1_times; + std::vector phase2_times; + elapsed_times.reserve(runs); + config_times.reserve(runs); + phase1_times.reserve(runs); + phase2_times.reserve(runs); + + for(int i = 0; i < runs; i++) { + // True cold start: rebuild CDB (clears toolchain & config caches), + // reset PathPool and DependencyGraph. + cdb = CompilationDatabase{}; + cdb.load(cdb_path); + path_pool = PathPool{}; + graph = DependencyGraph{}; + + auto report = scan_dependency_graph(cdb, path_pool, graph); + + elapsed_times.push_back(report.elapsed_ms); + config_times.push_back(report.config_ms); + phase1_times.push_back(report.phase1_ms); + phase2_times.push_back(report.phase2_ms); + + std::println("[run {:2}] {}ms | config={}ms phase1={}ms phase2={}ms | files={}", + i + 1, + report.elapsed_ms, + report.config_ms, + report.phase1_ms, + report.phase2_ms, + report.total_files); + + // Print detailed report for the first run only. + if(i == 0) { + std::println(""); + print_report(report); + } + } + + // Summary statistics. + if(runs > 1) { + auto stats = [](std::vector& v) { + std::ranges::sort(v); + auto sum = std::accumulate(v.begin(), v.end(), std::int64_t{0}); + return std::tuple{v.front(), sum / static_cast(v.size()), v.back()}; + }; + auto [e_min, e_avg, e_max] = stats(elapsed_times); + auto [c_min, c_avg, c_max] = stats(config_times); + auto [p1_min, p1_avg, p1_max] = stats(phase1_times); + auto [p2_min, p2_avg, p2_max] = stats(phase2_times); + + std::println("\n Summary ({} runs) min avg max", runs); + std::println(" Total: {:>7} {:>6} {:>6}", e_min, e_avg, e_max); + std::println(" Config extraction: {:>7} {:>6} {:>6}", c_min, c_avg, c_max); + std::println(" Phase 1 (read+scan):{:>7} {:>6} {:>6}", p1_min, p1_avg, p1_max); + std::println(" Phase 2 (resolve): {:>7} {:>6} {:>6}", p2_min, p2_avg, p2_max); + } + + // Export dependency graph as JSON if requested. + if(opts.export_path.has_value()) { + export_graph_json(path_pool, graph, *opts.export_path); + } + + return 0; +} diff --git a/src/command/argument_parser.cpp b/src/command/argument_parser.cpp index 3fa3fc3f..55543b9f 100644 --- a/src/command/argument_parser.cpp +++ b/src/command/argument_parser.cpp @@ -5,9 +5,11 @@ #include "llvm/ADT/SmallString.h" #include "llvm/ADT/StringExtras.h" #include "llvm/Support/FileSystem.h" +#include "llvm/Support/Path.h" #include "llvm/Support/raw_ostream.h" #include "clang/Driver/Driver.h" #include "clang/Driver/Options.h" +#include "clang/Driver/Types.h" namespace clice { @@ -40,19 +42,23 @@ auto& option_table = driver::getDriverOptTable(); std::unique_ptr ArgumentParser::parse_one(unsigned& index) { assert(!enable_dash_dash_parsing(option_table)); assert(!enable_grouped_short_options(option_table)); - return option_table.ParseOneArg(*this, index); + return option_table.ParseOneArg(*this, index, opt::Visibility(visibility_mask)); } using ID = clang::driver::options::ID; bool is_discarded_option(unsigned id) { switch(id) { - /// Input file and output — we manage these ourselves. + /// Input file, unknown args, and output — we manage these ourselves. case ID::OPT_INPUT: + case ID::OPT_UNKNOWN: + case ID::OPT__DASH_DASH: case ID::OPT_c: case ID::OPT_o: case ID::OPT_dxc_Fc: case ID::OPT_dxc_Fo: + case ID::OPT__SLASH_Fo: + case ID::OPT__SLASH_Fd: /// PCH building. case ID::OPT_emit_pch: @@ -115,6 +121,24 @@ bool is_xclang_option(unsigned id) { return id == ID::OPT_Xclang; } +bool is_toolchain_option(unsigned id) { + switch(id) { + case ID::OPT_target: + case ID::OPT_target_legacy_spelling: + case ID::OPT_isysroot: + case ID::OPT__sysroot_EQ: + case ID::OPT__sysroot: + case ID::OPT_stdlib_EQ: + case ID::OPT_gcc_toolchain: + case ID::OPT_gcc_install_dir_EQ: + case ID::OPT_nostdinc: + case ID::OPT_nostdincxx: + case ID::OPT_std_EQ: + case ID::OPT_x: return true; + default: return false; + } +} + std::optional get_option_id(llvm::StringRef argument) { llvm::SmallString<64> buffer = argument; @@ -221,4 +245,33 @@ std::string print_argv(llvm::ArrayRef args) { return std::move(os.str()); } +unsigned default_visibility(llvm::StringRef driver) { + namespace options = clang::driver::options; + auto name = llvm::sys::path::filename(driver); + name.consume_back(".exe"); + + auto is_cl = [](llvm::StringRef s) { + return s.equals_insensitive("cl") || s.equals_insensitive("clang-cl"); + }; + + /// cl.exe and clang-cl.exe both need MSVC-style /options. + /// Also handle versioned names like clang-cl-17, clang-cl-17.0.1. + if(is_cl(name) || is_cl(name.rtrim("0123456789.-"))) { + return ~0u; + } + /// Exclude CLOption to prevent /U, /D, /I from matching Unix paths. + return ~static_cast(options::CLOption); +} + +bool is_c_family_file(llvm::StringRef filename) { + namespace types = clang::driver::types; + auto ext = llvm::sys::path::extension(filename); + if(ext.empty()) { + return false; + } + /// Drop the leading dot: ".cpp" → "cpp". + auto type = types::lookupTypeForExtension(ext.drop_front()); + return type != types::TY_INVALID && types::isAcceptedByClang(type); +} + } // namespace clice diff --git a/src/command/argument_parser.h b/src/command/argument_parser.h index 0d1f5174..deb45289 100644 --- a/src/command/argument_parser.h +++ b/src/command/argument_parser.h @@ -38,6 +38,14 @@ public: return p; } + /// Set visibility mask for option parsing. The default (~0u) accepts all + /// options. Pass a narrower mask to exclude option groups — e.g. exclude + /// MSVC cl.exe-style /U, /D, /I options that would otherwise misparse + /// Unix absolute paths like /Users/... on macOS. + void set_visibility(unsigned mask) { + visibility_mask = mask; + } + /// Parse a single argument at the given index. Defined out-of-line in /// argument_parser.cpp to isolate the heavy clang driver option table include. std::unique_ptr parse_one(unsigned& index); @@ -72,6 +80,7 @@ public: private: llvm::BumpPtrAllocator* allocator; + unsigned visibility_mask = ~0u; llvm::ArrayRef arguments; }; @@ -101,6 +110,10 @@ bool is_include_path_option(unsigned id); /// Check if this is the -Xclang pass-through option. bool is_xclang_option(unsigned id); +/// Options that affect system path discovery and should be included in the +/// toolchain cache key. Only these flags are passed to the toolchain query. +bool is_toolchain_option(unsigned id); + /// Get the option ID for a specific argument string. std::optional get_option_id(llvm::StringRef argument); @@ -111,4 +124,15 @@ llvm::StringRef resource_dir(); /// Format an argument list as a human-readable string: "[arg1 arg2 ...]". std::string print_argv(llvm::ArrayRef args); +/// Return the visibility mask to exclude MSVC cl.exe-style options (/U, /D, +/// /I, etc.) unless the driver is cl.exe. This prevents Unix absolute paths +/// like /Users/... from being misparsed as /U sers/... on macOS/Linux. +/// Defined out-of-line in argument_parser.cpp (needs ClangVisibility enum). +unsigned default_visibility(llvm::StringRef driver); + +/// Check if a filename has a C/C++/ObjC/CUDA/etc. extension accepted by clang. +/// Returns false for .rc (Windows resource), .asm, .def, and other non-C-family files. +/// Defined out-of-line in argument_parser.cpp (needs clang::driver::types). +bool is_c_family_file(llvm::StringRef filename); + } // namespace clice diff --git a/src/command/command.cpp b/src/command/command.cpp index 3e91a8a6..210fa8fe 100644 --- a/src/command/command.cpp +++ b/src/command/command.cpp @@ -104,6 +104,8 @@ object_ptr bool remove_pch = false; + parser->set_visibility(default_visibility(arguments[0])); + auto on_error = [&](int index, int count) { LOG_WARN("missing argument index: {}, count: {} when parse: {}", index, count, file); }; @@ -262,6 +264,13 @@ std::size_t CompilationDatabase::load(llvm::StringRef path) { llvm::StringRef dir_ref(dir_sv.data(), dir_sv.size()); llvm::StringRef file_ref(file_sv.data(), file_sv.size()); + // Skip non-C-family files (e.g. .rc, .asm, .def) that some build + // systems emit into compile_commands.json. + if(!is_c_family_file(file_ref)) { + ++index; + continue; + } + // Resolve relative file paths against the directory so that entries // from different directories don't collide in the PathPool. std::string file_abs; @@ -353,6 +362,9 @@ llvm::SmallVector CompilationDatabase::lookup(llvm::StringRe append_args(info->patch); } else { arguments.assign(cached.begin(), cached.end()); + // TODO: add an assertion that the last arg is the temp source + // file (e.g., contains "query-toolchain") to guard against + // future changes in clang cc1 argument ordering. arguments.pop_back(); // remove temp source file // Replace resource dir if needed. @@ -530,13 +542,13 @@ CompilationDatabase::ToolchainExtract result.query_args.push_back(arguments[0]); + parser->set_visibility(default_visibility(arguments[0])); + parser->parse( llvm::ArrayRef(arguments).drop_front(), [&](std::unique_ptr arg) { - auto& opt = arg->getOption(); - auto id = opt.getID(); - if(is_discarded_option(id) || is_user_content_option(id) || - is_codegen_option(id, opt)) { + auto id = arg->getOption().getID(); + if(!is_toolchain_option(id)) { return; } @@ -634,6 +646,14 @@ llvm::StringRef CompilationDatabase::resolve_path(std::uint32_t path_id) { return paths.resolve(path_id); } +std::uint32_t CompilationDatabase::intern_path(llvm::StringRef path) { + return paths.intern(path); +} + +llvm::ArrayRef CompilationDatabase::get_entries() const { + return entries; +} + #ifdef CLICE_ENABLE_TEST void CompilationDatabase::add_command(llvm::StringRef directory, diff --git a/src/command/command.h b/src/command/command.h index 75f26571..cfa27954 100644 --- a/src/command/command.h +++ b/src/command/command.h @@ -188,6 +188,12 @@ public: /// Resolve a path_id back to the file path string. llvm::StringRef resolve_path(std::uint32_t path_id); + /// Intern a file path and return its path_id. + std::uint32_t intern_path(llvm::StringRef path); + + /// All compilation entries (sorted by path_id). + llvm::ArrayRef get_entries() const; + /// Entry for batch pre-warming: file + directory + raw compilation arguments. struct PendingEntry { llvm::StringRef file; diff --git a/src/command/search_config.cpp b/src/command/search_config.cpp index 497d2bf7..34ea2140 100644 --- a/src/command/search_config.cpp +++ b/src/command/search_config.cpp @@ -37,6 +37,7 @@ SearchConfig extract_search_config(llvm::ArrayRef arguments, llvm::BumpPtrAllocator allocator; ArgumentParser parser{&allocator}; + parser.set_visibility(default_visibility(arguments[0])); parser.parse( llvm::ArrayRef(arguments).drop_front(), diff --git a/src/server/master_server.cpp b/src/server/master_server.cpp index 63fec8d9..7b4beaf0 100644 --- a/src/server/master_server.cpp +++ b/src/server/master_server.cpp @@ -14,6 +14,7 @@ #include "server/protocol.h" #include "support/filesystem.h" #include "support/logging.h" +#include "syntax/dependency_graph.h" namespace clice { @@ -194,6 +195,28 @@ et::task<> MasterServer::load_workspace() { auto count = cdb.load(cdb_path); LOG_INFO("Loaded CDB from {} with {} entries", cdb_path, count); + + auto report = scan_dependency_graph(cdb, path_pool, dependency_graph); + + auto unresolved = report.includes_found - report.includes_resolved; + double accuracy = + report.includes_found > 0 + ? 100.0 * static_cast(report.includes_resolved) / report.includes_found + : 100.0; + LOG_INFO( + "Dependency scan: {}ms, {} files ({} source + {} header), " "{} edges, {}/{} resolved ({:.1f}%), {} waves", + report.elapsed_ms, + report.total_files, + report.source_files, + report.header_files, + report.total_edges, + report.includes_resolved, + report.includes_found, + accuracy, + report.waves); + if(unresolved > 0) { + LOG_WARN("{} unresolved includes", unresolved); + } } void MasterServer::fill_compile_args(llvm::StringRef path, diff --git a/src/server/master_server.h b/src/server/master_server.h index dc440dd6..f87d582d 100644 --- a/src/server/master_server.h +++ b/src/server/master_server.h @@ -11,38 +11,19 @@ #include "eventide/serde/serde/raw_value.h" #include "server/config.h" #include "server/worker_pool.h" +#include "support/path_pool.h" +#include "syntax/dependency_graph.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringMap.h" #include "llvm/ADT/StringRef.h" -#include "llvm/Support/Allocator.h" namespace clice { namespace et = eventide; namespace protocol = et::ipc::protocol; -/// Global path interning pool. Maps file paths to uint32_t IDs. -struct ServerPathPool { - llvm::BumpPtrAllocator allocator; - llvm::SmallVector paths; - llvm::StringMap cache; - - std::uint32_t intern(llvm::StringRef path) { - auto [it, inserted] = cache.try_emplace(path, paths.size()); - if(inserted) { - auto saved = path.copy(allocator); - paths.push_back(saved); - } - return it->second; - } - - llvm::StringRef resolve(std::uint32_t id) const { - return paths[id]; - } -}; - struct DocumentState { int version = 0; std::string text; @@ -70,7 +51,7 @@ private: et::event_loop& loop; et::ipc::JsonPeer& peer; WorkerPool pool; - ServerPathPool path_pool; + PathPool path_pool; ServerLifecycle lifecycle = ServerLifecycle::Uninitialized; std::string self_path; @@ -78,6 +59,7 @@ private: CliceConfig config; CompilationDatabase cdb; + DependencyGraph dependency_graph; // Document state: path_id -> DocumentState llvm::DenseMap documents; diff --git a/src/syntax/dependency_graph.cpp b/src/syntax/dependency_graph.cpp new file mode 100644 index 00000000..9f9c8d37 --- /dev/null +++ b/src/syntax/dependency_graph.cpp @@ -0,0 +1,708 @@ +#include "syntax/dependency_graph.h" + +#include + +#include "command/toolchain.h" +#include "eventide/async/async.h" +#include "support/logging.h" +#include "syntax/include_resolver.h" +#include "syntax/scan.h" + +#include "llvm/ADT/DenseSet.h" +#include "llvm/ADT/StringSet.h" +#include "llvm/Support/FileSystem.h" +#include "llvm/Support/MemoryBuffer.h" +#include "llvm/Support/Path.h" +#include "llvm/Support/StringSaver.h" + +namespace clice { + +namespace et = eventide; + +// ============================================================================ +// DependencyGraph implementation +// ============================================================================ + +void DependencyGraph::add_module(llvm::StringRef module_name, std::uint32_t path_id) { + auto& ids = module_to_path[module_name]; + if(llvm::find(ids, path_id) == ids.end()) { + ids.push_back(path_id); + } +} + +llvm::ArrayRef DependencyGraph::lookup_module(llvm::StringRef module_name) const { + auto it = module_to_path.find(module_name); + if(it != module_to_path.end()) { + return it->second; + } + return {}; +} + +void DependencyGraph::set_includes(std::uint32_t path_id, + std::uint32_t config_id, + llvm::SmallVector included_ids) { + IncludeKey key{path_id, config_id}; + includes[key] = std::move(included_ids); + auto& configs = file_configs[path_id]; + if(std::find(configs.begin(), configs.end(), config_id) == configs.end()) { + configs.push_back(config_id); + } +} + +llvm::ArrayRef DependencyGraph::get_includes(std::uint32_t path_id, + std::uint32_t config_id) const { + auto it = includes.find(IncludeKey{path_id, config_id}); + if(it != includes.end()) { + return it->second; + } + return {}; +} + +llvm::SmallVector DependencyGraph::get_all_includes(std::uint32_t path_id) const { + llvm::DenseMap seen; // raw_id -> index in result + llvm::SmallVector result; + + auto fc_it = file_configs.find(path_id); + if(fc_it == file_configs.end()) { + return result; + } + + for(auto config_id: fc_it->second) { + auto it = includes.find(IncludeKey{path_id, config_id}); + if(it != includes.end()) { + for(auto id: it->second) { + auto raw_id = id & PATH_ID_MASK; + auto [sit, inserted] = seen.try_emplace(raw_id, result.size()); + if(inserted) { + result.push_back(id); + } else if(!(id & CONDITIONAL_FLAG)) { + // Unconditional include wins over conditional. + result[sit->second] = raw_id; + } + } + } + } + return result; +} + +std::size_t DependencyGraph::file_count() const { + return file_configs.size(); +} + +std::size_t DependencyGraph::module_count() const { + return module_to_path.size(); +} + +std::size_t DependencyGraph::edge_count() const { + std::size_t count = 0; + for(auto& [key, ids]: includes) { + count += ids.size(); + } + return count; +} + +// ============================================================================ +// Wavefront BFS scanner — async implementation +// ============================================================================ + +namespace { + +/// Result of scanning a single file (returned from worker thread). +struct FileScanResult { + const char* path; // Stable pointer from PathPool. + std::uint32_t path_id; + std::uint32_t config_id; + ScanResult scan_result; + bool read_failed = false; + std::int64_t read_us = 0; + std::int64_t scan_us = 0; +}; + +/// Scan a single file: read content + lexer scan. +/// Runs on libuv worker thread via queue(). +/// @param path Stable pointer from PathPool (must outlive the task). +FileScanResult scan_file_worker(const char* path, std::uint32_t path_id, std::uint32_t config_id) { + FileScanResult result; + result.path = path; + result.path_id = path_id; + result.config_id = config_id; + + auto t0 = std::chrono::steady_clock::now(); + // Force read() instead of mmap: RequiresNullTerminator=true makes LLVM + // fall back to read() for page-aligned files, and IsVolatile=true forces + // read() unconditionally — bypassing mmap entirely. This separates + // actual I/O cost from page-fault cost that was previously hidden inside + // the lexer timing. + auto buf = llvm::MemoryBuffer::getFile(result.path, + /*FileSize=*/-1, + /*RequiresNullTerminator=*/true, + /*IsVolatile=*/true); + auto t1 = std::chrono::steady_clock::now(); + result.read_us = std::chrono::duration_cast(t1 - t0).count(); + + if(!buf) { + result.read_failed = true; + return result; + } + + result.scan_result = scan((*buf)->getBuffer()); + auto t2 = std::chrono::steady_clock::now(); + result.scan_us = std::chrono::duration_cast(t2 - t1).count(); + + return result; +} + +/// The async scan implementation that runs on a local event loop. +et::task<> scan_impl(CompilationDatabase& cdb, + PathPool& path_pool, + DependencyGraph& graph, + ScanReport& report, + ScanCache* ext_cache, + et::event_loop& loop) { + auto start_time = std::chrono::steady_clock::now(); + + // Reuse context groups and configs from cache when available (warm runs). + // On the first call (or when cache is null) we build everything from scratch. + const bool have_config_cache = + ext_cache && !ext_cache->context_groups.empty() && !ext_cache->configs.empty(); + + // Provide local storage when not using the persistent cache. + llvm::DenseMap> local_context_groups; + llvm::DenseMap local_context_to_config_id; + llvm::DenseMap local_configs; + + // When ext_cache is provided, write directly into it so that the data + // survives across calls (making have_config_cache true on run 2+). + llvm::DenseMap>& context_groups = + ext_cache ? ext_cache->context_groups : local_context_groups; + llvm::DenseMap& context_to_config_id = + ext_cache ? ext_cache->context_to_config_id : local_context_to_config_id; + llvm::DenseMap& configs = + ext_cache ? ext_cache->configs : local_configs; + + auto config_start = std::chrono::steady_clock::now(); + + if(!have_config_cache) { + // Group files by CompilationInfo pointer to identify unique compilation commands. + // Convert CDB path IDs to PathPool IDs. + for(auto& entry: cdb.get_entries()) { + auto path = cdb.resolve_path(entry.file); + auto pool_id = path_pool.intern(path); + context_groups[entry.info.ptr].push_back(pool_id); + } + + // Pre-warm toolchain cache: extract unique queries, execute in parallel. + // Skip entirely when configs are already cached (warm runs), since the + // toolchain cache is necessarily also populated from the previous scan. + auto prewarm_start = std::chrono::steady_clock::now(); + if(!cdb.has_cached_configs()) { + std::vector pending_entries; + for(auto& [info_ptr, file_ids]: context_groups) { + auto representative_path = path_pool.resolve(file_ids[0]); + CompilationDatabase::PendingEntry pe; + pe.file = representative_path; + pe.directory = info_ptr->directory; + // Reconstruct arguments: canonical args + patch args. + for(auto arg: info_ptr->canonical->arguments) { + pe.arguments.push_back(arg); + } + for(auto arg: info_ptr->patch) { + pe.arguments.push_back(arg); + } + pending_entries.push_back(std::move(pe)); + } + + auto pending = cdb.get_pending_queries(pending_entries); + if(!pending.empty()) { + LOG_INFO("Warming toolchain cache: {} unique queries", pending.size()); + + std::vector> tasks; + tasks.reserve(pending.size()); + for(auto& query: pending) { + tasks.push_back(et::queue( + [q = std::move(query)]() -> ToolchainResult { + ToolchainResult result; + result.key = q.key; + llvm::BumpPtrAllocator alloc; + llvm::StringSaver saver(alloc); + toolchain::query_toolchain({q.file, + q.directory, + q.query_args, + [&](const char* s) -> const char* { + result.cc1_args.push_back(s); + return saver.save(s).data(); + }}); + return result; + }, + loop)); + } + + auto outcome = co_await et::when_all(std::move(tasks)); + if(outcome.has_value()) { + cdb.inject_results(*outcome); + } else { + LOG_ERROR("Parallel toolchain query failed: {}", outcome.error().message()); + } + } + } + auto prewarm_end = std::chrono::steady_clock::now(); + report.prewarm_ms = + std::chrono::duration_cast(prewarm_end - prewarm_start) + .count(); + + // Extract SearchConfig for each unique context. + std::uint32_t next_config_id = 0; + std::int64_t lookup_us = 0; + for(auto& [context, file_ids]: context_groups) { + std::uint32_t config_id = next_config_id++; + context_to_config_id[context] = config_id; + auto representative_path = path_pool.resolve(file_ids[0]); + auto t0 = std::chrono::steady_clock::now(); + configs[config_id] = + cdb.lookup_search_config(representative_path, {.query_toolchain = true}); + auto t1 = std::chrono::steady_clock::now(); + lookup_us += std::chrono::duration_cast(t1 - t0).count(); + } + report.config_loop_ms = lookup_us / 1000; + LOG_INFO("Config extracted: {} groups, {:.1f}ms", configs.size(), lookup_us / 1000.0); + } + + auto config_end = std::chrono::steady_clock::now(); + report.config_ms = + std::chrono::duration_cast(config_end - config_start).count(); + + // Use external persistent cache when provided, otherwise create a local one. + DirListingCache local_dir_cache; + DirListingCache& dir_cache = ext_cache ? ext_cache->dir_cache : local_dir_cache; + + llvm::StringMap local_include_cache; + llvm::StringMap& include_cache = + ext_cache ? ext_cache->include_cache : local_include_cache; + + // Collect all unique search dirs and launch readdir tasks on the + // thread pool. Tasks start executing immediately but are NOT awaited + // here — instead they run concurrently with Wave 0's file scanning + // (Optimization 1: overlap dir cache with Phase 1). We only await + // them before Phase 2 of Wave 0, which is the first consumer. + + struct DirEntry { + std::string dir_path; + llvm::StringSet<> entries; + }; + + std::vector> pending_dir_tasks; + + if(dir_cache.dirs.empty()) { + llvm::StringSet<> unique_dirs; + for(auto& [config_id, config]: configs) { + for(auto& dir: config.dirs) { + unique_dirs.insert(dir.path); + } + } + // Also prefetch parent directories of source files (for quoted include resolution). + for(auto& [context, file_ids]: context_groups) { + for(auto path_id: file_ids) { + auto dir = llvm::sys::path::parent_path(path_pool.resolve(path_id)); + if(!dir.empty()) { + unique_dirs.insert(dir); + } + } + } + + pending_dir_tasks.reserve(unique_dirs.size()); + for(auto& entry: unique_dirs) { + auto dir_path = entry.getKey().str(); + pending_dir_tasks.push_back(et::queue( + [dir_path = std::move(dir_path)]() -> DirEntry { + DirEntry result; + result.dir_path = dir_path; + std::error_code ec; + llvm::sys::fs::directory_iterator di(result.dir_path, ec); + for(; !ec && di != llvm::sys::fs::directory_iterator(); di.increment(ec)) { + result.entries.insert(llvm::sys::path::filename(di->path())); + } + return result; + }, + loop)); + } + LOG_INFO("Launched {} dir cache tasks (running in background)", pending_dir_tasks.size()); + } + + // Track which files have been scanned (by path_id — cheaper than string hash). + // Value: found_dir_idx needed for #include_next. + llvm::DenseMap scanned_files; + + // Wave 0: all source files from CDB. + // Re-use the cached initial_wave when available to avoid re-iterating context_groups. + std::vector current_wave; + const bool have_initial_wave_cache = ext_cache && !ext_cache->initial_wave.empty(); + if(have_initial_wave_cache) { + current_wave = ext_cache->initial_wave; + for(auto& entry: current_wave) { + scanned_files.try_emplace(entry.path_id, entry.found_dir_idx); + } + } else { + current_wave.reserve(cdb.get_entries().size()); + for(auto& [context, file_ids]: context_groups) { + auto config_id = context_to_config_id[context]; + for(auto path_id: file_ids) { + scanned_files.try_emplace(path_id, 0u); + current_wave.push_back({path_id, config_id, /*found_dir_idx=*/0}); + } + } + if(ext_cache) { + ext_cache->initial_wave = current_wave; + } + } + + report.source_files = current_wave.size(); + std::size_t wave_num = 0; + + // Optimization 2: prefetch scan tasks. + // During Phase 2 of wave N, newly discovered files are immediately + // queued for scanning on the thread pool. When wave N+1 starts, + // these tasks are already running (or finished), eliminating most + // of the Phase 1 wait time for subsequent waves. + std::vector> prefetch_tasks; + + // Pre-resolved search configs: built once after dir cache is populated, + // then reused for all waves. Eliminates StringMap lookups in Phase 2. + llvm::DenseMap resolved_configs; + + while(!current_wave.empty()) { + auto wave_start = std::chrono::steady_clock::now(); + + // Phase 1: Read + scan all files in parallel on the thread pool. + // Files with a cached ScanResult skip I/O and lexing entirely. + // For waves > 0, files discovered during the previous wave's Phase 2 + // already have running scan tasks in prefetch_tasks. + std::vector scan_results; + scan_results.reserve(current_wave.size()); + std::size_t wave_cache_hits = 0; + + // Collect cache hits first (applies to all waves). + for(auto& entry: current_wave) { + if(ext_cache) { + auto it = ext_cache->scan_results.find(entry.path_id); + if(it != ext_cache->scan_results.end()) { + scan_results.push_back({path_pool.resolve(entry.path_id).data(), + entry.path_id, + entry.config_id, + it->second, + false, + 0, + 0}); + report.scan_cache_hits++; + wave_cache_hits++; + } + } + } + + if(!prefetch_tasks.empty()) { + // Waves 1+: await prefetched scan tasks from previous Phase 2. + auto scan_outcome = co_await et::when_all(std::move(prefetch_tasks)); + prefetch_tasks.clear(); + if(scan_outcome.has_error()) { + LOG_ERROR("Prefetch scan failed: {}", scan_outcome.error().message()); + break; + } + for(auto& r: *scan_outcome) { + if(!r.read_failed && ext_cache) { + ext_cache->scan_results.try_emplace(r.path_id, r.scan_result); + } + scan_results.push_back(std::move(r)); + } + } else { + // Wave 0 (or warm run with all cache hits): create scan tasks now. + std::vector> scan_tasks; + scan_tasks.reserve(current_wave.size()); + for(auto& entry: current_wave) { + auto pid = entry.path_id; + auto cid = entry.config_id; + // Skip files already served from cache above. + if(ext_cache && ext_cache->scan_results.count(pid)) { + continue; + } + auto path = path_pool.resolve(pid).data(); + scan_tasks.push_back( + et::queue([path, pid, cid]() { return scan_file_worker(path, pid, cid); }, + loop)); + } + + // Optimization 1: await dir cache tasks concurrently with scan tasks. + // Both sets of tasks run on the same thread pool. By awaiting dir + // tasks first (while scan tasks continue in the background), we pay + // max(dir_time, scan_time) instead of dir_time + scan_time. + if(!pending_dir_tasks.empty()) { + auto dir_t0 = std::chrono::steady_clock::now(); + auto dir_outcome = co_await et::when_all(std::move(pending_dir_tasks)); + pending_dir_tasks.clear(); + if(dir_outcome.has_value()) { + for(auto& entry: *dir_outcome) { + dir_cache.dirs.try_emplace(entry.dir_path, std::move(entry.entries)); + } + LOG_INFO("Pre-populated dir cache: {} directories", dir_outcome->size()); + } + auto dir_t1 = std::chrono::steady_clock::now(); + report.dir_cache_ms = + std::chrono::duration_cast(dir_t1 - dir_t0).count(); + } + + if(!scan_tasks.empty()) { + auto scan_outcome = co_await et::when_all(std::move(scan_tasks)); + if(scan_outcome.has_error()) { + LOG_ERROR("Parallel scan failed: {}", scan_outcome.error().message()); + break; + } + for(auto& r: *scan_outcome) { + if(!r.read_failed && ext_cache) { + ext_cache->scan_results.try_emplace(r.path_id, r.scan_result); + } + scan_results.push_back(std::move(r)); + } + } + } + + auto phase1_end = std::chrono::steady_clock::now(); + + // Accumulate per-file read/scan timing into report. + for(auto& sr: scan_results) { + report.read_us += sr.read_us; + report.scan_us += sr.scan_us; + } + + // Pre-resolve search configs once after dir cache is populated (wave 0). + // Converts StringMap lookups into direct pointer dereferences for Phase 2. + if(resolved_configs.empty()) { + for(auto& [config_id, config]: configs) { + resolved_configs[config_id] = resolve_search_config(config, dir_cache); + } + } + + // Phase 2+3: Resolve includes, intern paths, build graph, collect next wave. + // Merged into a single pass to avoid intermediate string allocations. + // Optimization 2: newly discovered files are immediately queued for + // scanning (prefetch_tasks), overlapping Phase 1 of the next wave + // with Phase 2 of the current wave. + std::vector next_wave; + next_wave.reserve(current_wave.size()); // Heuristic: next wave ≤ current wave. + StatCounters wave_stat_counters; + + for(auto& scan_result: scan_results) { + report.total_files++; + + if(scan_result.read_failed) { + LOG_WARN("Failed to read file for scanning: {}", scan_result.path); + continue; + } + + auto rc_it = resolved_configs.find(scan_result.config_id); + if(rc_it == resolved_configs.end()) { + continue; + } + + auto& resolved_config = rc_it->second; + auto includer_dir = llvm::sys::path::parent_path(scan_result.path); + auto* includer_entries = resolve_dir(includer_dir, dir_cache, &wave_stat_counters); + + // Look up the found_dir_idx for this file (stored when it was discovered). + unsigned includer_found_dir_idx = 0; + auto sf_it = scanned_files.find(scan_result.path_id); + if(sf_it != scanned_files.end()) { + includer_found_dir_idx = sf_it->second; + } + + // Record module interface unit mapping. + if(scan_result.scan_result.is_interface_unit) { + graph.add_module(scan_result.scan_result.module_name, scan_result.path_id); + } + + report.includes_found += scan_result.scan_result.includes.size(); + + llvm::SmallVector include_ids; + include_ids.reserve(scan_result.scan_result.includes.size()); + + for(auto& inc: scan_result.scan_result.includes) { + // For angled includes, resolution depends only on config (not includer dir). + // Cache these to skip redundant directory searches across files. + bool cache_eligible = inc.is_angled && !inc.is_include_next; + llvm::SmallString<80> cache_key; + if(cache_eligible) { + cache_key.append(reinterpret_cast(&scan_result.config_id), + reinterpret_cast(&scan_result.config_id) + + sizeof(std::uint32_t)); + cache_key += inc.path; + + auto cache_it = include_cache.find(cache_key); + if(cache_it != include_cache.end()) { + report.include_cache_hits++; + auto& cached = cache_it->second; + if(cached.path_id == UINT32_MAX) { + report.unresolved.push_back({ + std::move(inc.path), + std::string(path_pool.resolve(scan_result.path_id)), + inc.is_angled, + inc.conditional, + }); + continue; + } + report.includes_resolved++; + // Jump directly to edge building with cached path_id. + std::uint32_t flagged_id = cached.path_id; + if(inc.conditional) { + flagged_id |= DependencyGraph::CONDITIONAL_FLAG; + report.conditional_edges++; + } else { + report.unconditional_edges++; + } + report.total_edges++; + include_ids.push_back(flagged_id); + if(scanned_files.try_emplace(cached.path_id, cached.found_dir_idx).second) { + next_wave.push_back( + {cached.path_id, scan_result.config_id, cached.found_dir_idx}); + } + continue; + } + } + + auto r_t0 = std::chrono::steady_clock::now(); + auto resolved = resolve_include(inc.path, + inc.is_angled, + includer_entries, + includer_dir, + inc.is_include_next, + includer_found_dir_idx, + resolved_config, + dir_cache, + &wave_stat_counters); + auto r_t1 = std::chrono::steady_clock::now(); + report.p2_resolve_us += + std::chrono::duration_cast(r_t1 - r_t0).count(); + if(!resolved.has_value()) { + if(cache_eligible) { + include_cache.try_emplace(cache_key, + ScanCache::CachedInclude{UINT32_MAX, 0}); + } + report.unresolved.push_back({ + std::move(inc.path), + std::string(path_pool.resolve(scan_result.path_id)), + inc.is_angled, + inc.conditional, + }); + continue; + } + + auto inc_path_id = path_pool.intern(resolved->path); + report.includes_resolved++; + + if(cache_eligible) { + include_cache.try_emplace( + cache_key, + ScanCache::CachedInclude{inc_path_id, resolved->found_dir_idx}); + } + + std::uint32_t flagged_id = inc_path_id; + if(inc.conditional) { + flagged_id |= DependencyGraph::CONDITIONAL_FLAG; + report.conditional_edges++; + } else { + report.unconditional_edges++; + } + report.total_edges++; + include_ids.push_back(flagged_id); + + if(scanned_files.try_emplace(inc_path_id, resolved->found_dir_idx).second) { + next_wave.push_back( + {inc_path_id, scan_result.config_id, resolved->found_dir_idx}); + // Prefetch: start scanning this file immediately on the + // thread pool so it's ready when the next wave begins. + if(!ext_cache || + ext_cache->scan_results.find(inc_path_id) == ext_cache->scan_results.end()) { + auto inc_path = path_pool.resolve(inc_path_id).data(); + prefetch_tasks.push_back(et::queue( + [inc_path, inc_path_id, cid = scan_result.config_id]() { + return scan_file_worker(inc_path, inc_path_id, cid); + }, + loop)); + } + } + } + + graph.set_includes(scan_result.path_id, scan_result.config_id, std::move(include_ids)); + } + + report.dir_listings += wave_stat_counters.dir_listings; + report.dir_hits += wave_stat_counters.dir_hits; + report.fs_lookups += wave_stat_counters.lookups; + report.fs_us += wave_stat_counters.us; + + auto phase2_end = std::chrono::steady_clock::now(); + auto phase3_end = phase2_end; + + auto p1 = + std::chrono::duration_cast(phase1_end - wave_start).count(); + auto p2 = + std::chrono::duration_cast(phase2_end - phase1_end).count(); + auto p3 = + std::chrono::duration_cast(phase3_end - phase2_end).count(); + + report.phase1_ms += p1; + report.phase2_ms += p2; + report.phase3_ms += p3; + + // Record per-wave stats for cold start analysis. + ScanReport::WaveStats ws; + ws.files = current_wave.size(); + ws.phase1_ms = p1; + ws.phase2_ms = p2; + ws.next_files = next_wave.size(); + ws.prefetch_count = prefetch_tasks.size(); + ws.dir_listings = wave_stat_counters.dir_listings; + ws.dir_hits = wave_stat_counters.dir_hits; + ws.cache_hits = wave_cache_hits; + report.wave_stats.push_back(ws); + + LOG_INFO( + "Wave {}: {} files | read+scan={}ms resolve={}ms graph={}ms | next={} " "prefetch={}", + wave_num, + current_wave.size(), + p1, + p2, + p3, + next_wave.size(), + prefetch_tasks.size()); + + current_wave = std::move(next_wave); + wave_num++; + } + + auto end_time = std::chrono::steady_clock::now(); + report.elapsed_ms = + std::chrono::duration_cast(end_time - start_time).count(); + report.header_files = report.total_files - report.source_files; + report.modules = graph.module_count(); + report.waves = wave_num; +} + +} // namespace + +// ============================================================================ +// Public sync entry point +// ============================================================================ + +ScanReport scan_dependency_graph(CompilationDatabase& cdb, + PathPool& path_pool, + DependencyGraph& graph, + ScanCache* cache) { + ScanReport report; + if(cdb.get_entries().empty()) { + return report; + } + + et::event_loop loop; + loop.schedule(scan_impl(cdb, path_pool, graph, report, cache, loop)); + loop.run(); + return report; +} + +} // namespace clice diff --git a/src/syntax/dependency_graph.h b/src/syntax/dependency_graph.h new file mode 100644 index 00000000..90b6f14f --- /dev/null +++ b/src/syntax/dependency_graph.h @@ -0,0 +1,247 @@ +#pragma once + +#include +#include +#include + +#include "command/command.h" +#include "support/path_pool.h" +#include "syntax/include_resolver.h" +#include "syntax/scan.h" + +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/StringMap.h" +#include "llvm/ADT/StringRef.h" + +namespace clice { + +class DependencyGraph { +public: + /// Conditional flag: bit 31 marks an include inside #ifdef/#if. + constexpr static std::uint32_t CONDITIONAL_FLAG = 0x80000000u; + + /// Mask to extract the actual PathID from a flagged value. + constexpr static std::uint32_t PATH_ID_MASK = 0x7FFFFFFFu; + + /// Key for per-(file, SearchConfig) include storage. + struct IncludeKey { + std::uint32_t path_id; + std::uint32_t config_id; + + bool operator==(const IncludeKey&) const = default; + }; + + struct IncludeKeyInfo { + static IncludeKey getEmptyKey() { + return {~0u, ~0u}; + } + + static IncludeKey getTombstoneKey() { + return {~0u - 1, ~0u - 1}; + } + + static unsigned getHashValue(const IncludeKey& key) { + return llvm::DenseMapInfo::getHashValue( + (std::uint64_t(key.path_id) << 32) | key.config_id); + } + + static bool isEqual(const IncludeKey& lhs, const IncludeKey& rhs) { + return lhs == rhs; + } + }; + + /// Register a module interface unit: module name -> PathID. + void add_module(llvm::StringRef module_name, std::uint32_t path_id); + + /// Look up all PathIDs that provide a given module (may have multiple candidates). + llvm::ArrayRef lookup_module(llvm::StringRef module_name) const; + + /// Set the direct include list for a (file, config) pair. + void set_includes(std::uint32_t path_id, + std::uint32_t config_id, + llvm::SmallVector included_ids); + + /// Get direct includes for a specific (file, config) pair. + llvm::ArrayRef get_includes(std::uint32_t path_id, + std::uint32_t config_id) const; + + /// Get the union of includes across all configs for a file. + llvm::SmallVector get_all_includes(std::uint32_t path_id) const; + + /// Number of files with include entries. + std::size_t file_count() const; + + /// Number of module mappings. + std::size_t module_count() const; + + /// Total number of include edges across all (file, config) pairs. + std::size_t edge_count() const; + + /// Access the module name -> PathID mapping. + const llvm::StringMap>& modules() const { + return module_to_path; + } + +private: + /// Module name -> PathIDs (multiple candidates possible, e.g. different targets). + llvm::StringMap> module_to_path; + + /// (PathID, ConfigID) -> list of directly included PathIDs. + /// Each PathID may have bit 31 set to indicate conditional include. + llvm::DenseMap, IncludeKeyInfo> includes; + + /// Track which files have any include entries (for file_count). + llvm::DenseMap> file_configs; +}; + +/// A (file, search-config) pair used to track per-wave work items. +struct WaveEntry { + std::uint32_t path_id; + std::uint32_t config_id; + /// Search dir index where this file was found. Used for #include_next. + /// Source files (wave 0) use 0. + unsigned found_dir_idx = 0; +}; + +/// Detailed report from a dependency scan. +struct ScanReport { + /// Timing in milliseconds. + std::int64_t elapsed_ms = 0; + + /// File counts. + std::size_t source_files = 0; // Files from CDB (translation units). + std::size_t header_files = 0; // Files discovered via include scanning. + std::size_t total_files = 0; // source_files + header_files. + + /// Include edge counts. + std::size_t total_edges = 0; // Total include edges. + std::size_t conditional_edges = 0; // Edges inside #if/#ifdef. + std::size_t unconditional_edges = 0; // Edges not inside conditionals. + + /// Include resolution. + std::size_t includes_found = 0; // Total #include directives seen. + std::size_t includes_resolved = 0; // Successfully resolved to a file. + + /// Module info. + std::size_t modules = 0; + + /// BFS wave count. + std::size_t waves = 0; + + /// Wall-clock time per phase (milliseconds, summed across waves). + std::int64_t phase1_ms = 0; // Read + scan (parallel on thread pool). + std::int64_t phase2_ms = 0; // Include resolution (stat calls). + std::int64_t phase3_ms = 0; // Graph building (single-threaded). + std::int64_t config_ms = 0; // Config extraction (one-time, total). + std::int64_t prewarm_ms = 0; // Toolchain pre-warm subset. + std::int64_t config_loop_ms = 0; // lookup + extract_search_config loop. + std::int64_t dir_cache_ms = 0; // Dir cache pre-population (overlapped with Phase 1). + + /// Cumulative I/O time across all threads/files (microseconds). + /// These are sums of per-file durations — will exceed wall-clock time + /// when work is parallelized across threads. + std::int64_t read_us = 0; // File read (cumulative across threads). + std::int64_t scan_us = 0; // Lexer scan (cumulative across threads). + std::int64_t fs_us = 0; // Filesystem ops (readdir calls). + + /// Phase 2 breakdown (microseconds, single-threaded). + std::int64_t p2_resolve_us = 0; // resolve_include() calls. + + /// Filesystem call counts. + std::size_t dir_listings = 0; // Actual readdir() calls (dir cache misses). + std::size_t dir_hits = 0; // Directory cache hits (no syscall). + std::size_t fs_lookups = 0; // Total file existence lookups. + std::size_t include_cache_hits = 0; // Include resolution cache hits (skipped resolve). + std::size_t scan_cache_hits = 0; // Scan result cache hits (skipped I/O + lexer). + + /// Per-wave timing breakdown for cold start analysis. + struct WaveStats { + std::size_t files = 0; // Files processed in this wave. + std::int64_t phase1_ms = 0; // Read + scan (parallel). + std::int64_t phase2_ms = 0; // Include resolution (serial). + std::size_t next_files = 0; // Files discovered for next wave. + std::size_t prefetch_count = 0; // Prefetch tasks launched during Phase 2. + std::size_t dir_listings = 0; // readdir() calls in this wave. + std::size_t dir_hits = 0; // Dir cache hits in this wave. + std::size_t cache_hits = 0; // Scan cache hits in this wave. + }; + + std::vector wave_stats; + + /// Unresolved includes: (header_name, includer_path). + struct UnresolvedInclude { + std::string header; + std::string includer; + bool is_angled = false; + bool conditional = false; + }; + + std::vector unresolved; +}; + +/// Persistent cache that can be reused across successive scan calls. +/// Holding onto this between incremental re-scans eliminates repeated +/// readdir() calls, angled-include resolution, and file I/O on warm runs. +/// +/// Thread safety: not thread-safe; callers must serialise scan calls. +/// +/// Invalidation: callers must clear (or discard) this cache whenever the +/// compilation database or filesystem state changes. +/// +/// TODO: add a generation counter or single invalidate() method to prevent +/// partial clearing from causing inconsistency between inter-dependent fields. +struct ScanCache { + /// Directory listing cache: dir path → set of filenames. + DirListingCache dir_cache; + + /// Angled-include resolution cache: (config_id bytes + header) → {path_id, found_dir_idx}. + /// path_id values are valid only for the PathPool used during the scan + /// that populated this cache. If PathPool is reset between scans, clear + /// this cache too (or pass nullptr to scan_dependency_graph). + struct CachedInclude { + std::uint32_t path_id; + unsigned found_dir_idx; + }; + + llvm::StringMap include_cache; + + /// Lexer scan result cache: path_id → ScanResult. + /// Populated on the first scan of each file. On subsequent calls the + /// worker-thread file read and lexer scan are skipped entirely, making + /// warm-run Phase 1 effectively free. + /// Invalidate per-entry when a file changes on disk. + llvm::DenseMap scan_results; + + // Populated during the first scan and reused on all subsequent calls + // when the compilation database has not changed. + + /// Files grouped by unique CompilationInfo pointer. + /// path_ids are valid for the persistent PathPool. + llvm::DenseMap> context_groups; + + /// CompilationInfo pointer → dense config_id (index into configs). + llvm::DenseMap context_to_config_id; + + /// Per-config search configuration (reused across scans). + llvm::DenseMap configs; + + /// Pre-built initial wave (wave 0): all source files with their config IDs. + std::vector initial_wave; +}; + +/// Run the wavefront BFS scan over all files in the compilation database. +/// Internally creates a local event loop for async I/O (file reads via worker +/// thread pool, stat calls via libuv). Blocks until the scan is complete. +/// +/// @param cache Optional persistent cache. When non-null and pre-populated, +/// avoids repeated readdir() and include-resolution work across +/// successive calls. PathPool must NOT be reset between calls +/// when a persistent cache is used (path_id values must remain stable). +ScanReport scan_dependency_graph(CompilationDatabase& cdb, + PathPool& path_pool, + DependencyGraph& graph, + ScanCache* cache = nullptr); + +} // namespace clice diff --git a/src/syntax/include_resolver.cpp b/src/syntax/include_resolver.cpp new file mode 100644 index 00000000..13bca45c --- /dev/null +++ b/src/syntax/include_resolver.cpp @@ -0,0 +1,212 @@ +#include "syntax/include_resolver.h" + +#include + +#include "support/logging.h" + +#include "llvm/Support/FileSystem.h" +#include "llvm/Support/Path.h" + +namespace clice { + +const llvm::StringSet<>* resolve_dir(llvm::StringRef dir, + DirListingCache& cache, + StatCounters* counters) { + auto it = cache.dirs.find(dir); + if(it != cache.dirs.end()) { + if(counters) { + counters->dir_hits++; + } + return &it->second; + } + + if(counters) { + counters->dir_listings++; + } + + auto t0 = std::chrono::steady_clock::now(); + llvm::StringSet<> entries; + std::error_code ec; + llvm::sys::fs::directory_iterator di(dir, ec); + if(ec) { + LOG_DEBUG("readdir failed for '{}': {}", dir, ec.message()); + } + for(; !ec && di != llvm::sys::fs::directory_iterator(); di.increment(ec)) { + entries.insert(llvm::sys::path::filename(di->path())); + } + auto t1 = std::chrono::steady_clock::now(); + if(counters) { + counters->us += std::chrono::duration_cast(t1 - t0).count(); + } + + auto [new_it, _] = cache.dirs.try_emplace(dir, std::move(entries)); + return &new_it->second; +} + +ResolvedSearchConfig resolve_search_config(const SearchConfig& config, DirListingCache& cache) { + ResolvedSearchConfig resolved; + resolved.angled_start_idx = config.angled_start_idx; + resolved.system_start_idx = config.system_start_idx; + resolved.after_start_idx = config.after_start_idx; + resolved.dirs.reserve(config.dirs.size()); + for(auto& dir: config.dirs) { + resolved.dirs.push_back({dir.path, resolve_dir(dir.path, cache)}); + } + return resolved; +} + +namespace { + +/// Check if a file exists in a directory, handling multi-component include paths. +/// For simple filenames (no '/'), checks pre-resolved entries directly. +/// For multi-component paths like "llvm/Support/raw_ostream.h", constructs the +/// full path and resolves the actual parent subdirectory via DirListingCache. +bool check_in_dir(llvm::StringRef dir_path, + const llvm::StringSet<>* entries, + llvm::StringRef filename, + bool is_simple, + DirListingCache& dir_cache, + StatCounters* counters) { + if(counters) + counters->lookups++; + + if(is_simple) { + return entries->contains(filename); + } + + // Quick rejection: check if first path component exists in pre-resolved + // entries. For "llvm/Support/raw_ostream.h", check if "llvm" exists in + // the search dir listing. Most search dirs won't have it, so we skip + // the expensive full path construction + subdirectory resolution. + // Skip this for relative paths starting with "." or ".." (e.g. "../foo.h"). + auto first_sep = filename.find_first_of("/\\"); + auto first_component = filename.substr(0, first_sep); + if(first_component != "." && first_component != "..") { + if(!entries->contains(first_component)) { + return false; + } + } + + // First component matched — construct full path, resolve actual subdirectory. + llvm::SmallString<256> full; + full = dir_path; + llvm::sys::path::append(full, filename); + auto parent = llvm::sys::path::parent_path(full); + auto name = llvm::sys::path::filename(full); + auto* sub_entries = resolve_dir(parent, dir_cache, counters); + return sub_entries->contains(name); +} + +} // namespace + +std::optional resolve_include(llvm::StringRef filename, + bool is_angled, + const llvm::StringSet<>* includer_entries, + llvm::StringRef includer_dir, + bool is_include_next, + unsigned found_dir_idx, + const ResolvedSearchConfig& config, + DirListingCache& dir_cache, + StatCounters* stat_counters) { + // 1. Absolute path: check directly via stat(). + if(llvm::sys::path::is_absolute(filename)) { + if(llvm::sys::fs::exists(filename)) { + return ResolveResult{llvm::SmallString<256>(filename), 0}; + } + return std::nullopt; + } + + // Check if filename has path separators (multi-component like "llvm/Support/foo.h"). + bool is_simple = + filename.find('/') == llvm::StringRef::npos && filename.find('\\') == llvm::StringRef::npos; + + // Check if filename contains "." or ".." components that need normalization. + // Only these produce non-canonical paths after path::append. + bool needs_normalize = !is_simple && (filename.find("..") != llvm::StringRef::npos || + filename.find("./") != llvm::StringRef::npos || + filename.find(".\\") != llvm::StringRef::npos || + filename.find("\\.") != llvm::StringRef::npos); + + llvm::SmallString<256> candidate; + + // Helper: build candidate path + normalize if needed. + auto make_candidate = [&](llvm::StringRef dir, llvm::StringRef fname) { + candidate = dir; + llvm::sys::path::append(candidate, fname); + if(needs_normalize) { + llvm::sys::path::remove_dots(candidate, /*remove_dot_dot=*/true); + } + }; + + // 2. For #include_next, start from found_dir_idx + 1. + if(is_include_next) { + unsigned start = found_dir_idx + 1; + for(unsigned i = start; i < config.dirs.size(); ++i) { + if(check_in_dir(config.dirs[i].path, + config.dirs[i].entries, + filename, + is_simple, + dir_cache, + stat_counters)) { + make_candidate(config.dirs[i].path, filename); + return ResolveResult{candidate, i}; + } + } + return std::nullopt; + } + + // 3. Quoted include: try includer's directory first. + if(!is_angled && includer_entries) { + if(check_in_dir(includer_dir, + includer_entries, + filename, + is_simple, + dir_cache, + stat_counters)) { + make_candidate(includer_dir, filename); + return ResolveResult{candidate, 0}; + } + } + + // 4. Search directories from appropriate start index. + // TODO: macOS Framework search — for , try Foo.framework/Headers/Bar.h + // in dirs marked as framework dirs (-F, -iframework). + unsigned start = is_angled ? config.angled_start_idx : 0; + for(unsigned i = start; i < config.dirs.size(); ++i) { + if(check_in_dir(config.dirs[i].path, + config.dirs[i].entries, + filename, + is_simple, + dir_cache, + stat_counters)) { + make_candidate(config.dirs[i].path, filename); + return ResolveResult{candidate, i}; + } + } + + return std::nullopt; +} + +std::optional resolve_include(llvm::StringRef filename, + bool is_angled, + llvm::StringRef includer_dir, + bool is_include_next, + unsigned found_dir_idx, + const SearchConfig& config, + DirListingCache& dir_cache, + StatCounters* stat_counters) { + auto resolved_config = resolve_search_config(config, dir_cache); + const llvm::StringSet<>* includer_entries = + includer_dir.empty() ? nullptr : resolve_dir(includer_dir, dir_cache, stat_counters); + return resolve_include(filename, + is_angled, + includer_entries, + includer_dir, + is_include_next, + found_dir_idx, + resolved_config, + dir_cache, + stat_counters); +} + +} // namespace clice diff --git a/src/syntax/include_resolver.h b/src/syntax/include_resolver.h new file mode 100644 index 00000000..624b42ed --- /dev/null +++ b/src/syntax/include_resolver.h @@ -0,0 +1,107 @@ +#pragma once + +#include +#include + +#include "command/search_config.h" + +#include "llvm/ADT/SmallString.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/StringMap.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/ADT/StringSet.h" + +namespace clice { + +struct ResolveResult { + /// The resolved absolute path (stack-allocated for paths < 256 chars). + llvm::SmallString<256> path; + + /// The index in SearchConfig::dirs where this file was found. + /// Used for #include_next to resume searching from found_dir_idx + 1. + unsigned found_dir_idx = 0; +}; + +/// Counters for filesystem call tracking during include resolution. +struct StatCounters { + std::size_t dir_listings = 0; // Actual readdir() calls (directory cache misses). + std::size_t dir_hits = 0; // Directory cache hits (no syscall). + std::size_t lookups = 0; // Total file existence lookups. + std::int64_t us = 0; // Microseconds spent in filesystem ops. +}; + +/// Cache of directory listings for fast file existence checks. +/// Instead of calling stat() for each candidate path, we list directory +/// contents once via readdir() and do in-memory set lookups thereafter. +/// This is dramatically faster on Windows where individual stat() calls +/// are very expensive (~10x slower than Linux). +/// +/// TODO: add per-directory invalidation for incremental updates (currently +/// the entire cache must be discarded when files change on disk). +/// TODO: on case-insensitive filesystems (macOS HFS+/APFS, Windows NTFS), +/// the readdir-based first-component optimization in resolve_include may +/// produce false negatives when the #include casing differs from disk. +struct DirListingCache { + llvm::StringMap> dirs; +}; + +/// A search directory with a pre-resolved pointer to its cached entries. +/// The pointer is stable because StringMap allocates entries on the heap. +struct ResolvedSearchDir { + llvm::StringRef path; + const llvm::StringSet<>* entries; // Never null after resolve_search_config(). +}; + +/// Pre-resolved version of SearchConfig — all directory lookups are resolved +/// to direct pointers, eliminating StringMap lookups during include resolution. +struct ResolvedSearchConfig { + llvm::SmallVector dirs; + unsigned angled_start_idx = 0; + unsigned system_start_idx = 0; + unsigned after_start_idx = 0; +}; + +/// Resolve a single directory to its cached StringSet. +/// Returns a stable pointer into the DirListingCache. +/// On cache miss, lazily populates via readdir(). +const llvm::StringSet<>* resolve_dir(llvm::StringRef dir, + DirListingCache& cache, + StatCounters* counters = nullptr); + +/// Pre-resolve a SearchConfig against a populated DirListingCache. +/// Call once per config after dir cache pre-population, then reuse +/// the result for all resolve_include() calls with that config. +ResolvedSearchConfig resolve_search_config(const SearchConfig& config, DirListingCache& cache); + +/// Resolve an include directive using pre-resolved config and includer entries. +/// +/// @param filename Raw include name (without delimiters) +/// @param is_angled Whether this is a <...> include +/// @param includer_entries Pre-resolved StringSet for the includer's directory (may be null) +/// @param includer_dir Directory of the file containing the #include +/// @param is_include_next Whether this is #include_next +/// @param found_dir_idx For #include_next: the search dir index of the includer +/// @param config Pre-resolved search configuration +/// @return Resolved path and the search dir index, or nullopt if not found +std::optional resolve_include(llvm::StringRef filename, + bool is_angled, + const llvm::StringSet<>* includer_entries, + llvm::StringRef includer_dir, + bool is_include_next, + unsigned found_dir_idx, + const ResolvedSearchConfig& config, + DirListingCache& dir_cache, + StatCounters* stat_counters = nullptr); + +/// Convenience overload: resolves config and includer_dir on the fly. +/// Use for tests and one-off calls where pre-resolution overhead doesn't matter. +std::optional resolve_include(llvm::StringRef filename, + bool is_angled, + llvm::StringRef includer_dir, + bool is_include_next, + unsigned found_dir_idx, + const SearchConfig& config, + DirListingCache& dir_cache, + StatCounters* stat_counters = nullptr); + +} // namespace clice diff --git a/src/syntax/scan.cpp b/src/syntax/scan.cpp index c0d6027a..16fa7283 100644 --- a/src/syntax/scan.cpp +++ b/src/syntax/scan.cpp @@ -4,6 +4,7 @@ #include "syntax/lexer.h" +#include "llvm/ADT/StringSet.h" #include "llvm/Support/MemoryBuffer.h" #include "clang/Basic/DiagnosticOptions.h" #include "clang/Basic/FileEntry.h" @@ -30,6 +31,9 @@ ScanResult scan(llvm::StringRef content) { return result; } + // Most source files have 10-30 includes; pre-allocate to avoid reallocs. + result.includes.reserve(std::min(directives.size(), 32)); + int conditional_depth = 0; for(auto& dir: directives) { diff --git a/tests/unit/syntax/dependency_graph_tests.cpp b/tests/unit/syntax/dependency_graph_tests.cpp new file mode 100644 index 00000000..9ab68b58 --- /dev/null +++ b/tests/unit/syntax/dependency_graph_tests.cpp @@ -0,0 +1,606 @@ +#include "test/temp_dir.h" +#include "test/test.h" +#include "command/command.h" +#include "support/path_pool.h" +#include "syntax/dependency_graph.h" + +namespace clice::testing { +namespace { + +TEST_SUITE(DependencyGraph) { + +// ============================================================================ +// Module mapping tests +// ============================================================================ + +TEST_CASE(LookupModuleEmpty) { + clice::DependencyGraph graph; + EXPECT_TRUE(graph.lookup_module("foo.bar").empty()); +} + +TEST_CASE(AddAndLookupModule) { + clice::DependencyGraph graph; + graph.add_module("foo.bar", 42); + + auto result = graph.lookup_module("foo.bar"); + ASSERT_EQ(result.size(), 1u); + EXPECT_EQ(result[0], 42u); +} + +TEST_CASE(DuplicateModuleDedup) { + clice::DependencyGraph graph; + // Same module name, same path_id — should dedup. + graph.add_module("foo", 10); + graph.add_module("foo", 10); + ASSERT_EQ(graph.lookup_module("foo").size(), 1u); + + // Same module name, different path_id — multiple candidates. + graph.add_module("foo", 20); + auto result = graph.lookup_module("foo"); + ASSERT_EQ(result.size(), 2u); + EXPECT_EQ(result[0], 10u); + EXPECT_EQ(result[1], 20u); +} + +TEST_CASE(MultipleModules) { + clice::DependencyGraph graph; + graph.add_module("mod.a", 1); + graph.add_module("mod.b", 2); + graph.add_module("mod.c:part", 3); + + ASSERT_EQ(graph.lookup_module("mod.a").size(), 1u); + EXPECT_EQ(graph.lookup_module("mod.a")[0], 1u); + ASSERT_EQ(graph.lookup_module("mod.b").size(), 1u); + EXPECT_EQ(graph.lookup_module("mod.b")[0], 2u); + ASSERT_EQ(graph.lookup_module("mod.c:part").size(), 1u); + EXPECT_EQ(graph.lookup_module("mod.c:part")[0], 3u); + EXPECT_TRUE(graph.lookup_module("mod.d").empty()); +} + +TEST_CASE(ModuleCount) { + clice::DependencyGraph graph; + EXPECT_EQ(graph.module_count(), 0u); + + graph.add_module("a", 1); + EXPECT_EQ(graph.module_count(), 1u); + + graph.add_module("b", 2); + EXPECT_EQ(graph.module_count(), 2u); + + // Second candidate for "a" doesn't increase module name count. + graph.add_module("a", 3); + EXPECT_EQ(graph.module_count(), 2u); +} + +// ============================================================================ +// Include edge tests +// ============================================================================ + +TEST_CASE(EmptyGraphIncludes) { + clice::DependencyGraph graph; + auto includes = graph.get_includes(0, 0); + EXPECT_TRUE(includes.empty()); +} + +TEST_CASE(SetAndGetIncludes) { + clice::DependencyGraph graph; + llvm::SmallVector ids = {10, 20, 30}; + graph.set_includes(1, 0, ids); + + auto result = graph.get_includes(1, 0); + ASSERT_EQ(result.size(), 3u); + EXPECT_EQ(result[0], 10u); + EXPECT_EQ(result[1], 20u); + EXPECT_EQ(result[2], 30u); +} + +TEST_CASE(IncludesPerConfig) { + clice::DependencyGraph graph; + + // Same file, different configs. + graph.set_includes(1, 0, {10, 20}); + graph.set_includes(1, 1, {20, 30}); + + auto config0 = graph.get_includes(1, 0); + ASSERT_EQ(config0.size(), 2u); + EXPECT_EQ(config0[0], 10u); + EXPECT_EQ(config0[1], 20u); + + auto config1 = graph.get_includes(1, 1); + ASSERT_EQ(config1.size(), 2u); + EXPECT_EQ(config1[0], 20u); + EXPECT_EQ(config1[1], 30u); +} + +TEST_CASE(GetAllIncludesUnion) { + clice::DependencyGraph graph; + + graph.set_includes(1, 0, {10, 20}); + graph.set_includes(1, 1, {20, 30}); + + auto all = graph.get_all_includes(1); + // Union of {10, 20} and {20, 30} = {10, 20, 30}. + ASSERT_EQ(all.size(), 3u); +} + +TEST_CASE(ConditionalFlag) { + clice::DependencyGraph graph; + + constexpr auto FLAG = clice::DependencyGraph::CONDITIONAL_FLAG; + constexpr auto MASK = clice::DependencyGraph::PATH_ID_MASK; + + // PathID 5 unconditional, PathID 7 conditional. + llvm::SmallVector ids = {5, 7 | FLAG}; + graph.set_includes(1, 0, ids); + + auto result = graph.get_includes(1, 0); + ASSERT_EQ(result.size(), 2u); + + // First: unconditional. + EXPECT_EQ(result[0] & MASK, 5u); + EXPECT_EQ(result[0] & FLAG, 0u); + + // Second: conditional. + EXPECT_EQ(result[1] & MASK, 7u); + EXPECT_NE(result[1] & FLAG, 0u); +} + +TEST_CASE(FileCount) { + clice::DependencyGraph graph; + EXPECT_EQ(graph.file_count(), 0u); + + graph.set_includes(1, 0, {10}); + EXPECT_EQ(graph.file_count(), 1u); + + // Same file, different config. + graph.set_includes(1, 1, {20}); + EXPECT_EQ(graph.file_count(), 1u); + + // Different file. + graph.set_includes(2, 0, {30}); + EXPECT_EQ(graph.file_count(), 2u); +} + +TEST_CASE(EdgeCount) { + clice::DependencyGraph graph; + EXPECT_EQ(graph.edge_count(), 0u); + + graph.set_includes(1, 0, {10, 20}); + EXPECT_EQ(graph.edge_count(), 2u); + + graph.set_includes(2, 0, {30}); + EXPECT_EQ(graph.edge_count(), 3u); +} + +TEST_CASE(EmptyIncludes) { + clice::DependencyGraph graph; + graph.set_includes(1, 0, {}); + + auto result = graph.get_includes(1, 0); + EXPECT_TRUE(result.empty()); + EXPECT_EQ(graph.file_count(), 1u); + EXPECT_EQ(graph.edge_count(), 0u); +} + +}; // TEST_SUITE(DependencyGraph) + +// ============================================================================ +// scan_dependency_graph() integration tests +// ============================================================================ + +/// Write a compile_commands.json into the temp dir and load it into the given CDB. +void write_cdb(TempDir& tmp, CompilationDatabase& cdb, llvm::StringRef json_content) { + tmp.touch("compile_commands.json", json_content); + cdb.load(tmp.path("compile_commands.json")); +} + +/// Helper: build a compile_commands.json array from entries. +/// Uses "arguments" array form to avoid platform-specific tokenization issues +/// (e.g. TokenizeGNUCommandLine treating backslashes as escape characters). +struct CDBEntry { + llvm::StringRef dir; + std::string file; + std::vector extra_args; +}; + +/// Escape backslashes and quotes for JSON string values. +std::string json_escape(llvm::StringRef s) { + std::string result; + result.reserve(s.size()); + for(char c: s) { + if(c == '\\' || c == '"') { + result += '\\'; + } + result += c; + } + return result; +} + +std::string build_cdb_json(llvm::ArrayRef entries) { + std::string json = "[\n"; + for(std::size_t i = 0; i < entries.size(); ++i) { + auto& e = entries[i]; + if(i > 0) { + json += ",\n"; + } + json += R"( {"directory": ")"; + json += json_escape(e.dir); + json += R"(", "file": ")"; + json += json_escape(e.file); + json += R"(", "arguments": ["clang++", "-std=c++20")"; + for(auto& arg: e.extra_args) { + json += R"(, ")"; + json += json_escape(arg); + json += R"(")"; + } + json += R"(, ")"; + json += json_escape(e.file); + json += R"("]})"; + } + json += "\n]"; + return json; +} + +TEST_SUITE(ScanDependencyGraph) { + +TEST_CASE(EmptyCDB) { + CompilationDatabase cdb; + PathPool pool; + DependencyGraph graph; + + scan_dependency_graph(cdb, pool, graph); + + EXPECT_EQ(graph.file_count(), 0u); + EXPECT_EQ(graph.module_count(), 0u); + EXPECT_EQ(graph.edge_count(), 0u); +} + +TEST_CASE(SingleFileNoIncludes) { + TempDir tmp; + tmp.touch("src/main.cpp", R"(int main() { return 0; })"); + + CompilationDatabase cdb; + PathPool pool; + DependencyGraph graph; + + auto json = build_cdb_json({ + {tmp.root, tmp.path("src/main.cpp"), {}} + }); + write_cdb(tmp, cdb, json); + scan_dependency_graph(cdb, pool, graph); + + EXPECT_EQ(graph.file_count(), 1u); + EXPECT_EQ(graph.edge_count(), 0u); + EXPECT_EQ(graph.module_count(), 0u); +} + +TEST_CASE(SingleFileWithInclude) { + TempDir tmp; + tmp.touch("include/header.h", R"(int x = 1;)"); + tmp.touch("src/main.cpp", R"( +#include "header.h" +int main() { return x; } +)"); + + CompilationDatabase cdb; + PathPool pool; + DependencyGraph graph; + + auto json = build_cdb_json({ + {tmp.root, tmp.path("src/main.cpp"), {"-I", tmp.path("include")}} + }); + write_cdb(tmp, cdb, json); + scan_dependency_graph(cdb, pool, graph); + + EXPECT_GE(graph.file_count(), 1u); + EXPECT_GE(graph.edge_count(), 1u); +} + +TEST_CASE(TransitiveIncludes) { + TempDir tmp; + tmp.touch("inc/a.h", R"(#include "b.h")"); + tmp.touch("inc/b.h", R"(#include "c.h")"); + tmp.touch("inc/c.h", R"(int c = 3;)"); + tmp.touch("src/main.cpp", R"( +#include "a.h" +int main() {} +)"); + + CompilationDatabase cdb; + PathPool pool; + DependencyGraph graph; + + auto json = build_cdb_json({ + {tmp.root, tmp.path("src/main.cpp"), {"-I", tmp.path("inc")}} + }); + write_cdb(tmp, cdb, json); + scan_dependency_graph(cdb, pool, graph); + + // main->a, a->b, b->c across 4 waves. + EXPECT_GE(graph.file_count(), 3u); + EXPECT_GE(graph.edge_count(), 3u); +} + +TEST_CASE(MultipleSourceFiles) { + TempDir tmp; + tmp.touch("inc/shared.h", R"(int shared = 1;)"); + tmp.touch("src/a.cpp", R"( +#include "shared.h" +void a() {} +)"); + tmp.touch("src/b.cpp", R"( +#include "shared.h" +void b() {} +)"); + + CompilationDatabase cdb; + PathPool pool; + DependencyGraph graph; + + std::vector inc = {"-I", tmp.path("inc")}; + auto json = build_cdb_json({ + {tmp.root, tmp.path("src/a.cpp"), inc}, + {tmp.root, tmp.path("src/b.cpp"), inc}, + }); + write_cdb(tmp, cdb, json); + scan_dependency_graph(cdb, pool, graph); + + EXPECT_GE(graph.file_count(), 2u); + EXPECT_GE(graph.edge_count(), 2u); +} + +TEST_CASE(ConditionalIncludes) { + TempDir tmp; + tmp.touch("inc/always.h", R"(// always)"); + tmp.touch("inc/maybe.h", R"(// maybe)"); + tmp.touch("src/main.cpp", R"( +#include "always.h" +#ifdef FOO +#include "maybe.h" +#endif +)"); + + CompilationDatabase cdb; + PathPool pool; + DependencyGraph graph; + + auto json = build_cdb_json({ + {tmp.root, tmp.path("src/main.cpp"), {"-I", tmp.path("inc")}} + }); + write_cdb(tmp, cdb, json); + scan_dependency_graph(cdb, pool, graph); + + // Both headers discovered (over-approximate). + EXPECT_GE(graph.edge_count(), 2u); + + // Verify conditional flag. + bool found_unconditional = false; + bool found_conditional = false; + auto includes = graph.get_includes(pool.cache[tmp.path("src/main.cpp")], 0); + for(auto id: includes) { + if(id & DependencyGraph::CONDITIONAL_FLAG) { + found_conditional = true; + } else { + found_unconditional = true; + } + } + EXPECT_TRUE(found_unconditional); + EXPECT_TRUE(found_conditional); +} + +TEST_CASE(ModuleExtraction) { + TempDir tmp; + tmp.touch("src/mymod.cpp", R"( +export module my.module; +export int foo() { return 42; } +)"); + + CompilationDatabase cdb; + PathPool pool; + DependencyGraph graph; + + auto json = build_cdb_json({ + {tmp.root, tmp.path("src/mymod.cpp"), {}} + }); + write_cdb(tmp, cdb, json); + scan_dependency_graph(cdb, pool, graph); + + auto result = graph.lookup_module("my.module"); + ASSERT_EQ(result.size(), 1u); + + auto path = pool.resolve(result[0]); + EXPECT_TRUE(llvm::sys::fs::equivalent(path, tmp.path("src/mymod.cpp"))); +} + +TEST_CASE(ModulePartition) { + TempDir tmp; + tmp.touch("src/mod.cpp", R"( +export module my.mod:part; +void impl() {} +)"); + + CompilationDatabase cdb; + PathPool pool; + DependencyGraph graph; + + auto json = build_cdb_json({ + {tmp.root, tmp.path("src/mod.cpp"), {}} + }); + write_cdb(tmp, cdb, json); + scan_dependency_graph(cdb, pool, graph); + + ASSERT_EQ(graph.lookup_module("my.mod:part").size(), 1u); +} + +TEST_CASE(DiamondIncludes) { + TempDir tmp; + tmp.touch("inc/common.h", R"(int common = 1;)"); + tmp.touch("inc/a.h", R"( +#include "common.h" +int a = 1; +)"); + tmp.touch("inc/b.h", R"( +#include "common.h" +int b = 1; +)"); + tmp.touch("src/main.cpp", R"( +#include "a.h" +#include "b.h" +int main() {} +)"); + + CompilationDatabase cdb; + PathPool pool; + DependencyGraph graph; + + auto json = build_cdb_json({ + {tmp.root, tmp.path("src/main.cpp"), {"-I", tmp.path("inc")}} + }); + write_cdb(tmp, cdb, json); + scan_dependency_graph(cdb, pool, graph); + + // main->a, main->b, a->common, b->common. + EXPECT_GE(graph.edge_count(), 4u); + EXPECT_GE(graph.file_count(), 3u); +} + +TEST_CASE(AngledVsQuoted) { + TempDir tmp; + tmp.touch("quoted/header.h", R"(int q = 1;)"); + tmp.touch("angled/header.h", R"(int a = 1;)"); + tmp.touch("src/main.cpp", R"( +#include "header.h" +#include +int main() {} +)"); + + CompilationDatabase cdb; + PathPool pool; + DependencyGraph graph; + + auto json = build_cdb_json({ + {tmp.root, + tmp.path("src/main.cpp"), + {"-iquote", tmp.path("quoted"), "-I", tmp.path("angled")}} + }); + write_cdb(tmp, cdb, json); + scan_dependency_graph(cdb, pool, graph); + + EXPECT_GE(graph.edge_count(), 2u); +} + +TEST_CASE(MissingInclude) { + TempDir tmp; + tmp.touch("src/main.cpp", R"( +#include "nonexistent.h" +int main() {} +)"); + + CompilationDatabase cdb; + PathPool pool; + DependencyGraph graph; + + auto json = build_cdb_json({ + {tmp.root, tmp.path("src/main.cpp"), {}} + }); + write_cdb(tmp, cdb, json); + scan_dependency_graph(cdb, pool, graph); + + EXPECT_EQ(graph.file_count(), 1u); + EXPECT_EQ(graph.edge_count(), 0u); +} + +TEST_CASE(MultipleModules) { + TempDir tmp; + tmp.touch("src/mod_a.cpp", R"( +export module mod.a; +void a() {} +)"); + tmp.touch("src/mod_b.cpp", R"( +export module mod.b; +void b() {} +)"); + tmp.touch("src/impl.cpp", R"( +module mod.a; +void a_impl() {} +)"); + + CompilationDatabase cdb; + PathPool pool; + DependencyGraph graph; + + auto json = build_cdb_json({ + {tmp.root, tmp.path("src/mod_a.cpp"), {}}, + {tmp.root, tmp.path("src/mod_b.cpp"), {}}, + {tmp.root, tmp.path("src/impl.cpp"), {}}, + }); + write_cdb(tmp, cdb, json); + scan_dependency_graph(cdb, pool, graph); + + EXPECT_EQ(graph.module_count(), 2u); + ASSERT_FALSE(graph.lookup_module("mod.a").empty()); + ASSERT_FALSE(graph.lookup_module("mod.b").empty()); +} + +TEST_CASE(DeepIncludeChain) { + TempDir tmp; + tmp.touch("inc/h4.h", R"(int h4 = 4;)"); + tmp.touch("inc/h3.h", R"(#include "h4.h")"); + tmp.touch("inc/h2.h", R"(#include "h3.h")"); + tmp.touch("inc/h1.h", R"(#include "h2.h")"); + tmp.touch("inc/h0.h", R"(#include "h1.h")"); + tmp.touch("src/main.cpp", R"( +#include "h0.h" +int main() {} +)"); + + CompilationDatabase cdb; + PathPool pool; + DependencyGraph graph; + + auto json = build_cdb_json({ + {tmp.root, tmp.path("src/main.cpp"), {"-I", tmp.path("inc")}} + }); + write_cdb(tmp, cdb, json); + scan_dependency_graph(cdb, pool, graph); + + // main->h0->h1->h2->h3->h4 across 5 waves. + EXPECT_GE(graph.edge_count(), 5u); + EXPECT_GE(graph.file_count(), 5u); +} + +TEST_CASE(ModuleWithIncludes) { + TempDir tmp; + tmp.touch("inc/util.h", R"(int util = 1;)"); + tmp.touch("src/mymod.cpp", R"( +module; +#include "util.h" +export module my.lib; +export int value() { return util; } +)"); + + CompilationDatabase cdb; + PathPool pool; + DependencyGraph graph; + + auto json = build_cdb_json({ + {tmp.root, tmp.path("src/mymod.cpp"), {"-I", tmp.path("inc")}} + }); + write_cdb(tmp, cdb, json); + scan_dependency_graph(cdb, pool, graph); + + ASSERT_FALSE(graph.lookup_module("my.lib").empty()); + EXPECT_GE(graph.edge_count(), 1u); +} + +// TODO: add tests for: +// - Circular includes (A→B→A) to verify BFS terminates correctly +// - ScanCache warm runs (pass ScanCache* to scan_dependency_graph twice) +// - get_all_includes flag merge: same header conditional in one config, +// unconditional in another — unconditional should win +// - set_includes overwrite: calling twice with same (path_id, config_id) + +}; // TEST_SUITE(ScanDependencyGraph) + +} // namespace +} // namespace clice::testing diff --git a/tests/unit/syntax/include_resolver_tests.cpp b/tests/unit/syntax/include_resolver_tests.cpp new file mode 100644 index 00000000..b95f033d --- /dev/null +++ b/tests/unit/syntax/include_resolver_tests.cpp @@ -0,0 +1,356 @@ +#include "test/temp_dir.h" +#include "test/test.h" +#include "syntax/include_resolver.h" +#include "syntax/scan.h" + +namespace clice::testing { +namespace { + +// ============================================================================ +// scan() — is_angled and is_include_next fields +// ============================================================================ + +TEST_SUITE(IncludeResolver) { + +TEST_CASE(ScanAngledVsQuoted) { + auto result = scan(R"( +#include +#include "local.h" +)"); + + ASSERT_EQ(result.includes.size(), 2u); + EXPECT_EQ(result.includes[0].path, "vector"); + EXPECT_TRUE(result.includes[0].is_angled); + EXPECT_FALSE(result.includes[0].is_include_next); + + EXPECT_EQ(result.includes[1].path, "local.h"); + EXPECT_FALSE(result.includes[1].is_angled); + EXPECT_FALSE(result.includes[1].is_include_next); +} + +TEST_CASE(ScanIncludeNext) { + auto result = scan(R"( +#include_next +)"); + + ASSERT_EQ(result.includes.size(), 1u); + EXPECT_EQ(result.includes[0].path, "stdlib.h"); + EXPECT_TRUE(result.includes[0].is_angled); + EXPECT_TRUE(result.includes[0].is_include_next); +} + +TEST_CASE(ScanMixedDirectives) { + auto result = scan(R"( +#include +#include "quoted.h" +#ifdef FOO +#include +#include "conditional_quoted.h" +#endif +#include_next "next_quoted.h" +)"); + + ASSERT_EQ(result.includes.size(), 5u); + + EXPECT_TRUE(result.includes[0].is_angled); + EXPECT_FALSE(result.includes[0].conditional); + + EXPECT_FALSE(result.includes[1].is_angled); + EXPECT_FALSE(result.includes[1].conditional); + + EXPECT_TRUE(result.includes[2].is_angled); + EXPECT_TRUE(result.includes[2].conditional); + + EXPECT_FALSE(result.includes[3].is_angled); + EXPECT_TRUE(result.includes[3].conditional); + + EXPECT_FALSE(result.includes[4].is_angled); + EXPECT_TRUE(result.includes[4].is_include_next); +} + +// ============================================================================ +// resolve_include() — tests with real filesystem +// ============================================================================ + +TEST_CASE(ResolveAbsolutePath) { + TempDir tmp; + tmp.touch("header.h"); + + auto abs_path = tmp.path("header.h"); + SearchConfig config; + DirListingCache dir_cache; + + auto result = resolve_include(abs_path, false, "", false, 0, config, dir_cache); + + ASSERT_TRUE(result.has_value()); + EXPECT_TRUE(llvm::sys::fs::equivalent(result->path, abs_path)); +} + +TEST_CASE(ResolveQuotedIncludeFromIncluderDir) { + TempDir tmp; + tmp.touch("src/main.cpp"); + tmp.touch("src/local.h"); + + SearchConfig config; + config.dirs.push_back({tmp.path("include")}); + config.angled_start_idx = 0; + + DirListingCache dir_cache; + + auto result = resolve_include("local.h", false, tmp.path("src"), false, 0, config, dir_cache); + + ASSERT_TRUE(result.has_value()); + EXPECT_TRUE(llvm::sys::fs::equivalent(result->path, tmp.path("src/local.h"))); +} + +TEST_CASE(ResolveAngledIncludeFromSearchDirs) { + TempDir tmp; + tmp.touch("include/sys/types.h"); + + SearchConfig config; + config.dirs.push_back({tmp.path("include")}); + config.angled_start_idx = 0; + + DirListingCache dir_cache; + + auto result = resolve_include("sys/types.h", true, "", false, 0, config, dir_cache); + + ASSERT_TRUE(result.has_value()); + EXPECT_TRUE(llvm::sys::fs::equivalent(result->path, tmp.path("include/sys/types.h"))); +} + +TEST_CASE(ResolveAngledSkipsQuotedDirs) { + TempDir tmp; + tmp.touch("quoted/header.h", "// quoted"); + tmp.touch("angled/header.h", "// angled"); + + SearchConfig config; + config.dirs.push_back({tmp.path("quoted")}); // index 0 — quoted only + config.dirs.push_back({tmp.path("angled")}); // index 1 — angled starts + config.angled_start_idx = 1; + + DirListingCache dir_cache; + + auto result = resolve_include("header.h", true, "", false, 0, config, dir_cache); + + ASSERT_TRUE(result.has_value()); + // Angled include should skip quoted dir and find in angled dir. + EXPECT_TRUE(llvm::sys::fs::equivalent(result->path, tmp.path("angled/header.h"))); + EXPECT_EQ(result->found_dir_idx, 1u); +} + +TEST_CASE(ResolveIncludeNext) { + TempDir tmp; + tmp.touch("dir1/stdlib.h", "// first"); + tmp.touch("dir2/stdlib.h", "// second"); + + SearchConfig config; + config.dirs.push_back({tmp.path("dir1")}); // index 0 + config.dirs.push_back({tmp.path("dir2")}); // index 1 + config.angled_start_idx = 0; + + DirListingCache dir_cache; + + // Simulate #include_next from a file found at dir index 0. + auto result = resolve_include("stdlib.h", true, "", true, 0, config, dir_cache); + + ASSERT_TRUE(result.has_value()); + // Should skip dir1 (found_dir_idx=0) and find in dir2. + EXPECT_TRUE(llvm::sys::fs::equivalent(result->path, tmp.path("dir2/stdlib.h"))); + EXPECT_EQ(result->found_dir_idx, 1u); +} + +TEST_CASE(ResolveNotFound) { + TempDir tmp; + + SearchConfig config; + config.dirs.push_back({tmp.path("include")}); + config.angled_start_idx = 0; + + DirListingCache dir_cache; + + auto result = + resolve_include("nonexistent.h", false, tmp.path("src"), false, 0, config, dir_cache); + + EXPECT_FALSE(result.has_value()); +} + +TEST_CASE(ResolveStatCacheHits) { + TempDir tmp; + tmp.touch("include/cached.h"); + + SearchConfig config; + config.dirs.push_back({tmp.path("include")}); + config.angled_start_idx = 0; + + DirListingCache dir_cache; + + // First resolution — populates cache. + auto result1 = resolve_include("cached.h", true, "", false, 0, config, dir_cache); + + ASSERT_TRUE(result1.has_value()); + + // Second resolution — should use cache (no filesystem I/O needed). + auto result2 = resolve_include("cached.h", true, "", false, 0, config, dir_cache); + + ASSERT_TRUE(result2.has_value()); + EXPECT_EQ(result1->path, result2->path); +} + +TEST_CASE(ResolveQuotedFallsBackToSearchDirs) { + TempDir tmp; + // Header not in includer dir, but in search dir. + tmp.touch("include/fallback.h"); + + SearchConfig config; + config.dirs.push_back({tmp.path("include")}); + config.angled_start_idx = 0; + + DirListingCache dir_cache; + + auto result = + resolve_include("fallback.h", false, tmp.path("src"), false, 0, config, dir_cache); + + ASSERT_TRUE(result.has_value()); + EXPECT_TRUE(llvm::sys::fs::equivalent(result->path, tmp.path("include/fallback.h"))); +} + +// ============================================================================ +// Three-tier search directory tests +// ============================================================================ + +TEST_CASE(AngledSkipsQuotedDirs) { + TempDir tmp; + tmp.touch("iquote/header.h", "// iquote"); + tmp.touch("idir/header.h", "// I dir"); + tmp.touch("sys/header.h", "// system"); + + // Layout: [iquote | idir | sys] + SearchConfig config; + config.dirs.push_back({tmp.path("iquote")}); // 0: Quoted + config.dirs.push_back({tmp.path("idir")}); // 1: Angled + config.dirs.push_back({tmp.path("sys")}); // 2: System + config.angled_start_idx = 1; + config.system_start_idx = 2; + + DirListingCache dir_cache; + + // should skip iquote, find in idir (Angled before System). + auto result = resolve_include("header.h", true, "", false, 0, config, dir_cache); + ASSERT_TRUE(result.has_value()); + EXPECT_TRUE(llvm::sys::fs::equivalent(result->path, tmp.path("idir/header.h"))); + EXPECT_EQ(result->found_dir_idx, 1u); +} + +TEST_CASE(AngledMissesQuotedOnly) { + TempDir tmp; + tmp.touch("iquote/only_here.h"); + + // Layout: [iquote | (no angled) | (no system)] + SearchConfig config; + config.dirs.push_back({tmp.path("iquote")}); + config.angled_start_idx = 1; + config.system_start_idx = 1; + + DirListingCache dir_cache; + + // should NOT find it — only in quoted dir. + auto result = resolve_include("only_here.h", true, "", false, 0, config, dir_cache); + EXPECT_FALSE(result.has_value()); +} + +TEST_CASE(QuotedSearchesAllDirs) { + TempDir tmp; + tmp.touch("sys/deep.h", "// system"); + + // Layout: [iquote | idir | sys] + SearchConfig config; + config.dirs.push_back({tmp.path("iquote")}); + config.dirs.push_back({tmp.path("idir")}); + config.dirs.push_back({tmp.path("sys")}); + config.angled_start_idx = 1; + config.system_start_idx = 2; + + DirListingCache dir_cache; + + // "deep.h" is only in system dir, but quoted search goes through all. + auto result = resolve_include("deep.h", false, "", false, 0, config, dir_cache); + ASSERT_TRUE(result.has_value()); + EXPECT_TRUE(llvm::sys::fs::equivalent(result->path, tmp.path("sys/deep.h"))); +} + +TEST_CASE(AngledBeforeSystem) { + TempDir tmp; + tmp.touch("idir/priority.h", "// angled"); + tmp.touch("sys/priority.h", "// system"); + + SearchConfig config; + config.dirs.push_back({tmp.path("idir")}); // 0: Angled + config.dirs.push_back({tmp.path("sys")}); // 1: System + config.angled_start_idx = 0; + config.system_start_idx = 1; + + DirListingCache dir_cache; + + // should find in Angled (index 0) before System (index 1). + auto result = resolve_include("priority.h", true, "", false, 0, config, dir_cache); + ASSERT_TRUE(result.has_value()); + EXPECT_TRUE(llvm::sys::fs::equivalent(result->path, tmp.path("idir/priority.h"))); + EXPECT_EQ(result->found_dir_idx, 0u); +} + +TEST_CASE(AfterSearchedLast) { + TempDir tmp; + tmp.touch("after/fallback.h", "// after"); + + // Layout: [| /angled | /sys | /after] + SearchConfig config; + config.dirs.push_back({tmp.path("angled")}); + config.dirs.push_back({tmp.path("sys")}); + config.dirs.push_back({tmp.path("after")}); + config.angled_start_idx = 0; + config.system_start_idx = 1; + config.after_start_idx = 2; + + DirListingCache dir_cache; + + // not in angled or sys, found in after. + auto result = resolve_include("fallback.h", true, "", false, 0, config, dir_cache); + ASSERT_TRUE(result.has_value()); + EXPECT_TRUE(llvm::sys::fs::equivalent(result->path, tmp.path("after/fallback.h"))); + EXPECT_EQ(result->found_dir_idx, 2u); +} + +TEST_CASE(IncludeNextPropagatesIdx) { + TempDir tmp; + tmp.touch("dir0/limits.h", "// local"); + tmp.touch("dir1/limits.h", "// system1"); + tmp.touch("dir2/limits.h", "// system2"); + + SearchConfig config; + config.dirs.push_back({tmp.path("dir0")}); + config.dirs.push_back({tmp.path("dir1")}); + config.dirs.push_back({tmp.path("dir2")}); + config.angled_start_idx = 0; + config.system_start_idx = 1; + + DirListingCache dir_cache; + + // File found at dir1 (index 1) does #include_next + auto result = resolve_include("limits.h", true, "", true, 1, config, dir_cache); + ASSERT_TRUE(result.has_value()); + // Should skip dirs 0-1, find in dir2. + EXPECT_TRUE(llvm::sys::fs::equivalent(result->path, tmp.path("dir2/limits.h"))); + EXPECT_EQ(result->found_dir_idx, 2u); +} + +// TODO: add tests for: +// - #include_next crossing segment boundaries (angled→system) +// - #include_next at last search dir (should return nullopt) +// - Relative paths with .. components ("../sibling/header.h") +// - ResolvedSearchConfig overload (the production hot path) + +}; // TEST_SUITE(IncludeResolver) + +} // namespace +} // namespace clice::testing diff --git a/tests/unit/syntax/scan_tests.cpp b/tests/unit/syntax/scan_tests.cpp index b14bcb91..ac10ec4c 100644 --- a/tests/unit/syntax/scan_tests.cpp +++ b/tests/unit/syntax/scan_tests.cpp @@ -6,6 +6,8 @@ namespace { TEST_SUITE(Scan) { +// === scan() tests === + TEST_CASE(BasicIncludes) { auto result = scan(R"( #include @@ -15,8 +17,10 @@ int x = 1; ASSERT_EQ(result.includes.size(), 2u); EXPECT_EQ(result.includes[0].path, "vector"); + EXPECT_TRUE(result.includes[0].is_angled); EXPECT_FALSE(result.includes[0].conditional); EXPECT_EQ(result.includes[1].path, "foo/bar.h"); + EXPECT_FALSE(result.includes[1].is_angled); EXPECT_FALSE(result.includes[1].conditional); EXPECT_TRUE(result.module_name.empty()); } @@ -71,6 +75,7 @@ export module my.module; EXPECT_FALSE(result.need_preprocess); ASSERT_EQ(result.includes.size(), 1u); EXPECT_EQ(result.includes[0].path, "header.h"); + EXPECT_TRUE(result.includes[0].is_angled); } TEST_CASE(ModulePartition) { @@ -128,6 +133,8 @@ int main() { EXPECT_TRUE(result.includes.empty()); EXPECT_TRUE(result.module_name.empty()); + EXPECT_FALSE(result.is_interface_unit); + EXPECT_FALSE(result.need_preprocess); } // === scan_precise() tests ===