From dffa884e5a7619037a3060e53269cf5b697dc767 Mon Sep 17 00:00:00 2001 From: ykiko Date: Sun, 19 Oct 2025 21:53:57 +0800 Subject: [PATCH] Store indices to disk (#279) --- include/Compiler/CompilationUnit.h | 6 + include/Index/IncludeGraph.h | 6 +- include/Index/MergedIndex.h | 177 +++------ include/Index/ProjectIndex.h | 2 +- include/Index/TUIndex.h | 9 + include/Index/schema.fbs | 28 +- include/Server/Indexer.h | 28 +- src/Compiler/Compilation.cpp | 16 +- src/Compiler/CompilationUnit.cpp | 73 ++-- src/Compiler/CompilationUnitImpl.h | 16 +- src/Index/IncludeGraph.cpp | 2 +- src/Index/MergedIndex.cpp | 588 +++++++++++++++++++++++++---- src/Index/ProjectIndex.cpp | 73 +++- src/Index/Serialization.cpp | 227 ----------- src/Index/Serialization.h | 67 ++++ src/Index/TUIndex.cpp | 39 ++ src/Server/Indexer.cpp | 115 +++++- src/Server/Lifecycle.cpp | 2 + src/Server/Server.cpp | 2 +- tests/unit/Index/MergedIndex.cpp | 8 +- tests/unit/Index/ProjectIndex.cpp | 0 tests/unit/Index/TUIndex.cpp | 9 +- 22 files changed, 979 insertions(+), 514 deletions(-) delete mode 100644 src/Index/Serialization.cpp create mode 100644 src/Index/Serialization.h create mode 100644 tests/unit/Index/ProjectIndex.cpp diff --git a/include/Compiler/CompilationUnit.h b/include/Compiler/CompilationUnit.h index f812281a..079cb66d 100644 --- a/include/Compiler/CompilationUnit.h +++ b/include/Compiler/CompilationUnit.h @@ -1,5 +1,7 @@ #pragma once +#include + #include "Directive.h" #include "Compiler/Diagnostic.h" #include "AST/SymbolID.h" @@ -166,6 +168,10 @@ public: auto top_level_decls() -> llvm::ArrayRef; + std::chrono::milliseconds build_at(); + + std::chrono::milliseconds build_duration(); + clang::LangOptions& lang_options(); clang::ASTContext& context(); diff --git a/include/Index/IncludeGraph.h b/include/Index/IncludeGraph.h index 8e5aee3d..5cb5f105 100644 --- a/include/Index/IncludeGraph.h +++ b/include/Index/IncludeGraph.h @@ -12,13 +12,15 @@ namespace clice::index { struct IncludeLocation { /// The file path of the include directive. - std::uint32_t path = -1; + std::uint32_t path_id = -1; /// The line number of the include directive, 1-based. std::uint32_t line = -1; /// The include location that introduces this file. std::uint32_t include = -1; + + friend bool operator== (const IncludeLocation&, const IncludeLocation&) = default; }; struct IncludeGraph { @@ -51,7 +53,7 @@ struct IncludeGraph { std::uint32_t path_id(clang::FileID fid) { auto include = include_location_id(fid); if(include != -1) { - return locations[include].path; + return locations[include].path_id; } else { return paths.size() - 1; } diff --git a/include/Index/MergedIndex.h b/include/Index/MergedIndex.h index 78f36de5..f1b28b98 100644 --- a/include/Index/MergedIndex.h +++ b/include/Index/MergedIndex.h @@ -1,156 +1,83 @@ #pragma once #include "TUIndex.h" -#include "Support/Bitmap.h" #include "llvm/Support/Allocator.h" - -namespace llvm { - -template -unsigned dense_hash(const Ts&... ts) { - return llvm::DenseMapInfo>::getHashValue(std::tuple{ts...}); -} - -template <> -struct DenseMapInfo { - using R = clice::LocalSourceRange; - using V = clice::index::Occurrence; - - inline static V getEmptyKey() { - return V(R(-1, 0), 0); - } - - inline static V getTombstoneKey() { - return V(R(-2, 0), 0); - } - - static auto getHashValue(const V& v) { - return dense_hash(v.range.begin, v.range.end, v.target); - } - - static bool isEqual(const V& lhs, const V& rhs) { - return lhs.range == rhs.range && lhs.target == rhs.target; - } -}; - -template <> -struct DenseMapInfo { - using R = clice::index::Relation; - - inline static R getEmptyKey() { - return R{ - .kind = clice::RelationKind(), - .range = clice::LocalSourceRange(-1, 0), - .target_symbol = 0, - }; - } - - inline static R getTombstoneKey() { - return R{ - .kind = clice::RelationKind(), - .range = clice::LocalSourceRange(-2, 0), - .target_symbol = 0, - }; - } - - /// Contextual doen't take part in hashing and equality. - static auto getHashValue(const R& relation) { - return dense_hash(relation.kind.value(), - relation.range.begin, - relation.range.end, - relation.target_symbol); - } - - static bool isEqual(const R& lhs, const R& rhs) { - return lhs.kind == rhs.kind && lhs.range == rhs.range && - lhs.target_symbol == rhs.target_symbol; - } -}; - -} // namespace llvm +#include "llvm/Support/MemoryBuffer.h" namespace clice::index { -/// struct CompilationContext { -/// /// The target of this compilation. -/// llvm::StringRef target; -/// -/// /// The canonical compilation command. -/// llvm::StringRef command; -/// -/// /// A version field for verification. -/// std::uint32_t version; -/// }; -/// -/// struct HeaderContext : CompilationContext { -/// /// The include location in the include graph. -/// std::uint32_t include; -/// -/// /// The path of the file includes this header. -/// llvm::StringRef path; -/// }; +class MergedIndex { +private: + struct Impl; -struct HeaderContexts { - std::uint32_t version = 0; + using Self = MergedIndex; - struct Context { - std::uint32_t include; - std::uint32_t canonical_id; + MergedIndex(std::unique_ptr buffer, std::unique_ptr impl); - friend bool operator== (const Context&, const Context&) = default; - }; + void load_in_memory(this Self& self); - /// A array of include location and its context id. - llvm::SmallVector includes; +public: + MergedIndex(); - friend bool operator== (const HeaderContexts&, const HeaderContexts&) = default; -}; + MergedIndex(llvm::StringRef data); -struct MergedIndex { - /// For each merged index, we will give it a canonical id. - /// The max canonical id. - std::uint32_t max_canonical_id = 0; + MergedIndex(const MergedIndex&) = delete; - /// We use the value of SHA256 to judge whether two indices are same. - /// Index with same content will be given same canonical id. - llvm::StringMap canonical_cache; + MergedIndex(MergedIndex&& other); - /// The reference count of each canonical id. - std::vector canonical_ref_counts; + MergedIndex& operator= (const MergedIndex&) = delete; - /// The canonical id set of removed index. - roaring::Roaring removed; + MergedIndex& operator= (MergedIndex&& other); - /// A map between source file path and its header contexts. - llvm::StringMap contexts; + ~MergedIndex(); - /// All merged symbol occurrences. - llvm::DenseMap occurrences; + /// Load merged index from disk + static MergedIndex load(llvm::StringRef path); - /// All merged symbol relations. - llvm::DenseMap> relations; + /// Serialize it to binary format. + void serialize(this const Self& self, llvm::raw_ostream& out); - /// FIXME: The content of this file. - /// std::string content; + /// Lookup the occurrence in corresponding offset. + void lookup(this const Self& self, + std::uint32_t offset, + llvm::function_ref callback); - /// Sorted occurrences cache for fast lookup. - std::vector cache_occurrences; + /// Lookup the relations of given symbol. + void lookup(this const Self& self, + SymbolHash symbol, + RelationKind kind, + llvm::function_ref callback); - void remove(llvm::StringRef path); + /// Whether this index needs rebuilding. + bool need_update(this const Self& self, llvm::ArrayRef path_mapping); - void merge(llvm::StringRef path, std::uint32_t include, FileIndex& index); + bool need_rewrite() { + return impl != nullptr; + } - std::vector lookup(std::uint32_t offset); + /// Remove the index of specific path id. + void remove(this Self& self, std::uint32_t path_id); - void serialize(this MergedIndex& self, llvm::raw_ostream& out); + /// Merge the index with given compilation context. + void merge(this Self& self, + std::uint32_t path_id, + std::chrono::milliseconds build_at, + std::vector include_locations, + FileIndex& index); - friend bool operator== (const MergedIndex&, const MergedIndex&) = default; -}; + /// Merge the index with given header context. + void merge(this Self& self, std::uint32_t path_id, std::uint32_t include_id, FileIndex& index); -struct MergedIndexView { - const void* data; + friend bool operator== (MergedIndex& lhs, MergedIndex& rhs); - MergedIndex deserialize(); +private: + /// The binary serialization data of index. If you load merged index + /// from disk, we use directly access the data without deserialization + /// unless you want to modify it. + std::unique_ptr buffer; + + /// The in memory data of the index. + std::unique_ptr impl; }; } // namespace clice::index diff --git a/include/Index/ProjectIndex.h b/include/Index/ProjectIndex.h index a8fcbf65..2eb66590 100644 --- a/include/Index/ProjectIndex.h +++ b/include/Index/ProjectIndex.h @@ -47,7 +47,7 @@ struct ProjectIndex { SymbolTable symbols; - void merge(this ProjectIndex& self, TUIndex& index); + llvm::SmallVector merge(this ProjectIndex& self, TUIndex& index); void serialize(this ProjectIndex& self, llvm::raw_ostream& os); diff --git a/include/Index/TUIndex.h b/include/Index/TUIndex.h index 15af6b2c..21a1f374 100644 --- a/include/Index/TUIndex.h +++ b/include/Index/TUIndex.h @@ -1,5 +1,6 @@ #pragma once +#include #include "IncludeGraph.h" #include "AST/SourceCode.h" #include "AST/SymbolKind.h" @@ -43,6 +44,8 @@ struct FileIndex { llvm::DenseMap> relations; std::vector occurrences; + + std::array hash(); }; struct Symbol { @@ -59,12 +62,18 @@ struct Symbol { using SymbolTable = llvm::DenseMap; struct TUIndex { + /// The building timestamp of this file. + std::chrono::milliseconds built_at; + + /// The include information of this file. IncludeGraph graph; SymbolTable symbols; llvm::DenseMap file_indices; + FileIndex main_file_index; + static TUIndex build(CompilationUnit& unit); }; diff --git a/include/Index/schema.fbs b/include/Index/schema.fbs index 1ae268d8..b9e6cbf9 100644 --- a/include/Index/schema.fbs +++ b/include/Index/schema.fbs @@ -22,19 +22,29 @@ table CacheEntry { canonical_id: uint; } -struct Context { - include_: uint; +struct IncludeContext { + include_id: uint; canonical_id: uint; } -table HeaderContexts { +table HeaderContextEntry { + path_id: uint; version: uint; - includes: [Context]; + includes: [IncludeContext]; } -table HeaderContextsEntry { - path: string; - contexts: HeaderContexts; +struct IncludeLocation { + path_id: uint; + line: uint; + include_id: uint; +} + +table CompilationContextEntry { + path_id: uint; + version: uint; + canonical_id: uint; + build_at: ulong; + include_locations: [IncludeLocation]; } table OccurrenceEntry { @@ -67,7 +77,9 @@ table MergedIndex { canonical_cache: [CacheEntry]; - contexts: [HeaderContextsEntry]; + header_contexts: [HeaderContextEntry]; + + compilation_contexts: [CompilationContextEntry]; occurrences: [OccurrenceEntry]; diff --git a/include/Server/Indexer.h b/include/Server/Indexer.h index d463f9d0..314acee7 100644 --- a/include/Server/Indexer.h +++ b/include/Server/Indexer.h @@ -3,6 +3,8 @@ #include #include +#include "Config.h" +#include "Convert.h" #include "Async/Async.h" #include "Compiler/Command.h" #include "Index/MergedIndex.h" @@ -19,7 +21,8 @@ class CompilationUnit; class Indexer { public: - Indexer(CompilationDatabase& database) : database(database) {} + Indexer(CompilationDatabase& database, config::Config& config) : + database(database), config(config) {} async::Task<> index(llvm::StringRef path); @@ -29,8 +32,27 @@ public: async::Task<> index_all(); + index::MergedIndex& get_index(std::uint32_t path_id) { + auto [it, success] = in_memory_indices.try_emplace(path_id); + if(!success) { + return it->second; + } + + auto it2 = project_index.indices.find(path_id); + if(it2 != project_index.indices.end()) { + auto path = project_index.path_pool.path(it2->second); + it->second = index::MergedIndex::load(path); + } + + return it->second; + } + using Result = async::Task>; + void load_from_disk(); + + void save_to_disk(); + auto lookup(llvm::StringRef path, std::uint32_t offset, RelationKind kind) -> Result; auto declaration(llvm::StringRef path, std::uint32_t offset) -> Result; @@ -46,8 +68,12 @@ public: private: CompilationDatabase& database; + config::Config& config; + index::ProjectIndex project_index; + PathMapping mapping; + llvm::DenseMap in_memory_indices; /// Currently indexes tasks ... diff --git a/src/Compiler/Compilation.cpp b/src/Compiler/Compilation.cpp index bf6da8fd..5c17bbea 100644 --- a/src/Compiler/Compilation.cpp +++ b/src/Compiler/Compilation.cpp @@ -152,6 +152,10 @@ template >(); auto diagnostic_engine = @@ -243,18 +247,22 @@ CompilationResult run_clang(CompilationParams& params, resolver.emplace(instance->getSema()); } + auto build_end = chrono::steady_clock::now().time_since_epoch(); + auto impl = new CompilationUnit::Impl{ .interested = pp.getSourceManager().getMainFileID(), .src_mgr = instance->getSourceManager(), .action = std::move(action), .instance = std::move(instance), - .m_resolver = std::move(resolver), + .resolver = std::move(resolver), .buffer = std::move(token_buffer), - .m_directives = std::move(directives), - .pathCache = llvm::DenseMap(), - .symbolHashCache = llvm::DenseMap(), + .directives = std::move(directives), + .path_cache = llvm::DenseMap(), + .symbol_hash_cache = llvm::DenseMap(), .diagnostics = diagnostics, .top_level_decls = std::move(top_level_decls), + .build_at = chrono::duration_cast(build_at), + .build_duration = chrono::duration_cast(build_end - build_start), }; CompilationUnit unit(params.kind, impl); diff --git a/src/Compiler/CompilationUnit.cpp b/src/Compiler/CompilationUnit.cpp index fa17d298..6a01e75a 100644 --- a/src/Compiler/CompilationUnit.cpp +++ b/src/Compiler/CompilationUnit.cpp @@ -78,7 +78,7 @@ auto CompilationUnit::file_offset(clang::SourceLocation location) -> std::uint32 auto CompilationUnit::file_path(clang::FileID fid) -> llvm::StringRef { assert(fid.isValid() && "Invalid fid"); - if(auto it = impl->pathCache.find(fid); it != impl->pathCache.end()) { + if(auto it = impl->path_cache.find(fid); it != impl->path_cache.end()) { return it->second; } @@ -97,11 +97,11 @@ auto CompilationUnit::file_path(clang::FileID fid) -> llvm::StringRef { /// Allocate the path in the storage. auto size = path.size(); - auto data = impl->pathStorage.Allocate(size + 1); + auto data = impl->path_storage.Allocate(size + 1); memcpy(data, path.data(), size); data[size] = '\0'; - auto [it, inserted] = impl->pathCache.try_emplace(fid, llvm::StringRef(data, size)); + auto [it, inserted] = impl->path_cache.try_emplace(fid, llvm::StringRef(data, size)); assert(inserted && "File path already exists"); return it->second; } @@ -118,6 +118,18 @@ auto CompilationUnit::interested_content() -> llvm::StringRef { return file_content(impl->interested); } +bool CompilationUnit::is_builtin_file(clang::FileID fid) { + // No FileEntryRef => built-in/command line/scratch. + if(!impl->src_mgr.getFileEntryRefForID(fid)) { + if(auto buffer = impl->src_mgr.getBufferOrNone(fid)) { + auto name = buffer->getBufferIdentifier(); + return name == "" || name == "" || name == ""; + } + } + + return false; +} + auto CompilationUnit::start_location(clang::FileID fid) -> clang::SourceLocation { return impl->src_mgr.getLocForStartOfFile(fid); } @@ -197,6 +209,22 @@ bool CompilationUnit::is_module_interface_unit() { return impl->instance->getPreprocessor().isInNamedInterfaceUnit(); } +auto CompilationUnit::diagnostics() -> llvm::ArrayRef { + return *impl->diagnostics; +} + +auto CompilationUnit::top_level_decls() -> llvm::ArrayRef { + return impl->top_level_decls; +} + +std::chrono::milliseconds CompilationUnit::build_at() { + return impl->build_at; +} + +std::chrono::milliseconds CompilationUnit::build_duration() { + return impl->build_duration; +} + clang::LangOptions& CompilationUnit::lang_options() { return impl->instance->getLangOpts(); } @@ -231,14 +259,14 @@ std::vector CompilationUnit::deps() { index::SymbolID CompilationUnit::getSymbolID(const clang::NamedDecl* decl) { uint64_t hash; - auto iter = impl->symbolHashCache.find(decl); - if(iter != impl->symbolHashCache.end()) { + auto iter = impl->symbol_hash_cache.find(decl); + if(iter != impl->symbol_hash_cache.end()) { hash = iter->second; } else { llvm::SmallString<128> USR; index::generateUSRForDecl(decl, USR); hash = llvm::xxh3_64bits(USR); - impl->symbolHashCache.try_emplace(decl, hash); + impl->symbol_hash_cache.try_emplace(decl, hash); } return index::SymbolID{hash, ast::name_of(decl)}; } @@ -246,44 +274,31 @@ index::SymbolID CompilationUnit::getSymbolID(const clang::NamedDecl* decl) { index::SymbolID CompilationUnit::getSymbolID(const clang::MacroInfo* macro) { std::uint64_t hash; auto name = token_spelling(macro->getDefinitionLoc()); - auto iter = impl->symbolHashCache.find(macro); - if(iter != impl->symbolHashCache.end()) { + auto iter = impl->symbol_hash_cache.find(macro); + if(iter != impl->symbol_hash_cache.end()) { hash = iter->second; } else { llvm::SmallString<128> USR; index::generateUSRForMacro(name, macro->getDefinitionLoc(), impl->src_mgr, USR); hash = llvm::xxh3_64bits(USR); - impl->symbolHashCache.try_emplace(macro, hash); + impl->symbol_hash_cache.try_emplace(macro, hash); } return index::SymbolID{hash, name.str()}; } -bool CompilationUnit::is_builtin_file(clang::FileID fid) { - auto path = file_path(fid); - return path == "" || path == "" || path == ""; -} - -auto CompilationUnit::diagnostics() -> llvm::ArrayRef { - return *impl->diagnostics; -} - -auto CompilationUnit::top_level_decls() -> llvm::ArrayRef { - return impl->top_level_decls; -} - const llvm::DenseSet& CompilationUnit::files() { - if(impl->allFiles.empty()) { + if(impl->all_files.empty()) { /// FIXME: handle preamble and embed file id. for(auto& [fid, diretive]: directives()) { for(auto& include: diretive.includes) { if(!include.skipped) { - impl->allFiles.insert(include.fid); + impl->all_files.insert(include.fid); } } } - impl->allFiles.insert(impl->src_mgr.getMainFileID()); + impl->all_files.insert(impl->src_mgr.getMainFileID()); } - return impl->allFiles; + return impl->all_files; } clang::TranslationUnitDecl* CompilationUnit::tu() { @@ -291,12 +306,12 @@ clang::TranslationUnitDecl* CompilationUnit::tu() { } llvm::DenseMap& CompilationUnit::directives() { - return impl->m_directives; + return impl->directives; } TemplateResolver& CompilationUnit::resolver() { - assert(impl->m_resolver && "Template resolver is not available"); - return *impl->m_resolver; + assert(impl->resolver && "Template resolver is not available"); + return *impl->resolver; } clang::ASTContext& CompilationUnit::context() { diff --git a/src/Compiler/CompilationUnitImpl.h b/src/Compiler/CompilationUnitImpl.h index 577d403c..06ab142c 100644 --- a/src/Compiler/CompilationUnitImpl.h +++ b/src/Compiler/CompilationUnitImpl.h @@ -21,27 +21,31 @@ struct CompilationUnit::Impl { std::unique_ptr instance; /// The template resolver used to resolve dependent name. - std::optional m_resolver; + std::optional resolver; /// Token information collected during the preprocessing. std::optional buffer; /// All diretive information collected during the preprocessing. - llvm::DenseMap m_directives; + llvm::DenseMap directives; - llvm::DenseSet allFiles; + llvm::DenseSet all_files; /// Cache for file path. It is used to avoid multiple file path lookup. - llvm::DenseMap pathCache; + llvm::DenseMap path_cache; /// Cache for symbol id. - llvm::DenseMap symbolHashCache; + llvm::DenseMap symbol_hash_cache; - llvm::BumpPtrAllocator pathStorage; + llvm::BumpPtrAllocator path_storage; std::shared_ptr> diagnostics; std::vector top_level_decls; + + std::chrono::milliseconds build_at; + + std::chrono::milliseconds build_duration; }; } // namespace clice diff --git a/src/Index/IncludeGraph.cpp b/src/Index/IncludeGraph.cpp index a1e0e7e8..51b85344 100644 --- a/src/Index/IncludeGraph.cpp +++ b/src/Index/IncludeGraph.cpp @@ -31,7 +31,7 @@ static std::uint32_t addIncludeChain(CompilationUnit& unit, if(success) { paths.emplace_back(path); } - locations[index].path = iter->second; + locations[index].path_id = iter->second; uint32_t include = -1; if(presumed.getIncludeLoc().isValid()) { diff --git a/src/Index/MergedIndex.cpp b/src/Index/MergedIndex.cpp index ad303881..f4df581f 100644 --- a/src/Index/MergedIndex.cpp +++ b/src/Index/MergedIndex.cpp @@ -1,116 +1,542 @@ +#include "Serialization.h" #include "Support/Compare.h" -#include "schema_generated.h" +#include "Support/FileSystem.h" #include "Index/MergedIndex.h" -#include "llvm/Support/SHA256.h" +#include "llvm/ADT/DenseSet.h" #include "llvm/Support/raw_os_ostream.h" +namespace llvm { + +template +unsigned dense_hash(const Ts&... ts) { + return llvm::DenseMapInfo>::getHashValue(std::tuple{ts...}); +} + +template <> +struct DenseMapInfo { + using R = clice::LocalSourceRange; + using V = clice::index::Occurrence; + + inline static V getEmptyKey() { + return V(R(-1, 0), 0); + } + + inline static V getTombstoneKey() { + return V(R(-2, 0), 0); + } + + static auto getHashValue(const V& v) { + return dense_hash(v.range.begin, v.range.end, v.target); + } + + static bool isEqual(const V& lhs, const V& rhs) { + return lhs.range == rhs.range && lhs.target == rhs.target; + } +}; + +template <> +struct DenseMapInfo { + using R = clice::index::Relation; + + inline static R getEmptyKey() { + return R{ + .kind = clice::RelationKind(), + .range = clice::LocalSourceRange(-1, 0), + .target_symbol = 0, + }; + } + + inline static R getTombstoneKey() { + return R{ + .kind = clice::RelationKind(), + .range = clice::LocalSourceRange(-2, 0), + .target_symbol = 0, + }; + } + + /// Contextual doesn’t take part in hashing and equality. + static auto getHashValue(const R& relation) { + return dense_hash(relation.kind.value(), + relation.range.begin, + relation.range.end, + relation.target_symbol); + } + + static bool isEqual(const R& lhs, const R& rhs) { + return lhs.kind == rhs.kind && lhs.range == rhs.range && + lhs.target_symbol == rhs.target_symbol; + } +}; +} // namespace llvm + namespace clice::index { -namespace { +struct IncludeContext { + std::uint32_t include_id; -auto sha256_hash(FileIndex& index) { - llvm::SHA256 hasher; + std::uint32_t canonical_id; - using u8 = std::uint8_t; + friend bool operator== (const IncludeContext&, const IncludeContext&) = default; +}; - if(!index.occurrences.empty()) { - static_assert(sizeof(Occurrence) == sizeof(Range) + sizeof(SymbolHash)); - static_assert(sizeof(Occurrence) % 8 == 0); - auto data = reinterpret_cast(index.occurrences.data()); - auto size = index.occurrences.size() * sizeof(Occurrence); - hasher.update(llvm::ArrayRef(data, size)); +struct HeaderContext { + std::uint32_t version = 0; + + llvm::SmallVector includes; + + friend bool operator== (const HeaderContext&, const HeaderContext&) = default; +}; + +struct CompilationContext { + std::uint32_t version = 0; + + std::uint32_t canonical_id = 0; + + std::uint64_t build_at; + + std::vector include_locations; + + friend bool operator== (const CompilationContext&, const CompilationContext&) = default; +}; + +struct MergedIndex::Impl { + /// The content of corresponding source file. + std::string content; + + /// If this file is included by other source file, then it has header contexts. + /// The key represents the source file id, value represents the context in the + /// source file. + llvm::SmallDenseMap header_contexts; + + /// If this file is compiled as source file, then it has compilation contexts. + /// The key represents the compilation command id. File with compilation content + /// could provide header contexts for other files. + llvm::SmallDenseMap compilation_contexts; + + /// We use the value of SHA256 to judge whether two indices are same. + /// The same indices will be given same canonical id. + llvm::StringMap canonical_cache; + + /// The max canonical id we have allocated. + std::uint32_t max_canonical_id = 0; + + /// The reference count of each canonical id. + std::vector canonical_ref_counts; + + /// The canonical id set of removed index. + roaring::Roaring removed; + + /// All merged symbol occurrences. + llvm::DenseMap occurrences; + + /// All merged symbol relations. + llvm::DenseMap> relations; + + /// Sorted occurrences cache for fast lookup. + std::vector occurrences_cache; + + void merge(this Impl& self, std::uint32_t path_id, FileIndex& index, auto&& add_context) { + auto hash = index.hash(); + auto hash_key = llvm::StringRef(reinterpret_cast(hash.data()), hash.size()); + auto [it, success] = self.canonical_cache.try_emplace(hash_key, self.max_canonical_id); + + auto canonical_id = it->second; + add_context(self, canonical_id); + + if(!success) { + self.canonical_ref_counts[canonical_id] += 1; + self.removed.remove(canonical_id); + return; + } + + for(auto& occurrence: index.occurrences) { + self.occurrences[occurrence].add(canonical_id); + } + + for(auto& [symbol_id, relations]: index.relations) { + auto& target = self.relations[symbol_id]; + for(auto& relation: relations) { + target[relation].add(canonical_id); + } + } + + self.canonical_ref_counts.emplace_back(1); + self.max_canonical_id += 1; } - for(auto& [symbol_id, relations]: index.relations) { - hasher.update(std::bit_cast>(symbol_id)); - static_assert(sizeof(Relation) == - sizeof(RelationKind) + 4 + sizeof(Range) + sizeof(SymbolHash)); - static_assert(sizeof(Relation) % 8 == 0); + friend bool operator== (const Impl&, const Impl&) = default; +}; - if(!relations.empty()) { - auto data = reinterpret_cast(relations.data()); - auto size = relations.size() * sizeof(Relation); - hasher.update(llvm::ArrayRef(data, size)); +MergedIndex::MergedIndex(std::unique_ptr buffer, std::unique_ptr impl) : + buffer(std::move(buffer)), impl(std::move(impl)) {} + +MergedIndex::MergedIndex() = default; + +MergedIndex::MergedIndex(llvm::StringRef data) : + MergedIndex(llvm::MemoryBuffer::getMemBuffer(data, "", false), nullptr) {} + +MergedIndex::MergedIndex(MergedIndex&& other) = default; + +MergedIndex& MergedIndex::operator= (MergedIndex&& other) = default; + +MergedIndex::~MergedIndex() = default; + +void MergedIndex::load_in_memory(this Self& self) { + if(self.impl) { + return; + } + + self.impl = std::make_unique(); + if(!self.buffer) { + return; + } + + auto& index = *self.impl; + auto root = fbs::GetRoot(self.buffer->getBufferStart()); + + index.max_canonical_id = root->max_canonical_id(); + + for(auto entry: *root->canonical_cache()) { + index.canonical_cache.try_emplace(entry->sha256()->string_view(), entry->canonical_id()); + } + + index.canonical_ref_counts.resize(index.max_canonical_id, 0); + + for(auto entry: *root->header_contexts()) { + HeaderContext context; + auto path = entry->path_id(); + context.version = entry->version(); + for(auto include: *entry->includes()) { + index.canonical_ref_counts[include->canonical_id()] += 1; + context.includes.emplace_back(*safe_cast(include)); + } + index.header_contexts.try_emplace(path, std::move(context)); + } + + for(auto entry: *root->compilation_contexts()) { + CompilationContext context; + auto path = entry->path_id(); + context.version = entry->version(); + context.canonical_id = entry->canonical_id(); + context.build_at = entry->build_at(); + for(auto include: *entry->include_locations()) { + context.include_locations.emplace_back(*safe_cast(include)); + } + index.compilation_contexts.try_emplace(path, std::move(context)); + } + + for(auto entry: *root->occurrences()) { + index.occurrences.try_emplace(*safe_cast(entry->occurrence()), + read_bitmap(entry->context())); + } + + for(auto entry: *root->relations()) { + auto& relations = index.relations[entry->symbol()]; + for(auto relation_entry: *entry->relations()) { + relations.try_emplace(*safe_cast(relation_entry->relation()), + read_bitmap(relation_entry->context())); } } - return hasher.final(); + self.buffer.reset(); } -} // namespace +MergedIndex MergedIndex::load(llvm::StringRef path) { + auto buffer = llvm::MemoryBuffer::getFile(path); + if(!buffer) { + return MergedIndex(); + } else { + return MergedIndex(std::move(*buffer), nullptr); + } +} -void MergedIndex::remove(llvm::StringRef path) { - auto& includes = contexts[path].includes; +void MergedIndex::serialize(this const Self& self, llvm::raw_ostream& out) { + if(self.buffer) { + out.write(self.buffer->getBufferStart(), self.buffer->getBufferSize()); + return; + } + + if(!self.impl) { + return; + } + + auto& index = self.impl; + + fbs::FlatBufferBuilder builder(1024); + + llvm::SmallVector buffer; + + auto canonical_cache = transform(index->canonical_cache, [&](auto&& value) { + auto&& [hash, canonical_id] = value; + return binary::CreateCacheEntry(builder, CreateString(builder, hash), canonical_id); + }); + + auto header_contexts = transform(index->header_contexts, [&](auto&& value) { + auto& [path_id, context] = value; + return binary::CreateHeaderContextEntry( + builder, + path_id, + context.version, + CreateStructVector(builder, context.includes)); + }); + + auto compilation_contexts = transform(index->compilation_contexts, [&](auto&& value) { + auto& [path_id, context] = value; + return binary::CreateCompilationContextEntry( + builder, + path_id, + context.version, + context.canonical_id, + context.build_at, + CreateStructVector(builder, context.include_locations)); + }); + + llvm::SmallVector occurrence_keys; + occurrence_keys.reserve(index->occurrences.size()); + auto occurrences = transform(index->occurrences, [&](auto&& value) { + auto&& [occurrence, bitmap] = value; + buffer.clear(); + buffer.resize_for_overwrite(bitmap.getSizeInBytes(false)); + bitmap.write(buffer.data(), false); + occurrence_keys.emplace_back(&occurrence); + return binary::CreateOccurrenceEntry(builder, + safe_cast(&occurrence), + CreateVector(builder, buffer)); + }); + ranges::sort(views::zip(occurrence_keys, occurrences), refl::less, [](auto e) -> auto& { + return *std::get<0>(e); + }); + + llvm::SmallVector relation_keys; + relation_keys.reserve(index->relations.size()); + auto relations = transform(index->relations, [&](auto&& value) { + auto&& [symbol_id, symbol_relations] = value; + auto relations = transform(symbol_relations, [&](auto&& value) { + auto&& [relation, bitmap] = value; + buffer.clear(); + buffer.resize_for_overwrite(bitmap.getSizeInBytes(false)); + bitmap.write(buffer.data(), false); + return binary::CreateRelationEntry(builder, + safe_cast(&relation), + CreateVector(builder, buffer)); + }); + relation_keys.emplace_back(symbol_id); + return binary::CreateSymbolRelationsEntry(builder, + symbol_id, + CreateVector(builder, relations)); + }); + ranges::sort(views::zip(relation_keys, relations), refl::less, [](auto e) -> auto { + return std::get<0>(e); + }); + + auto merged_index = binary::CreateMergedIndex(builder, + index->max_canonical_id, + CreateVector(builder, canonical_cache), + CreateVector(builder, header_contexts), + CreateVector(builder, compilation_contexts), + CreateVector(builder, occurrences), + CreateVector(builder, relations)); + builder.Finish(merged_index); + + out.write(safe_cast(builder.GetBufferPointer()), builder.GetSize()); +} + +void MergedIndex::lookup(this const Self& self, + std::uint32_t offset, + llvm::function_ref callback) { + if(self.impl) { + auto& index = *self.impl; + auto& occurrences = index.occurrences_cache; + if(occurrences.empty()) { + for(auto& [o, _]: index.occurrences) { + occurrences.emplace_back(o); + } + ranges::sort(occurrences, refl::less); + } + + auto it = ranges::lower_bound(occurrences, offset, {}, [](index::Occurrence& o) { + return o.range.end; + }); + + while(it != occurrences.end()) { + if(it->range.contains(offset)) { + if(!callback(*it)) { + break; + } + + it++; + continue; + } + + break; + } + } else if(self.buffer) { + auto index = fbs::GetRoot(self.buffer->getBufferStart()); + auto& occurrences = *index->occurrences(); + + auto it = ranges::lower_bound(occurrences, offset, {}, [](auto o) { + return o->occurrence()->range().end(); + }); + + while(it != occurrences.end()) { + auto o = safe_cast(it->occurrence()); + if(o->range.contains(offset)) { + if(!callback(*o)) { + break; + } + + it++; + continue; + } + + break; + } + } +} + +void MergedIndex::lookup(this const Self& self, + SymbolHash symbol, + RelationKind kind, + llvm::function_ref callback) { + + if(self.impl) { + auto it = self.impl->relations.find(symbol); + if(it == self.impl->relations.end()) [[unlikely]] { + return; + } + + auto& relations = it->second; + for(auto& [relation, _]: relations) { + if(relation.kind & kind) { + if(!callback(relation)) { + break; + } + } + } + } else if(self.buffer) { + auto index = fbs::GetRoot(self.buffer->getBufferStart()); + auto& entries = *index->relations(); + + auto it = ranges::lower_bound(entries, symbol, {}, [](auto e) { return e->symbol(); }); + if(it == entries.end() || it->symbol() != symbol) [[unlikely]] { + return; + } + + for(auto entry: *it->relations()) { + auto r = safe_cast(entry->relation()); + if(r->kind & kind) { + if(!callback(*r)) { + break; + } + } + } + } +} + +bool MergedIndex::need_update(this const Self& self, llvm::ArrayRef path_mapping) { + if(self.impl) { + if(self.impl->compilation_contexts.empty()) { + return true; + } + + auto& context = self.impl->compilation_contexts.begin()->getSecond(); + + llvm::DenseSet deps; + for(auto& location: context.include_locations) { + auto [_, success] = deps.insert(location.path_id); + if(success) { + fs::file_status status; + if(auto err = fs::status(path_mapping[location.path_id], status)) { + return true; + } + + auto time = std::chrono::duration_cast( + status.getLastModificationTime().time_since_epoch()); + if(time.count() > context.build_at) { + return true; + } + } + } + + return false; + } else if(self.buffer) { + auto index = fbs::GetRoot(self.buffer->getBufferStart()); + if(index->compilation_contexts()->empty()) { + return true; + } + + auto context = *index->compilation_contexts()->begin(); + + llvm::DenseSet deps; + for(auto location: *context->include_locations()) { + auto [_, success] = deps.insert(location->path_id()); + if(success) { + fs::file_status status; + if(auto err = fs::status(path_mapping[location->path_id()], status)) { + return true; + } + + auto time = std::chrono::duration_cast( + status.getLastModificationTime().time_since_epoch()); + if(time.count() > context->build_at()) { + return true; + } + } + } + + return false; + } + + return true; +} + +void MergedIndex::remove(this Self& self, std::uint32_t path_id) { + self.load_in_memory(); + auto& index = *self.impl; + + auto& includes = index.header_contexts[path_id].includes; for(auto& [_, canonical_id]: includes) { - auto& ref_counts = canonical_ref_counts[canonical_id]; + auto& ref_counts = index.canonical_ref_counts[canonical_id]; ref_counts -= 1; if(ref_counts == 0) { - removed.add(canonical_id); + index.removed.add(canonical_id); } } includes.clear(); } -void MergedIndex::merge(llvm::StringRef path, std::uint32_t include, FileIndex& index) { - auto& context = contexts[path]; - - auto hash = sha256_hash(index); - auto hash_key = llvm::StringRef(reinterpret_cast(hash.data()), hash.size()); - auto [it, success] = canonical_cache.try_emplace(hash_key, max_canonical_id); - - auto canonical_id = it->second; - context.includes.emplace_back(include, canonical_id); - - if(!success) { - canonical_ref_counts[canonical_id] += 1; - removed.remove(canonical_id); - return; - } - - for(auto& occurrence: index.occurrences) { - this->occurrences[occurrence].add(canonical_id); - } - - for(auto& [symbol_id, relations]: index.relations) { - auto& target = this->relations[symbol_id]; - for(auto& relation: relations) { - target[relation].add(canonical_id); - } - } - - canonical_ref_counts.emplace_back(1); - max_canonical_id += 1; +void MergedIndex::merge(this Self& self, + std::uint32_t path_id, + std::chrono::milliseconds build_at, + std::vector include_locations, + FileIndex& index) { + self.load_in_memory(); + self.impl->merge(path_id, index, [&](Impl& self, std::uint32_t canonical_id) { + auto& context = self.compilation_contexts[path_id]; + context.canonical_id = canonical_id; + context.build_at = build_at.count(); + context.include_locations = std::move(include_locations); + }); } -std::vector MergedIndex::lookup(std::uint32_t offset) { - if(cache_occurrences.size() != occurrences.size()) { - cache_occurrences.clear(); - for(auto& [occurrence, _]: occurrences) { - cache_occurrences.emplace_back(occurrence); - } - std::ranges::sort(cache_occurrences, refl::less); - } +void MergedIndex::merge(this Self& self, + std::uint32_t path_id, + std::uint32_t include_id, + FileIndex& index) { + self.load_in_memory(); + self.impl->merge(path_id, index, [&](Impl& self, std::uint32_t canonical_id) { + auto& context = self.header_contexts[path_id]; + context.includes.emplace_back(include_id, canonical_id); + }); +} - auto it = - std::ranges::lower_bound(cache_occurrences, offset, {}, [](index::Occurrence& occurrence) { - return occurrence.range.end; - }); - - std::vector occurrences; - while(it != cache_occurrences.end()) { - if(it->range.contains(offset)) { - occurrences.emplace_back(*it); - it++; - continue; - } - - break; - } - - return occurrences; +bool operator== (MergedIndex& lhs, MergedIndex& rhs) { + lhs.load_in_memory(); + rhs.load_in_memory(); + return *lhs.impl == *rhs.impl; } } // namespace clice::index diff --git a/src/Index/ProjectIndex.cpp b/src/Index/ProjectIndex.cpp index 1f3cab05..d40693d9 100644 --- a/src/Index/ProjectIndex.cpp +++ b/src/Index/ProjectIndex.cpp @@ -1,10 +1,10 @@ -#include "schema_generated.h" +#include "Serialization.h" #include "Index/ProjectIndex.h" #include "Support/Ranges.h" namespace clice::index { -void ProjectIndex::merge(this ProjectIndex& self, TUIndex& index) { +llvm::SmallVector ProjectIndex::merge(this ProjectIndex& self, TUIndex& index) { auto& paths = index.graph.paths; llvm::SmallVector file_ids_map; file_ids_map.resize_for_overwrite(paths.size()); @@ -19,6 +19,75 @@ void ProjectIndex::merge(this ProjectIndex& self, TUIndex& index) { target_symbol.reference_files.add(file_ids_map[ref]); } } + + return file_ids_map; +} + +void ProjectIndex::serialize(this ProjectIndex& self, llvm::raw_ostream& os) { + fbs::FlatBufferBuilder builder(1024); + + llvm::SmallVector buffer; + + auto i = 0; + auto paths = transform(self.path_pool.paths, [&](llvm::StringRef path) { + auto entry = + binary::CreatePathEntry(builder, CreateString(builder, self.path_pool.paths[i]), i); + i += 1; + return entry; + }); + + auto indices = transform(self.indices, [&](auto&& value) { + auto&& [source, index] = value; + return binary::PathMapEntry(source, index); + }); + + auto symbols = transform(self.symbols, [&](auto&& value) { + auto& [symbol_id, symbol] = value; + + buffer.clear(); + buffer.resize_for_overwrite(symbol.reference_files.getSizeInBytes(false)); + symbol.reference_files.write(buffer.data(), false); + + return binary::CreateSymbolEntry( + builder, + symbol_id, + binary::CreateSymbol(builder, symbol.kind.value(), CreateVector(builder, buffer))); + }); + + auto project_index = + binary::CreateProjectIndex(builder, + CreateVector(builder, paths), + CreateStructVector(builder, indices), + CreateVector(builder, symbols)); + + builder.Finish(project_index); + os.write(safe_cast(builder.GetBufferPointer()), builder.GetSize()); +} + +ProjectIndex ProjectIndex::from(const void* data) { + auto root = fbs::GetRoot(data); + + ProjectIndex index; + + auto& pool = index.path_pool; + pool.paths.resize(root->paths()->size()); + for(auto entry: *root->paths()) { + auto k = pool.save(entry->path()->string_view()); + pool.paths[entry->id()] = k; + pool.cache.try_emplace(k, entry->id()); + } + + for(auto entry: *root->indices()) { + index.indices.try_emplace(entry->source(), entry->index()); + } + + for(auto entry: *root->symbols()) { + auto& symbol = index.symbols[entry->symbol_id()]; + symbol.kind = SymbolKind(entry->symbol()->kind()); + symbol.reference_files = read_bitmap(entry->symbol()->refs()); + } + + return index; } } // namespace clice::index diff --git a/src/Index/Serialization.cpp b/src/Index/Serialization.cpp deleted file mode 100644 index fd880968..00000000 --- a/src/Index/Serialization.cpp +++ /dev/null @@ -1,227 +0,0 @@ -#include "schema_generated.h" -#include "Index/MergedIndex.h" -#include "Index/ProjectIndex.h" -#include "Support/Ranges.h" - -namespace clice::index { - -namespace fbs = flatbuffers; - -namespace { - -template -using Offsets = llvm::SmallVector, 0>; - -template -const U* safe_cast(const V* v) { - static_assert(sizeof(U) == sizeof(V)); - assert((void(std::bit_cast(V{})), true)); - return reinterpret_cast(v); -} - -auto CreateString(fbs::FlatBufferBuilder& builder, llvm::StringRef string) { - return builder.CreateString(string.data(), string.size()); -} - -template -auto CreateVector(fbs::FlatBufferBuilder& builder, const Range& range) { - return builder.CreateVector(range.data(), range.size()); -} - -auto CreateVector(fbs::FlatBufferBuilder& builder, const llvm::SmallVector& range) { - return builder.CreateVector(reinterpret_cast(range.data()), range.size()); -} - -template -auto CreateStructVector(fbs::FlatBufferBuilder& builder, const Range& range) { - using V = ranges::range_value_t; - return builder.CreateVectorOfStructs(safe_cast(range.data()), range.size()); -} - -template -auto transform(const Range& range, const Functor& functor) { - using V = ranges::range_value_t; - using R = std::invoke_result_t; - - llvm::SmallVector result; - result.resize_for_overwrite(ranges::size(range)); - - auto i = 0; - for(auto&& v: range) { - result[i] = functor(v); - i += 1; - } - return result; -} - -Bitmap read_bitmap(const fbs::Vector* buffer) { - return Bitmap::read(reinterpret_cast(buffer->data()), false); -} - -} // namespace - -void MergedIndex::serialize(this MergedIndex& self, llvm::raw_ostream& out) { - fbs::FlatBufferBuilder builder(1024); - - llvm::SmallVector buffer; - - auto canonical_cache = transform(self.canonical_cache, [&](auto&& value) { - auto&& [hash, canonical_id] = value; - return binary::CreateCacheEntry(builder, CreateString(builder, hash), canonical_id); - }); - - auto header_contexts = transform(self.contexts, [&](auto&& value) { - auto& [path, contexts] = value; - return binary::CreateHeaderContextsEntry( - builder, - CreateString(builder, path), - binary::CreateHeaderContexts( - builder, - contexts.version, - CreateStructVector(builder, contexts.includes))); - }); - - auto occurrences = transform(self.occurrences, [&](auto&& value) { - auto&& [occurrence, bitmap] = value; - buffer.clear(); - buffer.resize_for_overwrite(bitmap.getSizeInBytes(false)); - bitmap.write(buffer.data(), false); - return binary::CreateOccurrenceEntry(builder, - safe_cast(&occurrence), - CreateVector(builder, buffer)); - }); - - auto relations = transform(self.relations, [&](auto&& value) { - auto&& [symbold_id, symbol_relations] = value; - auto relations = transform(symbol_relations, [&](auto&& value) { - auto&& [relation, bitmap] = value; - buffer.clear(); - buffer.resize_for_overwrite(bitmap.getSizeInBytes(false)); - bitmap.write(buffer.data(), false); - return binary::CreateRelationEntry(builder, - safe_cast(&relation), - CreateVector(builder, buffer)); - }); - return binary::CreateSymbolRelationsEntry(builder, - symbold_id, - CreateVector(builder, relations)); - }); - - auto merged_index = binary::CreateMergedIndex(builder, - self.max_canonical_id, - CreateVector(builder, canonical_cache), - CreateVector(builder, header_contexts), - CreateVector(builder, occurrences), - CreateVector(builder, relations)); - builder.Finish(merged_index); - - out.write(safe_cast(builder.GetBufferPointer()), builder.GetSize()); -} - -MergedIndex MergedIndexView::deserialize() { - auto root = fbs::GetRoot(data); - - MergedIndex index; - index.max_canonical_id = root->max_canonical_id(); - - for(auto entry: *root->canonical_cache()) { - index.canonical_cache.try_emplace(entry->sha256()->string_view(), entry->canonical_id()); - } - - index.canonical_ref_counts.resize(index.max_canonical_id, 0); - - HeaderContexts contexts; - for(auto entry: *root->contexts()) { - auto path = entry->path()->string_view(); - contexts.version = entry->contexts()->version(); - for(auto include: *entry->contexts()->includes()) { - index.canonical_ref_counts[include->canonical_id()] += 1; - contexts.includes.emplace_back(include->include_(), include->canonical_id()); - } - index.contexts.try_emplace(path, std::move(contexts)); - } - - for(auto entry: *root->occurrences()) { - index.occurrences.try_emplace(*safe_cast(entry->occurrence()), - read_bitmap(entry->context())); - } - - for(auto entry: *root->relations()) { - auto& relations = index.relations[entry->symbol()]; - for(auto relation_entry: *entry->relations()) { - relations.try_emplace(*safe_cast(relation_entry->relation()), - read_bitmap(relation_entry->context())); - } - } - - return index; -} - -void ProjectIndex::serialize(this ProjectIndex& self, llvm::raw_ostream& os) { - fbs::FlatBufferBuilder builder(1024); - - llvm::SmallVector buffer; - - auto i = 0; - auto paths = transform(self.path_pool.paths, [&](llvm::StringRef path) { - auto enrty = - binary::CreatePathEntry(builder, CreateString(builder, self.path_pool.paths[i]), i); - i += 1; - return enrty; - }); - - auto indices = transform(self.indices, [&](auto&& value) { - auto&& [source, index] = value; - return binary::PathMapEntry(source, index); - }); - - auto symbols = transform(self.symbols, [&](auto&& value) { - auto& [symbol_id, symbol] = value; - - buffer.clear(); - buffer.resize_for_overwrite(symbol.reference_files.getSizeInBytes(false)); - symbol.reference_files.write(buffer.data(), false); - - return binary::CreateSymbolEntry( - builder, - symbol_id, - binary::CreateSymbol(builder, symbol.kind.value(), CreateVector(builder, buffer))); - }); - - auto project_index = - binary::CreateProjectIndex(builder, - CreateVector(builder, paths), - CreateStructVector(builder, indices), - CreateVector(builder, symbols)); - - builder.Finish(project_index); - os.write(safe_cast(builder.GetBufferPointer()), builder.GetSize()); -} - -ProjectIndex ProjectIndex::from(const void* data) { - auto root = fbs::GetRoot(data); - - ProjectIndex index; - - auto& pool = index.path_pool; - pool.paths.resize(root->paths()->size()); - for(auto entry: *root->paths()) { - auto k = pool.save(entry->path()->string_view()); - pool.paths[entry->id()] = k; - pool.cache.try_emplace(k, entry->id()); - } - - for(auto entry: *root->indices()) { - index.indices.try_emplace(entry->source(), entry->index()); - } - - for(auto entry: *root->symbols()) { - auto& symbol = index.symbols[entry->symbol_id()]; - symbol.kind = SymbolKind(entry->symbol()->kind()); - symbol.reference_files = read_bitmap(entry->symbol()->refs()); - } - - return index; -} - -} // namespace clice::index diff --git a/src/Index/Serialization.h b/src/Index/Serialization.h new file mode 100644 index 00000000..4887de48 --- /dev/null +++ b/src/Index/Serialization.h @@ -0,0 +1,67 @@ +#include "schema_generated.h" +#include "Support/Bitmap.h" +#include "Support/Ranges.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/ADT/SmallVector.h" + +namespace clice::index { + +namespace fbs = flatbuffers; + +namespace { + +template +using Offsets = llvm::SmallVector, 0>; + +template +const U* safe_cast(const V* v) { + static_assert(sizeof(U) == sizeof(V), "size mismatch"); + static_assert(alignof(U) == alignof(V), "alignment mismatch"); + static_assert(std::is_trivially_copyable_v && std::is_trivially_copyable_v, + "requires trivially copyable"); + /// If aliasing issues arise, prefer copying into a temporary SmallVector. + return reinterpret_cast(v); +} + +auto CreateString(fbs::FlatBufferBuilder& builder, llvm::StringRef string) { + return builder.CreateString(string.data(), string.size()); +} + +template +auto CreateVector(fbs::FlatBufferBuilder& builder, const Range& range) { + return builder.CreateVector(range.data(), range.size()); +} + +auto CreateVector(fbs::FlatBufferBuilder& builder, const llvm::SmallVector& range) { + return builder.CreateVector(reinterpret_cast(range.data()), range.size()); +} + +template +auto CreateStructVector(fbs::FlatBufferBuilder& builder, const Range& range) { + using V = ranges::range_value_t; + return builder.CreateVectorOfStructs(safe_cast(range.data()), range.size()); +} + +template +auto transform(const Range& range, const Functor& functor) { + using V = ranges::range_value_t; + using R = std::invoke_result_t; + + llvm::SmallVector result; + result.resize_for_overwrite(ranges::size(range)); + + auto i = 0; + for(auto&& v: range) { + result[i] = functor(v); + i += 1; + } + return result; +} + +Bitmap read_bitmap(const fbs::Vector* buffer) { + return Bitmap::read(reinterpret_cast(buffer->data()), false); +} + +} // namespace + +} // namespace clice::index diff --git a/src/Index/TUIndex.cpp b/src/Index/TUIndex.cpp index 5b21f208..30278d95 100644 --- a/src/Index/TUIndex.cpp +++ b/src/Index/TUIndex.cpp @@ -1,6 +1,7 @@ #include "AST/Semantic.h" #include "Index/TUIndex.h" #include "Support/Compare.h" +#include "llvm/Support/SHA256.h" namespace clice::index { @@ -114,7 +115,13 @@ public: std::ranges::sort(index.occurrences, refl::less); auto range = std::ranges::unique(index.occurrences, refl::equal); index.occurrences.erase(range.begin(), range.end()); + + if(fid == unit.interested_file()) { + result.main_file_index = std::move(index); + } } + + result.file_indices.erase(unit.interested_file()); } private: @@ -123,10 +130,42 @@ private: } // namespace +std::array FileIndex::hash() { + llvm::SHA256 hasher; + + using u8 = std::uint8_t; + + if(!occurrences.empty()) { + static_assert(sizeof(Occurrence) == sizeof(Range) + sizeof(SymbolHash)); + static_assert(sizeof(Occurrence) % 8 == 0); + auto data = reinterpret_cast(occurrences.data()); + auto size = occurrences.size() * sizeof(Occurrence); + hasher.update(llvm::ArrayRef(data, size)); + } + + for(auto& [symbol_id, relations]: relations) { + hasher.update(std::bit_cast>(symbol_id)); + static_assert(sizeof(Relation) == + sizeof(RelationKind) + 4 + sizeof(Range) + sizeof(SymbolHash)); + static_assert(sizeof(Relation) % 8 == 0); + + if(!relations.empty()) { + auto data = reinterpret_cast(relations.data()); + auto size = relations.size() * sizeof(Relation); + hasher.update(llvm::ArrayRef(data, size)); + } + } + + return hasher.final(); +} + TUIndex TUIndex::build(CompilationUnit& unit) { TUIndex index; + index.built_at = unit.build_at(); + Builder builder(index, unit); builder.build(); + return index; } diff --git a/src/Server/Indexer.cpp b/src/Server/Indexer.cpp index 51d844bc..39c9bd4d 100644 --- a/src/Server/Indexer.cpp +++ b/src/Server/Indexer.cpp @@ -12,11 +12,16 @@ async::Task<> Indexer::index(llvm::StringRef path) { params.kind = CompilationUnit::Indexing; params.arguments = database.get_command(path).arguments; + auto path_id = project_index.path_pool.path_id(path); + auto& merged_index = get_index(path_id); + if(!merged_index.need_update(project_index.path_pool.paths)) { + logging::info("Check update for {}, not need to update", path); + co_return; + } + /// FIXME: We may want to stop the task in the future. /// params.stop; - /// Check update? - auto tu_index = co_await async::submit([&]() -> std::optional { auto unit = compile(params); if(!unit) { @@ -31,17 +36,25 @@ async::Task<> Indexer::index(llvm::StringRef path) { co_return; } - project_index.merge(*tu_index); + auto path_map = project_index.merge(*tu_index); /// FIXME: Currently, we merge index eagerly, I would like to improve /// this in the future. for(auto& [fid, index]: tu_index->file_indices) { - auto path = tu_index->graph.path(tu_index->graph.path_id(fid)); - auto& merged_index = in_memory_indices[project_index.path_pool.path_id(path)]; - - merged_index.merge(path, tu_index->graph.include_location_id(fid), index); + auto path_id = path_map[tu_index->graph.path_id(fid)]; + auto& merged_index = get_index(path_id); + merged_index.merge(path_id, tu_index->graph.include_location_id(fid), index); } + auto& index = get_index(path_id); + for(auto& include: tu_index->graph.locations) { + include.path_id = path_map[include.path_id]; + } + index.merge(path_id, + tu_index->built_at, + std::move(tu_index->graph.locations), + tu_index->main_file_index); + logging::info("Successfully index {}", path); } @@ -89,12 +102,81 @@ async::Task<> Indexer::index_all() { co_return; } +void Indexer::load_from_disk() { + std::string output_path = path::join(config.project.index_dir, "project.idx"); + if(auto content = fs::read(output_path); content && !content->empty()) { + /// FIXME: from should return a expected ... + project_index = index::ProjectIndex::from(content->data()); + logging::info("Load project index form {} successfully", output_path); + } else { + logging::info("Fail to load project index form {}", output_path); + } + + /// FIXME: check indices update .... +} + +void Indexer::save_to_disk() { + if(auto err = fs::create_directories(config.project.index_dir)) { + logging::warn("Fail to create index output dir: {}, because: {}", + config.project.index_dir, + err); + return; + } + + for(auto& [path_id, index]: in_memory_indices) { + if(index.need_rewrite()) { + auto path = project_index.path_pool.path(path_id); + + std::string output_path; + if(auto it = project_index.indices.find(path_id); it != project_index.indices.end()) { + output_path = project_index.path_pool.path(it->second); + } else { + output_path = path::join( + config.project.index_dir, + std::format("{}.{}.idx", path::filename(path), llvm::xxHash64(path))); + } + + std::error_code err; + llvm::raw_fd_ostream os(output_path, err, fs::CreationDisposition::CD_CreateAlways); + if(err) { + logging::info("Fail to create output index file: {}, because: {}", + output_path, + err); + continue; + } + + index.serialize(os); + + auto opath_id = project_index.path_pool.path_id(output_path); + project_index.indices.try_emplace(path_id, opath_id); + logging::info("Successfully save index for {} to {}", path, output_path); + } + } + + std::string output_path = path::join(config.project.index_dir, "project.idx"); + + std::error_code err; + llvm::raw_fd_ostream os(output_path, err, fs::CreationDisposition::CD_CreateAlways); + if(err) { + logging::info("Fail to create output index file: {}, because: {}", output_path, err); + return; + } + + project_index.serialize(os); + logging::info("Successfully save project index to {}", output_path); +} + auto Indexer::lookup(llvm::StringRef path, std::uint32_t offset, RelationKind kind) -> Result { std::vector locations; auto path_id = project_index.path_pool.path_id(path); - auto index = in_memory_indices[path_id]; - auto occurrences = index.lookup(offset); + auto& index = get_index(path_id); + + llvm::SmallVector occurrences; + index.lookup(offset, [&occurrences](const index::Occurrence& o) { + occurrences.emplace_back(o); + return true; + }); if(occurrences.empty()) { co_return locations; } @@ -105,16 +187,15 @@ auto Indexer::lookup(llvm::StringRef path, std::uint32_t offset, RelationKind ki /// FIXME: We may want to parallelize this ... for(auto file: refs) { - auto& relations = in_memory_indices[file].relations[symbol_id]; - std::vector results; - for(auto& [relation, _]: relations) { - if(relation.kind & kind) { - results.emplace_back(relation.range); - } - } + get_index(file).lookup(symbol_id, kind, [&results](const index::Relation& r) { + results.emplace_back(r.range); + return true; + }); llvm::StringRef path = project_index.path_pool.path(file); + + /// FIXME: Use the content stored in the merged index. auto content = fs::read(path); if(!content) { continue; @@ -127,7 +208,7 @@ auto Indexer::lookup(llvm::StringRef path, std::uint32_t offset, RelationKind ki for(auto result: results) { auto begin = converter.toPosition(result.begin); auto end = converter.toPosition(result.end); - locations.emplace_back(path.str(), proto::Range(begin, end)); + locations.emplace_back(mapping.to_uri(path), proto::Range(begin, end)); } } diff --git a/src/Server/Lifecycle.cpp b/src/Server/Lifecycle.cpp index a45f9b03..489e23f1 100644 --- a/src/Server/Lifecycle.cpp +++ b/src/Server/Lifecycle.cpp @@ -96,6 +96,7 @@ async::Task Server::on_initialize(proto::InitializeParams params) { } async::Task<> Server::on_initialized(proto::InitializedParams) { + indexer.load_from_disk(); co_await indexer.index_all(); co_return; } @@ -106,6 +107,7 @@ async::Task Server::on_shutdown(proto::ShutdownParams params) { async::Task<> Server::on_exit(proto::ExitParams params) { save_cache_info(); + indexer.save_to_disk(); async::stop(); co_return; } diff --git a/src/Server/Server.cpp b/src/Server/Server.cpp index 2bdae212..8c60249a 100644 --- a/src/Server/Server.cpp +++ b/src/Server/Server.cpp @@ -95,7 +95,7 @@ async::Task<> Server::registerCapacity(llvm::StringRef id, }); } -Server::Server() : indexer(database) { +Server::Server() : indexer(database, config) { register_callback<&Server::on_initialize>("initialize"); register_callback<&Server::on_initialized>("initialized"); register_callback<&Server::on_shutdown>("shutdown"); diff --git a/tests/unit/Index/MergedIndex.cpp b/tests/unit/Index/MergedIndex.cpp index c3ba7bf3..8192ba57 100644 --- a/tests/unit/Index/MergedIndex.cpp +++ b/tests/unit/Index/MergedIndex.cpp @@ -58,7 +58,7 @@ suite<"MergedIndex"> suite = [] { auto& graph = tu_index.graph; for(auto& [fid, index]: tu_index.file_indices) { llvm::StringRef path = graph.paths[graph.path_id(fid)]; - merged_indices[path].merge("main.cpp", graph.include_location_id(fid), index); + merged_indices[path].merge(0, graph.include_location_id(fid), index); } for(auto& [path, merged]: merged_indices) { @@ -67,10 +67,8 @@ suite<"MergedIndex"> suite = [] { merged.serialize(os); - index::MergedIndexView view(s.data()); - auto merged2 = view.deserialize(); - - expect(merged == merged2); + auto view = index::MergedIndex(s); + expect(merged == view); } }; }; diff --git a/tests/unit/Index/ProjectIndex.cpp b/tests/unit/Index/ProjectIndex.cpp new file mode 100644 index 00000000..e69de29b diff --git a/tests/unit/Index/TUIndex.cpp b/tests/unit/Index/TUIndex.cpp index 1af2fc01..e65d6ce7 100644 --- a/tests/unit/Index/TUIndex.cpp +++ b/tests/unit/Index/TUIndex.cpp @@ -22,7 +22,8 @@ suite<"TUIndex"> suite = [] { llvm::StringRef file = "") -> std::vector { auto offset = tester.point(pos, file); auto fid = file.empty() ? tester.unit->interested_file() : tester.unit->file_id(file); - auto& index = tu_index.file_indices[fid]; + auto& index = fid == tester.unit->interested_file() ? tu_index.main_file_index + : tu_index.file_indices[fid]; auto it = std::ranges::lower_bound( index.occurrences, @@ -74,7 +75,8 @@ suite<"TUIndex"> suite = [] { dump(range)); auto fid = file.empty() ? tester.unit->interested_file() : tester.unit->file_id(file); - auto& index = tu_index.file_indices[fid]; + auto& index = fid == tester.unit->interested_file() ? tu_index.main_file_index + : tu_index.file_indices[fid]; auto it = index.relations.find(occurrences.front().target); fatal / expect(it != index.relations.end(), location) @@ -98,8 +100,7 @@ suite<"TUIndex"> suite = [] { } )"); - expect(eq(tu_index.file_indices.size(), 1)); - auto& index = tu_index.file_indices.begin()->second; + auto& index = tu_index.main_file_index; expect(eq(index.relations.size(), 2)); expect(eq(index.occurrences.size(), 3));