1 Commits

Author SHA1 Message Date
ykiko
d241ea8492 refactor(index): migrate FlatBuffers from flatc IDL to kotatsu reflection
Replace the flatc-generated serialization layer with kotatsu's arena codec
driven directly by the in-memory index types. No hand-written DTOs: the
on-wire layout is derived from reflection over the existing structs, with
type-level customization where needed.

- Drop `schema.fbs`, `serialization.h`, and the flatc build step
- Delete `wire_types.h` — no more parallel wire representation
- Add `kotatsu_adapters.h` with `kota::codec::type_adapter<T>` specializations
  for RelationKind, SymbolKind, Bitmap, and std::chrono::milliseconds
- Mark runtime-only FileID-keyed maps with `kota::meta::skip<>` so they
  are excluded from reflection slots; serialize via `main_file_index` and
  `path_file_indices` (keyed by path id)
- Restore MergedIndex's dual dispatch: in-memory path when `impl` is live,
  lazy flatbuffers path via `kfb::table_view<Impl>::from_bytes()` and
  `root[&Impl::field]` proxy access when only the buffer is held
- Add default member initializers to LocalSourceRange, padding field to
  Relation, and a path_id lookup struct to IncludeLocation so reflection
  picks up all stored state
- Propagate buffer size through `TUIndex::from` / `ProjectIndex::from`
  (kota codec requires an explicit size for bounds verification)

All 551 unit tests pass; 9 environment-gated integration tests skipped.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-04-20 09:51:28 +08:00
16 changed files with 397 additions and 671 deletions

View File

@@ -124,42 +124,21 @@ if(CLICE_CI_ENVIRONMENT)
target_compile_definitions(clice_options INTERFACE CLICE_CI_ENVIRONMENT=1)
endif()
set(FBS_SCHEMA_FILE "${PROJECT_SOURCE_DIR}/src/index/schema.fbs")
set(GENERATED_HEADER "${PROJECT_BINARY_DIR}/generated/schema_generated.h")
if(CMAKE_CROSSCOMPILING)
find_program(FLATC_EXECUTABLE flatc REQUIRED)
set(FLATC_CMD "${FLATC_EXECUTABLE}")
else()
set(FLATC_CMD "$<TARGET_FILE:flatc>")
endif()
add_custom_command(
OUTPUT "${GENERATED_HEADER}"
COMMAND ${FLATC_CMD} --cpp -o "${PROJECT_BINARY_DIR}/generated" "${FBS_SCHEMA_FILE}"
DEPENDS "${FBS_SCHEMA_FILE}"
COMMENT "Generating C++ header from ${FBS_SCHEMA_FILE}"
)
add_custom_target(generate_flatbuffers_schema DEPENDS "${GENERATED_HEADER}")
file(GLOB_RECURSE CLICE_CORE_SOURCES CONFIGURE_DEPENDS "${PROJECT_SOURCE_DIR}/src/*.cpp")
add_library(clice-core STATIC ${CLICE_CORE_SOURCES})
add_library(clice::core ALIAS clice-core)
add_dependencies(clice-core generate_flatbuffers_schema)
target_include_directories(clice-core PUBLIC
"${PROJECT_SOURCE_DIR}/src"
"${PROJECT_BINARY_DIR}/generated"
)
target_link_libraries(clice-core PUBLIC
clice_options
llvm-libs
spdlog::spdlog
roaring::roaring
flatbuffers
kota::ipc::lsp
kota::codec::toml
kota::codec::flatbuffers
simdjson::simdjson
)

View File

@@ -27,21 +27,10 @@ FetchContent_Declare(
set(ENABLE_ROARING_TESTS OFF CACHE INTERNAL "" FORCE)
set(ENABLE_ROARING_MICROBENCHMARKS OFF CACHE INTERNAL "" FORCE)
# flatbuffers
FetchContent_Declare(
flatbuffers
GIT_REPOSITORY https://github.com/google/flatbuffers.git
GIT_TAG v25.9.23
GIT_SHALLOW TRUE
)
set(FLATBUFFERS_BUILD_GRPC OFF CACHE BOOL "" FORCE)
set(FLATBUFFERS_BUILD_TESTS OFF CACHE BOOL "" FORCE)
set(FLATBUFFERS_BUILD_FLATHASH OFF CACHE BOOL "" FORCE)
FetchContent_Declare(
kotatsu
GIT_REPOSITORY https://github.com/clice-io/kotatsu
GIT_TAG main
GIT_TAG refactor/flatbuffers-schema-driven
GIT_SHALLOW TRUE
)
@@ -50,7 +39,8 @@ set(KOTA_ENABLE_TEST OFF)
set(KOTA_CODEC_ENABLE_SIMDJSON ON)
set(KOTA_CODEC_ENABLE_YYJSON ON)
set(KOTA_CODEC_ENABLE_TOML ON)
set(KOTA_CODEC_ENABLE_FLATBUFFERS ON)
set(KOTA_ENABLE_EXCEPTIONS OFF)
set(KOTA_ENABLE_RTTI OFF)
FetchContent_MakeAvailable(kotatsu spdlog croaring flatbuffers)
FetchContent_MakeAvailable(kotatsu spdlog croaring)

View File

@@ -7,6 +7,7 @@
#include "syntax/token.h"
#include "kota/meta/annotation.h"
#include "llvm/ADT/DenseMap.h"
namespace clice {
@@ -42,7 +43,10 @@ struct IncludeGraph {
/// Each `FileID` represents a new header context and is introduced
/// by a new include directive. So a include directive is a new header
/// context. A map between FileID and its include location.
llvm::DenseMap<clang::FileID, std::uint32_t> file_table;
///
/// Runtime-only: `clang::FileID` is an AST-scoped handle; on-disk the
/// include graph is fully described by `paths` + `locations`.
kota::meta::skip<llvm::DenseMap<clang::FileID, std::uint32_t>> file_table;
static IncludeGraph from(CompilationUnitRef unit);

View File

@@ -0,0 +1,121 @@
#pragma once
#include <chrono>
#include <cstddef>
#include <cstdint>
#include <utility>
#include <vector>
#include "semantic/relation_kind.h"
#include "semantic/symbol_kind.h"
#include "support/bitmap.h"
#include "kota/codec/arena/traits.h"
#include "kota/codec/detail/fwd.h"
/// Type-level wire traits for clice index types.
///
/// These partially specialize the primary
/// `kota::codec::serialize_traits<S, T>` / `deserialize_traits<D, T>`
/// templates, constrained so only arena backends pick them up. They
/// declare the wire representation for `T` and propagate through map
/// values, sequence elements, and nested containers — no per-field
/// `annotation<T, with<...>>` required.
namespace kota::codec {
/// `std::chrono::milliseconds` ⇄ `int64` tick count.
template <typename S>
requires arena::arena_serializer_like<S>
struct serialize_traits<S, std::chrono::milliseconds> {
using wire_type = std::int64_t;
static std::int64_t serialize(S&, std::chrono::milliseconds value) noexcept {
return value.count();
}
};
template <typename D>
requires arena::arena_deserializer_like<D>
struct deserialize_traits<D, std::chrono::milliseconds> {
using wire_type = std::int64_t;
static std::chrono::milliseconds deserialize(const D&, std::int64_t value) noexcept {
return std::chrono::milliseconds(value);
}
};
/// `RelationKind` ⇄ underlying `uint32` bitflags.
template <typename S>
requires arena::arena_serializer_like<S>
struct serialize_traits<S, clice::RelationKind> {
using wire_type = std::uint32_t;
static std::uint32_t serialize(S&, const clice::RelationKind& k) noexcept {
return k.value();
}
};
template <typename D>
requires arena::arena_deserializer_like<D>
struct deserialize_traits<D, clice::RelationKind> {
using wire_type = std::uint32_t;
static clice::RelationKind deserialize(const D&, std::uint32_t v) noexcept {
return clice::RelationKind(static_cast<clice::RelationKind::Kind>(v));
}
};
/// `SymbolKind` ⇄ underlying `uint8`.
template <typename S>
requires arena::arena_serializer_like<S>
struct serialize_traits<S, clice::SymbolKind> {
using wire_type = std::uint8_t;
static std::uint8_t serialize(S&, const clice::SymbolKind& k) noexcept {
return k.value();
}
};
template <typename D>
requires arena::arena_deserializer_like<D>
struct deserialize_traits<D, clice::SymbolKind> {
using wire_type = std::uint8_t;
static clice::SymbolKind deserialize(const D&, std::uint8_t v) noexcept {
return clice::SymbolKind(v);
}
};
/// `clice::Bitmap` (= `roaring::Roaring`) ⇄ opaque byte blob produced by
/// Roaring's non-portable serialization (matches the legacy wire format).
template <typename S>
requires arena::arena_serializer_like<S>
struct serialize_traits<S, clice::Bitmap> {
using wire_type = std::vector<std::byte>;
static std::vector<std::byte> serialize(S&, const clice::Bitmap& bitmap) {
std::vector<std::byte> buffer;
if(bitmap.isEmpty()) {
return buffer;
}
buffer.resize(bitmap.getSizeInBytes(false));
bitmap.write(reinterpret_cast<char*>(buffer.data()), false);
return buffer;
}
};
template <typename D>
requires arena::arena_deserializer_like<D>
struct deserialize_traits<D, clice::Bitmap> {
using wire_type = std::vector<std::byte>;
static clice::Bitmap deserialize(const D&, std::vector<std::byte> bytes) {
if(bytes.empty()) {
return clice::Bitmap();
}
return clice::Bitmap::read(reinterpret_cast<const char*>(bytes.data()), false);
}
};
} // namespace kota::codec

View File

@@ -1,11 +1,18 @@
#include "index/merged_index.h"
#include <cassert>
#include <cstdint>
#include <ranges>
#include <span>
#include <tuple>
#include "index/serialization.h"
#include "index/kotatsu_adapters.h" // type_adapter specializations
#include "support/filesystem.h"
#include "kota/codec/flatbuffers/deserializer.h"
#include "kota/codec/flatbuffers/proxy.h"
#include "kota/codec/flatbuffers/serializer.h"
#include "kota/meta/annotation.h"
#include "llvm/ADT/DenseSet.h"
#include "llvm/Support/raw_os_ostream.h"
@@ -97,7 +104,7 @@ struct CompilationContext {
std::uint32_t canonical_id = 0;
std::uint64_t build_at;
std::uint64_t build_at = 0;
std::vector<IncludeLocation> include_locations;
@@ -125,8 +132,9 @@ struct MergedIndex::Impl {
/// The max canonical id we have allocated.
std::uint32_t max_canonical_id = 0;
/// The reference count of each canonical id.
std::vector<std::uint32_t> canonical_ref_counts;
/// Reference counts per canonical id — derivable from header/compilation
/// contexts at load time, so it doesn't need to live on the wire.
kota::meta::skip<std::vector<std::uint32_t>> canonical_ref_counts;
/// The canonical id set of removed index.
roaring::Roaring removed;
@@ -137,8 +145,8 @@ struct MergedIndex::Impl {
/// All merged symbol relations.
llvm::DenseMap<SymbolHash, llvm::DenseMap<Relation, roaring::Roaring>> relations;
/// Sorted occurrences cache for fast lookup.
std::vector<Occurrence> occurrences_cache;
/// Sorted occurrences cache for fast lookup — rebuilt on demand.
kota::meta::skip<std::vector<Occurrence>> occurrences_cache;
void merge(this Impl& self, std::uint32_t path_id, FileIndex& index, auto&& add_context) {
auto hash = index.hash();
@@ -172,6 +180,18 @@ struct MergedIndex::Impl {
friend bool operator==(const Impl&, const Impl&) = default;
};
namespace {
namespace kfb = kota::codec::flatbuffers;
std::span<const std::uint8_t> buffer_bytes(const llvm::MemoryBuffer& buffer) {
return std::span<const std::uint8_t>(
reinterpret_cast<const std::uint8_t*>(buffer.getBufferStart()),
buffer.getBufferSize());
}
} // namespace
MergedIndex::MergedIndex(std::unique_ptr<llvm::MemoryBuffer> buffer, std::unique_ptr<Impl> impl) :
buffer(std::move(buffer)), impl(std::move(impl)) {}
@@ -196,65 +216,24 @@ void MergedIndex::load_in_memory(this Self& self) {
return;
}
auto bytes = buffer_bytes(*self.buffer);
auto result = kfb::from_flatbuffer(bytes, *self.impl);
if(!result) {
self.buffer.reset();
return;
}
// Rebuild the ref count table from the already-loaded contexts.
auto& index = *self.impl;
auto root = fbs::GetRoot<binary::MergedIndex>(self.buffer->getBufferStart());
index.max_canonical_id = root->max_canonical_id();
for(auto entry: *root->canonical_cache()) {
index.canonical_cache.try_emplace(entry->sha256()->string_view(), entry->canonical_id());
}
index.canonical_ref_counts.clear();
index.canonical_ref_counts.resize(index.max_canonical_id, 0);
for(auto entry: *root->header_contexts()) {
HeaderContext context;
auto path = entry->path_id();
context.version = entry->version();
for(auto include: *entry->includes()) {
index.canonical_ref_counts[include->canonical_id()] += 1;
context.includes.emplace_back(*safe_cast<IncludeContext>(include));
}
index.header_contexts.try_emplace(path, std::move(context));
}
for(auto entry: *root->compilation_contexts()) {
CompilationContext context;
auto path = entry->path_id();
context.version = entry->version();
context.canonical_id = entry->canonical_id();
context.build_at = entry->build_at();
for(auto include: *entry->include_locations()) {
context.include_locations.emplace_back(*safe_cast<IncludeLocation>(include));
}
index.compilation_contexts.try_emplace(path, std::move(context));
}
// Count ref counts from compilation contexts.
for(auto entry: *root->compilation_contexts()) {
index.canonical_ref_counts[entry->canonical_id()] += 1;
}
// Deserialize removed bitmap.
if(root->removed() && root->removed()->size() > 0) {
index.removed = read_bitmap(root->removed());
}
for(auto entry: *root->occurrences()) {
index.occurrences.try_emplace(*safe_cast<Occurrence>(entry->occurrence()),
read_bitmap(entry->context()));
}
for(auto entry: *root->relations()) {
auto& relations = index.relations[entry->symbol()];
for(auto relation_entry: *entry->relations()) {
relations.try_emplace(*safe_cast<Relation>(relation_entry->relation()),
read_bitmap(relation_entry->context()));
for(auto& [_, ctx]: index.header_contexts) {
for(auto& inc: ctx.includes) {
index.canonical_ref_counts[inc.canonical_id] += 1;
}
}
if(root->content()) {
index.content = root->content()->str();
for(auto& [_, ctx]: index.compilation_contexts) {
index.canonical_ref_counts[ctx.canonical_id] += 1;
}
self.buffer.reset();
@@ -279,100 +258,9 @@ void MergedIndex::serialize(this const Self& self, llvm::raw_ostream& out) {
return;
}
auto& index = self.impl;
fbs::FlatBufferBuilder builder(1024);
llvm::SmallVector<char, 1024> buffer;
auto canonical_cache = transform(index->canonical_cache, [&](auto&& value) {
auto&& [hash, canonical_id] = value;
return binary::CreateCacheEntry(builder, CreateString(builder, hash), canonical_id);
});
auto header_contexts = transform(index->header_contexts, [&](auto&& value) {
auto& [path_id, context] = value;
return binary::CreateHeaderContextEntry(
builder,
path_id,
context.version,
CreateStructVector<binary::IncludeContext>(builder, context.includes));
});
auto compilation_contexts = transform(index->compilation_contexts, [&](auto&& value) {
auto& [path_id, context] = value;
return binary::CreateCompilationContextEntry(
builder,
path_id,
context.version,
context.canonical_id,
context.build_at,
CreateStructVector<binary::IncludeLocation>(builder, context.include_locations));
});
llvm::SmallVector<const Occurrence*> occurrence_keys;
occurrence_keys.reserve(index->occurrences.size());
auto occurrences = transform(index->occurrences, [&](auto&& value) {
auto&& [occurrence, bitmap] = value;
buffer.clear();
buffer.resize_for_overwrite(bitmap.getSizeInBytes(false));
bitmap.write(buffer.data(), false);
occurrence_keys.emplace_back(&occurrence);
return binary::CreateOccurrenceEntry(builder,
safe_cast<binary::Occurrence>(&occurrence),
CreateVector(builder, buffer));
});
std::ranges::sort(std::views::zip(occurrence_keys, occurrences), [](auto lhs, auto rhs) {
const auto& lo = *std::get<0>(lhs);
const auto& ro = *std::get<0>(rhs);
return std::tuple(lo.range.begin, lo.range.end, lo.target) <
std::tuple(ro.range.begin, ro.range.end, ro.target);
});
llvm::SmallVector<std::uint64_t> relation_keys;
relation_keys.reserve(index->relations.size());
auto relations = transform(index->relations, [&](auto&& value) {
auto&& [symbol_id, symbol_relations] = value;
auto relations = transform(symbol_relations, [&](auto&& value) {
auto&& [relation, bitmap] = value;
buffer.clear();
buffer.resize_for_overwrite(bitmap.getSizeInBytes(false));
bitmap.write(buffer.data(), false);
return binary::CreateRelationEntry(builder,
safe_cast<binary::Relation>(&relation),
CreateVector(builder, buffer));
});
relation_keys.emplace_back(symbol_id);
return binary::CreateSymbolRelationsEntry(builder,
symbol_id,
CreateVector(builder, relations));
});
std::ranges::sort(std::views::zip(relation_keys, relations), {}, [](auto e) {
return std::get<0>(e);
});
// Serialize removed bitmap.
buffer.clear();
if(!index->removed.isEmpty()) {
buffer.resize_for_overwrite(index->removed.getSizeInBytes(false));
index->removed.write(buffer.data(), false);
}
auto removed = CreateVector(builder, buffer);
auto content_offset = CreateString(builder, index->content);
auto merged_index = binary::CreateMergedIndex(builder,
index->max_canonical_id,
CreateVector(builder, canonical_cache),
CreateVector(builder, header_contexts),
CreateVector(builder, compilation_contexts),
CreateVector(builder, occurrences),
CreateVector(builder, relations),
removed,
content_offset);
builder.Finish(merged_index);
out.write(safe_cast<char>(builder.GetBufferPointer()), builder.GetSize());
auto bytes = kfb::to_flatbuffer(*self.impl);
assert(bytes && "MergedIndex flatbuffer serialization failed");
out.write(reinterpret_cast<const char*>(bytes->data()), bytes->size());
}
void MergedIndex::lookup(this const Self& self,
@@ -420,25 +308,43 @@ void MergedIndex::lookup(this const Self& self,
break;
}
} else if(self.buffer) {
auto index = fbs::GetRoot<binary::MergedIndex>(self.buffer->getBufferStart());
auto& occurrences = *index->occurrences();
// Lazy path: binary-search the sorted occurrences array directly in
// the flatbuffer without materializing the in-memory Impl.
auto root = kfb::table_view<Impl>::from_bytes(buffer_bytes(*self.buffer));
auto entries = root[&Impl::occurrences];
auto it = std::ranges::lower_bound(occurrences, offset, {}, [](auto o) {
return o->occurrence()->range().end();
});
auto read_occurrence = [](auto occ_view) -> Occurrence {
auto range_view = occ_view[&Occurrence::range];
return Occurrence{
LocalSourceRange{range_view[&LocalSourceRange::begin],
range_view[&LocalSourceRange::end]},
occ_view[&Occurrence::target],
};
};
while(it != occurrences.end()) {
auto o = safe_cast<Occurrence>(it->occurrence());
if(o->range.contains(offset)) {
if(!callback(*o)) {
break;
}
it++;
continue;
const std::size_t count = entries.size();
std::size_t lo = 0;
std::size_t hi = count;
while(lo < hi) {
auto mid = lo + (hi - lo) / 2;
auto entry = entries.at(mid);
auto range_view = entry.template get<0>()[&Occurrence::range];
if(range_view[&LocalSourceRange::end] < offset) {
lo = mid + 1;
} else {
hi = mid;
}
}
break;
for(; lo < count; ++lo) {
auto entry = entries.at(lo);
auto occurrence = read_occurrence(entry.template get<0>());
if(!occurrence.range.contains(offset)) {
break;
}
if(!callback(occurrence)) {
break;
}
}
}
}
@@ -470,18 +376,31 @@ void MergedIndex::lookup(this const Self& self,
}
}
} else if(self.buffer) {
auto index = fbs::GetRoot<binary::MergedIndex>(self.buffer->getBufferStart());
auto& entries = *index->relations();
auto it = std::ranges::lower_bound(entries, symbol, {}, [](auto e) { return e->symbol(); });
if(it == entries.end() || it->symbol() != symbol) [[unlikely]] {
// Lazy path: binary-search the outer relations map and iterate the
// inner map without materializing Impl.
auto root = kfb::table_view<Impl>::from_bytes(buffer_bytes(*self.buffer));
auto outer = root[&Impl::relations];
auto entry = outer.find(symbol);
if(!entry) {
return;
}
for(auto entry: *it->relations()) {
auto r = safe_cast<Relation>(entry->relation());
if(r->kind & kind) {
if(!callback(*r)) {
auto inner = entry->template get<1>();
const std::size_t count = inner.size();
for(std::size_t i = 0; i < count; ++i) {
auto rel_view = inner.at(i).template get<0>();
// Kind comes back as the wire uint32 via the type_adapter; rewrap it.
auto relation_kind =
RelationKind(static_cast<RelationKind::Kind>(rel_view[&Relation::kind]));
if(relation_kind & kind) {
auto range_view = rel_view[&Relation::range];
Relation relation{
.kind = relation_kind,
.padding = rel_view[&Relation::padding],
.range = LocalSourceRange{range_view[&LocalSourceRange::begin],
range_view[&LocalSourceRange::end]},
.target_symbol = rel_view[&Relation::target_symbol],
};
if(!callback(relation)) {
break;
}
}
@@ -516,25 +435,31 @@ bool MergedIndex::need_update(this const Self& self, llvm::ArrayRef<llvm::String
return false;
} else if(self.buffer) {
auto index = fbs::GetRoot<binary::MergedIndex>(self.buffer->getBufferStart());
if(index->compilation_contexts()->empty()) {
auto root = kfb::table_view<Impl>::from_bytes(buffer_bytes(*self.buffer));
auto contexts = root[&Impl::compilation_contexts];
if(contexts.empty()) {
return true;
}
auto context = *index->compilation_contexts()->begin();
auto context = contexts.at(0).template get<1>();
auto build_at = context[&CompilationContext::build_at];
auto include_locations = context[&CompilationContext::include_locations];
llvm::DenseSet<std::uint32_t> deps;
for(auto location: *context->include_locations()) {
auto [_, success] = deps.insert(location->path_id());
const std::size_t count = include_locations.size();
for(std::size_t i = 0; i < count; ++i) {
auto location = include_locations.at(i);
auto path_id = location[&IncludeLocation::path_id];
auto [_, success] = deps.insert(path_id);
if(success) {
fs::file_status status;
if(auto err = fs::status(path_mapping[location->path_id()], status)) {
if(auto err = fs::status(path_mapping[path_id], status)) {
return true;
}
auto time = std::chrono::duration_cast<std::chrono::milliseconds>(
status.getLastModificationTime().time_since_epoch());
if(time.count() > context->build_at()) {
if(time.count() > build_at) {
return true;
}
}
@@ -616,10 +541,9 @@ llvm::StringRef MergedIndex::content(this const Self& self) {
if(self.impl) {
return self.impl->content;
} else if(self.buffer) {
auto root = fbs::GetRoot<binary::MergedIndex>(self.buffer->getBufferStart());
if(root->content()) {
return root->content()->string_view();
}
auto root = kfb::table_view<Impl>::from_bytes(buffer_bytes(*self.buffer));
auto view = root[&Impl::content];
return llvm::StringRef(view.data(), view.size());
}
return {};
}

View File

@@ -1,9 +1,22 @@
#include "index/project_index.h"
#include "index/serialization.h"
#include <cassert>
#include <cstdint>
#include <span>
#include "index/kotatsu_adapters.h" // type_adapter specializations
#include "kota/codec/flatbuffers/deserializer.h"
#include "kota/codec/flatbuffers/serializer.h"
namespace clice::index {
namespace {
namespace kfb = kota::codec::flatbuffers;
} // namespace
llvm::SmallVector<std::uint32_t> ProjectIndex::merge(this ProjectIndex& self, TUIndex& index) {
auto& paths = index.graph.paths;
llvm::SmallVector<std::uint32_t> file_ids_map;
@@ -28,79 +41,22 @@ llvm::SmallVector<std::uint32_t> ProjectIndex::merge(this ProjectIndex& self, TU
}
void ProjectIndex::serialize(this ProjectIndex& self, llvm::raw_ostream& os) {
fbs::FlatBufferBuilder builder(1024);
llvm::SmallVector<char, 1024> buffer;
auto i = 0;
auto paths = transform(self.path_pool.paths, [&](llvm::StringRef path) {
auto entry =
binary::CreatePathEntry(builder, CreateString(builder, self.path_pool.paths[i]), i);
i += 1;
return entry;
});
auto indices = transform(self.indices, [&](auto&& value) {
auto&& [source, index] = value;
return binary::PathMapEntry(source, index);
});
auto symbols = transform(self.symbols, [&](auto&& value) {
auto& [symbol_id, symbol] = value;
buffer.clear();
buffer.resize_for_overwrite(symbol.reference_files.getSizeInBytes(false));
symbol.reference_files.write(buffer.data(), false);
return binary::CreateSymbolEntry(builder,
symbol_id,
binary::CreateSymbol(builder,
CreateString(builder, symbol.name),
symbol.kind.value(),
CreateVector(builder, buffer)));
});
auto project_index =
binary::CreateProjectIndex(builder,
CreateVector(builder, paths),
CreateStructVector<binary::PathMapEntry>(builder, indices),
CreateVector(builder, symbols));
builder.Finish(project_index);
os.write(safe_cast<const char>(builder.GetBufferPointer()), builder.GetSize());
auto bytes = kfb::to_flatbuffer(self);
assert(bytes && "ProjectIndex flatbuffer serialization failed");
os.write(reinterpret_cast<const char*>(bytes->data()), bytes->size());
}
ProjectIndex ProjectIndex::from(const void* data) {
auto root = fbs::GetRoot<binary::ProjectIndex>(data);
ProjectIndex ProjectIndex::from(const void* data, std::size_t size) {
ProjectIndex index;
auto& pool = index.path_pool;
pool.paths.resize(root->paths()->size());
for(auto entry: *root->paths()) {
// Normalize backslashes to forward slashes for cross-platform consistency
// (persisted index may contain native-separator paths from Windows).
llvm::SmallString<256> normalized(entry->path()->string_view());
std::replace(normalized.begin(), normalized.end(), '\\', '/');
auto k = pool.save(normalized.str());
pool.paths[entry->id()] = k;
pool.cache.try_emplace(k, entry->id());
if(data == nullptr || size == 0) {
return index;
}
for(auto entry: *root->indices()) {
index.indices.try_emplace(entry->source(), entry->index());
std::span<const std::uint8_t> bytes(static_cast<const std::uint8_t*>(data), size);
auto result = kfb::from_flatbuffer(bytes, index);
if(!result) {
return ProjectIndex();
}
for(auto entry: *root->symbols()) {
auto& symbol = index.symbols[entry->symbol_id()];
auto* fb_symbol = entry->symbol();
if(auto* name = fb_symbol->name()) {
symbol.name = name->str();
}
symbol.kind = SymbolKind(static_cast<std::uint8_t>(fb_symbol->kind()));
symbol.reference_files = read_bitmap(fb_symbol->refs());
}
return index;
}

View File

@@ -2,10 +2,14 @@
#include <algorithm>
#include <cstdint>
#include <string>
#include <vector>
#include "index/tu_index.h"
#include "kota/codec/arena/traits.h"
#include "kota/codec/detail/fwd.h"
#include "kota/support/expected_try.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/SmallString.h"
#include "llvm/ADT/SmallVector.h"
@@ -84,7 +88,71 @@ struct ProjectIndex {
void serialize(this ProjectIndex& self, llvm::raw_ostream& os);
static ProjectIndex from(const void* data);
static ProjectIndex from(const void* data, std::size_t size);
};
} // namespace clice::index
namespace kota::codec {
/// `PathPool` on the wire is a flat list of absolute paths; `id` is the
/// position in the vector. The allocator and reverse cache are runtime-only.
///
/// Streaming serialize: iterate `pool.paths` and allocate strings directly
/// into the builder, avoiding the double-copy that a value-mode
/// `wire_type = std::vector<std::string>` conversion would introduce.
template <typename S>
requires arena::arena_serializer_like<S>
struct serialize_traits<S, clice::index::PathPool> {
// Structural wire shape — declared so the flatbuffers proxy views
// a `PathPool` field as an `array_view<std::string>`.
using wire_type = std::vector<std::string>;
static auto serialize(S& s, const clice::index::PathPool& pool)
-> std::expected<typename S::vector_ref, typename S::error_type> {
std::vector<typename S::string_ref> offsets;
offsets.reserve(pool.paths.size());
for(const auto& path: pool.paths) {
auto r = s.alloc_string(std::string_view(path.data(), path.size()));
if(!r) {
return std::unexpected(r.error());
}
offsets.push_back(*r);
}
return s.alloc_string_vector(
std::span<const typename S::string_ref>(offsets.data(), offsets.size()));
}
};
/// Streaming deserialize: read each path out of the flatbuffer's
/// string-vector view directly, interning it into the pool's allocator
/// in-place. Avoids the transient `std::vector<std::string>` the
/// value-mode form would materialize.
template <typename D>
requires arena::arena_deserializer_like<D>
struct deserialize_traits<D, clice::index::PathPool> {
using wire_type = std::vector<std::string>;
static auto deserialize(const D& d,
typename D::TableView view,
typename D::slot_id sid,
clice::index::PathPool& out)
-> std::expected<void, typename D::error_type> {
if(!view.has(sid)) {
return {};
}
KOTA_EXPECTED_TRY_V(auto vec, d.get_string_vector(view, sid));
out.paths.resize(vec.size());
for(std::size_t i = 0; i < vec.size(); ++i) {
auto sv = vec[i];
llvm::SmallString<256> normalized(llvm::StringRef(sv.data(), sv.size()));
std::replace(normalized.begin(), normalized.end(), '\\', '/');
auto interned = out.save(normalized.str());
out.paths[i] = interned;
out.cache.try_emplace(interned, static_cast<std::uint32_t>(i));
}
return {};
}
};
} // namespace kota::codec

View File

@@ -1,173 +0,0 @@
namespace clice.index.binary;
struct Range {
begin : uint;
end : uint;
}
struct Occurrence {
range : Range;
target : ulong;
}
struct Relation {
kind : uint;
padding : uint;
range : Range;
target_symbol : ulong;
}
table CacheEntry {
sha256:
string;
canonical_id:
uint;
}
struct IncludeContext {
include_id : uint;
canonical_id : uint;
}
table HeaderContextEntry {
path_id:
uint;
version:
uint;
includes:
[IncludeContext];
}
struct IncludeLocation {
path_id : uint;
line : uint;
include_id : uint;
}
table CompilationContextEntry {
path_id:
uint;
version:
uint;
canonical_id:
uint;
build_at:
ulong;
include_locations:
[IncludeLocation];
}
table OccurrenceEntry {
occurrence:
Occurrence;
context:
[ubyte];
}
table RelationEntry {
relation:
Relation;
context:
[ubyte];
}
table SymbolRelationsEntry {
symbol:
ulong;
relations:
[RelationEntry];
}
table Symbol {
name:
string;
kind:
ubyte;
refs:
[ubyte];
}
table SymbolEntry {
symbol_id:
ulong;
symbol:
Symbol;
}
table MergedIndex {
max_canonical_id:
uint;
canonical_cache:
[CacheEntry];
header_contexts:
[HeaderContextEntry];
compilation_contexts:
[CompilationContextEntry];
occurrences:
[OccurrenceEntry];
relations:
[SymbolRelationsEntry];
removed:
[ubyte];
content:
string;
}
table TUFileRelationsEntry {
symbol:
ulong;
relations:
[Relation];
}
table TUFileIndexEntry {
file_id:
uint;
occurrences:
[Occurrence];
relations:
[TUFileRelationsEntry];
}
table TUIndex {
built_at:
ulong;
paths:
[string];
locations:
[IncludeLocation];
symbols:
[SymbolEntry];
file_indices:
[TUFileIndexEntry];
main_file_index:
TUFileIndexEntry;
}
table PathEntry {
path:
string;
id:
uint;
}
struct PathMapEntry {
source : uint;
index : uint;
}
table ProjectIndex {
paths:
[PathEntry];
indices:
[PathMapEntry];
symbols:
[SymbolEntry];
}

View File

@@ -1,79 +0,0 @@
#include <cstdint>
#include <ranges>
#include <type_traits>
#include "schema_generated.h"
#include "support/bitmap.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/StringRef.h"
namespace clice::index {
namespace fbs = flatbuffers;
namespace {
template <typename Range>
concept sequence_range = std::ranges::input_range<Range> &&
!requires { typename Range::key_type; } && requires(const Range& r) {
r.data();
r.size();
};
template <typename T>
using Offsets = llvm::SmallVector<fbs::Offset<T>, 0>;
template <typename U, typename V>
const U* safe_cast(const V* v) {
static_assert(sizeof(U) == sizeof(V), "size mismatch");
static_assert(alignof(U) == alignof(V), "alignment mismatch");
static_assert(std::is_trivially_copyable_v<U> && std::is_trivially_copyable_v<V>,
"requires trivially copyable");
/// If aliasing issues arise, prefer copying into a temporary SmallVector<U>.
return reinterpret_cast<const U*>(v);
}
auto CreateString(fbs::FlatBufferBuilder& builder, llvm::StringRef string) {
return builder.CreateString(string.data(), string.size());
}
template <sequence_range Range>
auto CreateVector(fbs::FlatBufferBuilder& builder, const Range& range) {
return builder.CreateVector(range.data(), range.size());
}
auto CreateVector(fbs::FlatBufferBuilder& builder, const llvm::SmallVector<char, 1024>& range) {
return builder.CreateVector(reinterpret_cast<const std::uint8_t*>(range.data()), range.size());
}
template <typename U, sequence_range Range>
auto CreateStructVector(fbs::FlatBufferBuilder& builder, const Range& range) {
using V = std::ranges::range_value_t<Range>;
(void)sizeof(V);
return builder.CreateVectorOfStructs(safe_cast<U>(range.data()), range.size());
}
template <typename Range, typename Functor>
auto transform(const Range& range, const Functor& functor) {
using V = std::ranges::range_value_t<Range>;
using R = std::invoke_result_t<Functor, V>;
llvm::SmallVector<R, 0> result;
result.resize_for_overwrite(std::ranges::size(range));
auto i = 0;
for(auto&& v: range) {
result[i] = functor(v);
i += 1;
}
return result;
}
Bitmap read_bitmap(const fbs::Vector<uint8_t>* buffer) {
return Bitmap::read(reinterpret_cast<const char*>(buffer->data()), false);
}
} // namespace
} // namespace clice::index

View File

@@ -1,17 +1,24 @@
#include "index/tu_index.h"
#include <cassert>
#include <cstdint>
#include <span>
#include <tuple>
#include "index/serialization.h"
#include "index/kotatsu_adapters.h" // type_adapter specializations
#include "semantic/ast_utility.h"
#include "semantic/semantic_visitor.h"
#include "kota/codec/flatbuffers/deserializer.h"
#include "kota/codec/flatbuffers/serializer.h"
#include "llvm/Support/SHA256.h"
namespace clice::index {
namespace {
namespace kfb = kota::codec::flatbuffers;
class Builder : public SemanticVisitor<Builder> {
public:
Builder(TUIndex& result, CompilationUnitRef unit, bool interested_only) :
@@ -114,6 +121,8 @@ public:
void build() {
run();
auto interested = unit.interested_file();
for(auto& [fid, index]: result.file_indices) {
for(auto& [symbol_id, relations]: index.relations) {
std::ranges::sort(relations, [](const Relation& lhs, const Relation& rhs) {
@@ -144,13 +153,19 @@ public:
return lhs.range == rhs.range && lhs.target == rhs.target;
});
index.occurrences.erase(range.begin(), range.end());
if(fid == unit.interested_file()) {
result.main_file_index = std::move(index);
}
}
result.file_indices.erase(unit.interested_file());
// Populate main_file_index (interested file) and path_file_indices
// (keyed by path_id) for serialization. `file_indices` itself is
// `skip`-marked (runtime-only, keyed by clang::FileID) and retained
// for in-memory consumers/tests that need FileID access.
for(auto& [fid, index]: result.file_indices) {
if(fid == interested) {
result.main_file_index = index;
} else {
result.path_file_indices[result.graph.path_id(fid)] = index;
}
}
}
private:
@@ -198,119 +213,23 @@ TUIndex TUIndex::build(CompilationUnitRef unit, bool interested_only) {
return index;
}
void TUIndex::serialize(llvm::raw_ostream& os) const {
fbs::FlatBufferBuilder builder(4096);
llvm::SmallVector<char, 1024> buffer;
auto paths =
transform(graph.paths, [&](const std::string& p) { return builder.CreateString(p); });
auto syms = transform(symbols, [&](auto&& value) {
auto& [symbol_id, symbol] = value;
buffer.clear();
buffer.resize_for_overwrite(symbol.reference_files.getSizeInBytes(false));
symbol.reference_files.write(buffer.data(), false);
return binary::CreateSymbolEntry(builder,
symbol_id,
binary::CreateSymbol(builder,
CreateString(builder, symbol.name),
symbol.kind.value(),
CreateVector(builder, buffer)));
});
/// Serialize a single FileIndex into a TUFileIndexEntry.
auto serialize_file_index = [&](std::uint32_t fid, const FileIndex& index) {
auto occs = CreateStructVector<binary::Occurrence>(builder, index.occurrences);
auto rels = transform(index.relations, [&](auto&& value) {
auto& [symbol_id, relations] = value;
return binary::CreateTUFileRelationsEntry(
builder,
symbol_id,
CreateStructVector<binary::Relation>(builder, relations));
});
return binary::CreateTUFileIndexEntry(builder, fid, occs, CreateVector(builder, rels));
};
/// Convert FileID-keyed file_indices to path_id-keyed entries.
llvm::SmallVector<fbs::Offset<binary::TUFileIndexEntry>> file_idx_vec;
for(auto& [fid, index]: file_indices) {
auto pid = graph.path_id(fid);
file_idx_vec.push_back(serialize_file_index(pid, index));
}
/// Main file is the last path in graph.paths (convention from IncludeGraph).
auto main_idx =
serialize_file_index(static_cast<std::uint32_t>(graph.paths.size() - 1), main_file_index);
auto tu_index =
binary::CreateTUIndex(builder,
static_cast<std::uint64_t>(built_at.count()),
CreateVector(builder, paths),
CreateStructVector<binary::IncludeLocation>(builder, graph.locations),
CreateVector(builder, syms),
builder.CreateVector(file_idx_vec.data(), file_idx_vec.size()),
main_idx);
builder.Finish(tu_index);
os.write(safe_cast<const char>(builder.GetBufferPointer()), builder.GetSize());
void TUIndex::serialize(llvm::raw_ostream& os) {
auto bytes = kfb::to_flatbuffer(*this);
assert(bytes && "TUIndex flatbuffer serialization failed");
os.write(reinterpret_cast<const char*>(bytes->data()), bytes->size());
}
TUIndex TUIndex::from(const void* data) {
auto root = fbs::GetRoot<binary::TUIndex>(data);
TUIndex TUIndex::from(const void* data, std::size_t size) {
TUIndex index;
index.built_at = std::chrono::milliseconds(root->built_at());
for(auto p: *root->paths()) {
index.graph.paths.emplace_back(p->str());
if(data == nullptr || size == 0) {
return index;
}
for(auto loc: *root->locations()) {
index.graph.locations.emplace_back(*safe_cast<IncludeLocation>(loc));
std::span<const std::uint8_t> bytes(static_cast<const std::uint8_t*>(data), size);
auto result = kfb::from_flatbuffer(bytes, index);
if(!result) {
return TUIndex();
}
for(auto entry: *root->symbols()) {
auto& symbol = index.symbols[entry->symbol_id()];
symbol.name = entry->symbol()->name()->str();
symbol.kind = SymbolKind(static_cast<std::uint8_t>(entry->symbol()->kind()));
symbol.reference_files = read_bitmap(entry->symbol()->refs());
}
/// Helper to deserialize a TUFileIndexEntry into a FileIndex.
auto deserialize_file_index = [](const binary::TUFileIndexEntry* entry) -> FileIndex {
FileIndex fi;
if(entry->occurrences()) {
fi.occurrences.reserve(entry->occurrences()->size());
for(auto o: *entry->occurrences()) {
fi.occurrences.emplace_back(*safe_cast<Occurrence>(o));
}
}
if(entry->relations()) {
for(auto rel_entry: *entry->relations()) {
auto& rels = fi.relations[rel_entry->symbol()];
if(rel_entry->relations()) {
rels.reserve(rel_entry->relations()->size());
for(auto r: *rel_entry->relations()) {
rels.emplace_back(*safe_cast<Relation>(r));
}
}
}
}
return fi;
};
/// Populate path_file_indices keyed by path_id (no clang::FileID needed).
if(root->file_indices()) {
for(auto entry: *root->file_indices()) {
index.path_file_indices[entry->file_id()] = deserialize_file_index(entry);
}
}
if(root->main_file_index()) {
index.main_file_index = deserialize_file_index(root->main_file_index());
}
return index;
}

View File

@@ -12,6 +12,7 @@
#include "semantic/symbol_kind.h"
#include "support/bitmap.h"
#include "kota/meta/annotation.h"
#include "llvm/Support/raw_ostream.h"
namespace clice::index {
@@ -35,6 +36,10 @@ struct Relation {
constexpr auto definition_range() {
return std::bit_cast<LocalSourceRange>(target_symbol);
}
friend bool operator==(const Relation&, const Relation&) = default;
friend auto operator<=>(const Relation&, const Relation&) = default;
};
struct Occurrence {
@@ -45,6 +50,8 @@ struct Occurrence {
SymbolHash target;
friend bool operator==(const Occurrence&, const Occurrence&) = default;
friend auto operator<=>(const Occurrence&, const Occurrence&) = default;
};
struct FileIndex {
@@ -77,19 +84,21 @@ struct TUIndex {
SymbolTable symbols;
llvm::DenseMap<clang::FileID, FileIndex> file_indices;
/// Runtime-only: keyed by AST-scoped `clang::FileID` during build; flushed
/// into `path_file_indices` (keyed by path id) before serialization.
kota::meta::skip<llvm::DenseMap<clang::FileID, FileIndex>> file_indices;
/// File indices keyed by path_id, populated by from() for deserialized data.
/// When built from AST, this is empty and file_indices (keyed by FileID) is used.
/// File indices keyed by path_id. Populated from `file_indices` at
/// serialize time, and directly from the wire on deserialize.
llvm::DenseMap<std::uint32_t, FileIndex> path_file_indices;
FileIndex main_file_index;
static TUIndex build(CompilationUnitRef unit, bool interested_only = false);
void serialize(llvm::raw_ostream& os) const;
void serialize(llvm::raw_ostream& os);
static TUIndex from(const void* data);
static TUIndex from(const void* data, std::size_t size);
};
} // namespace clice::index

View File

@@ -71,6 +71,10 @@ constexpr bool operator==(RelationKind lhs, RelationKind rhs) {
return lhs.value() == rhs.value();
}
constexpr auto operator<=>(RelationKind lhs, RelationKind rhs) {
return lhs.value() <=> rhs.value();
}
constexpr bool operator&(RelationKind lhs, RelationKind rhs) {
return lhs.value() == rhs.value();
}

View File

@@ -763,7 +763,8 @@ kota::task<bool> Compiler::ensure_compiled(Session& session) {
// Store open file index from the stateful worker's TUIndex.
if(!result.value().tu_index_data.empty()) {
auto tu_index = index::TUIndex::from(result.value().tu_index_data.data());
auto tu_index = index::TUIndex::from(result.value().tu_index_data.data(),
result.value().tu_index_data.size());
OpenFileIndex ofi;
ofi.file_index = std::move(tu_index.main_file_index);
ofi.symbols = std::move(tu_index.symbols);

View File

@@ -25,7 +25,7 @@ namespace clice {
namespace lsp = kota::ipc::lsp;
void Indexer::merge(const void* tu_index_data, std::size_t size) {
auto tu_index = index::TUIndex::from(tu_index_data);
auto tu_index = index::TUIndex::from(tu_index_data, size);
if(tu_index.graph.paths.empty()) {
LOG_WARN("Ignoring TUIndex with empty path graph");
return;
@@ -144,7 +144,8 @@ void Indexer::load(llvm::StringRef index_dir) {
auto project_path = path::join(index_dir, "project.idx");
auto buf = llvm::MemoryBuffer::getFile(project_path);
if(buf) {
workspace.project_index = index::ProjectIndex::from((*buf)->getBufferStart());
workspace.project_index =
index::ProjectIndex::from((*buf)->getBufferStart(), (*buf)->getBufferSize());
LOG_INFO("Loaded ProjectIndex: {} symbols", workspace.project_index.symbols.size());
}

View File

@@ -46,6 +46,8 @@ struct LocalSourceRange {
constexpr bool operator==(const LocalSourceRange& other) const = default;
constexpr auto operator<=>(const LocalSourceRange& other) const = default;
constexpr std::uint32_t length() const {
return end - begin;
}

View File

@@ -128,7 +128,7 @@ TEST_CASE(SerializationRoundTrip) {
project.serialize(os);
// Deserialize.
auto restored = index::ProjectIndex::from(buf.data());
auto restored = index::ProjectIndex::from(buf.data(), buf.size());
// Path pools should match.
ASSERT_EQ(project.path_pool.paths.size(), restored.path_pool.paths.size());
@@ -190,7 +190,7 @@ TEST_CASE(NameSurvivesRoundTrip) {
llvm::SmallString<4096> buf;
llvm::raw_svector_ostream os(buf);
project.serialize(os);
auto restored = index::ProjectIndex::from(buf.data());
auto restored = index::ProjectIndex::from(buf.data(), buf.size());
// Verify names survive round-trip.
for(auto& [hash, symbol]: project.symbols) {