refactor(document links): use Lexer for unified directive argument scanning

Replace hand-written character scanning with the project's Lexer class to find filename arguments in preprocessor directives. Extend the Lexer to activate header_name mode for #embed and expose set_header_name_mode() for __has_include/__has_embed contexts. Remove unused Include::filename_range field which had a latent assert crash on macro-expanded includes. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
Merge branch 'main' of https://github.com/clice-io/clice into feat/document-links-pch-embed
2026-04-09 21:02:22 +08:00 · 2026-04-09 19:44:34 +08:00 · 2026-04-09 17:12:14 +08:00 · 2026-04-09 17:08:27 +08:00 · 2026-04-09 17:06:13 +08:00 · 2026-04-09 17:01:00 +08:00
18 changed files with 329 additions and 53 deletions
--- a/src/compile/directive.cpp
+++ b/src/compile/directive.cpp
@@ -94,7 +94,7 @@ public:
                            const clang::Token& include_tok,
                            llvm::StringRef,
                            bool,
-                            clang::CharSourceRange filename_range,
+                            clang::CharSourceRange,
                            clang::OptionalFileEntryRef,
                            llvm::StringRef,
                            llvm::StringRef,
@@ -108,7 +108,6 @@ public:
        unit->directives[prev_fid].includes.emplace_back(Include{
            .fid = {},
            .location = include_tok.getLocation(),
-            .filename_range = filename_range.getAsRange(),
        });
    }

--- a/src/compile/directive.h
+++ b/src/compile/directive.h
@@ -20,11 +20,8 @@ struct Include {
    /// The file id of included file.
    clang::FileID fid;

-    /// Location of the `include`.
+    /// Location of the `include` keyword.
    clang::SourceLocation location;
-
-    /// The range of filename(includes `""` or `<>`).
-    clang::SourceRange filename_range;
 };

 /// Information about `__has_include` directive.
--- a/src/feature/document_links.cpp
+++ b/src/feature/document_links.cpp
@@ -1,13 +1,20 @@
-#include <algorithm>
 #include <cstdint>
 #include <string>
 #include <vector>

 #include "feature/feature.h"
+#include "syntax/lexer.h"

 namespace clice::feature {

-namespace {}  // namespace
+namespace {
+
+bool is_directive_keyword(llvm::StringRef word) {
+    return word == "include" || word == "include_next" || word == "import" || word == "embed" ||
+           word == "__has_include" || word == "__has_include_next" || word == "__has_embed";
+}
+
+}  // namespace

 auto document_links(CompilationUnitRef unit, PositionEncoding encoding)
    -> std::vector<protocol::DocumentLink> {
@@ -23,49 +30,92 @@ auto document_links(CompilationUnitRef unit, PositionEncoding encoding)
    PositionMapper converter(content, encoding);
    auto& directives = directives_it->second;

-    links.reserve(directives.includes.size() + directives.has_includes.size());
-
-    for(const auto& include: directives.includes) {
-        auto [fid, range] = unit.decompose_range(include.filename_range);
-        if(fid != interested || !range.valid()) {
-            continue;
+    // Find the filename argument of a preprocessor directive starting from `offset`.
+    // Creates a Lexer from the line start so that # at start-of-line is detected,
+    // which enables header_name mode for #include and #embed automatically.
+    // For __has_include/__has_embed, manually enables header_name mode after (.
+    auto find_argument_range = [&](std::uint32_t offset) -> std::optional<LocalSourceRange> {
+        std::uint32_t line_start = 0;
+        if(offset > 0) {
+            if(auto nl = content.rfind('\n', offset - 1); nl != llvm::StringRef::npos)
+                line_start = static_cast<std::uint32_t>(nl + 1);
        }

-        protocol::DocumentLink link{
-            .range = to_range(converter, range),
-        };
-        link.target = std::string(unit.file_path(include.fid));
+        auto line = content.substr(line_start);
+        Lexer lexer(line);
+        bool after_has_keyword = false;
+
+        while(true) {
+            auto tok = lexer.advance();
+            if(tok.is_eof() || tok.is_eod())
+                break;
+
+            auto abs_begin = line_start + tok.range.begin;
+            auto abs_end = line_start + tok.range.end;
+
+            // Detect __has_include/__has_embed to enable header_name mode after (.
+            if(tok.is_identifier()) {
+                auto text = tok.text(line);
+                if(text == "__has_include" || text == "__has_include_next" ||
+                   text == "__has_embed") {
+                    after_has_keyword = true;
+                    continue;
+                }
+            }
+
+            if(tok.kind == clang::tok::l_paren && after_has_keyword) {
+                after_has_keyword = false;
+                lexer.set_header_name_mode();
+                continue;
+            }
+
+            // Only return tokens at or after the directive's starting offset.
+            if(abs_begin < offset)
+                continue;
+
+            if(tok.is_header_name() || tok.kind == clang::tok::string_literal)
+                return LocalSourceRange(abs_begin, abs_end);
+
+            if(tok.is_identifier() && !is_directive_keyword(tok.text(line)))
+                return LocalSourceRange(abs_begin, abs_end);
+        }
+        return std::nullopt;
+    };
+
+    auto add_link = [&](clang::SourceLocation loc, llvm::StringRef target) {
+        auto [fid, offset] = unit.decompose_location(loc);
+        if(fid != interested || offset >= content.size())
+            return;
+        auto range = find_argument_range(offset);
+        if(!range)
+            return;
+        protocol::DocumentLink link{.range = to_range(converter, *range)};
+        link.target = target.str();
        links.push_back(std::move(link));
+    };
+
+    for(const auto& include: directives.includes) {
+        if(include.fid.isValid()) {
+            add_link(include.location, unit.file_path(include.fid));
+        }
    }

    for(const auto& has_include: directives.has_includes) {
-        if(has_include.fid.isInvalid()) {
-            continue;
+        if(has_include.fid.isValid()) {
+            add_link(has_include.location, unit.file_path(has_include.fid));
        }
+    }

-        auto [fid, offset] = unit.decompose_location(has_include.location);
-        if(fid != interested || offset >= content.size()) {
-            continue;
+    for(const auto& embed: directives.embeds) {
+        if(embed.file) {
+            add_link(embed.loc, embed.file->getName());
        }
+    }

-        auto tail = content.substr(offset);
-        char open = tail.front();
-        if(open != '<' && open != '"') {
-            continue;
+    for(const auto& has_embed: directives.has_embeds) {
+        if(has_embed.file) {
+            add_link(has_embed.loc, has_embed.file->getName());
        }
-
-        char close = open == '<' ? '>' : '"';
-        auto close_index = tail.find(close, 1);
-        if(close_index == llvm::StringRef::npos) {
-            continue;
-        }
-
-        LocalSourceRange range(offset, offset + static_cast<std::uint32_t>(close_index + 1));
-        protocol::DocumentLink link{
-            .range = to_range(converter, range),
-        };
-        link.target = std::string(unit.file_path(has_include.fid));
-        links.push_back(std::move(link));
    }

    return links;
--- a/src/server/compiler.cpp
+++ b/src/server/compiler.cpp
@@ -502,6 +502,7 @@ et::task<bool> Compiler::ensure_pch(Session& session,
    st.bound = bound;
    st.hash = preamble_hash;
    st.deps = capture_deps_snapshot(workspace.path_pool, result.value().deps);
+    st.document_links_json = std::move(result.value().pch_links_json);
    st.building.reset();

    session.pch_ref = Session::PCHRef{path_id, preamble_hash, bound};
--- a/src/server/master_server.cpp
+++ b/src/server/master_server.cpp
@@ -478,15 +478,38 @@ void MasterServer::register_handlers() {
        co_return co_await compiler.forward_query(worker::QueryKind::DocumentSymbol, sit->second);
    });

-    peer.on_request(
-        [this](RequestContext& ctx, const protocol::DocumentLinkParams& params) -> RawResult {
-            auto path = uri_to_path(params.text_document.uri);
-            auto path_id = workspace.path_pool.intern(path);
-            auto sit = sessions.find(path_id);
-            if(sit == sessions.end())
-                co_return serde_raw{"null"};
-            co_return co_await compiler.forward_query(worker::QueryKind::DocumentLink, sit->second);
-        });
+    peer.on_request([this](RequestContext& ctx,
+                           const protocol::DocumentLinkParams& params) -> RawResult {
+        auto path = uri_to_path(params.text_document.uri);
+        auto path_id = workspace.path_pool.intern(path);
+        auto sit = sessions.find(path_id);
+        if(sit == sessions.end())
+            co_return serde_raw{"null"};
+        auto& session = sit->second;
+        auto result = co_await compiler.forward_query(worker::QueryKind::DocumentLink, session);
+        if(!result.has_value())
+            co_return serde_raw{"null"};
+        // Merge document links from PCH if available.
+        auto& links = result.value();
+        // Re-lookup session after co_await since iterators may be invalidated.
+        auto sit2 = sessions.find(path_id);
+        if(sit2 != sessions.end() && sit2->second.pch_ref) {
+            auto pch_it = workspace.pch_cache.find(sit2->second.pch_ref->path_id);
+            if(pch_it != workspace.pch_cache.end() && !pch_it->second.document_links_json.empty()) {
+                auto& pch_json = pch_it->second.document_links_json;
+                // Merge two JSON arrays.
+                if(!links.data.empty() && links.data != "null" && links.data.size() > 2) {
+                    // "[a,b]" + "[c,d]" -> "[a,b,c,d]"
+                    links.data.pop_back();  // remove trailing ']'
+                    links.data += ',';
+                    links.data.append(pch_json.begin() + 1, pch_json.end());  // skip '['
+                } else {
+                    links.data = pch_json;
+                }
+            }
+        }
+        co_return std::move(links);
+    });

    peer.on_request(
        [this](RequestContext& ctx, const protocol::CodeActionParams& params) -> RawResult {
--- a/src/server/protocol.h
+++ b/src/server/protocol.h
@@ -102,6 +102,7 @@ struct BuildResult {
    std::string output_path;  ///< PCH or PCM path
    std::vector<std::string> deps;
    std::string tu_index_data;
+    std::string pch_links_json;             ///< Pre-serialized DocumentLink[] from PCH
    eventide::serde::RawValue result_json;  ///< Completion/SignatureHelp result
 };

--- a/src/server/stateless_worker.cpp
+++ b/src/server/stateless_worker.cpp
@@ -96,8 +96,13 @@ static worker::BuildResult handle_build_pch(const worker::BuildParams& params) {
        errors = collect_errors(unit);

    std::string tu_index_data;
-    if(success)
+    std::string pch_links_json;
+    if(success) {
        tu_index_data = serialize_tu_index(unit);
+        auto links = feature::document_links(unit);
+        auto raw = to_raw(links);
+        pch_links_json = std::move(raw.data);
+    }

    // Destroy CompilationUnit to flush PCH to disk.
    unit = CompilationUnit(nullptr);
@@ -110,6 +115,7 @@ static worker::BuildResult handle_build_pch(const worker::BuildParams& params) {
        result.output_path = std::move(final_path);
        result.deps = pch_info.deps;
        result.tu_index_data = std::move(tu_index_data);
+        result.pch_links_json = std::move(pch_links_json);
        return result;
    } else {
        LOG_WARN("BuildPCH failed: file={}, {}ms, errors=[{}]", params.file, timer.ms(), errors);
--- a/src/server/workspace.h
+++ b/src/server/workspace.h
@@ -140,6 +140,7 @@ struct PCHState {
    std::uint32_t bound = 0;
    std::uint64_t hash = 0;
    DepsSnapshot deps;
+    std::string document_links_json;  ///< Pre-serialized DocumentLink[] from PCH build
    std::shared_ptr<eventide::event> building;
 };

--- a/src/syntax/lexer.cpp
+++ b/src/syntax/lexer.cpp
@@ -53,7 +53,8 @@ void Lexer::lex(Token& token) {
        }
    } else if(parse_pp_keyword) {
        parse_pp_keyword = false;
-        parse_header_name = token.text(content) == "include";
+        auto kw = token.text(content);
+        parse_header_name = kw == "include" || kw == "include_next" || kw == "embed";
    }
 }

--- a/src/syntax/lexer.h
+++ b/src/syntax/lexer.h
@@ -51,6 +51,15 @@ public:

    Token advance_until(TokenKind kind);

+    /// Force the lexer into header-name mode so the next token is lexed
+    /// via LexIncludeFilename (correctly handling both "..." and <...>).
+    /// Use this before lexing filename arguments in contexts like
+    /// __has_include() or __has_embed() where the lexer cannot detect
+    /// the mode automatically.
+    void set_header_name_mode() {
+        parse_header_name = true;
+    }
+
 private:
    bool ignore_end_of_directive = true;
    bool parse_pp_keyword = false;
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -231,6 +231,14 @@ def _generate_test_data_cdbs(data_dir: Path) -> None:
    if ic_main.exists():
        _write(ic_dir, [_entry(ic_dir, ic_main, ["-I."])])

+    # document_links
+    dl_dir = data_dir / "document_links"
+    dl_main = dl_dir / "main.cpp"
+    if dl_main.exists():
+        _write(
+            dl_dir, [_entry(dl_dir, dl_main, [f"-I{dl_dir.as_posix()}", "-std=c++23"])]
+        )
+
    # pch_test
    pt_dir = data_dir / "pch_test"
    if pt_dir.exists():
--- a/tests/data/document_links/data.bin
+++ b/tests/data/document_links/data.bin
@@ -0,0 +1 @@
+0123456789
--- a/tests/data/document_links/header_a.h
+++ b/tests/data/document_links/header_a.h
@@ -0,0 +1,3 @@
+#pragma once
+
+int a = 1;
--- a/tests/data/document_links/header_b.h
+++ b/tests/data/document_links/header_b.h
@@ -0,0 +1,3 @@
+#pragma once
+
+int b = 2;
--- a/tests/data/document_links/header_c.h
+++ b/tests/data/document_links/header_c.h
@@ -0,0 +1,3 @@
+#pragma once
+
+int c = 3;
--- a/tests/data/document_links/main.cpp
+++ b/tests/data/document_links/main.cpp
@@ -0,0 +1,20 @@
+#include "header_a.h"
+#include "header_b.h"
+int x = 1;
+#include "header_c.h"
+
+const char data[] = {
+#embed "data.bin"
+};
+
+#if __has_embed("data.bin")
+int has_embed_found = 1;
+#endif
+
+#if __has_embed("no_such_file.bin")
+int has_embed_not_found = 1;
+#endif
+
+int main() {
+    return a + b + c;
+}
--- a/tests/integration/features/test_document_links.py
+++ b/tests/integration/features/test_document_links.py
@@ -0,0 +1,103 @@
+from pathlib import Path
+
+import pytest
+
+
+@pytest.mark.workspace("document_links")
+async def test_document_links_with_pch(client, workspace):
+    uri, content = await client.open_and_wait(workspace / "main.cpp")
+    links = await client.document_links(uri)
+
+    assert links is not None, "document_links returned None"
+
+    targets = sorted(Path(link.target).name for link in links)
+    assert targets == [
+        "data.bin",
+        "data.bin",
+        "header_a.h",
+        "header_b.h",
+        "header_c.h",
+    ], f"Unexpected targets: {targets}"
+
+    client.close(uri)
+
+
+@pytest.mark.workspace("document_links")
+async def test_document_links_pch_portion(client, workspace):
+    uri, _ = await client.open_and_wait(workspace / "main.cpp")
+    links = await client.document_links(uri)
+
+    pch_links = [link for link in links if link.range.start.line < 2]
+    assert len(pch_links) == 2, (
+        f"Expected 2 PCH links (lines 0-1), got {len(pch_links)}"
+    )
+
+    pch_targets = sorted(Path(link.target).name for link in pch_links)
+    assert pch_targets == ["header_a.h", "header_b.h"]
+
+    client.close(uri)
+
+
+@pytest.mark.workspace("document_links")
+async def test_document_links_main_portion(client, workspace):
+    uri, _ = await client.open_and_wait(workspace / "main.cpp")
+    links = await client.document_links(uri)
+
+    main_links = [link for link in links if link.range.start.line >= 2]
+    assert len(main_links) == 3, (
+        f"Expected 3 main-file links (lines 3, 6, 9), got {len(main_links)}"
+    )
+
+    main_targets = sorted(Path(link.target).name for link in main_links)
+    assert main_targets == ["data.bin", "data.bin", "header_c.h"]
+
+    client.close(uri)
+
+
+@pytest.mark.workspace("document_links")
+async def test_document_links_embed(client, workspace):
+    uri, _ = await client.open_and_wait(workspace / "main.cpp")
+    links = await client.document_links(uri)
+
+    embed_links = [
+        link
+        for link in links
+        if Path(link.target).name == "data.bin" and link.range.start.line == 6
+    ]
+    assert len(embed_links) == 1, (
+        f"Expected 1 embed link at line 6, got {len(embed_links)}"
+    )
+
+    client.close(uri)
+
+
+@pytest.mark.workspace("document_links")
+async def test_document_links_has_embed_exists(client, workspace):
+    uri, _ = await client.open_and_wait(workspace / "main.cpp")
+    links = await client.document_links(uri)
+
+    has_embed_links = [
+        link
+        for link in links
+        if Path(link.target).name == "data.bin" and link.range.start.line == 9
+    ]
+    assert len(has_embed_links) == 1, (
+        f"Expected 1 has_embed link at line 9, got {len(has_embed_links)}"
+    )
+
+    client.close(uri)
+
+
+@pytest.mark.workspace("document_links")
+async def test_document_links_has_embed_missing(client, workspace):
+    uri, _ = await client.open_and_wait(workspace / "main.cpp")
+    links = await client.document_links(uri)
+
+    missing_links = [
+        link for link in links if Path(link.target).name == "no_such_file.bin"
+    ]
+    assert len(missing_links) == 0, (
+        f"Expected 0 links for non-existent file, got {len(missing_links)}"
+    )
+
+    client.close(uri)
--- a/tests/unit/feature/document_link_tests.cpp
+++ b/tests/unit/feature/document_link_tests.cpp
@@ -15,9 +15,9 @@ TEST_SUITE(DocumentLink, Tester) {

 std::vector<protocol::DocumentLink> links;

-void run(llvm::StringRef source) {
+void run(llvm::StringRef source, llvm::StringRef standard = "-std=c++17") {
    add_files("main.cpp", source);
-    ASSERT_TRUE(compile());
+    ASSERT_TRUE(compile(standard));
    links = feature::document_links(*unit, feature::PositionEncoding::UTF8);
 }

@@ -89,6 +89,53 @@ TEST_CASE(HasInclude) {
    EXPECT_LINK(1, "1", TestVFS::path("test.h"));
 }

+TEST_CASE(MacroInclude) {
+    run(R"cpp(
+#[test.h]
+
+#[main.cpp]
+#define HEADER "test.h"
+#include @0[HEADER$]
+)cpp");
+
+    ASSERT_EQ(links.size(), 1U);
+    EXPECT_LINK(0, "0", TestVFS::path("test.h"));
+}
+
+TEST_CASE(Embed) {
+    run(R"cpp(
+#[bytes.bin]
+0123456789
+
+#[main.cpp]
+const char e[] = {
+#embed @0["bytes.bin"$]
+};
+)cpp",
+        "-std=c++23");
+
+    ASSERT_EQ(links.size(), 1U);
+    EXPECT_LINK(0, "0", TestVFS::path("bytes.bin"));
+}
+
+TEST_CASE(HasEmbed) {
+    run(R"cpp(
+#[data.bin]
+ABCDE
+
+#[main.cpp]
+#if __has_embed(@0["data.bin"$])
+#endif
+
+#if __has_embed("non_existent.bin")
+#endif
+)cpp",
+        "-std=c++23");
+
+    ASSERT_EQ(links.size(), 1U);
+    EXPECT_LINK(0, "0", TestVFS::path("data.bin"));
+}
+
 };  // TEST_SUITE(DocumentLink)

 }  // namespace
Author	SHA1	Message	Date
ykiko	8f714c3b4a	refactor(document links): use Lexer for unified directive argument scanning Replace hand-written character scanning with the project's Lexer class to find filename arguments in preprocessor directives. Extend the Lexer to activate header_name mode for #embed and expose set_header_name_mode() for __has_include/__has_embed contexts. Remove unused Include::filename_range field which had a latent assert crash on macro-expanded includes. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>	2026-04-09 21:02:22 +08:00
ykiko	ccc805d0c3	Merge branch 'main' of https://github.com/clice-io/clice into feat/document-links-pch-embed	2026-04-09 19:44:34 +08:00
ykiko	d48236de9c	refactor: unify include handling with add_link_by_location Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>	2026-04-09 17:12:14 +08:00
ykiko	b691ed1d06	refactor: extract add_link_by_location to deduplicate has_include/embed/has_embed Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>	2026-04-09 17:08:27 +08:00
ykiko	02e4f74347	style: remove verbose comments from integration tests Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>	2026-04-09 17:06:13 +08:00
ykiko	8af2704723	refactor: reuse find_filename_range for has_include scanning Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>	2026-04-09 17:01:00 +08:00
ykiko	4d8c335c0d	fix: re-lookup session after co_await to avoid invalidated iterator The sessions DenseMap iterator may be invalidated during co_await (other coroutines can modify the map). Re-lookup by path_id after the await completes. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>	2026-04-09 16:56:08 +08:00
ykiko	4926b4ac32	test(document links): add __has_embed integration tests Cover both existing-file (produces link) and missing-file (no link) cases for __has_embed directives. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>	2026-04-09 16:05:54 +08:00
ykiko	13527b7084	feat(feature): preserve PCH document links and add #embed/#has_embed support PCH compilation now serializes document links and stores them in PCHState. The master server merges PCH links with main-file links on DocumentLink requests, fixing missing links for includes inside the preamble. Also adds document link support for #embed and __has_embed directives. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>	2026-04-09 15:56:19 +08:00