#include "Server/LSPConverter.h" #include "Support/FileSystem.h" namespace clice { namespace { /// @brief Iterates over Unicode codepoints in a UTF-8 encoded string and invokes a callback for /// each codepoint. /// /// Processes the input UTF-8 string, calculating the length of each Unicode codepoint in both /// UTF-8 (bytes) and UTF-16 (code units), and passes these lengths to the callback. /// Iteration stops early if the callback returns `false`. /// /// ASCII characters are treated as 1-byte UTF-8 codepoints with a UTF-16 length of 1. /// Non-ASCII characters are processed based on their leading byte to determine UTF-8 length: /// - Valid lengths are 2 to 4 bytes. /// - Astral codepoints (UTF-8 length of 4) have a UTF-16 length of 2 code units. /// Invalid UTF-8 sequences are treated as single-byte ASCII characters. /// /// Returns `false` if the callback stops the iteration. template bool iterateCodepoints(llvm::StringRef content, const Callback& callback) { // Iterate over the input string, processing each codepoint. for(size_t index = 0; index < content.size();) { unsigned char c = static_cast(content[index]); // Handle ASCII characters (1-byte UTF-8, 1-code-unit UTF-16). if(!(c & 0x80)) [[likely]] { if(!callback(1, 1)) { return true; } ++index; continue; } // Determine the length of the codepoint in UTF-8 by counting the leading 1s. size_t length = llvm::countl_one(c); // Validate UTF-8 encoding: length must be between 2 and 4. if(length < 2 || length > 4) [[unlikely]] { assert(false && "Invalid UTF-8 sequence"); // Treat the byte as an ASCII character. if(!callback(1, 1)) { return true; } ++index; continue; } // Advance the index by the length of the current UTF-8 codepoint. index += length; // Calculate the UTF-16 length: astral codepoints (4-byte UTF-8) take 2 code units. if(!callback(length, length == 4 ? 2 : 1)) { return true; } } return false; } /// Convert a proto::Position to a file offset in the content with the specified encoding kind. std::uint32_t toOffset(llvm::StringRef content, PositionEncodingKind kind, proto::Position position) { std::uint32_t offset = 0; for(auto i = 0; i < position.line; i++) { auto pos = content.find('\n'); assert(pos != llvm::StringRef::npos && "Line value is out of range"); offset += pos + 1; content = content.substr(pos + 1); } /// Drop the content after the line. content = content.take_until([](char c) { return c == '\n'; }); assert(position.character <= content.size() && "Character value is out of range"); if(kind == PositionEncodingKind::UTF8) { offset += position.character; return offset; } if(kind == PositionEncodingKind::UTF16) { iterateCodepoints(content, [&](std::uint32_t utf8Length, std::uint32_t utf16Length) { assert(position.character >= utf16Length && "Character value is out of range"); position.character -= utf16Length; offset += utf8Length; return position.character != 0; }); return offset; } if(kind == PositionEncodingKind::UTF32) { iterateCodepoints(content, [&](std::uint32_t utf8Length, std::uint32_t) { assert(position.character >= 1 && "Character value is out of range"); position.character -= 1; offset += utf8Length; return position.character != 0; }); return offset; } std::unreachable(); } /// Remeasure the length (character count) of the content with the specified encoding kind. std::uint32_t remeasure(llvm::StringRef content, PositionEncodingKind kind) { if(kind == PositionEncodingKind::UTF8) { return content.size(); } if(kind == PositionEncodingKind::UTF16) { std::uint32_t length = 0; iterateCodepoints(content, [&](std::uint32_t, std::uint32_t utf16Length) { length += utf16Length; return true; }); return length; } if(kind == PositionEncodingKind::UTF32) { std::uint32_t length = 0; iterateCodepoints(content, [&](std::uint32_t, std::uint32_t) { length += 1; return true; }); return length; } std::unreachable(); } class PositionConverter { public: PositionConverter(llvm::StringRef content, PositionEncodingKind encoding) : content(content), encoding(encoding) {} /// Convert a offset to a proto::Position with given encoding. /// The input offset must be UTF-8 encoded and in order. proto::Position toPosition(uint32_t offset) { assert(offset <= content.size() && "Offset is out of range"); assert(offset >= lastInput && "Offset must be in order"); /// Fast path: return the last output. if(offset == lastInput) [[unlikely]] { return lastOutput; } /// The length of the current line. std::uint32_t lineLength = 0; /// Move the line offset to the current line. for(std::uint32_t i = lastLineOffset; i < offset; i++) { lineLength += 1; if(content[i] == '\n') { line += 1; lastLineOffset += lineLength; lineLength = 0; } } /// Get the content of the current line. auto lineContent = content.substr(lastLineOffset, lineLength); auto position = proto::Position{ .line = line, .character = remeasure(lineContent, encoding), }; /// Cache the result. lastInput = offset; lastOutput = position; return position; } template void toPositions(Range&& range, Proj&& proj) { std::vector offsets; for(auto&& item: range) { auto [begin, end] = proj(item); offsets.emplace_back(begin); offsets.emplace_back(end); } ranges::sort(offsets); for(auto&& offset: offsets) { if(auto it = cache.find(offset); it == cache.end()) { cache.try_emplace(offset, toPosition(offset)); } } } proto::Position lookup(uint32_t offset) { auto it = cache.find(offset); assert(it != cache.end() && "Offset is not cached"); return it->second; } proto::Range lookup(LocalSourceRange range) { auto it = cache.find(range.begin); assert(it != cache.end() && "Offset is not cached"); auto begin = it->second; it = cache.find(range.end); assert(it != cache.end() && "Offset is not cached"); auto end = it->second; return proto::Range{begin, end}; } private: std::uint32_t line = 0; /// The offset of the last line end. std::uint32_t lastLineOffset = 0; /// The input offset of last call. std::uint32_t lastInput = 0; proto::Position lastOutput = {0, 0}; llvm::DenseMap cache; llvm::StringRef content; PositionEncodingKind encoding; }; } // namespace std::uint32_t LSPConverter::convert(llvm::StringRef content, proto::Position position) { return toOffset(content, encoding(), position); } proto::Position LSPConverter::convert(llvm::StringRef content, std::uint32_t offset) { PositionConverter converter(content, encoding()); return converter.toPosition(offset); } std::string LSPConverter::convert(llvm::StringRef URI) { return fs::toPath(URI); } json::Value LSPConverter::convert(llvm::StringRef content, const feature::Hover& hover) { return json::Value(nullptr); } json::Value LSPConverter::convert(llvm::StringRef content, const feature::InlayHints& hints) { return json::Value(nullptr); } json::Value LSPConverter::convert(llvm::StringRef content, const feature::FoldingRanges& foldings) { PositionConverter converter(content, encoding()); converter.toPositions(foldings, [](auto&& folding) { return folding.range; }); json::Array result; for(auto&& folding: foldings) { auto [beginOffset, endOffset] = folding.range; auto [beginLine, beginChar] = converter.lookup(beginOffset); auto [endLine, endChar] = converter.lookup(endOffset); auto object = json::Object{ {"startLine", beginLine}, {"startCharacter", beginChar}, {"endLine", endLine }, {"kind", "region" }, }; result.push_back(std::move(object)); } return result; } json::Value LSPConverter::convert(llvm::StringRef content, const feature::DocumentLinks& links) { PositionConverter converter(content, encoding()); json::Array result; for(auto& link: links) { proto::Range range{ converter.toPosition(link.range.begin), converter.toPosition(link.range.end), }; auto object = json::Object{ /// The range of document link. {"range", json::serialize(range)}, /// Target file URI. {"target", fs::toURI(link.file) }, }; result.emplace_back(std::move(object)); } return result; } json::Value LSPConverter::convert(llvm::StringRef content, const feature::DocumentSymbols& symbols) { PositionConverter converter(content, encoding()); struct DocumentSymbol { std::string name; std::string detail; SymbolKind kind; proto::Range range; proto::Range selectionRange; std::vector children; }; json::Array result; /// TODO: Implementation. return result; } json::Value LSPConverter::convert(llvm::StringRef content, const feature::SemanticTokens& tokens) { std::vector groups; auto addGroup = [&](uint32_t line, uint32_t character, uint32_t length, SymbolKind kind, SymbolModifiers modifiers) { groups.emplace_back(line); groups.emplace_back(character); groups.emplace_back(length); groups.emplace_back(kind.value()); groups.emplace_back(0); }; PositionConverter converter(content, encoding()); std::uint32_t lastLine = 0; std::uint32_t lastChar = 0; for(auto& token: tokens) { auto [beginOffset, endOffset] = token.range; auto [beginLine, beginChar] = converter.toPosition(beginOffset); auto [endLine, endChar] = converter.toPosition(endOffset); if(beginLine == endLine) [[likely]] { std::uint32_t line = beginLine - lastLine; std::uint32_t character = (line == 0 ? beginChar - lastChar : beginChar); std::uint32_t length = endChar - beginChar; addGroup(line, character, length, token.kind, token.modifiers); } else { /// If the token spans multiple lines, split it into multiple tokens. auto subContent = content.substr(beginOffset, endOffset - beginOffset); /// The first line is special. bool isFirst = true; /// The offset of the last line end. std::uint32_t lastLineOffset = 0; /// The length of the current line. std::uint32_t lineLength = 0; for(auto c: subContent) { lineLength += 1; if(c == '\n') { std::uint32_t line; std::uint32_t character; if(isFirst) [[unlikely]] { line = beginLine - lastLine; character = (line == 0 ? beginChar - lastChar : beginChar); isFirst = false; } else { line = 1; character = 0; } std::uint32_t length = remeasure(subContent.substr(lastLineOffset, lineLength), encoding()); addGroup(line, character, length, token.kind, token.modifiers); lastLineOffset += lineLength; lineLength = 0; } } /// Process the last line if it's not empty. if(lineLength > 0) { std::uint32_t length = remeasure(subContent.substr(lastLineOffset), encoding()); addGroup(1, 0, length, token.kind, token.modifiers); } } lastLine = endLine; lastChar = beginChar; } return json::Object{ /// The actual tokens. {"data", json::serialize(groups)}, }; } json::Value LSPConverter::convert(llvm::StringRef content, const std::vector& items) { PositionConverter converter(content, encoding()); converter.toPositions(items, [](auto& item) { return item.edit.range; }); json::Array result; for(auto& item: items) { json::Object object{ {"label", item.label }, {"kind", static_cast(item.kind)}, {"textEdit", json::Object{ {"newText", item.edit.text}, {"range", json::serialize(converter.lookup(item.edit.range))}, } }, }; result.emplace_back(std::move(object)); } return result; } namespace proto { struct InitializeParams { struct ClientInfo { std::string name; std::string version; } clientInfo; struct ClientCapabilities { struct General { std::vector positionEncodings; } general; } capabilities; std::vector workspaceFolders; }; struct InitializeResult { struct ServerInfo { std::string name; std::string version; } serverInfo; struct ServerCapabilities { std::string positionEncoding; TextDocumentSyncKind textDocumentSync = TextDocumentSyncKind::Full; bool declarationProvider = true; bool definitionProvider = true; bool typeDefinitionProvider = true; bool implementationProvider = true; bool callHierarchyProvider = true; bool typeHierarchyProvider = true; bool hoverProvider = true; ResolveProvider inlayHintProvider = {true}; bool foldingRangeProvider = true; ResolveProvider documentLinkProvider = {false}; bool documentSymbolProvider = true; SemanticTokenOptions semanticTokensProvider; /// TODO: CompletionOptions completionProvider; /// signatureHelpProvider /// codeLensProvider /// codeActionProvider /// documentFormattingProvider /// documentRangeFormattingProvider /// renameProvider /// diagnosticProvider } capabilities; }; } // namespace proto json::Value LSPConverter::initialize(json::Value value) { auto params = json::deserialize(value); auto& encodings = params.capabilities.general.positionEncodings; /// Select the first one encoding if any. if(encodings.empty()) { kind = PositionEncodingKind::UTF16; } else if(encodings[0] == "utf-8") { kind = PositionEncodingKind::UTF8; } else if(encodings[0] == "utf-16") { kind = PositionEncodingKind::UTF16; } else if(encodings[0] == "utf-32") { kind = PositionEncodingKind::UTF32; } workspacePath = fs::toPath(params.workspaceFolders[0].uri); proto::InitializeResult result{ .serverInfo = {"clice", "0.0.1"}, .capabilities = { .positionEncoding = encodings.empty() ? "utf-16" : encodings[0], } }; auto& semanticTokensProvider = result.capabilities.semanticTokensProvider; for(auto name: SymbolKind::all()) { std::string type{name}; type[0] = std::tolower(type[0]); semanticTokensProvider.legend.tokenTypes.emplace_back(std::move(type)); } return json::serialize(result); } } // namespace clice