[clang-tidy] Confusable identifiers detection

Detect identifiers that are confusable using a variant of Unicode definition

        http://www.unicode.org/reports/tr39/#Confusable_Detection

and have conflicting scopes.

This a recommit (with portability and feature fixes) of b94db7ed7e

Differential Revision: https://reviews.llvm.org/D112916
This commit is contained in:
serge-sans-paille
2021-10-15 15:20:22 +02:00
parent 3c867898c7
commit c3574ef739
11 changed files with 9958 additions and 0 deletions

View File

@@ -3,8 +3,27 @@ set(LLVM_LINK_COMPONENTS
Support
)
if(LLVM_USE_HOST_TOOLS)
build_native_tool(make-confusable-table make_confusable_table)
set(make_confusable_table_target "${make_confusable_table}")
else()
set(make_confusable_table $<TARGET_FILE:make-confusable-table>)
set(make_confusable_table_target make-confusable-table)
endif()
add_subdirectory(ConfusableTable)
add_custom_command(
OUTPUT Confusables.inc
COMMAND ${make_confusable_table} ${CMAKE_CURRENT_SOURCE_DIR}/ConfusableTable/confusables.txt ${CMAKE_CURRENT_BINARY_DIR}/Confusables.inc
DEPENDS ${make_confusable_table_target} ConfusableTable/confusables.txt)
add_custom_target(genconfusable DEPENDS Confusables.inc)
add_clang_library(clangTidyMiscModule
DefinitionsInHeadersCheck.cpp
ConfusableIdentifierCheck.cpp
MiscTidyModule.cpp
MisleadingBidirectional.cpp
MisleadingIdentifier.cpp
@@ -28,6 +47,7 @@ add_clang_library(clangTidyMiscModule
DEPENDS
omp_gen
genconfusable
)
clang_target_link_libraries(clangTidyMiscModule

View File

@@ -0,0 +1,126 @@
//===--- ConfusableIdentifierCheck.cpp -
// clang-tidy--------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#include "ConfusableIdentifierCheck.h"
#include "clang/Frontend/CompilerInstance.h"
#include "clang/Lex/Preprocessor.h"
#include "llvm/Support/ConvertUTF.h"
namespace {
// Preprocessed version of
// https://www.unicode.org/Public/security/latest/confusables.txt
//
// This contains a sorted array of { UTF32 codepoint; UTF32 values[N];}
#include "Confusables.inc"
} // namespace
namespace clang {
namespace tidy {
namespace misc {
ConfusableIdentifierCheck::ConfusableIdentifierCheck(StringRef Name,
ClangTidyContext *Context)
: ClangTidyCheck(Name, Context) {}
ConfusableIdentifierCheck::~ConfusableIdentifierCheck() = default;
// Build a skeleton out of the Original identifier, inspired by the algorithm
// described in http://www.unicode.org/reports/tr39/#def-skeleton
//
// FIXME: TR39 mandates:
//
// For an input string X, define skeleton(X) to be the following transformation
// on the string:
//
// 1. Convert X to NFD format, as described in [UAX15].
// 2. Concatenate the prototypes for each character in X according to the
// specified data, producing a string of exemplar characters.
// 3. Reapply NFD.
//
// We're skipping 1. and 3. for the sake of simplicity, but this can lead to
// false positive.
std::string ConfusableIdentifierCheck::skeleton(StringRef Name) {
using namespace llvm;
std::string SName = Name.str();
std::string Skeleton;
Skeleton.reserve(1 + Name.size());
const char *Curr = SName.c_str();
const char *End = Curr + SName.size();
while (Curr < End) {
const char *Prev = Curr;
UTF32 CodePoint;
ConversionResult Result = convertUTF8Sequence(
reinterpret_cast<const UTF8 **>(&Curr),
reinterpret_cast<const UTF8 *>(End), &CodePoint, strictConversion);
if (Result != conversionOK) {
errs() << "Unicode conversion issue\n";
break;
}
StringRef Key(Prev, Curr - Prev);
auto Where = std::lower_bound(std::begin(ConfusableEntries),
std::end(ConfusableEntries), CodePoint,
[](decltype(ConfusableEntries[0]) x,
UTF32 y) { return x.codepoint < y; });
if (Where == std::end(ConfusableEntries) || CodePoint != Where->codepoint) {
Skeleton.append(Prev, Curr);
} else {
UTF8 Buffer[32];
UTF8 *BufferStart = std::begin(Buffer);
UTF8 *IBuffer = BufferStart;
const UTF32 *ValuesStart = std::begin(Where->values);
const UTF32 *ValuesEnd =
std::find(std::begin(Where->values), std::end(Where->values), '\0');
if (ConvertUTF32toUTF8(&ValuesStart, ValuesEnd, &IBuffer,
std::end(Buffer),
strictConversion) != conversionOK) {
errs() << "Unicode conversion issue\n";
break;
}
Skeleton.append((char *)BufferStart, (char *)IBuffer);
}
}
return Skeleton;
}
void ConfusableIdentifierCheck::check(
const ast_matchers::MatchFinder::MatchResult &Result) {
if (const auto *ND = Result.Nodes.getNodeAs<NamedDecl>("nameddecl")) {
if (IdentifierInfo *II = ND->getIdentifier()) {
StringRef NDName = II->getName();
llvm::SmallVector<const NamedDecl *> &Mapped = Mapper[skeleton(NDName)];
const DeclContext *NDDecl = ND->getDeclContext();
for (const NamedDecl *OND : Mapped) {
if (!NDDecl->isDeclInLexicalTraversal(OND) &&
!OND->getDeclContext()->isDeclInLexicalTraversal(ND))
continue;
if (OND->getIdentifier()->getName() != NDName) {
diag(OND->getLocation(), "%0 is confusable with %1")
<< OND->getName() << NDName;
diag(ND->getLocation(), "other declaration found here",
DiagnosticIDs::Note);
}
}
Mapped.push_back(ND);
}
}
}
void ConfusableIdentifierCheck::registerMatchers(
ast_matchers::MatchFinder *Finder) {
Finder->addMatcher(ast_matchers::namedDecl().bind("nameddecl"), this);
}
} // namespace misc
} // namespace tidy
} // namespace clang

View File

@@ -0,0 +1,40 @@
//===--- ConfusableIdentifierCheck.h - clang-tidy
//-------------------------------*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_CONFUSABLE_IDENTIFIER_CHECK_H
#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_CONFUSABLE_IDENTIFIER_CHECK_H
#include "../ClangTidyCheck.h"
namespace clang {
namespace tidy {
namespace misc {
/// Finds symbol which have confusable identifiers, i.e. identifiers that look
/// the same visually but have a different Unicode representation.
/// If symbols are confusable but don't live in conflicting namespaces, they are
/// not reported.
class ConfusableIdentifierCheck : public ClangTidyCheck {
public:
ConfusableIdentifierCheck(StringRef Name, ClangTidyContext *Context);
~ConfusableIdentifierCheck();
void registerMatchers(ast_matchers::MatchFinder *Finder) override;
void check(const ast_matchers::MatchFinder::MatchResult &Result) override;
private:
std::string skeleton(StringRef);
llvm::StringMap<llvm::SmallVector<const NamedDecl *>> Mapper;
};
} // namespace misc
} // namespace tidy
} // namespace clang
#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_CONFUSABLE_IDENTIFIER_CHECK_H

View File

@@ -0,0 +1,85 @@
//===--- BuildConfusableTable.cpp - clang-tidy---------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#include "llvm/ADT/StringExtras.h"
#include "llvm/Support/ConvertUTF.h"
#include "llvm/Support/MemoryBuffer.h"
#include "llvm/Support/raw_ostream.h"
#include <algorithm>
using namespace llvm;
int main(int argc, char *argv[]) {
auto ErrorOrBuffer = MemoryBuffer::getFile(argv[1], true);
if (!ErrorOrBuffer)
return 1;
std::unique_ptr<MemoryBuffer> Buffer = std::move(ErrorOrBuffer.get());
StringRef Content = Buffer->getBuffer();
Content = Content.drop_until([](char c) { return c == '#'; });
SmallVector<StringRef> Lines;
SplitString(Content, Lines, "\r\n");
std::vector<std::pair<llvm::UTF32, SmallVector<llvm::UTF32>>> Entries;
SmallVector<StringRef> Values;
for (StringRef Line : Lines) {
if (Line.startswith("#"))
continue;
Values.clear();
Line.split(Values, ';');
if (Values.size() < 2) {
errs() << "Failed to parse: " << Line << "\n";
return 2;
}
llvm::StringRef From = Values[0].trim();
llvm::UTF32 CodePoint;
From.getAsInteger(16, CodePoint);
SmallVector<llvm::UTF32> To;
SmallVector<StringRef> ToN;
Values[1].split(ToN, ' ', -1, false);
for (StringRef To_ : ToN) {
llvm::UTF32 ToCodePoint;
To_.trim().getAsInteger(16, ToCodePoint);
To.push_back(ToCodePoint);
}
// Sentinel
To.push_back(0);
Entries.emplace_back(CodePoint, To);
}
std::sort(Entries.begin(), Entries.end());
unsigned LargestValue =
std::max_element(Entries.begin(), Entries.end(),
[](const auto &Entry0, const auto &Entry1) {
return Entry0.second.size() < Entry1.second.size();
})
->second.size();
std::error_code ec;
llvm::raw_fd_ostream os(argv[2], ec);
// FIXME: If memory consumption and/or lookup time becomes a constraint, it
// maybe worth using a more elaborate data structure.
os << "struct {llvm::UTF32 codepoint; llvm::UTF32 values[" << LargestValue
<< "];} "
"ConfusableEntries[] = {\n";
for (const auto &Values : Entries) {
os << " { ";
os << Values.first;
os << ", {";
for (auto CP : Values.second)
os << CP << ", ";
os << "}},\n";
}
os << "};\n";
return 0;
}

View File

@@ -0,0 +1 @@
add_llvm_executable(make-confusable-table BuildConfusableTable.cpp)

File diff suppressed because it is too large Load Diff

View File

@@ -9,6 +9,7 @@
#include "../ClangTidy.h"
#include "../ClangTidyModule.h"
#include "../ClangTidyModuleRegistry.h"
#include "ConfusableIdentifierCheck.h"
#include "DefinitionsInHeadersCheck.h"
#include "MisleadingBidirectional.h"
#include "MisleadingIdentifier.h"
@@ -33,6 +34,8 @@ namespace misc {
class MiscModule : public ClangTidyModule {
public:
void addCheckFactories(ClangTidyCheckFactories &CheckFactories) override {
CheckFactories.registerCheck<ConfusableIdentifierCheck>(
"misc-confusable-identifiers");
CheckFactories.registerCheck<DefinitionsInHeadersCheck>(
"misc-definitions-in-headers");
CheckFactories.registerCheck<MisleadingBidirectionalCheck>(

View File

@@ -139,6 +139,10 @@ New checks
Future libc++ will remove the extension (`D120996
<https://reviews.llvm.org/D120996>`).
- New :doc:`misc-confusable-identifiers <clang-tidy/checks/misc-confusable-identifiers` check.
Detects confusable Unicode identifiers.
New check aliases
^^^^^^^^^^^^^^^^^

View File

@@ -237,6 +237,7 @@ Clang-Tidy Checks
`llvmlibc-callee-namespace <llvmlibc/callee-namespace.html>`_,
`llvmlibc-implementation-in-namespace <llvmlibc/implementation-in-namespace.html>`_,
`llvmlibc-restrict-system-libc-headers <llvmlibc/restrict-system-libc-headers.html>`_, "Yes"
`misc-confusable-identifiers <misc/confusable-identifiers.html>`_,
`misc-definitions-in-headers <misc/definitions-in-headers.html>`_, "Yes"
`misc-misleading-bidirectional <misc/misleading-bidirectional.html>`_,
`misc-misleading-identifier <misc/misleading-identifier.html>`_,

View File

@@ -0,0 +1,15 @@
.. title:: clang-tidy - misc-confusable-identifiers
misc-confusable-identifiers
===========================
Warn about confusable identifiers, i.e. identifiers that are visually close to
each other, but use different Unicode characters. This detects a potential
attack described in `CVE-2021-42574 <https://www.cve.org/CVERecord?id=CVE-2021-42574>`_.
Example:
.. code-block:: c++
int fo; // Initial character is U+0066 (LATIN SMALL LETTER F).
int 𝐟o; // Initial character is U+1234 (SUPER COOL AWESOME UPPERCASE NOT LATIN F) not U+0066 (LATIN SMALL LETTER F).

View File

@@ -0,0 +1,25 @@
// RUN: %check_clang_tidy %s misc-confusable-identifiers %t
int fo;
// CHECK-MESSAGES: :[[#@LINE-1]]:5: warning: fo is confusable with 𝐟o [misc-confusable-identifiers]
int 𝐟o;
// CHECK-MESSAGES: :[[#@LINE-1]]:5: note: other declaration found here
void no() {
int 𝐟oo;
}
void worry() {
int foo;
}
int 𝐟i;
// CHECK-MESSAGES: :[[#@LINE-1]]:5: warning: 𝐟i is confusable with fi [misc-confusable-identifiers]
int fi;
// CHECK-MESSAGES: :[[#@LINE-1]]:5: note: other declaration found here
// should not print anything
namespace ns {
struct Foo {};
} // namespace ns
auto f = ns::Foo();