[clang-tidy] Confusable identifiers detection
Detect identifiers that are confusable using a variant of Unicode definition
http://www.unicode.org/reports/tr39/#Confusable_Detection
and have conflicting scopes.
This a recommit (with portability and feature fixes) of b94db7ed7e
Differential Revision: https://reviews.llvm.org/D112916
This commit is contained in:
@@ -3,8 +3,27 @@ set(LLVM_LINK_COMPONENTS
|
||||
Support
|
||||
)
|
||||
|
||||
if(LLVM_USE_HOST_TOOLS)
|
||||
build_native_tool(make-confusable-table make_confusable_table)
|
||||
set(make_confusable_table_target "${make_confusable_table}")
|
||||
else()
|
||||
set(make_confusable_table $<TARGET_FILE:make-confusable-table>)
|
||||
set(make_confusable_table_target make-confusable-table)
|
||||
endif()
|
||||
|
||||
add_subdirectory(ConfusableTable)
|
||||
|
||||
|
||||
add_custom_command(
|
||||
OUTPUT Confusables.inc
|
||||
COMMAND ${make_confusable_table} ${CMAKE_CURRENT_SOURCE_DIR}/ConfusableTable/confusables.txt ${CMAKE_CURRENT_BINARY_DIR}/Confusables.inc
|
||||
DEPENDS ${make_confusable_table_target} ConfusableTable/confusables.txt)
|
||||
|
||||
add_custom_target(genconfusable DEPENDS Confusables.inc)
|
||||
|
||||
add_clang_library(clangTidyMiscModule
|
||||
DefinitionsInHeadersCheck.cpp
|
||||
ConfusableIdentifierCheck.cpp
|
||||
MiscTidyModule.cpp
|
||||
MisleadingBidirectional.cpp
|
||||
MisleadingIdentifier.cpp
|
||||
@@ -28,6 +47,7 @@ add_clang_library(clangTidyMiscModule
|
||||
|
||||
DEPENDS
|
||||
omp_gen
|
||||
genconfusable
|
||||
)
|
||||
|
||||
clang_target_link_libraries(clangTidyMiscModule
|
||||
|
||||
126
clang-tools-extra/clang-tidy/misc/ConfusableIdentifierCheck.cpp
Normal file
126
clang-tools-extra/clang-tidy/misc/ConfusableIdentifierCheck.cpp
Normal file
@@ -0,0 +1,126 @@
|
||||
//===--- ConfusableIdentifierCheck.cpp -
|
||||
// clang-tidy--------------------------===//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "ConfusableIdentifierCheck.h"
|
||||
|
||||
#include "clang/Frontend/CompilerInstance.h"
|
||||
#include "clang/Lex/Preprocessor.h"
|
||||
#include "llvm/Support/ConvertUTF.h"
|
||||
|
||||
namespace {
|
||||
// Preprocessed version of
|
||||
// https://www.unicode.org/Public/security/latest/confusables.txt
|
||||
//
|
||||
// This contains a sorted array of { UTF32 codepoint; UTF32 values[N];}
|
||||
#include "Confusables.inc"
|
||||
} // namespace
|
||||
|
||||
namespace clang {
|
||||
namespace tidy {
|
||||
namespace misc {
|
||||
|
||||
ConfusableIdentifierCheck::ConfusableIdentifierCheck(StringRef Name,
|
||||
ClangTidyContext *Context)
|
||||
: ClangTidyCheck(Name, Context) {}
|
||||
|
||||
ConfusableIdentifierCheck::~ConfusableIdentifierCheck() = default;
|
||||
|
||||
// Build a skeleton out of the Original identifier, inspired by the algorithm
|
||||
// described in http://www.unicode.org/reports/tr39/#def-skeleton
|
||||
//
|
||||
// FIXME: TR39 mandates:
|
||||
//
|
||||
// For an input string X, define skeleton(X) to be the following transformation
|
||||
// on the string:
|
||||
//
|
||||
// 1. Convert X to NFD format, as described in [UAX15].
|
||||
// 2. Concatenate the prototypes for each character in X according to the
|
||||
// specified data, producing a string of exemplar characters.
|
||||
// 3. Reapply NFD.
|
||||
//
|
||||
// We're skipping 1. and 3. for the sake of simplicity, but this can lead to
|
||||
// false positive.
|
||||
|
||||
std::string ConfusableIdentifierCheck::skeleton(StringRef Name) {
|
||||
using namespace llvm;
|
||||
std::string SName = Name.str();
|
||||
std::string Skeleton;
|
||||
Skeleton.reserve(1 + Name.size());
|
||||
|
||||
const char *Curr = SName.c_str();
|
||||
const char *End = Curr + SName.size();
|
||||
while (Curr < End) {
|
||||
|
||||
const char *Prev = Curr;
|
||||
UTF32 CodePoint;
|
||||
ConversionResult Result = convertUTF8Sequence(
|
||||
reinterpret_cast<const UTF8 **>(&Curr),
|
||||
reinterpret_cast<const UTF8 *>(End), &CodePoint, strictConversion);
|
||||
if (Result != conversionOK) {
|
||||
errs() << "Unicode conversion issue\n";
|
||||
break;
|
||||
}
|
||||
|
||||
StringRef Key(Prev, Curr - Prev);
|
||||
auto Where = std::lower_bound(std::begin(ConfusableEntries),
|
||||
std::end(ConfusableEntries), CodePoint,
|
||||
[](decltype(ConfusableEntries[0]) x,
|
||||
UTF32 y) { return x.codepoint < y; });
|
||||
if (Where == std::end(ConfusableEntries) || CodePoint != Where->codepoint) {
|
||||
Skeleton.append(Prev, Curr);
|
||||
} else {
|
||||
UTF8 Buffer[32];
|
||||
UTF8 *BufferStart = std::begin(Buffer);
|
||||
UTF8 *IBuffer = BufferStart;
|
||||
const UTF32 *ValuesStart = std::begin(Where->values);
|
||||
const UTF32 *ValuesEnd =
|
||||
std::find(std::begin(Where->values), std::end(Where->values), '\0');
|
||||
if (ConvertUTF32toUTF8(&ValuesStart, ValuesEnd, &IBuffer,
|
||||
std::end(Buffer),
|
||||
strictConversion) != conversionOK) {
|
||||
errs() << "Unicode conversion issue\n";
|
||||
break;
|
||||
}
|
||||
Skeleton.append((char *)BufferStart, (char *)IBuffer);
|
||||
}
|
||||
}
|
||||
return Skeleton;
|
||||
}
|
||||
|
||||
void ConfusableIdentifierCheck::check(
|
||||
const ast_matchers::MatchFinder::MatchResult &Result) {
|
||||
if (const auto *ND = Result.Nodes.getNodeAs<NamedDecl>("nameddecl")) {
|
||||
if (IdentifierInfo *II = ND->getIdentifier()) {
|
||||
StringRef NDName = II->getName();
|
||||
llvm::SmallVector<const NamedDecl *> &Mapped = Mapper[skeleton(NDName)];
|
||||
const DeclContext *NDDecl = ND->getDeclContext();
|
||||
for (const NamedDecl *OND : Mapped) {
|
||||
if (!NDDecl->isDeclInLexicalTraversal(OND) &&
|
||||
!OND->getDeclContext()->isDeclInLexicalTraversal(ND))
|
||||
continue;
|
||||
if (OND->getIdentifier()->getName() != NDName) {
|
||||
diag(OND->getLocation(), "%0 is confusable with %1")
|
||||
<< OND->getName() << NDName;
|
||||
diag(ND->getLocation(), "other declaration found here",
|
||||
DiagnosticIDs::Note);
|
||||
}
|
||||
}
|
||||
Mapped.push_back(ND);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void ConfusableIdentifierCheck::registerMatchers(
|
||||
ast_matchers::MatchFinder *Finder) {
|
||||
Finder->addMatcher(ast_matchers::namedDecl().bind("nameddecl"), this);
|
||||
}
|
||||
|
||||
} // namespace misc
|
||||
} // namespace tidy
|
||||
} // namespace clang
|
||||
@@ -0,0 +1,40 @@
|
||||
//===--- ConfusableIdentifierCheck.h - clang-tidy
|
||||
//-------------------------------*- C++ -*-===//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_CONFUSABLE_IDENTIFIER_CHECK_H
|
||||
#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_CONFUSABLE_IDENTIFIER_CHECK_H
|
||||
|
||||
#include "../ClangTidyCheck.h"
|
||||
|
||||
namespace clang {
|
||||
namespace tidy {
|
||||
namespace misc {
|
||||
|
||||
/// Finds symbol which have confusable identifiers, i.e. identifiers that look
|
||||
/// the same visually but have a different Unicode representation.
|
||||
/// If symbols are confusable but don't live in conflicting namespaces, they are
|
||||
/// not reported.
|
||||
class ConfusableIdentifierCheck : public ClangTidyCheck {
|
||||
public:
|
||||
ConfusableIdentifierCheck(StringRef Name, ClangTidyContext *Context);
|
||||
~ConfusableIdentifierCheck();
|
||||
|
||||
void registerMatchers(ast_matchers::MatchFinder *Finder) override;
|
||||
void check(const ast_matchers::MatchFinder::MatchResult &Result) override;
|
||||
|
||||
private:
|
||||
std::string skeleton(StringRef);
|
||||
llvm::StringMap<llvm::SmallVector<const NamedDecl *>> Mapper;
|
||||
};
|
||||
|
||||
} // namespace misc
|
||||
} // namespace tidy
|
||||
} // namespace clang
|
||||
|
||||
#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_CONFUSABLE_IDENTIFIER_CHECK_H
|
||||
@@ -0,0 +1,85 @@
|
||||
//===--- BuildConfusableTable.cpp - clang-tidy---------------------------===//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
#include "llvm/ADT/StringExtras.h"
|
||||
#include "llvm/Support/ConvertUTF.h"
|
||||
#include "llvm/Support/MemoryBuffer.h"
|
||||
#include "llvm/Support/raw_ostream.h"
|
||||
|
||||
#include <algorithm>
|
||||
|
||||
using namespace llvm;
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
auto ErrorOrBuffer = MemoryBuffer::getFile(argv[1], true);
|
||||
if (!ErrorOrBuffer)
|
||||
return 1;
|
||||
std::unique_ptr<MemoryBuffer> Buffer = std::move(ErrorOrBuffer.get());
|
||||
StringRef Content = Buffer->getBuffer();
|
||||
Content = Content.drop_until([](char c) { return c == '#'; });
|
||||
SmallVector<StringRef> Lines;
|
||||
SplitString(Content, Lines, "\r\n");
|
||||
|
||||
std::vector<std::pair<llvm::UTF32, SmallVector<llvm::UTF32>>> Entries;
|
||||
SmallVector<StringRef> Values;
|
||||
for (StringRef Line : Lines) {
|
||||
if (Line.startswith("#"))
|
||||
continue;
|
||||
|
||||
Values.clear();
|
||||
Line.split(Values, ';');
|
||||
if (Values.size() < 2) {
|
||||
errs() << "Failed to parse: " << Line << "\n";
|
||||
return 2;
|
||||
}
|
||||
|
||||
llvm::StringRef From = Values[0].trim();
|
||||
llvm::UTF32 CodePoint;
|
||||
From.getAsInteger(16, CodePoint);
|
||||
|
||||
SmallVector<llvm::UTF32> To;
|
||||
SmallVector<StringRef> ToN;
|
||||
Values[1].split(ToN, ' ', -1, false);
|
||||
for (StringRef To_ : ToN) {
|
||||
llvm::UTF32 ToCodePoint;
|
||||
To_.trim().getAsInteger(16, ToCodePoint);
|
||||
To.push_back(ToCodePoint);
|
||||
}
|
||||
// Sentinel
|
||||
To.push_back(0);
|
||||
|
||||
Entries.emplace_back(CodePoint, To);
|
||||
}
|
||||
std::sort(Entries.begin(), Entries.end());
|
||||
|
||||
unsigned LargestValue =
|
||||
std::max_element(Entries.begin(), Entries.end(),
|
||||
[](const auto &Entry0, const auto &Entry1) {
|
||||
return Entry0.second.size() < Entry1.second.size();
|
||||
})
|
||||
->second.size();
|
||||
|
||||
std::error_code ec;
|
||||
llvm::raw_fd_ostream os(argv[2], ec);
|
||||
|
||||
// FIXME: If memory consumption and/or lookup time becomes a constraint, it
|
||||
// maybe worth using a more elaborate data structure.
|
||||
os << "struct {llvm::UTF32 codepoint; llvm::UTF32 values[" << LargestValue
|
||||
<< "];} "
|
||||
"ConfusableEntries[] = {\n";
|
||||
for (const auto &Values : Entries) {
|
||||
os << " { ";
|
||||
os << Values.first;
|
||||
os << ", {";
|
||||
for (auto CP : Values.second)
|
||||
os << CP << ", ";
|
||||
|
||||
os << "}},\n";
|
||||
}
|
||||
os << "};\n";
|
||||
return 0;
|
||||
}
|
||||
@@ -0,0 +1 @@
|
||||
add_llvm_executable(make-confusable-table BuildConfusableTable.cpp)
|
||||
9638
clang-tools-extra/clang-tidy/misc/ConfusableTable/confusables.txt
Normal file
9638
clang-tools-extra/clang-tidy/misc/ConfusableTable/confusables.txt
Normal file
File diff suppressed because it is too large
Load Diff
@@ -9,6 +9,7 @@
|
||||
#include "../ClangTidy.h"
|
||||
#include "../ClangTidyModule.h"
|
||||
#include "../ClangTidyModuleRegistry.h"
|
||||
#include "ConfusableIdentifierCheck.h"
|
||||
#include "DefinitionsInHeadersCheck.h"
|
||||
#include "MisleadingBidirectional.h"
|
||||
#include "MisleadingIdentifier.h"
|
||||
@@ -33,6 +34,8 @@ namespace misc {
|
||||
class MiscModule : public ClangTidyModule {
|
||||
public:
|
||||
void addCheckFactories(ClangTidyCheckFactories &CheckFactories) override {
|
||||
CheckFactories.registerCheck<ConfusableIdentifierCheck>(
|
||||
"misc-confusable-identifiers");
|
||||
CheckFactories.registerCheck<DefinitionsInHeadersCheck>(
|
||||
"misc-definitions-in-headers");
|
||||
CheckFactories.registerCheck<MisleadingBidirectionalCheck>(
|
||||
|
||||
@@ -139,6 +139,10 @@ New checks
|
||||
Future libc++ will remove the extension (`D120996
|
||||
<https://reviews.llvm.org/D120996>`).
|
||||
|
||||
- New :doc:`misc-confusable-identifiers <clang-tidy/checks/misc-confusable-identifiers` check.
|
||||
|
||||
Detects confusable Unicode identifiers.
|
||||
|
||||
New check aliases
|
||||
^^^^^^^^^^^^^^^^^
|
||||
|
||||
|
||||
@@ -237,6 +237,7 @@ Clang-Tidy Checks
|
||||
`llvmlibc-callee-namespace <llvmlibc/callee-namespace.html>`_,
|
||||
`llvmlibc-implementation-in-namespace <llvmlibc/implementation-in-namespace.html>`_,
|
||||
`llvmlibc-restrict-system-libc-headers <llvmlibc/restrict-system-libc-headers.html>`_, "Yes"
|
||||
`misc-confusable-identifiers <misc/confusable-identifiers.html>`_,
|
||||
`misc-definitions-in-headers <misc/definitions-in-headers.html>`_, "Yes"
|
||||
`misc-misleading-bidirectional <misc/misleading-bidirectional.html>`_,
|
||||
`misc-misleading-identifier <misc/misleading-identifier.html>`_,
|
||||
|
||||
@@ -0,0 +1,15 @@
|
||||
.. title:: clang-tidy - misc-confusable-identifiers
|
||||
|
||||
misc-confusable-identifiers
|
||||
===========================
|
||||
|
||||
Warn about confusable identifiers, i.e. identifiers that are visually close to
|
||||
each other, but use different Unicode characters. This detects a potential
|
||||
attack described in `CVE-2021-42574 <https://www.cve.org/CVERecord?id=CVE-2021-42574>`_.
|
||||
|
||||
Example:
|
||||
|
||||
.. code-block:: c++
|
||||
|
||||
int fo; // Initial character is U+0066 (LATIN SMALL LETTER F).
|
||||
int 𝐟o; // Initial character is U+1234 (SUPER COOL AWESOME UPPERCASE NOT LATIN F) not U+0066 (LATIN SMALL LETTER F).
|
||||
@@ -0,0 +1,25 @@
|
||||
// RUN: %check_clang_tidy %s misc-confusable-identifiers %t
|
||||
|
||||
int fo;
|
||||
// CHECK-MESSAGES: :[[#@LINE-1]]:5: warning: fo is confusable with 𝐟o [misc-confusable-identifiers]
|
||||
int 𝐟o;
|
||||
// CHECK-MESSAGES: :[[#@LINE-1]]:5: note: other declaration found here
|
||||
|
||||
void no() {
|
||||
int 𝐟oo;
|
||||
}
|
||||
|
||||
void worry() {
|
||||
int foo;
|
||||
}
|
||||
|
||||
int 𝐟i;
|
||||
// CHECK-MESSAGES: :[[#@LINE-1]]:5: warning: 𝐟i is confusable with fi [misc-confusable-identifiers]
|
||||
int fi;
|
||||
// CHECK-MESSAGES: :[[#@LINE-1]]:5: note: other declaration found here
|
||||
|
||||
// should not print anything
|
||||
namespace ns {
|
||||
struct Foo {};
|
||||
} // namespace ns
|
||||
auto f = ns::Foo();
|
||||
Reference in New Issue
Block a user