diff --git a/libc/src/__support/wchar/CMakeLists.txt b/libc/src/__support/wchar/CMakeLists.txt index 5cca58400ff4..6715e354e23e 100644 --- a/libc/src/__support/wchar/CMakeLists.txt +++ b/libc/src/__support/wchar/CMakeLists.txt @@ -15,12 +15,7 @@ add_object_library( DEPENDS libc.hdr.types.char8_t libc.hdr.types.char32_t + libc.src.__support.error_or + libc.src.__support.math_extras .mbstate - .utf_ret -) - -add_header_library( - utf_ret - HDRS - utf_ret.h ) diff --git a/libc/src/__support/wchar/character_converter.cpp b/libc/src/__support/wchar/character_converter.cpp index f09c7815a6cc..bac2f6d827e1 100644 --- a/libc/src/__support/wchar/character_converter.cpp +++ b/libc/src/__support/wchar/character_converter.cpp @@ -8,8 +8,10 @@ #include "hdr/types/char32_t.h" #include "hdr/types/char8_t.h" +#include "src/__support/common.h" +#include "src/__support/error_or.h" +#include "src/__support/math_extras.h" #include "src/__support/wchar/mbstate.h" -#include "src/__support/wchar/utf_ret.h" #include "character_converter.h" @@ -18,17 +20,75 @@ namespace internal { CharacterConverter::CharacterConverter(mbstate *mbstate) { state = mbstate; } +void CharacterConverter::clear() { + state->partial = 0; + state->bytes_processed = 0; + state->total_bytes = 0; +} + bool CharacterConverter::isComplete() { return state->bytes_processed == state->total_bytes; } -int CharacterConverter::push(char8_t utf8_byte) {} +int CharacterConverter::push(char32_t utf32) { + // we can't be partially through a conversion when pushing a utf32 value + if (!isComplete()) + return -1; -int CharacterConverter::push(char32_t utf32) {} + state->partial = utf32; + state->bytes_processed = 0; -utf_ret CharacterConverter::pop_utf8() {} + // determine number of utf-8 bytes needed to represent this utf32 value + constexpr char32_t MAX_VALUE_PER_UTF8_LEN[] = {0x7f, 0x7ff, 0xffff, 0x10ffff}; + constexpr int NUM_RANGES = 4; + for (uint8_t i = 0; i < NUM_RANGES; i++) { + if (state->partial <= MAX_VALUE_PER_UTF8_LEN[i]) { + state->total_bytes = i + 1; + return 0; + } + } -utf_ret CharacterConverter::pop_utf32() {} + // `utf32` contains a value that is too large to actually represent a valid + // unicode character + clear(); + return -1; +} + +ErrorOr CharacterConverter::pop_utf8() { + if (isComplete()) + return Error(-1); + + constexpr char8_t FIRST_BYTE_HEADERS[] = {0, 0xC0, 0xE0, 0xF0}; + constexpr char8_t CONTINUING_BYTE_HEADER = 0x80; + + // the number of bits per utf-8 byte that actually encode character + // information not metadata (# of bits excluding the byte headers) + constexpr size_t ENCODED_BITS_PER_UTF8 = 6; + constexpr int MASK_ENCODED_BITS = + mask_trailing_ones(); + + char32_t output; + + // Shift to get the next 6 bits from the utf32 encoding + const char32_t shift_amount = + (state->total_bytes - state->bytes_processed - 1) * ENCODED_BITS_PER_UTF8; + if (state->bytes_processed == 0) { + /* + Choose the correct set of most significant bits to encode the length + of the utf8 sequence. The remaining bits contain the most significant + bits of the unicode value of the character. + */ + output = FIRST_BYTE_HEADERS[state->total_bytes - 1] | + (state->partial >> shift_amount); + } else { + // Get the next 6 bits and format it like so: 10xxxxxx + output = CONTINUING_BYTE_HEADER | + ((state->partial >> shift_amount) & MASK_ENCODED_BITS); + } + + state->bytes_processed++; + return static_cast(output); +} } // namespace internal } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/__support/wchar/character_converter.h b/libc/src/__support/wchar/character_converter.h index d0602d2defe2..c4ba7cf6b689 100644 --- a/libc/src/__support/wchar/character_converter.h +++ b/libc/src/__support/wchar/character_converter.h @@ -11,8 +11,9 @@ #include "hdr/types/char32_t.h" #include "hdr/types/char8_t.h" +#include "src/__support/common.h" +#include "src/__support/error_or.h" #include "src/__support/wchar/mbstate.h" -#include "src/__support/wchar/utf_ret.h" namespace LIBC_NAMESPACE_DECL { namespace internal { @@ -24,13 +25,14 @@ private: public: CharacterConverter(mbstate *mbstate); + void clear(); bool isComplete(); int push(char8_t utf8_byte); int push(char32_t utf32); - utf_ret pop_utf8(); - utf_ret pop_utf32(); + ErrorOr pop_utf8(); + ErrorOr pop_utf32(); }; } // namespace internal diff --git a/libc/src/__support/wchar/mbstate.h b/libc/src/__support/wchar/mbstate.h index d33ee354a544..fb08fb4eaa18 100644 --- a/libc/src/__support/wchar/mbstate.h +++ b/libc/src/__support/wchar/mbstate.h @@ -17,8 +17,17 @@ namespace LIBC_NAMESPACE_DECL { namespace internal { struct mbstate { + // store a partial codepoint (in UTF-32) char32_t partial; + + /* + Progress towards a conversion + For utf8 -> utf32, increases with each CharacterConverter::push(utf8_byte) + For utf32 -> utf8, increases with each CharacterConverter::pop_utf8() + */ uint8_t bytes_processed; + + // Total number of bytes that will be needed to represent this character uint8_t total_bytes; }; diff --git a/libc/src/__support/wchar/utf_ret.h b/libc/src/__support/wchar/utf_ret.h deleted file mode 100644 index fa99b76159bd..000000000000 --- a/libc/src/__support/wchar/utf_ret.h +++ /dev/null @@ -1,24 +0,0 @@ -//===-- Definition of utf_ret ----------------------------------*-- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIBC_SRC___SUPPORT_UTF_RET_H -#define LLVM_LIBC_SRC___SUPPORT_UTF_RET_H - -#include "src/__support/common.h" - -namespace LIBC_NAMESPACE_DECL { -namespace internal { -template struct utf_ret { - T out; - int error; -}; - -} // namespace internal -} // namespace LIBC_NAMESPACE_DECL - -#endif // LLVM_LIBC_SRC___SUPPORT_UTF_RET_H diff --git a/libc/test/src/__support/CMakeLists.txt b/libc/test/src/__support/CMakeLists.txt index 4fb0dae86e5c..76218a16e0cf 100644 --- a/libc/test/src/__support/CMakeLists.txt +++ b/libc/test/src/__support/CMakeLists.txt @@ -275,3 +275,9 @@ add_subdirectory(fixed_point) add_subdirectory(HashTable) add_subdirectory(time) add_subdirectory(threads) + +# Requires access to uchar header which is not on macos +# Therefore, cannot currently build this on macos in overlay mode +if(NOT(LIBC_TARGET_OS_IS_DARWIN)) + add_subdirectory(wchar) +endif() diff --git a/libc/test/src/__support/wchar/CMakeLists.txt b/libc/test/src/__support/wchar/CMakeLists.txt new file mode 100644 index 000000000000..5dff6e9115f7 --- /dev/null +++ b/libc/test/src/__support/wchar/CMakeLists.txt @@ -0,0 +1,11 @@ +add_custom_target(libc-support-wchar-tests) + +add_libc_test( + utf32_to_8_test + SUITE + libc-support-tests + SRCS + utf32_to_8_test.cpp + DEPENDS + libc.src.__support.wchar.character_converter +) diff --git a/libc/test/src/__support/wchar/utf32_to_8_test.cpp b/libc/test/src/__support/wchar/utf32_to_8_test.cpp new file mode 100644 index 000000000000..f4c5cb863ff3 --- /dev/null +++ b/libc/test/src/__support/wchar/utf32_to_8_test.cpp @@ -0,0 +1,180 @@ +//===-- Unittests for the CharacterConverter class (utf32 -> 8) -----------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/__support/common.h" +#include "src/__support/wchar/character_converter.h" +#include "src/__support/wchar/mbstate.h" + +#include "test/UnitTest/Test.h" + +TEST(LlvmLibcCharacterConverterUTF32To8Test, OneByte) { + LIBC_NAMESPACE::internal::mbstate state; + LIBC_NAMESPACE::internal::CharacterConverter cr(&state); + cr.clear(); + + // utf8 1-byte encodings are identical to their utf32 representations + char32_t utf32_A = 0x41; // 'A' + cr.push(utf32_A); + auto popped = cr.pop_utf8(); + ASSERT_TRUE(popped.has_value()); + ASSERT_EQ(static_cast(popped.value()), 'A'); + ASSERT_TRUE(cr.isComplete()); + + char32_t utf32_B = 0x42; // 'B' + cr.push(utf32_B); + popped = cr.pop_utf8(); + ASSERT_TRUE(popped.has_value()); + ASSERT_EQ(static_cast(popped.value()), 'B'); + ASSERT_TRUE(cr.isComplete()); + + // should error if we try to pop another utf8 byte out + popped = cr.pop_utf8(); + ASSERT_FALSE(popped.has_value()); +} + +TEST(LlvmLibcCharacterConverterUTF32To8Test, TwoByte) { + LIBC_NAMESPACE::internal::mbstate state; + LIBC_NAMESPACE::internal::CharacterConverter cr(&state); + cr.clear(); + + // testing utf32: 0xff -> utf8: 0xc3 0xbf + char32_t utf32 = 0xff; + cr.push(utf32); + auto popped = cr.pop_utf8(); + ASSERT_TRUE(popped.has_value()); + ASSERT_EQ(static_cast(popped.value()), 0xc3); + ASSERT_TRUE(!cr.isComplete()); + popped = cr.pop_utf8(); + ASSERT_TRUE(popped.has_value()); + ASSERT_EQ(static_cast(popped.value()), 0xbf); + ASSERT_TRUE(cr.isComplete()); + + // testing utf32: 0x58e -> utf8: 0xd6 0x8e + utf32 = 0x58e; + cr.push(utf32); + popped = cr.pop_utf8(); + ASSERT_TRUE(popped.has_value()); + ASSERT_EQ(static_cast(popped.value()), 0xd6); + ASSERT_TRUE(!cr.isComplete()); + popped = cr.pop_utf8(); + ASSERT_TRUE(popped.has_value()); + ASSERT_EQ(static_cast(popped.value()), 0x8e); + ASSERT_TRUE(cr.isComplete()); + + // should error if we try to pop another utf8 byte out + popped = cr.pop_utf8(); + ASSERT_FALSE(popped.has_value()); +} + +TEST(LlvmLibcCharacterConverterUTF32To8Test, ThreeByte) { + LIBC_NAMESPACE::internal::mbstate state; + LIBC_NAMESPACE::internal::CharacterConverter cr(&state); + cr.clear(); + + // testing utf32: 0xac15 -> utf8: 0xea 0xb0 0x95 + char32_t utf32 = 0xac15; + cr.push(utf32); + auto popped = cr.pop_utf8(); + ASSERT_TRUE(popped.has_value()); + ASSERT_EQ(static_cast(popped.value()), 0xea); + ASSERT_TRUE(!cr.isComplete()); + popped = cr.pop_utf8(); + ASSERT_TRUE(popped.has_value()); + ASSERT_EQ(static_cast(popped.value()), 0xb0); + ASSERT_TRUE(!cr.isComplete()); + popped = cr.pop_utf8(); + ASSERT_TRUE(popped.has_value()); + ASSERT_EQ(static_cast(popped.value()), 0x95); + ASSERT_TRUE(cr.isComplete()); + + // testing utf32: 0x267b -> utf8: 0xe2 0x99 0xbb + utf32 = 0x267b; + cr.push(utf32); + popped = cr.pop_utf8(); + ASSERT_TRUE(popped.has_value()); + ASSERT_EQ(static_cast(popped.value()), 0xe2); + ASSERT_TRUE(!cr.isComplete()); + popped = cr.pop_utf8(); + ASSERT_TRUE(popped.has_value()); + ASSERT_EQ(static_cast(popped.value()), 0x99); + ASSERT_TRUE(!cr.isComplete()); + popped = cr.pop_utf8(); + ASSERT_TRUE(popped.has_value()); + ASSERT_EQ(static_cast(popped.value()), 0xbb); + ASSERT_TRUE(cr.isComplete()); + + // should error if we try to pop another utf8 byte out + popped = cr.pop_utf8(); + ASSERT_FALSE(popped.has_value()); +} + +TEST(LlvmLibcCharacterConverterUTF32To8Test, FourByte) { + LIBC_NAMESPACE::internal::mbstate state; + LIBC_NAMESPACE::internal::CharacterConverter cr(&state); + cr.clear(); + + // testing utf32: 0x1f921 -> utf8: 0xf0 0x9f 0xa4 0xa1 + char32_t utf32 = 0x1f921; + cr.push(utf32); + auto popped = cr.pop_utf8(); + ASSERT_TRUE(popped.has_value()); + ASSERT_EQ(static_cast(popped.value()), 0xf0); + ASSERT_TRUE(!cr.isComplete()); + popped = cr.pop_utf8(); + ASSERT_TRUE(popped.has_value()); + ASSERT_EQ(static_cast(popped.value()), 0x9f); + ASSERT_TRUE(!cr.isComplete()); + popped = cr.pop_utf8(); + ASSERT_TRUE(popped.has_value()); + ASSERT_EQ(static_cast(popped.value()), 0xa4); + ASSERT_TRUE(!cr.isComplete()); + popped = cr.pop_utf8(); + ASSERT_TRUE(popped.has_value()); + ASSERT_EQ(static_cast(popped.value()), 0xa1); + ASSERT_TRUE(cr.isComplete()); + + // testing utf32: 0x12121 -> utf8: 0xf0 0x92 0x84 0xa1 + utf32 = 0x12121; + cr.push(utf32); + popped = cr.pop_utf8(); + ASSERT_TRUE(popped.has_value()); + ASSERT_EQ(static_cast(popped.value()), 0xf0); + ASSERT_TRUE(!cr.isComplete()); + popped = cr.pop_utf8(); + ASSERT_TRUE(popped.has_value()); + ASSERT_EQ(static_cast(popped.value()), 0x92); + ASSERT_TRUE(!cr.isComplete()); + popped = cr.pop_utf8(); + ASSERT_TRUE(popped.has_value()); + ASSERT_EQ(static_cast(popped.value()), 0x84); + ASSERT_TRUE(!cr.isComplete()); + popped = cr.pop_utf8(); + ASSERT_TRUE(popped.has_value()); + ASSERT_EQ(static_cast(popped.value()), 0xa1); + ASSERT_TRUE(cr.isComplete()); + + // should error if we try to pop another utf8 byte out + popped = cr.pop_utf8(); + ASSERT_FALSE(popped.has_value()); +} + +TEST(LlvmLibcCharacterConverterUTF32To8Test, CantPushMidConversion) { + LIBC_NAMESPACE::internal::mbstate state; + LIBC_NAMESPACE::internal::CharacterConverter cr(&state); + cr.clear(); + + // testing utf32: 0x12121 -> utf8: 0xf0 0x92 0x84 0xa1 + char32_t utf32 = 0x12121; + ASSERT_EQ(cr.push(utf32), 0); + auto popped = cr.pop_utf8(); + ASSERT_TRUE(popped.has_value()); + + // can't push a utf32 without finishing popping the utf8 bytes out + int err = cr.push(utf32); + ASSERT_EQ(err, -1); +}