[libc] utf8 to 32 CharacterConverter (#143973)
Implemented push and pop for utf8 to 32 conversion and tests. --------- Co-authored-by: Sriya Pratipati <sriyap@google.com>
This commit is contained in:
@@ -8,6 +8,7 @@
|
||||
|
||||
#include "hdr/types/char32_t.h"
|
||||
#include "hdr/types/char8_t.h"
|
||||
#include "src/__support/CPP/bit.h"
|
||||
#include "src/__support/common.h"
|
||||
#include "src/__support/error_or.h"
|
||||
#include "src/__support/math_extras.h"
|
||||
@@ -30,6 +31,49 @@ bool CharacterConverter::isComplete() {
|
||||
return state->bytes_processed == state->total_bytes;
|
||||
}
|
||||
|
||||
int CharacterConverter::push(char8_t utf8_byte) {
|
||||
uint8_t num_ones = static_cast<uint8_t>(cpp::countl_one(utf8_byte));
|
||||
// Checking the first byte if first push
|
||||
if (state->bytes_processed == 0) {
|
||||
// UTF-8 char has 1 byte total
|
||||
if (num_ones == 0) {
|
||||
state->total_bytes = 1;
|
||||
}
|
||||
// UTF-8 char has 2 through 4 bytes total
|
||||
else if (num_ones >= 2 && num_ones <= 4) {
|
||||
/* Since the format is 110xxxxx, 1110xxxx, and 11110xxx for 2, 3, and 4,
|
||||
we will make the base mask with 7 ones and right shift it as necessary. */
|
||||
constexpr size_t SIGNIFICANT_BITS = 7;
|
||||
uint32_t base_mask = mask_trailing_ones<uint32_t, SIGNIFICANT_BITS>();
|
||||
state->total_bytes = num_ones;
|
||||
utf8_byte &= (base_mask >> num_ones);
|
||||
}
|
||||
// Invalid first byte
|
||||
else {
|
||||
// bytes_processed and total_bytes will always be 0 here
|
||||
state->partial = static_cast<char32_t>(0);
|
||||
return -1;
|
||||
}
|
||||
state->partial = static_cast<char32_t>(utf8_byte);
|
||||
state->bytes_processed++;
|
||||
return 0;
|
||||
}
|
||||
// Any subsequent push
|
||||
// Adding 6 more bits so need to left shift
|
||||
constexpr size_t ENCODED_BITS_PER_UTF8 = 6;
|
||||
if (num_ones == 1 && !isComplete()) {
|
||||
char32_t byte =
|
||||
utf8_byte & mask_trailing_ones<uint32_t, ENCODED_BITS_PER_UTF8>();
|
||||
state->partial = state->partial << ENCODED_BITS_PER_UTF8;
|
||||
state->partial |= byte;
|
||||
state->bytes_processed++;
|
||||
return 0;
|
||||
}
|
||||
// Invalid byte -> reset the state
|
||||
clear();
|
||||
return -1;
|
||||
}
|
||||
|
||||
int CharacterConverter::push(char32_t utf32) {
|
||||
// we can't be partially through a conversion when pushing a utf32 value
|
||||
if (!isComplete())
|
||||
@@ -54,6 +98,17 @@ int CharacterConverter::push(char32_t utf32) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
ErrorOr<char32_t> CharacterConverter::pop_utf32() {
|
||||
// If pop is called too early, do not reset the state, use error to determine
|
||||
// whether enough bytes have been pushed
|
||||
if (!isComplete() || state->bytes_processed == 0)
|
||||
return Error(-1);
|
||||
char32_t utf32 = state->partial;
|
||||
// reset if successful pop
|
||||
clear();
|
||||
return utf32;
|
||||
}
|
||||
|
||||
ErrorOr<char8_t> CharacterConverter::pop_utf8() {
|
||||
if (isComplete())
|
||||
return Error(-1);
|
||||
|
||||
@@ -275,9 +275,8 @@ add_subdirectory(fixed_point)
|
||||
add_subdirectory(HashTable)
|
||||
add_subdirectory(time)
|
||||
add_subdirectory(threads)
|
||||
|
||||
# Requires access to uchar header which is not on macos
|
||||
# Therefore, cannot currently build this on macos in overlay mode
|
||||
# Requires access to uchar header which is not on MacOS
|
||||
# Cannot currently build this on MacOS in overlay mode
|
||||
if(NOT(LIBC_TARGET_OS_IS_DARWIN))
|
||||
add_subdirectory(wchar)
|
||||
endif()
|
||||
|
||||
@@ -1,5 +1,15 @@
|
||||
add_custom_target(libc-support-wchar-tests)
|
||||
|
||||
add_libc_test(
|
||||
utf8_to_32_test
|
||||
SUITE
|
||||
libc-support-tests
|
||||
SRCS
|
||||
utf8_to_32_test.cpp
|
||||
DEPENDS
|
||||
libc.src.__support.wchar.character_converter
|
||||
)
|
||||
|
||||
add_libc_test(
|
||||
utf32_to_8_test
|
||||
SUITE
|
||||
|
||||
196
libc/test/src/__support/wchar/utf8_to_32_test.cpp
Normal file
196
libc/test/src/__support/wchar/utf8_to_32_test.cpp
Normal file
@@ -0,0 +1,196 @@
|
||||
//===-- Unittests for character_converter utf8->utf32 ---------------------===//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "src/__support/error_or.h"
|
||||
#include "src/__support/wchar/character_converter.h"
|
||||
#include "src/__support/wchar/mbstate.h"
|
||||
#include "test/UnitTest/Test.h"
|
||||
|
||||
TEST(LlvmLibcCharacterConverterUTF8To32Test, OneByte) {
|
||||
LIBC_NAMESPACE::internal::mbstate state;
|
||||
state.bytes_processed = 0;
|
||||
state.total_bytes = 0;
|
||||
char ch = 'A';
|
||||
|
||||
LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
|
||||
int err = char_conv.push(static_cast<char8_t>(ch));
|
||||
auto wch = char_conv.pop_utf32();
|
||||
|
||||
ASSERT_EQ(err, 0);
|
||||
ASSERT_TRUE(wch.has_value());
|
||||
ASSERT_EQ(static_cast<int>(wch.value()), 65);
|
||||
}
|
||||
|
||||
TEST(LlvmLibcCharacterConverterUTF8To32Test, TwoBytes) {
|
||||
LIBC_NAMESPACE::internal::mbstate state;
|
||||
state.bytes_processed = 0;
|
||||
state.total_bytes = 0;
|
||||
const char ch[2] = {static_cast<char>(0xC2),
|
||||
static_cast<char>(0x8E)}; // car symbol
|
||||
|
||||
LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
|
||||
char_conv.push(static_cast<char8_t>(ch[0]));
|
||||
char_conv.push(static_cast<char8_t>(ch[1]));
|
||||
auto wch = char_conv.pop_utf32();
|
||||
|
||||
ASSERT_TRUE(wch.has_value());
|
||||
ASSERT_EQ(static_cast<int>(wch.value()), 142);
|
||||
}
|
||||
|
||||
TEST(LlvmLibcCharacterConverterUTF8To32Test, ThreeBytes) {
|
||||
LIBC_NAMESPACE::internal::mbstate state;
|
||||
state.bytes_processed = 0;
|
||||
state.total_bytes = 0;
|
||||
const char ch[3] = {static_cast<char>(0xE2), static_cast<char>(0x88),
|
||||
static_cast<char>(0x91)}; // ∑ sigma symbol
|
||||
|
||||
LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
|
||||
char_conv.push(static_cast<char8_t>(ch[0]));
|
||||
char_conv.push(static_cast<char8_t>(ch[1]));
|
||||
char_conv.push(static_cast<char8_t>(ch[2]));
|
||||
auto wch = char_conv.pop_utf32();
|
||||
|
||||
ASSERT_TRUE(wch.has_value());
|
||||
ASSERT_EQ(static_cast<int>(wch.value()), 8721);
|
||||
}
|
||||
|
||||
TEST(LlvmLibcCharacterConverterUTF8To32Test, FourBytes) {
|
||||
LIBC_NAMESPACE::internal::mbstate state;
|
||||
state.bytes_processed = 0;
|
||||
state.total_bytes = 0;
|
||||
const char ch[4] = {static_cast<char>(0xF0), static_cast<char>(0x9F),
|
||||
static_cast<char>(0xA4),
|
||||
static_cast<char>(0xA1)}; // 🤡 clown emoji
|
||||
|
||||
LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
|
||||
char_conv.push(static_cast<char8_t>(ch[0]));
|
||||
char_conv.push(static_cast<char8_t>(ch[1]));
|
||||
char_conv.push(static_cast<char8_t>(ch[2]));
|
||||
char_conv.push(static_cast<char8_t>(ch[3]));
|
||||
auto wch = char_conv.pop_utf32();
|
||||
|
||||
ASSERT_TRUE(wch.has_value());
|
||||
ASSERT_EQ(static_cast<int>(wch.value()), 129313);
|
||||
}
|
||||
|
||||
TEST(LlvmLibcCharacterConverterUTF8To32Test, InvalidByte) {
|
||||
LIBC_NAMESPACE::internal::mbstate state;
|
||||
state.bytes_processed = 0;
|
||||
state.total_bytes = 0;
|
||||
const char ch = static_cast<char>(0x80); // invalid starting bit sequence
|
||||
|
||||
LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
|
||||
int err = char_conv.push(static_cast<char8_t>(ch));
|
||||
|
||||
ASSERT_EQ(err, -1);
|
||||
}
|
||||
|
||||
TEST(LlvmLibcCharacterConverterUTF8To32Test, InvalidMultiByte) {
|
||||
LIBC_NAMESPACE::internal::mbstate state;
|
||||
state.bytes_processed = 0;
|
||||
state.total_bytes = 0;
|
||||
const char ch[4] = {
|
||||
static_cast<char>(0x80), static_cast<char>(0x00), static_cast<char>(0x80),
|
||||
static_cast<char>(0x00)}; // first and third bytes are invalid
|
||||
|
||||
LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
|
||||
int err = char_conv.push(static_cast<char8_t>(ch[0]));
|
||||
ASSERT_EQ(err, -1);
|
||||
err = char_conv.push(static_cast<char8_t>(ch[1]));
|
||||
ASSERT_EQ(err, 0);
|
||||
// Prev byte was single byte so trying to push another should error.
|
||||
err = char_conv.push(static_cast<char8_t>(ch[2]));
|
||||
ASSERT_EQ(err, -1);
|
||||
err = char_conv.push(static_cast<char8_t>(ch[3]));
|
||||
ASSERT_EQ(err, 0);
|
||||
}
|
||||
|
||||
TEST(LlvmLibcCharacterConverterUTF8To32Test, InvalidLastByte) {
|
||||
LIBC_NAMESPACE::internal::mbstate state;
|
||||
state.bytes_processed = 0;
|
||||
state.total_bytes = 0;
|
||||
// Last byte is invalid since it does not have correct starting sequence.
|
||||
// 0xC0 --> 11000000 starting sequence should be 10xxxxxx
|
||||
const char ch[4] = {static_cast<char>(0xF1), static_cast<char>(0x80),
|
||||
static_cast<char>(0x80), static_cast<char>(0xC0)};
|
||||
|
||||
LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
|
||||
int err = char_conv.push(static_cast<char8_t>(ch[0]));
|
||||
ASSERT_EQ(err, 0);
|
||||
err = char_conv.push(static_cast<char8_t>(ch[1]));
|
||||
ASSERT_EQ(err, 0);
|
||||
err = char_conv.push(static_cast<char8_t>(ch[2]));
|
||||
ASSERT_EQ(err, 0);
|
||||
err = char_conv.push(static_cast<char8_t>(ch[3]));
|
||||
ASSERT_EQ(err, -1);
|
||||
}
|
||||
|
||||
TEST(LlvmLibcCharacterConverterUTF8To32Test, ValidTwoByteWithExtraRead) {
|
||||
LIBC_NAMESPACE::internal::mbstate state;
|
||||
state.bytes_processed = 0;
|
||||
state.total_bytes = 0;
|
||||
const char ch[3] = {static_cast<char>(0xC2), static_cast<char>(0x8E),
|
||||
static_cast<char>(0x80)};
|
||||
|
||||
LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
|
||||
int err = char_conv.push(static_cast<char8_t>(ch[0]));
|
||||
ASSERT_EQ(err, 0);
|
||||
err = char_conv.push(static_cast<char8_t>(ch[1]));
|
||||
ASSERT_EQ(err, 0);
|
||||
// Should produce an error on 3rd byte
|
||||
err = char_conv.push(static_cast<char8_t>(ch[2]));
|
||||
ASSERT_EQ(err, -1);
|
||||
|
||||
// Should produce an error since mbstate was reset
|
||||
auto wch = char_conv.pop_utf32();
|
||||
ASSERT_FALSE(wch.has_value());
|
||||
}
|
||||
|
||||
TEST(LlvmLibcCharacterConverterUTF8To32Test, TwoValidTwoBytes) {
|
||||
LIBC_NAMESPACE::internal::mbstate state;
|
||||
state.bytes_processed = 0;
|
||||
state.total_bytes = 0;
|
||||
const char ch[4] = {static_cast<char>(0xC2), static_cast<char>(0x8E),
|
||||
static_cast<char>(0xC7), static_cast<char>(0x8C)};
|
||||
|
||||
LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
|
||||
int err = char_conv.push(static_cast<char8_t>(ch[0]));
|
||||
ASSERT_EQ(err, 0);
|
||||
err = char_conv.push(static_cast<char8_t>(ch[1]));
|
||||
ASSERT_EQ(err, 0);
|
||||
auto wch = char_conv.pop_utf32();
|
||||
ASSERT_TRUE(wch.has_value());
|
||||
ASSERT_EQ(static_cast<int>(wch.value()), 142);
|
||||
|
||||
// Second two byte character
|
||||
err = char_conv.push(static_cast<char8_t>(ch[2]));
|
||||
ASSERT_EQ(err, 0);
|
||||
err = char_conv.push(static_cast<char8_t>(ch[3]));
|
||||
ASSERT_EQ(err, 0);
|
||||
wch = char_conv.pop_utf32();
|
||||
ASSERT_TRUE(wch.has_value());
|
||||
ASSERT_EQ(static_cast<int>(wch.value()), 460);
|
||||
}
|
||||
|
||||
TEST(LlvmLibcCharacterConverterUTF8To32Test, InvalidPop) {
|
||||
LIBC_NAMESPACE::internal::mbstate state;
|
||||
state.bytes_processed = 0;
|
||||
state.total_bytes = 0;
|
||||
LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
|
||||
const char ch[2] = {static_cast<char>(0xC2), static_cast<char>(0x8E)};
|
||||
int err = char_conv.push(static_cast<char8_t>(ch[0]));
|
||||
ASSERT_EQ(err, 0);
|
||||
auto wch = char_conv.pop_utf32();
|
||||
ASSERT_FALSE(
|
||||
wch.has_value()); // Should fail since we have not read enough bytes
|
||||
err = char_conv.push(static_cast<char8_t>(ch[1]));
|
||||
ASSERT_EQ(err, 0);
|
||||
wch = char_conv.pop_utf32();
|
||||
ASSERT_TRUE(wch.has_value());
|
||||
ASSERT_EQ(static_cast<int>(wch.value()), 142);
|
||||
}
|
||||
Reference in New Issue
Block a user