[libc] utf8 to 32 CharacterConverter (#143973)

Implemented push and pop for utf8 to 32 conversion and tests.

---------

Co-authored-by: Sriya Pratipati <sriyap@google.com>
This commit is contained in:
sribee8
2025-06-16 15:20:08 -07:00
committed by GitHub
parent 2488f26d15
commit 98eee4b554
4 changed files with 263 additions and 3 deletions

View File

@@ -275,9 +275,8 @@ add_subdirectory(fixed_point)
add_subdirectory(HashTable)
add_subdirectory(time)
add_subdirectory(threads)
# Requires access to uchar header which is not on macos
# Therefore, cannot currently build this on macos in overlay mode
# Requires access to uchar header which is not on MacOS
# Cannot currently build this on MacOS in overlay mode
if(NOT(LIBC_TARGET_OS_IS_DARWIN))
add_subdirectory(wchar)
endif()

View File

@@ -1,5 +1,15 @@
add_custom_target(libc-support-wchar-tests)
add_libc_test(
utf8_to_32_test
SUITE
libc-support-tests
SRCS
utf8_to_32_test.cpp
DEPENDS
libc.src.__support.wchar.character_converter
)
add_libc_test(
utf32_to_8_test
SUITE

View File

@@ -0,0 +1,196 @@
//===-- Unittests for character_converter utf8->utf32 ---------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#include "src/__support/error_or.h"
#include "src/__support/wchar/character_converter.h"
#include "src/__support/wchar/mbstate.h"
#include "test/UnitTest/Test.h"
TEST(LlvmLibcCharacterConverterUTF8To32Test, OneByte) {
LIBC_NAMESPACE::internal::mbstate state;
state.bytes_processed = 0;
state.total_bytes = 0;
char ch = 'A';
LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
int err = char_conv.push(static_cast<char8_t>(ch));
auto wch = char_conv.pop_utf32();
ASSERT_EQ(err, 0);
ASSERT_TRUE(wch.has_value());
ASSERT_EQ(static_cast<int>(wch.value()), 65);
}
TEST(LlvmLibcCharacterConverterUTF8To32Test, TwoBytes) {
LIBC_NAMESPACE::internal::mbstate state;
state.bytes_processed = 0;
state.total_bytes = 0;
const char ch[2] = {static_cast<char>(0xC2),
static_cast<char>(0x8E)}; // Ž car symbol
LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
char_conv.push(static_cast<char8_t>(ch[0]));
char_conv.push(static_cast<char8_t>(ch[1]));
auto wch = char_conv.pop_utf32();
ASSERT_TRUE(wch.has_value());
ASSERT_EQ(static_cast<int>(wch.value()), 142);
}
TEST(LlvmLibcCharacterConverterUTF8To32Test, ThreeBytes) {
LIBC_NAMESPACE::internal::mbstate state;
state.bytes_processed = 0;
state.total_bytes = 0;
const char ch[3] = {static_cast<char>(0xE2), static_cast<char>(0x88),
static_cast<char>(0x91)}; // ∑ sigma symbol
LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
char_conv.push(static_cast<char8_t>(ch[0]));
char_conv.push(static_cast<char8_t>(ch[1]));
char_conv.push(static_cast<char8_t>(ch[2]));
auto wch = char_conv.pop_utf32();
ASSERT_TRUE(wch.has_value());
ASSERT_EQ(static_cast<int>(wch.value()), 8721);
}
TEST(LlvmLibcCharacterConverterUTF8To32Test, FourBytes) {
LIBC_NAMESPACE::internal::mbstate state;
state.bytes_processed = 0;
state.total_bytes = 0;
const char ch[4] = {static_cast<char>(0xF0), static_cast<char>(0x9F),
static_cast<char>(0xA4),
static_cast<char>(0xA1)}; // 🤡 clown emoji
LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
char_conv.push(static_cast<char8_t>(ch[0]));
char_conv.push(static_cast<char8_t>(ch[1]));
char_conv.push(static_cast<char8_t>(ch[2]));
char_conv.push(static_cast<char8_t>(ch[3]));
auto wch = char_conv.pop_utf32();
ASSERT_TRUE(wch.has_value());
ASSERT_EQ(static_cast<int>(wch.value()), 129313);
}
TEST(LlvmLibcCharacterConverterUTF8To32Test, InvalidByte) {
LIBC_NAMESPACE::internal::mbstate state;
state.bytes_processed = 0;
state.total_bytes = 0;
const char ch = static_cast<char>(0x80); // invalid starting bit sequence
LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
int err = char_conv.push(static_cast<char8_t>(ch));
ASSERT_EQ(err, -1);
}
TEST(LlvmLibcCharacterConverterUTF8To32Test, InvalidMultiByte) {
LIBC_NAMESPACE::internal::mbstate state;
state.bytes_processed = 0;
state.total_bytes = 0;
const char ch[4] = {
static_cast<char>(0x80), static_cast<char>(0x00), static_cast<char>(0x80),
static_cast<char>(0x00)}; // first and third bytes are invalid
LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
int err = char_conv.push(static_cast<char8_t>(ch[0]));
ASSERT_EQ(err, -1);
err = char_conv.push(static_cast<char8_t>(ch[1]));
ASSERT_EQ(err, 0);
// Prev byte was single byte so trying to push another should error.
err = char_conv.push(static_cast<char8_t>(ch[2]));
ASSERT_EQ(err, -1);
err = char_conv.push(static_cast<char8_t>(ch[3]));
ASSERT_EQ(err, 0);
}
TEST(LlvmLibcCharacterConverterUTF8To32Test, InvalidLastByte) {
LIBC_NAMESPACE::internal::mbstate state;
state.bytes_processed = 0;
state.total_bytes = 0;
// Last byte is invalid since it does not have correct starting sequence.
// 0xC0 --> 11000000 starting sequence should be 10xxxxxx
const char ch[4] = {static_cast<char>(0xF1), static_cast<char>(0x80),
static_cast<char>(0x80), static_cast<char>(0xC0)};
LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
int err = char_conv.push(static_cast<char8_t>(ch[0]));
ASSERT_EQ(err, 0);
err = char_conv.push(static_cast<char8_t>(ch[1]));
ASSERT_EQ(err, 0);
err = char_conv.push(static_cast<char8_t>(ch[2]));
ASSERT_EQ(err, 0);
err = char_conv.push(static_cast<char8_t>(ch[3]));
ASSERT_EQ(err, -1);
}
TEST(LlvmLibcCharacterConverterUTF8To32Test, ValidTwoByteWithExtraRead) {
LIBC_NAMESPACE::internal::mbstate state;
state.bytes_processed = 0;
state.total_bytes = 0;
const char ch[3] = {static_cast<char>(0xC2), static_cast<char>(0x8E),
static_cast<char>(0x80)};
LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
int err = char_conv.push(static_cast<char8_t>(ch[0]));
ASSERT_EQ(err, 0);
err = char_conv.push(static_cast<char8_t>(ch[1]));
ASSERT_EQ(err, 0);
// Should produce an error on 3rd byte
err = char_conv.push(static_cast<char8_t>(ch[2]));
ASSERT_EQ(err, -1);
// Should produce an error since mbstate was reset
auto wch = char_conv.pop_utf32();
ASSERT_FALSE(wch.has_value());
}
TEST(LlvmLibcCharacterConverterUTF8To32Test, TwoValidTwoBytes) {
LIBC_NAMESPACE::internal::mbstate state;
state.bytes_processed = 0;
state.total_bytes = 0;
const char ch[4] = {static_cast<char>(0xC2), static_cast<char>(0x8E),
static_cast<char>(0xC7), static_cast<char>(0x8C)};
LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
int err = char_conv.push(static_cast<char8_t>(ch[0]));
ASSERT_EQ(err, 0);
err = char_conv.push(static_cast<char8_t>(ch[1]));
ASSERT_EQ(err, 0);
auto wch = char_conv.pop_utf32();
ASSERT_TRUE(wch.has_value());
ASSERT_EQ(static_cast<int>(wch.value()), 142);
// Second two byte character
err = char_conv.push(static_cast<char8_t>(ch[2]));
ASSERT_EQ(err, 0);
err = char_conv.push(static_cast<char8_t>(ch[3]));
ASSERT_EQ(err, 0);
wch = char_conv.pop_utf32();
ASSERT_TRUE(wch.has_value());
ASSERT_EQ(static_cast<int>(wch.value()), 460);
}
TEST(LlvmLibcCharacterConverterUTF8To32Test, InvalidPop) {
LIBC_NAMESPACE::internal::mbstate state;
state.bytes_processed = 0;
state.total_bytes = 0;
LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
const char ch[2] = {static_cast<char>(0xC2), static_cast<char>(0x8E)};
int err = char_conv.push(static_cast<char8_t>(ch[0]));
ASSERT_EQ(err, 0);
auto wch = char_conv.pop_utf32();
ASSERT_FALSE(
wch.has_value()); // Should fail since we have not read enough bytes
err = char_conv.push(static_cast<char8_t>(ch[1]));
ASSERT_EQ(err, 0);
wch = char_conv.pop_utf32();
ASSERT_TRUE(wch.has_value());
ASSERT_EQ(static_cast<int>(wch.value()), 142);
}