[libc] Implemented CharacterConverter push/pop for utf32->utf8 conversions (#143971)

Implemented CharacterConverter methods for conversion between utf32 ->
utf8
Added tests

---------

Co-authored-by: Michael Jones <michaelrj@google.com>
This commit is contained in:
Uzair Nawaz
2025-06-16 20:06:46 +00:00
committed by GitHub
parent a637584fad
commit 8adccaee2a
8 changed files with 278 additions and 39 deletions

View File

@@ -15,12 +15,7 @@ add_object_library(
DEPENDS
libc.hdr.types.char8_t
libc.hdr.types.char32_t
libc.src.__support.error_or
libc.src.__support.math_extras
.mbstate
.utf_ret
)
add_header_library(
utf_ret
HDRS
utf_ret.h
)

View File

@@ -8,8 +8,10 @@
#include "hdr/types/char32_t.h"
#include "hdr/types/char8_t.h"
#include "src/__support/common.h"
#include "src/__support/error_or.h"
#include "src/__support/math_extras.h"
#include "src/__support/wchar/mbstate.h"
#include "src/__support/wchar/utf_ret.h"
#include "character_converter.h"
@@ -18,17 +20,75 @@ namespace internal {
CharacterConverter::CharacterConverter(mbstate *mbstate) { state = mbstate; }
void CharacterConverter::clear() {
state->partial = 0;
state->bytes_processed = 0;
state->total_bytes = 0;
}
bool CharacterConverter::isComplete() {
return state->bytes_processed == state->total_bytes;
}
int CharacterConverter::push(char8_t utf8_byte) {}
int CharacterConverter::push(char32_t utf32) {
// we can't be partially through a conversion when pushing a utf32 value
if (!isComplete())
return -1;
int CharacterConverter::push(char32_t utf32) {}
state->partial = utf32;
state->bytes_processed = 0;
utf_ret<char8_t> CharacterConverter::pop_utf8() {}
// determine number of utf-8 bytes needed to represent this utf32 value
constexpr char32_t MAX_VALUE_PER_UTF8_LEN[] = {0x7f, 0x7ff, 0xffff, 0x10ffff};
constexpr int NUM_RANGES = 4;
for (uint8_t i = 0; i < NUM_RANGES; i++) {
if (state->partial <= MAX_VALUE_PER_UTF8_LEN[i]) {
state->total_bytes = i + 1;
return 0;
}
}
utf_ret<char32_t> CharacterConverter::pop_utf32() {}
// `utf32` contains a value that is too large to actually represent a valid
// unicode character
clear();
return -1;
}
ErrorOr<char8_t> CharacterConverter::pop_utf8() {
if (isComplete())
return Error(-1);
constexpr char8_t FIRST_BYTE_HEADERS[] = {0, 0xC0, 0xE0, 0xF0};
constexpr char8_t CONTINUING_BYTE_HEADER = 0x80;
// the number of bits per utf-8 byte that actually encode character
// information not metadata (# of bits excluding the byte headers)
constexpr size_t ENCODED_BITS_PER_UTF8 = 6;
constexpr int MASK_ENCODED_BITS =
mask_trailing_ones<unsigned int, ENCODED_BITS_PER_UTF8>();
char32_t output;
// Shift to get the next 6 bits from the utf32 encoding
const char32_t shift_amount =
(state->total_bytes - state->bytes_processed - 1) * ENCODED_BITS_PER_UTF8;
if (state->bytes_processed == 0) {
/*
Choose the correct set of most significant bits to encode the length
of the utf8 sequence. The remaining bits contain the most significant
bits of the unicode value of the character.
*/
output = FIRST_BYTE_HEADERS[state->total_bytes - 1] |
(state->partial >> shift_amount);
} else {
// Get the next 6 bits and format it like so: 10xxxxxx
output = CONTINUING_BYTE_HEADER |
((state->partial >> shift_amount) & MASK_ENCODED_BITS);
}
state->bytes_processed++;
return static_cast<char8_t>(output);
}
} // namespace internal
} // namespace LIBC_NAMESPACE_DECL

View File

@@ -11,8 +11,9 @@
#include "hdr/types/char32_t.h"
#include "hdr/types/char8_t.h"
#include "src/__support/common.h"
#include "src/__support/error_or.h"
#include "src/__support/wchar/mbstate.h"
#include "src/__support/wchar/utf_ret.h"
namespace LIBC_NAMESPACE_DECL {
namespace internal {
@@ -24,13 +25,14 @@ private:
public:
CharacterConverter(mbstate *mbstate);
void clear();
bool isComplete();
int push(char8_t utf8_byte);
int push(char32_t utf32);
utf_ret<char8_t> pop_utf8();
utf_ret<char32_t> pop_utf32();
ErrorOr<char8_t> pop_utf8();
ErrorOr<char32_t> pop_utf32();
};
} // namespace internal

View File

@@ -17,8 +17,17 @@ namespace LIBC_NAMESPACE_DECL {
namespace internal {
struct mbstate {
// store a partial codepoint (in UTF-32)
char32_t partial;
/*
Progress towards a conversion
For utf8 -> utf32, increases with each CharacterConverter::push(utf8_byte)
For utf32 -> utf8, increases with each CharacterConverter::pop_utf8()
*/
uint8_t bytes_processed;
// Total number of bytes that will be needed to represent this character
uint8_t total_bytes;
};

View File

@@ -1,24 +0,0 @@
//===-- Definition of utf_ret ----------------------------------*-- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#ifndef LLVM_LIBC_SRC___SUPPORT_UTF_RET_H
#define LLVM_LIBC_SRC___SUPPORT_UTF_RET_H
#include "src/__support/common.h"
namespace LIBC_NAMESPACE_DECL {
namespace internal {
template <typename T> struct utf_ret {
T out;
int error;
};
} // namespace internal
} // namespace LIBC_NAMESPACE_DECL
#endif // LLVM_LIBC_SRC___SUPPORT_UTF_RET_H

View File

@@ -275,3 +275,9 @@ add_subdirectory(fixed_point)
add_subdirectory(HashTable)
add_subdirectory(time)
add_subdirectory(threads)
# Requires access to uchar header which is not on macos
# Therefore, cannot currently build this on macos in overlay mode
if(NOT(LIBC_TARGET_OS_IS_DARWIN))
add_subdirectory(wchar)
endif()

View File

@@ -0,0 +1,11 @@
add_custom_target(libc-support-wchar-tests)
add_libc_test(
utf32_to_8_test
SUITE
libc-support-tests
SRCS
utf32_to_8_test.cpp
DEPENDS
libc.src.__support.wchar.character_converter
)

View File

@@ -0,0 +1,180 @@
//===-- Unittests for the CharacterConverter class (utf32 -> 8) -----------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#include "src/__support/common.h"
#include "src/__support/wchar/character_converter.h"
#include "src/__support/wchar/mbstate.h"
#include "test/UnitTest/Test.h"
TEST(LlvmLibcCharacterConverterUTF32To8Test, OneByte) {
LIBC_NAMESPACE::internal::mbstate state;
LIBC_NAMESPACE::internal::CharacterConverter cr(&state);
cr.clear();
// utf8 1-byte encodings are identical to their utf32 representations
char32_t utf32_A = 0x41; // 'A'
cr.push(utf32_A);
auto popped = cr.pop_utf8();
ASSERT_TRUE(popped.has_value());
ASSERT_EQ(static_cast<char>(popped.value()), 'A');
ASSERT_TRUE(cr.isComplete());
char32_t utf32_B = 0x42; // 'B'
cr.push(utf32_B);
popped = cr.pop_utf8();
ASSERT_TRUE(popped.has_value());
ASSERT_EQ(static_cast<char>(popped.value()), 'B');
ASSERT_TRUE(cr.isComplete());
// should error if we try to pop another utf8 byte out
popped = cr.pop_utf8();
ASSERT_FALSE(popped.has_value());
}
TEST(LlvmLibcCharacterConverterUTF32To8Test, TwoByte) {
LIBC_NAMESPACE::internal::mbstate state;
LIBC_NAMESPACE::internal::CharacterConverter cr(&state);
cr.clear();
// testing utf32: 0xff -> utf8: 0xc3 0xbf
char32_t utf32 = 0xff;
cr.push(utf32);
auto popped = cr.pop_utf8();
ASSERT_TRUE(popped.has_value());
ASSERT_EQ(static_cast<int>(popped.value()), 0xc3);
ASSERT_TRUE(!cr.isComplete());
popped = cr.pop_utf8();
ASSERT_TRUE(popped.has_value());
ASSERT_EQ(static_cast<int>(popped.value()), 0xbf);
ASSERT_TRUE(cr.isComplete());
// testing utf32: 0x58e -> utf8: 0xd6 0x8e
utf32 = 0x58e;
cr.push(utf32);
popped = cr.pop_utf8();
ASSERT_TRUE(popped.has_value());
ASSERT_EQ(static_cast<int>(popped.value()), 0xd6);
ASSERT_TRUE(!cr.isComplete());
popped = cr.pop_utf8();
ASSERT_TRUE(popped.has_value());
ASSERT_EQ(static_cast<int>(popped.value()), 0x8e);
ASSERT_TRUE(cr.isComplete());
// should error if we try to pop another utf8 byte out
popped = cr.pop_utf8();
ASSERT_FALSE(popped.has_value());
}
TEST(LlvmLibcCharacterConverterUTF32To8Test, ThreeByte) {
LIBC_NAMESPACE::internal::mbstate state;
LIBC_NAMESPACE::internal::CharacterConverter cr(&state);
cr.clear();
// testing utf32: 0xac15 -> utf8: 0xea 0xb0 0x95
char32_t utf32 = 0xac15;
cr.push(utf32);
auto popped = cr.pop_utf8();
ASSERT_TRUE(popped.has_value());
ASSERT_EQ(static_cast<int>(popped.value()), 0xea);
ASSERT_TRUE(!cr.isComplete());
popped = cr.pop_utf8();
ASSERT_TRUE(popped.has_value());
ASSERT_EQ(static_cast<int>(popped.value()), 0xb0);
ASSERT_TRUE(!cr.isComplete());
popped = cr.pop_utf8();
ASSERT_TRUE(popped.has_value());
ASSERT_EQ(static_cast<int>(popped.value()), 0x95);
ASSERT_TRUE(cr.isComplete());
// testing utf32: 0x267b -> utf8: 0xe2 0x99 0xbb
utf32 = 0x267b;
cr.push(utf32);
popped = cr.pop_utf8();
ASSERT_TRUE(popped.has_value());
ASSERT_EQ(static_cast<int>(popped.value()), 0xe2);
ASSERT_TRUE(!cr.isComplete());
popped = cr.pop_utf8();
ASSERT_TRUE(popped.has_value());
ASSERT_EQ(static_cast<int>(popped.value()), 0x99);
ASSERT_TRUE(!cr.isComplete());
popped = cr.pop_utf8();
ASSERT_TRUE(popped.has_value());
ASSERT_EQ(static_cast<int>(popped.value()), 0xbb);
ASSERT_TRUE(cr.isComplete());
// should error if we try to pop another utf8 byte out
popped = cr.pop_utf8();
ASSERT_FALSE(popped.has_value());
}
TEST(LlvmLibcCharacterConverterUTF32To8Test, FourByte) {
LIBC_NAMESPACE::internal::mbstate state;
LIBC_NAMESPACE::internal::CharacterConverter cr(&state);
cr.clear();
// testing utf32: 0x1f921 -> utf8: 0xf0 0x9f 0xa4 0xa1
char32_t utf32 = 0x1f921;
cr.push(utf32);
auto popped = cr.pop_utf8();
ASSERT_TRUE(popped.has_value());
ASSERT_EQ(static_cast<int>(popped.value()), 0xf0);
ASSERT_TRUE(!cr.isComplete());
popped = cr.pop_utf8();
ASSERT_TRUE(popped.has_value());
ASSERT_EQ(static_cast<int>(popped.value()), 0x9f);
ASSERT_TRUE(!cr.isComplete());
popped = cr.pop_utf8();
ASSERT_TRUE(popped.has_value());
ASSERT_EQ(static_cast<int>(popped.value()), 0xa4);
ASSERT_TRUE(!cr.isComplete());
popped = cr.pop_utf8();
ASSERT_TRUE(popped.has_value());
ASSERT_EQ(static_cast<int>(popped.value()), 0xa1);
ASSERT_TRUE(cr.isComplete());
// testing utf32: 0x12121 -> utf8: 0xf0 0x92 0x84 0xa1
utf32 = 0x12121;
cr.push(utf32);
popped = cr.pop_utf8();
ASSERT_TRUE(popped.has_value());
ASSERT_EQ(static_cast<int>(popped.value()), 0xf0);
ASSERT_TRUE(!cr.isComplete());
popped = cr.pop_utf8();
ASSERT_TRUE(popped.has_value());
ASSERT_EQ(static_cast<int>(popped.value()), 0x92);
ASSERT_TRUE(!cr.isComplete());
popped = cr.pop_utf8();
ASSERT_TRUE(popped.has_value());
ASSERT_EQ(static_cast<int>(popped.value()), 0x84);
ASSERT_TRUE(!cr.isComplete());
popped = cr.pop_utf8();
ASSERT_TRUE(popped.has_value());
ASSERT_EQ(static_cast<int>(popped.value()), 0xa1);
ASSERT_TRUE(cr.isComplete());
// should error if we try to pop another utf8 byte out
popped = cr.pop_utf8();
ASSERT_FALSE(popped.has_value());
}
TEST(LlvmLibcCharacterConverterUTF32To8Test, CantPushMidConversion) {
LIBC_NAMESPACE::internal::mbstate state;
LIBC_NAMESPACE::internal::CharacterConverter cr(&state);
cr.clear();
// testing utf32: 0x12121 -> utf8: 0xf0 0x92 0x84 0xa1
char32_t utf32 = 0x12121;
ASSERT_EQ(cr.push(utf32), 0);
auto popped = cr.pop_utf8();
ASSERT_TRUE(popped.has_value());
// can't push a utf32 without finishing popping the utf8 bytes out
int err = cr.push(utf32);
ASSERT_EQ(err, -1);
}