[libc] Implemented CharacterConverter push/pop for utf32->utf8 conversions (#143971)
Implemented CharacterConverter methods for conversion between utf32 -> utf8 Added tests --------- Co-authored-by: Michael Jones <michaelrj@google.com>
This commit is contained in:
@@ -15,12 +15,7 @@ add_object_library(
|
||||
DEPENDS
|
||||
libc.hdr.types.char8_t
|
||||
libc.hdr.types.char32_t
|
||||
libc.src.__support.error_or
|
||||
libc.src.__support.math_extras
|
||||
.mbstate
|
||||
.utf_ret
|
||||
)
|
||||
|
||||
add_header_library(
|
||||
utf_ret
|
||||
HDRS
|
||||
utf_ret.h
|
||||
)
|
||||
|
||||
@@ -8,8 +8,10 @@
|
||||
|
||||
#include "hdr/types/char32_t.h"
|
||||
#include "hdr/types/char8_t.h"
|
||||
#include "src/__support/common.h"
|
||||
#include "src/__support/error_or.h"
|
||||
#include "src/__support/math_extras.h"
|
||||
#include "src/__support/wchar/mbstate.h"
|
||||
#include "src/__support/wchar/utf_ret.h"
|
||||
|
||||
#include "character_converter.h"
|
||||
|
||||
@@ -18,17 +20,75 @@ namespace internal {
|
||||
|
||||
CharacterConverter::CharacterConverter(mbstate *mbstate) { state = mbstate; }
|
||||
|
||||
void CharacterConverter::clear() {
|
||||
state->partial = 0;
|
||||
state->bytes_processed = 0;
|
||||
state->total_bytes = 0;
|
||||
}
|
||||
|
||||
bool CharacterConverter::isComplete() {
|
||||
return state->bytes_processed == state->total_bytes;
|
||||
}
|
||||
|
||||
int CharacterConverter::push(char8_t utf8_byte) {}
|
||||
int CharacterConverter::push(char32_t utf32) {
|
||||
// we can't be partially through a conversion when pushing a utf32 value
|
||||
if (!isComplete())
|
||||
return -1;
|
||||
|
||||
int CharacterConverter::push(char32_t utf32) {}
|
||||
state->partial = utf32;
|
||||
state->bytes_processed = 0;
|
||||
|
||||
utf_ret<char8_t> CharacterConverter::pop_utf8() {}
|
||||
// determine number of utf-8 bytes needed to represent this utf32 value
|
||||
constexpr char32_t MAX_VALUE_PER_UTF8_LEN[] = {0x7f, 0x7ff, 0xffff, 0x10ffff};
|
||||
constexpr int NUM_RANGES = 4;
|
||||
for (uint8_t i = 0; i < NUM_RANGES; i++) {
|
||||
if (state->partial <= MAX_VALUE_PER_UTF8_LEN[i]) {
|
||||
state->total_bytes = i + 1;
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
utf_ret<char32_t> CharacterConverter::pop_utf32() {}
|
||||
// `utf32` contains a value that is too large to actually represent a valid
|
||||
// unicode character
|
||||
clear();
|
||||
return -1;
|
||||
}
|
||||
|
||||
ErrorOr<char8_t> CharacterConverter::pop_utf8() {
|
||||
if (isComplete())
|
||||
return Error(-1);
|
||||
|
||||
constexpr char8_t FIRST_BYTE_HEADERS[] = {0, 0xC0, 0xE0, 0xF0};
|
||||
constexpr char8_t CONTINUING_BYTE_HEADER = 0x80;
|
||||
|
||||
// the number of bits per utf-8 byte that actually encode character
|
||||
// information not metadata (# of bits excluding the byte headers)
|
||||
constexpr size_t ENCODED_BITS_PER_UTF8 = 6;
|
||||
constexpr int MASK_ENCODED_BITS =
|
||||
mask_trailing_ones<unsigned int, ENCODED_BITS_PER_UTF8>();
|
||||
|
||||
char32_t output;
|
||||
|
||||
// Shift to get the next 6 bits from the utf32 encoding
|
||||
const char32_t shift_amount =
|
||||
(state->total_bytes - state->bytes_processed - 1) * ENCODED_BITS_PER_UTF8;
|
||||
if (state->bytes_processed == 0) {
|
||||
/*
|
||||
Choose the correct set of most significant bits to encode the length
|
||||
of the utf8 sequence. The remaining bits contain the most significant
|
||||
bits of the unicode value of the character.
|
||||
*/
|
||||
output = FIRST_BYTE_HEADERS[state->total_bytes - 1] |
|
||||
(state->partial >> shift_amount);
|
||||
} else {
|
||||
// Get the next 6 bits and format it like so: 10xxxxxx
|
||||
output = CONTINUING_BYTE_HEADER |
|
||||
((state->partial >> shift_amount) & MASK_ENCODED_BITS);
|
||||
}
|
||||
|
||||
state->bytes_processed++;
|
||||
return static_cast<char8_t>(output);
|
||||
}
|
||||
|
||||
} // namespace internal
|
||||
} // namespace LIBC_NAMESPACE_DECL
|
||||
|
||||
@@ -11,8 +11,9 @@
|
||||
|
||||
#include "hdr/types/char32_t.h"
|
||||
#include "hdr/types/char8_t.h"
|
||||
#include "src/__support/common.h"
|
||||
#include "src/__support/error_or.h"
|
||||
#include "src/__support/wchar/mbstate.h"
|
||||
#include "src/__support/wchar/utf_ret.h"
|
||||
|
||||
namespace LIBC_NAMESPACE_DECL {
|
||||
namespace internal {
|
||||
@@ -24,13 +25,14 @@ private:
|
||||
public:
|
||||
CharacterConverter(mbstate *mbstate);
|
||||
|
||||
void clear();
|
||||
bool isComplete();
|
||||
|
||||
int push(char8_t utf8_byte);
|
||||
int push(char32_t utf32);
|
||||
|
||||
utf_ret<char8_t> pop_utf8();
|
||||
utf_ret<char32_t> pop_utf32();
|
||||
ErrorOr<char8_t> pop_utf8();
|
||||
ErrorOr<char32_t> pop_utf32();
|
||||
};
|
||||
|
||||
} // namespace internal
|
||||
|
||||
@@ -17,8 +17,17 @@ namespace LIBC_NAMESPACE_DECL {
|
||||
namespace internal {
|
||||
|
||||
struct mbstate {
|
||||
// store a partial codepoint (in UTF-32)
|
||||
char32_t partial;
|
||||
|
||||
/*
|
||||
Progress towards a conversion
|
||||
For utf8 -> utf32, increases with each CharacterConverter::push(utf8_byte)
|
||||
For utf32 -> utf8, increases with each CharacterConverter::pop_utf8()
|
||||
*/
|
||||
uint8_t bytes_processed;
|
||||
|
||||
// Total number of bytes that will be needed to represent this character
|
||||
uint8_t total_bytes;
|
||||
};
|
||||
|
||||
|
||||
@@ -1,24 +0,0 @@
|
||||
//===-- Definition of utf_ret ----------------------------------*-- C++ -*-===//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#ifndef LLVM_LIBC_SRC___SUPPORT_UTF_RET_H
|
||||
#define LLVM_LIBC_SRC___SUPPORT_UTF_RET_H
|
||||
|
||||
#include "src/__support/common.h"
|
||||
|
||||
namespace LIBC_NAMESPACE_DECL {
|
||||
namespace internal {
|
||||
template <typename T> struct utf_ret {
|
||||
T out;
|
||||
int error;
|
||||
};
|
||||
|
||||
} // namespace internal
|
||||
} // namespace LIBC_NAMESPACE_DECL
|
||||
|
||||
#endif // LLVM_LIBC_SRC___SUPPORT_UTF_RET_H
|
||||
@@ -275,3 +275,9 @@ add_subdirectory(fixed_point)
|
||||
add_subdirectory(HashTable)
|
||||
add_subdirectory(time)
|
||||
add_subdirectory(threads)
|
||||
|
||||
# Requires access to uchar header which is not on macos
|
||||
# Therefore, cannot currently build this on macos in overlay mode
|
||||
if(NOT(LIBC_TARGET_OS_IS_DARWIN))
|
||||
add_subdirectory(wchar)
|
||||
endif()
|
||||
|
||||
11
libc/test/src/__support/wchar/CMakeLists.txt
Normal file
11
libc/test/src/__support/wchar/CMakeLists.txt
Normal file
@@ -0,0 +1,11 @@
|
||||
add_custom_target(libc-support-wchar-tests)
|
||||
|
||||
add_libc_test(
|
||||
utf32_to_8_test
|
||||
SUITE
|
||||
libc-support-tests
|
||||
SRCS
|
||||
utf32_to_8_test.cpp
|
||||
DEPENDS
|
||||
libc.src.__support.wchar.character_converter
|
||||
)
|
||||
180
libc/test/src/__support/wchar/utf32_to_8_test.cpp
Normal file
180
libc/test/src/__support/wchar/utf32_to_8_test.cpp
Normal file
@@ -0,0 +1,180 @@
|
||||
//===-- Unittests for the CharacterConverter class (utf32 -> 8) -----------===//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "src/__support/common.h"
|
||||
#include "src/__support/wchar/character_converter.h"
|
||||
#include "src/__support/wchar/mbstate.h"
|
||||
|
||||
#include "test/UnitTest/Test.h"
|
||||
|
||||
TEST(LlvmLibcCharacterConverterUTF32To8Test, OneByte) {
|
||||
LIBC_NAMESPACE::internal::mbstate state;
|
||||
LIBC_NAMESPACE::internal::CharacterConverter cr(&state);
|
||||
cr.clear();
|
||||
|
||||
// utf8 1-byte encodings are identical to their utf32 representations
|
||||
char32_t utf32_A = 0x41; // 'A'
|
||||
cr.push(utf32_A);
|
||||
auto popped = cr.pop_utf8();
|
||||
ASSERT_TRUE(popped.has_value());
|
||||
ASSERT_EQ(static_cast<char>(popped.value()), 'A');
|
||||
ASSERT_TRUE(cr.isComplete());
|
||||
|
||||
char32_t utf32_B = 0x42; // 'B'
|
||||
cr.push(utf32_B);
|
||||
popped = cr.pop_utf8();
|
||||
ASSERT_TRUE(popped.has_value());
|
||||
ASSERT_EQ(static_cast<char>(popped.value()), 'B');
|
||||
ASSERT_TRUE(cr.isComplete());
|
||||
|
||||
// should error if we try to pop another utf8 byte out
|
||||
popped = cr.pop_utf8();
|
||||
ASSERT_FALSE(popped.has_value());
|
||||
}
|
||||
|
||||
TEST(LlvmLibcCharacterConverterUTF32To8Test, TwoByte) {
|
||||
LIBC_NAMESPACE::internal::mbstate state;
|
||||
LIBC_NAMESPACE::internal::CharacterConverter cr(&state);
|
||||
cr.clear();
|
||||
|
||||
// testing utf32: 0xff -> utf8: 0xc3 0xbf
|
||||
char32_t utf32 = 0xff;
|
||||
cr.push(utf32);
|
||||
auto popped = cr.pop_utf8();
|
||||
ASSERT_TRUE(popped.has_value());
|
||||
ASSERT_EQ(static_cast<int>(popped.value()), 0xc3);
|
||||
ASSERT_TRUE(!cr.isComplete());
|
||||
popped = cr.pop_utf8();
|
||||
ASSERT_TRUE(popped.has_value());
|
||||
ASSERT_EQ(static_cast<int>(popped.value()), 0xbf);
|
||||
ASSERT_TRUE(cr.isComplete());
|
||||
|
||||
// testing utf32: 0x58e -> utf8: 0xd6 0x8e
|
||||
utf32 = 0x58e;
|
||||
cr.push(utf32);
|
||||
popped = cr.pop_utf8();
|
||||
ASSERT_TRUE(popped.has_value());
|
||||
ASSERT_EQ(static_cast<int>(popped.value()), 0xd6);
|
||||
ASSERT_TRUE(!cr.isComplete());
|
||||
popped = cr.pop_utf8();
|
||||
ASSERT_TRUE(popped.has_value());
|
||||
ASSERT_EQ(static_cast<int>(popped.value()), 0x8e);
|
||||
ASSERT_TRUE(cr.isComplete());
|
||||
|
||||
// should error if we try to pop another utf8 byte out
|
||||
popped = cr.pop_utf8();
|
||||
ASSERT_FALSE(popped.has_value());
|
||||
}
|
||||
|
||||
TEST(LlvmLibcCharacterConverterUTF32To8Test, ThreeByte) {
|
||||
LIBC_NAMESPACE::internal::mbstate state;
|
||||
LIBC_NAMESPACE::internal::CharacterConverter cr(&state);
|
||||
cr.clear();
|
||||
|
||||
// testing utf32: 0xac15 -> utf8: 0xea 0xb0 0x95
|
||||
char32_t utf32 = 0xac15;
|
||||
cr.push(utf32);
|
||||
auto popped = cr.pop_utf8();
|
||||
ASSERT_TRUE(popped.has_value());
|
||||
ASSERT_EQ(static_cast<int>(popped.value()), 0xea);
|
||||
ASSERT_TRUE(!cr.isComplete());
|
||||
popped = cr.pop_utf8();
|
||||
ASSERT_TRUE(popped.has_value());
|
||||
ASSERT_EQ(static_cast<int>(popped.value()), 0xb0);
|
||||
ASSERT_TRUE(!cr.isComplete());
|
||||
popped = cr.pop_utf8();
|
||||
ASSERT_TRUE(popped.has_value());
|
||||
ASSERT_EQ(static_cast<int>(popped.value()), 0x95);
|
||||
ASSERT_TRUE(cr.isComplete());
|
||||
|
||||
// testing utf32: 0x267b -> utf8: 0xe2 0x99 0xbb
|
||||
utf32 = 0x267b;
|
||||
cr.push(utf32);
|
||||
popped = cr.pop_utf8();
|
||||
ASSERT_TRUE(popped.has_value());
|
||||
ASSERT_EQ(static_cast<int>(popped.value()), 0xe2);
|
||||
ASSERT_TRUE(!cr.isComplete());
|
||||
popped = cr.pop_utf8();
|
||||
ASSERT_TRUE(popped.has_value());
|
||||
ASSERT_EQ(static_cast<int>(popped.value()), 0x99);
|
||||
ASSERT_TRUE(!cr.isComplete());
|
||||
popped = cr.pop_utf8();
|
||||
ASSERT_TRUE(popped.has_value());
|
||||
ASSERT_EQ(static_cast<int>(popped.value()), 0xbb);
|
||||
ASSERT_TRUE(cr.isComplete());
|
||||
|
||||
// should error if we try to pop another utf8 byte out
|
||||
popped = cr.pop_utf8();
|
||||
ASSERT_FALSE(popped.has_value());
|
||||
}
|
||||
|
||||
TEST(LlvmLibcCharacterConverterUTF32To8Test, FourByte) {
|
||||
LIBC_NAMESPACE::internal::mbstate state;
|
||||
LIBC_NAMESPACE::internal::CharacterConverter cr(&state);
|
||||
cr.clear();
|
||||
|
||||
// testing utf32: 0x1f921 -> utf8: 0xf0 0x9f 0xa4 0xa1
|
||||
char32_t utf32 = 0x1f921;
|
||||
cr.push(utf32);
|
||||
auto popped = cr.pop_utf8();
|
||||
ASSERT_TRUE(popped.has_value());
|
||||
ASSERT_EQ(static_cast<int>(popped.value()), 0xf0);
|
||||
ASSERT_TRUE(!cr.isComplete());
|
||||
popped = cr.pop_utf8();
|
||||
ASSERT_TRUE(popped.has_value());
|
||||
ASSERT_EQ(static_cast<int>(popped.value()), 0x9f);
|
||||
ASSERT_TRUE(!cr.isComplete());
|
||||
popped = cr.pop_utf8();
|
||||
ASSERT_TRUE(popped.has_value());
|
||||
ASSERT_EQ(static_cast<int>(popped.value()), 0xa4);
|
||||
ASSERT_TRUE(!cr.isComplete());
|
||||
popped = cr.pop_utf8();
|
||||
ASSERT_TRUE(popped.has_value());
|
||||
ASSERT_EQ(static_cast<int>(popped.value()), 0xa1);
|
||||
ASSERT_TRUE(cr.isComplete());
|
||||
|
||||
// testing utf32: 0x12121 -> utf8: 0xf0 0x92 0x84 0xa1
|
||||
utf32 = 0x12121;
|
||||
cr.push(utf32);
|
||||
popped = cr.pop_utf8();
|
||||
ASSERT_TRUE(popped.has_value());
|
||||
ASSERT_EQ(static_cast<int>(popped.value()), 0xf0);
|
||||
ASSERT_TRUE(!cr.isComplete());
|
||||
popped = cr.pop_utf8();
|
||||
ASSERT_TRUE(popped.has_value());
|
||||
ASSERT_EQ(static_cast<int>(popped.value()), 0x92);
|
||||
ASSERT_TRUE(!cr.isComplete());
|
||||
popped = cr.pop_utf8();
|
||||
ASSERT_TRUE(popped.has_value());
|
||||
ASSERT_EQ(static_cast<int>(popped.value()), 0x84);
|
||||
ASSERT_TRUE(!cr.isComplete());
|
||||
popped = cr.pop_utf8();
|
||||
ASSERT_TRUE(popped.has_value());
|
||||
ASSERT_EQ(static_cast<int>(popped.value()), 0xa1);
|
||||
ASSERT_TRUE(cr.isComplete());
|
||||
|
||||
// should error if we try to pop another utf8 byte out
|
||||
popped = cr.pop_utf8();
|
||||
ASSERT_FALSE(popped.has_value());
|
||||
}
|
||||
|
||||
TEST(LlvmLibcCharacterConverterUTF32To8Test, CantPushMidConversion) {
|
||||
LIBC_NAMESPACE::internal::mbstate state;
|
||||
LIBC_NAMESPACE::internal::CharacterConverter cr(&state);
|
||||
cr.clear();
|
||||
|
||||
// testing utf32: 0x12121 -> utf8: 0xf0 0x92 0x84 0xa1
|
||||
char32_t utf32 = 0x12121;
|
||||
ASSERT_EQ(cr.push(utf32), 0);
|
||||
auto popped = cr.pop_utf8();
|
||||
ASSERT_TRUE(popped.has_value());
|
||||
|
||||
// can't push a utf32 without finishing popping the utf8 bytes out
|
||||
int err = cr.push(utf32);
|
||||
ASSERT_EQ(err, -1);
|
||||
}
|
||||
Reference in New Issue
Block a user