From 10d46cf0d57ad388c02e6197f52ada3cc4a95635 Mon Sep 17 00:00:00 2001 From: sribee8 Date: Mon, 23 Jun 2025 16:25:13 -0700 Subject: [PATCH] [libc] mbtowc implementation (#145405) Implemented mbtowcs and tests for the function. --------- Co-authored-by: Sriya Pratipati --- libc/config/linux/x86_64/entrypoints.txt | 1 + libc/include/wchar.yaml | 8 ++ libc/src/wchar/CMakeLists.txt | 16 +++ libc/src/wchar/mbtowc.cpp | 40 ++++++ libc/src/wchar/mbtowc.h | 22 ++++ libc/test/src/wchar/CMakeLists.txt | 13 ++ libc/test/src/wchar/mbtowc_test.cpp | 154 +++++++++++++++++++++++ 7 files changed, 254 insertions(+) create mode 100644 libc/src/wchar/mbtowc.cpp create mode 100644 libc/src/wchar/mbtowc.h create mode 100644 libc/test/src/wchar/mbtowc_test.cpp diff --git a/libc/config/linux/x86_64/entrypoints.txt b/libc/config/linux/x86_64/entrypoints.txt index e1a2a26479de..f0e17d6a2544 100644 --- a/libc/config/linux/x86_64/entrypoints.txt +++ b/libc/config/linux/x86_64/entrypoints.txt @@ -1248,6 +1248,7 @@ if(LLVM_LIBC_FULL_BUILD) # wchar.h entrypoints libc.src.wchar.mbrtowc + libc.src.wchar.mbtowc libc.src.wchar.wcrtomb ) endif() diff --git a/libc/include/wchar.yaml b/libc/include/wchar.yaml index 0a6a75ebbbf9..d5044e1728f8 100644 --- a/libc/include/wchar.yaml +++ b/libc/include/wchar.yaml @@ -38,6 +38,14 @@ functions: - type: const char *__restrict - type: size_t - type: mbstate_t *__restrict + - name: mbtowc + standards: + - stdc + return_type: int + arguments: + - type: wchar_t *__restrict + - type: const char *__restrict + - type: size_t - name: wmemset standards: - stdc diff --git a/libc/src/wchar/CMakeLists.txt b/libc/src/wchar/CMakeLists.txt index 476cf38f4662..983e4269c57e 100644 --- a/libc/src/wchar/CMakeLists.txt +++ b/libc/src/wchar/CMakeLists.txt @@ -65,6 +65,22 @@ add_entrypoint_object( libc.src.__support.wchar.mbstate ) +add_entrypoint_object( + mbtowc + SRCS + mbtowc.cpp + HDRS + mbtowc.h + DEPENDS + libc.hdr.types.size_t + libc.hdr.types.wchar_t + libc.src.__support.common + libc.src.__support.macros.config + libc.src.__support.libc_errno + libc.src.__support.wchar.mbrtowc + libc.src.__support.wchar.mbstate +) + add_entrypoint_object( wmemset SRCS diff --git a/libc/src/wchar/mbtowc.cpp b/libc/src/wchar/mbtowc.cpp new file mode 100644 index 000000000000..eae39ba6081f --- /dev/null +++ b/libc/src/wchar/mbtowc.cpp @@ -0,0 +1,40 @@ +//===-- Implementation of mbtowc -----------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/wchar/mbtowc.h" + +#include "hdr/types/size_t.h" +#include "hdr/types/wchar_t.h" +#include "src/__support/common.h" +#include "src/__support/libc_errno.h" +#include "src/__support/macros/config.h" +#include "src/__support/wchar/mbrtowc.h" +#include "src/__support/wchar/mbstate.h" + +namespace LIBC_NAMESPACE_DECL { + +LLVM_LIBC_FUNCTION(int, mbtowc, + (wchar_t *__restrict pwc, const char *__restrict s, + size_t n)) { + // returns 0 since UTF-8 encoding is not state-dependent + if (s == nullptr) + return 0; + internal::mbstate internal_mbstate; + // temp ptr to use if pwc is nullptr + wchar_t buf[1]; + auto ret = + internal::mbrtowc(pwc == nullptr ? buf : pwc, s, n, &internal_mbstate); + if (!ret.has_value() || static_cast(ret.value()) == -2) { + // Encoding failure + libc_errno = EILSEQ; + return -1; + } + return static_cast(ret.value()); +} + +} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/wchar/mbtowc.h b/libc/src/wchar/mbtowc.h new file mode 100644 index 000000000000..f974197f81b5 --- /dev/null +++ b/libc/src/wchar/mbtowc.h @@ -0,0 +1,22 @@ +//===-- Implementation header for mbtowc ---------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_WCHAR_MBTOWC_H +#define LLVM_LIBC_SRC_WCHAR_MBTOWC_H + +#include "hdr/types/size_t.h" +#include "hdr/types/wchar_t.h" +#include "src/__support/macros/config.h" + +namespace LIBC_NAMESPACE_DECL { + +int mbtowc(wchar_t *__restrict pwc, const char *__restrict s, size_t n); + +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC_WCHAR_MBTOWC_H diff --git a/libc/test/src/wchar/CMakeLists.txt b/libc/test/src/wchar/CMakeLists.txt index c932f3632c7a..ac5869380597 100644 --- a/libc/test/src/wchar/CMakeLists.txt +++ b/libc/test/src/wchar/CMakeLists.txt @@ -37,6 +37,19 @@ add_libc_test( libc.hdr.types.wchar_t ) +add_libc_test( + mbtowc_test + SUITE + libc_wchar_unittests + SRCS + mbtowc_test.cpp + DEPENDS + libc.src.__support.libc_errno + libc.src.wchar.mbtowc + libc.hdr.types.wchar_t + libc.test.UnitTest.ErrnoCheckingTest +) + add_libc_test( wctob_test SUITE diff --git a/libc/test/src/wchar/mbtowc_test.cpp b/libc/test/src/wchar/mbtowc_test.cpp new file mode 100644 index 000000000000..b27b05cbd899 --- /dev/null +++ b/libc/test/src/wchar/mbtowc_test.cpp @@ -0,0 +1,154 @@ +//===-- Unittests for mbtowc ---------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "hdr/types/wchar_t.h" +#include "src/__support/libc_errno.h" +#include "src/wchar/mbtowc.h" +#include "test/UnitTest/ErrnoCheckingTest.h" +#include "test/UnitTest/Test.h" + +using LlvmLibcMBToWCTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest; + +TEST_F(LlvmLibcMBToWCTest, OneByte) { + const char *ch = "A"; + wchar_t dest[2]; + int n = LIBC_NAMESPACE::mbtowc(dest, ch, 1); + ASSERT_EQ(static_cast(*dest), 'A'); + ASSERT_EQ(n, 1); + + // Should fail since we have not read enough + n = LIBC_NAMESPACE::mbtowc(dest, ch, 0); + ASSERT_EQ(n, -1); + ASSERT_ERRNO_EQ(EILSEQ); +} + +TEST_F(LlvmLibcMBToWCTest, TwoByte) { + const char ch[2] = {static_cast(0xC2), + static_cast(0x8E)}; // Ž car symbol + wchar_t dest[2]; + int n = LIBC_NAMESPACE::mbtowc(dest, ch, 2); + ASSERT_EQ(static_cast(*dest), 142); + ASSERT_EQ(n, 2); + + // Should fail since we have not read enough + n = LIBC_NAMESPACE::mbtowc(dest, ch, 1); + ASSERT_EQ(n, -1); + // Should fail after trying to read next byte too + n = LIBC_NAMESPACE::mbtowc(dest, ch + 1, 1); + ASSERT_EQ(n, -1); + ASSERT_ERRNO_EQ(EILSEQ); +} + +TEST_F(LlvmLibcMBToWCTest, ThreeByte) { + const char ch[3] = {static_cast(0xE2), static_cast(0x88), + static_cast(0x91)}; // ∑ sigma symbol + wchar_t dest[2]; + int n = LIBC_NAMESPACE::mbtowc(dest, ch, 3); + ASSERT_EQ(static_cast(*dest), 8721); + ASSERT_EQ(n, 3); + + // Should fail since we have not read enough + n = LIBC_NAMESPACE::mbtowc(dest, ch, 2); + ASSERT_EQ(n, -1); + ASSERT_ERRNO_EQ(EILSEQ); +} + +TEST_F(LlvmLibcMBToWCTest, FourByte) { + const char ch[4] = {static_cast(0xF0), static_cast(0x9F), + static_cast(0xA4), + static_cast(0xA1)}; // 🤡 clown emoji + wchar_t dest[2]; + int n = LIBC_NAMESPACE::mbtowc(dest, ch, 4); + ASSERT_EQ(static_cast(*dest), 129313); + ASSERT_EQ(n, 4); + + // Should fail since we have not read enough + n = LIBC_NAMESPACE::mbtowc(dest, ch, 2); + ASSERT_EQ(n, -1); + ASSERT_ERRNO_EQ(EILSEQ); +} + +TEST_F(LlvmLibcMBToWCTest, InvalidByte) { + const char ch[1] = {static_cast(0x80)}; + wchar_t dest[2]; + int n = LIBC_NAMESPACE::mbtowc(dest, ch, 1); + ASSERT_EQ(n, -1); + ASSERT_ERRNO_EQ(EILSEQ); +} + +TEST_F(LlvmLibcMBToWCTest, InvalidMultiByte) { + const char ch[4] = {static_cast(0x80), static_cast(0x00), + static_cast(0x80), + static_cast(0x00)}; // invalid sequence of bytes + wchar_t dest[2]; + // Trying to push all 4 should error + int n = LIBC_NAMESPACE::mbtowc(dest, ch, 4); + ASSERT_EQ(n, -1); + ASSERT_ERRNO_EQ(EILSEQ); + + // Trying to push the second and third should correspond to null wc + n = LIBC_NAMESPACE::mbtowc(dest, ch + 1, 2); + ASSERT_EQ(n, 0); + ASSERT_TRUE(*dest == L'\0'); +} + +TEST_F(LlvmLibcMBToWCTest, InvalidLastByte) { + // Last byte is invalid since it does not have correct starting sequence. + // 0xC0 --> 11000000 starting sequence should be 10xxxxxx + const char ch[4] = {static_cast(0xF1), static_cast(0x80), + static_cast(0x80), static_cast(0xC0)}; + wchar_t dest[2]; + // Trying to push all 4 should error + int n = LIBC_NAMESPACE::mbtowc(dest, ch, 4); + ASSERT_EQ(n, -1); + ASSERT_ERRNO_EQ(EILSEQ); +} + +TEST_F(LlvmLibcMBToWCTest, ValidTwoByteWithExtraRead) { + const char ch[3] = {static_cast(0xC2), static_cast(0x8E), + static_cast(0x80)}; + wchar_t dest[2]; + // Trying to push all 3 should return valid 2 byte + int n = LIBC_NAMESPACE::mbtowc(dest, ch, 3); + ASSERT_EQ(n, 2); + ASSERT_EQ(static_cast(*dest), 142); +} + +TEST_F(LlvmLibcMBToWCTest, TwoValidTwoBytes) { + const char ch[4] = {static_cast(0xC2), static_cast(0x8E), + static_cast(0xC7), static_cast(0x8C)}; + wchar_t dest[2]; + int n = LIBC_NAMESPACE::mbtowc(dest, ch, 2); + ASSERT_EQ(n, 2); + ASSERT_EQ(static_cast(*dest), 142); + n = LIBC_NAMESPACE::mbtowc(dest + 1, ch + 2, 2); + ASSERT_EQ(n, 2); + ASSERT_EQ(static_cast(*(dest + 1)), 460); +} + +TEST_F(LlvmLibcMBToWCTest, NullString) { + wchar_t dest[2] = {L'O', L'K'}; + // reading on nullptr should return 0 + int n = LIBC_NAMESPACE::mbtowc(dest, nullptr, 2); + ASSERT_EQ(n, 0); + ASSERT_TRUE(dest[0] == L'O'); + // reading a null terminator should return 0 + const char *ch = "\0"; + n = LIBC_NAMESPACE::mbtowc(dest, ch, 1); + ASSERT_EQ(n, 0); +} + +TEST_F(LlvmLibcMBToWCTest, NullWCPtr) { + const char ch[2] = { + static_cast(0xC2), + static_cast(0x8E), + }; + // a null destination should still return the number of read bytes + int n = LIBC_NAMESPACE::mbtowc(nullptr, ch, 2); + ASSERT_EQ(n, 2); +}