351 lines
9.3 KiB
C++
351 lines
9.3 KiB
C++
// -*- C++ -*-
|
|
//===----------------------------------------------------------------------===//
|
|
//
|
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
// See https://llvm.org/LICENSE.txt for license information.
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
// WARNING, this entire header is generated by
|
|
// utils/generate_indic_conjunct_break_table.py
|
|
// DO NOT MODIFY!
|
|
|
|
// UNICODE, INC. LICENSE AGREEMENT - DATA FILES AND SOFTWARE
|
|
//
|
|
// See Terms of Use <https://www.unicode.org/copyright.html>
|
|
// for definitions of Unicode Inc.'s Data Files and Software.
|
|
//
|
|
// NOTICE TO USER: Carefully read the following legal agreement.
|
|
// BY DOWNLOADING, INSTALLING, COPYING OR OTHERWISE USING UNICODE INC.'S
|
|
// DATA FILES ("DATA FILES"), AND/OR SOFTWARE ("SOFTWARE"),
|
|
// YOU UNEQUIVOCALLY ACCEPT, AND AGREE TO BE BOUND BY, ALL OF THE
|
|
// TERMS AND CONDITIONS OF THIS AGREEMENT.
|
|
// IF YOU DO NOT AGREE, DO NOT DOWNLOAD, INSTALL, COPY, DISTRIBUTE OR USE
|
|
// THE DATA FILES OR SOFTWARE.
|
|
//
|
|
// COPYRIGHT AND PERMISSION NOTICE
|
|
//
|
|
// Copyright (c) 1991-2022 Unicode, Inc. All rights reserved.
|
|
// Distributed under the Terms of Use in https://www.unicode.org/copyright.html.
|
|
//
|
|
// Permission is hereby granted, free of charge, to any person obtaining
|
|
// a copy of the Unicode data files and any associated documentation
|
|
// (the "Data Files") or Unicode software and any associated documentation
|
|
// (the "Software") to deal in the Data Files or Software
|
|
// without restriction, including without limitation the rights to use,
|
|
// copy, modify, merge, publish, distribute, and/or sell copies of
|
|
// the Data Files or Software, and to permit persons to whom the Data Files
|
|
// or Software are furnished to do so, provided that either
|
|
// (a) this copyright and permission notice appear with all copies
|
|
// of the Data Files or Software, or
|
|
// (b) this copyright and permission notice appear in associated
|
|
// Documentation.
|
|
//
|
|
// THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF
|
|
// ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
|
|
// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
|
// NONINFRINGEMENT OF THIRD PARTY RIGHTS.
|
|
// IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS
|
|
// NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL
|
|
// DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
|
|
// DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
|
|
// TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
|
|
// PERFORMANCE OF THE DATA FILES OR SOFTWARE.
|
|
//
|
|
// Except as contained in this notice, the name of a copyright holder
|
|
// shall not be used in advertising or otherwise to promote the sale,
|
|
// use or other dealings in these Data Files or Software without prior
|
|
// written authorization of the copyright holder.
|
|
|
|
#ifndef _LIBCPP___FORMAT_INDIC_CONJUNCT_BREAK_TABLE_H
|
|
#define _LIBCPP___FORMAT_INDIC_CONJUNCT_BREAK_TABLE_H
|
|
|
|
#include <__algorithm/ranges_upper_bound.h>
|
|
#include <__config>
|
|
#include <__cstddef/ptrdiff_t.h>
|
|
#include <__iterator/access.h>
|
|
#include <cstdint>
|
|
|
|
#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
|
|
# pragma GCC system_header
|
|
#endif
|
|
|
|
_LIBCPP_BEGIN_NAMESPACE_STD
|
|
|
|
#if _LIBCPP_STD_VER >= 20
|
|
|
|
namespace __indic_conjunct_break {
|
|
|
|
enum class __property : uint8_t {
|
|
// Values generated from the data files.
|
|
__Consonant,
|
|
__Extend,
|
|
__Linker,
|
|
|
|
// The code unit has none of above properties.
|
|
__none
|
|
};
|
|
|
|
/// The entries of the indic conjunct break property table.
|
|
///
|
|
/// The data is generated from
|
|
/// - https://www.unicode.org/Public/UCD/latest/ucd/DerivedCoreProperties.txt
|
|
///
|
|
/// The data has 3 values
|
|
/// - bits [0, 1] The property. One of the values generated from the datafiles
|
|
/// of \ref __property
|
|
/// - bits [2, 10] The size of the range.
|
|
/// - bits [11, 31] The lower bound code point of the range. The upper bound of
|
|
/// the range is lower bound + size.
|
|
///
|
|
/// The 9 bits for the size allow a maximum range of 512 elements. Some ranges
|
|
/// in the Unicode tables are larger. They are stored in multiple consecutive
|
|
/// ranges in the data table. An alternative would be to store the sizes in a
|
|
/// separate 16-bit value. The original MSVC STL code had such an approach, but
|
|
/// this approach uses less space for the data and is about 4% faster in the
|
|
/// following benchmark.
|
|
/// libcxx/benchmarks/std_format_spec_string_unicode.bench.cpp
|
|
// clang-format off
|
|
_LIBCPP_HIDE_FROM_ABI inline constexpr uint32_t __entries[201] = {
|
|
0x00180139,
|
|
0x001a807d,
|
|
0x00241811,
|
|
0x002c88b1,
|
|
0x002df801,
|
|
0x002e0805,
|
|
0x002e2005,
|
|
0x002e3801,
|
|
0x00308029,
|
|
0x00325851,
|
|
0x00338001,
|
|
0x0036b019,
|
|
0x0036f815,
|
|
0x00373805,
|
|
0x0037500d,
|
|
0x00388801,
|
|
0x00398069,
|
|
0x003f5821,
|
|
0x003fe801,
|
|
0x0040b00d,
|
|
0x0040d821,
|
|
0x00412809,
|
|
0x00414811,
|
|
0x0042c809,
|
|
0x0044c01d,
|
|
0x0046505d,
|
|
0x00471871,
|
|
0x0048a890,
|
|
0x0049e001,
|
|
0x004a6802,
|
|
0x004a880d,
|
|
0x004ac01c,
|
|
0x004bc01c,
|
|
0x004ca84c,
|
|
0x004d5018,
|
|
0x004d9000,
|
|
0x004db00c,
|
|
0x004de001,
|
|
0x004e6802,
|
|
0x004ee004,
|
|
0x004ef800,
|
|
0x004f8004,
|
|
0x004ff001,
|
|
0x0051e001,
|
|
0x0054a84c,
|
|
0x00555018,
|
|
0x00559004,
|
|
0x0055a810,
|
|
0x0055e001,
|
|
0x00566802,
|
|
0x0057c800,
|
|
0x0058a84c,
|
|
0x00595018,
|
|
0x00599004,
|
|
0x0059a810,
|
|
0x0059e001,
|
|
0x005a6802,
|
|
0x005ae004,
|
|
0x005af800,
|
|
0x005b8800,
|
|
0x0060a84c,
|
|
0x0061503c,
|
|
0x0061e001,
|
|
0x00626802,
|
|
0x0062a805,
|
|
0x0062c008,
|
|
0x0065e001,
|
|
0x0068a894,
|
|
0x0069d805,
|
|
0x006a6802,
|
|
0x0071c009,
|
|
0x0072400d,
|
|
0x0075c009,
|
|
0x0076400d,
|
|
0x0078c005,
|
|
0x0079a801,
|
|
0x0079b801,
|
|
0x0079c801,
|
|
0x007b8805,
|
|
0x007ba001,
|
|
0x007bd00d,
|
|
0x007c0001,
|
|
0x007c1009,
|
|
0x007c3005,
|
|
0x007e3001,
|
|
0x0081b801,
|
|
0x0081c805,
|
|
0x00846801,
|
|
0x009ae809,
|
|
0x00b8a001,
|
|
0x00be9001,
|
|
0x00bee801,
|
|
0x00c54801,
|
|
0x00c9c809,
|
|
0x00d0b805,
|
|
0x00d30001,
|
|
0x00d3a81d,
|
|
0x00d3f801,
|
|
0x00d58035,
|
|
0x00d5f83d,
|
|
0x00d9a001,
|
|
0x00db5821,
|
|
0x00dd5801,
|
|
0x00df3001,
|
|
0x00e1b801,
|
|
0x00e68009,
|
|
0x00e6a031,
|
|
0x00e71019,
|
|
0x00e76801,
|
|
0x00e7a001,
|
|
0x00e7c005,
|
|
0x00ee00fd,
|
|
0x01006801,
|
|
0x01068031,
|
|
0x01070801,
|
|
0x0107282d,
|
|
0x01677809,
|
|
0x016bf801,
|
|
0x016f007d,
|
|
0x01815015,
|
|
0x0184c805,
|
|
0x05337801,
|
|
0x0533a025,
|
|
0x0534f005,
|
|
0x05378005,
|
|
0x05416001,
|
|
0x05470045,
|
|
0x05495809,
|
|
0x054d9801,
|
|
0x05558001,
|
|
0x05559009,
|
|
0x0555b805,
|
|
0x0555f005,
|
|
0x05560801,
|
|
0x0557b001,
|
|
0x055f6801,
|
|
0x07d8f001,
|
|
0x07f1003d,
|
|
0x080fe801,
|
|
0x08170001,
|
|
0x081bb011,
|
|
0x08506801,
|
|
0x08507801,
|
|
0x0851c009,
|
|
0x0851f801,
|
|
0x08572805,
|
|
0x0869200d,
|
|
0x08755805,
|
|
0x0877e809,
|
|
0x087a3029,
|
|
0x087c100d,
|
|
0x08838001,
|
|
0x0883f801,
|
|
0x0885d001,
|
|
0x08880009,
|
|
0x08899805,
|
|
0x088b9801,
|
|
0x088e5001,
|
|
0x0891b001,
|
|
0x08974805,
|
|
0x0899d805,
|
|
0x089b3019,
|
|
0x089b8011,
|
|
0x08a23001,
|
|
0x08a2f001,
|
|
0x08a61801,
|
|
0x08ae0001,
|
|
0x08b5b801,
|
|
0x08b95801,
|
|
0x08c1d001,
|
|
0x08c9f001,
|
|
0x08ca1801,
|
|
0x08d1a001,
|
|
0x08d23801,
|
|
0x08d4c801,
|
|
0x08ea1001,
|
|
0x08ea2005,
|
|
0x08ecb801,
|
|
0x08fa1001,
|
|
0x0b578011,
|
|
0x0b598019,
|
|
0x0de4f001,
|
|
0x0e8b2801,
|
|
0x0e8b3809,
|
|
0x0e8b7011,
|
|
0x0e8bd81d,
|
|
0x0e8c2819,
|
|
0x0e8d500d,
|
|
0x0e921009,
|
|
0x0f000019,
|
|
0x0f004041,
|
|
0x0f00d819,
|
|
0x0f011805,
|
|
0x0f013011,
|
|
0x0f047801,
|
|
0x0f098019,
|
|
0x0f157001,
|
|
0x0f17600d,
|
|
0x0f27600d,
|
|
0x0f468019,
|
|
0x0f4a2019};
|
|
// clang-format on
|
|
|
|
/// Returns the indic conjuct break property of a code point.
|
|
[[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr __property __get_property(const char32_t __code_point) noexcept {
|
|
// The algorithm searches for the upper bound of the range and, when found,
|
|
// steps back one entry. This algorithm is used since the code point can be
|
|
// anywhere in the range. After a lower bound is found the next step is to
|
|
// compare whether the code unit is indeed in the range.
|
|
//
|
|
// Since the entry contains a code unit, size, and property the code point
|
|
// being sought needs to be adjusted. Just shifting the code point to the
|
|
// proper position doesn't work; suppose an entry has property 0, size 1,
|
|
// and lower bound 3. This results in the entry 0x1810.
|
|
// When searching for code point 3 it will search for 0x1800, find 0x1810
|
|
// and moves to the previous entry. Thus the lower bound value will never
|
|
// be found.
|
|
// The simple solution is to set the bits belonging to the property and
|
|
// size. Then the upper bound for code point 3 will return the entry after
|
|
// 0x1810. After moving to the previous entry the algorithm arrives at the
|
|
// correct entry.
|
|
ptrdiff_t __i = std::ranges::upper_bound(__entries, (__code_point << 11) | 0x7ffu) - __entries;
|
|
if (__i == 0)
|
|
return __property::__none;
|
|
|
|
--__i;
|
|
uint32_t __upper_bound = (__entries[__i] >> 11) + ((__entries[__i] >> 2) & 0b1'1111'1111);
|
|
if (__code_point <= __upper_bound)
|
|
return static_cast<__property>(__entries[__i] & 0b11);
|
|
|
|
return __property::__none;
|
|
}
|
|
|
|
} // namespace __indic_conjunct_break
|
|
|
|
#endif // _LIBCPP_STD_VER >= 20
|
|
|
|
_LIBCPP_END_NAMESPACE_STD
|
|
|
|
#endif // _LIBCPP___FORMAT_INDIC_CONJUNCT_BREAK_TABLE_H
|