/src/hermes/include/hermes/Platform/Unicode/CharacterProperties.h
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * Copyright (c) Meta Platforms, Inc. and affiliates. |
3 | | * |
4 | | * This source code is licensed under the MIT license found in the |
5 | | * LICENSE file in the root directory of this source tree. |
6 | | */ |
7 | | |
8 | | #ifndef HERMES_PLATFORMUNICODE_CHARACTERPROPERTIES_H |
9 | | #define HERMES_PLATFORMUNICODE_CHARACTERPROPERTIES_H |
10 | | |
11 | | #include <cassert> |
12 | | #include <cstdint> |
13 | | #include <string> |
14 | | |
15 | | #include "llvh/ADT/ArrayRef.h" |
16 | | |
17 | | namespace hermes { |
18 | | |
19 | | const uint32_t UNICODE_MAX_VALUE = 0x10FFFF; |
20 | | /// The start of the surrogate range. |
21 | | const uint32_t UNICODE_SURROGATE_FIRST = 0xD800; |
22 | | /// The last character of the surrogate range (inclusive). |
23 | | const uint32_t UNICODE_SURROGATE_LAST = 0xDFFF; |
24 | | const uint32_t UTF16_HIGH_SURROGATE = 0xD800; |
25 | | const uint32_t UTF16_LOW_SURROGATE = 0xDC00; |
26 | | const uint32_t UNICODE_REPLACEMENT_CHARACTER = 0xFFFD; |
27 | | /// The last member of the BMP. |
28 | | const uint32_t UNICODE_LAST_BMP = 0xFFFF; |
29 | | |
30 | | const uint32_t UNICODE_LINE_SEPARATOR = 0x2028; |
31 | | const uint32_t UNICODE_PARAGRAPH_SEPARATOR = 0x2029; |
32 | | |
33 | | const uint32_t UNICODE_ZWNJ = 0x200C; |
34 | | const uint32_t UNICODE_ZWJ = 0x200D; |
35 | | |
36 | | /// The maximum number of precanonicalizations of any character. |
37 | | /// Precanonicalization is not a term from the Unicode spec; rather it refers to |
38 | | /// the RegExp Canonicalize function given in ES5.1 15.10.2.8. Most characters |
39 | | /// are either canonicalized to by themselves or their lowercase variant; |
40 | | /// there's a handful of exceptions which are tracked here. |
41 | | const uint32_t UNICODE_MAX_PRECANONICALIZATIONS = 3; |
42 | | |
43 | 0 | inline bool isValidCodePoint(uint32_t cp) { |
44 | 0 | return !( |
45 | 0 | (cp >= UNICODE_SURROGATE_FIRST && cp <= UNICODE_SURROGATE_LAST) || |
46 | 0 | cp > UNICODE_MAX_VALUE); |
47 | 0 | } |
48 | | |
49 | | /// \return whether \p cp is part of the Basic Multilingual Plane. |
50 | | /// Surrogate characters are considered part of the BMP. |
51 | 156 | inline bool isMemberOfBMP(uint32_t cp) { |
52 | 156 | return cp <= UNICODE_LAST_BMP; |
53 | 156 | } |
54 | | |
55 | | /// \return whether cp is a high surrogate. |
56 | 181 | inline bool isHighSurrogate(uint32_t cp) { |
57 | 181 | return UNICODE_SURROGATE_FIRST <= cp && cp < UTF16_LOW_SURROGATE; |
58 | 181 | } |
59 | | |
60 | | /// \return whether cp is a low surrogate. |
61 | 195 | inline bool isLowSurrogate(uint32_t cp) { |
62 | 195 | return UTF16_LOW_SURROGATE <= cp && cp <= UNICODE_SURROGATE_LAST; |
63 | 195 | } |
64 | | |
65 | | //===----------------------------------------------------------------------===// |
66 | | // ES14 11.1.3 |
67 | | |
68 | | /// Decode a surrogate pair [\p lead, \p trail] into a code point. |
69 | 14 | inline uint32_t utf16SurrogatePairToCodePoint(uint32_t lead, uint32_t trail) { |
70 | 14 | assert( |
71 | 14 | isHighSurrogate(lead) && isLowSurrogate(trail) && "Not a surrogate pair"); |
72 | 14 | return ((lead - UTF16_HIGH_SURROGATE) << 10) + (trail - UTF16_LOW_SURROGATE) + |
73 | 14 | 0x10000; |
74 | 14 | } |
75 | | |
76 | | /// \return true if the codepoint is not ASCII and is a Unicode letter. |
77 | | bool isUnicodeOnlyLetter(uint32_t cp); |
78 | | /// \return true if the codepoint is not ASCII and is a Unicode space. |
79 | | bool isUnicodeOnlySpace(uint32_t cp); |
80 | | /// \return true if the codepoint is in the Non-Spacing Mark or |
81 | | /// Combining-Spacing Mark categories. |
82 | | bool isUnicodeCombiningMark(uint32_t cp); |
83 | | /// \return true if the codepoint is in the Decimal Number category. |
84 | | bool isUnicodeDigit(uint32_t cp); |
85 | | /// \return true if the codepoint is in the Connector Punctuation category. |
86 | | bool isUnicodeConnectorPunctuation(uint32_t cp); |
87 | | |
88 | | /// \return true if the codepoint has the ID_Start property and is ASCII. |
89 | 13.4k | inline bool isASCIIIdentifierStart(uint32_t ch) { |
90 | 13.4k | return ch == '_' || ch == '$' || ((ch | 32) >= 'a' && (ch | 32) <= 'z'); |
91 | 13.4k | } |
92 | | |
93 | | /// \return true if the codepoint has the ID_Start property. |
94 | 13.3k | inline bool isUnicodeIDStart(uint32_t cp) { |
95 | 13.3k | return isASCIIIdentifierStart(cp) || isUnicodeOnlyLetter(cp); |
96 | 13.3k | } |
97 | | |
98 | | /// \return true if the codepoint has the ID_Continue property. |
99 | 13.3k | inline bool isUnicodeIDContinue(uint32_t cp) { |
100 | | // TODO: clearly this has to be optimized somehow |
101 | 13.3k | return isUnicodeIDStart(cp) || isUnicodeCombiningMark(cp) || |
102 | 13.3k | isUnicodeDigit(cp) || isUnicodeConnectorPunctuation(cp) || |
103 | 13.3k | cp == UNICODE_ZWNJ || cp == UNICODE_ZWJ; |
104 | 13.3k | } |
105 | | |
106 | | /// \return true if the codepoint is valid in a unicode property name |
107 | 0 | inline bool isUnicodePropertyName(uint32_t ch) { |
108 | 0 | return ch == '_' || ((ch | 32) >= 'a' && (ch | 32) <= 'z'); |
109 | 0 | } |
110 | | |
111 | | /// \return true if the codepoint is valid in a unicode property value |
112 | 0 | inline bool isUnicodePropertyValue(uint32_t ch) { |
113 | 0 | return isUnicodePropertyName(ch) || isUnicodeDigit(ch); |
114 | 0 | } |
115 | | |
116 | | /// \return the canonicalized value of \p cp, following ES9 21.2.2.8.2. |
117 | | uint32_t canonicalize(uint32_t cp, bool unicode); |
118 | | |
119 | | class CodePointSet; |
120 | | /// \return a set containing all characters which are canonically equivalent to |
121 | | /// any character in \p set, following ES9 21.2.2.8.2. |
122 | | CodePointSet makeCanonicallyEquivalent(const CodePointSet &set, bool unicode); |
123 | | |
124 | | struct UnicodeRangePoolRef; |
125 | | |
126 | | // Create a codepoint range array from a Unicode \p propertyName and \p |
127 | | // propertyValue. |
128 | | llvh::ArrayRef<UnicodeRangePoolRef> unicodePropertyRanges( |
129 | | std::string_view propertyName, |
130 | | std::string_view propertyValue); |
131 | | |
132 | | /// Add a codepoint range array of codepoints to \p receiver, typically used in |
133 | | /// conjuction with unicodePropertyRanges. |
134 | | void addRangeArrayPoolToBracket( |
135 | | CodePointSet *receiver, |
136 | | const llvh::ArrayRef<UnicodeRangePoolRef> rangeArrayPool, |
137 | | bool inverted); |
138 | | |
139 | | } // namespace hermes |
140 | | |
141 | | #endif // HERMES_PLATFORMUNICODE_CHARACTERPROPERTIES_H |