/src/mozilla-central/intl/unicharutil/util/nsUnicodeProperties.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | /* -*- Mode: C++; tab-width: 20; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ |
2 | | /* vim:set ts=4 sw=4 sts=4 et cindent: */ |
3 | | /* This Source Code Form is subject to the terms of the Mozilla Public |
4 | | * License, v. 2.0. If a copy of the MPL was not distributed with this |
5 | | * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ |
6 | | |
7 | | #include "nsUnicodeProperties.h" |
8 | | #include "nsUnicodePropertyData.cpp" |
9 | | |
10 | | #include "mozilla/ArrayUtils.h" |
11 | | #include "nsCharTraits.h" |
12 | | |
13 | 88.5k | #define UNICODE_BMP_LIMIT 0x10000 |
14 | | #define UNICODE_LIMIT 0x110000 |
15 | | |
16 | | const nsCharProps2& |
17 | | GetCharProps2(uint32_t aCh) |
18 | 88.5k | { |
19 | 88.5k | if (aCh < UNICODE_BMP_LIMIT) { |
20 | 88.5k | return sCharProp2Values[sCharProp2Pages[0][aCh >> kCharProp2CharBits]] |
21 | 88.5k | [aCh & ((1 << kCharProp2CharBits) - 1)]; |
22 | 88.5k | } |
23 | 34 | if (aCh < (kCharProp2MaxPlane + 1) * 0x10000) { |
24 | 34 | return sCharProp2Values[sCharProp2Pages[sCharProp2Planes[(aCh >> 16) - 1]] |
25 | 34 | [(aCh & 0xffff) >> kCharProp2CharBits]] |
26 | 34 | [aCh & ((1 << kCharProp2CharBits) - 1)]; |
27 | 34 | } |
28 | 0 | |
29 | 0 | MOZ_ASSERT_UNREACHABLE("Getting CharProps for codepoint outside Unicode " |
30 | 0 | "range"); |
31 | 0 |
|
32 | 0 | // Default values for unassigned |
33 | 0 | using namespace mozilla::unicode; |
34 | 0 | static const nsCharProps2 undefined = { |
35 | 0 | VERTICAL_ORIENTATION_R, |
36 | 0 | 0 // IdentifierType |
37 | 0 | }; |
38 | 0 | return undefined; |
39 | 0 | } |
40 | | |
41 | | namespace mozilla { |
42 | | |
43 | | namespace unicode { |
44 | | |
45 | | /* |
46 | | To store properties for a million Unicode codepoints compactly, we use |
47 | | a three-level array structure, with the Unicode values considered as |
48 | | three elements: Plane, Page, and Char. |
49 | | |
50 | | Space optimization happens because multiple Planes can refer to the same |
51 | | Page array, and multiple Pages can refer to the same Char array holding |
52 | | the actual values. In practice, most of the higher planes are empty and |
53 | | thus share the same data; and within the BMP, there are also many pages |
54 | | that repeat the same data for any given property. |
55 | | |
56 | | Plane is usually zero, so we skip a lookup in this case, and require |
57 | | that the Plane 0 pages are always the first set of entries in the Page |
58 | | array. |
59 | | |
60 | | The division of the remaining 16 bits into Page and Char fields is |
61 | | adjusted for each property (by experiment using the generation tool) |
62 | | to provide the most compact storage, depending on the distribution |
63 | | of values. |
64 | | */ |
65 | | |
66 | | const nsUGenCategory sDetailedToGeneralCategory[] = { |
67 | | /* |
68 | | * The order here corresponds to the HB_UNICODE_GENERAL_CATEGORY_* constants |
69 | | * of the hb_unicode_general_category_t enum in gfx/harfbuzz/src/hb-unicode.h. |
70 | | */ |
71 | | /* CONTROL */ nsUGenCategory::kOther, |
72 | | /* FORMAT */ nsUGenCategory::kOther, |
73 | | /* UNASSIGNED */ nsUGenCategory::kOther, |
74 | | /* PRIVATE_USE */ nsUGenCategory::kOther, |
75 | | /* SURROGATE */ nsUGenCategory::kOther, |
76 | | /* LOWERCASE_LETTER */ nsUGenCategory::kLetter, |
77 | | /* MODIFIER_LETTER */ nsUGenCategory::kLetter, |
78 | | /* OTHER_LETTER */ nsUGenCategory::kLetter, |
79 | | /* TITLECASE_LETTER */ nsUGenCategory::kLetter, |
80 | | /* UPPERCASE_LETTER */ nsUGenCategory::kLetter, |
81 | | /* COMBINING_MARK */ nsUGenCategory::kMark, |
82 | | /* ENCLOSING_MARK */ nsUGenCategory::kMark, |
83 | | /* NON_SPACING_MARK */ nsUGenCategory::kMark, |
84 | | /* DECIMAL_NUMBER */ nsUGenCategory::kNumber, |
85 | | /* LETTER_NUMBER */ nsUGenCategory::kNumber, |
86 | | /* OTHER_NUMBER */ nsUGenCategory::kNumber, |
87 | | /* CONNECT_PUNCTUATION */ nsUGenCategory::kPunctuation, |
88 | | /* DASH_PUNCTUATION */ nsUGenCategory::kPunctuation, |
89 | | /* CLOSE_PUNCTUATION */ nsUGenCategory::kPunctuation, |
90 | | /* FINAL_PUNCTUATION */ nsUGenCategory::kPunctuation, |
91 | | /* INITIAL_PUNCTUATION */ nsUGenCategory::kPunctuation, |
92 | | /* OTHER_PUNCTUATION */ nsUGenCategory::kPunctuation, |
93 | | /* OPEN_PUNCTUATION */ nsUGenCategory::kPunctuation, |
94 | | /* CURRENCY_SYMBOL */ nsUGenCategory::kSymbol, |
95 | | /* MODIFIER_SYMBOL */ nsUGenCategory::kSymbol, |
96 | | /* MATH_SYMBOL */ nsUGenCategory::kSymbol, |
97 | | /* OTHER_SYMBOL */ nsUGenCategory::kSymbol, |
98 | | /* LINE_SEPARATOR */ nsUGenCategory::kSeparator, |
99 | | /* PARAGRAPH_SEPARATOR */ nsUGenCategory::kSeparator, |
100 | | /* SPACE_SEPARATOR */ nsUGenCategory::kSeparator |
101 | | }; |
102 | | |
103 | | const hb_unicode_general_category_t sICUtoHBcategory[U_CHAR_CATEGORY_COUNT] = { |
104 | | HB_UNICODE_GENERAL_CATEGORY_UNASSIGNED, // U_GENERAL_OTHER_TYPES = 0, |
105 | | HB_UNICODE_GENERAL_CATEGORY_UPPERCASE_LETTER, // U_UPPERCASE_LETTER = 1, |
106 | | HB_UNICODE_GENERAL_CATEGORY_LOWERCASE_LETTER, // U_LOWERCASE_LETTER = 2, |
107 | | HB_UNICODE_GENERAL_CATEGORY_TITLECASE_LETTER, // U_TITLECASE_LETTER = 3, |
108 | | HB_UNICODE_GENERAL_CATEGORY_MODIFIER_LETTER, // U_MODIFIER_LETTER = 4, |
109 | | HB_UNICODE_GENERAL_CATEGORY_OTHER_LETTER, // U_OTHER_LETTER = 5, |
110 | | HB_UNICODE_GENERAL_CATEGORY_NON_SPACING_MARK, // U_NON_SPACING_MARK = 6, |
111 | | HB_UNICODE_GENERAL_CATEGORY_ENCLOSING_MARK, // U_ENCLOSING_MARK = 7, |
112 | | HB_UNICODE_GENERAL_CATEGORY_SPACING_MARK, // U_COMBINING_SPACING_MARK = 8, |
113 | | HB_UNICODE_GENERAL_CATEGORY_DECIMAL_NUMBER, // U_DECIMAL_DIGIT_NUMBER = 9, |
114 | | HB_UNICODE_GENERAL_CATEGORY_LETTER_NUMBER, // U_LETTER_NUMBER = 10, |
115 | | HB_UNICODE_GENERAL_CATEGORY_OTHER_NUMBER, // U_OTHER_NUMBER = 11, |
116 | | HB_UNICODE_GENERAL_CATEGORY_SPACE_SEPARATOR, // U_SPACE_SEPARATOR = 12, |
117 | | HB_UNICODE_GENERAL_CATEGORY_LINE_SEPARATOR, // U_LINE_SEPARATOR = 13, |
118 | | HB_UNICODE_GENERAL_CATEGORY_PARAGRAPH_SEPARATOR, // U_PARAGRAPH_SEPARATOR = 14, |
119 | | HB_UNICODE_GENERAL_CATEGORY_CONTROL, // U_CONTROL_CHAR = 15, |
120 | | HB_UNICODE_GENERAL_CATEGORY_FORMAT, // U_FORMAT_CHAR = 16, |
121 | | HB_UNICODE_GENERAL_CATEGORY_PRIVATE_USE, // U_PRIVATE_USE_CHAR = 17, |
122 | | HB_UNICODE_GENERAL_CATEGORY_SURROGATE, // U_SURROGATE = 18, |
123 | | HB_UNICODE_GENERAL_CATEGORY_DASH_PUNCTUATION, // U_DASH_PUNCTUATION = 19, |
124 | | HB_UNICODE_GENERAL_CATEGORY_OPEN_PUNCTUATION, // U_START_PUNCTUATION = 20, |
125 | | HB_UNICODE_GENERAL_CATEGORY_CLOSE_PUNCTUATION, // U_END_PUNCTUATION = 21, |
126 | | HB_UNICODE_GENERAL_CATEGORY_CONNECT_PUNCTUATION, // U_CONNECTOR_PUNCTUATION = 22, |
127 | | HB_UNICODE_GENERAL_CATEGORY_OTHER_PUNCTUATION, // U_OTHER_PUNCTUATION = 23, |
128 | | HB_UNICODE_GENERAL_CATEGORY_MATH_SYMBOL, // U_MATH_SYMBOL = 24, |
129 | | HB_UNICODE_GENERAL_CATEGORY_CURRENCY_SYMBOL, // U_CURRENCY_SYMBOL = 25, |
130 | | HB_UNICODE_GENERAL_CATEGORY_MODIFIER_SYMBOL, // U_MODIFIER_SYMBOL = 26, |
131 | | HB_UNICODE_GENERAL_CATEGORY_OTHER_SYMBOL, // U_OTHER_SYMBOL = 27, |
132 | | HB_UNICODE_GENERAL_CATEGORY_INITIAL_PUNCTUATION, // U_INITIAL_PUNCTUATION = 28, |
133 | | HB_UNICODE_GENERAL_CATEGORY_FINAL_PUNCTUATION, // U_FINAL_PUNCTUATION = 29, |
134 | | }; |
135 | | |
136 | | #define DEFINE_BMP_1PLANE_MAPPING_GET_FUNC(prefix_) \ |
137 | | uint32_t Get##prefix_(uint32_t aCh) \ |
138 | 0 | { \ |
139 | 0 | if (aCh >= UNICODE_BMP_LIMIT) { \ |
140 | 0 | return aCh; \ |
141 | 0 | } \ |
142 | 0 | auto page = s##prefix_##Pages[aCh >> k##prefix_##CharBits]; \ |
143 | 0 | auto index = aCh & ((1 << k##prefix_##CharBits) - 1); \ |
144 | 0 | uint32_t v = s##prefix_##Values[page][index]; \ |
145 | 0 | return v ? v : aCh; \ |
146 | 0 | } Unexecuted instantiation: mozilla::unicode::GetFullWidth(unsigned int) Unexecuted instantiation: mozilla::unicode::GetFullWidthInverse(unsigned int) |
147 | | |
148 | | // full-width mappings only exist for BMP characters; all others are |
149 | | // returned unchanged |
150 | | DEFINE_BMP_1PLANE_MAPPING_GET_FUNC(FullWidth) |
151 | | DEFINE_BMP_1PLANE_MAPPING_GET_FUNC(FullWidthInverse) |
152 | | |
153 | | bool |
154 | | IsClusterExtender(uint32_t aCh, uint8_t aCategory) |
155 | 0 | { |
156 | 0 | return ((aCategory >= HB_UNICODE_GENERAL_CATEGORY_SPACING_MARK && |
157 | 0 | aCategory <= HB_UNICODE_GENERAL_CATEGORY_NON_SPACING_MARK) || |
158 | 0 | (aCh >= 0x200c && aCh <= 0x200d) || // ZWJ, ZWNJ |
159 | 0 | (aCh >= 0xff9e && aCh <= 0xff9f) || // katakana sound marks |
160 | 0 | (aCh >= 0x1F3FB && aCh <= 0x1F3FF) || // fitzpatrick skin tone modifiers |
161 | 0 | (aCh >= 0xe0020 && aCh <= 0xe007f)); // emoji (flag) tag characters |
162 | 0 | } |
163 | | |
164 | | enum HSType { |
165 | | HST_NONE = U_HST_NOT_APPLICABLE, |
166 | | HST_L = U_HST_LEADING_JAMO, |
167 | | HST_V = U_HST_VOWEL_JAMO, |
168 | | HST_T = U_HST_TRAILING_JAMO, |
169 | | HST_LV = U_HST_LV_SYLLABLE, |
170 | | HST_LVT = U_HST_LVT_SYLLABLE |
171 | | }; |
172 | | |
173 | | static HSType |
174 | | GetHangulSyllableType(uint32_t aCh) |
175 | 0 | { |
176 | 0 | return HSType(u_getIntPropertyValue(aCh, UCHAR_HANGUL_SYLLABLE_TYPE)); |
177 | 0 | } |
178 | | |
179 | | void |
180 | | ClusterIterator::Next() |
181 | 0 | { |
182 | 0 | if (AtEnd()) { |
183 | 0 | NS_WARNING("ClusterIterator has already reached the end"); |
184 | 0 | return; |
185 | 0 | } |
186 | 0 |
|
187 | 0 | uint32_t ch = *mPos++; |
188 | 0 |
|
189 | 0 | if (NS_IS_HIGH_SURROGATE(ch) && mPos < mLimit && |
190 | 0 | NS_IS_LOW_SURROGATE(*mPos)) { |
191 | 0 | ch = SURROGATE_TO_UCS4(ch, *mPos++); |
192 | 0 | } else if ((ch & ~0xff) == 0x1100 || |
193 | 0 | (ch >= 0xa960 && ch <= 0xa97f) || |
194 | 0 | (ch >= 0xac00 && ch <= 0xd7ff)) { |
195 | 0 | // Handle conjoining Jamo that make Hangul syllables |
196 | 0 | HSType hangulState = GetHangulSyllableType(ch); |
197 | 0 | while (mPos < mLimit) { |
198 | 0 | ch = *mPos; |
199 | 0 | HSType hangulType = GetHangulSyllableType(ch); |
200 | 0 | switch (hangulType) { |
201 | 0 | case HST_L: |
202 | 0 | case HST_LV: |
203 | 0 | case HST_LVT: |
204 | 0 | if (hangulState == HST_L) { |
205 | 0 | hangulState = hangulType; |
206 | 0 | mPos++; |
207 | 0 | continue; |
208 | 0 | } |
209 | 0 | break; |
210 | 0 | case HST_V: |
211 | 0 | if ((hangulState != HST_NONE) && (hangulState != HST_T) && |
212 | 0 | (hangulState != HST_LVT)) { |
213 | 0 | hangulState = hangulType; |
214 | 0 | mPos++; |
215 | 0 | continue; |
216 | 0 | } |
217 | 0 | break; |
218 | 0 | case HST_T: |
219 | 0 | if (hangulState != HST_NONE && hangulState != HST_L) { |
220 | 0 | hangulState = hangulType; |
221 | 0 | mPos++; |
222 | 0 | continue; |
223 | 0 | } |
224 | 0 | break; |
225 | 0 | default: |
226 | 0 | break; |
227 | 0 | } |
228 | 0 | break; |
229 | 0 | } |
230 | 0 | } |
231 | 0 |
|
232 | 0 | while (mPos < mLimit) { |
233 | 0 | ch = *mPos; |
234 | 0 |
|
235 | 0 | // Check for surrogate pairs; note that isolated surrogates will just |
236 | 0 | // be treated as generic (non-cluster-extending) characters here, |
237 | 0 | // which is fine for cluster-iterating purposes |
238 | 0 | if (NS_IS_HIGH_SURROGATE(ch) && mPos < mLimit - 1 && |
239 | 0 | NS_IS_LOW_SURROGATE(*(mPos + 1))) { |
240 | 0 | ch = SURROGATE_TO_UCS4(ch, *(mPos + 1)); |
241 | 0 | } |
242 | 0 |
|
243 | 0 | if (!IsClusterExtender(ch)) { |
244 | 0 | break; |
245 | 0 | } |
246 | 0 | |
247 | 0 | mPos++; |
248 | 0 | if (!IS_IN_BMP(ch)) { |
249 | 0 | mPos++; |
250 | 0 | } |
251 | 0 | } |
252 | 0 |
|
253 | 0 | NS_ASSERTION(mText < mPos && mPos <= mLimit, |
254 | 0 | "ClusterIterator::Next has overshot the string!"); |
255 | 0 | } |
256 | | |
257 | | void |
258 | | ClusterReverseIterator::Next() |
259 | 0 | { |
260 | 0 | if (AtEnd()) { |
261 | 0 | NS_WARNING("ClusterReverseIterator has already reached the end"); |
262 | 0 | return; |
263 | 0 | } |
264 | 0 |
|
265 | 0 | uint32_t ch; |
266 | 0 | do { |
267 | 0 | ch = *--mPos; |
268 | 0 |
|
269 | 0 | if (NS_IS_LOW_SURROGATE(ch) && mPos > mLimit && |
270 | 0 | NS_IS_HIGH_SURROGATE(*(mPos - 1))) { |
271 | 0 | ch = SURROGATE_TO_UCS4(*--mPos, ch); |
272 | 0 | } |
273 | 0 |
|
274 | 0 | if (!IsClusterExtender(ch)) { |
275 | 0 | break; |
276 | 0 | } |
277 | 0 | } while (mPos > mLimit); |
278 | 0 |
|
279 | 0 | // XXX May need to handle conjoining Jamo |
280 | 0 |
|
281 | 0 | NS_ASSERTION(mPos >= mLimit, |
282 | 0 | "ClusterReverseIterator::Next has overshot the string!"); |
283 | 0 | } |
284 | | |
285 | | uint32_t |
286 | | CountGraphemeClusters(const char16_t* aText, uint32_t aLength) |
287 | 0 | { |
288 | 0 | ClusterIterator iter(aText, aLength); |
289 | 0 | uint32_t result = 0; |
290 | 0 | while (!iter.AtEnd()) { |
291 | 0 | ++result; |
292 | 0 | iter.Next(); |
293 | 0 | } |
294 | 0 | return result; |
295 | 0 | } |
296 | | |
297 | | } // end namespace unicode |
298 | | |
299 | | } // end namespace mozilla |