/src/icu/icu4c/source/common/uprops.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | // © 2016 and later: Unicode, Inc. and others. |
2 | | // License & terms of use: http://www.unicode.org/copyright.html |
3 | | /* |
4 | | ******************************************************************************* |
5 | | * |
6 | | * Copyright (C) 2002-2016, International Business Machines |
7 | | * Corporation and others. All Rights Reserved. |
8 | | * |
9 | | ******************************************************************************* |
10 | | * file name: uprops.cpp |
11 | | * encoding: UTF-8 |
12 | | * tab size: 8 (not used) |
13 | | * indentation:4 |
14 | | * |
15 | | * created on: 2002feb24 |
16 | | * created by: Markus W. Scherer |
17 | | * |
18 | | * Implementations for mostly non-core Unicode character properties |
19 | | * stored in uprops.icu. |
20 | | * |
21 | | * With the APIs implemented here, almost all properties files and |
22 | | * their associated implementation files are used from this file, |
23 | | * including those for normalization and case mappings. |
24 | | */ |
25 | | |
26 | | #include "unicode/utypes.h" |
27 | | #include "unicode/uchar.h" |
28 | | #include "unicode/ucptrie.h" |
29 | | #include "unicode/udata.h" |
30 | | #include "unicode/unorm2.h" |
31 | | #include "unicode/uscript.h" |
32 | | #include "unicode/ustring.h" |
33 | | #include "unicode/utf16.h" |
34 | | #include "cstring.h" |
35 | | #include "emojiprops.h" |
36 | | #include "mutex.h" |
37 | | #include "normalizer2impl.h" |
38 | | #include "umutex.h" |
39 | | #include "ubidi_props.h" |
40 | | #include "uprops.h" |
41 | | #include "ucase.h" |
42 | | #include "ucln_cmn.h" |
43 | | #include "ulayout_props.h" |
44 | | #include "ustr_imp.h" |
45 | | |
46 | | U_NAMESPACE_USE |
47 | | |
48 | | // Unicode text layout properties data ----------------------------------------- |
49 | | |
50 | | namespace { |
51 | | |
52 | | icu::UInitOnce gLayoutInitOnce {}; |
53 | | UDataMemory *gLayoutMemory = nullptr; |
54 | | |
55 | | UCPTrie *gInpcTrie = nullptr; // Indic_Positional_Category |
56 | | UCPTrie *gInscTrie = nullptr; // Indic_Syllabic_Category |
57 | | UCPTrie *gVoTrie = nullptr; // Vertical_Orientation |
58 | | |
59 | | int32_t gMaxInpcValue = 0; |
60 | | int32_t gMaxInscValue = 0; |
61 | | int32_t gMaxVoValue = 0; |
62 | | |
63 | 0 | UBool U_CALLCONV uprops_cleanup() { |
64 | 0 | udata_close(gLayoutMemory); |
65 | 0 | gLayoutMemory = nullptr; |
66 | |
|
67 | 0 | ucptrie_close(gInpcTrie); |
68 | 0 | gInpcTrie = nullptr; |
69 | 0 | ucptrie_close(gInscTrie); |
70 | 0 | gInscTrie = nullptr; |
71 | 0 | ucptrie_close(gVoTrie); |
72 | 0 | gVoTrie = nullptr; |
73 | |
|
74 | 0 | gMaxInpcValue = 0; |
75 | 0 | gMaxInscValue = 0; |
76 | 0 | gMaxVoValue = 0; |
77 | |
|
78 | 0 | gLayoutInitOnce.reset(); |
79 | 0 | return true; |
80 | 0 | } |
81 | | |
82 | | UBool U_CALLCONV |
83 | | ulayout_isAcceptable(void * /*context*/, |
84 | | const char * /* type */, const char * /*name*/, |
85 | 0 | const UDataInfo *pInfo) { |
86 | 0 | return pInfo->size >= 20 && |
87 | 0 | pInfo->isBigEndian == U_IS_BIG_ENDIAN && |
88 | 0 | pInfo->charsetFamily == U_CHARSET_FAMILY && |
89 | 0 | pInfo->dataFormat[0] == ULAYOUT_FMT_0 && |
90 | 0 | pInfo->dataFormat[1] == ULAYOUT_FMT_1 && |
91 | 0 | pInfo->dataFormat[2] == ULAYOUT_FMT_2 && |
92 | 0 | pInfo->dataFormat[3] == ULAYOUT_FMT_3 && |
93 | 0 | pInfo->formatVersion[0] == 1; |
94 | 0 | } |
95 | | |
96 | | // UInitOnce singleton initialization function |
97 | 0 | void U_CALLCONV ulayout_load(UErrorCode &errorCode) { |
98 | 0 | gLayoutMemory = udata_openChoice( |
99 | 0 | nullptr, ULAYOUT_DATA_TYPE, ULAYOUT_DATA_NAME, |
100 | 0 | ulayout_isAcceptable, nullptr, &errorCode); |
101 | 0 | if (U_FAILURE(errorCode)) { return; } |
102 | | |
103 | 0 | const uint8_t* inBytes = static_cast<const uint8_t*>(udata_getMemory(gLayoutMemory)); |
104 | 0 | const int32_t* inIndexes = reinterpret_cast<const int32_t*>(inBytes); |
105 | 0 | int32_t indexesLength = inIndexes[ULAYOUT_IX_INDEXES_LENGTH]; |
106 | 0 | if (indexesLength < 12) { |
107 | 0 | errorCode = U_INVALID_FORMAT_ERROR; // Not enough indexes. |
108 | 0 | return; |
109 | 0 | } |
110 | 0 | int32_t offset = indexesLength * 4; |
111 | 0 | int32_t top = inIndexes[ULAYOUT_IX_INPC_TRIE_TOP]; |
112 | 0 | int32_t trieSize = top - offset; |
113 | 0 | if (trieSize >= 16) { |
114 | 0 | gInpcTrie = ucptrie_openFromBinary( |
115 | 0 | UCPTRIE_TYPE_ANY, UCPTRIE_VALUE_BITS_ANY, |
116 | 0 | inBytes + offset, trieSize, nullptr, &errorCode); |
117 | 0 | } |
118 | 0 | offset = top; |
119 | 0 | top = inIndexes[ULAYOUT_IX_INSC_TRIE_TOP]; |
120 | 0 | trieSize = top - offset; |
121 | 0 | if (trieSize >= 16) { |
122 | 0 | gInscTrie = ucptrie_openFromBinary( |
123 | 0 | UCPTRIE_TYPE_ANY, UCPTRIE_VALUE_BITS_ANY, |
124 | 0 | inBytes + offset, trieSize, nullptr, &errorCode); |
125 | 0 | } |
126 | 0 | offset = top; |
127 | 0 | top = inIndexes[ULAYOUT_IX_VO_TRIE_TOP]; |
128 | 0 | trieSize = top - offset; |
129 | 0 | if (trieSize >= 16) { |
130 | 0 | gVoTrie = ucptrie_openFromBinary( |
131 | 0 | UCPTRIE_TYPE_ANY, UCPTRIE_VALUE_BITS_ANY, |
132 | 0 | inBytes + offset, trieSize, nullptr, &errorCode); |
133 | 0 | } |
134 | |
|
135 | 0 | uint32_t maxValues = inIndexes[ULAYOUT_IX_MAX_VALUES]; |
136 | 0 | gMaxInpcValue = maxValues >> ULAYOUT_MAX_INPC_SHIFT; |
137 | 0 | gMaxInscValue = (maxValues >> ULAYOUT_MAX_INSC_SHIFT) & 0xff; |
138 | 0 | gMaxVoValue = (maxValues >> ULAYOUT_MAX_VO_SHIFT) & 0xff; |
139 | |
|
140 | 0 | ucln_common_registerCleanup(UCLN_COMMON_UPROPS, uprops_cleanup); |
141 | 0 | } |
142 | | |
143 | 0 | UBool ulayout_ensureData(UErrorCode &errorCode) { |
144 | 0 | if (U_FAILURE(errorCode)) { return false; } |
145 | 0 | umtx_initOnce(gLayoutInitOnce, &ulayout_load, errorCode); |
146 | 0 | return U_SUCCESS(errorCode); |
147 | 0 | } |
148 | | |
149 | 0 | UBool ulayout_ensureData() { |
150 | 0 | UErrorCode errorCode = U_ZERO_ERROR; |
151 | 0 | return ulayout_ensureData(errorCode); |
152 | 0 | } |
153 | | |
154 | | } // namespace |
155 | | |
156 | | /* general properties API functions ----------------------------------------- */ |
157 | | |
158 | | struct BinaryProperty; |
159 | | |
160 | | typedef UBool BinaryPropertyContains(const BinaryProperty &prop, UChar32 c, UProperty which); |
161 | | |
162 | | struct BinaryProperty { |
163 | | int32_t column; // SRC_PROPSVEC column, or "source" if mask==0 |
164 | | uint32_t mask; |
165 | | BinaryPropertyContains *contains; |
166 | | }; |
167 | | |
168 | 0 | static UBool defaultContains(const BinaryProperty &prop, UChar32 c, UProperty /*which*/) { |
169 | | /* systematic, directly stored properties */ |
170 | 0 | return (u_getUnicodeProperties(c, prop.column)&prop.mask)!=0; |
171 | 0 | } |
172 | | |
173 | 0 | static UBool caseBinaryPropertyContains(const BinaryProperty &/*prop*/, UChar32 c, UProperty which) { |
174 | 0 | return ucase_hasBinaryProperty(c, which); |
175 | 0 | } |
176 | | |
177 | 0 | static UBool isBidiControl(const BinaryProperty &/*prop*/, UChar32 c, UProperty /*which*/) { |
178 | 0 | return ubidi_isBidiControl(c); |
179 | 0 | } |
180 | | |
181 | 0 | static UBool isMirrored(const BinaryProperty &/*prop*/, UChar32 c, UProperty /*which*/) { |
182 | 0 | return ubidi_isMirrored(c); |
183 | 0 | } |
184 | | |
185 | 0 | static UBool isJoinControl(const BinaryProperty &/*prop*/, UChar32 c, UProperty /*which*/) { |
186 | 0 | return ubidi_isJoinControl(c); |
187 | 0 | } |
188 | | |
189 | | #if UCONFIG_NO_NORMALIZATION |
190 | | static UBool hasFullCompositionExclusion(const BinaryProperty &, UChar32, UProperty) { |
191 | | return false; |
192 | | } |
193 | | #else |
194 | 0 | static UBool hasFullCompositionExclusion(const BinaryProperty &/*prop*/, UChar32 c, UProperty /*which*/) { |
195 | | // By definition, Full_Composition_Exclusion is the same as NFC_QC=No. |
196 | 0 | UErrorCode errorCode=U_ZERO_ERROR; |
197 | 0 | const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(errorCode); |
198 | 0 | return U_SUCCESS(errorCode) && impl->isCompNo(impl->getNorm16(c)); |
199 | 0 | } |
200 | | #endif |
201 | | |
202 | | // UCHAR_NF*_INERT properties |
203 | | #if UCONFIG_NO_NORMALIZATION |
204 | | static UBool isNormInert(const BinaryProperty &, UChar32, UProperty) { |
205 | | return false; |
206 | | } |
207 | | #else |
208 | 0 | static UBool isNormInert(const BinaryProperty &/*prop*/, UChar32 c, UProperty which) { |
209 | 0 | UErrorCode errorCode=U_ZERO_ERROR; |
210 | 0 | const Normalizer2 *norm2=Normalizer2Factory::getInstance( |
211 | 0 | static_cast<UNormalizationMode>(which - UCHAR_NFD_INERT + UNORM_NFD), errorCode); |
212 | 0 | return U_SUCCESS(errorCode) && norm2->isInert(c); |
213 | 0 | } |
214 | | #endif |
215 | | |
216 | | #if UCONFIG_NO_NORMALIZATION |
217 | | static UBool changesWhenCasefolded(const BinaryProperty &, UChar32, UProperty) { |
218 | | return false; |
219 | | } |
220 | | #else |
221 | 0 | static UBool changesWhenCasefolded(const BinaryProperty &/*prop*/, UChar32 c, UProperty /*which*/) { |
222 | 0 | UnicodeString nfd; |
223 | 0 | UErrorCode errorCode=U_ZERO_ERROR; |
224 | 0 | const Normalizer2 *nfcNorm2=Normalizer2::getNFCInstance(errorCode); |
225 | 0 | if(U_FAILURE(errorCode)) { |
226 | 0 | return false; |
227 | 0 | } |
228 | 0 | if(nfcNorm2->getDecomposition(c, nfd)) { |
229 | | /* c has a decomposition */ |
230 | 0 | if(nfd.length()==1) { |
231 | 0 | c=nfd[0]; /* single BMP code point */ |
232 | 0 | } else if(nfd.length()<=U16_MAX_LENGTH && |
233 | 0 | nfd.length()==U16_LENGTH(c=nfd.char32At(0)) |
234 | 0 | ) { |
235 | | /* single supplementary code point */ |
236 | 0 | } else { |
237 | 0 | c=U_SENTINEL; |
238 | 0 | } |
239 | 0 | } else if(c<0) { |
240 | 0 | return false; /* protect against bad input */ |
241 | 0 | } |
242 | 0 | if(c>=0) { |
243 | | /* single code point */ |
244 | 0 | const char16_t *resultString; |
245 | 0 | return ucase_toFullFolding(c, &resultString, U_FOLD_CASE_DEFAULT) >= 0; |
246 | 0 | } else { |
247 | | /* guess some large but stack-friendly capacity */ |
248 | 0 | char16_t dest[2*UCASE_MAX_STRING_LENGTH]; |
249 | 0 | int32_t destLength; |
250 | 0 | destLength=u_strFoldCase(dest, UPRV_LENGTHOF(dest), |
251 | 0 | nfd.getBuffer(), nfd.length(), |
252 | 0 | U_FOLD_CASE_DEFAULT, &errorCode); |
253 | 0 | return U_SUCCESS(errorCode) && |
254 | 0 | 0!=u_strCompare(nfd.getBuffer(), nfd.length(), |
255 | 0 | dest, destLength, false); |
256 | 0 | } |
257 | 0 | } |
258 | | #endif |
259 | | |
260 | | #if UCONFIG_NO_NORMALIZATION |
261 | | static UBool changesWhenNFKC_Casefolded(const BinaryProperty &, UChar32, UProperty) { |
262 | | return false; |
263 | | } |
264 | | #else |
265 | 0 | static UBool changesWhenNFKC_Casefolded(const BinaryProperty &/*prop*/, UChar32 c, UProperty /*which*/) { |
266 | 0 | UErrorCode errorCode=U_ZERO_ERROR; |
267 | 0 | const Normalizer2Impl *kcf=Normalizer2Factory::getNFKC_CFImpl(errorCode); |
268 | 0 | if(U_FAILURE(errorCode)) { |
269 | 0 | return false; |
270 | 0 | } |
271 | 0 | UnicodeString src(c); |
272 | 0 | UnicodeString dest; |
273 | 0 | { |
274 | | // The ReorderingBuffer must be in a block because its destructor |
275 | | // needs to release dest's buffer before we look at its contents. |
276 | 0 | ReorderingBuffer buffer(*kcf, dest); |
277 | | // Small destCapacity for NFKC_CF(c). |
278 | 0 | if(buffer.init(5, errorCode)) { |
279 | 0 | const char16_t *srcArray=src.getBuffer(); |
280 | 0 | kcf->compose(srcArray, srcArray+src.length(), false, |
281 | 0 | true, buffer, errorCode); |
282 | 0 | } |
283 | 0 | } |
284 | 0 | return U_SUCCESS(errorCode) && dest!=src; |
285 | 0 | } |
286 | | #endif |
287 | | |
288 | | #if UCONFIG_NO_NORMALIZATION |
289 | | static UBool isCanonSegmentStarter(const BinaryProperty &, UChar32, UProperty) { |
290 | | return false; |
291 | | } |
292 | | #else |
293 | 0 | static UBool isCanonSegmentStarter(const BinaryProperty &/*prop*/, UChar32 c, UProperty /*which*/) { |
294 | 0 | UErrorCode errorCode=U_ZERO_ERROR; |
295 | 0 | const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(errorCode); |
296 | 0 | return |
297 | 0 | U_SUCCESS(errorCode) && impl->ensureCanonIterData(errorCode) && |
298 | 0 | impl->isCanonSegmentStarter(c); |
299 | 0 | } |
300 | | #endif |
301 | | |
302 | 0 | static UBool isPOSIX_alnum(const BinaryProperty &/*prop*/, UChar32 c, UProperty /*which*/) { |
303 | 0 | return u_isalnumPOSIX(c); |
304 | 0 | } |
305 | | |
306 | 0 | static UBool isPOSIX_blank(const BinaryProperty &/*prop*/, UChar32 c, UProperty /*which*/) { |
307 | 0 | return u_isblank(c); |
308 | 0 | } |
309 | | |
310 | 0 | static UBool isPOSIX_graph(const BinaryProperty &/*prop*/, UChar32 c, UProperty /*which*/) { |
311 | 0 | return u_isgraphPOSIX(c); |
312 | 0 | } |
313 | | |
314 | 0 | static UBool isPOSIX_print(const BinaryProperty &/*prop*/, UChar32 c, UProperty /*which*/) { |
315 | 0 | return u_isprintPOSIX(c); |
316 | 0 | } |
317 | | |
318 | 0 | static UBool isPOSIX_xdigit(const BinaryProperty &/*prop*/, UChar32 c, UProperty /*which*/) { |
319 | 0 | return u_isxdigit(c); |
320 | 0 | } |
321 | | |
322 | 0 | static UBool isRegionalIndicator(const BinaryProperty &/*prop*/, UChar32 c, UProperty /*which*/) { |
323 | | // Property starts are a subset of lb=RI etc. |
324 | 0 | return 0x1F1E6<=c && c<=0x1F1FF; |
325 | 0 | } |
326 | | |
327 | 0 | static UBool hasEmojiProperty(const BinaryProperty &/*prop*/, UChar32 c, UProperty which) { |
328 | 0 | return EmojiProps::hasBinaryProperty(c, which); |
329 | 0 | } |
330 | | |
331 | 0 | static UBool isIDSUnaryOperator(const BinaryProperty &/*prop*/, UChar32 c, UProperty /*which*/) { |
332 | | // New in Unicode 15.1 for just two characters. |
333 | 0 | return 0x2FFE<=c && c<=0x2FFF; |
334 | 0 | } |
335 | | |
336 | | /** Ranges (start/limit pairs) of ID_Compat_Math_Continue (only), from UCD PropList.txt. */ |
337 | | static constexpr UChar32 ID_COMPAT_MATH_CONTINUE[] = { |
338 | | 0x00B2, 0x00B3 + 1, |
339 | | 0x00B9, 0x00B9 + 1, |
340 | | 0x2070, 0x2070 + 1, |
341 | | 0x2074, 0x207E + 1, |
342 | | 0x2080, 0x208E + 1 |
343 | | }; |
344 | | |
345 | | /** ID_Compat_Math_Start characters, from UCD PropList.txt. */ |
346 | | static constexpr UChar32 ID_COMPAT_MATH_START[] = { |
347 | | 0x2202, |
348 | | 0x2207, |
349 | | 0x221E, |
350 | | 0x1D6C1, |
351 | | 0x1D6DB, |
352 | | 0x1D6FB, |
353 | | 0x1D715, |
354 | | 0x1D735, |
355 | | 0x1D74F, |
356 | | 0x1D76F, |
357 | | 0x1D789, |
358 | | 0x1D7A9, |
359 | | 0x1D7C3 |
360 | | }; |
361 | | |
362 | | /** Ranges (start/limit pairs) of Modifier_Combining_mark (only), from UCD PropList.txt. */ |
363 | | static constexpr UChar32 MODIFIER_COMBINING_MARK[] = { |
364 | | 0x0654, 0x0655 + 1, |
365 | | 0x0658, 0x0658 + 1, // U+0658 |
366 | | 0x06DC, 0x06DC + 1, // U+06DC |
367 | | 0x06E3, 0x06E3 + 1, // U+06E3 |
368 | | 0x06E7, 0x06E8 + 1, |
369 | | 0x08CA, 0x08CB + 1, |
370 | | 0x08CD, 0x08CF + 1, |
371 | | 0x08D3, 0x08D3 + 1, // U+08D3 |
372 | | 0x08F3, 0x08F3 + 1 // U+08F3 |
373 | | }; |
374 | | |
375 | 0 | static UBool isIDCompatMathStart(const BinaryProperty &/*prop*/, UChar32 c, UProperty /*which*/) { |
376 | 0 | if (c < ID_COMPAT_MATH_START[0]) { return false; } // fastpath for common scripts |
377 | 0 | for (UChar32 startChar : ID_COMPAT_MATH_START) { |
378 | 0 | if (c == startChar) { return true; } |
379 | 0 | } |
380 | 0 | return false; |
381 | 0 | } |
382 | | |
383 | 0 | static UBool isIDCompatMathContinue(const BinaryProperty &prop, UChar32 c, UProperty /*which*/) { |
384 | 0 | for (int32_t i = 0; i < UPRV_LENGTHOF(ID_COMPAT_MATH_CONTINUE); i += 2) { |
385 | 0 | if (c < ID_COMPAT_MATH_CONTINUE[i]) { return false; } // below range start |
386 | 0 | if (c < ID_COMPAT_MATH_CONTINUE[i + 1]) { return true; } // below range limit |
387 | 0 | } |
388 | 0 | return isIDCompatMathStart(prop, c, UCHAR_ID_COMPAT_MATH_START); |
389 | 0 | } |
390 | | |
391 | 0 | static UBool isModifierCombiningMark(const BinaryProperty &/*prop*/, UChar32 c, UProperty /*which*/) { |
392 | 0 | for (int32_t i = 0; i < UPRV_LENGTHOF(MODIFIER_COMBINING_MARK); i += 2) { |
393 | 0 | if (c < MODIFIER_COMBINING_MARK[i]) { return false; } // below range start |
394 | 0 | if (c < MODIFIER_COMBINING_MARK[i + 1]) { return true; } // below range limit |
395 | 0 | } |
396 | 0 | return false; |
397 | 0 | } |
398 | | |
399 | | static const BinaryProperty binProps[UCHAR_BINARY_LIMIT]={ |
400 | | /* |
401 | | * column and mask values for binary properties from u_getUnicodeProperties(). |
402 | | * Must be in order of corresponding UProperty, |
403 | | * and there must be exactly one entry per binary UProperty. |
404 | | * |
405 | | * Properties with mask==0 are handled in code. |
406 | | * For them, column is the UPropertySource value. |
407 | | */ |
408 | | { 1, U_MASK(UPROPS_ALPHABETIC), defaultContains }, |
409 | | { 1, U_MASK(UPROPS_ASCII_HEX_DIGIT), defaultContains }, |
410 | | { UPROPS_SRC_BIDI, 0, isBidiControl }, |
411 | | { UPROPS_SRC_BIDI, 0, isMirrored }, |
412 | | { 1, U_MASK(UPROPS_DASH), defaultContains }, |
413 | | { 1, U_MASK(UPROPS_DEFAULT_IGNORABLE_CODE_POINT), defaultContains }, |
414 | | { 1, U_MASK(UPROPS_DEPRECATED), defaultContains }, |
415 | | { 1, U_MASK(UPROPS_DIACRITIC), defaultContains }, |
416 | | { 1, U_MASK(UPROPS_EXTENDER), defaultContains }, |
417 | | { UPROPS_SRC_NFC, 0, hasFullCompositionExclusion }, |
418 | | { 1, U_MASK(UPROPS_GRAPHEME_BASE), defaultContains }, |
419 | | { 1, U_MASK(UPROPS_GRAPHEME_EXTEND), defaultContains }, |
420 | | { 1, U_MASK(UPROPS_GRAPHEME_LINK), defaultContains }, |
421 | | { 1, U_MASK(UPROPS_HEX_DIGIT), defaultContains }, |
422 | | { 1, U_MASK(UPROPS_HYPHEN), defaultContains }, |
423 | | { 1, U_MASK(UPROPS_ID_CONTINUE), defaultContains }, |
424 | | { 1, U_MASK(UPROPS_ID_START), defaultContains }, |
425 | | { 1, U_MASK(UPROPS_IDEOGRAPHIC), defaultContains }, |
426 | | { 1, U_MASK(UPROPS_IDS_BINARY_OPERATOR), defaultContains }, |
427 | | { 1, U_MASK(UPROPS_IDS_TRINARY_OPERATOR), defaultContains }, |
428 | | { UPROPS_SRC_BIDI, 0, isJoinControl }, |
429 | | { 1, U_MASK(UPROPS_LOGICAL_ORDER_EXCEPTION), defaultContains }, |
430 | | { UPROPS_SRC_CASE, 0, caseBinaryPropertyContains }, // UCHAR_LOWERCASE |
431 | | { 1, U_MASK(UPROPS_MATH), defaultContains }, |
432 | | { 1, U_MASK(UPROPS_NONCHARACTER_CODE_POINT), defaultContains }, |
433 | | { 1, U_MASK(UPROPS_QUOTATION_MARK), defaultContains }, |
434 | | { 1, U_MASK(UPROPS_RADICAL), defaultContains }, |
435 | | { UPROPS_SRC_CASE, 0, caseBinaryPropertyContains }, // UCHAR_SOFT_DOTTED |
436 | | { 1, U_MASK(UPROPS_TERMINAL_PUNCTUATION), defaultContains }, |
437 | | { 1, U_MASK(UPROPS_UNIFIED_IDEOGRAPH), defaultContains }, |
438 | | { UPROPS_SRC_CASE, 0, caseBinaryPropertyContains }, // UCHAR_UPPERCASE |
439 | | { 1, U_MASK(UPROPS_WHITE_SPACE), defaultContains }, |
440 | | { 1, U_MASK(UPROPS_XID_CONTINUE), defaultContains }, |
441 | | { 1, U_MASK(UPROPS_XID_START), defaultContains }, |
442 | | { UPROPS_SRC_CASE, 0, caseBinaryPropertyContains }, // UCHAR_CASE_SENSITIVE |
443 | | { 1, U_MASK(UPROPS_S_TERM), defaultContains }, |
444 | | { 1, U_MASK(UPROPS_VARIATION_SELECTOR), defaultContains }, |
445 | | { UPROPS_SRC_NFC, 0, isNormInert }, // UCHAR_NFD_INERT |
446 | | { UPROPS_SRC_NFKC, 0, isNormInert }, // UCHAR_NFKD_INERT |
447 | | { UPROPS_SRC_NFC, 0, isNormInert }, // UCHAR_NFC_INERT |
448 | | { UPROPS_SRC_NFKC, 0, isNormInert }, // UCHAR_NFKC_INERT |
449 | | { UPROPS_SRC_NFC_CANON_ITER, 0, isCanonSegmentStarter }, |
450 | | { 1, U_MASK(UPROPS_PATTERN_SYNTAX), defaultContains }, |
451 | | { 1, U_MASK(UPROPS_PATTERN_WHITE_SPACE), defaultContains }, |
452 | | { UPROPS_SRC_CHAR_AND_PROPSVEC, 0, isPOSIX_alnum }, |
453 | | { UPROPS_SRC_CHAR, 0, isPOSIX_blank }, |
454 | | { UPROPS_SRC_CHAR, 0, isPOSIX_graph }, |
455 | | { UPROPS_SRC_CHAR, 0, isPOSIX_print }, |
456 | | { UPROPS_SRC_CHAR, 0, isPOSIX_xdigit }, |
457 | | { UPROPS_SRC_CASE, 0, caseBinaryPropertyContains }, // UCHAR_CASED |
458 | | { UPROPS_SRC_CASE, 0, caseBinaryPropertyContains }, // UCHAR_CASE_IGNORABLE |
459 | | { UPROPS_SRC_CASE, 0, caseBinaryPropertyContains }, // UCHAR_CHANGES_WHEN_LOWERCASED |
460 | | { UPROPS_SRC_CASE, 0, caseBinaryPropertyContains }, // UCHAR_CHANGES_WHEN_UPPERCASED |
461 | | { UPROPS_SRC_CASE, 0, caseBinaryPropertyContains }, // UCHAR_CHANGES_WHEN_TITLECASED |
462 | | { UPROPS_SRC_CASE_AND_NORM, 0, changesWhenCasefolded }, |
463 | | { UPROPS_SRC_CASE, 0, caseBinaryPropertyContains }, // UCHAR_CHANGES_WHEN_CASEMAPPED |
464 | | { UPROPS_SRC_NFKC_CF, 0, changesWhenNFKC_Casefolded }, |
465 | | { UPROPS_SRC_EMOJI, 0, hasEmojiProperty }, // UCHAR_EMOJI |
466 | | { UPROPS_SRC_EMOJI, 0, hasEmojiProperty }, // UCHAR_EMOJI_PRESENTATION |
467 | | { UPROPS_SRC_EMOJI, 0, hasEmojiProperty }, // UCHAR_EMOJI_MODIFIER |
468 | | { UPROPS_SRC_EMOJI, 0, hasEmojiProperty }, // UCHAR_EMOJI_MODIFIER_BASE |
469 | | { UPROPS_SRC_EMOJI, 0, hasEmojiProperty }, // UCHAR_EMOJI_COMPONENT |
470 | | { 2, 0, isRegionalIndicator }, |
471 | | { 1, U_MASK(UPROPS_PREPENDED_CONCATENATION_MARK), defaultContains }, |
472 | | { UPROPS_SRC_EMOJI, 0, hasEmojiProperty }, // UCHAR_EXTENDED_PICTOGRAPHIC |
473 | | { UPROPS_SRC_EMOJI, 0, hasEmojiProperty }, // UCHAR_BASIC_EMOJI |
474 | | { UPROPS_SRC_EMOJI, 0, hasEmojiProperty }, // UCHAR_EMOJI_KEYCAP_SEQUENCE |
475 | | { UPROPS_SRC_EMOJI, 0, hasEmojiProperty }, // UCHAR_RGI_EMOJI_MODIFIER_SEQUENCE |
476 | | { UPROPS_SRC_EMOJI, 0, hasEmojiProperty }, // UCHAR_RGI_EMOJI_FLAG_SEQUENCE |
477 | | { UPROPS_SRC_EMOJI, 0, hasEmojiProperty }, // UCHAR_RGI_EMOJI_TAG_SEQUENCE |
478 | | { UPROPS_SRC_EMOJI, 0, hasEmojiProperty }, // UCHAR_RGI_EMOJI_ZWJ_SEQUENCE |
479 | | { UPROPS_SRC_EMOJI, 0, hasEmojiProperty }, // UCHAR_RGI_EMOJI |
480 | | { UPROPS_SRC_IDSU, 0, isIDSUnaryOperator }, // UCHAR_IDS_UNARY_OPERATOR |
481 | | { UPROPS_SRC_ID_COMPAT_MATH, 0, isIDCompatMathStart }, // UCHAR_ID_COMPAT_MATH_START |
482 | | { UPROPS_SRC_ID_COMPAT_MATH, 0, isIDCompatMathContinue }, // UCHAR_ID_COMPAT_MATH_CONTINUE |
483 | | { UPROPS_SRC_MCM, 0 , isModifierCombiningMark }, // UCHAR_MODIFIER_COMBINING_MARK |
484 | | }; |
485 | | |
486 | | U_CAPI UBool U_EXPORT2 |
487 | 0 | u_hasBinaryProperty(UChar32 c, UProperty which) { |
488 | | /* c is range-checked in the functions that are called from here */ |
489 | 0 | if(which<UCHAR_BINARY_START || UCHAR_BINARY_LIMIT<=which) { |
490 | | /* not a known binary property */ |
491 | 0 | return false; |
492 | 0 | } else { |
493 | 0 | const BinaryProperty &prop=binProps[which]; |
494 | 0 | return prop.contains(prop, c, which); |
495 | 0 | } |
496 | 0 | } |
497 | | |
498 | | /* Checks if the Unicode character can start a Unicode identifier.*/ |
499 | | U_CAPI UBool U_EXPORT2 |
500 | 0 | u_isIDStart(UChar32 c) { |
501 | 0 | return u_hasBinaryProperty(c, UCHAR_ID_START); |
502 | 0 | } |
503 | | |
504 | | /* Checks if the Unicode character can be a Unicode identifier part other than starting the |
505 | | identifier.*/ |
506 | | U_CAPI UBool U_EXPORT2 |
507 | 0 | u_isIDPart(UChar32 c) { |
508 | 0 | return u_hasBinaryProperty(c, UCHAR_ID_CONTINUE); |
509 | 0 | } |
510 | | |
511 | | U_CAPI UBool U_EXPORT2 |
512 | 0 | u_stringHasBinaryProperty(const char16_t *s, int32_t length, UProperty which) { |
513 | 0 | if (s == nullptr && length != 0) { return false; } |
514 | 0 | if (length == 1) { |
515 | 0 | return u_hasBinaryProperty(s[0], which); // single code point |
516 | 0 | } else if (length == 2 || (length < 0 && *s != 0)) { // not empty string |
517 | | // first code point |
518 | 0 | int32_t i = 0; |
519 | 0 | UChar32 c; |
520 | 0 | U16_NEXT(s, i, length, c); |
521 | 0 | if (length > 0 ? i == length : s[i] == 0) { |
522 | 0 | return u_hasBinaryProperty(c, which); // single code point |
523 | 0 | } |
524 | 0 | } |
525 | | // Only call into EmojiProps for a relevant property, |
526 | | // so that we not unnecessarily try to load its data file. |
527 | 0 | return UCHAR_BASIC_EMOJI <= which && which <= UCHAR_RGI_EMOJI && |
528 | 0 | EmojiProps::hasBinaryProperty(s, length, which); |
529 | 0 | } |
530 | | |
531 | | struct IntProperty; |
532 | | |
533 | | typedef int32_t IntPropertyGetValue(const IntProperty &prop, UChar32 c, UProperty which); |
534 | | typedef int32_t IntPropertyGetMaxValue(const IntProperty &prop, UProperty which); |
535 | | |
536 | | struct IntProperty { |
537 | | int32_t column; // SRC_PROPSVEC column, or "source" if mask==0 |
538 | | uint32_t mask; |
539 | | int32_t shift; // =maxValue if getMaxValueFromShift() is used |
540 | | IntPropertyGetValue *getValue; |
541 | | IntPropertyGetMaxValue *getMaxValue; |
542 | | }; |
543 | | |
544 | 0 | static int32_t defaultGetValue(const IntProperty &prop, UChar32 c, UProperty /*which*/) { |
545 | | /* systematic, directly stored properties */ |
546 | 0 | return static_cast<int32_t>(u_getUnicodeProperties(c, prop.column) & prop.mask) >> prop.shift; |
547 | 0 | } |
548 | | |
549 | 0 | static int32_t defaultGetMaxValue(const IntProperty &prop, UProperty /*which*/) { |
550 | 0 | return (uprv_getMaxValues(prop.column)&prop.mask)>>prop.shift; |
551 | 0 | } |
552 | | |
553 | 0 | static int32_t getMaxValueFromShift(const IntProperty &prop, UProperty /*which*/) { |
554 | 0 | return prop.shift; |
555 | 0 | } |
556 | | |
557 | 0 | static int32_t getBiDiClass(const IntProperty &/*prop*/, UChar32 c, UProperty /*which*/) { |
558 | 0 | return static_cast<int32_t>(u_charDirection(c)); |
559 | 0 | } |
560 | | |
561 | 0 | static int32_t getBiDiPairedBracketType(const IntProperty &/*prop*/, UChar32 c, UProperty /*which*/) { |
562 | 0 | return static_cast<int32_t>(ubidi_getPairedBracketType(c)); |
563 | 0 | } |
564 | | |
565 | 0 | static int32_t biDiGetMaxValue(const IntProperty &/*prop*/, UProperty which) { |
566 | 0 | return ubidi_getMaxValue(which); |
567 | 0 | } |
568 | | |
569 | 0 | static int32_t getBlock(const IntProperty &/*prop*/, UChar32 c, UProperty /*which*/) { |
570 | 0 | return static_cast<int32_t>(ublock_getCode(c)); |
571 | 0 | } |
572 | | |
573 | 0 | static int32_t blockGetMaxValue(const IntProperty &/*prop*/, UProperty /*which*/) { |
574 | 0 | return uprv_getMaxValues(UPROPS_MAX_VALUES_OTHER_INDEX) & UPROPS_MAX_BLOCK; |
575 | 0 | } |
576 | | |
577 | | #if UCONFIG_NO_NORMALIZATION |
578 | | static int32_t getCombiningClass(const IntProperty &, UChar32, UProperty) { |
579 | | return 0; |
580 | | } |
581 | | #else |
582 | 0 | static int32_t getCombiningClass(const IntProperty &/*prop*/, UChar32 c, UProperty /*which*/) { |
583 | 0 | return u_getCombiningClass(c); |
584 | 0 | } |
585 | | #endif |
586 | | |
587 | 0 | static int32_t getGeneralCategory(const IntProperty &/*prop*/, UChar32 c, UProperty /*which*/) { |
588 | 0 | return static_cast<int32_t>(u_charType(c)); |
589 | 0 | } |
590 | | |
591 | 0 | static int32_t getJoiningGroup(const IntProperty &/*prop*/, UChar32 c, UProperty /*which*/) { |
592 | 0 | return ubidi_getJoiningGroup(c); |
593 | 0 | } |
594 | | |
595 | 0 | static int32_t getJoiningType(const IntProperty &/*prop*/, UChar32 c, UProperty /*which*/) { |
596 | 0 | return ubidi_getJoiningType(c); |
597 | 0 | } |
598 | | |
599 | 0 | static int32_t getNumericType(const IntProperty &/*prop*/, UChar32 c, UProperty /*which*/) { |
600 | 0 | int32_t ntv = static_cast<int32_t>(GET_NUMERIC_TYPE_VALUE(u_getMainProperties(c))); |
601 | 0 | return UPROPS_NTV_GET_TYPE(ntv); |
602 | 0 | } |
603 | | |
604 | 0 | static int32_t getScript(const IntProperty &/*prop*/, UChar32 c, UProperty /*which*/) { |
605 | 0 | UErrorCode errorCode=U_ZERO_ERROR; |
606 | 0 | return static_cast<int32_t>(uscript_getScript(c, &errorCode)); |
607 | 0 | } |
608 | | |
609 | 0 | static int32_t scriptGetMaxValue(const IntProperty &/*prop*/, UProperty /*which*/) { |
610 | 0 | return uprv_getMaxValues(0)&UPROPS_MAX_SCRIPT; |
611 | 0 | } |
612 | | |
613 | | /* |
614 | | * Map some of the Grapheme Cluster Break values to Hangul Syllable Types. |
615 | | * Hangul_Syllable_Type used to be fully redundant with a subset of Grapheme_Cluster_Break. |
616 | | * |
617 | | * Starting with Unicode 16, this is no longer true for HST=V vs. GCB=V in some cases: |
618 | | * Some Kirat Rai vowels are given GCB=V for proper grapheme clustering, but |
619 | | * they are of course not related to Hangul syllables. |
620 | | */ |
621 | | static const UHangulSyllableType gcbToHst[]={ |
622 | | U_HST_NOT_APPLICABLE, /* U_GCB_OTHER */ |
623 | | U_HST_NOT_APPLICABLE, /* U_GCB_CONTROL */ |
624 | | U_HST_NOT_APPLICABLE, /* U_GCB_CR */ |
625 | | U_HST_NOT_APPLICABLE, /* U_GCB_EXTEND */ |
626 | | U_HST_LEADING_JAMO, /* U_GCB_L */ |
627 | | U_HST_NOT_APPLICABLE, /* U_GCB_LF */ |
628 | | U_HST_LV_SYLLABLE, /* U_GCB_LV */ |
629 | | U_HST_LVT_SYLLABLE, /* U_GCB_LVT */ |
630 | | U_HST_TRAILING_JAMO, /* U_GCB_T */ |
631 | | U_HST_VOWEL_JAMO /* U_GCB_V */ |
632 | | /* |
633 | | * Omit GCB values beyond what we need for hst. |
634 | | * The code below checks for the array length. |
635 | | */ |
636 | | }; |
637 | | |
638 | 0 | static int32_t getHangulSyllableType(const IntProperty &/*prop*/, UChar32 c, UProperty /*which*/) { |
639 | | // Ignore supplementary code points: They all have HST=NA. |
640 | | // This is a simple way to handle the GCB!=hst cases since Unicode 16 (Kirat Rai vowels). |
641 | 0 | if(c>0xffff) { |
642 | 0 | return U_HST_NOT_APPLICABLE; |
643 | 0 | } |
644 | | /* see comments on gcbToHst[] above */ |
645 | 0 | int32_t gcb = static_cast<int32_t>(u_getUnicodeProperties(c, 2) & UPROPS_GCB_MASK) >> UPROPS_GCB_SHIFT; |
646 | 0 | if(gcb<UPRV_LENGTHOF(gcbToHst)) { |
647 | 0 | return gcbToHst[gcb]; |
648 | 0 | } else { |
649 | 0 | return U_HST_NOT_APPLICABLE; |
650 | 0 | } |
651 | 0 | } |
652 | | |
653 | | #if UCONFIG_NO_NORMALIZATION |
654 | | static int32_t getNormQuickCheck(const IntProperty &, UChar32, UProperty) { |
655 | | return 0; |
656 | | } |
657 | | #else |
658 | 0 | static int32_t getNormQuickCheck(const IntProperty &/*prop*/, UChar32 c, UProperty which) { |
659 | 0 | return static_cast<int32_t>(unorm_getQuickCheck(c, static_cast<UNormalizationMode>(which - UCHAR_NFD_QUICK_CHECK + UNORM_NFD))); |
660 | 0 | } |
661 | | #endif |
662 | | |
663 | | #if UCONFIG_NO_NORMALIZATION |
664 | | static int32_t getLeadCombiningClass(const IntProperty &, UChar32, UProperty) { |
665 | | return 0; |
666 | | } |
667 | | #else |
668 | 0 | static int32_t getLeadCombiningClass(const IntProperty &/*prop*/, UChar32 c, UProperty /*which*/) { |
669 | 0 | return unorm_getFCD16(c)>>8; |
670 | 0 | } |
671 | | #endif |
672 | | |
673 | | #if UCONFIG_NO_NORMALIZATION |
674 | | static int32_t getTrailCombiningClass(const IntProperty &, UChar32, UProperty) { |
675 | | return 0; |
676 | | } |
677 | | #else |
678 | 0 | static int32_t getTrailCombiningClass(const IntProperty &/*prop*/, UChar32 c, UProperty /*which*/) { |
679 | 0 | return unorm_getFCD16(c)&0xff; |
680 | 0 | } |
681 | | #endif |
682 | | |
683 | 0 | static int32_t getInPC(const IntProperty &, UChar32 c, UProperty) { |
684 | 0 | return ulayout_ensureData() && gInpcTrie != nullptr ? ucptrie_get(gInpcTrie, c) : 0; |
685 | 0 | } |
686 | | |
687 | 0 | static int32_t getInSC(const IntProperty &, UChar32 c, UProperty) { |
688 | 0 | return ulayout_ensureData() && gInscTrie != nullptr ? ucptrie_get(gInscTrie, c) : 0; |
689 | 0 | } |
690 | | |
691 | 0 | static int32_t getVo(const IntProperty &, UChar32 c, UProperty) { |
692 | 0 | return ulayout_ensureData() && gVoTrie != nullptr ? ucptrie_get(gVoTrie, c) : 0; |
693 | 0 | } |
694 | | |
695 | 0 | static int32_t layoutGetMaxValue(const IntProperty &/*prop*/, UProperty which) { |
696 | 0 | if (!ulayout_ensureData()) { return 0; } |
697 | 0 | switch (which) { |
698 | 0 | case UCHAR_INDIC_POSITIONAL_CATEGORY: |
699 | 0 | return gMaxInpcValue; |
700 | 0 | case UCHAR_INDIC_SYLLABIC_CATEGORY: |
701 | 0 | return gMaxInscValue; |
702 | 0 | case UCHAR_VERTICAL_ORIENTATION: |
703 | 0 | return gMaxVoValue; |
704 | 0 | default: |
705 | 0 | return 0; |
706 | 0 | } |
707 | 0 | } |
708 | | |
709 | 0 | static int32_t getIDStatusValue(const IntProperty & /*prop*/, UChar32 c, UProperty /*which*/) { |
710 | 0 | uint32_t value = u_getUnicodeProperties(c, 2) >> UPROPS_2_ID_TYPE_SHIFT; |
711 | 0 | return value >= UPROPS_ID_TYPE_ALLOWED_MIN ? U_ID_STATUS_ALLOWED : U_ID_STATUS_RESTRICTED; |
712 | 0 | } |
713 | | |
714 | | static const IntProperty intProps[UCHAR_INT_LIMIT-UCHAR_INT_START]={ |
715 | | /* |
716 | | * column, mask and shift values for int-value properties from u_getUnicodeProperties(). |
717 | | * Must be in order of corresponding UProperty, |
718 | | * and there must be exactly one entry per int UProperty. |
719 | | * |
720 | | * Properties with mask==0 are handled in code. |
721 | | * For them, column is the UPropertySource value. |
722 | | */ |
723 | | { UPROPS_SRC_BIDI, 0, 0, getBiDiClass, biDiGetMaxValue }, |
724 | | { UPROPS_SRC_BLOCK, 0, 0, getBlock, blockGetMaxValue }, |
725 | | { UPROPS_SRC_NFC, 0, 0xff, getCombiningClass, getMaxValueFromShift }, |
726 | | { 2, UPROPS_DT_MASK, 0, defaultGetValue, defaultGetMaxValue }, |
727 | | { 0, UPROPS_EA_MASK, UPROPS_EA_SHIFT, defaultGetValue, defaultGetMaxValue }, |
728 | | { UPROPS_SRC_CHAR, 0, static_cast<int32_t>(U_CHAR_CATEGORY_COUNT) - 1, getGeneralCategory, getMaxValueFromShift }, |
729 | | { UPROPS_SRC_BIDI, 0, 0, getJoiningGroup, biDiGetMaxValue }, |
730 | | { UPROPS_SRC_BIDI, 0, 0, getJoiningType, biDiGetMaxValue }, |
731 | | { 2, UPROPS_LB_MASK, UPROPS_LB_SHIFT, defaultGetValue, defaultGetMaxValue }, |
732 | | { UPROPS_SRC_CHAR, 0, static_cast<int32_t>(U_NT_COUNT) - 1, getNumericType, getMaxValueFromShift }, |
733 | | { UPROPS_SRC_PROPSVEC, 0, 0, getScript, scriptGetMaxValue }, |
734 | | { UPROPS_SRC_PROPSVEC, 0, static_cast<int32_t>(U_HST_COUNT) - 1, getHangulSyllableType, getMaxValueFromShift }, |
735 | | // UCHAR_NFD_QUICK_CHECK: max=1=YES -- never "maybe", only "no" or "yes" |
736 | | { UPROPS_SRC_NFC, 0, static_cast<int32_t>(UNORM_YES), getNormQuickCheck, getMaxValueFromShift }, |
737 | | // UCHAR_NFKD_QUICK_CHECK: max=1=YES -- never "maybe", only "no" or "yes" |
738 | | { UPROPS_SRC_NFKC, 0, static_cast<int32_t>(UNORM_YES), getNormQuickCheck, getMaxValueFromShift }, |
739 | | // UCHAR_NFC_QUICK_CHECK: max=2=MAYBE |
740 | | { UPROPS_SRC_NFC, 0, static_cast<int32_t>(UNORM_MAYBE), getNormQuickCheck, getMaxValueFromShift }, |
741 | | // UCHAR_NFKC_QUICK_CHECK: max=2=MAYBE |
742 | | { UPROPS_SRC_NFKC, 0, static_cast<int32_t>(UNORM_MAYBE), getNormQuickCheck, getMaxValueFromShift }, |
743 | | { UPROPS_SRC_NFC, 0, 0xff, getLeadCombiningClass, getMaxValueFromShift }, |
744 | | { UPROPS_SRC_NFC, 0, 0xff, getTrailCombiningClass, getMaxValueFromShift }, |
745 | | { 2, UPROPS_GCB_MASK, UPROPS_GCB_SHIFT, defaultGetValue, defaultGetMaxValue }, |
746 | | { 2, UPROPS_SB_MASK, UPROPS_SB_SHIFT, defaultGetValue, defaultGetMaxValue }, |
747 | | { 2, UPROPS_WB_MASK, UPROPS_WB_SHIFT, defaultGetValue, defaultGetMaxValue }, |
748 | | { UPROPS_SRC_BIDI, 0, 0, getBiDiPairedBracketType, biDiGetMaxValue }, |
749 | | { UPROPS_SRC_INPC, 0, 0, getInPC, layoutGetMaxValue }, |
750 | | { UPROPS_SRC_INSC, 0, 0, getInSC, layoutGetMaxValue }, |
751 | | { UPROPS_SRC_VO, 0, 0, getVo, layoutGetMaxValue }, |
752 | | { UPROPS_SRC_PROPSVEC, 0, static_cast<int32_t>(U_ID_STATUS_ALLOWED), getIDStatusValue, getMaxValueFromShift }, |
753 | | { 0, UPROPS_INCB_MASK, UPROPS_INCB_SHIFT,defaultGetValue, defaultGetMaxValue }, |
754 | | }; |
755 | | |
756 | | U_CAPI int32_t U_EXPORT2 |
757 | 0 | u_getIntPropertyValue(UChar32 c, UProperty which) { |
758 | 0 | if(which<UCHAR_INT_START) { |
759 | 0 | if(UCHAR_BINARY_START<=which && which<UCHAR_BINARY_LIMIT) { |
760 | 0 | const BinaryProperty &prop=binProps[which]; |
761 | 0 | return prop.contains(prop, c, which); |
762 | 0 | } |
763 | 0 | } else if(which<UCHAR_INT_LIMIT) { |
764 | 0 | const IntProperty &prop=intProps[which-UCHAR_INT_START]; |
765 | 0 | return prop.getValue(prop, c, which); |
766 | 0 | } else if(which==UCHAR_GENERAL_CATEGORY_MASK) { |
767 | 0 | return U_MASK(u_charType(c)); |
768 | 0 | } |
769 | 0 | return 0; // undefined |
770 | 0 | } |
771 | | |
772 | | U_CAPI int32_t U_EXPORT2 |
773 | 0 | u_getIntPropertyMinValue(UProperty /*which*/) { |
774 | 0 | return 0; /* all binary/enum/int properties have a minimum value of 0 */ |
775 | 0 | } |
776 | | |
777 | | U_CAPI int32_t U_EXPORT2 |
778 | 0 | u_getIntPropertyMaxValue(UProperty which) { |
779 | 0 | if(which<UCHAR_INT_START) { |
780 | 0 | if(UCHAR_BINARY_START<=which && which<UCHAR_BINARY_LIMIT) { |
781 | 0 | return 1; // maximum true for all binary properties |
782 | 0 | } |
783 | 0 | } else if(which<UCHAR_INT_LIMIT) { |
784 | 0 | const IntProperty &prop=intProps[which-UCHAR_INT_START]; |
785 | 0 | return prop.getMaxValue(prop, which); |
786 | 0 | } |
787 | 0 | return -1; // undefined |
788 | 0 | } |
789 | | |
790 | | U_CFUNC UPropertySource U_EXPORT2 |
791 | 0 | uprops_getSource(UProperty which) { |
792 | 0 | if(which<UCHAR_BINARY_START) { |
793 | 0 | return UPROPS_SRC_NONE; /* undefined */ |
794 | 0 | } else if(which<UCHAR_BINARY_LIMIT) { |
795 | 0 | const BinaryProperty &prop=binProps[which]; |
796 | 0 | if(prop.mask!=0) { |
797 | 0 | return UPROPS_SRC_PROPSVEC; |
798 | 0 | } else { |
799 | 0 | return (UPropertySource)prop.column; |
800 | 0 | } |
801 | 0 | } else if(which<UCHAR_INT_START) { |
802 | 0 | return UPROPS_SRC_NONE; /* undefined */ |
803 | 0 | } else if(which<UCHAR_INT_LIMIT) { |
804 | 0 | const IntProperty &prop=intProps[which-UCHAR_INT_START]; |
805 | 0 | if(prop.mask!=0) { |
806 | 0 | return UPROPS_SRC_PROPSVEC; |
807 | 0 | } else { |
808 | 0 | return (UPropertySource)prop.column; |
809 | 0 | } |
810 | 0 | } else if(which<UCHAR_STRING_START) { |
811 | 0 | switch(which) { |
812 | 0 | case UCHAR_GENERAL_CATEGORY_MASK: |
813 | 0 | case UCHAR_NUMERIC_VALUE: |
814 | 0 | return UPROPS_SRC_CHAR; |
815 | | |
816 | 0 | default: |
817 | 0 | return UPROPS_SRC_NONE; |
818 | 0 | } |
819 | 0 | } else if(which<UCHAR_STRING_LIMIT) { |
820 | 0 | switch(which) { |
821 | 0 | case UCHAR_AGE: |
822 | 0 | return UPROPS_SRC_PROPSVEC; |
823 | | |
824 | 0 | case UCHAR_BIDI_MIRRORING_GLYPH: |
825 | 0 | return UPROPS_SRC_BIDI; |
826 | | |
827 | 0 | case UCHAR_CASE_FOLDING: |
828 | 0 | case UCHAR_LOWERCASE_MAPPING: |
829 | 0 | case UCHAR_SIMPLE_CASE_FOLDING: |
830 | 0 | case UCHAR_SIMPLE_LOWERCASE_MAPPING: |
831 | 0 | case UCHAR_SIMPLE_TITLECASE_MAPPING: |
832 | 0 | case UCHAR_SIMPLE_UPPERCASE_MAPPING: |
833 | 0 | case UCHAR_TITLECASE_MAPPING: |
834 | 0 | case UCHAR_UPPERCASE_MAPPING: |
835 | 0 | return UPROPS_SRC_CASE; |
836 | | |
837 | 0 | case UCHAR_ISO_COMMENT: |
838 | 0 | case UCHAR_NAME: |
839 | 0 | case UCHAR_UNICODE_1_NAME: |
840 | 0 | return UPROPS_SRC_NAMES; |
841 | | |
842 | 0 | default: |
843 | 0 | return UPROPS_SRC_NONE; |
844 | 0 | } |
845 | 0 | } else { |
846 | 0 | switch(which) { |
847 | 0 | case UCHAR_SCRIPT_EXTENSIONS: |
848 | 0 | case UCHAR_IDENTIFIER_TYPE: |
849 | 0 | return UPROPS_SRC_PROPSVEC; |
850 | 0 | default: |
851 | 0 | return UPROPS_SRC_NONE; /* undefined */ |
852 | 0 | } |
853 | 0 | } |
854 | 0 | } |
855 | | |
856 | | U_CFUNC void U_EXPORT2 |
857 | 0 | uprops_addPropertyStarts(UPropertySource src, const USetAdder *sa, UErrorCode *pErrorCode) { |
858 | 0 | if (U_FAILURE(*pErrorCode)) { return; } |
859 | 0 | if (src == UPROPS_SRC_ID_COMPAT_MATH) { |
860 | | // range limits |
861 | 0 | for (UChar32 c : ID_COMPAT_MATH_CONTINUE) { |
862 | 0 | sa->add(sa->set, c); |
863 | 0 | } |
864 | | // single characters |
865 | 0 | for (UChar32 c : ID_COMPAT_MATH_START) { |
866 | 0 | sa->add(sa->set, c); |
867 | 0 | sa->add(sa->set, c + 1); |
868 | 0 | } |
869 | 0 | return; |
870 | 0 | } |
871 | 0 | if (src == UPROPS_SRC_MCM) { |
872 | | // range limits |
873 | 0 | for (UChar32 c : MODIFIER_COMBINING_MARK) { |
874 | 0 | sa->add(sa->set, c); |
875 | 0 | } |
876 | 0 | return; |
877 | 0 | } |
878 | 0 | if (!ulayout_ensureData(*pErrorCode)) { return; } |
879 | 0 | const UCPTrie *trie; |
880 | 0 | switch (src) { |
881 | 0 | case UPROPS_SRC_INPC: |
882 | 0 | trie = gInpcTrie; |
883 | 0 | break; |
884 | 0 | case UPROPS_SRC_INSC: |
885 | 0 | trie = gInscTrie; |
886 | 0 | break; |
887 | 0 | case UPROPS_SRC_VO: |
888 | 0 | trie = gVoTrie; |
889 | 0 | break; |
890 | 0 | default: |
891 | 0 | *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR; |
892 | 0 | return; |
893 | 0 | } |
894 | | |
895 | 0 | if (trie == nullptr) { |
896 | 0 | *pErrorCode = U_MISSING_RESOURCE_ERROR; |
897 | 0 | return; |
898 | 0 | } |
899 | | |
900 | | // Add the start code point of each same-value range of the trie. |
901 | 0 | UChar32 start = 0, end; |
902 | 0 | while ((end = ucptrie_getRange(trie, start, UCPMAP_RANGE_NORMAL, 0, |
903 | 0 | nullptr, nullptr, nullptr)) >= 0) { |
904 | 0 | sa->add(sa->set, start); |
905 | 0 | start = end + 1; |
906 | 0 | } |
907 | 0 | } |
908 | | |
909 | | U_CAPI bool U_EXPORT2 |
910 | 0 | u_hasIDType(UChar32 c, UIdentifierType type) { |
911 | 0 | uint32_t typeIndex = type; // also guards against negative type integers |
912 | 0 | if (typeIndex >= UPRV_LENGTHOF(uprops_idTypeToEncoded)) { |
913 | 0 | return false; |
914 | 0 | } |
915 | 0 | uint32_t encodedType = uprops_idTypeToEncoded[typeIndex]; |
916 | 0 | uint32_t value = u_getUnicodeProperties(c, 2) >> UPROPS_2_ID_TYPE_SHIFT; |
917 | 0 | if ((encodedType & UPROPS_ID_TYPE_BIT) != 0) { |
918 | 0 | return value < UPROPS_ID_TYPE_FORBIDDEN && (value & encodedType) != 0; |
919 | 0 | } else { |
920 | 0 | return value == encodedType; |
921 | 0 | } |
922 | 0 | } |
923 | | |
924 | | namespace { |
925 | | |
926 | | void maybeAppendType(uint32_t value, uint32_t bit, UIdentifierType t, |
927 | 0 | UIdentifierType *types, int32_t &length, int32_t capacity) { |
928 | 0 | if ((value & bit) != 0) { |
929 | 0 | if (length < capacity) { |
930 | 0 | types[length] = t; |
931 | 0 | } |
932 | 0 | ++length; |
933 | 0 | } |
934 | 0 | } |
935 | | |
936 | | } // namespace |
937 | | |
938 | | U_CAPI int32_t U_EXPORT2 |
939 | 0 | u_getIDTypes(UChar32 c, UIdentifierType *types, int32_t capacity, UErrorCode *pErrorCode) { |
940 | 0 | if (U_FAILURE(*pErrorCode)) { return 0; } |
941 | 0 | if (capacity < 0 || (capacity > 0 && types == nullptr)) { |
942 | 0 | *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR; |
943 | 0 | return 0; |
944 | 0 | } |
945 | 0 | uint32_t value = u_getUnicodeProperties(c, 2) >> UPROPS_2_ID_TYPE_SHIFT; |
946 | 0 | if ((value & UPROPS_ID_TYPE_FORBIDDEN) == UPROPS_ID_TYPE_FORBIDDEN || |
947 | 0 | value == UPROPS_ID_TYPE_NOT_CHARACTER) { |
948 | | // single value |
949 | 0 | if (capacity > 0) { |
950 | 0 | UIdentifierType t; |
951 | 0 | switch (value) { |
952 | 0 | case UPROPS_ID_TYPE_NOT_CHARACTER: t = U_ID_TYPE_NOT_CHARACTER; break; |
953 | 0 | case UPROPS_ID_TYPE_DEPRECATED: t = U_ID_TYPE_DEPRECATED; break; |
954 | 0 | case UPROPS_ID_TYPE_DEFAULT_IGNORABLE: t = U_ID_TYPE_DEFAULT_IGNORABLE; break; |
955 | 0 | case UPROPS_ID_TYPE_NOT_NFKC: t = U_ID_TYPE_NOT_NFKC; break; |
956 | 0 | case UPROPS_ID_TYPE_INCLUSION: t = U_ID_TYPE_INCLUSION; break; |
957 | 0 | case UPROPS_ID_TYPE_RECOMMENDED: t = U_ID_TYPE_RECOMMENDED; break; |
958 | 0 | default: |
959 | 0 | *pErrorCode = U_INVALID_FORMAT_ERROR; |
960 | 0 | return 0; |
961 | 0 | } |
962 | 0 | types[0] = t; |
963 | 0 | } else { |
964 | 0 | *pErrorCode = U_BUFFER_OVERFLOW_ERROR; |
965 | 0 | } |
966 | 0 | return 1; |
967 | 0 | } else { |
968 | | // one or more combinable bits |
969 | 0 | int32_t length = 0; |
970 | 0 | maybeAppendType(value, UPROPS_ID_TYPE_NOT_XID, U_ID_TYPE_NOT_XID, |
971 | 0 | types, length, capacity); |
972 | 0 | maybeAppendType(value, UPROPS_ID_TYPE_EXCLUSION, U_ID_TYPE_EXCLUSION, |
973 | 0 | types, length, capacity); |
974 | 0 | maybeAppendType(value, UPROPS_ID_TYPE_OBSOLETE, U_ID_TYPE_OBSOLETE, |
975 | 0 | types, length, capacity); |
976 | 0 | maybeAppendType(value, UPROPS_ID_TYPE_TECHNICAL, U_ID_TYPE_TECHNICAL, |
977 | 0 | types, length, capacity); |
978 | 0 | maybeAppendType(value, UPROPS_ID_TYPE_UNCOMMON_USE, U_ID_TYPE_UNCOMMON_USE, |
979 | 0 | types, length, capacity); |
980 | 0 | maybeAppendType(value, UPROPS_ID_TYPE_LIMITED_USE, U_ID_TYPE_LIMITED_USE, |
981 | 0 | types, length, capacity); |
982 | 0 | if (length >= capacity) { |
983 | 0 | *pErrorCode = U_BUFFER_OVERFLOW_ERROR; |
984 | 0 | } |
985 | 0 | return length; |
986 | 0 | } |
987 | 0 | } |
988 | | |
989 | | #if !UCONFIG_NO_NORMALIZATION |
990 | | |
991 | | U_CAPI int32_t U_EXPORT2 |
992 | 0 | u_getFC_NFKC_Closure(UChar32 c, char16_t *dest, int32_t destCapacity, UErrorCode *pErrorCode) { |
993 | 0 | if(pErrorCode==nullptr || U_FAILURE(*pErrorCode)) { |
994 | 0 | return 0; |
995 | 0 | } |
996 | 0 | if(destCapacity<0 || (dest==nullptr && destCapacity>0)) { |
997 | 0 | *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; |
998 | 0 | return 0; |
999 | 0 | } |
1000 | | // Compute the FC_NFKC_Closure on the fly: |
1001 | | // We have the API for complete coverage of Unicode properties, although |
1002 | | // this value by itself is not useful via API. |
1003 | | // (What could be useful is a custom normalization table that combines |
1004 | | // case folding and NFKC.) |
1005 | | // For the derivation, see Unicode's DerivedNormalizationProps.txt. |
1006 | 0 | const Normalizer2 *nfkc=Normalizer2::getNFKCInstance(*pErrorCode); |
1007 | 0 | if(U_FAILURE(*pErrorCode)) { |
1008 | 0 | return 0; |
1009 | 0 | } |
1010 | | // first: b = NFKC(Fold(a)) |
1011 | 0 | UnicodeString folded1String; |
1012 | 0 | const char16_t *folded1; |
1013 | 0 | int32_t folded1Length=ucase_toFullFolding(c, &folded1, U_FOLD_CASE_DEFAULT); |
1014 | 0 | if(folded1Length<0) { |
1015 | 0 | const Normalizer2Impl *nfkcImpl=Normalizer2Factory::getImpl(nfkc); |
1016 | 0 | if(nfkcImpl->getCompQuickCheck(nfkcImpl->getNorm16(c))!=UNORM_NO) { |
1017 | 0 | return u_terminateUChars(dest, destCapacity, 0, pErrorCode); // c does not change at all under CaseFolding+NFKC |
1018 | 0 | } |
1019 | 0 | folded1String.setTo(c); |
1020 | 0 | } else { |
1021 | 0 | if(folded1Length>UCASE_MAX_STRING_LENGTH) { |
1022 | 0 | folded1String.setTo(folded1Length); |
1023 | 0 | } else { |
1024 | 0 | folded1String.setTo(false, folded1, folded1Length); |
1025 | 0 | } |
1026 | 0 | } |
1027 | 0 | UnicodeString kc1=nfkc->normalize(folded1String, *pErrorCode); |
1028 | | // second: c = NFKC(Fold(b)) |
1029 | 0 | UnicodeString folded2String(kc1); |
1030 | 0 | UnicodeString kc2=nfkc->normalize(folded2String.foldCase(), *pErrorCode); |
1031 | | // if (c != b) add the mapping from a to c |
1032 | 0 | if(U_FAILURE(*pErrorCode) || kc1==kc2) { |
1033 | 0 | return u_terminateUChars(dest, destCapacity, 0, pErrorCode); |
1034 | 0 | } else { |
1035 | 0 | return kc2.extract(dest, destCapacity, *pErrorCode); |
1036 | 0 | } |
1037 | 0 | } |
1038 | | |
1039 | | #endif |