/src/mozilla-central/intl/icu/source/common/uprops.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | // © 2016 and later: Unicode, Inc. and others. |
2 | | // License & terms of use: http://www.unicode.org/copyright.html |
3 | | /* |
4 | | ******************************************************************************* |
5 | | * |
6 | | * Copyright (C) 2002-2016, International Business Machines |
7 | | * Corporation and others. All Rights Reserved. |
8 | | * |
9 | | ******************************************************************************* |
10 | | * file name: uprops.cpp |
11 | | * encoding: UTF-8 |
12 | | * tab size: 8 (not used) |
13 | | * indentation:4 |
14 | | * |
15 | | * created on: 2002feb24 |
16 | | * created by: Markus W. Scherer |
17 | | * |
18 | | * Implementations for mostly non-core Unicode character properties |
19 | | * stored in uprops.icu. |
20 | | * |
21 | | * With the APIs implemented here, almost all properties files and |
22 | | * their associated implementation files are used from this file, |
23 | | * including those for normalization and case mappings. |
24 | | */ |
25 | | |
26 | | #include "unicode/utypes.h" |
27 | | #include "unicode/uchar.h" |
28 | | #include "unicode/unorm2.h" |
29 | | #include "unicode/uscript.h" |
30 | | #include "unicode/ustring.h" |
31 | | #include "cstring.h" |
32 | | #include "normalizer2impl.h" |
33 | | #include "umutex.h" |
34 | | #include "ubidi_props.h" |
35 | | #include "uprops.h" |
36 | | #include "ucase.h" |
37 | | #include "ustr_imp.h" |
38 | | |
39 | | U_NAMESPACE_USE |
40 | | |
41 | | /* general properties API functions ----------------------------------------- */ |
42 | | |
43 | | struct BinaryProperty; |
44 | | |
45 | | typedef UBool BinaryPropertyContains(const BinaryProperty &prop, UChar32 c, UProperty which); |
46 | | |
47 | | struct BinaryProperty { |
48 | | int32_t column; // SRC_PROPSVEC column, or "source" if mask==0 |
49 | | uint32_t mask; |
50 | | BinaryPropertyContains *contains; |
51 | | }; |
52 | | |
53 | 0 | static UBool defaultContains(const BinaryProperty &prop, UChar32 c, UProperty /*which*/) { |
54 | 0 | /* systematic, directly stored properties */ |
55 | 0 | return (u_getUnicodeProperties(c, prop.column)&prop.mask)!=0; |
56 | 0 | } |
57 | | |
58 | 0 | static UBool caseBinaryPropertyContains(const BinaryProperty &/*prop*/, UChar32 c, UProperty which) { |
59 | 0 | return ucase_hasBinaryProperty(c, which); |
60 | 0 | } |
61 | | |
62 | 0 | static UBool isBidiControl(const BinaryProperty &/*prop*/, UChar32 c, UProperty /*which*/) { |
63 | 0 | return ubidi_isBidiControl(c); |
64 | 0 | } |
65 | | |
66 | 0 | static UBool isMirrored(const BinaryProperty &/*prop*/, UChar32 c, UProperty /*which*/) { |
67 | 0 | return ubidi_isMirrored(c); |
68 | 0 | } |
69 | | |
70 | 0 | static UBool isJoinControl(const BinaryProperty &/*prop*/, UChar32 c, UProperty /*which*/) { |
71 | 0 | return ubidi_isJoinControl(c); |
72 | 0 | } |
73 | | |
74 | | #if UCONFIG_NO_NORMALIZATION |
75 | | static UBool hasFullCompositionExclusion(const BinaryProperty &, UChar32, UProperty) { |
76 | | return FALSE; |
77 | | } |
78 | | #else |
79 | 0 | static UBool hasFullCompositionExclusion(const BinaryProperty &/*prop*/, UChar32 c, UProperty /*which*/) { |
80 | 0 | // By definition, Full_Composition_Exclusion is the same as NFC_QC=No. |
81 | 0 | UErrorCode errorCode=U_ZERO_ERROR; |
82 | 0 | const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(errorCode); |
83 | 0 | return U_SUCCESS(errorCode) && impl->isCompNo(impl->getNorm16(c)); |
84 | 0 | } |
85 | | #endif |
86 | | |
87 | | // UCHAR_NF*_INERT properties |
88 | | #if UCONFIG_NO_NORMALIZATION |
89 | | static UBool isNormInert(const BinaryProperty &, UChar32, UProperty) { |
90 | | return FALSE; |
91 | | } |
92 | | #else |
93 | 0 | static UBool isNormInert(const BinaryProperty &/*prop*/, UChar32 c, UProperty which) { |
94 | 0 | UErrorCode errorCode=U_ZERO_ERROR; |
95 | 0 | const Normalizer2 *norm2=Normalizer2Factory::getInstance( |
96 | 0 | (UNormalizationMode)(which-UCHAR_NFD_INERT+UNORM_NFD), errorCode); |
97 | 0 | return U_SUCCESS(errorCode) && norm2->isInert(c); |
98 | 0 | } |
99 | | #endif |
100 | | |
101 | | #if UCONFIG_NO_NORMALIZATION |
102 | | static UBool changesWhenCasefolded(const BinaryProperty &, UChar32, UProperty) { |
103 | | return FALSE; |
104 | | } |
105 | | #else |
106 | 0 | static UBool changesWhenCasefolded(const BinaryProperty &/*prop*/, UChar32 c, UProperty /*which*/) { |
107 | 0 | UnicodeString nfd; |
108 | 0 | UErrorCode errorCode=U_ZERO_ERROR; |
109 | 0 | const Normalizer2 *nfcNorm2=Normalizer2::getNFCInstance(errorCode); |
110 | 0 | if(U_FAILURE(errorCode)) { |
111 | 0 | return FALSE; |
112 | 0 | } |
113 | 0 | if(nfcNorm2->getDecomposition(c, nfd)) { |
114 | 0 | /* c has a decomposition */ |
115 | 0 | if(nfd.length()==1) { |
116 | 0 | c=nfd[0]; /* single BMP code point */ |
117 | 0 | } else if(nfd.length()<=U16_MAX_LENGTH && |
118 | 0 | nfd.length()==U16_LENGTH(c=nfd.char32At(0)) |
119 | 0 | ) { |
120 | 0 | /* single supplementary code point */ |
121 | 0 | } else { |
122 | 0 | c=U_SENTINEL; |
123 | 0 | } |
124 | 0 | } else if(c<0) { |
125 | 0 | return FALSE; /* protect against bad input */ |
126 | 0 | } |
127 | 0 | if(c>=0) { |
128 | 0 | /* single code point */ |
129 | 0 | const UChar *resultString; |
130 | 0 | return (UBool)(ucase_toFullFolding(c, &resultString, U_FOLD_CASE_DEFAULT)>=0); |
131 | 0 | } else { |
132 | 0 | /* guess some large but stack-friendly capacity */ |
133 | 0 | UChar dest[2*UCASE_MAX_STRING_LENGTH]; |
134 | 0 | int32_t destLength; |
135 | 0 | destLength=u_strFoldCase(dest, UPRV_LENGTHOF(dest), |
136 | 0 | nfd.getBuffer(), nfd.length(), |
137 | 0 | U_FOLD_CASE_DEFAULT, &errorCode); |
138 | 0 | return (UBool)(U_SUCCESS(errorCode) && |
139 | 0 | 0!=u_strCompare(nfd.getBuffer(), nfd.length(), |
140 | 0 | dest, destLength, FALSE)); |
141 | 0 | } |
142 | 0 | } |
143 | | #endif |
144 | | |
145 | | #if UCONFIG_NO_NORMALIZATION |
146 | | static UBool changesWhenNFKC_Casefolded(const BinaryProperty &, UChar32, UProperty) { |
147 | | return FALSE; |
148 | | } |
149 | | #else |
150 | 0 | static UBool changesWhenNFKC_Casefolded(const BinaryProperty &/*prop*/, UChar32 c, UProperty /*which*/) { |
151 | 0 | UErrorCode errorCode=U_ZERO_ERROR; |
152 | 0 | const Normalizer2Impl *kcf=Normalizer2Factory::getNFKC_CFImpl(errorCode); |
153 | 0 | if(U_FAILURE(errorCode)) { |
154 | 0 | return FALSE; |
155 | 0 | } |
156 | 0 | UnicodeString src(c); |
157 | 0 | UnicodeString dest; |
158 | 0 | { |
159 | 0 | // The ReorderingBuffer must be in a block because its destructor |
160 | 0 | // needs to release dest's buffer before we look at its contents. |
161 | 0 | ReorderingBuffer buffer(*kcf, dest); |
162 | 0 | // Small destCapacity for NFKC_CF(c). |
163 | 0 | if(buffer.init(5, errorCode)) { |
164 | 0 | const UChar *srcArray=src.getBuffer(); |
165 | 0 | kcf->compose(srcArray, srcArray+src.length(), FALSE, |
166 | 0 | TRUE, buffer, errorCode); |
167 | 0 | } |
168 | 0 | } |
169 | 0 | return U_SUCCESS(errorCode) && dest!=src; |
170 | 0 | } |
171 | | #endif |
172 | | |
173 | | #if UCONFIG_NO_NORMALIZATION |
174 | | static UBool isCanonSegmentStarter(const BinaryProperty &, UChar32, UProperty) { |
175 | | return FALSE; |
176 | | } |
177 | | #else |
178 | 0 | static UBool isCanonSegmentStarter(const BinaryProperty &/*prop*/, UChar32 c, UProperty /*which*/) { |
179 | 0 | UErrorCode errorCode=U_ZERO_ERROR; |
180 | 0 | const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(errorCode); |
181 | 0 | return |
182 | 0 | U_SUCCESS(errorCode) && impl->ensureCanonIterData(errorCode) && |
183 | 0 | impl->isCanonSegmentStarter(c); |
184 | 0 | } |
185 | | #endif |
186 | | |
187 | 0 | static UBool isPOSIX_alnum(const BinaryProperty &/*prop*/, UChar32 c, UProperty /*which*/) { |
188 | 0 | return u_isalnumPOSIX(c); |
189 | 0 | } |
190 | | |
191 | 0 | static UBool isPOSIX_blank(const BinaryProperty &/*prop*/, UChar32 c, UProperty /*which*/) { |
192 | 0 | return u_isblank(c); |
193 | 0 | } |
194 | | |
195 | 0 | static UBool isPOSIX_graph(const BinaryProperty &/*prop*/, UChar32 c, UProperty /*which*/) { |
196 | 0 | return u_isgraphPOSIX(c); |
197 | 0 | } |
198 | | |
199 | 0 | static UBool isPOSIX_print(const BinaryProperty &/*prop*/, UChar32 c, UProperty /*which*/) { |
200 | 0 | return u_isprintPOSIX(c); |
201 | 0 | } |
202 | | |
203 | 0 | static UBool isPOSIX_xdigit(const BinaryProperty &/*prop*/, UChar32 c, UProperty /*which*/) { |
204 | 0 | return u_isxdigit(c); |
205 | 0 | } |
206 | | |
207 | 0 | static UBool isRegionalIndicator(const BinaryProperty &/*prop*/, UChar32 c, UProperty /*which*/) { |
208 | 0 | // Property starts are a subset of lb=RI etc. |
209 | 0 | return 0x1F1E6<=c && c<=0x1F1FF; |
210 | 0 | } |
211 | | |
212 | | static const BinaryProperty binProps[UCHAR_BINARY_LIMIT]={ |
213 | | /* |
214 | | * column and mask values for binary properties from u_getUnicodeProperties(). |
215 | | * Must be in order of corresponding UProperty, |
216 | | * and there must be exactly one entry per binary UProperty. |
217 | | * |
218 | | * Properties with mask==0 are handled in code. |
219 | | * For them, column is the UPropertySource value. |
220 | | */ |
221 | | { 1, U_MASK(UPROPS_ALPHABETIC), defaultContains }, |
222 | | { 1, U_MASK(UPROPS_ASCII_HEX_DIGIT), defaultContains }, |
223 | | { UPROPS_SRC_BIDI, 0, isBidiControl }, |
224 | | { UPROPS_SRC_BIDI, 0, isMirrored }, |
225 | | { 1, U_MASK(UPROPS_DASH), defaultContains }, |
226 | | { 1, U_MASK(UPROPS_DEFAULT_IGNORABLE_CODE_POINT), defaultContains }, |
227 | | { 1, U_MASK(UPROPS_DEPRECATED), defaultContains }, |
228 | | { 1, U_MASK(UPROPS_DIACRITIC), defaultContains }, |
229 | | { 1, U_MASK(UPROPS_EXTENDER), defaultContains }, |
230 | | { UPROPS_SRC_NFC, 0, hasFullCompositionExclusion }, |
231 | | { 1, U_MASK(UPROPS_GRAPHEME_BASE), defaultContains }, |
232 | | { 1, U_MASK(UPROPS_GRAPHEME_EXTEND), defaultContains }, |
233 | | { 1, U_MASK(UPROPS_GRAPHEME_LINK), defaultContains }, |
234 | | { 1, U_MASK(UPROPS_HEX_DIGIT), defaultContains }, |
235 | | { 1, U_MASK(UPROPS_HYPHEN), defaultContains }, |
236 | | { 1, U_MASK(UPROPS_ID_CONTINUE), defaultContains }, |
237 | | { 1, U_MASK(UPROPS_ID_START), defaultContains }, |
238 | | { 1, U_MASK(UPROPS_IDEOGRAPHIC), defaultContains }, |
239 | | { 1, U_MASK(UPROPS_IDS_BINARY_OPERATOR), defaultContains }, |
240 | | { 1, U_MASK(UPROPS_IDS_TRINARY_OPERATOR), defaultContains }, |
241 | | { UPROPS_SRC_BIDI, 0, isJoinControl }, |
242 | | { 1, U_MASK(UPROPS_LOGICAL_ORDER_EXCEPTION), defaultContains }, |
243 | | { UPROPS_SRC_CASE, 0, caseBinaryPropertyContains }, // UCHAR_LOWERCASE |
244 | | { 1, U_MASK(UPROPS_MATH), defaultContains }, |
245 | | { 1, U_MASK(UPROPS_NONCHARACTER_CODE_POINT), defaultContains }, |
246 | | { 1, U_MASK(UPROPS_QUOTATION_MARK), defaultContains }, |
247 | | { 1, U_MASK(UPROPS_RADICAL), defaultContains }, |
248 | | { UPROPS_SRC_CASE, 0, caseBinaryPropertyContains }, // UCHAR_SOFT_DOTTED |
249 | | { 1, U_MASK(UPROPS_TERMINAL_PUNCTUATION), defaultContains }, |
250 | | { 1, U_MASK(UPROPS_UNIFIED_IDEOGRAPH), defaultContains }, |
251 | | { UPROPS_SRC_CASE, 0, caseBinaryPropertyContains }, // UCHAR_UPPERCASE |
252 | | { 1, U_MASK(UPROPS_WHITE_SPACE), defaultContains }, |
253 | | { 1, U_MASK(UPROPS_XID_CONTINUE), defaultContains }, |
254 | | { 1, U_MASK(UPROPS_XID_START), defaultContains }, |
255 | | { UPROPS_SRC_CASE, 0, caseBinaryPropertyContains }, // UCHAR_CASE_SENSITIVE |
256 | | { 1, U_MASK(UPROPS_S_TERM), defaultContains }, |
257 | | { 1, U_MASK(UPROPS_VARIATION_SELECTOR), defaultContains }, |
258 | | { UPROPS_SRC_NFC, 0, isNormInert }, // UCHAR_NFD_INERT |
259 | | { UPROPS_SRC_NFKC, 0, isNormInert }, // UCHAR_NFKD_INERT |
260 | | { UPROPS_SRC_NFC, 0, isNormInert }, // UCHAR_NFC_INERT |
261 | | { UPROPS_SRC_NFKC, 0, isNormInert }, // UCHAR_NFKC_INERT |
262 | | { UPROPS_SRC_NFC_CANON_ITER, 0, isCanonSegmentStarter }, |
263 | | { 1, U_MASK(UPROPS_PATTERN_SYNTAX), defaultContains }, |
264 | | { 1, U_MASK(UPROPS_PATTERN_WHITE_SPACE), defaultContains }, |
265 | | { UPROPS_SRC_CHAR_AND_PROPSVEC, 0, isPOSIX_alnum }, |
266 | | { UPROPS_SRC_CHAR, 0, isPOSIX_blank }, |
267 | | { UPROPS_SRC_CHAR, 0, isPOSIX_graph }, |
268 | | { UPROPS_SRC_CHAR, 0, isPOSIX_print }, |
269 | | { UPROPS_SRC_CHAR, 0, isPOSIX_xdigit }, |
270 | | { UPROPS_SRC_CASE, 0, caseBinaryPropertyContains }, // UCHAR_CASED |
271 | | { UPROPS_SRC_CASE, 0, caseBinaryPropertyContains }, // UCHAR_CASE_IGNORABLE |
272 | | { UPROPS_SRC_CASE, 0, caseBinaryPropertyContains }, // UCHAR_CHANGES_WHEN_LOWERCASED |
273 | | { UPROPS_SRC_CASE, 0, caseBinaryPropertyContains }, // UCHAR_CHANGES_WHEN_UPPERCASED |
274 | | { UPROPS_SRC_CASE, 0, caseBinaryPropertyContains }, // UCHAR_CHANGES_WHEN_TITLECASED |
275 | | { UPROPS_SRC_CASE_AND_NORM, 0, changesWhenCasefolded }, |
276 | | { UPROPS_SRC_CASE, 0, caseBinaryPropertyContains }, // UCHAR_CHANGES_WHEN_CASEMAPPED |
277 | | { UPROPS_SRC_NFKC_CF, 0, changesWhenNFKC_Casefolded }, |
278 | | { 2, U_MASK(UPROPS_2_EMOJI), defaultContains }, |
279 | | { 2, U_MASK(UPROPS_2_EMOJI_PRESENTATION), defaultContains }, |
280 | | { 2, U_MASK(UPROPS_2_EMOJI_MODIFIER), defaultContains }, |
281 | | { 2, U_MASK(UPROPS_2_EMOJI_MODIFIER_BASE), defaultContains }, |
282 | | { 2, U_MASK(UPROPS_2_EMOJI_COMPONENT), defaultContains }, |
283 | | { 2, 0, isRegionalIndicator }, |
284 | | { 1, U_MASK(UPROPS_PREPENDED_CONCATENATION_MARK), defaultContains }, |
285 | | { 2, U_MASK(UPROPS_2_EXTENDED_PICTOGRAPHIC), defaultContains }, |
286 | | }; |
287 | | |
288 | | U_CAPI UBool U_EXPORT2 |
289 | 0 | u_hasBinaryProperty(UChar32 c, UProperty which) { |
290 | 0 | /* c is range-checked in the functions that are called from here */ |
291 | 0 | if(which<UCHAR_BINARY_START || UCHAR_BINARY_LIMIT<=which) { |
292 | 0 | /* not a known binary property */ |
293 | 0 | return FALSE; |
294 | 0 | } else { |
295 | 0 | const BinaryProperty &prop=binProps[which]; |
296 | 0 | return prop.contains(prop, c, which); |
297 | 0 | } |
298 | 0 | } |
299 | | |
300 | | struct IntProperty; |
301 | | |
302 | | typedef int32_t IntPropertyGetValue(const IntProperty &prop, UChar32 c, UProperty which); |
303 | | typedef int32_t IntPropertyGetMaxValue(const IntProperty &prop, UProperty which); |
304 | | |
305 | | struct IntProperty { |
306 | | int32_t column; // SRC_PROPSVEC column, or "source" if mask==0 |
307 | | uint32_t mask; |
308 | | int32_t shift; // =maxValue if getMaxValueFromShift() is used |
309 | | IntPropertyGetValue *getValue; |
310 | | IntPropertyGetMaxValue *getMaxValue; |
311 | | }; |
312 | | |
313 | 0 | static int32_t defaultGetValue(const IntProperty &prop, UChar32 c, UProperty /*which*/) { |
314 | 0 | /* systematic, directly stored properties */ |
315 | 0 | return (int32_t)(u_getUnicodeProperties(c, prop.column)&prop.mask)>>prop.shift; |
316 | 0 | } |
317 | | |
318 | 0 | static int32_t defaultGetMaxValue(const IntProperty &prop, UProperty /*which*/) { |
319 | 0 | return (uprv_getMaxValues(prop.column)&prop.mask)>>prop.shift; |
320 | 0 | } |
321 | | |
322 | 0 | static int32_t getMaxValueFromShift(const IntProperty &prop, UProperty /*which*/) { |
323 | 0 | return prop.shift; |
324 | 0 | } |
325 | | |
326 | 0 | static int32_t getBiDiClass(const IntProperty &/*prop*/, UChar32 c, UProperty /*which*/) { |
327 | 0 | return (int32_t)u_charDirection(c); |
328 | 0 | } |
329 | | |
330 | 0 | static int32_t getBiDiPairedBracketType(const IntProperty &/*prop*/, UChar32 c, UProperty /*which*/) { |
331 | 0 | return (int32_t)ubidi_getPairedBracketType(c); |
332 | 0 | } |
333 | | |
334 | 0 | static int32_t biDiGetMaxValue(const IntProperty &/*prop*/, UProperty which) { |
335 | 0 | return ubidi_getMaxValue(which); |
336 | 0 | } |
337 | | |
338 | | #if UCONFIG_NO_NORMALIZATION |
339 | | static int32_t getCombiningClass(const IntProperty &, UChar32, UProperty) { |
340 | | return 0; |
341 | | } |
342 | | #else |
343 | 0 | static int32_t getCombiningClass(const IntProperty &/*prop*/, UChar32 c, UProperty /*which*/) { |
344 | 0 | return u_getCombiningClass(c); |
345 | 0 | } |
346 | | #endif |
347 | | |
348 | 0 | static int32_t getGeneralCategory(const IntProperty &/*prop*/, UChar32 c, UProperty /*which*/) { |
349 | 0 | return (int32_t)u_charType(c); |
350 | 0 | } |
351 | | |
352 | 0 | static int32_t getJoiningGroup(const IntProperty &/*prop*/, UChar32 c, UProperty /*which*/) { |
353 | 0 | return ubidi_getJoiningGroup(c); |
354 | 0 | } |
355 | | |
356 | 0 | static int32_t getJoiningType(const IntProperty &/*prop*/, UChar32 c, UProperty /*which*/) { |
357 | 0 | return ubidi_getJoiningType(c); |
358 | 0 | } |
359 | | |
360 | 17.0k | static int32_t getNumericType(const IntProperty &/*prop*/, UChar32 c, UProperty /*which*/) { |
361 | 17.0k | int32_t ntv=(int32_t)GET_NUMERIC_TYPE_VALUE(u_getMainProperties(c)); |
362 | 17.0k | return UPROPS_NTV_GET_TYPE(ntv); |
363 | 17.0k | } |
364 | | |
365 | 0 | static int32_t getScript(const IntProperty &/*prop*/, UChar32 c, UProperty /*which*/) { |
366 | 0 | UErrorCode errorCode=U_ZERO_ERROR; |
367 | 0 | return (int32_t)uscript_getScript(c, &errorCode); |
368 | 0 | } |
369 | | |
370 | | /* |
371 | | * Map some of the Grapheme Cluster Break values to Hangul Syllable Types. |
372 | | * Hangul_Syllable_Type is fully redundant with a subset of Grapheme_Cluster_Break. |
373 | | */ |
374 | | static const UHangulSyllableType gcbToHst[]={ |
375 | | U_HST_NOT_APPLICABLE, /* U_GCB_OTHER */ |
376 | | U_HST_NOT_APPLICABLE, /* U_GCB_CONTROL */ |
377 | | U_HST_NOT_APPLICABLE, /* U_GCB_CR */ |
378 | | U_HST_NOT_APPLICABLE, /* U_GCB_EXTEND */ |
379 | | U_HST_LEADING_JAMO, /* U_GCB_L */ |
380 | | U_HST_NOT_APPLICABLE, /* U_GCB_LF */ |
381 | | U_HST_LV_SYLLABLE, /* U_GCB_LV */ |
382 | | U_HST_LVT_SYLLABLE, /* U_GCB_LVT */ |
383 | | U_HST_TRAILING_JAMO, /* U_GCB_T */ |
384 | | U_HST_VOWEL_JAMO /* U_GCB_V */ |
385 | | /* |
386 | | * Omit GCB values beyond what we need for hst. |
387 | | * The code below checks for the array length. |
388 | | */ |
389 | | }; |
390 | | |
391 | 0 | static int32_t getHangulSyllableType(const IntProperty &/*prop*/, UChar32 c, UProperty /*which*/) { |
392 | 0 | /* see comments on gcbToHst[] above */ |
393 | 0 | int32_t gcb=(int32_t)(u_getUnicodeProperties(c, 2)&UPROPS_GCB_MASK)>>UPROPS_GCB_SHIFT; |
394 | 0 | if(gcb<UPRV_LENGTHOF(gcbToHst)) { |
395 | 0 | return gcbToHst[gcb]; |
396 | 0 | } else { |
397 | 0 | return U_HST_NOT_APPLICABLE; |
398 | 0 | } |
399 | 0 | } |
400 | | |
401 | | #if UCONFIG_NO_NORMALIZATION |
402 | | static int32_t getNormQuickCheck(const IntProperty &, UChar32, UProperty) { |
403 | | return 0; |
404 | | } |
405 | | #else |
406 | 0 | static int32_t getNormQuickCheck(const IntProperty &/*prop*/, UChar32 c, UProperty which) { |
407 | 0 | return (int32_t)unorm_getQuickCheck(c, (UNormalizationMode)(which-UCHAR_NFD_QUICK_CHECK+UNORM_NFD)); |
408 | 0 | } |
409 | | #endif |
410 | | |
411 | | #if UCONFIG_NO_NORMALIZATION |
412 | | static int32_t getLeadCombiningClass(const IntProperty &, UChar32, UProperty) { |
413 | | return 0; |
414 | | } |
415 | | #else |
416 | 0 | static int32_t getLeadCombiningClass(const IntProperty &/*prop*/, UChar32 c, UProperty /*which*/) { |
417 | 0 | return unorm_getFCD16(c)>>8; |
418 | 0 | } |
419 | | #endif |
420 | | |
421 | | #if UCONFIG_NO_NORMALIZATION |
422 | | static int32_t getTrailCombiningClass(const IntProperty &, UChar32, UProperty) { |
423 | | return 0; |
424 | | } |
425 | | #else |
426 | 0 | static int32_t getTrailCombiningClass(const IntProperty &/*prop*/, UChar32 c, UProperty /*which*/) { |
427 | 0 | return unorm_getFCD16(c)&0xff; |
428 | 0 | } |
429 | | #endif |
430 | | |
431 | | static const IntProperty intProps[UCHAR_INT_LIMIT-UCHAR_INT_START]={ |
432 | | /* |
433 | | * column, mask and shift values for int-value properties from u_getUnicodeProperties(). |
434 | | * Must be in order of corresponding UProperty, |
435 | | * and there must be exactly one entry per int UProperty. |
436 | | * |
437 | | * Properties with mask==0 are handled in code. |
438 | | * For them, column is the UPropertySource value. |
439 | | */ |
440 | | { UPROPS_SRC_BIDI, 0, 0, getBiDiClass, biDiGetMaxValue }, |
441 | | { 0, UPROPS_BLOCK_MASK, UPROPS_BLOCK_SHIFT, defaultGetValue, defaultGetMaxValue }, |
442 | | { UPROPS_SRC_NFC, 0, 0xff, getCombiningClass, getMaxValueFromShift }, |
443 | | { 2, UPROPS_DT_MASK, 0, defaultGetValue, defaultGetMaxValue }, |
444 | | { 0, UPROPS_EA_MASK, UPROPS_EA_SHIFT, defaultGetValue, defaultGetMaxValue }, |
445 | | { UPROPS_SRC_CHAR, 0, (int32_t)U_CHAR_CATEGORY_COUNT-1,getGeneralCategory, getMaxValueFromShift }, |
446 | | { UPROPS_SRC_BIDI, 0, 0, getJoiningGroup, biDiGetMaxValue }, |
447 | | { UPROPS_SRC_BIDI, 0, 0, getJoiningType, biDiGetMaxValue }, |
448 | | { 2, UPROPS_LB_MASK, UPROPS_LB_SHIFT, defaultGetValue, defaultGetMaxValue }, |
449 | | { UPROPS_SRC_CHAR, 0, (int32_t)U_NT_COUNT-1, getNumericType, getMaxValueFromShift }, |
450 | | { 0, UPROPS_SCRIPT_MASK, 0, getScript, defaultGetMaxValue }, |
451 | | { UPROPS_SRC_PROPSVEC, 0, (int32_t)U_HST_COUNT-1, getHangulSyllableType, getMaxValueFromShift }, |
452 | | // UCHAR_NFD_QUICK_CHECK: max=1=YES -- never "maybe", only "no" or "yes" |
453 | | { UPROPS_SRC_NFC, 0, (int32_t)UNORM_YES, getNormQuickCheck, getMaxValueFromShift }, |
454 | | // UCHAR_NFKD_QUICK_CHECK: max=1=YES -- never "maybe", only "no" or "yes" |
455 | | { UPROPS_SRC_NFKC, 0, (int32_t)UNORM_YES, getNormQuickCheck, getMaxValueFromShift }, |
456 | | // UCHAR_NFC_QUICK_CHECK: max=2=MAYBE |
457 | | { UPROPS_SRC_NFC, 0, (int32_t)UNORM_MAYBE, getNormQuickCheck, getMaxValueFromShift }, |
458 | | // UCHAR_NFKC_QUICK_CHECK: max=2=MAYBE |
459 | | { UPROPS_SRC_NFKC, 0, (int32_t)UNORM_MAYBE, getNormQuickCheck, getMaxValueFromShift }, |
460 | | { UPROPS_SRC_NFC, 0, 0xff, getLeadCombiningClass, getMaxValueFromShift }, |
461 | | { UPROPS_SRC_NFC, 0, 0xff, getTrailCombiningClass, getMaxValueFromShift }, |
462 | | { 2, UPROPS_GCB_MASK, UPROPS_GCB_SHIFT, defaultGetValue, defaultGetMaxValue }, |
463 | | { 2, UPROPS_SB_MASK, UPROPS_SB_SHIFT, defaultGetValue, defaultGetMaxValue }, |
464 | | { 2, UPROPS_WB_MASK, UPROPS_WB_SHIFT, defaultGetValue, defaultGetMaxValue }, |
465 | | { UPROPS_SRC_BIDI, 0, 0, getBiDiPairedBracketType, biDiGetMaxValue }, |
466 | | }; |
467 | | |
468 | | U_CAPI int32_t U_EXPORT2 |
469 | 17.0k | u_getIntPropertyValue(UChar32 c, UProperty which) { |
470 | 17.0k | if(which<UCHAR_INT_START) { |
471 | 0 | if(UCHAR_BINARY_START<=which && which<UCHAR_BINARY_LIMIT) { |
472 | 0 | const BinaryProperty &prop=binProps[which]; |
473 | 0 | return prop.contains(prop, c, which); |
474 | 0 | } |
475 | 17.0k | } else if(which<UCHAR_INT_LIMIT) { |
476 | 17.0k | const IntProperty &prop=intProps[which-UCHAR_INT_START]; |
477 | 17.0k | return prop.getValue(prop, c, which); |
478 | 17.0k | } else if(which==UCHAR_GENERAL_CATEGORY_MASK) { |
479 | 0 | return U_MASK(u_charType(c)); |
480 | 0 | } |
481 | 0 | return 0; // undefined |
482 | 0 | } |
483 | | |
484 | | U_CAPI int32_t U_EXPORT2 |
485 | 0 | u_getIntPropertyMinValue(UProperty /*which*/) { |
486 | 0 | return 0; /* all binary/enum/int properties have a minimum value of 0 */ |
487 | 0 | } |
488 | | |
489 | | U_CAPI int32_t U_EXPORT2 |
490 | 0 | u_getIntPropertyMaxValue(UProperty which) { |
491 | 0 | if(which<UCHAR_INT_START) { |
492 | 0 | if(UCHAR_BINARY_START<=which && which<UCHAR_BINARY_LIMIT) { |
493 | 0 | return 1; // maximum TRUE for all binary properties |
494 | 0 | } |
495 | 0 | } else if(which<UCHAR_INT_LIMIT) { |
496 | 0 | const IntProperty &prop=intProps[which-UCHAR_INT_START]; |
497 | 0 | return prop.getMaxValue(prop, which); |
498 | 0 | } |
499 | 0 | return -1; // undefined |
500 | 0 | } |
501 | | |
502 | | U_CFUNC UPropertySource U_EXPORT2 |
503 | 0 | uprops_getSource(UProperty which) { |
504 | 0 | if(which<UCHAR_BINARY_START) { |
505 | 0 | return UPROPS_SRC_NONE; /* undefined */ |
506 | 0 | } else if(which<UCHAR_BINARY_LIMIT) { |
507 | 0 | const BinaryProperty &prop=binProps[which]; |
508 | 0 | if(prop.mask!=0) { |
509 | 0 | return UPROPS_SRC_PROPSVEC; |
510 | 0 | } else { |
511 | 0 | return (UPropertySource)prop.column; |
512 | 0 | } |
513 | 0 | } else if(which<UCHAR_INT_START) { |
514 | 0 | return UPROPS_SRC_NONE; /* undefined */ |
515 | 0 | } else if(which<UCHAR_INT_LIMIT) { |
516 | 0 | const IntProperty &prop=intProps[which-UCHAR_INT_START]; |
517 | 0 | if(prop.mask!=0) { |
518 | 0 | return UPROPS_SRC_PROPSVEC; |
519 | 0 | } else { |
520 | 0 | return (UPropertySource)prop.column; |
521 | 0 | } |
522 | 0 | } else if(which<UCHAR_STRING_START) { |
523 | 0 | switch(which) { |
524 | 0 | case UCHAR_GENERAL_CATEGORY_MASK: |
525 | 0 | case UCHAR_NUMERIC_VALUE: |
526 | 0 | return UPROPS_SRC_CHAR; |
527 | 0 |
|
528 | 0 | default: |
529 | 0 | return UPROPS_SRC_NONE; |
530 | 0 | } |
531 | 0 | } else if(which<UCHAR_STRING_LIMIT) { |
532 | 0 | switch(which) { |
533 | 0 | case UCHAR_AGE: |
534 | 0 | return UPROPS_SRC_PROPSVEC; |
535 | 0 |
|
536 | 0 | case UCHAR_BIDI_MIRRORING_GLYPH: |
537 | 0 | return UPROPS_SRC_BIDI; |
538 | 0 |
|
539 | 0 | case UCHAR_CASE_FOLDING: |
540 | 0 | case UCHAR_LOWERCASE_MAPPING: |
541 | 0 | case UCHAR_SIMPLE_CASE_FOLDING: |
542 | 0 | case UCHAR_SIMPLE_LOWERCASE_MAPPING: |
543 | 0 | case UCHAR_SIMPLE_TITLECASE_MAPPING: |
544 | 0 | case UCHAR_SIMPLE_UPPERCASE_MAPPING: |
545 | 0 | case UCHAR_TITLECASE_MAPPING: |
546 | 0 | case UCHAR_UPPERCASE_MAPPING: |
547 | 0 | return UPROPS_SRC_CASE; |
548 | 0 |
|
549 | 0 | case UCHAR_ISO_COMMENT: |
550 | 0 | case UCHAR_NAME: |
551 | 0 | case UCHAR_UNICODE_1_NAME: |
552 | 0 | return UPROPS_SRC_NAMES; |
553 | 0 |
|
554 | 0 | default: |
555 | 0 | return UPROPS_SRC_NONE; |
556 | 0 | } |
557 | 0 | } else { |
558 | 0 | switch(which) { |
559 | 0 | case UCHAR_SCRIPT_EXTENSIONS: |
560 | 0 | return UPROPS_SRC_PROPSVEC; |
561 | 0 | default: |
562 | 0 | return UPROPS_SRC_NONE; /* undefined */ |
563 | 0 | } |
564 | 0 | } |
565 | 0 | } |
566 | | |
567 | | #if !UCONFIG_NO_NORMALIZATION |
568 | | |
569 | | U_CAPI int32_t U_EXPORT2 |
570 | 0 | u_getFC_NFKC_Closure(UChar32 c, UChar *dest, int32_t destCapacity, UErrorCode *pErrorCode) { |
571 | 0 | if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { |
572 | 0 | return 0; |
573 | 0 | } |
574 | 0 | if(destCapacity<0 || (dest==NULL && destCapacity>0)) { |
575 | 0 | *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; |
576 | 0 | return 0; |
577 | 0 | } |
578 | 0 | // Compute the FC_NFKC_Closure on the fly: |
579 | 0 | // We have the API for complete coverage of Unicode properties, although |
580 | 0 | // this value by itself is not useful via API. |
581 | 0 | // (What could be useful is a custom normalization table that combines |
582 | 0 | // case folding and NFKC.) |
583 | 0 | // For the derivation, see Unicode's DerivedNormalizationProps.txt. |
584 | 0 | const Normalizer2 *nfkc=Normalizer2::getNFKCInstance(*pErrorCode); |
585 | 0 | if(U_FAILURE(*pErrorCode)) { |
586 | 0 | return 0; |
587 | 0 | } |
588 | 0 | // first: b = NFKC(Fold(a)) |
589 | 0 | UnicodeString folded1String; |
590 | 0 | const UChar *folded1; |
591 | 0 | int32_t folded1Length=ucase_toFullFolding(c, &folded1, U_FOLD_CASE_DEFAULT); |
592 | 0 | if(folded1Length<0) { |
593 | 0 | const Normalizer2Impl *nfkcImpl=Normalizer2Factory::getImpl(nfkc); |
594 | 0 | if(nfkcImpl->getCompQuickCheck(nfkcImpl->getNorm16(c))!=UNORM_NO) { |
595 | 0 | return u_terminateUChars(dest, destCapacity, 0, pErrorCode); // c does not change at all under CaseFolding+NFKC |
596 | 0 | } |
597 | 0 | folded1String.setTo(c); |
598 | 0 | } else { |
599 | 0 | if(folded1Length>UCASE_MAX_STRING_LENGTH) { |
600 | 0 | folded1String.setTo(folded1Length); |
601 | 0 | } else { |
602 | 0 | folded1String.setTo(FALSE, folded1, folded1Length); |
603 | 0 | } |
604 | 0 | } |
605 | 0 | UnicodeString kc1=nfkc->normalize(folded1String, *pErrorCode); |
606 | 0 | // second: c = NFKC(Fold(b)) |
607 | 0 | UnicodeString folded2String(kc1); |
608 | 0 | UnicodeString kc2=nfkc->normalize(folded2String.foldCase(), *pErrorCode); |
609 | 0 | // if (c != b) add the mapping from a to c |
610 | 0 | if(U_FAILURE(*pErrorCode) || kc1==kc2) { |
611 | 0 | return u_terminateUChars(dest, destCapacity, 0, pErrorCode); |
612 | 0 | } else { |
613 | 0 | return kc2.extract(dest, destCapacity, *pErrorCode); |
614 | 0 | } |
615 | 0 | } |
616 | | |
617 | | #endif |