/src/icu/source/common/unicode/normalizer2.h
Line | Count | Source (jump to first uncovered line) |
1 | | // © 2016 and later: Unicode, Inc. and others. |
2 | | // License & terms of use: http://www.unicode.org/copyright.html |
3 | | /* |
4 | | ******************************************************************************* |
5 | | * |
6 | | * Copyright (C) 2009-2013, International Business Machines |
7 | | * Corporation and others. All Rights Reserved. |
8 | | * |
9 | | ******************************************************************************* |
10 | | * file name: normalizer2.h |
11 | | * encoding: UTF-8 |
12 | | * tab size: 8 (not used) |
13 | | * indentation:4 |
14 | | * |
15 | | * created on: 2009nov22 |
16 | | * created by: Markus W. Scherer |
17 | | */ |
18 | | |
19 | | #ifndef __NORMALIZER2_H__ |
20 | | #define __NORMALIZER2_H__ |
21 | | |
22 | | /** |
23 | | * \file |
24 | | * \brief C++ API: New API for Unicode Normalization. |
25 | | */ |
26 | | |
27 | | #include "unicode/utypes.h" |
28 | | |
29 | | #if !UCONFIG_NO_NORMALIZATION |
30 | | |
31 | | #include "unicode/uniset.h" |
32 | | #include "unicode/unistr.h" |
33 | | #include "unicode/unorm2.h" |
34 | | |
35 | | U_NAMESPACE_BEGIN |
36 | | |
37 | | /** |
38 | | * Unicode normalization functionality for standard Unicode normalization or |
39 | | * for using custom mapping tables. |
40 | | * All instances of this class are unmodifiable/immutable. |
41 | | * Instances returned by getInstance() are singletons that must not be deleted by the caller. |
42 | | * The Normalizer2 class is not intended for public subclassing. |
43 | | * |
44 | | * The primary functions are to produce a normalized string and to detect whether |
45 | | * a string is already normalized. |
46 | | * The most commonly used normalization forms are those defined in |
47 | | * http://www.unicode.org/unicode/reports/tr15/ |
48 | | * However, this API supports additional normalization forms for specialized purposes. |
49 | | * For example, NFKC_Casefold is provided via getInstance("nfkc_cf", COMPOSE) |
50 | | * and can be used in implementations of UTS #46. |
51 | | * |
52 | | * Not only are the standard compose and decompose modes supplied, |
53 | | * but additional modes are provided as documented in the Mode enum. |
54 | | * |
55 | | * Some of the functions in this class identify normalization boundaries. |
56 | | * At a normalization boundary, the portions of the string |
57 | | * before it and starting from it do not interact and can be handled independently. |
58 | | * |
59 | | * The spanQuickCheckYes() stops at a normalization boundary. |
60 | | * When the goal is a normalized string, then the text before the boundary |
61 | | * can be copied, and the remainder can be processed with normalizeSecondAndAppend(). |
62 | | * |
63 | | * The hasBoundaryBefore(), hasBoundaryAfter() and isInert() functions test whether |
64 | | * a character is guaranteed to be at a normalization boundary, |
65 | | * regardless of context. |
66 | | * This is used for moving from one normalization boundary to the next |
67 | | * or preceding boundary, and for performing iterative normalization. |
68 | | * |
69 | | * Iterative normalization is useful when only a small portion of a |
70 | | * longer string needs to be processed. |
71 | | * For example, in ICU, iterative normalization is used by the NormalizationTransliterator |
72 | | * (to avoid replacing already-normalized text) and ucol_nextSortKeyPart() |
73 | | * (to process only the substring for which sort key bytes are computed). |
74 | | * |
75 | | * The set of normalization boundaries returned by these functions may not be |
76 | | * complete: There may be more boundaries that could be returned. |
77 | | * Different functions may return different boundaries. |
78 | | * @stable ICU 4.4 |
79 | | */ |
80 | | class U_COMMON_API Normalizer2 : public UObject { |
81 | | public: |
82 | | /** |
83 | | * Destructor. |
84 | | * @stable ICU 4.4 |
85 | | */ |
86 | | ~Normalizer2(); |
87 | | |
88 | | /** |
89 | | * Returns a Normalizer2 instance for Unicode NFC normalization. |
90 | | * Same as getInstance(NULL, "nfc", UNORM2_COMPOSE, errorCode). |
91 | | * Returns an unmodifiable singleton instance. Do not delete it. |
92 | | * @param errorCode Standard ICU error code. Its input value must |
93 | | * pass the U_SUCCESS() test, or else the function returns |
94 | | * immediately. Check for U_FAILURE() on output or use with |
95 | | * function chaining. (See User Guide for details.) |
96 | | * @return the requested Normalizer2, if successful |
97 | | * @stable ICU 49 |
98 | | */ |
99 | | static const Normalizer2 * |
100 | | getNFCInstance(UErrorCode &errorCode); |
101 | | |
102 | | /** |
103 | | * Returns a Normalizer2 instance for Unicode NFD normalization. |
104 | | * Same as getInstance(NULL, "nfc", UNORM2_DECOMPOSE, errorCode). |
105 | | * Returns an unmodifiable singleton instance. Do not delete it. |
106 | | * @param errorCode Standard ICU error code. Its input value must |
107 | | * pass the U_SUCCESS() test, or else the function returns |
108 | | * immediately. Check for U_FAILURE() on output or use with |
109 | | * function chaining. (See User Guide for details.) |
110 | | * @return the requested Normalizer2, if successful |
111 | | * @stable ICU 49 |
112 | | */ |
113 | | static const Normalizer2 * |
114 | | getNFDInstance(UErrorCode &errorCode); |
115 | | |
116 | | /** |
117 | | * Returns a Normalizer2 instance for Unicode NFKC normalization. |
118 | | * Same as getInstance(NULL, "nfkc", UNORM2_COMPOSE, errorCode). |
119 | | * Returns an unmodifiable singleton instance. Do not delete it. |
120 | | * @param errorCode Standard ICU error code. Its input value must |
121 | | * pass the U_SUCCESS() test, or else the function returns |
122 | | * immediately. Check for U_FAILURE() on output or use with |
123 | | * function chaining. (See User Guide for details.) |
124 | | * @return the requested Normalizer2, if successful |
125 | | * @stable ICU 49 |
126 | | */ |
127 | | static const Normalizer2 * |
128 | | getNFKCInstance(UErrorCode &errorCode); |
129 | | |
130 | | /** |
131 | | * Returns a Normalizer2 instance for Unicode NFKD normalization. |
132 | | * Same as getInstance(NULL, "nfkc", UNORM2_DECOMPOSE, errorCode). |
133 | | * Returns an unmodifiable singleton instance. Do not delete it. |
134 | | * @param errorCode Standard ICU error code. Its input value must |
135 | | * pass the U_SUCCESS() test, or else the function returns |
136 | | * immediately. Check for U_FAILURE() on output or use with |
137 | | * function chaining. (See User Guide for details.) |
138 | | * @return the requested Normalizer2, if successful |
139 | | * @stable ICU 49 |
140 | | */ |
141 | | static const Normalizer2 * |
142 | | getNFKDInstance(UErrorCode &errorCode); |
143 | | |
144 | | /** |
145 | | * Returns a Normalizer2 instance for Unicode NFKC_Casefold normalization. |
146 | | * Same as getInstance(NULL, "nfkc_cf", UNORM2_COMPOSE, errorCode). |
147 | | * Returns an unmodifiable singleton instance. Do not delete it. |
148 | | * @param errorCode Standard ICU error code. Its input value must |
149 | | * pass the U_SUCCESS() test, or else the function returns |
150 | | * immediately. Check for U_FAILURE() on output or use with |
151 | | * function chaining. (See User Guide for details.) |
152 | | * @return the requested Normalizer2, if successful |
153 | | * @stable ICU 49 |
154 | | */ |
155 | | static const Normalizer2 * |
156 | | getNFKCCasefoldInstance(UErrorCode &errorCode); |
157 | | |
158 | | /** |
159 | | * Returns a Normalizer2 instance which uses the specified data file |
160 | | * (packageName/name similar to ucnv_openPackage() and ures_open()/ResourceBundle) |
161 | | * and which composes or decomposes text according to the specified mode. |
162 | | * Returns an unmodifiable singleton instance. Do not delete it. |
163 | | * |
164 | | * Use packageName=NULL for data files that are part of ICU's own data. |
165 | | * Use name="nfc" and UNORM2_COMPOSE/UNORM2_DECOMPOSE for Unicode standard NFC/NFD. |
166 | | * Use name="nfkc" and UNORM2_COMPOSE/UNORM2_DECOMPOSE for Unicode standard NFKC/NFKD. |
167 | | * Use name="nfkc_cf" and UNORM2_COMPOSE for Unicode standard NFKC_CF=NFKC_Casefold. |
168 | | * |
169 | | * @param packageName NULL for ICU built-in data, otherwise application data package name |
170 | | * @param name "nfc" or "nfkc" or "nfkc_cf" or name of custom data file |
171 | | * @param mode normalization mode (compose or decompose etc.) |
172 | | * @param errorCode Standard ICU error code. Its input value must |
173 | | * pass the U_SUCCESS() test, or else the function returns |
174 | | * immediately. Check for U_FAILURE() on output or use with |
175 | | * function chaining. (See User Guide for details.) |
176 | | * @return the requested Normalizer2, if successful |
177 | | * @stable ICU 4.4 |
178 | | */ |
179 | | static const Normalizer2 * |
180 | | getInstance(const char *packageName, |
181 | | const char *name, |
182 | | UNormalization2Mode mode, |
183 | | UErrorCode &errorCode); |
184 | | |
185 | | /** |
186 | | * Returns the normalized form of the source string. |
187 | | * @param src source string |
188 | | * @param errorCode Standard ICU error code. Its input value must |
189 | | * pass the U_SUCCESS() test, or else the function returns |
190 | | * immediately. Check for U_FAILURE() on output or use with |
191 | | * function chaining. (See User Guide for details.) |
192 | | * @return normalized src |
193 | | * @stable ICU 4.4 |
194 | | */ |
195 | | UnicodeString |
196 | 0 | normalize(const UnicodeString &src, UErrorCode &errorCode) const { |
197 | 0 | UnicodeString result; |
198 | 0 | normalize(src, result, errorCode); |
199 | 0 | return result; |
200 | 0 | } |
201 | | /** |
202 | | * Writes the normalized form of the source string to the destination string |
203 | | * (replacing its contents) and returns the destination string. |
204 | | * The source and destination strings must be different objects. |
205 | | * @param src source string |
206 | | * @param dest destination string; its contents is replaced with normalized src |
207 | | * @param errorCode Standard ICU error code. Its input value must |
208 | | * pass the U_SUCCESS() test, or else the function returns |
209 | | * immediately. Check for U_FAILURE() on output or use with |
210 | | * function chaining. (See User Guide for details.) |
211 | | * @return dest |
212 | | * @stable ICU 4.4 |
213 | | */ |
214 | | virtual UnicodeString & |
215 | | normalize(const UnicodeString &src, |
216 | | UnicodeString &dest, |
217 | | UErrorCode &errorCode) const = 0; |
218 | | /** |
219 | | * Appends the normalized form of the second string to the first string |
220 | | * (merging them at the boundary) and returns the first string. |
221 | | * The result is normalized if the first string was normalized. |
222 | | * The first and second strings must be different objects. |
223 | | * @param first string, should be normalized |
224 | | * @param second string, will be normalized |
225 | | * @param errorCode Standard ICU error code. Its input value must |
226 | | * pass the U_SUCCESS() test, or else the function returns |
227 | | * immediately. Check for U_FAILURE() on output or use with |
228 | | * function chaining. (See User Guide for details.) |
229 | | * @return first |
230 | | * @stable ICU 4.4 |
231 | | */ |
232 | | virtual UnicodeString & |
233 | | normalizeSecondAndAppend(UnicodeString &first, |
234 | | const UnicodeString &second, |
235 | | UErrorCode &errorCode) const = 0; |
236 | | /** |
237 | | * Appends the second string to the first string |
238 | | * (merging them at the boundary) and returns the first string. |
239 | | * The result is normalized if both the strings were normalized. |
240 | | * The first and second strings must be different objects. |
241 | | * @param first string, should be normalized |
242 | | * @param second string, should be normalized |
243 | | * @param errorCode Standard ICU error code. Its input value must |
244 | | * pass the U_SUCCESS() test, or else the function returns |
245 | | * immediately. Check for U_FAILURE() on output or use with |
246 | | * function chaining. (See User Guide for details.) |
247 | | * @return first |
248 | | * @stable ICU 4.4 |
249 | | */ |
250 | | virtual UnicodeString & |
251 | | append(UnicodeString &first, |
252 | | const UnicodeString &second, |
253 | | UErrorCode &errorCode) const = 0; |
254 | | |
255 | | /** |
256 | | * Gets the decomposition mapping of c. |
257 | | * Roughly equivalent to normalizing the String form of c |
258 | | * on a UNORM2_DECOMPOSE Normalizer2 instance, but much faster, and except that this function |
259 | | * returns FALSE and does not write a string |
260 | | * if c does not have a decomposition mapping in this instance's data. |
261 | | * This function is independent of the mode of the Normalizer2. |
262 | | * @param c code point |
263 | | * @param decomposition String object which will be set to c's |
264 | | * decomposition mapping, if there is one. |
265 | | * @return TRUE if c has a decomposition, otherwise FALSE |
266 | | * @stable ICU 4.6 |
267 | | */ |
268 | | virtual UBool |
269 | | getDecomposition(UChar32 c, UnicodeString &decomposition) const = 0; |
270 | | |
271 | | /** |
272 | | * Gets the raw decomposition mapping of c. |
273 | | * |
274 | | * This is similar to the getDecomposition() method but returns the |
275 | | * raw decomposition mapping as specified in UnicodeData.txt or |
276 | | * (for custom data) in the mapping files processed by the gennorm2 tool. |
277 | | * By contrast, getDecomposition() returns the processed, |
278 | | * recursively-decomposed version of this mapping. |
279 | | * |
280 | | * When used on a standard NFKC Normalizer2 instance, |
281 | | * getRawDecomposition() returns the Unicode Decomposition_Mapping (dm) property. |
282 | | * |
283 | | * When used on a standard NFC Normalizer2 instance, |
284 | | * it returns the Decomposition_Mapping only if the Decomposition_Type (dt) is Canonical (Can); |
285 | | * in this case, the result contains either one or two code points (=1..4 char16_ts). |
286 | | * |
287 | | * This function is independent of the mode of the Normalizer2. |
288 | | * The default implementation returns FALSE. |
289 | | * @param c code point |
290 | | * @param decomposition String object which will be set to c's |
291 | | * raw decomposition mapping, if there is one. |
292 | | * @return TRUE if c has a decomposition, otherwise FALSE |
293 | | * @stable ICU 49 |
294 | | */ |
295 | | virtual UBool |
296 | | getRawDecomposition(UChar32 c, UnicodeString &decomposition) const; |
297 | | |
298 | | /** |
299 | | * Performs pairwise composition of a & b and returns the composite if there is one. |
300 | | * |
301 | | * Returns a composite code point c only if c has a two-way mapping to a+b. |
302 | | * In standard Unicode normalization, this means that |
303 | | * c has a canonical decomposition to a+b |
304 | | * and c does not have the Full_Composition_Exclusion property. |
305 | | * |
306 | | * This function is independent of the mode of the Normalizer2. |
307 | | * The default implementation returns a negative value. |
308 | | * @param a A (normalization starter) code point. |
309 | | * @param b Another code point. |
310 | | * @return The non-negative composite code point if there is one; otherwise a negative value. |
311 | | * @stable ICU 49 |
312 | | */ |
313 | | virtual UChar32 |
314 | | composePair(UChar32 a, UChar32 b) const; |
315 | | |
316 | | /** |
317 | | * Gets the combining class of c. |
318 | | * The default implementation returns 0 |
319 | | * but all standard implementations return the Unicode Canonical_Combining_Class value. |
320 | | * @param c code point |
321 | | * @return c's combining class |
322 | | * @stable ICU 49 |
323 | | */ |
324 | | virtual uint8_t |
325 | | getCombiningClass(UChar32 c) const; |
326 | | |
327 | | /** |
328 | | * Tests if the string is normalized. |
329 | | * Internally, in cases where the quickCheck() method would return "maybe" |
330 | | * (which is only possible for the two COMPOSE modes) this method |
331 | | * resolves to "yes" or "no" to provide a definitive result, |
332 | | * at the cost of doing more work in those cases. |
333 | | * @param s input string |
334 | | * @param errorCode Standard ICU error code. Its input value must |
335 | | * pass the U_SUCCESS() test, or else the function returns |
336 | | * immediately. Check for U_FAILURE() on output or use with |
337 | | * function chaining. (See User Guide for details.) |
338 | | * @return TRUE if s is normalized |
339 | | * @stable ICU 4.4 |
340 | | */ |
341 | | virtual UBool |
342 | | isNormalized(const UnicodeString &s, UErrorCode &errorCode) const = 0; |
343 | | |
344 | | /** |
345 | | * Tests if the string is normalized. |
346 | | * For the two COMPOSE modes, the result could be "maybe" in cases that |
347 | | * would take a little more work to resolve definitively. |
348 | | * Use spanQuickCheckYes() and normalizeSecondAndAppend() for a faster |
349 | | * combination of quick check + normalization, to avoid |
350 | | * re-checking the "yes" prefix. |
351 | | * @param s input string |
352 | | * @param errorCode Standard ICU error code. Its input value must |
353 | | * pass the U_SUCCESS() test, or else the function returns |
354 | | * immediately. Check for U_FAILURE() on output or use with |
355 | | * function chaining. (See User Guide for details.) |
356 | | * @return UNormalizationCheckResult |
357 | | * @stable ICU 4.4 |
358 | | */ |
359 | | virtual UNormalizationCheckResult |
360 | | quickCheck(const UnicodeString &s, UErrorCode &errorCode) const = 0; |
361 | | |
362 | | /** |
363 | | * Returns the end of the normalized substring of the input string. |
364 | | * In other words, with <code>end=spanQuickCheckYes(s, ec);</code> |
365 | | * the substring <code>UnicodeString(s, 0, end)</code> |
366 | | * will pass the quick check with a "yes" result. |
367 | | * |
368 | | * The returned end index is usually one or more characters before the |
369 | | * "no" or "maybe" character: The end index is at a normalization boundary. |
370 | | * (See the class documentation for more about normalization boundaries.) |
371 | | * |
372 | | * When the goal is a normalized string and most input strings are expected |
373 | | * to be normalized already, then call this method, |
374 | | * and if it returns a prefix shorter than the input string, |
375 | | * copy that prefix and use normalizeSecondAndAppend() for the remainder. |
376 | | * @param s input string |
377 | | * @param errorCode Standard ICU error code. Its input value must |
378 | | * pass the U_SUCCESS() test, or else the function returns |
379 | | * immediately. Check for U_FAILURE() on output or use with |
380 | | * function chaining. (See User Guide for details.) |
381 | | * @return "yes" span end index |
382 | | * @stable ICU 4.4 |
383 | | */ |
384 | | virtual int32_t |
385 | | spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const = 0; |
386 | | |
387 | | /** |
388 | | * Tests if the character always has a normalization boundary before it, |
389 | | * regardless of context. |
390 | | * If true, then the character does not normalization-interact with |
391 | | * preceding characters. |
392 | | * In other words, a string containing this character can be normalized |
393 | | * by processing portions before this character and starting from this |
394 | | * character independently. |
395 | | * This is used for iterative normalization. See the class documentation for details. |
396 | | * @param c character to test |
397 | | * @return TRUE if c has a normalization boundary before it |
398 | | * @stable ICU 4.4 |
399 | | */ |
400 | | virtual UBool hasBoundaryBefore(UChar32 c) const = 0; |
401 | | |
402 | | /** |
403 | | * Tests if the character always has a normalization boundary after it, |
404 | | * regardless of context. |
405 | | * If true, then the character does not normalization-interact with |
406 | | * following characters. |
407 | | * In other words, a string containing this character can be normalized |
408 | | * by processing portions up to this character and after this |
409 | | * character independently. |
410 | | * This is used for iterative normalization. See the class documentation for details. |
411 | | * Note that this operation may be significantly slower than hasBoundaryBefore(). |
412 | | * @param c character to test |
413 | | * @return TRUE if c has a normalization boundary after it |
414 | | * @stable ICU 4.4 |
415 | | */ |
416 | | virtual UBool hasBoundaryAfter(UChar32 c) const = 0; |
417 | | |
418 | | /** |
419 | | * Tests if the character is normalization-inert. |
420 | | * If true, then the character does not change, nor normalization-interact with |
421 | | * preceding or following characters. |
422 | | * In other words, a string containing this character can be normalized |
423 | | * by processing portions before this character and after this |
424 | | * character independently. |
425 | | * This is used for iterative normalization. See the class documentation for details. |
426 | | * Note that this operation may be significantly slower than hasBoundaryBefore(). |
427 | | * @param c character to test |
428 | | * @return TRUE if c is normalization-inert |
429 | | * @stable ICU 4.4 |
430 | | */ |
431 | | virtual UBool isInert(UChar32 c) const = 0; |
432 | | }; |
433 | | |
434 | | /** |
435 | | * Normalization filtered by a UnicodeSet. |
436 | | * Normalizes portions of the text contained in the filter set and leaves |
437 | | * portions not contained in the filter set unchanged. |
438 | | * Filtering is done via UnicodeSet::span(..., USET_SPAN_SIMPLE). |
439 | | * Not-in-the-filter text is treated as "is normalized" and "quick check yes". |
440 | | * This class implements all of (and only) the Normalizer2 API. |
441 | | * An instance of this class is unmodifiable/immutable but is constructed and |
442 | | * must be destructed by the owner. |
443 | | * @stable ICU 4.4 |
444 | | */ |
445 | | class U_COMMON_API FilteredNormalizer2 : public Normalizer2 { |
446 | | public: |
447 | | /** |
448 | | * Constructs a filtered normalizer wrapping any Normalizer2 instance |
449 | | * and a filter set. |
450 | | * Both are aliased and must not be modified or deleted while this object |
451 | | * is used. |
452 | | * The filter set should be frozen; otherwise the performance will suffer greatly. |
453 | | * @param n2 wrapped Normalizer2 instance |
454 | | * @param filterSet UnicodeSet which determines the characters to be normalized |
455 | | * @stable ICU 4.4 |
456 | | */ |
457 | | FilteredNormalizer2(const Normalizer2 &n2, const UnicodeSet &filterSet) : |
458 | 0 | norm2(n2), set(filterSet) {} |
459 | | |
460 | | /** |
461 | | * Destructor. |
462 | | * @stable ICU 4.4 |
463 | | */ |
464 | | ~FilteredNormalizer2(); |
465 | | |
466 | | /** |
467 | | * Writes the normalized form of the source string to the destination string |
468 | | * (replacing its contents) and returns the destination string. |
469 | | * The source and destination strings must be different objects. |
470 | | * @param src source string |
471 | | * @param dest destination string; its contents is replaced with normalized src |
472 | | * @param errorCode Standard ICU error code. Its input value must |
473 | | * pass the U_SUCCESS() test, or else the function returns |
474 | | * immediately. Check for U_FAILURE() on output or use with |
475 | | * function chaining. (See User Guide for details.) |
476 | | * @return dest |
477 | | * @stable ICU 4.4 |
478 | | */ |
479 | | virtual UnicodeString & |
480 | | normalize(const UnicodeString &src, |
481 | | UnicodeString &dest, |
482 | | UErrorCode &errorCode) const; |
483 | | /** |
484 | | * Appends the normalized form of the second string to the first string |
485 | | * (merging them at the boundary) and returns the first string. |
486 | | * The result is normalized if the first string was normalized. |
487 | | * The first and second strings must be different objects. |
488 | | * @param first string, should be normalized |
489 | | * @param second string, will be normalized |
490 | | * @param errorCode Standard ICU error code. Its input value must |
491 | | * pass the U_SUCCESS() test, or else the function returns |
492 | | * immediately. Check for U_FAILURE() on output or use with |
493 | | * function chaining. (See User Guide for details.) |
494 | | * @return first |
495 | | * @stable ICU 4.4 |
496 | | */ |
497 | | virtual UnicodeString & |
498 | | normalizeSecondAndAppend(UnicodeString &first, |
499 | | const UnicodeString &second, |
500 | | UErrorCode &errorCode) const; |
501 | | /** |
502 | | * Appends the second string to the first string |
503 | | * (merging them at the boundary) and returns the first string. |
504 | | * The result is normalized if both the strings were normalized. |
505 | | * The first and second strings must be different objects. |
506 | | * @param first string, should be normalized |
507 | | * @param second string, should be normalized |
508 | | * @param errorCode Standard ICU error code. Its input value must |
509 | | * pass the U_SUCCESS() test, or else the function returns |
510 | | * immediately. Check for U_FAILURE() on output or use with |
511 | | * function chaining. (See User Guide for details.) |
512 | | * @return first |
513 | | * @stable ICU 4.4 |
514 | | */ |
515 | | virtual UnicodeString & |
516 | | append(UnicodeString &first, |
517 | | const UnicodeString &second, |
518 | | UErrorCode &errorCode) const; |
519 | | |
520 | | /** |
521 | | * Gets the decomposition mapping of c. |
522 | | * For details see the base class documentation. |
523 | | * |
524 | | * This function is independent of the mode of the Normalizer2. |
525 | | * @param c code point |
526 | | * @param decomposition String object which will be set to c's |
527 | | * decomposition mapping, if there is one. |
528 | | * @return TRUE if c has a decomposition, otherwise FALSE |
529 | | * @stable ICU 4.6 |
530 | | */ |
531 | | virtual UBool |
532 | | getDecomposition(UChar32 c, UnicodeString &decomposition) const; |
533 | | |
534 | | /** |
535 | | * Gets the raw decomposition mapping of c. |
536 | | * For details see the base class documentation. |
537 | | * |
538 | | * This function is independent of the mode of the Normalizer2. |
539 | | * @param c code point |
540 | | * @param decomposition String object which will be set to c's |
541 | | * raw decomposition mapping, if there is one. |
542 | | * @return TRUE if c has a decomposition, otherwise FALSE |
543 | | * @stable ICU 49 |
544 | | */ |
545 | | virtual UBool |
546 | | getRawDecomposition(UChar32 c, UnicodeString &decomposition) const; |
547 | | |
548 | | /** |
549 | | * Performs pairwise composition of a & b and returns the composite if there is one. |
550 | | * For details see the base class documentation. |
551 | | * |
552 | | * This function is independent of the mode of the Normalizer2. |
553 | | * @param a A (normalization starter) code point. |
554 | | * @param b Another code point. |
555 | | * @return The non-negative composite code point if there is one; otherwise a negative value. |
556 | | * @stable ICU 49 |
557 | | */ |
558 | | virtual UChar32 |
559 | | composePair(UChar32 a, UChar32 b) const; |
560 | | |
561 | | /** |
562 | | * Gets the combining class of c. |
563 | | * The default implementation returns 0 |
564 | | * but all standard implementations return the Unicode Canonical_Combining_Class value. |
565 | | * @param c code point |
566 | | * @return c's combining class |
567 | | * @stable ICU 49 |
568 | | */ |
569 | | virtual uint8_t |
570 | | getCombiningClass(UChar32 c) const; |
571 | | |
572 | | /** |
573 | | * Tests if the string is normalized. |
574 | | * For details see the Normalizer2 base class documentation. |
575 | | * @param s input string |
576 | | * @param errorCode Standard ICU error code. Its input value must |
577 | | * pass the U_SUCCESS() test, or else the function returns |
578 | | * immediately. Check for U_FAILURE() on output or use with |
579 | | * function chaining. (See User Guide for details.) |
580 | | * @return TRUE if s is normalized |
581 | | * @stable ICU 4.4 |
582 | | */ |
583 | | virtual UBool |
584 | | isNormalized(const UnicodeString &s, UErrorCode &errorCode) const; |
585 | | /** |
586 | | * Tests if the string is normalized. |
587 | | * For details see the Normalizer2 base class documentation. |
588 | | * @param s input string |
589 | | * @param errorCode Standard ICU error code. Its input value must |
590 | | * pass the U_SUCCESS() test, or else the function returns |
591 | | * immediately. Check for U_FAILURE() on output or use with |
592 | | * function chaining. (See User Guide for details.) |
593 | | * @return UNormalizationCheckResult |
594 | | * @stable ICU 4.4 |
595 | | */ |
596 | | virtual UNormalizationCheckResult |
597 | | quickCheck(const UnicodeString &s, UErrorCode &errorCode) const; |
598 | | /** |
599 | | * Returns the end of the normalized substring of the input string. |
600 | | * For details see the Normalizer2 base class documentation. |
601 | | * @param s input string |
602 | | * @param errorCode Standard ICU error code. Its input value must |
603 | | * pass the U_SUCCESS() test, or else the function returns |
604 | | * immediately. Check for U_FAILURE() on output or use with |
605 | | * function chaining. (See User Guide for details.) |
606 | | * @return "yes" span end index |
607 | | * @stable ICU 4.4 |
608 | | */ |
609 | | virtual int32_t |
610 | | spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const; |
611 | | |
612 | | /** |
613 | | * Tests if the character always has a normalization boundary before it, |
614 | | * regardless of context. |
615 | | * For details see the Normalizer2 base class documentation. |
616 | | * @param c character to test |
617 | | * @return TRUE if c has a normalization boundary before it |
618 | | * @stable ICU 4.4 |
619 | | */ |
620 | | virtual UBool hasBoundaryBefore(UChar32 c) const; |
621 | | |
622 | | /** |
623 | | * Tests if the character always has a normalization boundary after it, |
624 | | * regardless of context. |
625 | | * For details see the Normalizer2 base class documentation. |
626 | | * @param c character to test |
627 | | * @return TRUE if c has a normalization boundary after it |
628 | | * @stable ICU 4.4 |
629 | | */ |
630 | | virtual UBool hasBoundaryAfter(UChar32 c) const; |
631 | | |
632 | | /** |
633 | | * Tests if the character is normalization-inert. |
634 | | * For details see the Normalizer2 base class documentation. |
635 | | * @param c character to test |
636 | | * @return TRUE if c is normalization-inert |
637 | | * @stable ICU 4.4 |
638 | | */ |
639 | | virtual UBool isInert(UChar32 c) const; |
640 | | private: |
641 | | UnicodeString & |
642 | | normalize(const UnicodeString &src, |
643 | | UnicodeString &dest, |
644 | | USetSpanCondition spanCondition, |
645 | | UErrorCode &errorCode) const; |
646 | | |
647 | | UnicodeString & |
648 | | normalizeSecondAndAppend(UnicodeString &first, |
649 | | const UnicodeString &second, |
650 | | UBool doNormalize, |
651 | | UErrorCode &errorCode) const; |
652 | | |
653 | | const Normalizer2 &norm2; |
654 | | const UnicodeSet &set; |
655 | | }; |
656 | | |
657 | | U_NAMESPACE_END |
658 | | |
659 | | #endif // !UCONFIG_NO_NORMALIZATION |
660 | | #endif // __NORMALIZER2_H__ |