/src/icu/icu4c/source/common/unicode/normalizer2.h
Line | Count | Source |
1 | | // © 2016 and later: Unicode, Inc. and others. |
2 | | // License & terms of use: http://www.unicode.org/copyright.html |
3 | | /* |
4 | | ******************************************************************************* |
5 | | * |
6 | | * Copyright (C) 2009-2013, International Business Machines |
7 | | * Corporation and others. All Rights Reserved. |
8 | | * |
9 | | ******************************************************************************* |
10 | | * file name: normalizer2.h |
11 | | * encoding: UTF-8 |
12 | | * tab size: 8 (not used) |
13 | | * indentation:4 |
14 | | * |
15 | | * created on: 2009nov22 |
16 | | * created by: Markus W. Scherer |
17 | | */ |
18 | | |
19 | | #ifndef __NORMALIZER2_H__ |
20 | | #define __NORMALIZER2_H__ |
21 | | |
22 | | /** |
23 | | * \file |
24 | | * \brief C++ API: New API for Unicode Normalization. |
25 | | */ |
26 | | |
27 | | #include "unicode/utypes.h" |
28 | | |
29 | | #if U_SHOW_CPLUSPLUS_API |
30 | | |
31 | | #if !UCONFIG_NO_NORMALIZATION |
32 | | |
33 | | #include "unicode/stringpiece.h" |
34 | | #include "unicode/uniset.h" |
35 | | #include "unicode/unistr.h" |
36 | | #include "unicode/unorm2.h" |
37 | | |
38 | | U_NAMESPACE_BEGIN |
39 | | |
40 | | class ByteSink; |
41 | | |
42 | | /** |
43 | | * Unicode normalization functionality for standard Unicode normalization or |
44 | | * for using custom mapping tables. |
45 | | * All instances of this class are unmodifiable/immutable. |
46 | | * Instances returned by getInstance() are singletons that must not be deleted by the caller. |
47 | | * The Normalizer2 class is not intended for public subclassing. |
48 | | * |
49 | | * The primary functions are to produce a normalized string and to detect whether |
50 | | * a string is already normalized. |
51 | | * The most commonly used normalization forms are those defined in |
52 | | * http://www.unicode.org/unicode/reports/tr15/ |
53 | | * However, this API supports additional normalization forms for specialized purposes. |
54 | | * For example, NFKC_Casefold is provided via getInstance("nfkc_cf", COMPOSE) |
55 | | * and can be used in implementations of UTS #46. |
56 | | * |
57 | | * Not only are the standard compose and decompose modes supplied, |
58 | | * but additional modes are provided as documented in the Mode enum. |
59 | | * |
60 | | * Some of the functions in this class identify normalization boundaries. |
61 | | * At a normalization boundary, the portions of the string |
62 | | * before it and starting from it do not interact and can be handled independently. |
63 | | * |
64 | | * The spanQuickCheckYes() stops at a normalization boundary. |
65 | | * When the goal is a normalized string, then the text before the boundary |
66 | | * can be copied, and the remainder can be processed with normalizeSecondAndAppend(). |
67 | | * |
68 | | * The hasBoundaryBefore(), hasBoundaryAfter() and isInert() functions test whether |
69 | | * a character is guaranteed to be at a normalization boundary, |
70 | | * regardless of context. |
71 | | * This is used for moving from one normalization boundary to the next |
72 | | * or preceding boundary, and for performing iterative normalization. |
73 | | * |
74 | | * Iterative normalization is useful when only a small portion of a |
75 | | * longer string needs to be processed. |
76 | | * For example, in ICU, iterative normalization is used by the NormalizationTransliterator |
77 | | * (to avoid replacing already-normalized text) and ucol_nextSortKeyPart() |
78 | | * (to process only the substring for which sort key bytes are computed). |
79 | | * |
80 | | * The set of normalization boundaries returned by these functions may not be |
81 | | * complete: There may be more boundaries that could be returned. |
82 | | * Different functions may return different boundaries. |
83 | | * @stable ICU 4.4 |
84 | | */ |
85 | | class U_COMMON_API Normalizer2 : public UObject { |
86 | | public: |
87 | | /** |
88 | | * Destructor. |
89 | | * @stable ICU 4.4 |
90 | | */ |
91 | | ~Normalizer2(); |
92 | | |
93 | | /** |
94 | | * Returns a Normalizer2 instance for Unicode NFC normalization. |
95 | | * Same as getInstance(nullptr, "nfc", UNORM2_COMPOSE, errorCode). |
96 | | * Returns an unmodifiable singleton instance. Do not delete it. |
97 | | * @param errorCode Standard ICU error code. Its input value must |
98 | | * pass the U_SUCCESS() test, or else the function returns |
99 | | * immediately. Check for U_FAILURE() on output or use with |
100 | | * function chaining. (See User Guide for details.) |
101 | | * @return the requested Normalizer2, if successful |
102 | | * @stable ICU 49 |
103 | | */ |
104 | | static const Normalizer2 * |
105 | | getNFCInstance(UErrorCode &errorCode); |
106 | | |
107 | | /** |
108 | | * Returns a Normalizer2 instance for Unicode NFD normalization. |
109 | | * Same as getInstance(nullptr, "nfc", UNORM2_DECOMPOSE, errorCode). |
110 | | * Returns an unmodifiable singleton instance. Do not delete it. |
111 | | * @param errorCode Standard ICU error code. Its input value must |
112 | | * pass the U_SUCCESS() test, or else the function returns |
113 | | * immediately. Check for U_FAILURE() on output or use with |
114 | | * function chaining. (See User Guide for details.) |
115 | | * @return the requested Normalizer2, if successful |
116 | | * @stable ICU 49 |
117 | | */ |
118 | | static const Normalizer2 * |
119 | | getNFDInstance(UErrorCode &errorCode); |
120 | | |
121 | | /** |
122 | | * Returns a Normalizer2 instance for Unicode NFKC normalization. |
123 | | * Same as getInstance(nullptr, "nfkc", UNORM2_COMPOSE, errorCode). |
124 | | * Returns an unmodifiable singleton instance. Do not delete it. |
125 | | * @param errorCode Standard ICU error code. Its input value must |
126 | | * pass the U_SUCCESS() test, or else the function returns |
127 | | * immediately. Check for U_FAILURE() on output or use with |
128 | | * function chaining. (See User Guide for details.) |
129 | | * @return the requested Normalizer2, if successful |
130 | | * @stable ICU 49 |
131 | | */ |
132 | | static const Normalizer2 * |
133 | | getNFKCInstance(UErrorCode &errorCode); |
134 | | |
135 | | /** |
136 | | * Returns a Normalizer2 instance for Unicode NFKD normalization. |
137 | | * Same as getInstance(nullptr, "nfkc", UNORM2_DECOMPOSE, errorCode). |
138 | | * Returns an unmodifiable singleton instance. Do not delete it. |
139 | | * @param errorCode Standard ICU error code. Its input value must |
140 | | * pass the U_SUCCESS() test, or else the function returns |
141 | | * immediately. Check for U_FAILURE() on output or use with |
142 | | * function chaining. (See User Guide for details.) |
143 | | * @return the requested Normalizer2, if successful |
144 | | * @stable ICU 49 |
145 | | */ |
146 | | static const Normalizer2 * |
147 | | getNFKDInstance(UErrorCode &errorCode); |
148 | | |
149 | | /** |
150 | | * Returns a Normalizer2 instance for Unicode toNFKC_Casefold() normalization |
151 | | * which is equivalent to applying the NFKC_Casefold mappings and then NFC. |
152 | | * See https://www.unicode.org/reports/tr44/#NFKC_Casefold |
153 | | * |
154 | | * Same as getInstance(nullptr, "nfkc_cf", UNORM2_COMPOSE, errorCode). |
155 | | * Returns an unmodifiable singleton instance. Do not delete it. |
156 | | * @param errorCode Standard ICU error code. Its input value must |
157 | | * pass the U_SUCCESS() test, or else the function returns |
158 | | * immediately. Check for U_FAILURE() on output or use with |
159 | | * function chaining. (See User Guide for details.) |
160 | | * @return the requested Normalizer2, if successful |
161 | | * @stable ICU 49 |
162 | | */ |
163 | | static const Normalizer2 * |
164 | | getNFKCCasefoldInstance(UErrorCode &errorCode); |
165 | | |
166 | | /** |
167 | | * Returns a Normalizer2 instance for a variant of Unicode toNFKC_Casefold() normalization |
168 | | * which is equivalent to applying the NFKC_Simple_Casefold mappings and then NFC. |
169 | | * See https://www.unicode.org/reports/tr44/#NFKC_Simple_Casefold |
170 | | * |
171 | | * Same as getInstance(nullptr, "nfkc_scf", UNORM2_COMPOSE, errorCode). |
172 | | * Returns an unmodifiable singleton instance. Do not delete it. |
173 | | * @param errorCode Standard ICU error code. Its input value must |
174 | | * pass the U_SUCCESS() test, or else the function returns |
175 | | * immediately. Check for U_FAILURE() on output or use with |
176 | | * function chaining. (See User Guide for details.) |
177 | | * @return the requested Normalizer2, if successful |
178 | | * @stable ICU 74 |
179 | | */ |
180 | | static const Normalizer2 * |
181 | | getNFKCSimpleCasefoldInstance(UErrorCode &errorCode); |
182 | | |
183 | | /** |
184 | | * Returns a Normalizer2 instance which uses the specified data file |
185 | | * (packageName/name similar to ucnv_openPackage() and ures_open()/ResourceBundle) |
186 | | * and which composes or decomposes text according to the specified mode. |
187 | | * Returns an unmodifiable singleton instance. Do not delete it. |
188 | | * |
189 | | * Use packageName=nullptr for data files that are part of ICU's own data. |
190 | | * Use name="nfc" and UNORM2_COMPOSE/UNORM2_DECOMPOSE for Unicode standard NFC/NFD. |
191 | | * Use name="nfkc" and UNORM2_COMPOSE/UNORM2_DECOMPOSE for Unicode standard NFKC/NFKD. |
192 | | * Use name="nfkc_cf" and UNORM2_COMPOSE for Unicode standard NFKC_CF=NFKC_Casefold. |
193 | | * |
194 | | * @param packageName nullptr for ICU built-in data, otherwise application data package name |
195 | | * @param name "nfc" or "nfkc" or "nfkc_cf" or "nfkc_scf" or name of custom data file |
196 | | * @param mode normalization mode (compose or decompose etc.) |
197 | | * @param errorCode Standard ICU error code. Its input value must |
198 | | * pass the U_SUCCESS() test, or else the function returns |
199 | | * immediately. Check for U_FAILURE() on output or use with |
200 | | * function chaining. (See User Guide for details.) |
201 | | * @return the requested Normalizer2, if successful |
202 | | * @stable ICU 4.4 |
203 | | */ |
204 | | static const Normalizer2 * |
205 | | getInstance(const char *packageName, |
206 | | const char *name, |
207 | | UNormalization2Mode mode, |
208 | | UErrorCode &errorCode); |
209 | | |
210 | | /** |
211 | | * Returns the normalized form of the source string. |
212 | | * @param src source string |
213 | | * @param errorCode Standard ICU error code. Its input value must |
214 | | * pass the U_SUCCESS() test, or else the function returns |
215 | | * immediately. Check for U_FAILURE() on output or use with |
216 | | * function chaining. (See User Guide for details.) |
217 | | * @return normalized src |
218 | | * @stable ICU 4.4 |
219 | | */ |
220 | | UnicodeString |
221 | 5.82M | normalize(const UnicodeString &src, UErrorCode &errorCode) const { |
222 | 5.82M | UnicodeString result; |
223 | 5.82M | normalize(src, result, errorCode); |
224 | 5.82M | return result; |
225 | 5.82M | } |
226 | | /** |
227 | | * Writes the normalized form of the source string to the destination string |
228 | | * (replacing its contents) and returns the destination string. |
229 | | * The source and destination strings must be different objects. |
230 | | * @param src source string |
231 | | * @param dest destination string; its contents is replaced with normalized src |
232 | | * @param errorCode Standard ICU error code. Its input value must |
233 | | * pass the U_SUCCESS() test, or else the function returns |
234 | | * immediately. Check for U_FAILURE() on output or use with |
235 | | * function chaining. (See User Guide for details.) |
236 | | * @return dest |
237 | | * @stable ICU 4.4 |
238 | | */ |
239 | | virtual UnicodeString & |
240 | | normalize(const UnicodeString &src, |
241 | | UnicodeString &dest, |
242 | | UErrorCode &errorCode) const = 0; |
243 | | |
244 | | /** |
245 | | * Normalizes a UTF-8 string and optionally records how source substrings |
246 | | * relate to changed and unchanged result substrings. |
247 | | * |
248 | | * Implemented completely for all built-in modes except for FCD. |
249 | | * The base class implementation converts to & from UTF-16 and does not support edits. |
250 | | * |
251 | | * @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT and U_EDITS_NO_RESET. |
252 | | * @param src Source UTF-8 string. |
253 | | * @param sink A ByteSink to which the normalized UTF-8 result string is written. |
254 | | * sink.Flush() is called at the end. |
255 | | * @param edits Records edits for index mapping, working with styled text, |
256 | | * and getting only changes (if any). |
257 | | * The Edits contents is undefined if any error occurs. |
258 | | * This function calls edits->reset() first unless |
259 | | * options includes U_EDITS_NO_RESET. edits can be nullptr. |
260 | | * @param errorCode Standard ICU error code. Its input value must |
261 | | * pass the U_SUCCESS() test, or else the function returns |
262 | | * immediately. Check for U_FAILURE() on output or use with |
263 | | * function chaining. (See User Guide for details.) |
264 | | * @stable ICU 60 |
265 | | */ |
266 | | virtual void |
267 | | normalizeUTF8(uint32_t options, StringPiece src, ByteSink &sink, |
268 | | Edits *edits, UErrorCode &errorCode) const; |
269 | | |
270 | | /** |
271 | | * Appends the normalized form of the second string to the first string |
272 | | * (merging them at the boundary) and returns the first string. |
273 | | * The result is normalized if the first string was normalized. |
274 | | * The first and second strings must be different objects. |
275 | | * @param first string, should be normalized |
276 | | * @param second string, will be normalized |
277 | | * @param errorCode Standard ICU error code. Its input value must |
278 | | * pass the U_SUCCESS() test, or else the function returns |
279 | | * immediately. Check for U_FAILURE() on output or use with |
280 | | * function chaining. (See User Guide for details.) |
281 | | * @return first |
282 | | * @stable ICU 4.4 |
283 | | */ |
284 | | virtual UnicodeString & |
285 | | normalizeSecondAndAppend(UnicodeString &first, |
286 | | const UnicodeString &second, |
287 | | UErrorCode &errorCode) const = 0; |
288 | | /** |
289 | | * Appends the second string to the first string |
290 | | * (merging them at the boundary) and returns the first string. |
291 | | * The result is normalized if both the strings were normalized. |
292 | | * The first and second strings must be different objects. |
293 | | * @param first string, should be normalized |
294 | | * @param second string, should be normalized |
295 | | * @param errorCode Standard ICU error code. Its input value must |
296 | | * pass the U_SUCCESS() test, or else the function returns |
297 | | * immediately. Check for U_FAILURE() on output or use with |
298 | | * function chaining. (See User Guide for details.) |
299 | | * @return first |
300 | | * @stable ICU 4.4 |
301 | | */ |
302 | | virtual UnicodeString & |
303 | | append(UnicodeString &first, |
304 | | const UnicodeString &second, |
305 | | UErrorCode &errorCode) const = 0; |
306 | | |
307 | | /** |
308 | | * Gets the decomposition mapping of c. |
309 | | * Roughly equivalent to normalizing the String form of c |
310 | | * on a UNORM2_DECOMPOSE Normalizer2 instance, but much faster, and except that this function |
311 | | * returns false and does not write a string |
312 | | * if c does not have a decomposition mapping in this instance's data. |
313 | | * This function is independent of the mode of the Normalizer2. |
314 | | * @param c code point |
315 | | * @param decomposition String object which will be set to c's |
316 | | * decomposition mapping, if there is one. |
317 | | * @return true if c has a decomposition, otherwise false |
318 | | * @stable ICU 4.6 |
319 | | */ |
320 | | virtual UBool |
321 | | getDecomposition(UChar32 c, UnicodeString &decomposition) const = 0; |
322 | | |
323 | | /** |
324 | | * Gets the raw decomposition mapping of c. |
325 | | * |
326 | | * This is similar to the getDecomposition() method but returns the |
327 | | * raw decomposition mapping as specified in UnicodeData.txt or |
328 | | * (for custom data) in the mapping files processed by the gennorm2 tool. |
329 | | * By contrast, getDecomposition() returns the processed, |
330 | | * recursively-decomposed version of this mapping. |
331 | | * |
332 | | * When used on a standard NFKC Normalizer2 instance, |
333 | | * getRawDecomposition() returns the Unicode Decomposition_Mapping (dm) property. |
334 | | * |
335 | | * When used on a standard NFC Normalizer2 instance, |
336 | | * it returns the Decomposition_Mapping only if the Decomposition_Type (dt) is Canonical (Can); |
337 | | * in this case, the result contains either one or two code points (=1..4 char16_ts). |
338 | | * |
339 | | * This function is independent of the mode of the Normalizer2. |
340 | | * The default implementation returns false. |
341 | | * @param c code point |
342 | | * @param decomposition String object which will be set to c's |
343 | | * raw decomposition mapping, if there is one. |
344 | | * @return true if c has a decomposition, otherwise false |
345 | | * @stable ICU 49 |
346 | | */ |
347 | | virtual UBool |
348 | | getRawDecomposition(UChar32 c, UnicodeString &decomposition) const; |
349 | | |
350 | | /** |
351 | | * Performs pairwise composition of a & b and returns the composite if there is one. |
352 | | * |
353 | | * Returns a composite code point c only if c has a two-way mapping to a+b. |
354 | | * In standard Unicode normalization, this means that |
355 | | * c has a canonical decomposition to a+b |
356 | | * and c does not have the Full_Composition_Exclusion property. |
357 | | * |
358 | | * This function is independent of the mode of the Normalizer2. |
359 | | * The default implementation returns a negative value. |
360 | | * @param a A (normalization starter) code point. |
361 | | * @param b Another code point. |
362 | | * @return The non-negative composite code point if there is one; otherwise a negative value. |
363 | | * @stable ICU 49 |
364 | | */ |
365 | | virtual UChar32 |
366 | | composePair(UChar32 a, UChar32 b) const; |
367 | | |
368 | | /** |
369 | | * Gets the combining class of c. |
370 | | * The default implementation returns 0 |
371 | | * but all standard implementations return the Unicode Canonical_Combining_Class value. |
372 | | * @param c code point |
373 | | * @return c's combining class |
374 | | * @stable ICU 49 |
375 | | */ |
376 | | virtual uint8_t |
377 | | getCombiningClass(UChar32 c) const; |
378 | | |
379 | | /** |
380 | | * Tests if the string is normalized. |
381 | | * Internally, in cases where the quickCheck() method would return "maybe" |
382 | | * (which is only possible for the two COMPOSE modes) this method |
383 | | * resolves to "yes" or "no" to provide a definitive result, |
384 | | * at the cost of doing more work in those cases. |
385 | | * @param s input string |
386 | | * @param errorCode Standard ICU error code. Its input value must |
387 | | * pass the U_SUCCESS() test, or else the function returns |
388 | | * immediately. Check for U_FAILURE() on output or use with |
389 | | * function chaining. (See User Guide for details.) |
390 | | * @return true if s is normalized |
391 | | * @stable ICU 4.4 |
392 | | */ |
393 | | virtual UBool |
394 | | isNormalized(const UnicodeString &s, UErrorCode &errorCode) const = 0; |
395 | | /** |
396 | | * Tests if the UTF-8 string is normalized. |
397 | | * Internally, in cases where the quickCheck() method would return "maybe" |
398 | | * (which is only possible for the two COMPOSE modes) this method |
399 | | * resolves to "yes" or "no" to provide a definitive result, |
400 | | * at the cost of doing more work in those cases. |
401 | | * |
402 | | * This works for all normalization modes. |
403 | | * It is optimized for UTF-8 for all built-in modes except for FCD. |
404 | | * The base class implementation converts to UTF-16 and calls isNormalized(). |
405 | | * |
406 | | * @param s UTF-8 input string |
407 | | * @param errorCode Standard ICU error code. Its input value must |
408 | | * pass the U_SUCCESS() test, or else the function returns |
409 | | * immediately. Check for U_FAILURE() on output or use with |
410 | | * function chaining. (See User Guide for details.) |
411 | | * @return true if s is normalized |
412 | | * @stable ICU 60 |
413 | | */ |
414 | | virtual UBool |
415 | | isNormalizedUTF8(StringPiece s, UErrorCode &errorCode) const; |
416 | | |
417 | | |
418 | | /** |
419 | | * Tests if the string is normalized. |
420 | | * For the two COMPOSE modes, the result could be "maybe" in cases that |
421 | | * would take a little more work to resolve definitively. |
422 | | * Use spanQuickCheckYes() and normalizeSecondAndAppend() for a faster |
423 | | * combination of quick check + normalization, to avoid |
424 | | * re-checking the "yes" prefix. |
425 | | * @param s input string |
426 | | * @param errorCode Standard ICU error code. Its input value must |
427 | | * pass the U_SUCCESS() test, or else the function returns |
428 | | * immediately. Check for U_FAILURE() on output or use with |
429 | | * function chaining. (See User Guide for details.) |
430 | | * @return UNormalizationCheckResult |
431 | | * @stable ICU 4.4 |
432 | | */ |
433 | | virtual UNormalizationCheckResult |
434 | | quickCheck(const UnicodeString &s, UErrorCode &errorCode) const = 0; |
435 | | |
436 | | /** |
437 | | * Returns the end of the normalized substring of the input string. |
438 | | * In other words, with <code>end=spanQuickCheckYes(s, ec);</code> |
439 | | * the substring <code>UnicodeString(s, 0, end)</code> |
440 | | * will pass the quick check with a "yes" result. |
441 | | * |
442 | | * The returned end index is usually one or more characters before the |
443 | | * "no" or "maybe" character: The end index is at a normalization boundary. |
444 | | * (See the class documentation for more about normalization boundaries.) |
445 | | * |
446 | | * When the goal is a normalized string and most input strings are expected |
447 | | * to be normalized already, then call this method, |
448 | | * and if it returns a prefix shorter than the input string, |
449 | | * copy that prefix and use normalizeSecondAndAppend() for the remainder. |
450 | | * @param s input string |
451 | | * @param errorCode Standard ICU error code. Its input value must |
452 | | * pass the U_SUCCESS() test, or else the function returns |
453 | | * immediately. Check for U_FAILURE() on output or use with |
454 | | * function chaining. (See User Guide for details.) |
455 | | * @return "yes" span end index |
456 | | * @stable ICU 4.4 |
457 | | */ |
458 | | virtual int32_t |
459 | | spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const = 0; |
460 | | |
461 | | /** |
462 | | * Tests if the character always has a normalization boundary before it, |
463 | | * regardless of context. |
464 | | * If true, then the character does not normalization-interact with |
465 | | * preceding characters. |
466 | | * In other words, a string containing this character can be normalized |
467 | | * by processing portions before this character and starting from this |
468 | | * character independently. |
469 | | * This is used for iterative normalization. See the class documentation for details. |
470 | | * @param c character to test |
471 | | * @return true if c has a normalization boundary before it |
472 | | * @stable ICU 4.4 |
473 | | */ |
474 | | virtual UBool hasBoundaryBefore(UChar32 c) const = 0; |
475 | | |
476 | | /** |
477 | | * Tests if the character always has a normalization boundary after it, |
478 | | * regardless of context. |
479 | | * If true, then the character does not normalization-interact with |
480 | | * following characters. |
481 | | * In other words, a string containing this character can be normalized |
482 | | * by processing portions up to this character and after this |
483 | | * character independently. |
484 | | * This is used for iterative normalization. See the class documentation for details. |
485 | | * Note that this operation may be significantly slower than hasBoundaryBefore(). |
486 | | * @param c character to test |
487 | | * @return true if c has a normalization boundary after it |
488 | | * @stable ICU 4.4 |
489 | | */ |
490 | | virtual UBool hasBoundaryAfter(UChar32 c) const = 0; |
491 | | |
492 | | /** |
493 | | * Tests if the character is normalization-inert. |
494 | | * If true, then the character does not change, nor normalization-interact with |
495 | | * preceding or following characters. |
496 | | * In other words, a string containing this character can be normalized |
497 | | * by processing portions before this character and after this |
498 | | * character independently. |
499 | | * This is used for iterative normalization. See the class documentation for details. |
500 | | * Note that this operation may be significantly slower than hasBoundaryBefore(). |
501 | | * @param c character to test |
502 | | * @return true if c is normalization-inert |
503 | | * @stable ICU 4.4 |
504 | | */ |
505 | | virtual UBool isInert(UChar32 c) const = 0; |
506 | | }; |
507 | | |
508 | | /** |
509 | | * Normalization filtered by a UnicodeSet. |
510 | | * Normalizes portions of the text contained in the filter set and leaves |
511 | | * portions not contained in the filter set unchanged. |
512 | | * Filtering is done via UnicodeSet::span(..., USET_SPAN_SIMPLE). |
513 | | * Not-in-the-filter text is treated as "is normalized" and "quick check yes". |
514 | | * This class implements all of (and only) the Normalizer2 API. |
515 | | * An instance of this class is unmodifiable/immutable but is constructed and |
516 | | * must be destructed by the owner. |
517 | | * @stable ICU 4.4 |
518 | | */ |
519 | | class U_COMMON_API FilteredNormalizer2 : public Normalizer2 { |
520 | | public: |
521 | | /** |
522 | | * Constructs a filtered normalizer wrapping any Normalizer2 instance |
523 | | * and a filter set. |
524 | | * Both are aliased and must not be modified or deleted while this object |
525 | | * is used. |
526 | | * The filter set should be frozen; otherwise the performance will suffer greatly. |
527 | | * @param n2 wrapped Normalizer2 instance |
528 | | * @param filterSet UnicodeSet which determines the characters to be normalized |
529 | | * @stable ICU 4.4 |
530 | | */ |
531 | | FilteredNormalizer2(const Normalizer2 &n2, const UnicodeSet &filterSet) : |
532 | 0 | norm2(n2), set(filterSet) {} |
533 | | |
534 | | /** |
535 | | * Destructor. |
536 | | * @stable ICU 4.4 |
537 | | */ |
538 | | ~FilteredNormalizer2(); |
539 | | |
540 | | /** |
541 | | * Writes the normalized form of the source string to the destination string |
542 | | * (replacing its contents) and returns the destination string. |
543 | | * The source and destination strings must be different objects. |
544 | | * @param src source string |
545 | | * @param dest destination string; its contents is replaced with normalized src |
546 | | * @param errorCode Standard ICU error code. Its input value must |
547 | | * pass the U_SUCCESS() test, or else the function returns |
548 | | * immediately. Check for U_FAILURE() on output or use with |
549 | | * function chaining. (See User Guide for details.) |
550 | | * @return dest |
551 | | * @stable ICU 4.4 |
552 | | */ |
553 | | virtual UnicodeString & |
554 | | normalize(const UnicodeString &src, |
555 | | UnicodeString &dest, |
556 | | UErrorCode &errorCode) const override; |
557 | | |
558 | | /** |
559 | | * Normalizes a UTF-8 string and optionally records how source substrings |
560 | | * relate to changed and unchanged result substrings. |
561 | | * |
562 | | * Implemented completely for most built-in modes except for FCD. |
563 | | * The base class implementation converts to & from UTF-16 and does not support edits. |
564 | | * |
565 | | * @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT and U_EDITS_NO_RESET. |
566 | | * @param src Source UTF-8 string. |
567 | | * @param sink A ByteSink to which the normalized UTF-8 result string is written. |
568 | | * sink.Flush() is called at the end. |
569 | | * @param edits Records edits for index mapping, working with styled text, |
570 | | * and getting only changes (if any). |
571 | | * The Edits contents is undefined if any error occurs. |
572 | | * This function calls edits->reset() first unless |
573 | | * options includes U_EDITS_NO_RESET. edits can be nullptr. |
574 | | * @param errorCode Standard ICU error code. Its input value must |
575 | | * pass the U_SUCCESS() test, or else the function returns |
576 | | * immediately. Check for U_FAILURE() on output or use with |
577 | | * function chaining. (See User Guide for details.) |
578 | | * @stable ICU 60 |
579 | | */ |
580 | | virtual void |
581 | | normalizeUTF8(uint32_t options, StringPiece src, ByteSink &sink, |
582 | | Edits *edits, UErrorCode &errorCode) const override; |
583 | | |
584 | | /** |
585 | | * Appends the normalized form of the second string to the first string |
586 | | * (merging them at the boundary) and returns the first string. |
587 | | * The result is normalized if the first string was normalized. |
588 | | * The first and second strings must be different objects. |
589 | | * @param first string, should be normalized |
590 | | * @param second string, will be normalized |
591 | | * @param errorCode Standard ICU error code. Its input value must |
592 | | * pass the U_SUCCESS() test, or else the function returns |
593 | | * immediately. Check for U_FAILURE() on output or use with |
594 | | * function chaining. (See User Guide for details.) |
595 | | * @return first |
596 | | * @stable ICU 4.4 |
597 | | */ |
598 | | virtual UnicodeString & |
599 | | normalizeSecondAndAppend(UnicodeString &first, |
600 | | const UnicodeString &second, |
601 | | UErrorCode &errorCode) const override; |
602 | | /** |
603 | | * Appends the second string to the first string |
604 | | * (merging them at the boundary) and returns the first string. |
605 | | * The result is normalized if both the strings were normalized. |
606 | | * The first and second strings must be different objects. |
607 | | * @param first string, should be normalized |
608 | | * @param second string, should be normalized |
609 | | * @param errorCode Standard ICU error code. Its input value must |
610 | | * pass the U_SUCCESS() test, or else the function returns |
611 | | * immediately. Check for U_FAILURE() on output or use with |
612 | | * function chaining. (See User Guide for details.) |
613 | | * @return first |
614 | | * @stable ICU 4.4 |
615 | | */ |
616 | | virtual UnicodeString & |
617 | | append(UnicodeString &first, |
618 | | const UnicodeString &second, |
619 | | UErrorCode &errorCode) const override; |
620 | | |
621 | | /** |
622 | | * Gets the decomposition mapping of c. |
623 | | * For details see the base class documentation. |
624 | | * |
625 | | * This function is independent of the mode of the Normalizer2. |
626 | | * @param c code point |
627 | | * @param decomposition String object which will be set to c's |
628 | | * decomposition mapping, if there is one. |
629 | | * @return true if c has a decomposition, otherwise false |
630 | | * @stable ICU 4.6 |
631 | | */ |
632 | | virtual UBool |
633 | | getDecomposition(UChar32 c, UnicodeString &decomposition) const override; |
634 | | |
635 | | /** |
636 | | * Gets the raw decomposition mapping of c. |
637 | | * For details see the base class documentation. |
638 | | * |
639 | | * This function is independent of the mode of the Normalizer2. |
640 | | * @param c code point |
641 | | * @param decomposition String object which will be set to c's |
642 | | * raw decomposition mapping, if there is one. |
643 | | * @return true if c has a decomposition, otherwise false |
644 | | * @stable ICU 49 |
645 | | */ |
646 | | virtual UBool |
647 | | getRawDecomposition(UChar32 c, UnicodeString &decomposition) const override; |
648 | | |
649 | | /** |
650 | | * Performs pairwise composition of a & b and returns the composite if there is one. |
651 | | * For details see the base class documentation. |
652 | | * |
653 | | * This function is independent of the mode of the Normalizer2. |
654 | | * @param a A (normalization starter) code point. |
655 | | * @param b Another code point. |
656 | | * @return The non-negative composite code point if there is one; otherwise a negative value. |
657 | | * @stable ICU 49 |
658 | | */ |
659 | | virtual UChar32 |
660 | | composePair(UChar32 a, UChar32 b) const override; |
661 | | |
662 | | /** |
663 | | * Gets the combining class of c. |
664 | | * The default implementation returns 0 |
665 | | * but all standard implementations return the Unicode Canonical_Combining_Class value. |
666 | | * @param c code point |
667 | | * @return c's combining class |
668 | | * @stable ICU 49 |
669 | | */ |
670 | | virtual uint8_t |
671 | | getCombiningClass(UChar32 c) const override; |
672 | | |
673 | | /** |
674 | | * Tests if the string is normalized. |
675 | | * For details see the Normalizer2 base class documentation. |
676 | | * @param s input string |
677 | | * @param errorCode Standard ICU error code. Its input value must |
678 | | * pass the U_SUCCESS() test, or else the function returns |
679 | | * immediately. Check for U_FAILURE() on output or use with |
680 | | * function chaining. (See User Guide for details.) |
681 | | * @return true if s is normalized |
682 | | * @stable ICU 4.4 |
683 | | */ |
684 | | virtual UBool |
685 | | isNormalized(const UnicodeString &s, UErrorCode &errorCode) const override; |
686 | | /** |
687 | | * Tests if the UTF-8 string is normalized. |
688 | | * Internally, in cases where the quickCheck() method would return "maybe" |
689 | | * (which is only possible for the two COMPOSE modes) this method |
690 | | * resolves to "yes" or "no" to provide a definitive result, |
691 | | * at the cost of doing more work in those cases. |
692 | | * |
693 | | * This works for all normalization modes. |
694 | | * It is optimized for UTF-8 for all built-in modes except for FCD. |
695 | | * The base class implementation converts to UTF-16 and calls isNormalized(). |
696 | | * |
697 | | * @param s UTF-8 input string |
698 | | * @param errorCode Standard ICU error code. Its input value must |
699 | | * pass the U_SUCCESS() test, or else the function returns |
700 | | * immediately. Check for U_FAILURE() on output or use with |
701 | | * function chaining. (See User Guide for details.) |
702 | | * @return true if s is normalized |
703 | | * @stable ICU 60 |
704 | | */ |
705 | | virtual UBool |
706 | | isNormalizedUTF8(StringPiece s, UErrorCode &errorCode) const override; |
707 | | /** |
708 | | * Tests if the string is normalized. |
709 | | * For details see the Normalizer2 base class documentation. |
710 | | * @param s input string |
711 | | * @param errorCode Standard ICU error code. Its input value must |
712 | | * pass the U_SUCCESS() test, or else the function returns |
713 | | * immediately. Check for U_FAILURE() on output or use with |
714 | | * function chaining. (See User Guide for details.) |
715 | | * @return UNormalizationCheckResult |
716 | | * @stable ICU 4.4 |
717 | | */ |
718 | | virtual UNormalizationCheckResult |
719 | | quickCheck(const UnicodeString &s, UErrorCode &errorCode) const override; |
720 | | /** |
721 | | * Returns the end of the normalized substring of the input string. |
722 | | * For details see the Normalizer2 base class documentation. |
723 | | * @param s input string |
724 | | * @param errorCode Standard ICU error code. Its input value must |
725 | | * pass the U_SUCCESS() test, or else the function returns |
726 | | * immediately. Check for U_FAILURE() on output or use with |
727 | | * function chaining. (See User Guide for details.) |
728 | | * @return "yes" span end index |
729 | | * @stable ICU 4.4 |
730 | | */ |
731 | | virtual int32_t |
732 | | spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const override; |
733 | | |
734 | | /** |
735 | | * Tests if the character always has a normalization boundary before it, |
736 | | * regardless of context. |
737 | | * For details see the Normalizer2 base class documentation. |
738 | | * @param c character to test |
739 | | * @return true if c has a normalization boundary before it |
740 | | * @stable ICU 4.4 |
741 | | */ |
742 | | virtual UBool hasBoundaryBefore(UChar32 c) const override; |
743 | | |
744 | | /** |
745 | | * Tests if the character always has a normalization boundary after it, |
746 | | * regardless of context. |
747 | | * For details see the Normalizer2 base class documentation. |
748 | | * @param c character to test |
749 | | * @return true if c has a normalization boundary after it |
750 | | * @stable ICU 4.4 |
751 | | */ |
752 | | virtual UBool hasBoundaryAfter(UChar32 c) const override; |
753 | | |
754 | | /** |
755 | | * Tests if the character is normalization-inert. |
756 | | * For details see the Normalizer2 base class documentation. |
757 | | * @param c character to test |
758 | | * @return true if c is normalization-inert |
759 | | * @stable ICU 4.4 |
760 | | */ |
761 | | virtual UBool isInert(UChar32 c) const override; |
762 | | private: |
763 | | UnicodeString & |
764 | | normalize(const UnicodeString &src, |
765 | | UnicodeString &dest, |
766 | | USetSpanCondition spanCondition, |
767 | | UErrorCode &errorCode) const; |
768 | | |
769 | | void |
770 | | normalizeUTF8(uint32_t options, const char *src, int32_t length, |
771 | | ByteSink &sink, Edits *edits, |
772 | | USetSpanCondition spanCondition, |
773 | | UErrorCode &errorCode) const; |
774 | | |
775 | | UnicodeString & |
776 | | normalizeSecondAndAppend(UnicodeString &first, |
777 | | const UnicodeString &second, |
778 | | UBool doNormalize, |
779 | | UErrorCode &errorCode) const; |
780 | | |
781 | | const Normalizer2 &norm2; |
782 | | const UnicodeSet &set; |
783 | | }; |
784 | | |
785 | | U_NAMESPACE_END |
786 | | |
787 | | #endif // !UCONFIG_NO_NORMALIZATION |
788 | | |
789 | | #endif /* U_SHOW_CPLUSPLUS_API */ |
790 | | |
791 | | #endif // __NORMALIZER2_H__ |