/src/icu/source/common/unicode/normalizer2.h
Line | Count | Source (jump to first uncovered line) |
1 | | // © 2016 and later: Unicode, Inc. and others. |
2 | | // License & terms of use: http://www.unicode.org/copyright.html |
3 | | /* |
4 | | ******************************************************************************* |
5 | | * |
6 | | * Copyright (C) 2009-2013, International Business Machines |
7 | | * Corporation and others. All Rights Reserved. |
8 | | * |
9 | | ******************************************************************************* |
10 | | * file name: normalizer2.h |
11 | | * encoding: UTF-8 |
12 | | * tab size: 8 (not used) |
13 | | * indentation:4 |
14 | | * |
15 | | * created on: 2009nov22 |
16 | | * created by: Markus W. Scherer |
17 | | */ |
18 | | |
19 | | #ifndef __NORMALIZER2_H__ |
20 | | #define __NORMALIZER2_H__ |
21 | | |
22 | | /** |
23 | | * \file |
24 | | * \brief C++ API: New API for Unicode Normalization. |
25 | | */ |
26 | | |
27 | | #include "unicode/utypes.h" |
28 | | |
29 | | #if U_SHOW_CPLUSPLUS_API |
30 | | |
31 | | #if !UCONFIG_NO_NORMALIZATION |
32 | | |
33 | | #include "unicode/stringpiece.h" |
34 | | #include "unicode/uniset.h" |
35 | | #include "unicode/unistr.h" |
36 | | #include "unicode/unorm2.h" |
37 | | |
38 | | U_NAMESPACE_BEGIN |
39 | | |
40 | | class ByteSink; |
41 | | |
42 | | /** |
43 | | * Unicode normalization functionality for standard Unicode normalization or |
44 | | * for using custom mapping tables. |
45 | | * All instances of this class are unmodifiable/immutable. |
46 | | * Instances returned by getInstance() are singletons that must not be deleted by the caller. |
47 | | * The Normalizer2 class is not intended for public subclassing. |
48 | | * |
49 | | * The primary functions are to produce a normalized string and to detect whether |
50 | | * a string is already normalized. |
51 | | * The most commonly used normalization forms are those defined in |
52 | | * http://www.unicode.org/unicode/reports/tr15/ |
53 | | * However, this API supports additional normalization forms for specialized purposes. |
54 | | * For example, NFKC_Casefold is provided via getInstance("nfkc_cf", COMPOSE) |
55 | | * and can be used in implementations of UTS #46. |
56 | | * |
57 | | * Not only are the standard compose and decompose modes supplied, |
58 | | * but additional modes are provided as documented in the Mode enum. |
59 | | * |
60 | | * Some of the functions in this class identify normalization boundaries. |
61 | | * At a normalization boundary, the portions of the string |
62 | | * before it and starting from it do not interact and can be handled independently. |
63 | | * |
64 | | * The spanQuickCheckYes() stops at a normalization boundary. |
65 | | * When the goal is a normalized string, then the text before the boundary |
66 | | * can be copied, and the remainder can be processed with normalizeSecondAndAppend(). |
67 | | * |
68 | | * The hasBoundaryBefore(), hasBoundaryAfter() and isInert() functions test whether |
69 | | * a character is guaranteed to be at a normalization boundary, |
70 | | * regardless of context. |
71 | | * This is used for moving from one normalization boundary to the next |
72 | | * or preceding boundary, and for performing iterative normalization. |
73 | | * |
74 | | * Iterative normalization is useful when only a small portion of a |
75 | | * longer string needs to be processed. |
76 | | * For example, in ICU, iterative normalization is used by the NormalizationTransliterator |
77 | | * (to avoid replacing already-normalized text) and ucol_nextSortKeyPart() |
78 | | * (to process only the substring for which sort key bytes are computed). |
79 | | * |
80 | | * The set of normalization boundaries returned by these functions may not be |
81 | | * complete: There may be more boundaries that could be returned. |
82 | | * Different functions may return different boundaries. |
83 | | * @stable ICU 4.4 |
84 | | */ |
85 | | class U_COMMON_API Normalizer2 : public UObject { |
86 | | public: |
87 | | /** |
88 | | * Destructor. |
89 | | * @stable ICU 4.4 |
90 | | */ |
91 | | ~Normalizer2(); |
92 | | |
93 | | /** |
94 | | * Returns a Normalizer2 instance for Unicode NFC normalization. |
95 | | * Same as getInstance(NULL, "nfc", UNORM2_COMPOSE, errorCode). |
96 | | * Returns an unmodifiable singleton instance. Do not delete it. |
97 | | * @param errorCode Standard ICU error code. Its input value must |
98 | | * pass the U_SUCCESS() test, or else the function returns |
99 | | * immediately. Check for U_FAILURE() on output or use with |
100 | | * function chaining. (See User Guide for details.) |
101 | | * @return the requested Normalizer2, if successful |
102 | | * @stable ICU 49 |
103 | | */ |
104 | | static const Normalizer2 * |
105 | | getNFCInstance(UErrorCode &errorCode); |
106 | | |
107 | | /** |
108 | | * Returns a Normalizer2 instance for Unicode NFD normalization. |
109 | | * Same as getInstance(NULL, "nfc", UNORM2_DECOMPOSE, errorCode). |
110 | | * Returns an unmodifiable singleton instance. Do not delete it. |
111 | | * @param errorCode Standard ICU error code. Its input value must |
112 | | * pass the U_SUCCESS() test, or else the function returns |
113 | | * immediately. Check for U_FAILURE() on output or use with |
114 | | * function chaining. (See User Guide for details.) |
115 | | * @return the requested Normalizer2, if successful |
116 | | * @stable ICU 49 |
117 | | */ |
118 | | static const Normalizer2 * |
119 | | getNFDInstance(UErrorCode &errorCode); |
120 | | |
121 | | /** |
122 | | * Returns a Normalizer2 instance for Unicode NFKC normalization. |
123 | | * Same as getInstance(NULL, "nfkc", UNORM2_COMPOSE, errorCode). |
124 | | * Returns an unmodifiable singleton instance. Do not delete it. |
125 | | * @param errorCode Standard ICU error code. Its input value must |
126 | | * pass the U_SUCCESS() test, or else the function returns |
127 | | * immediately. Check for U_FAILURE() on output or use with |
128 | | * function chaining. (See User Guide for details.) |
129 | | * @return the requested Normalizer2, if successful |
130 | | * @stable ICU 49 |
131 | | */ |
132 | | static const Normalizer2 * |
133 | | getNFKCInstance(UErrorCode &errorCode); |
134 | | |
135 | | /** |
136 | | * Returns a Normalizer2 instance for Unicode NFKD normalization. |
137 | | * Same as getInstance(NULL, "nfkc", UNORM2_DECOMPOSE, errorCode). |
138 | | * Returns an unmodifiable singleton instance. Do not delete it. |
139 | | * @param errorCode Standard ICU error code. Its input value must |
140 | | * pass the U_SUCCESS() test, or else the function returns |
141 | | * immediately. Check for U_FAILURE() on output or use with |
142 | | * function chaining. (See User Guide for details.) |
143 | | * @return the requested Normalizer2, if successful |
144 | | * @stable ICU 49 |
145 | | */ |
146 | | static const Normalizer2 * |
147 | | getNFKDInstance(UErrorCode &errorCode); |
148 | | |
149 | | /** |
150 | | * Returns a Normalizer2 instance for Unicode NFKC_Casefold normalization. |
151 | | * Same as getInstance(NULL, "nfkc_cf", UNORM2_COMPOSE, errorCode). |
152 | | * Returns an unmodifiable singleton instance. Do not delete it. |
153 | | * @param errorCode Standard ICU error code. Its input value must |
154 | | * pass the U_SUCCESS() test, or else the function returns |
155 | | * immediately. Check for U_FAILURE() on output or use with |
156 | | * function chaining. (See User Guide for details.) |
157 | | * @return the requested Normalizer2, if successful |
158 | | * @stable ICU 49 |
159 | | */ |
160 | | static const Normalizer2 * |
161 | | getNFKCCasefoldInstance(UErrorCode &errorCode); |
162 | | |
163 | | /** |
164 | | * Returns a Normalizer2 instance which uses the specified data file |
165 | | * (packageName/name similar to ucnv_openPackage() and ures_open()/ResourceBundle) |
166 | | * and which composes or decomposes text according to the specified mode. |
167 | | * Returns an unmodifiable singleton instance. Do not delete it. |
168 | | * |
169 | | * Use packageName=NULL for data files that are part of ICU's own data. |
170 | | * Use name="nfc" and UNORM2_COMPOSE/UNORM2_DECOMPOSE for Unicode standard NFC/NFD. |
171 | | * Use name="nfkc" and UNORM2_COMPOSE/UNORM2_DECOMPOSE for Unicode standard NFKC/NFKD. |
172 | | * Use name="nfkc_cf" and UNORM2_COMPOSE for Unicode standard NFKC_CF=NFKC_Casefold. |
173 | | * |
174 | | * @param packageName NULL for ICU built-in data, otherwise application data package name |
175 | | * @param name "nfc" or "nfkc" or "nfkc_cf" or name of custom data file |
176 | | * @param mode normalization mode (compose or decompose etc.) |
177 | | * @param errorCode Standard ICU error code. Its input value must |
178 | | * pass the U_SUCCESS() test, or else the function returns |
179 | | * immediately. Check for U_FAILURE() on output or use with |
180 | | * function chaining. (See User Guide for details.) |
181 | | * @return the requested Normalizer2, if successful |
182 | | * @stable ICU 4.4 |
183 | | */ |
184 | | static const Normalizer2 * |
185 | | getInstance(const char *packageName, |
186 | | const char *name, |
187 | | UNormalization2Mode mode, |
188 | | UErrorCode &errorCode); |
189 | | |
190 | | /** |
191 | | * Returns the normalized form of the source string. |
192 | | * @param src source string |
193 | | * @param errorCode Standard ICU error code. Its input value must |
194 | | * pass the U_SUCCESS() test, or else the function returns |
195 | | * immediately. Check for U_FAILURE() on output or use with |
196 | | * function chaining. (See User Guide for details.) |
197 | | * @return normalized src |
198 | | * @stable ICU 4.4 |
199 | | */ |
200 | | UnicodeString |
201 | 0 | normalize(const UnicodeString &src, UErrorCode &errorCode) const { |
202 | 0 | UnicodeString result; |
203 | 0 | normalize(src, result, errorCode); |
204 | 0 | return result; |
205 | 0 | } |
206 | | /** |
207 | | * Writes the normalized form of the source string to the destination string |
208 | | * (replacing its contents) and returns the destination string. |
209 | | * The source and destination strings must be different objects. |
210 | | * @param src source string |
211 | | * @param dest destination string; its contents is replaced with normalized src |
212 | | * @param errorCode Standard ICU error code. Its input value must |
213 | | * pass the U_SUCCESS() test, or else the function returns |
214 | | * immediately. Check for U_FAILURE() on output or use with |
215 | | * function chaining. (See User Guide for details.) |
216 | | * @return dest |
217 | | * @stable ICU 4.4 |
218 | | */ |
219 | | virtual UnicodeString & |
220 | | normalize(const UnicodeString &src, |
221 | | UnicodeString &dest, |
222 | | UErrorCode &errorCode) const = 0; |
223 | | |
224 | | /** |
225 | | * Normalizes a UTF-8 string and optionally records how source substrings |
226 | | * relate to changed and unchanged result substrings. |
227 | | * |
228 | | * Implemented completely for all built-in modes except for FCD. |
229 | | * The base class implementation converts to & from UTF-16 and does not support edits. |
230 | | * |
231 | | * @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT and U_EDITS_NO_RESET. |
232 | | * @param src Source UTF-8 string. |
233 | | * @param sink A ByteSink to which the normalized UTF-8 result string is written. |
234 | | * sink.Flush() is called at the end. |
235 | | * @param edits Records edits for index mapping, working with styled text, |
236 | | * and getting only changes (if any). |
237 | | * The Edits contents is undefined if any error occurs. |
238 | | * This function calls edits->reset() first unless |
239 | | * options includes U_EDITS_NO_RESET. edits can be nullptr. |
240 | | * @param errorCode Standard ICU error code. Its input value must |
241 | | * pass the U_SUCCESS() test, or else the function returns |
242 | | * immediately. Check for U_FAILURE() on output or use with |
243 | | * function chaining. (See User Guide for details.) |
244 | | * @stable ICU 60 |
245 | | */ |
246 | | virtual void |
247 | | normalizeUTF8(uint32_t options, StringPiece src, ByteSink &sink, |
248 | | Edits *edits, UErrorCode &errorCode) const; |
249 | | |
250 | | /** |
251 | | * Appends the normalized form of the second string to the first string |
252 | | * (merging them at the boundary) and returns the first string. |
253 | | * The result is normalized if the first string was normalized. |
254 | | * The first and second strings must be different objects. |
255 | | * @param first string, should be normalized |
256 | | * @param second string, will be normalized |
257 | | * @param errorCode Standard ICU error code. Its input value must |
258 | | * pass the U_SUCCESS() test, or else the function returns |
259 | | * immediately. Check for U_FAILURE() on output or use with |
260 | | * function chaining. (See User Guide for details.) |
261 | | * @return first |
262 | | * @stable ICU 4.4 |
263 | | */ |
264 | | virtual UnicodeString & |
265 | | normalizeSecondAndAppend(UnicodeString &first, |
266 | | const UnicodeString &second, |
267 | | UErrorCode &errorCode) const = 0; |
268 | | /** |
269 | | * Appends the second string to the first string |
270 | | * (merging them at the boundary) and returns the first string. |
271 | | * The result is normalized if both the strings were normalized. |
272 | | * The first and second strings must be different objects. |
273 | | * @param first string, should be normalized |
274 | | * @param second string, should be normalized |
275 | | * @param errorCode Standard ICU error code. Its input value must |
276 | | * pass the U_SUCCESS() test, or else the function returns |
277 | | * immediately. Check for U_FAILURE() on output or use with |
278 | | * function chaining. (See User Guide for details.) |
279 | | * @return first |
280 | | * @stable ICU 4.4 |
281 | | */ |
282 | | virtual UnicodeString & |
283 | | append(UnicodeString &first, |
284 | | const UnicodeString &second, |
285 | | UErrorCode &errorCode) const = 0; |
286 | | |
287 | | /** |
288 | | * Gets the decomposition mapping of c. |
289 | | * Roughly equivalent to normalizing the String form of c |
290 | | * on a UNORM2_DECOMPOSE Normalizer2 instance, but much faster, and except that this function |
291 | | * returns false and does not write a string |
292 | | * if c does not have a decomposition mapping in this instance's data. |
293 | | * This function is independent of the mode of the Normalizer2. |
294 | | * @param c code point |
295 | | * @param decomposition String object which will be set to c's |
296 | | * decomposition mapping, if there is one. |
297 | | * @return true if c has a decomposition, otherwise false |
298 | | * @stable ICU 4.6 |
299 | | */ |
300 | | virtual UBool |
301 | | getDecomposition(UChar32 c, UnicodeString &decomposition) const = 0; |
302 | | |
303 | | /** |
304 | | * Gets the raw decomposition mapping of c. |
305 | | * |
306 | | * This is similar to the getDecomposition() method but returns the |
307 | | * raw decomposition mapping as specified in UnicodeData.txt or |
308 | | * (for custom data) in the mapping files processed by the gennorm2 tool. |
309 | | * By contrast, getDecomposition() returns the processed, |
310 | | * recursively-decomposed version of this mapping. |
311 | | * |
312 | | * When used on a standard NFKC Normalizer2 instance, |
313 | | * getRawDecomposition() returns the Unicode Decomposition_Mapping (dm) property. |
314 | | * |
315 | | * When used on a standard NFC Normalizer2 instance, |
316 | | * it returns the Decomposition_Mapping only if the Decomposition_Type (dt) is Canonical (Can); |
317 | | * in this case, the result contains either one or two code points (=1..4 char16_ts). |
318 | | * |
319 | | * This function is independent of the mode of the Normalizer2. |
320 | | * The default implementation returns false. |
321 | | * @param c code point |
322 | | * @param decomposition String object which will be set to c's |
323 | | * raw decomposition mapping, if there is one. |
324 | | * @return true if c has a decomposition, otherwise false |
325 | | * @stable ICU 49 |
326 | | */ |
327 | | virtual UBool |
328 | | getRawDecomposition(UChar32 c, UnicodeString &decomposition) const; |
329 | | |
330 | | /** |
331 | | * Performs pairwise composition of a & b and returns the composite if there is one. |
332 | | * |
333 | | * Returns a composite code point c only if c has a two-way mapping to a+b. |
334 | | * In standard Unicode normalization, this means that |
335 | | * c has a canonical decomposition to a+b |
336 | | * and c does not have the Full_Composition_Exclusion property. |
337 | | * |
338 | | * This function is independent of the mode of the Normalizer2. |
339 | | * The default implementation returns a negative value. |
340 | | * @param a A (normalization starter) code point. |
341 | | * @param b Another code point. |
342 | | * @return The non-negative composite code point if there is one; otherwise a negative value. |
343 | | * @stable ICU 49 |
344 | | */ |
345 | | virtual UChar32 |
346 | | composePair(UChar32 a, UChar32 b) const; |
347 | | |
348 | | /** |
349 | | * Gets the combining class of c. |
350 | | * The default implementation returns 0 |
351 | | * but all standard implementations return the Unicode Canonical_Combining_Class value. |
352 | | * @param c code point |
353 | | * @return c's combining class |
354 | | * @stable ICU 49 |
355 | | */ |
356 | | virtual uint8_t |
357 | | getCombiningClass(UChar32 c) const; |
358 | | |
359 | | /** |
360 | | * Tests if the string is normalized. |
361 | | * Internally, in cases where the quickCheck() method would return "maybe" |
362 | | * (which is only possible for the two COMPOSE modes) this method |
363 | | * resolves to "yes" or "no" to provide a definitive result, |
364 | | * at the cost of doing more work in those cases. |
365 | | * @param s input string |
366 | | * @param errorCode Standard ICU error code. Its input value must |
367 | | * pass the U_SUCCESS() test, or else the function returns |
368 | | * immediately. Check for U_FAILURE() on output or use with |
369 | | * function chaining. (See User Guide for details.) |
370 | | * @return true if s is normalized |
371 | | * @stable ICU 4.4 |
372 | | */ |
373 | | virtual UBool |
374 | | isNormalized(const UnicodeString &s, UErrorCode &errorCode) const = 0; |
375 | | /** |
376 | | * Tests if the UTF-8 string is normalized. |
377 | | * Internally, in cases where the quickCheck() method would return "maybe" |
378 | | * (which is only possible for the two COMPOSE modes) this method |
379 | | * resolves to "yes" or "no" to provide a definitive result, |
380 | | * at the cost of doing more work in those cases. |
381 | | * |
382 | | * This works for all normalization modes. |
383 | | * It is optimized for UTF-8 for all built-in modes except for FCD. |
384 | | * The base class implementation converts to UTF-16 and calls isNormalized(). |
385 | | * |
386 | | * @param s UTF-8 input string |
387 | | * @param errorCode Standard ICU error code. Its input value must |
388 | | * pass the U_SUCCESS() test, or else the function returns |
389 | | * immediately. Check for U_FAILURE() on output or use with |
390 | | * function chaining. (See User Guide for details.) |
391 | | * @return true if s is normalized |
392 | | * @stable ICU 60 |
393 | | */ |
394 | | virtual UBool |
395 | | isNormalizedUTF8(StringPiece s, UErrorCode &errorCode) const; |
396 | | |
397 | | |
398 | | /** |
399 | | * Tests if the string is normalized. |
400 | | * For the two COMPOSE modes, the result could be "maybe" in cases that |
401 | | * would take a little more work to resolve definitively. |
402 | | * Use spanQuickCheckYes() and normalizeSecondAndAppend() for a faster |
403 | | * combination of quick check + normalization, to avoid |
404 | | * re-checking the "yes" prefix. |
405 | | * @param s input string |
406 | | * @param errorCode Standard ICU error code. Its input value must |
407 | | * pass the U_SUCCESS() test, or else the function returns |
408 | | * immediately. Check for U_FAILURE() on output or use with |
409 | | * function chaining. (See User Guide for details.) |
410 | | * @return UNormalizationCheckResult |
411 | | * @stable ICU 4.4 |
412 | | */ |
413 | | virtual UNormalizationCheckResult |
414 | | quickCheck(const UnicodeString &s, UErrorCode &errorCode) const = 0; |
415 | | |
416 | | /** |
417 | | * Returns the end of the normalized substring of the input string. |
418 | | * In other words, with <code>end=spanQuickCheckYes(s, ec);</code> |
419 | | * the substring <code>UnicodeString(s, 0, end)</code> |
420 | | * will pass the quick check with a "yes" result. |
421 | | * |
422 | | * The returned end index is usually one or more characters before the |
423 | | * "no" or "maybe" character: The end index is at a normalization boundary. |
424 | | * (See the class documentation for more about normalization boundaries.) |
425 | | * |
426 | | * When the goal is a normalized string and most input strings are expected |
427 | | * to be normalized already, then call this method, |
428 | | * and if it returns a prefix shorter than the input string, |
429 | | * copy that prefix and use normalizeSecondAndAppend() for the remainder. |
430 | | * @param s input string |
431 | | * @param errorCode Standard ICU error code. Its input value must |
432 | | * pass the U_SUCCESS() test, or else the function returns |
433 | | * immediately. Check for U_FAILURE() on output or use with |
434 | | * function chaining. (See User Guide for details.) |
435 | | * @return "yes" span end index |
436 | | * @stable ICU 4.4 |
437 | | */ |
438 | | virtual int32_t |
439 | | spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const = 0; |
440 | | |
441 | | /** |
442 | | * Tests if the character always has a normalization boundary before it, |
443 | | * regardless of context. |
444 | | * If true, then the character does not normalization-interact with |
445 | | * preceding characters. |
446 | | * In other words, a string containing this character can be normalized |
447 | | * by processing portions before this character and starting from this |
448 | | * character independently. |
449 | | * This is used for iterative normalization. See the class documentation for details. |
450 | | * @param c character to test |
451 | | * @return true if c has a normalization boundary before it |
452 | | * @stable ICU 4.4 |
453 | | */ |
454 | | virtual UBool hasBoundaryBefore(UChar32 c) const = 0; |
455 | | |
456 | | /** |
457 | | * Tests if the character always has a normalization boundary after it, |
458 | | * regardless of context. |
459 | | * If true, then the character does not normalization-interact with |
460 | | * following characters. |
461 | | * In other words, a string containing this character can be normalized |
462 | | * by processing portions up to this character and after this |
463 | | * character independently. |
464 | | * This is used for iterative normalization. See the class documentation for details. |
465 | | * Note that this operation may be significantly slower than hasBoundaryBefore(). |
466 | | * @param c character to test |
467 | | * @return true if c has a normalization boundary after it |
468 | | * @stable ICU 4.4 |
469 | | */ |
470 | | virtual UBool hasBoundaryAfter(UChar32 c) const = 0; |
471 | | |
472 | | /** |
473 | | * Tests if the character is normalization-inert. |
474 | | * If true, then the character does not change, nor normalization-interact with |
475 | | * preceding or following characters. |
476 | | * In other words, a string containing this character can be normalized |
477 | | * by processing portions before this character and after this |
478 | | * character independently. |
479 | | * This is used for iterative normalization. See the class documentation for details. |
480 | | * Note that this operation may be significantly slower than hasBoundaryBefore(). |
481 | | * @param c character to test |
482 | | * @return true if c is normalization-inert |
483 | | * @stable ICU 4.4 |
484 | | */ |
485 | | virtual UBool isInert(UChar32 c) const = 0; |
486 | | }; |
487 | | |
488 | | /** |
489 | | * Normalization filtered by a UnicodeSet. |
490 | | * Normalizes portions of the text contained in the filter set and leaves |
491 | | * portions not contained in the filter set unchanged. |
492 | | * Filtering is done via UnicodeSet::span(..., USET_SPAN_SIMPLE). |
493 | | * Not-in-the-filter text is treated as "is normalized" and "quick check yes". |
494 | | * This class implements all of (and only) the Normalizer2 API. |
495 | | * An instance of this class is unmodifiable/immutable but is constructed and |
496 | | * must be destructed by the owner. |
497 | | * @stable ICU 4.4 |
498 | | */ |
499 | | class U_COMMON_API FilteredNormalizer2 : public Normalizer2 { |
500 | | public: |
501 | | /** |
502 | | * Constructs a filtered normalizer wrapping any Normalizer2 instance |
503 | | * and a filter set. |
504 | | * Both are aliased and must not be modified or deleted while this object |
505 | | * is used. |
506 | | * The filter set should be frozen; otherwise the performance will suffer greatly. |
507 | | * @param n2 wrapped Normalizer2 instance |
508 | | * @param filterSet UnicodeSet which determines the characters to be normalized |
509 | | * @stable ICU 4.4 |
510 | | */ |
511 | | FilteredNormalizer2(const Normalizer2 &n2, const UnicodeSet &filterSet) : |
512 | 0 | norm2(n2), set(filterSet) {} |
513 | | |
514 | | /** |
515 | | * Destructor. |
516 | | * @stable ICU 4.4 |
517 | | */ |
518 | | ~FilteredNormalizer2(); |
519 | | |
520 | | /** |
521 | | * Writes the normalized form of the source string to the destination string |
522 | | * (replacing its contents) and returns the destination string. |
523 | | * The source and destination strings must be different objects. |
524 | | * @param src source string |
525 | | * @param dest destination string; its contents is replaced with normalized src |
526 | | * @param errorCode Standard ICU error code. Its input value must |
527 | | * pass the U_SUCCESS() test, or else the function returns |
528 | | * immediately. Check for U_FAILURE() on output or use with |
529 | | * function chaining. (See User Guide for details.) |
530 | | * @return dest |
531 | | * @stable ICU 4.4 |
532 | | */ |
533 | | virtual UnicodeString & |
534 | | normalize(const UnicodeString &src, |
535 | | UnicodeString &dest, |
536 | | UErrorCode &errorCode) const U_OVERRIDE; |
537 | | |
538 | | /** |
539 | | * Normalizes a UTF-8 string and optionally records how source substrings |
540 | | * relate to changed and unchanged result substrings. |
541 | | * |
542 | | * Implemented completely for most built-in modes except for FCD. |
543 | | * The base class implementation converts to & from UTF-16 and does not support edits. |
544 | | * |
545 | | * @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT and U_EDITS_NO_RESET. |
546 | | * @param src Source UTF-8 string. |
547 | | * @param sink A ByteSink to which the normalized UTF-8 result string is written. |
548 | | * sink.Flush() is called at the end. |
549 | | * @param edits Records edits for index mapping, working with styled text, |
550 | | * and getting only changes (if any). |
551 | | * The Edits contents is undefined if any error occurs. |
552 | | * This function calls edits->reset() first unless |
553 | | * options includes U_EDITS_NO_RESET. edits can be nullptr. |
554 | | * @param errorCode Standard ICU error code. Its input value must |
555 | | * pass the U_SUCCESS() test, or else the function returns |
556 | | * immediately. Check for U_FAILURE() on output or use with |
557 | | * function chaining. (See User Guide for details.) |
558 | | * @stable ICU 60 |
559 | | */ |
560 | | virtual void |
561 | | normalizeUTF8(uint32_t options, StringPiece src, ByteSink &sink, |
562 | | Edits *edits, UErrorCode &errorCode) const U_OVERRIDE; |
563 | | |
564 | | /** |
565 | | * Appends the normalized form of the second string to the first string |
566 | | * (merging them at the boundary) and returns the first string. |
567 | | * The result is normalized if the first string was normalized. |
568 | | * The first and second strings must be different objects. |
569 | | * @param first string, should be normalized |
570 | | * @param second string, will be normalized |
571 | | * @param errorCode Standard ICU error code. Its input value must |
572 | | * pass the U_SUCCESS() test, or else the function returns |
573 | | * immediately. Check for U_FAILURE() on output or use with |
574 | | * function chaining. (See User Guide for details.) |
575 | | * @return first |
576 | | * @stable ICU 4.4 |
577 | | */ |
578 | | virtual UnicodeString & |
579 | | normalizeSecondAndAppend(UnicodeString &first, |
580 | | const UnicodeString &second, |
581 | | UErrorCode &errorCode) const U_OVERRIDE; |
582 | | /** |
583 | | * Appends the second string to the first string |
584 | | * (merging them at the boundary) and returns the first string. |
585 | | * The result is normalized if both the strings were normalized. |
586 | | * The first and second strings must be different objects. |
587 | | * @param first string, should be normalized |
588 | | * @param second string, should be normalized |
589 | | * @param errorCode Standard ICU error code. Its input value must |
590 | | * pass the U_SUCCESS() test, or else the function returns |
591 | | * immediately. Check for U_FAILURE() on output or use with |
592 | | * function chaining. (See User Guide for details.) |
593 | | * @return first |
594 | | * @stable ICU 4.4 |
595 | | */ |
596 | | virtual UnicodeString & |
597 | | append(UnicodeString &first, |
598 | | const UnicodeString &second, |
599 | | UErrorCode &errorCode) const U_OVERRIDE; |
600 | | |
601 | | /** |
602 | | * Gets the decomposition mapping of c. |
603 | | * For details see the base class documentation. |
604 | | * |
605 | | * This function is independent of the mode of the Normalizer2. |
606 | | * @param c code point |
607 | | * @param decomposition String object which will be set to c's |
608 | | * decomposition mapping, if there is one. |
609 | | * @return true if c has a decomposition, otherwise false |
610 | | * @stable ICU 4.6 |
611 | | */ |
612 | | virtual UBool |
613 | | getDecomposition(UChar32 c, UnicodeString &decomposition) const U_OVERRIDE; |
614 | | |
615 | | /** |
616 | | * Gets the raw decomposition mapping of c. |
617 | | * For details see the base class documentation. |
618 | | * |
619 | | * This function is independent of the mode of the Normalizer2. |
620 | | * @param c code point |
621 | | * @param decomposition String object which will be set to c's |
622 | | * raw decomposition mapping, if there is one. |
623 | | * @return true if c has a decomposition, otherwise false |
624 | | * @stable ICU 49 |
625 | | */ |
626 | | virtual UBool |
627 | | getRawDecomposition(UChar32 c, UnicodeString &decomposition) const U_OVERRIDE; |
628 | | |
629 | | /** |
630 | | * Performs pairwise composition of a & b and returns the composite if there is one. |
631 | | * For details see the base class documentation. |
632 | | * |
633 | | * This function is independent of the mode of the Normalizer2. |
634 | | * @param a A (normalization starter) code point. |
635 | | * @param b Another code point. |
636 | | * @return The non-negative composite code point if there is one; otherwise a negative value. |
637 | | * @stable ICU 49 |
638 | | */ |
639 | | virtual UChar32 |
640 | | composePair(UChar32 a, UChar32 b) const U_OVERRIDE; |
641 | | |
642 | | /** |
643 | | * Gets the combining class of c. |
644 | | * The default implementation returns 0 |
645 | | * but all standard implementations return the Unicode Canonical_Combining_Class value. |
646 | | * @param c code point |
647 | | * @return c's combining class |
648 | | * @stable ICU 49 |
649 | | */ |
650 | | virtual uint8_t |
651 | | getCombiningClass(UChar32 c) const U_OVERRIDE; |
652 | | |
653 | | /** |
654 | | * Tests if the string is normalized. |
655 | | * For details see the Normalizer2 base class documentation. |
656 | | * @param s input string |
657 | | * @param errorCode Standard ICU error code. Its input value must |
658 | | * pass the U_SUCCESS() test, or else the function returns |
659 | | * immediately. Check for U_FAILURE() on output or use with |
660 | | * function chaining. (See User Guide for details.) |
661 | | * @return true if s is normalized |
662 | | * @stable ICU 4.4 |
663 | | */ |
664 | | virtual UBool |
665 | | isNormalized(const UnicodeString &s, UErrorCode &errorCode) const U_OVERRIDE; |
666 | | /** |
667 | | * Tests if the UTF-8 string is normalized. |
668 | | * Internally, in cases where the quickCheck() method would return "maybe" |
669 | | * (which is only possible for the two COMPOSE modes) this method |
670 | | * resolves to "yes" or "no" to provide a definitive result, |
671 | | * at the cost of doing more work in those cases. |
672 | | * |
673 | | * This works for all normalization modes. |
674 | | * It is optimized for UTF-8 for all built-in modes except for FCD. |
675 | | * The base class implementation converts to UTF-16 and calls isNormalized(). |
676 | | * |
677 | | * @param s UTF-8 input string |
678 | | * @param errorCode Standard ICU error code. Its input value must |
679 | | * pass the U_SUCCESS() test, or else the function returns |
680 | | * immediately. Check for U_FAILURE() on output or use with |
681 | | * function chaining. (See User Guide for details.) |
682 | | * @return true if s is normalized |
683 | | * @stable ICU 60 |
684 | | */ |
685 | | virtual UBool |
686 | | isNormalizedUTF8(StringPiece s, UErrorCode &errorCode) const U_OVERRIDE; |
687 | | /** |
688 | | * Tests if the string is normalized. |
689 | | * For details see the Normalizer2 base class documentation. |
690 | | * @param s input string |
691 | | * @param errorCode Standard ICU error code. Its input value must |
692 | | * pass the U_SUCCESS() test, or else the function returns |
693 | | * immediately. Check for U_FAILURE() on output or use with |
694 | | * function chaining. (See User Guide for details.) |
695 | | * @return UNormalizationCheckResult |
696 | | * @stable ICU 4.4 |
697 | | */ |
698 | | virtual UNormalizationCheckResult |
699 | | quickCheck(const UnicodeString &s, UErrorCode &errorCode) const U_OVERRIDE; |
700 | | /** |
701 | | * Returns the end of the normalized substring of the input string. |
702 | | * For details see the Normalizer2 base class documentation. |
703 | | * @param s input string |
704 | | * @param errorCode Standard ICU error code. Its input value must |
705 | | * pass the U_SUCCESS() test, or else the function returns |
706 | | * immediately. Check for U_FAILURE() on output or use with |
707 | | * function chaining. (See User Guide for details.) |
708 | | * @return "yes" span end index |
709 | | * @stable ICU 4.4 |
710 | | */ |
711 | | virtual int32_t |
712 | | spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const U_OVERRIDE; |
713 | | |
714 | | /** |
715 | | * Tests if the character always has a normalization boundary before it, |
716 | | * regardless of context. |
717 | | * For details see the Normalizer2 base class documentation. |
718 | | * @param c character to test |
719 | | * @return true if c has a normalization boundary before it |
720 | | * @stable ICU 4.4 |
721 | | */ |
722 | | virtual UBool hasBoundaryBefore(UChar32 c) const U_OVERRIDE; |
723 | | |
724 | | /** |
725 | | * Tests if the character always has a normalization boundary after it, |
726 | | * regardless of context. |
727 | | * For details see the Normalizer2 base class documentation. |
728 | | * @param c character to test |
729 | | * @return true if c has a normalization boundary after it |
730 | | * @stable ICU 4.4 |
731 | | */ |
732 | | virtual UBool hasBoundaryAfter(UChar32 c) const U_OVERRIDE; |
733 | | |
734 | | /** |
735 | | * Tests if the character is normalization-inert. |
736 | | * For details see the Normalizer2 base class documentation. |
737 | | * @param c character to test |
738 | | * @return true if c is normalization-inert |
739 | | * @stable ICU 4.4 |
740 | | */ |
741 | | virtual UBool isInert(UChar32 c) const U_OVERRIDE; |
742 | | private: |
743 | | UnicodeString & |
744 | | normalize(const UnicodeString &src, |
745 | | UnicodeString &dest, |
746 | | USetSpanCondition spanCondition, |
747 | | UErrorCode &errorCode) const; |
748 | | |
749 | | void |
750 | | normalizeUTF8(uint32_t options, const char *src, int32_t length, |
751 | | ByteSink &sink, Edits *edits, |
752 | | USetSpanCondition spanCondition, |
753 | | UErrorCode &errorCode) const; |
754 | | |
755 | | UnicodeString & |
756 | | normalizeSecondAndAppend(UnicodeString &first, |
757 | | const UnicodeString &second, |
758 | | UBool doNormalize, |
759 | | UErrorCode &errorCode) const; |
760 | | |
761 | | const Normalizer2 &norm2; |
762 | | const UnicodeSet &set; |
763 | | }; |
764 | | |
765 | | U_NAMESPACE_END |
766 | | |
767 | | #endif // !UCONFIG_NO_NORMALIZATION |
768 | | |
769 | | #endif /* U_SHOW_CPLUSPLUS_API */ |
770 | | |
771 | | #endif // __NORMALIZER2_H__ |