Coverage Report

Created: 2026-01-25 06:58

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/icu/icu4c/source/common/unicode/normalizer2.h
Line
Count
Source
1
// © 2016 and later: Unicode, Inc. and others.
2
// License & terms of use: http://www.unicode.org/copyright.html
3
/*
4
*******************************************************************************
5
*
6
*   Copyright (C) 2009-2013, International Business Machines
7
*   Corporation and others.  All Rights Reserved.
8
*
9
*******************************************************************************
10
*   file name:  normalizer2.h
11
*   encoding:   UTF-8
12
*   tab size:   8 (not used)
13
*   indentation:4
14
*
15
*   created on: 2009nov22
16
*   created by: Markus W. Scherer
17
*/
18
19
#ifndef __NORMALIZER2_H__
20
#define __NORMALIZER2_H__
21
22
/**
23
 * \file
24
 * \brief C++ API: New API for Unicode Normalization.
25
 */
26
27
#include "unicode/utypes.h"
28
29
#if U_SHOW_CPLUSPLUS_API
30
31
#if !UCONFIG_NO_NORMALIZATION
32
33
#include "unicode/stringpiece.h"
34
#include "unicode/uniset.h"
35
#include "unicode/unistr.h"
36
#include "unicode/unorm2.h"
37
38
U_NAMESPACE_BEGIN
39
40
class ByteSink;
41
42
/**
43
 * Unicode normalization functionality for standard Unicode normalization or
44
 * for using custom mapping tables.
45
 * All instances of this class are unmodifiable/immutable.
46
 * Instances returned by getInstance() are singletons that must not be deleted by the caller.
47
 * The Normalizer2 class is not intended for public subclassing.
48
 *
49
 * The primary functions are to produce a normalized string and to detect whether
50
 * a string is already normalized.
51
 * The most commonly used normalization forms are those defined in
52
 * http://www.unicode.org/unicode/reports/tr15/
53
 * However, this API supports additional normalization forms for specialized purposes.
54
 * For example, NFKC_Casefold is provided via getInstance("nfkc_cf", COMPOSE)
55
 * and can be used in implementations of UTS #46.
56
 *
57
 * Not only are the standard compose and decompose modes supplied,
58
 * but additional modes are provided as documented in the Mode enum.
59
 *
60
 * Some of the functions in this class identify normalization boundaries.
61
 * At a normalization boundary, the portions of the string
62
 * before it and starting from it do not interact and can be handled independently.
63
 *
64
 * The spanQuickCheckYes() stops at a normalization boundary.
65
 * When the goal is a normalized string, then the text before the boundary
66
 * can be copied, and the remainder can be processed with normalizeSecondAndAppend().
67
 *
68
 * The hasBoundaryBefore(), hasBoundaryAfter() and isInert() functions test whether
69
 * a character is guaranteed to be at a normalization boundary,
70
 * regardless of context.
71
 * This is used for moving from one normalization boundary to the next
72
 * or preceding boundary, and for performing iterative normalization.
73
 *
74
 * Iterative normalization is useful when only a small portion of a
75
 * longer string needs to be processed.
76
 * For example, in ICU, iterative normalization is used by the NormalizationTransliterator
77
 * (to avoid replacing already-normalized text) and ucol_nextSortKeyPart()
78
 * (to process only the substring for which sort key bytes are computed).
79
 *
80
 * The set of normalization boundaries returned by these functions may not be
81
 * complete: There may be more boundaries that could be returned.
82
 * Different functions may return different boundaries.
83
 * @stable ICU 4.4
84
 */
85
class U_COMMON_API Normalizer2 : public UObject {
86
public:
87
    /**
88
     * Destructor.
89
     * @stable ICU 4.4
90
     */
91
    ~Normalizer2();
92
93
    /**
94
     * Returns a Normalizer2 instance for Unicode NFC normalization.
95
     * Same as getInstance(nullptr, "nfc", UNORM2_COMPOSE, errorCode).
96
     * Returns an unmodifiable singleton instance. Do not delete it.
97
     * @param errorCode Standard ICU error code. Its input value must
98
     *                  pass the U_SUCCESS() test, or else the function returns
99
     *                  immediately. Check for U_FAILURE() on output or use with
100
     *                  function chaining. (See User Guide for details.)
101
     * @return the requested Normalizer2, if successful
102
     * @stable ICU 49
103
     */
104
    static const Normalizer2 *
105
    getNFCInstance(UErrorCode &errorCode);
106
107
    /**
108
     * Returns a Normalizer2 instance for Unicode NFD normalization.
109
     * Same as getInstance(nullptr, "nfc", UNORM2_DECOMPOSE, errorCode).
110
     * Returns an unmodifiable singleton instance. Do not delete it.
111
     * @param errorCode Standard ICU error code. Its input value must
112
     *                  pass the U_SUCCESS() test, or else the function returns
113
     *                  immediately. Check for U_FAILURE() on output or use with
114
     *                  function chaining. (See User Guide for details.)
115
     * @return the requested Normalizer2, if successful
116
     * @stable ICU 49
117
     */
118
    static const Normalizer2 *
119
    getNFDInstance(UErrorCode &errorCode);
120
121
    /**
122
     * Returns a Normalizer2 instance for Unicode NFKC normalization.
123
     * Same as getInstance(nullptr, "nfkc", UNORM2_COMPOSE, errorCode).
124
     * Returns an unmodifiable singleton instance. Do not delete it.
125
     * @param errorCode Standard ICU error code. Its input value must
126
     *                  pass the U_SUCCESS() test, or else the function returns
127
     *                  immediately. Check for U_FAILURE() on output or use with
128
     *                  function chaining. (See User Guide for details.)
129
     * @return the requested Normalizer2, if successful
130
     * @stable ICU 49
131
     */
132
    static const Normalizer2 *
133
    getNFKCInstance(UErrorCode &errorCode);
134
135
    /**
136
     * Returns a Normalizer2 instance for Unicode NFKD normalization.
137
     * Same as getInstance(nullptr, "nfkc", UNORM2_DECOMPOSE, errorCode).
138
     * Returns an unmodifiable singleton instance. Do not delete it.
139
     * @param errorCode Standard ICU error code. Its input value must
140
     *                  pass the U_SUCCESS() test, or else the function returns
141
     *                  immediately. Check for U_FAILURE() on output or use with
142
     *                  function chaining. (See User Guide for details.)
143
     * @return the requested Normalizer2, if successful
144
     * @stable ICU 49
145
     */
146
    static const Normalizer2 *
147
    getNFKDInstance(UErrorCode &errorCode);
148
149
    /**
150
     * Returns a Normalizer2 instance for Unicode toNFKC_Casefold() normalization
151
     * which is equivalent to applying the NFKC_Casefold mappings and then NFC.
152
     * See https://www.unicode.org/reports/tr44/#NFKC_Casefold
153
     *
154
     * Same as getInstance(nullptr, "nfkc_cf", UNORM2_COMPOSE, errorCode).
155
     * Returns an unmodifiable singleton instance. Do not delete it.
156
     * @param errorCode Standard ICU error code. Its input value must
157
     *                  pass the U_SUCCESS() test, or else the function returns
158
     *                  immediately. Check for U_FAILURE() on output or use with
159
     *                  function chaining. (See User Guide for details.)
160
     * @return the requested Normalizer2, if successful
161
     * @stable ICU 49
162
     */
163
    static const Normalizer2 *
164
    getNFKCCasefoldInstance(UErrorCode &errorCode);
165
166
    /**
167
     * Returns a Normalizer2 instance for a variant of Unicode toNFKC_Casefold() normalization
168
     * which is equivalent to applying the NFKC_Simple_Casefold mappings and then NFC.
169
     * See https://www.unicode.org/reports/tr44/#NFKC_Simple_Casefold
170
     *
171
     * Same as getInstance(nullptr, "nfkc_scf", UNORM2_COMPOSE, errorCode).
172
     * Returns an unmodifiable singleton instance. Do not delete it.
173
     * @param errorCode Standard ICU error code. Its input value must
174
     *                  pass the U_SUCCESS() test, or else the function returns
175
     *                  immediately. Check for U_FAILURE() on output or use with
176
     *                  function chaining. (See User Guide for details.)
177
     * @return the requested Normalizer2, if successful
178
     * @stable ICU 74
179
     */
180
    static const Normalizer2 *
181
    getNFKCSimpleCasefoldInstance(UErrorCode &errorCode);
182
183
    /**
184
     * Returns a Normalizer2 instance which uses the specified data file
185
     * (packageName/name similar to ucnv_openPackage() and ures_open()/ResourceBundle)
186
     * and which composes or decomposes text according to the specified mode.
187
     * Returns an unmodifiable singleton instance. Do not delete it.
188
     *
189
     * Use packageName=nullptr for data files that are part of ICU's own data.
190
     * Use name="nfc" and UNORM2_COMPOSE/UNORM2_DECOMPOSE for Unicode standard NFC/NFD.
191
     * Use name="nfkc" and UNORM2_COMPOSE/UNORM2_DECOMPOSE for Unicode standard NFKC/NFKD.
192
     * Use name="nfkc_cf" and UNORM2_COMPOSE for Unicode standard NFKC_CF=NFKC_Casefold.
193
     *
194
     * @param packageName nullptr for ICU built-in data, otherwise application data package name
195
     * @param name "nfc" or "nfkc" or "nfkc_cf" or "nfkc_scf" or name of custom data file
196
     * @param mode normalization mode (compose or decompose etc.)
197
     * @param errorCode Standard ICU error code. Its input value must
198
     *                  pass the U_SUCCESS() test, or else the function returns
199
     *                  immediately. Check for U_FAILURE() on output or use with
200
     *                  function chaining. (See User Guide for details.)
201
     * @return the requested Normalizer2, if successful
202
     * @stable ICU 4.4
203
     */
204
    static const Normalizer2 *
205
    getInstance(const char *packageName,
206
                const char *name,
207
                UNormalization2Mode mode,
208
                UErrorCode &errorCode);
209
210
    /**
211
     * Returns the normalized form of the source string.
212
     * @param src source string
213
     * @param errorCode Standard ICU error code. Its input value must
214
     *                  pass the U_SUCCESS() test, or else the function returns
215
     *                  immediately. Check for U_FAILURE() on output or use with
216
     *                  function chaining. (See User Guide for details.)
217
     * @return normalized src
218
     * @stable ICU 4.4
219
     */
220
    UnicodeString
221
5.82M
    normalize(const UnicodeString &src, UErrorCode &errorCode) const {
222
5.82M
        UnicodeString result;
223
5.82M
        normalize(src, result, errorCode);
224
5.82M
        return result;
225
5.82M
    }
226
    /**
227
     * Writes the normalized form of the source string to the destination string
228
     * (replacing its contents) and returns the destination string.
229
     * The source and destination strings must be different objects.
230
     * @param src source string
231
     * @param dest destination string; its contents is replaced with normalized src
232
     * @param errorCode Standard ICU error code. Its input value must
233
     *                  pass the U_SUCCESS() test, or else the function returns
234
     *                  immediately. Check for U_FAILURE() on output or use with
235
     *                  function chaining. (See User Guide for details.)
236
     * @return dest
237
     * @stable ICU 4.4
238
     */
239
    virtual UnicodeString &
240
    normalize(const UnicodeString &src,
241
              UnicodeString &dest,
242
              UErrorCode &errorCode) const = 0;
243
244
    /**
245
     * Normalizes a UTF-8 string and optionally records how source substrings
246
     * relate to changed and unchanged result substrings.
247
     *
248
     * Implemented completely for all built-in modes except for FCD.
249
     * The base class implementation converts to & from UTF-16 and does not support edits.
250
     *
251
     * @param options   Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT and U_EDITS_NO_RESET.
252
     * @param src       Source UTF-8 string.
253
     * @param sink      A ByteSink to which the normalized UTF-8 result string is written.
254
     *                  sink.Flush() is called at the end.
255
     * @param edits     Records edits for index mapping, working with styled text,
256
     *                  and getting only changes (if any).
257
     *                  The Edits contents is undefined if any error occurs.
258
     *                  This function calls edits->reset() first unless
259
     *                  options includes U_EDITS_NO_RESET. edits can be nullptr.
260
     * @param errorCode Standard ICU error code. Its input value must
261
     *                  pass the U_SUCCESS() test, or else the function returns
262
     *                  immediately. Check for U_FAILURE() on output or use with
263
     *                  function chaining. (See User Guide for details.)
264
     * @stable ICU 60
265
     */
266
    virtual void
267
    normalizeUTF8(uint32_t options, StringPiece src, ByteSink &sink,
268
                  Edits *edits, UErrorCode &errorCode) const;
269
270
    /**
271
     * Appends the normalized form of the second string to the first string
272
     * (merging them at the boundary) and returns the first string.
273
     * The result is normalized if the first string was normalized.
274
     * The first and second strings must be different objects.
275
     * @param first string, should be normalized
276
     * @param second string, will be normalized
277
     * @param errorCode Standard ICU error code. Its input value must
278
     *                  pass the U_SUCCESS() test, or else the function returns
279
     *                  immediately. Check for U_FAILURE() on output or use with
280
     *                  function chaining. (See User Guide for details.)
281
     * @return first
282
     * @stable ICU 4.4
283
     */
284
    virtual UnicodeString &
285
    normalizeSecondAndAppend(UnicodeString &first,
286
                             const UnicodeString &second,
287
                             UErrorCode &errorCode) const = 0;
288
    /**
289
     * Appends the second string to the first string
290
     * (merging them at the boundary) and returns the first string.
291
     * The result is normalized if both the strings were normalized.
292
     * The first and second strings must be different objects.
293
     * @param first string, should be normalized
294
     * @param second string, should be normalized
295
     * @param errorCode Standard ICU error code. Its input value must
296
     *                  pass the U_SUCCESS() test, or else the function returns
297
     *                  immediately. Check for U_FAILURE() on output or use with
298
     *                  function chaining. (See User Guide for details.)
299
     * @return first
300
     * @stable ICU 4.4
301
     */
302
    virtual UnicodeString &
303
    append(UnicodeString &first,
304
           const UnicodeString &second,
305
           UErrorCode &errorCode) const = 0;
306
307
    /**
308
     * Gets the decomposition mapping of c.
309
     * Roughly equivalent to normalizing the String form of c
310
     * on a UNORM2_DECOMPOSE Normalizer2 instance, but much faster, and except that this function
311
     * returns false and does not write a string
312
     * if c does not have a decomposition mapping in this instance's data.
313
     * This function is independent of the mode of the Normalizer2.
314
     * @param c code point
315
     * @param decomposition String object which will be set to c's
316
     *                      decomposition mapping, if there is one.
317
     * @return true if c has a decomposition, otherwise false
318
     * @stable ICU 4.6
319
     */
320
    virtual UBool
321
    getDecomposition(UChar32 c, UnicodeString &decomposition) const = 0;
322
323
    /**
324
     * Gets the raw decomposition mapping of c.
325
     *
326
     * This is similar to the getDecomposition() method but returns the
327
     * raw decomposition mapping as specified in UnicodeData.txt or
328
     * (for custom data) in the mapping files processed by the gennorm2 tool.
329
     * By contrast, getDecomposition() returns the processed,
330
     * recursively-decomposed version of this mapping.
331
     *
332
     * When used on a standard NFKC Normalizer2 instance,
333
     * getRawDecomposition() returns the Unicode Decomposition_Mapping (dm) property.
334
     *
335
     * When used on a standard NFC Normalizer2 instance,
336
     * it returns the Decomposition_Mapping only if the Decomposition_Type (dt) is Canonical (Can);
337
     * in this case, the result contains either one or two code points (=1..4 char16_ts).
338
     *
339
     * This function is independent of the mode of the Normalizer2.
340
     * The default implementation returns false.
341
     * @param c code point
342
     * @param decomposition String object which will be set to c's
343
     *                      raw decomposition mapping, if there is one.
344
     * @return true if c has a decomposition, otherwise false
345
     * @stable ICU 49
346
     */
347
    virtual UBool
348
    getRawDecomposition(UChar32 c, UnicodeString &decomposition) const;
349
350
    /**
351
     * Performs pairwise composition of a & b and returns the composite if there is one.
352
     *
353
     * Returns a composite code point c only if c has a two-way mapping to a+b.
354
     * In standard Unicode normalization, this means that
355
     * c has a canonical decomposition to a+b
356
     * and c does not have the Full_Composition_Exclusion property.
357
     *
358
     * This function is independent of the mode of the Normalizer2.
359
     * The default implementation returns a negative value.
360
     * @param a A (normalization starter) code point.
361
     * @param b Another code point.
362
     * @return The non-negative composite code point if there is one; otherwise a negative value.
363
     * @stable ICU 49
364
     */
365
    virtual UChar32
366
    composePair(UChar32 a, UChar32 b) const;
367
368
    /**
369
     * Gets the combining class of c.
370
     * The default implementation returns 0
371
     * but all standard implementations return the Unicode Canonical_Combining_Class value.
372
     * @param c code point
373
     * @return c's combining class
374
     * @stable ICU 49
375
     */
376
    virtual uint8_t
377
    getCombiningClass(UChar32 c) const;
378
379
    /**
380
     * Tests if the string is normalized.
381
     * Internally, in cases where the quickCheck() method would return "maybe"
382
     * (which is only possible for the two COMPOSE modes) this method
383
     * resolves to "yes" or "no" to provide a definitive result,
384
     * at the cost of doing more work in those cases.
385
     * @param s input string
386
     * @param errorCode Standard ICU error code. Its input value must
387
     *                  pass the U_SUCCESS() test, or else the function returns
388
     *                  immediately. Check for U_FAILURE() on output or use with
389
     *                  function chaining. (See User Guide for details.)
390
     * @return true if s is normalized
391
     * @stable ICU 4.4
392
     */
393
    virtual UBool
394
    isNormalized(const UnicodeString &s, UErrorCode &errorCode) const = 0;
395
    /**
396
     * Tests if the UTF-8 string is normalized.
397
     * Internally, in cases where the quickCheck() method would return "maybe"
398
     * (which is only possible for the two COMPOSE modes) this method
399
     * resolves to "yes" or "no" to provide a definitive result,
400
     * at the cost of doing more work in those cases.
401
     *
402
     * This works for all normalization modes.
403
     * It is optimized for UTF-8 for all built-in modes except for FCD.
404
     * The base class implementation converts to UTF-16 and calls isNormalized().
405
     *
406
     * @param s UTF-8 input string
407
     * @param errorCode Standard ICU error code. Its input value must
408
     *                  pass the U_SUCCESS() test, or else the function returns
409
     *                  immediately. Check for U_FAILURE() on output or use with
410
     *                  function chaining. (See User Guide for details.)
411
     * @return true if s is normalized
412
     * @stable ICU 60
413
     */
414
    virtual UBool
415
    isNormalizedUTF8(StringPiece s, UErrorCode &errorCode) const;
416
417
418
    /**
419
     * Tests if the string is normalized.
420
     * For the two COMPOSE modes, the result could be "maybe" in cases that
421
     * would take a little more work to resolve definitively.
422
     * Use spanQuickCheckYes() and normalizeSecondAndAppend() for a faster
423
     * combination of quick check + normalization, to avoid
424
     * re-checking the "yes" prefix.
425
     * @param s input string
426
     * @param errorCode Standard ICU error code. Its input value must
427
     *                  pass the U_SUCCESS() test, or else the function returns
428
     *                  immediately. Check for U_FAILURE() on output or use with
429
     *                  function chaining. (See User Guide for details.)
430
     * @return UNormalizationCheckResult
431
     * @stable ICU 4.4
432
     */
433
    virtual UNormalizationCheckResult
434
    quickCheck(const UnicodeString &s, UErrorCode &errorCode) const = 0;
435
436
    /**
437
     * Returns the end of the normalized substring of the input string.
438
     * In other words, with <code>end=spanQuickCheckYes(s, ec);</code>
439
     * the substring <code>UnicodeString(s, 0, end)</code>
440
     * will pass the quick check with a "yes" result.
441
     *
442
     * The returned end index is usually one or more characters before the
443
     * "no" or "maybe" character: The end index is at a normalization boundary.
444
     * (See the class documentation for more about normalization boundaries.)
445
     *
446
     * When the goal is a normalized string and most input strings are expected
447
     * to be normalized already, then call this method,
448
     * and if it returns a prefix shorter than the input string,
449
     * copy that prefix and use normalizeSecondAndAppend() for the remainder.
450
     * @param s input string
451
     * @param errorCode Standard ICU error code. Its input value must
452
     *                  pass the U_SUCCESS() test, or else the function returns
453
     *                  immediately. Check for U_FAILURE() on output or use with
454
     *                  function chaining. (See User Guide for details.)
455
     * @return "yes" span end index
456
     * @stable ICU 4.4
457
     */
458
    virtual int32_t
459
    spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const = 0;
460
461
    /**
462
     * Tests if the character always has a normalization boundary before it,
463
     * regardless of context.
464
     * If true, then the character does not normalization-interact with
465
     * preceding characters.
466
     * In other words, a string containing this character can be normalized
467
     * by processing portions before this character and starting from this
468
     * character independently.
469
     * This is used for iterative normalization. See the class documentation for details.
470
     * @param c character to test
471
     * @return true if c has a normalization boundary before it
472
     * @stable ICU 4.4
473
     */
474
    virtual UBool hasBoundaryBefore(UChar32 c) const = 0;
475
476
    /**
477
     * Tests if the character always has a normalization boundary after it,
478
     * regardless of context.
479
     * If true, then the character does not normalization-interact with
480
     * following characters.
481
     * In other words, a string containing this character can be normalized
482
     * by processing portions up to this character and after this
483
     * character independently.
484
     * This is used for iterative normalization. See the class documentation for details.
485
     * Note that this operation may be significantly slower than hasBoundaryBefore().
486
     * @param c character to test
487
     * @return true if c has a normalization boundary after it
488
     * @stable ICU 4.4
489
     */
490
    virtual UBool hasBoundaryAfter(UChar32 c) const = 0;
491
492
    /**
493
     * Tests if the character is normalization-inert.
494
     * If true, then the character does not change, nor normalization-interact with
495
     * preceding or following characters.
496
     * In other words, a string containing this character can be normalized
497
     * by processing portions before this character and after this
498
     * character independently.
499
     * This is used for iterative normalization. See the class documentation for details.
500
     * Note that this operation may be significantly slower than hasBoundaryBefore().
501
     * @param c character to test
502
     * @return true if c is normalization-inert
503
     * @stable ICU 4.4
504
     */
505
    virtual UBool isInert(UChar32 c) const = 0;
506
};
507
508
/**
509
 * Normalization filtered by a UnicodeSet.
510
 * Normalizes portions of the text contained in the filter set and leaves
511
 * portions not contained in the filter set unchanged.
512
 * Filtering is done via UnicodeSet::span(..., USET_SPAN_SIMPLE).
513
 * Not-in-the-filter text is treated as "is normalized" and "quick check yes".
514
 * This class implements all of (and only) the Normalizer2 API.
515
 * An instance of this class is unmodifiable/immutable but is constructed and
516
 * must be destructed by the owner.
517
 * @stable ICU 4.4
518
 */
519
class U_COMMON_API FilteredNormalizer2 : public Normalizer2 {
520
public:
521
    /**
522
     * Constructs a filtered normalizer wrapping any Normalizer2 instance
523
     * and a filter set.
524
     * Both are aliased and must not be modified or deleted while this object
525
     * is used.
526
     * The filter set should be frozen; otherwise the performance will suffer greatly.
527
     * @param n2 wrapped Normalizer2 instance
528
     * @param filterSet UnicodeSet which determines the characters to be normalized
529
     * @stable ICU 4.4
530
     */
531
    FilteredNormalizer2(const Normalizer2 &n2, const UnicodeSet &filterSet) :
532
0
            norm2(n2), set(filterSet) {}
533
534
    /**
535
     * Destructor.
536
     * @stable ICU 4.4
537
     */
538
    ~FilteredNormalizer2();
539
540
    /**
541
     * Writes the normalized form of the source string to the destination string
542
     * (replacing its contents) and returns the destination string.
543
     * The source and destination strings must be different objects.
544
     * @param src source string
545
     * @param dest destination string; its contents is replaced with normalized src
546
     * @param errorCode Standard ICU error code. Its input value must
547
     *                  pass the U_SUCCESS() test, or else the function returns
548
     *                  immediately. Check for U_FAILURE() on output or use with
549
     *                  function chaining. (See User Guide for details.)
550
     * @return dest
551
     * @stable ICU 4.4
552
     */
553
    virtual UnicodeString &
554
    normalize(const UnicodeString &src,
555
              UnicodeString &dest,
556
              UErrorCode &errorCode) const override;
557
558
    /**
559
     * Normalizes a UTF-8 string and optionally records how source substrings
560
     * relate to changed and unchanged result substrings.
561
     *
562
     * Implemented completely for most built-in modes except for FCD.
563
     * The base class implementation converts to & from UTF-16 and does not support edits.
564
     *
565
     * @param options   Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT and U_EDITS_NO_RESET.
566
     * @param src       Source UTF-8 string.
567
     * @param sink      A ByteSink to which the normalized UTF-8 result string is written.
568
     *                  sink.Flush() is called at the end.
569
     * @param edits     Records edits for index mapping, working with styled text,
570
     *                  and getting only changes (if any).
571
     *                  The Edits contents is undefined if any error occurs.
572
     *                  This function calls edits->reset() first unless
573
     *                  options includes U_EDITS_NO_RESET. edits can be nullptr.
574
     * @param errorCode Standard ICU error code. Its input value must
575
     *                  pass the U_SUCCESS() test, or else the function returns
576
     *                  immediately. Check for U_FAILURE() on output or use with
577
     *                  function chaining. (See User Guide for details.)
578
     * @stable ICU 60
579
     */
580
    virtual void
581
    normalizeUTF8(uint32_t options, StringPiece src, ByteSink &sink,
582
                  Edits *edits, UErrorCode &errorCode) const override;
583
584
    /**
585
     * Appends the normalized form of the second string to the first string
586
     * (merging them at the boundary) and returns the first string.
587
     * The result is normalized if the first string was normalized.
588
     * The first and second strings must be different objects.
589
     * @param first string, should be normalized
590
     * @param second string, will be normalized
591
     * @param errorCode Standard ICU error code. Its input value must
592
     *                  pass the U_SUCCESS() test, or else the function returns
593
     *                  immediately. Check for U_FAILURE() on output or use with
594
     *                  function chaining. (See User Guide for details.)
595
     * @return first
596
     * @stable ICU 4.4
597
     */
598
    virtual UnicodeString &
599
    normalizeSecondAndAppend(UnicodeString &first,
600
                             const UnicodeString &second,
601
                             UErrorCode &errorCode) const override;
602
    /**
603
     * Appends the second string to the first string
604
     * (merging them at the boundary) and returns the first string.
605
     * The result is normalized if both the strings were normalized.
606
     * The first and second strings must be different objects.
607
     * @param first string, should be normalized
608
     * @param second string, should be normalized
609
     * @param errorCode Standard ICU error code. Its input value must
610
     *                  pass the U_SUCCESS() test, or else the function returns
611
     *                  immediately. Check for U_FAILURE() on output or use with
612
     *                  function chaining. (See User Guide for details.)
613
     * @return first
614
     * @stable ICU 4.4
615
     */
616
    virtual UnicodeString &
617
    append(UnicodeString &first,
618
           const UnicodeString &second,
619
           UErrorCode &errorCode) const override;
620
621
    /**
622
     * Gets the decomposition mapping of c.
623
     * For details see the base class documentation.
624
     *
625
     * This function is independent of the mode of the Normalizer2.
626
     * @param c code point
627
     * @param decomposition String object which will be set to c's
628
     *                      decomposition mapping, if there is one.
629
     * @return true if c has a decomposition, otherwise false
630
     * @stable ICU 4.6
631
     */
632
    virtual UBool
633
    getDecomposition(UChar32 c, UnicodeString &decomposition) const override;
634
635
    /**
636
     * Gets the raw decomposition mapping of c.
637
     * For details see the base class documentation.
638
     *
639
     * This function is independent of the mode of the Normalizer2.
640
     * @param c code point
641
     * @param decomposition String object which will be set to c's
642
     *                      raw decomposition mapping, if there is one.
643
     * @return true if c has a decomposition, otherwise false
644
     * @stable ICU 49
645
     */
646
    virtual UBool
647
    getRawDecomposition(UChar32 c, UnicodeString &decomposition) const override;
648
649
    /**
650
     * Performs pairwise composition of a & b and returns the composite if there is one.
651
     * For details see the base class documentation.
652
     *
653
     * This function is independent of the mode of the Normalizer2.
654
     * @param a A (normalization starter) code point.
655
     * @param b Another code point.
656
     * @return The non-negative composite code point if there is one; otherwise a negative value.
657
     * @stable ICU 49
658
     */
659
    virtual UChar32
660
    composePair(UChar32 a, UChar32 b) const override;
661
662
    /**
663
     * Gets the combining class of c.
664
     * The default implementation returns 0
665
     * but all standard implementations return the Unicode Canonical_Combining_Class value.
666
     * @param c code point
667
     * @return c's combining class
668
     * @stable ICU 49
669
     */
670
    virtual uint8_t
671
    getCombiningClass(UChar32 c) const override;
672
673
    /**
674
     * Tests if the string is normalized.
675
     * For details see the Normalizer2 base class documentation.
676
     * @param s input string
677
     * @param errorCode Standard ICU error code. Its input value must
678
     *                  pass the U_SUCCESS() test, or else the function returns
679
     *                  immediately. Check for U_FAILURE() on output or use with
680
     *                  function chaining. (See User Guide for details.)
681
     * @return true if s is normalized
682
     * @stable ICU 4.4
683
     */
684
    virtual UBool
685
    isNormalized(const UnicodeString &s, UErrorCode &errorCode) const override;
686
    /**
687
     * Tests if the UTF-8 string is normalized.
688
     * Internally, in cases where the quickCheck() method would return "maybe"
689
     * (which is only possible for the two COMPOSE modes) this method
690
     * resolves to "yes" or "no" to provide a definitive result,
691
     * at the cost of doing more work in those cases.
692
     *
693
     * This works for all normalization modes.
694
     * It is optimized for UTF-8 for all built-in modes except for FCD.
695
     * The base class implementation converts to UTF-16 and calls isNormalized().
696
     *
697
     * @param s UTF-8 input string
698
     * @param errorCode Standard ICU error code. Its input value must
699
     *                  pass the U_SUCCESS() test, or else the function returns
700
     *                  immediately. Check for U_FAILURE() on output or use with
701
     *                  function chaining. (See User Guide for details.)
702
     * @return true if s is normalized
703
     * @stable ICU 60
704
     */
705
    virtual UBool
706
    isNormalizedUTF8(StringPiece s, UErrorCode &errorCode) const override;
707
    /**
708
     * Tests if the string is normalized.
709
     * For details see the Normalizer2 base class documentation.
710
     * @param s input string
711
     * @param errorCode Standard ICU error code. Its input value must
712
     *                  pass the U_SUCCESS() test, or else the function returns
713
     *                  immediately. Check for U_FAILURE() on output or use with
714
     *                  function chaining. (See User Guide for details.)
715
     * @return UNormalizationCheckResult
716
     * @stable ICU 4.4
717
     */
718
    virtual UNormalizationCheckResult
719
    quickCheck(const UnicodeString &s, UErrorCode &errorCode) const override;
720
    /**
721
     * Returns the end of the normalized substring of the input string.
722
     * For details see the Normalizer2 base class documentation.
723
     * @param s input string
724
     * @param errorCode Standard ICU error code. Its input value must
725
     *                  pass the U_SUCCESS() test, or else the function returns
726
     *                  immediately. Check for U_FAILURE() on output or use with
727
     *                  function chaining. (See User Guide for details.)
728
     * @return "yes" span end index
729
     * @stable ICU 4.4
730
     */
731
    virtual int32_t
732
    spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const override;
733
734
    /**
735
     * Tests if the character always has a normalization boundary before it,
736
     * regardless of context.
737
     * For details see the Normalizer2 base class documentation.
738
     * @param c character to test
739
     * @return true if c has a normalization boundary before it
740
     * @stable ICU 4.4
741
     */
742
    virtual UBool hasBoundaryBefore(UChar32 c) const override;
743
744
    /**
745
     * Tests if the character always has a normalization boundary after it,
746
     * regardless of context.
747
     * For details see the Normalizer2 base class documentation.
748
     * @param c character to test
749
     * @return true if c has a normalization boundary after it
750
     * @stable ICU 4.4
751
     */
752
    virtual UBool hasBoundaryAfter(UChar32 c) const override;
753
754
    /**
755
     * Tests if the character is normalization-inert.
756
     * For details see the Normalizer2 base class documentation.
757
     * @param c character to test
758
     * @return true if c is normalization-inert
759
     * @stable ICU 4.4
760
     */
761
    virtual UBool isInert(UChar32 c) const override;
762
private:
763
    UnicodeString &
764
    normalize(const UnicodeString &src,
765
              UnicodeString &dest,
766
              USetSpanCondition spanCondition,
767
              UErrorCode &errorCode) const;
768
769
    void
770
    normalizeUTF8(uint32_t options, const char *src, int32_t length,
771
                  ByteSink &sink, Edits *edits,
772
                  USetSpanCondition spanCondition,
773
                  UErrorCode &errorCode) const;
774
775
    UnicodeString &
776
    normalizeSecondAndAppend(UnicodeString &first,
777
                             const UnicodeString &second,
778
                             UBool doNormalize,
779
                             UErrorCode &errorCode) const;
780
781
    const Normalizer2 &norm2;
782
    const UnicodeSet &set;
783
};
784
785
U_NAMESPACE_END
786
787
#endif  // !UCONFIG_NO_NORMALIZATION
788
789
#endif /* U_SHOW_CPLUSPLUS_API */
790
791
#endif  // __NORMALIZER2_H__