Coverage Report

Created: 2023-02-22 06:51

/src/icu/source/common/unicode/normalizer2.h
Line
Count
Source (jump to first uncovered line)
1
// © 2016 and later: Unicode, Inc. and others.
2
// License & terms of use: http://www.unicode.org/copyright.html
3
/*
4
*******************************************************************************
5
*
6
*   Copyright (C) 2009-2013, International Business Machines
7
*   Corporation and others.  All Rights Reserved.
8
*
9
*******************************************************************************
10
*   file name:  normalizer2.h
11
*   encoding:   UTF-8
12
*   tab size:   8 (not used)
13
*   indentation:4
14
*
15
*   created on: 2009nov22
16
*   created by: Markus W. Scherer
17
*/
18
19
#ifndef __NORMALIZER2_H__
20
#define __NORMALIZER2_H__
21
22
/**
23
 * \file
24
 * \brief C++ API: New API for Unicode Normalization.
25
 */
26
27
#include "unicode/utypes.h"
28
29
#if U_SHOW_CPLUSPLUS_API
30
31
#if !UCONFIG_NO_NORMALIZATION
32
33
#include "unicode/stringpiece.h"
34
#include "unicode/uniset.h"
35
#include "unicode/unistr.h"
36
#include "unicode/unorm2.h"
37
38
U_NAMESPACE_BEGIN
39
40
class ByteSink;
41
42
/**
43
 * Unicode normalization functionality for standard Unicode normalization or
44
 * for using custom mapping tables.
45
 * All instances of this class are unmodifiable/immutable.
46
 * Instances returned by getInstance() are singletons that must not be deleted by the caller.
47
 * The Normalizer2 class is not intended for public subclassing.
48
 *
49
 * The primary functions are to produce a normalized string and to detect whether
50
 * a string is already normalized.
51
 * The most commonly used normalization forms are those defined in
52
 * http://www.unicode.org/unicode/reports/tr15/
53
 * However, this API supports additional normalization forms for specialized purposes.
54
 * For example, NFKC_Casefold is provided via getInstance("nfkc_cf", COMPOSE)
55
 * and can be used in implementations of UTS #46.
56
 *
57
 * Not only are the standard compose and decompose modes supplied,
58
 * but additional modes are provided as documented in the Mode enum.
59
 *
60
 * Some of the functions in this class identify normalization boundaries.
61
 * At a normalization boundary, the portions of the string
62
 * before it and starting from it do not interact and can be handled independently.
63
 *
64
 * The spanQuickCheckYes() stops at a normalization boundary.
65
 * When the goal is a normalized string, then the text before the boundary
66
 * can be copied, and the remainder can be processed with normalizeSecondAndAppend().
67
 *
68
 * The hasBoundaryBefore(), hasBoundaryAfter() and isInert() functions test whether
69
 * a character is guaranteed to be at a normalization boundary,
70
 * regardless of context.
71
 * This is used for moving from one normalization boundary to the next
72
 * or preceding boundary, and for performing iterative normalization.
73
 *
74
 * Iterative normalization is useful when only a small portion of a
75
 * longer string needs to be processed.
76
 * For example, in ICU, iterative normalization is used by the NormalizationTransliterator
77
 * (to avoid replacing already-normalized text) and ucol_nextSortKeyPart()
78
 * (to process only the substring for which sort key bytes are computed).
79
 *
80
 * The set of normalization boundaries returned by these functions may not be
81
 * complete: There may be more boundaries that could be returned.
82
 * Different functions may return different boundaries.
83
 * @stable ICU 4.4
84
 */
85
class U_COMMON_API Normalizer2 : public UObject {
86
public:
87
    /**
88
     * Destructor.
89
     * @stable ICU 4.4
90
     */
91
    ~Normalizer2();
92
93
    /**
94
     * Returns a Normalizer2 instance for Unicode NFC normalization.
95
     * Same as getInstance(NULL, "nfc", UNORM2_COMPOSE, errorCode).
96
     * Returns an unmodifiable singleton instance. Do not delete it.
97
     * @param errorCode Standard ICU error code. Its input value must
98
     *                  pass the U_SUCCESS() test, or else the function returns
99
     *                  immediately. Check for U_FAILURE() on output or use with
100
     *                  function chaining. (See User Guide for details.)
101
     * @return the requested Normalizer2, if successful
102
     * @stable ICU 49
103
     */
104
    static const Normalizer2 *
105
    getNFCInstance(UErrorCode &errorCode);
106
107
    /**
108
     * Returns a Normalizer2 instance for Unicode NFD normalization.
109
     * Same as getInstance(NULL, "nfc", UNORM2_DECOMPOSE, errorCode).
110
     * Returns an unmodifiable singleton instance. Do not delete it.
111
     * @param errorCode Standard ICU error code. Its input value must
112
     *                  pass the U_SUCCESS() test, or else the function returns
113
     *                  immediately. Check for U_FAILURE() on output or use with
114
     *                  function chaining. (See User Guide for details.)
115
     * @return the requested Normalizer2, if successful
116
     * @stable ICU 49
117
     */
118
    static const Normalizer2 *
119
    getNFDInstance(UErrorCode &errorCode);
120
121
    /**
122
     * Returns a Normalizer2 instance for Unicode NFKC normalization.
123
     * Same as getInstance(NULL, "nfkc", UNORM2_COMPOSE, errorCode).
124
     * Returns an unmodifiable singleton instance. Do not delete it.
125
     * @param errorCode Standard ICU error code. Its input value must
126
     *                  pass the U_SUCCESS() test, or else the function returns
127
     *                  immediately. Check for U_FAILURE() on output or use with
128
     *                  function chaining. (See User Guide for details.)
129
     * @return the requested Normalizer2, if successful
130
     * @stable ICU 49
131
     */
132
    static const Normalizer2 *
133
    getNFKCInstance(UErrorCode &errorCode);
134
135
    /**
136
     * Returns a Normalizer2 instance for Unicode NFKD normalization.
137
     * Same as getInstance(NULL, "nfkc", UNORM2_DECOMPOSE, errorCode).
138
     * Returns an unmodifiable singleton instance. Do not delete it.
139
     * @param errorCode Standard ICU error code. Its input value must
140
     *                  pass the U_SUCCESS() test, or else the function returns
141
     *                  immediately. Check for U_FAILURE() on output or use with
142
     *                  function chaining. (See User Guide for details.)
143
     * @return the requested Normalizer2, if successful
144
     * @stable ICU 49
145
     */
146
    static const Normalizer2 *
147
    getNFKDInstance(UErrorCode &errorCode);
148
149
    /**
150
     * Returns a Normalizer2 instance for Unicode NFKC_Casefold normalization.
151
     * Same as getInstance(NULL, "nfkc_cf", UNORM2_COMPOSE, errorCode).
152
     * Returns an unmodifiable singleton instance. Do not delete it.
153
     * @param errorCode Standard ICU error code. Its input value must
154
     *                  pass the U_SUCCESS() test, or else the function returns
155
     *                  immediately. Check for U_FAILURE() on output or use with
156
     *                  function chaining. (See User Guide for details.)
157
     * @return the requested Normalizer2, if successful
158
     * @stable ICU 49
159
     */
160
    static const Normalizer2 *
161
    getNFKCCasefoldInstance(UErrorCode &errorCode);
162
163
    /**
164
     * Returns a Normalizer2 instance which uses the specified data file
165
     * (packageName/name similar to ucnv_openPackage() and ures_open()/ResourceBundle)
166
     * and which composes or decomposes text according to the specified mode.
167
     * Returns an unmodifiable singleton instance. Do not delete it.
168
     *
169
     * Use packageName=NULL for data files that are part of ICU's own data.
170
     * Use name="nfc" and UNORM2_COMPOSE/UNORM2_DECOMPOSE for Unicode standard NFC/NFD.
171
     * Use name="nfkc" and UNORM2_COMPOSE/UNORM2_DECOMPOSE for Unicode standard NFKC/NFKD.
172
     * Use name="nfkc_cf" and UNORM2_COMPOSE for Unicode standard NFKC_CF=NFKC_Casefold.
173
     *
174
     * @param packageName NULL for ICU built-in data, otherwise application data package name
175
     * @param name "nfc" or "nfkc" or "nfkc_cf" or name of custom data file
176
     * @param mode normalization mode (compose or decompose etc.)
177
     * @param errorCode Standard ICU error code. Its input value must
178
     *                  pass the U_SUCCESS() test, or else the function returns
179
     *                  immediately. Check for U_FAILURE() on output or use with
180
     *                  function chaining. (See User Guide for details.)
181
     * @return the requested Normalizer2, if successful
182
     * @stable ICU 4.4
183
     */
184
    static const Normalizer2 *
185
    getInstance(const char *packageName,
186
                const char *name,
187
                UNormalization2Mode mode,
188
                UErrorCode &errorCode);
189
190
    /**
191
     * Returns the normalized form of the source string.
192
     * @param src source string
193
     * @param errorCode Standard ICU error code. Its input value must
194
     *                  pass the U_SUCCESS() test, or else the function returns
195
     *                  immediately. Check for U_FAILURE() on output or use with
196
     *                  function chaining. (See User Guide for details.)
197
     * @return normalized src
198
     * @stable ICU 4.4
199
     */
200
    UnicodeString
201
0
    normalize(const UnicodeString &src, UErrorCode &errorCode) const {
202
0
        UnicodeString result;
203
0
        normalize(src, result, errorCode);
204
0
        return result;
205
0
    }
206
    /**
207
     * Writes the normalized form of the source string to the destination string
208
     * (replacing its contents) and returns the destination string.
209
     * The source and destination strings must be different objects.
210
     * @param src source string
211
     * @param dest destination string; its contents is replaced with normalized src
212
     * @param errorCode Standard ICU error code. Its input value must
213
     *                  pass the U_SUCCESS() test, or else the function returns
214
     *                  immediately. Check for U_FAILURE() on output or use with
215
     *                  function chaining. (See User Guide for details.)
216
     * @return dest
217
     * @stable ICU 4.4
218
     */
219
    virtual UnicodeString &
220
    normalize(const UnicodeString &src,
221
              UnicodeString &dest,
222
              UErrorCode &errorCode) const = 0;
223
224
    /**
225
     * Normalizes a UTF-8 string and optionally records how source substrings
226
     * relate to changed and unchanged result substrings.
227
     *
228
     * Implemented completely for all built-in modes except for FCD.
229
     * The base class implementation converts to & from UTF-16 and does not support edits.
230
     *
231
     * @param options   Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT and U_EDITS_NO_RESET.
232
     * @param src       Source UTF-8 string.
233
     * @param sink      A ByteSink to which the normalized UTF-8 result string is written.
234
     *                  sink.Flush() is called at the end.
235
     * @param edits     Records edits for index mapping, working with styled text,
236
     *                  and getting only changes (if any).
237
     *                  The Edits contents is undefined if any error occurs.
238
     *                  This function calls edits->reset() first unless
239
     *                  options includes U_EDITS_NO_RESET. edits can be nullptr.
240
     * @param errorCode Standard ICU error code. Its input value must
241
     *                  pass the U_SUCCESS() test, or else the function returns
242
     *                  immediately. Check for U_FAILURE() on output or use with
243
     *                  function chaining. (See User Guide for details.)
244
     * @stable ICU 60
245
     */
246
    virtual void
247
    normalizeUTF8(uint32_t options, StringPiece src, ByteSink &sink,
248
                  Edits *edits, UErrorCode &errorCode) const;
249
250
    /**
251
     * Appends the normalized form of the second string to the first string
252
     * (merging them at the boundary) and returns the first string.
253
     * The result is normalized if the first string was normalized.
254
     * The first and second strings must be different objects.
255
     * @param first string, should be normalized
256
     * @param second string, will be normalized
257
     * @param errorCode Standard ICU error code. Its input value must
258
     *                  pass the U_SUCCESS() test, or else the function returns
259
     *                  immediately. Check for U_FAILURE() on output or use with
260
     *                  function chaining. (See User Guide for details.)
261
     * @return first
262
     * @stable ICU 4.4
263
     */
264
    virtual UnicodeString &
265
    normalizeSecondAndAppend(UnicodeString &first,
266
                             const UnicodeString &second,
267
                             UErrorCode &errorCode) const = 0;
268
    /**
269
     * Appends the second string to the first string
270
     * (merging them at the boundary) and returns the first string.
271
     * The result is normalized if both the strings were normalized.
272
     * The first and second strings must be different objects.
273
     * @param first string, should be normalized
274
     * @param second string, should be normalized
275
     * @param errorCode Standard ICU error code. Its input value must
276
     *                  pass the U_SUCCESS() test, or else the function returns
277
     *                  immediately. Check for U_FAILURE() on output or use with
278
     *                  function chaining. (See User Guide for details.)
279
     * @return first
280
     * @stable ICU 4.4
281
     */
282
    virtual UnicodeString &
283
    append(UnicodeString &first,
284
           const UnicodeString &second,
285
           UErrorCode &errorCode) const = 0;
286
287
    /**
288
     * Gets the decomposition mapping of c.
289
     * Roughly equivalent to normalizing the String form of c
290
     * on a UNORM2_DECOMPOSE Normalizer2 instance, but much faster, and except that this function
291
     * returns false and does not write a string
292
     * if c does not have a decomposition mapping in this instance's data.
293
     * This function is independent of the mode of the Normalizer2.
294
     * @param c code point
295
     * @param decomposition String object which will be set to c's
296
     *                      decomposition mapping, if there is one.
297
     * @return true if c has a decomposition, otherwise false
298
     * @stable ICU 4.6
299
     */
300
    virtual UBool
301
    getDecomposition(UChar32 c, UnicodeString &decomposition) const = 0;
302
303
    /**
304
     * Gets the raw decomposition mapping of c.
305
     *
306
     * This is similar to the getDecomposition() method but returns the
307
     * raw decomposition mapping as specified in UnicodeData.txt or
308
     * (for custom data) in the mapping files processed by the gennorm2 tool.
309
     * By contrast, getDecomposition() returns the processed,
310
     * recursively-decomposed version of this mapping.
311
     *
312
     * When used on a standard NFKC Normalizer2 instance,
313
     * getRawDecomposition() returns the Unicode Decomposition_Mapping (dm) property.
314
     *
315
     * When used on a standard NFC Normalizer2 instance,
316
     * it returns the Decomposition_Mapping only if the Decomposition_Type (dt) is Canonical (Can);
317
     * in this case, the result contains either one or two code points (=1..4 char16_ts).
318
     *
319
     * This function is independent of the mode of the Normalizer2.
320
     * The default implementation returns false.
321
     * @param c code point
322
     * @param decomposition String object which will be set to c's
323
     *                      raw decomposition mapping, if there is one.
324
     * @return true if c has a decomposition, otherwise false
325
     * @stable ICU 49
326
     */
327
    virtual UBool
328
    getRawDecomposition(UChar32 c, UnicodeString &decomposition) const;
329
330
    /**
331
     * Performs pairwise composition of a & b and returns the composite if there is one.
332
     *
333
     * Returns a composite code point c only if c has a two-way mapping to a+b.
334
     * In standard Unicode normalization, this means that
335
     * c has a canonical decomposition to a+b
336
     * and c does not have the Full_Composition_Exclusion property.
337
     *
338
     * This function is independent of the mode of the Normalizer2.
339
     * The default implementation returns a negative value.
340
     * @param a A (normalization starter) code point.
341
     * @param b Another code point.
342
     * @return The non-negative composite code point if there is one; otherwise a negative value.
343
     * @stable ICU 49
344
     */
345
    virtual UChar32
346
    composePair(UChar32 a, UChar32 b) const;
347
348
    /**
349
     * Gets the combining class of c.
350
     * The default implementation returns 0
351
     * but all standard implementations return the Unicode Canonical_Combining_Class value.
352
     * @param c code point
353
     * @return c's combining class
354
     * @stable ICU 49
355
     */
356
    virtual uint8_t
357
    getCombiningClass(UChar32 c) const;
358
359
    /**
360
     * Tests if the string is normalized.
361
     * Internally, in cases where the quickCheck() method would return "maybe"
362
     * (which is only possible for the two COMPOSE modes) this method
363
     * resolves to "yes" or "no" to provide a definitive result,
364
     * at the cost of doing more work in those cases.
365
     * @param s input string
366
     * @param errorCode Standard ICU error code. Its input value must
367
     *                  pass the U_SUCCESS() test, or else the function returns
368
     *                  immediately. Check for U_FAILURE() on output or use with
369
     *                  function chaining. (See User Guide for details.)
370
     * @return true if s is normalized
371
     * @stable ICU 4.4
372
     */
373
    virtual UBool
374
    isNormalized(const UnicodeString &s, UErrorCode &errorCode) const = 0;
375
    /**
376
     * Tests if the UTF-8 string is normalized.
377
     * Internally, in cases where the quickCheck() method would return "maybe"
378
     * (which is only possible for the two COMPOSE modes) this method
379
     * resolves to "yes" or "no" to provide a definitive result,
380
     * at the cost of doing more work in those cases.
381
     *
382
     * This works for all normalization modes.
383
     * It is optimized for UTF-8 for all built-in modes except for FCD.
384
     * The base class implementation converts to UTF-16 and calls isNormalized().
385
     *
386
     * @param s UTF-8 input string
387
     * @param errorCode Standard ICU error code. Its input value must
388
     *                  pass the U_SUCCESS() test, or else the function returns
389
     *                  immediately. Check for U_FAILURE() on output or use with
390
     *                  function chaining. (See User Guide for details.)
391
     * @return true if s is normalized
392
     * @stable ICU 60
393
     */
394
    virtual UBool
395
    isNormalizedUTF8(StringPiece s, UErrorCode &errorCode) const;
396
397
398
    /**
399
     * Tests if the string is normalized.
400
     * For the two COMPOSE modes, the result could be "maybe" in cases that
401
     * would take a little more work to resolve definitively.
402
     * Use spanQuickCheckYes() and normalizeSecondAndAppend() for a faster
403
     * combination of quick check + normalization, to avoid
404
     * re-checking the "yes" prefix.
405
     * @param s input string
406
     * @param errorCode Standard ICU error code. Its input value must
407
     *                  pass the U_SUCCESS() test, or else the function returns
408
     *                  immediately. Check for U_FAILURE() on output or use with
409
     *                  function chaining. (See User Guide for details.)
410
     * @return UNormalizationCheckResult
411
     * @stable ICU 4.4
412
     */
413
    virtual UNormalizationCheckResult
414
    quickCheck(const UnicodeString &s, UErrorCode &errorCode) const = 0;
415
416
    /**
417
     * Returns the end of the normalized substring of the input string.
418
     * In other words, with <code>end=spanQuickCheckYes(s, ec);</code>
419
     * the substring <code>UnicodeString(s, 0, end)</code>
420
     * will pass the quick check with a "yes" result.
421
     *
422
     * The returned end index is usually one or more characters before the
423
     * "no" or "maybe" character: The end index is at a normalization boundary.
424
     * (See the class documentation for more about normalization boundaries.)
425
     *
426
     * When the goal is a normalized string and most input strings are expected
427
     * to be normalized already, then call this method,
428
     * and if it returns a prefix shorter than the input string,
429
     * copy that prefix and use normalizeSecondAndAppend() for the remainder.
430
     * @param s input string
431
     * @param errorCode Standard ICU error code. Its input value must
432
     *                  pass the U_SUCCESS() test, or else the function returns
433
     *                  immediately. Check for U_FAILURE() on output or use with
434
     *                  function chaining. (See User Guide for details.)
435
     * @return "yes" span end index
436
     * @stable ICU 4.4
437
     */
438
    virtual int32_t
439
    spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const = 0;
440
441
    /**
442
     * Tests if the character always has a normalization boundary before it,
443
     * regardless of context.
444
     * If true, then the character does not normalization-interact with
445
     * preceding characters.
446
     * In other words, a string containing this character can be normalized
447
     * by processing portions before this character and starting from this
448
     * character independently.
449
     * This is used for iterative normalization. See the class documentation for details.
450
     * @param c character to test
451
     * @return true if c has a normalization boundary before it
452
     * @stable ICU 4.4
453
     */
454
    virtual UBool hasBoundaryBefore(UChar32 c) const = 0;
455
456
    /**
457
     * Tests if the character always has a normalization boundary after it,
458
     * regardless of context.
459
     * If true, then the character does not normalization-interact with
460
     * following characters.
461
     * In other words, a string containing this character can be normalized
462
     * by processing portions up to this character and after this
463
     * character independently.
464
     * This is used for iterative normalization. See the class documentation for details.
465
     * Note that this operation may be significantly slower than hasBoundaryBefore().
466
     * @param c character to test
467
     * @return true if c has a normalization boundary after it
468
     * @stable ICU 4.4
469
     */
470
    virtual UBool hasBoundaryAfter(UChar32 c) const = 0;
471
472
    /**
473
     * Tests if the character is normalization-inert.
474
     * If true, then the character does not change, nor normalization-interact with
475
     * preceding or following characters.
476
     * In other words, a string containing this character can be normalized
477
     * by processing portions before this character and after this
478
     * character independently.
479
     * This is used for iterative normalization. See the class documentation for details.
480
     * Note that this operation may be significantly slower than hasBoundaryBefore().
481
     * @param c character to test
482
     * @return true if c is normalization-inert
483
     * @stable ICU 4.4
484
     */
485
    virtual UBool isInert(UChar32 c) const = 0;
486
};
487
488
/**
489
 * Normalization filtered by a UnicodeSet.
490
 * Normalizes portions of the text contained in the filter set and leaves
491
 * portions not contained in the filter set unchanged.
492
 * Filtering is done via UnicodeSet::span(..., USET_SPAN_SIMPLE).
493
 * Not-in-the-filter text is treated as "is normalized" and "quick check yes".
494
 * This class implements all of (and only) the Normalizer2 API.
495
 * An instance of this class is unmodifiable/immutable but is constructed and
496
 * must be destructed by the owner.
497
 * @stable ICU 4.4
498
 */
499
class U_COMMON_API FilteredNormalizer2 : public Normalizer2 {
500
public:
501
    /**
502
     * Constructs a filtered normalizer wrapping any Normalizer2 instance
503
     * and a filter set.
504
     * Both are aliased and must not be modified or deleted while this object
505
     * is used.
506
     * The filter set should be frozen; otherwise the performance will suffer greatly.
507
     * @param n2 wrapped Normalizer2 instance
508
     * @param filterSet UnicodeSet which determines the characters to be normalized
509
     * @stable ICU 4.4
510
     */
511
    FilteredNormalizer2(const Normalizer2 &n2, const UnicodeSet &filterSet) :
512
0
            norm2(n2), set(filterSet) {}
513
514
    /**
515
     * Destructor.
516
     * @stable ICU 4.4
517
     */
518
    ~FilteredNormalizer2();
519
520
    /**
521
     * Writes the normalized form of the source string to the destination string
522
     * (replacing its contents) and returns the destination string.
523
     * The source and destination strings must be different objects.
524
     * @param src source string
525
     * @param dest destination string; its contents is replaced with normalized src
526
     * @param errorCode Standard ICU error code. Its input value must
527
     *                  pass the U_SUCCESS() test, or else the function returns
528
     *                  immediately. Check for U_FAILURE() on output or use with
529
     *                  function chaining. (See User Guide for details.)
530
     * @return dest
531
     * @stable ICU 4.4
532
     */
533
    virtual UnicodeString &
534
    normalize(const UnicodeString &src,
535
              UnicodeString &dest,
536
              UErrorCode &errorCode) const U_OVERRIDE;
537
538
    /**
539
     * Normalizes a UTF-8 string and optionally records how source substrings
540
     * relate to changed and unchanged result substrings.
541
     *
542
     * Implemented completely for most built-in modes except for FCD.
543
     * The base class implementation converts to & from UTF-16 and does not support edits.
544
     *
545
     * @param options   Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT and U_EDITS_NO_RESET.
546
     * @param src       Source UTF-8 string.
547
     * @param sink      A ByteSink to which the normalized UTF-8 result string is written.
548
     *                  sink.Flush() is called at the end.
549
     * @param edits     Records edits for index mapping, working with styled text,
550
     *                  and getting only changes (if any).
551
     *                  The Edits contents is undefined if any error occurs.
552
     *                  This function calls edits->reset() first unless
553
     *                  options includes U_EDITS_NO_RESET. edits can be nullptr.
554
     * @param errorCode Standard ICU error code. Its input value must
555
     *                  pass the U_SUCCESS() test, or else the function returns
556
     *                  immediately. Check for U_FAILURE() on output or use with
557
     *                  function chaining. (See User Guide for details.)
558
     * @stable ICU 60
559
     */
560
    virtual void
561
    normalizeUTF8(uint32_t options, StringPiece src, ByteSink &sink,
562
                  Edits *edits, UErrorCode &errorCode) const U_OVERRIDE;
563
564
    /**
565
     * Appends the normalized form of the second string to the first string
566
     * (merging them at the boundary) and returns the first string.
567
     * The result is normalized if the first string was normalized.
568
     * The first and second strings must be different objects.
569
     * @param first string, should be normalized
570
     * @param second string, will be normalized
571
     * @param errorCode Standard ICU error code. Its input value must
572
     *                  pass the U_SUCCESS() test, or else the function returns
573
     *                  immediately. Check for U_FAILURE() on output or use with
574
     *                  function chaining. (See User Guide for details.)
575
     * @return first
576
     * @stable ICU 4.4
577
     */
578
    virtual UnicodeString &
579
    normalizeSecondAndAppend(UnicodeString &first,
580
                             const UnicodeString &second,
581
                             UErrorCode &errorCode) const U_OVERRIDE;
582
    /**
583
     * Appends the second string to the first string
584
     * (merging them at the boundary) and returns the first string.
585
     * The result is normalized if both the strings were normalized.
586
     * The first and second strings must be different objects.
587
     * @param first string, should be normalized
588
     * @param second string, should be normalized
589
     * @param errorCode Standard ICU error code. Its input value must
590
     *                  pass the U_SUCCESS() test, or else the function returns
591
     *                  immediately. Check for U_FAILURE() on output or use with
592
     *                  function chaining. (See User Guide for details.)
593
     * @return first
594
     * @stable ICU 4.4
595
     */
596
    virtual UnicodeString &
597
    append(UnicodeString &first,
598
           const UnicodeString &second,
599
           UErrorCode &errorCode) const U_OVERRIDE;
600
601
    /**
602
     * Gets the decomposition mapping of c.
603
     * For details see the base class documentation.
604
     *
605
     * This function is independent of the mode of the Normalizer2.
606
     * @param c code point
607
     * @param decomposition String object which will be set to c's
608
     *                      decomposition mapping, if there is one.
609
     * @return true if c has a decomposition, otherwise false
610
     * @stable ICU 4.6
611
     */
612
    virtual UBool
613
    getDecomposition(UChar32 c, UnicodeString &decomposition) const U_OVERRIDE;
614
615
    /**
616
     * Gets the raw decomposition mapping of c.
617
     * For details see the base class documentation.
618
     *
619
     * This function is independent of the mode of the Normalizer2.
620
     * @param c code point
621
     * @param decomposition String object which will be set to c's
622
     *                      raw decomposition mapping, if there is one.
623
     * @return true if c has a decomposition, otherwise false
624
     * @stable ICU 49
625
     */
626
    virtual UBool
627
    getRawDecomposition(UChar32 c, UnicodeString &decomposition) const U_OVERRIDE;
628
629
    /**
630
     * Performs pairwise composition of a & b and returns the composite if there is one.
631
     * For details see the base class documentation.
632
     *
633
     * This function is independent of the mode of the Normalizer2.
634
     * @param a A (normalization starter) code point.
635
     * @param b Another code point.
636
     * @return The non-negative composite code point if there is one; otherwise a negative value.
637
     * @stable ICU 49
638
     */
639
    virtual UChar32
640
    composePair(UChar32 a, UChar32 b) const U_OVERRIDE;
641
642
    /**
643
     * Gets the combining class of c.
644
     * The default implementation returns 0
645
     * but all standard implementations return the Unicode Canonical_Combining_Class value.
646
     * @param c code point
647
     * @return c's combining class
648
     * @stable ICU 49
649
     */
650
    virtual uint8_t
651
    getCombiningClass(UChar32 c) const U_OVERRIDE;
652
653
    /**
654
     * Tests if the string is normalized.
655
     * For details see the Normalizer2 base class documentation.
656
     * @param s input string
657
     * @param errorCode Standard ICU error code. Its input value must
658
     *                  pass the U_SUCCESS() test, or else the function returns
659
     *                  immediately. Check for U_FAILURE() on output or use with
660
     *                  function chaining. (See User Guide for details.)
661
     * @return true if s is normalized
662
     * @stable ICU 4.4
663
     */
664
    virtual UBool
665
    isNormalized(const UnicodeString &s, UErrorCode &errorCode) const U_OVERRIDE;
666
    /**
667
     * Tests if the UTF-8 string is normalized.
668
     * Internally, in cases where the quickCheck() method would return "maybe"
669
     * (which is only possible for the two COMPOSE modes) this method
670
     * resolves to "yes" or "no" to provide a definitive result,
671
     * at the cost of doing more work in those cases.
672
     *
673
     * This works for all normalization modes.
674
     * It is optimized for UTF-8 for all built-in modes except for FCD.
675
     * The base class implementation converts to UTF-16 and calls isNormalized().
676
     *
677
     * @param s UTF-8 input string
678
     * @param errorCode Standard ICU error code. Its input value must
679
     *                  pass the U_SUCCESS() test, or else the function returns
680
     *                  immediately. Check for U_FAILURE() on output or use with
681
     *                  function chaining. (See User Guide for details.)
682
     * @return true if s is normalized
683
     * @stable ICU 60
684
     */
685
    virtual UBool
686
    isNormalizedUTF8(StringPiece s, UErrorCode &errorCode) const U_OVERRIDE;
687
    /**
688
     * Tests if the string is normalized.
689
     * For details see the Normalizer2 base class documentation.
690
     * @param s input string
691
     * @param errorCode Standard ICU error code. Its input value must
692
     *                  pass the U_SUCCESS() test, or else the function returns
693
     *                  immediately. Check for U_FAILURE() on output or use with
694
     *                  function chaining. (See User Guide for details.)
695
     * @return UNormalizationCheckResult
696
     * @stable ICU 4.4
697
     */
698
    virtual UNormalizationCheckResult
699
    quickCheck(const UnicodeString &s, UErrorCode &errorCode) const U_OVERRIDE;
700
    /**
701
     * Returns the end of the normalized substring of the input string.
702
     * For details see the Normalizer2 base class documentation.
703
     * @param s input string
704
     * @param errorCode Standard ICU error code. Its input value must
705
     *                  pass the U_SUCCESS() test, or else the function returns
706
     *                  immediately. Check for U_FAILURE() on output or use with
707
     *                  function chaining. (See User Guide for details.)
708
     * @return "yes" span end index
709
     * @stable ICU 4.4
710
     */
711
    virtual int32_t
712
    spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const U_OVERRIDE;
713
714
    /**
715
     * Tests if the character always has a normalization boundary before it,
716
     * regardless of context.
717
     * For details see the Normalizer2 base class documentation.
718
     * @param c character to test
719
     * @return true if c has a normalization boundary before it
720
     * @stable ICU 4.4
721
     */
722
    virtual UBool hasBoundaryBefore(UChar32 c) const U_OVERRIDE;
723
724
    /**
725
     * Tests if the character always has a normalization boundary after it,
726
     * regardless of context.
727
     * For details see the Normalizer2 base class documentation.
728
     * @param c character to test
729
     * @return true if c has a normalization boundary after it
730
     * @stable ICU 4.4
731
     */
732
    virtual UBool hasBoundaryAfter(UChar32 c) const U_OVERRIDE;
733
734
    /**
735
     * Tests if the character is normalization-inert.
736
     * For details see the Normalizer2 base class documentation.
737
     * @param c character to test
738
     * @return true if c is normalization-inert
739
     * @stable ICU 4.4
740
     */
741
    virtual UBool isInert(UChar32 c) const U_OVERRIDE;
742
private:
743
    UnicodeString &
744
    normalize(const UnicodeString &src,
745
              UnicodeString &dest,
746
              USetSpanCondition spanCondition,
747
              UErrorCode &errorCode) const;
748
749
    void
750
    normalizeUTF8(uint32_t options, const char *src, int32_t length,
751
                  ByteSink &sink, Edits *edits,
752
                  USetSpanCondition spanCondition,
753
                  UErrorCode &errorCode) const;
754
755
    UnicodeString &
756
    normalizeSecondAndAppend(UnicodeString &first,
757
                             const UnicodeString &second,
758
                             UBool doNormalize,
759
                             UErrorCode &errorCode) const;
760
761
    const Normalizer2 &norm2;
762
    const UnicodeSet &set;
763
};
764
765
U_NAMESPACE_END
766
767
#endif  // !UCONFIG_NO_NORMALIZATION
768
769
#endif /* U_SHOW_CPLUSPLUS_API */
770
771
#endif  // __NORMALIZER2_H__