Coverage Report

Created: 2025-07-11 06:23

/src/icu/source/common/normalizer2.cpp
Line
Count
Source (jump to first uncovered line)
1
// © 2016 and later: Unicode, Inc. and others.
2
// License & terms of use: http://www.unicode.org/copyright.html
3
/*
4
*******************************************************************************
5
*
6
*   Copyright (C) 2009-2016, International Business Machines
7
*   Corporation and others.  All Rights Reserved.
8
*
9
*******************************************************************************
10
*   file name:  normalizer2.cpp
11
*   encoding:   UTF-8
12
*   tab size:   8 (not used)
13
*   indentation:4
14
*
15
*   created on: 2009nov22
16
*   created by: Markus W. Scherer
17
*/
18
19
#include "unicode/utypes.h"
20
21
#if !UCONFIG_NO_NORMALIZATION
22
23
#include "unicode/normalizer2.h"
24
#include "unicode/unistr.h"
25
#include "unicode/unorm.h"
26
#include "cstring.h"
27
#include "mutex.h"
28
#include "norm2allmodes.h"
29
#include "normalizer2impl.h"
30
#include "uassert.h"
31
#include "ucln_cmn.h"
32
33
using icu::Normalizer2Impl;
34
35
// NFC/NFD data machine-generated by gennorm2 --csource
36
#define INCLUDED_FROM_NORMALIZER2_CPP
37
#include "norm2_nfc_data.h"
38
39
U_NAMESPACE_BEGIN
40
41
// Public API dispatch via Normalizer2 subclasses -------------------------- ***
42
43
0
Normalizer2::~Normalizer2() {}
44
45
UBool
46
0
Normalizer2::getRawDecomposition(UChar32, UnicodeString &) const {
47
0
    return FALSE;
48
0
}
49
50
UChar32
51
0
Normalizer2::composePair(UChar32, UChar32) const {
52
0
    return U_SENTINEL;
53
0
}
54
55
uint8_t
56
0
Normalizer2::getCombiningClass(UChar32 /*c*/) const {
57
0
    return 0;
58
0
}
59
60
// Normalizer2 implementation for the old UNORM_NONE.
61
class NoopNormalizer2 : public Normalizer2 {
62
    virtual ~NoopNormalizer2();
63
64
    virtual UnicodeString &
65
    normalize(const UnicodeString &src,
66
              UnicodeString &dest,
67
0
              UErrorCode &errorCode) const {
68
0
        if(U_SUCCESS(errorCode)) {
69
0
            if(&dest!=&src) {
70
0
                dest=src;
71
0
            } else {
72
0
                errorCode=U_ILLEGAL_ARGUMENT_ERROR;
73
0
            }
74
0
        }
75
0
        return dest;
76
0
    }
77
    virtual UnicodeString &
78
    normalizeSecondAndAppend(UnicodeString &first,
79
                             const UnicodeString &second,
80
0
                             UErrorCode &errorCode) const {
81
0
        if(U_SUCCESS(errorCode)) {
82
0
            if(&first!=&second) {
83
0
                first.append(second);
84
0
            } else {
85
0
                errorCode=U_ILLEGAL_ARGUMENT_ERROR;
86
0
            }
87
0
        }
88
0
        return first;
89
0
    }
90
    virtual UnicodeString &
91
    append(UnicodeString &first,
92
           const UnicodeString &second,
93
0
           UErrorCode &errorCode) const {
94
0
        if(U_SUCCESS(errorCode)) {
95
0
            if(&first!=&second) {
96
0
                first.append(second);
97
0
            } else {
98
0
                errorCode=U_ILLEGAL_ARGUMENT_ERROR;
99
0
            }
100
0
        }
101
0
        return first;
102
0
    }
103
    virtual UBool
104
0
    getDecomposition(UChar32, UnicodeString &) const {
105
0
        return FALSE;
106
0
    }
107
    // No need to override the default getRawDecomposition().
108
    virtual UBool
109
0
    isNormalized(const UnicodeString &, UErrorCode &) const {
110
0
        return TRUE;
111
0
    }
112
    virtual UNormalizationCheckResult
113
0
    quickCheck(const UnicodeString &, UErrorCode &) const {
114
0
        return UNORM_YES;
115
0
    }
116
    virtual int32_t
117
0
    spanQuickCheckYes(const UnicodeString &s, UErrorCode &) const {
118
0
        return s.length();
119
0
    }
120
0
    virtual UBool hasBoundaryBefore(UChar32) const { return TRUE; }
121
0
    virtual UBool hasBoundaryAfter(UChar32) const { return TRUE; }
122
0
    virtual UBool isInert(UChar32) const { return TRUE; }
123
};
124
125
NoopNormalizer2::~NoopNormalizer2() {}
126
127
Normalizer2WithImpl::~Normalizer2WithImpl() {}
128
129
DecomposeNormalizer2::~DecomposeNormalizer2() {}
130
131
ComposeNormalizer2::~ComposeNormalizer2() {}
132
133
FCDNormalizer2::~FCDNormalizer2() {}
134
135
// instance cache ---------------------------------------------------------- ***
136
137
0
Norm2AllModes::~Norm2AllModes() {
138
0
    delete impl;
139
0
}
140
141
Norm2AllModes *
142
2
Norm2AllModes::createInstance(Normalizer2Impl *impl, UErrorCode &errorCode) {
143
2
    if(U_FAILURE(errorCode)) {
144
0
        delete impl;
145
0
        return NULL;
146
0
    }
147
2
    Norm2AllModes *allModes=new Norm2AllModes(impl);
148
2
    if(allModes==NULL) {
149
0
        errorCode=U_MEMORY_ALLOCATION_ERROR;
150
0
        delete impl;
151
0
        return NULL;
152
0
    }
153
2
    return allModes;
154
2
}
155
156
Norm2AllModes *
157
0
Norm2AllModes::createNFCInstance(UErrorCode &errorCode) {
158
0
    if(U_FAILURE(errorCode)) {
159
0
        return NULL;
160
0
    }
161
0
    Normalizer2Impl *impl=new Normalizer2Impl;
162
0
    if(impl==NULL) {
163
0
        errorCode=U_MEMORY_ALLOCATION_ERROR;
164
0
        return NULL;
165
0
    }
166
0
    impl->init(norm2_nfc_data_indexes, &norm2_nfc_data_trie,
167
0
               norm2_nfc_data_extraData, norm2_nfc_data_smallFCD);
168
0
    return createInstance(impl, errorCode);
169
0
}
170
171
U_CDECL_BEGIN
172
static UBool U_CALLCONV uprv_normalizer2_cleanup();
173
U_CDECL_END
174
175
static Norm2AllModes *nfcSingleton;
176
static Normalizer2   *noopSingleton;
177
178
static icu::UInitOnce nfcInitOnce = U_INITONCE_INITIALIZER;
179
static icu::UInitOnce noopInitOnce = U_INITONCE_INITIALIZER;
180
181
// UInitOnce singleton initialization functions
182
0
static void U_CALLCONV initNFCSingleton(UErrorCode &errorCode) {
183
0
    nfcSingleton=Norm2AllModes::createNFCInstance(errorCode);
184
0
    ucln_common_registerCleanup(UCLN_COMMON_NORMALIZER2, uprv_normalizer2_cleanup);
185
0
}
186
187
0
static void U_CALLCONV initNoopSingleton(UErrorCode &errorCode) {
188
0
    if(U_FAILURE(errorCode)) {
189
0
        return;
190
0
    }
191
0
    noopSingleton=new NoopNormalizer2;
192
0
    if(noopSingleton==NULL) {
193
0
        errorCode=U_MEMORY_ALLOCATION_ERROR;
194
0
        return;
195
0
    }
196
0
    ucln_common_registerCleanup(UCLN_COMMON_NORMALIZER2, uprv_normalizer2_cleanup);
197
0
}
198
199
U_CDECL_BEGIN
200
201
0
static UBool U_CALLCONV uprv_normalizer2_cleanup() {
202
0
    delete nfcSingleton;
203
0
    nfcSingleton = NULL;
204
0
    delete noopSingleton;
205
0
    noopSingleton = NULL;
206
0
    nfcInitOnce.reset(); 
207
0
    noopInitOnce.reset(); 
208
0
    return TRUE;
209
0
}
210
211
U_CDECL_END
212
213
const Norm2AllModes *
214
0
Norm2AllModes::getNFCInstance(UErrorCode &errorCode) {
215
0
    if(U_FAILURE(errorCode)) { return NULL; }
216
0
    umtx_initOnce(nfcInitOnce, &initNFCSingleton, errorCode);
217
0
    return nfcSingleton;
218
0
}
219
220
const Normalizer2 *
221
0
Normalizer2::getNFCInstance(UErrorCode &errorCode) {
222
0
    const Norm2AllModes *allModes=Norm2AllModes::getNFCInstance(errorCode);
223
0
    return allModes!=NULL ? &allModes->comp : NULL;
224
0
}
225
226
const Normalizer2 *
227
0
Normalizer2::getNFDInstance(UErrorCode &errorCode) {
228
0
    const Norm2AllModes *allModes=Norm2AllModes::getNFCInstance(errorCode);
229
0
    return allModes!=NULL ? &allModes->decomp : NULL;
230
0
}
231
232
0
const Normalizer2 *Normalizer2Factory::getFCDInstance(UErrorCode &errorCode) {
233
0
    const Norm2AllModes *allModes=Norm2AllModes::getNFCInstance(errorCode);
234
0
    return allModes!=NULL ? &allModes->fcd : NULL;
235
0
}
236
237
0
const Normalizer2 *Normalizer2Factory::getFCCInstance(UErrorCode &errorCode) {
238
0
    const Norm2AllModes *allModes=Norm2AllModes::getNFCInstance(errorCode);
239
0
    return allModes!=NULL ? &allModes->fcc : NULL;
240
0
}
241
242
0
const Normalizer2 *Normalizer2Factory::getNoopInstance(UErrorCode &errorCode) {
243
0
    if(U_FAILURE(errorCode)) { return NULL; }
244
0
    umtx_initOnce(noopInitOnce, &initNoopSingleton, errorCode);
245
0
    return noopSingleton;
246
0
}
247
248
const Normalizer2Impl *
249
0
Normalizer2Factory::getNFCImpl(UErrorCode &errorCode) {
250
0
    const Norm2AllModes *allModes=Norm2AllModes::getNFCInstance(errorCode);
251
0
    return allModes!=NULL ? allModes->impl : NULL;
252
0
}
253
254
const Normalizer2Impl *
255
0
Normalizer2Factory::getImpl(const Normalizer2 *norm2) {
256
0
    return &((Normalizer2WithImpl *)norm2)->impl;
257
0
}
258
259
U_NAMESPACE_END
260
261
// C API ------------------------------------------------------------------- ***
262
263
U_NAMESPACE_USE
264
265
U_CAPI const UNormalizer2 * U_EXPORT2
266
0
unorm2_getNFCInstance(UErrorCode *pErrorCode) {
267
0
    return (const UNormalizer2 *)Normalizer2::getNFCInstance(*pErrorCode);
268
0
}
269
270
U_CAPI const UNormalizer2 * U_EXPORT2
271
0
unorm2_getNFDInstance(UErrorCode *pErrorCode) {
272
0
    return (const UNormalizer2 *)Normalizer2::getNFDInstance(*pErrorCode);
273
0
}
274
275
U_CAPI void U_EXPORT2
276
0
unorm2_close(UNormalizer2 *norm2) {
277
0
    delete (Normalizer2 *)norm2;
278
0
}
279
280
U_CAPI int32_t U_EXPORT2
281
unorm2_normalize(const UNormalizer2 *norm2,
282
                 const UChar *src, int32_t length,
283
                 UChar *dest, int32_t capacity,
284
0
                 UErrorCode *pErrorCode) {
285
0
    if(U_FAILURE(*pErrorCode)) {
286
0
        return 0;
287
0
    }
288
0
    if( (src==NULL ? length!=0 : length<-1) ||
289
0
        (dest==NULL ? capacity!=0 : capacity<0) ||
290
0
        (src==dest && src!=NULL)
291
0
    ) {
292
0
        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
293
0
        return 0;
294
0
    }
295
0
    UnicodeString destString(dest, 0, capacity);
296
    // length==0: Nothing to do, and n2wi->normalize(NULL, NULL, buffer, ...) would crash.
297
0
    if(length!=0) {
298
0
        const Normalizer2 *n2=(const Normalizer2 *)norm2;
299
0
        const Normalizer2WithImpl *n2wi=dynamic_cast<const Normalizer2WithImpl *>(n2);
300
0
        if(n2wi!=NULL) {
301
            // Avoid duplicate argument checking and support NUL-terminated src.
302
0
            ReorderingBuffer buffer(n2wi->impl, destString);
303
0
            if(buffer.init(length, *pErrorCode)) {
304
0
                n2wi->normalize(src, length>=0 ? src+length : NULL, buffer, *pErrorCode);
305
0
            }
306
0
        } else {
307
0
            UnicodeString srcString(length<0, src, length);
308
0
            n2->normalize(srcString, destString, *pErrorCode);
309
0
        }
310
0
    }
311
0
    return destString.extract(dest, capacity, *pErrorCode);
312
0
}
313
314
static int32_t
315
normalizeSecondAndAppend(const UNormalizer2 *norm2,
316
                         UChar *first, int32_t firstLength, int32_t firstCapacity,
317
                         const UChar *second, int32_t secondLength,
318
                         UBool doNormalize,
319
0
                         UErrorCode *pErrorCode) {
320
0
    if(U_FAILURE(*pErrorCode)) {
321
0
        return 0;
322
0
    }
323
0
    if( (second==NULL ? secondLength!=0 : secondLength<-1) ||
324
0
        (first==NULL ? (firstCapacity!=0 || firstLength!=0) :
325
0
                       (firstCapacity<0 || firstLength<-1)) ||
326
0
        (first==second && first!=NULL)
327
0
    ) {
328
0
        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
329
0
        return 0;
330
0
    }
331
0
    UnicodeString firstString(first, firstLength, firstCapacity);
332
0
    firstLength=firstString.length();  // In case it was -1.
333
    // secondLength==0: Nothing to do, and n2wi->normalizeAndAppend(NULL, NULL, buffer, ...) would crash.
334
0
    if(secondLength!=0) {
335
0
        const Normalizer2 *n2=(const Normalizer2 *)norm2;
336
0
        const Normalizer2WithImpl *n2wi=dynamic_cast<const Normalizer2WithImpl *>(n2);
337
0
        if(n2wi!=NULL) {
338
            // Avoid duplicate argument checking and support NUL-terminated src.
339
0
            UnicodeString safeMiddle;
340
0
            {
341
0
                ReorderingBuffer buffer(n2wi->impl, firstString);
342
0
                if(buffer.init(firstLength+secondLength+1, *pErrorCode)) {  // destCapacity>=-1
343
0
                    n2wi->normalizeAndAppend(second, secondLength>=0 ? second+secondLength : NULL,
344
0
                                             doNormalize, safeMiddle, buffer, *pErrorCode);
345
0
                }
346
0
            }  // The ReorderingBuffer destructor finalizes firstString.
347
0
            if(U_FAILURE(*pErrorCode) || firstString.length()>firstCapacity) {
348
                // Restore the modified suffix of the first string.
349
                // This does not restore first[] array contents between firstLength and firstCapacity.
350
                // (That might be uninitialized memory, as far as we know.)
351
0
                if(first!=NULL) { /* don't dereference NULL */
352
0
                  safeMiddle.extract(0, 0x7fffffff, first+firstLength-safeMiddle.length());
353
0
                  if(firstLength<firstCapacity) {
354
0
                    first[firstLength]=0;  // NUL-terminate in case it was originally.
355
0
                  }
356
0
                }
357
0
            }
358
0
        } else {
359
0
            UnicodeString secondString(secondLength<0, second, secondLength);
360
0
            if(doNormalize) {
361
0
                n2->normalizeSecondAndAppend(firstString, secondString, *pErrorCode);
362
0
            } else {
363
0
                n2->append(firstString, secondString, *pErrorCode);
364
0
            }
365
0
        }
366
0
    }
367
0
    return firstString.extract(first, firstCapacity, *pErrorCode);
368
0
}
369
370
U_CAPI int32_t U_EXPORT2
371
unorm2_normalizeSecondAndAppend(const UNormalizer2 *norm2,
372
                                UChar *first, int32_t firstLength, int32_t firstCapacity,
373
                                const UChar *second, int32_t secondLength,
374
0
                                UErrorCode *pErrorCode) {
375
0
    return normalizeSecondAndAppend(norm2,
376
0
                                    first, firstLength, firstCapacity,
377
0
                                    second, secondLength,
378
0
                                    TRUE, pErrorCode);
379
0
}
380
381
U_CAPI int32_t U_EXPORT2
382
unorm2_append(const UNormalizer2 *norm2,
383
              UChar *first, int32_t firstLength, int32_t firstCapacity,
384
              const UChar *second, int32_t secondLength,
385
0
              UErrorCode *pErrorCode) {
386
0
    return normalizeSecondAndAppend(norm2,
387
0
                                    first, firstLength, firstCapacity,
388
0
                                    second, secondLength,
389
0
                                    FALSE, pErrorCode);
390
0
}
391
392
U_CAPI int32_t U_EXPORT2
393
unorm2_getDecomposition(const UNormalizer2 *norm2,
394
                        UChar32 c, UChar *decomposition, int32_t capacity,
395
0
                        UErrorCode *pErrorCode) {
396
0
    if(U_FAILURE(*pErrorCode)) {
397
0
        return 0;
398
0
    }
399
0
    if(decomposition==NULL ? capacity!=0 : capacity<0) {
400
0
        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
401
0
        return 0;
402
0
    }
403
0
    UnicodeString destString(decomposition, 0, capacity);
404
0
    if(reinterpret_cast<const Normalizer2 *>(norm2)->getDecomposition(c, destString)) {
405
0
        return destString.extract(decomposition, capacity, *pErrorCode);
406
0
    } else {
407
0
        return -1;
408
0
    }
409
0
}
410
411
U_CAPI int32_t U_EXPORT2
412
unorm2_getRawDecomposition(const UNormalizer2 *norm2,
413
                           UChar32 c, UChar *decomposition, int32_t capacity,
414
0
                           UErrorCode *pErrorCode) {
415
0
    if(U_FAILURE(*pErrorCode)) {
416
0
        return 0;
417
0
    }
418
0
    if(decomposition==NULL ? capacity!=0 : capacity<0) {
419
0
        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
420
0
        return 0;
421
0
    }
422
0
    UnicodeString destString(decomposition, 0, capacity);
423
0
    if(reinterpret_cast<const Normalizer2 *>(norm2)->getRawDecomposition(c, destString)) {
424
0
        return destString.extract(decomposition, capacity, *pErrorCode);
425
0
    } else {
426
0
        return -1;
427
0
    }
428
0
}
429
430
U_CAPI UChar32 U_EXPORT2
431
0
unorm2_composePair(const UNormalizer2 *norm2, UChar32 a, UChar32 b) {
432
0
    return reinterpret_cast<const Normalizer2 *>(norm2)->composePair(a, b);
433
0
}
434
435
U_CAPI uint8_t U_EXPORT2
436
0
unorm2_getCombiningClass(const UNormalizer2 *norm2, UChar32 c) {
437
0
    return reinterpret_cast<const Normalizer2 *>(norm2)->getCombiningClass(c);
438
0
}
439
440
U_CAPI UBool U_EXPORT2
441
unorm2_isNormalized(const UNormalizer2 *norm2,
442
                    const UChar *s, int32_t length,
443
0
                    UErrorCode *pErrorCode) {
444
0
    if(U_FAILURE(*pErrorCode)) {
445
0
        return 0;
446
0
    }
447
0
    if((s==NULL && length!=0) || length<-1) {
448
0
        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
449
0
        return 0;
450
0
    }
451
0
    UnicodeString sString(length<0, s, length);
452
0
    return ((const Normalizer2 *)norm2)->isNormalized(sString, *pErrorCode);
453
0
}
454
455
U_CAPI UNormalizationCheckResult U_EXPORT2
456
unorm2_quickCheck(const UNormalizer2 *norm2,
457
                  const UChar *s, int32_t length,
458
0
                  UErrorCode *pErrorCode) {
459
0
    if(U_FAILURE(*pErrorCode)) {
460
0
        return UNORM_NO;
461
0
    }
462
0
    if((s==NULL && length!=0) || length<-1) {
463
0
        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
464
0
        return UNORM_NO;
465
0
    }
466
0
    UnicodeString sString(length<0, s, length);
467
0
    return ((const Normalizer2 *)norm2)->quickCheck(sString, *pErrorCode);
468
0
}
469
470
U_CAPI int32_t U_EXPORT2
471
unorm2_spanQuickCheckYes(const UNormalizer2 *norm2,
472
                         const UChar *s, int32_t length,
473
0
                         UErrorCode *pErrorCode) {
474
0
    if(U_FAILURE(*pErrorCode)) {
475
0
        return 0;
476
0
    }
477
0
    if((s==NULL && length!=0) || length<-1) {
478
0
        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
479
0
        return 0;
480
0
    }
481
0
    UnicodeString sString(length<0, s, length);
482
0
    return ((const Normalizer2 *)norm2)->spanQuickCheckYes(sString, *pErrorCode);
483
0
}
484
485
U_CAPI UBool U_EXPORT2
486
0
unorm2_hasBoundaryBefore(const UNormalizer2 *norm2, UChar32 c) {
487
0
    return ((const Normalizer2 *)norm2)->hasBoundaryBefore(c);
488
0
}
489
490
U_CAPI UBool U_EXPORT2
491
0
unorm2_hasBoundaryAfter(const UNormalizer2 *norm2, UChar32 c) {
492
0
    return ((const Normalizer2 *)norm2)->hasBoundaryAfter(c);
493
0
}
494
495
U_CAPI UBool U_EXPORT2
496
0
unorm2_isInert(const UNormalizer2 *norm2, UChar32 c) {
497
0
    return ((const Normalizer2 *)norm2)->isInert(c);
498
0
}
499
500
// Some properties APIs ---------------------------------------------------- ***
501
502
U_CAPI uint8_t U_EXPORT2
503
0
u_getCombiningClass(UChar32 c) {
504
0
    UErrorCode errorCode=U_ZERO_ERROR;
505
0
    const Normalizer2 *nfd=Normalizer2::getNFDInstance(errorCode);
506
0
    if(U_SUCCESS(errorCode)) {
507
0
        return nfd->getCombiningClass(c);
508
0
    } else {
509
0
        return 0;
510
0
    }
511
0
}
512
513
U_CFUNC uint16_t
514
0
unorm_getFCD16(UChar32 c) {
515
0
    UErrorCode errorCode=U_ZERO_ERROR;
516
0
    const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(errorCode);
517
0
    if(U_SUCCESS(errorCode)) {
518
0
        return impl->getFCD16(c);
519
0
    } else {
520
0
        return 0;
521
0
    }
522
0
}
523
524
#endif  // !UCONFIG_NO_NORMALIZATION