Coverage Report

Created: 2025-07-11 06:23

/src/icu/source/common/normalizer2impl.cpp
Line
Count
Source (jump to first uncovered line)
1
// © 2016 and later: Unicode, Inc. and others.
2
// License & terms of use: http://www.unicode.org/copyright.html
3
/*
4
*******************************************************************************
5
*
6
*   Copyright (C) 2009-2014, International Business Machines
7
*   Corporation and others.  All Rights Reserved.
8
*
9
*******************************************************************************
10
*   file name:  normalizer2impl.cpp
11
*   encoding:   UTF-8
12
*   tab size:   8 (not used)
13
*   indentation:4
14
*
15
*   created on: 2009nov22
16
*   created by: Markus W. Scherer
17
*/
18
19
#include "unicode/utypes.h"
20
21
#if !UCONFIG_NO_NORMALIZATION
22
23
#include "unicode/normalizer2.h"
24
#include "unicode/udata.h"
25
#include "unicode/ustring.h"
26
#include "unicode/utf16.h"
27
#include "cmemory.h"
28
#include "mutex.h"
29
#include "normalizer2impl.h"
30
#include "putilimp.h"
31
#include "uassert.h"
32
#include "uset_imp.h"
33
#include "utrie2.h"
34
#include "uvector.h"
35
36
U_NAMESPACE_BEGIN
37
38
// ReorderingBuffer -------------------------------------------------------- ***
39
40
6.52M
UBool ReorderingBuffer::init(int32_t destCapacity, UErrorCode &errorCode) {
41
6.52M
    int32_t length=str.length();
42
6.52M
    start=str.getBuffer(destCapacity);
43
6.52M
    if(start==NULL) {
44
        // getBuffer() already did str.setToBogus()
45
0
        errorCode=U_MEMORY_ALLOCATION_ERROR;
46
0
        return FALSE;
47
0
    }
48
6.52M
    limit=start+length;
49
6.52M
    remainingCapacity=str.getCapacity()-length;
50
6.52M
    reorderStart=start;
51
6.52M
    if(start==limit) {
52
5.97M
        lastCC=0;
53
5.97M
    } else {
54
559k
        setIterator();
55
559k
        lastCC=previousCC();
56
        // Set reorderStart after the last code point with cc<=1 if there is one.
57
559k
        if(lastCC>1) {
58
0
            while(previousCC()>1) {}
59
0
        }
60
559k
        reorderStart=codePointLimit;
61
559k
    }
62
6.52M
    return TRUE;
63
6.52M
}
64
65
4.81k
UBool ReorderingBuffer::equals(const UChar *otherStart, const UChar *otherLimit) const {
66
4.81k
    int32_t length=(int32_t)(limit-start);
67
4.81k
    return
68
4.81k
        length==(int32_t)(otherLimit-otherStart) &&
69
4.81k
        0==u_memcmp(start, otherStart, length);
70
4.81k
}
71
72
15.6k
UBool ReorderingBuffer::appendSupplementary(UChar32 c, uint8_t cc, UErrorCode &errorCode) {
73
15.6k
    if(remainingCapacity<2 && !resize(2, errorCode)) {
74
0
        return FALSE;
75
0
    }
76
15.6k
    if(lastCC<=cc || cc==0) {
77
14.6k
        limit[0]=U16_LEAD(c);
78
14.6k
        limit[1]=U16_TRAIL(c);
79
14.6k
        limit+=2;
80
14.6k
        lastCC=cc;
81
14.6k
        if(cc<=1) {
82
5.15k
            reorderStart=limit;
83
5.15k
        }
84
14.6k
    } else {
85
937
        insert(c, cc);
86
937
    }
87
15.6k
    remainingCapacity-=2;
88
15.6k
    return TRUE;
89
15.6k
}
90
91
UBool ReorderingBuffer::append(const UChar *s, int32_t length,
92
                               uint8_t leadCC, uint8_t trailCC,
93
1.83M
                               UErrorCode &errorCode) {
94
1.83M
    if(length==0) {
95
380k
        return TRUE;
96
380k
    }
97
1.45M
    if(remainingCapacity<length && !resize(length, errorCode)) {
98
0
        return FALSE;
99
0
    }
100
1.45M
    remainingCapacity-=length;
101
1.45M
    if(lastCC<=leadCC || leadCC==0) {
102
1.44M
        if(trailCC<=1) {
103
670k
            reorderStart=limit+length;
104
774k
        } else if(leadCC<=1) {
105
772k
            reorderStart=limit+1;  // Ok if not a code point boundary.
106
772k
        }
107
1.44M
        const UChar *sLimit=s+length;
108
3.34M
        do { *limit++=*s++; } while(s!=sLimit);
109
1.44M
        lastCC=trailCC;
110
1.44M
    } else {
111
8.53k
        int32_t i=0;
112
8.53k
        UChar32 c;
113
8.53k
        U16_NEXT(s, i, length, c);
114
8.53k
        insert(c, leadCC);  // insert first code point
115
16.6k
        while(i<length) {
116
8.10k
            U16_NEXT(s, i, length, c);
117
8.10k
            if(i<length) {
118
                // s must be in NFD, otherwise we need to use getCC().
119
0
                leadCC=Normalizer2Impl::getCCFromYesOrMaybe(impl.getNorm16(c));
120
8.10k
            } else {
121
8.10k
                leadCC=trailCC;
122
8.10k
            }
123
8.10k
            append(c, leadCC, errorCode);
124
8.10k
        }
125
8.53k
    }
126
1.45M
    return TRUE;
127
1.45M
}
128
129
0
UBool ReorderingBuffer::appendZeroCC(UChar32 c, UErrorCode &errorCode) {
130
0
    int32_t cpLength=U16_LENGTH(c);
131
0
    if(remainingCapacity<cpLength && !resize(cpLength, errorCode)) {
132
0
        return FALSE;
133
0
    }
134
0
    remainingCapacity-=cpLength;
135
0
    if(cpLength==1) {
136
0
        *limit++=(UChar)c;
137
0
    } else {
138
0
        limit[0]=U16_LEAD(c);
139
0
        limit[1]=U16_TRAIL(c);
140
0
        limit+=2;
141
0
    }
142
0
    lastCC=0;
143
0
    reorderStart=limit;
144
0
    return TRUE;
145
0
}
146
147
3.56M
UBool ReorderingBuffer::appendZeroCC(const UChar *s, const UChar *sLimit, UErrorCode &errorCode) {
148
3.56M
    if(s==sLimit) {
149
0
        return TRUE;
150
0
    }
151
3.56M
    int32_t length=(int32_t)(sLimit-s);
152
3.56M
    if(remainingCapacity<length && !resize(length, errorCode)) {
153
0
        return FALSE;
154
0
    }
155
3.56M
    u_memcpy(limit, s, length);
156
3.56M
    limit+=length;
157
3.56M
    remainingCapacity-=length;
158
3.56M
    lastCC=0;
159
3.56M
    reorderStart=limit;
160
3.56M
    return TRUE;
161
3.56M
}
162
163
3.70k
void ReorderingBuffer::remove() {
164
3.70k
    reorderStart=limit=start;
165
3.70k
    remainingCapacity=str.getCapacity();
166
3.70k
    lastCC=0;
167
3.70k
}
168
169
1.34M
void ReorderingBuffer::removeSuffix(int32_t suffixLength) {
170
1.34M
    if(suffixLength<(limit-start)) {
171
83.8k
        limit-=suffixLength;
172
83.8k
        remainingCapacity+=suffixLength;
173
1.26M
    } else {
174
1.26M
        limit=start;
175
1.26M
        remainingCapacity=str.getCapacity();
176
1.26M
    }
177
1.34M
    lastCC=0;
178
1.34M
    reorderStart=limit;
179
1.34M
}
180
181
3.15k
UBool ReorderingBuffer::resize(int32_t appendLength, UErrorCode &errorCode) {
182
3.15k
    int32_t reorderStartIndex=(int32_t)(reorderStart-start);
183
3.15k
    int32_t length=(int32_t)(limit-start);
184
3.15k
    str.releaseBuffer(length);
185
3.15k
    int32_t newCapacity=length+appendLength;
186
3.15k
    int32_t doubleCapacity=2*str.getCapacity();
187
3.15k
    if(newCapacity<doubleCapacity) {
188
3.15k
        newCapacity=doubleCapacity;
189
3.15k
    }
190
3.15k
    if(newCapacity<256) {
191
881
        newCapacity=256;
192
881
    }
193
3.15k
    start=str.getBuffer(newCapacity);
194
3.15k
    if(start==NULL) {
195
        // getBuffer() already did str.setToBogus()
196
0
        errorCode=U_MEMORY_ALLOCATION_ERROR;
197
0
        return FALSE;
198
0
    }
199
3.15k
    reorderStart=start+reorderStartIndex;
200
3.15k
    limit=start+length;
201
3.15k
    remainingCapacity=str.getCapacity()-length;
202
3.15k
    return TRUE;
203
3.15k
}
204
205
35.5k
void ReorderingBuffer::skipPrevious() {
206
35.5k
    codePointLimit=codePointStart;
207
35.5k
    UChar c=*--codePointStart;
208
35.5k
    if(U16_IS_TRAIL(c) && start<codePointStart && U16_IS_LEAD(*(codePointStart-1))) {
209
2.03k
        --codePointStart;
210
2.03k
    }
211
35.5k
}
212
213
612k
uint8_t ReorderingBuffer::previousCC() {
214
612k
    codePointLimit=codePointStart;
215
612k
    if(reorderStart>=codePointStart) {
216
17.4k
        return 0;
217
17.4k
    }
218
594k
    UChar32 c=*--codePointStart;
219
594k
    if(c<Normalizer2Impl::MIN_CCC_LCCC_CP) {
220
559k
        return 0;
221
559k
    }
222
223
34.7k
    UChar c2;
224
34.7k
    if(U16_IS_TRAIL(c) && start<codePointStart && U16_IS_LEAD(c2=*(codePointStart-1))) {
225
2.97k
        --codePointStart;
226
2.97k
        c=U16_GET_SUPPLEMENTARY(c2, c);
227
2.97k
    }
228
34.7k
    return Normalizer2Impl::getCCFromYesOrMaybe(impl.getNorm16(c));
229
594k
}
230
231
// Inserts c somewhere before the last character.
232
// Requires 0<cc<lastCC which implies reorderStart<limit.
233
35.5k
void ReorderingBuffer::insert(UChar32 c, uint8_t cc) {
234
52.2k
    for(setIterator(), skipPrevious(); previousCC()>cc;) {}
235
    // insert c at codePointLimit, after the character with prevCC<=cc
236
35.5k
    UChar *q=limit;
237
35.5k
    UChar *r=limit+=U16_LENGTH(c);
238
56.3k
    do {
239
56.3k
        *--r=*--q;
240
56.3k
    } while(codePointLimit!=q);
241
35.5k
    writeCodePoint(q, c);
242
35.5k
    if(cc<=1) {
243
1.53k
        reorderStart=r;
244
1.53k
    }
245
35.5k
}
246
247
// Normalizer2Impl --------------------------------------------------------- ***
248
249
struct CanonIterData : public UMemory {
250
    CanonIterData(UErrorCode &errorCode);
251
    ~CanonIterData();
252
    void addToStartSet(UChar32 origin, UChar32 decompLead, UErrorCode &errorCode);
253
    UTrie2 *trie;
254
    UVector canonStartSets;  // contains UnicodeSet *
255
};
256
257
0
Normalizer2Impl::~Normalizer2Impl() {
258
0
    delete fCanonIterData;
259
0
}
260
261
void
262
Normalizer2Impl::init(const int32_t *inIndexes, const UTrie2 *inTrie,
263
2
                      const uint16_t *inExtraData, const uint8_t *inSmallFCD) {
264
2
    minDecompNoCP=inIndexes[IX_MIN_DECOMP_NO_CP];
265
2
    minCompNoMaybeCP=inIndexes[IX_MIN_COMP_NO_MAYBE_CP];
266
267
2
    minYesNo=inIndexes[IX_MIN_YES_NO];
268
2
    minYesNoMappingsOnly=inIndexes[IX_MIN_YES_NO_MAPPINGS_ONLY];
269
2
    minNoNo=inIndexes[IX_MIN_NO_NO];
270
2
    limitNoNo=inIndexes[IX_LIMIT_NO_NO];
271
2
    minMaybeYes=inIndexes[IX_MIN_MAYBE_YES];
272
273
2
    normTrie=inTrie;
274
275
2
    maybeYesCompositions=inExtraData;
276
2
    extraData=maybeYesCompositions+(MIN_NORMAL_MAYBE_YES-minMaybeYes);
277
278
2
    smallFCD=inSmallFCD;
279
280
    // Build tccc180[].
281
    // gennorm2 enforces lccc=0 for c<MIN_CCC_LCCC_CP=U+0300.
282
2
    uint8_t bits=0;
283
26
    for(UChar c=0; c<0x180; bits>>=1) {
284
24
        if((c&0xff)==0) {
285
4
            bits=smallFCD[c>>8];  // one byte per 0x100 code points
286
4
        }
287
24
        if(bits&1) {
288
396
            for(int i=0; i<0x20; ++i, ++c) {
289
384
                tccc180[c]=(uint8_t)getFCD16FromNormData(c);
290
384
            }
291
12
        } else {
292
12
            uprv_memset(tccc180+c, 0, 0x20);
293
12
            c+=0x20;
294
12
        }
295
24
    }
296
2
}
297
298
0
uint8_t Normalizer2Impl::getTrailCCFromCompYesAndZeroCC(const UChar *cpStart, const UChar *cpLimit) const {
299
0
    UChar32 c;
300
0
    if(cpStart==(cpLimit-1)) {
301
0
        c=*cpStart;
302
0
    } else {
303
0
        c=U16_GET_SUPPLEMENTARY(cpStart[0], cpStart[1]);
304
0
    }
305
0
    uint16_t prevNorm16=getNorm16(c);
306
0
    if(prevNorm16<=minYesNo) {
307
0
        return 0;  // yesYes and Hangul LV/LVT have ccc=tccc=0
308
0
    } else {
309
0
        return (uint8_t)(*getMapping(prevNorm16)>>8);  // tccc from yesNo
310
0
    }
311
0
}
312
313
namespace {
314
315
class LcccContext {
316
public:
317
0
    LcccContext(const Normalizer2Impl &ni, UnicodeSet &s) : impl(ni), set(s) {}
318
319
0
    void handleRange(UChar32 start, UChar32 end, uint16_t norm16) {
320
0
        if(impl.isAlgorithmicNoNo(norm16)) {
321
            // Range of code points with same-norm16-value algorithmic decompositions.
322
            // They might have different non-zero FCD16 values.
323
0
            do {
324
0
                uint16_t fcd16=impl.getFCD16(start);
325
0
                if(fcd16>0xff) { set.add(start); }
326
0
            } while(++start<=end);
327
0
        } else {
328
0
            uint16_t fcd16=impl.getFCD16(start);
329
0
            if(fcd16>0xff) { set.add(start, end); }
330
0
        }
331
0
    }
332
333
private:
334
    const Normalizer2Impl &impl;
335
    UnicodeSet &set;
336
};
337
338
struct PropertyStartsContext {
339
    PropertyStartsContext(const Normalizer2Impl &ni, const USetAdder *adder)
340
0
            : impl(ni), sa(adder) {}
341
342
    const Normalizer2Impl &impl;
343
    const USetAdder *sa;
344
};
345
346
}  // namespace
347
348
U_CDECL_BEGIN
349
350
static UBool U_CALLCONV
351
0
enumLcccRange(const void *context, UChar32 start, UChar32 end, uint32_t value) {
352
0
    ((LcccContext *)context)->handleRange(start, end, (uint16_t)value);
353
0
    return TRUE;
354
0
}
355
356
static UBool U_CALLCONV
357
0
enumNorm16PropertyStartsRange(const void *context, UChar32 start, UChar32 end, uint32_t value) {
358
    /* add the start code point to the USet */
359
0
    const PropertyStartsContext *ctx=(const PropertyStartsContext *)context;
360
0
    const USetAdder *sa=ctx->sa;
361
0
    sa->add(sa->set, start);
362
0
    if(start!=end && ctx->impl.isAlgorithmicNoNo((uint16_t)value)) {
363
        // Range of code points with same-norm16-value algorithmic decompositions.
364
        // They might have different non-zero FCD16 values.
365
0
        uint16_t prevFCD16=ctx->impl.getFCD16(start);
366
0
        while(++start<=end) {
367
0
            uint16_t fcd16=ctx->impl.getFCD16(start);
368
0
            if(fcd16!=prevFCD16) {
369
0
                sa->add(sa->set, start);
370
0
                prevFCD16=fcd16;
371
0
            }
372
0
        }
373
0
    }
374
0
    return TRUE;
375
0
}
376
377
static UBool U_CALLCONV
378
0
enumPropertyStartsRange(const void *context, UChar32 start, UChar32 /*end*/, uint32_t /*value*/) {
379
    /* add the start code point to the USet */
380
0
    const USetAdder *sa=(const USetAdder *)context;
381
0
    sa->add(sa->set, start);
382
0
    return TRUE;
383
0
}
384
385
static uint32_t U_CALLCONV
386
0
segmentStarterMapper(const void * /*context*/, uint32_t value) {
387
0
    return value&CANON_NOT_SEGMENT_STARTER;
388
0
}
389
390
U_CDECL_END
391
392
void
393
0
Normalizer2Impl::addLcccChars(UnicodeSet &set) const {
394
    /* add the start code point of each same-value range of each trie */
395
0
    LcccContext context(*this, set);
396
0
    utrie2_enum(normTrie, NULL, enumLcccRange, &context);
397
0
}
398
399
void
400
0
Normalizer2Impl::addPropertyStarts(const USetAdder *sa, UErrorCode & /*errorCode*/) const {
401
    /* add the start code point of each same-value range of each trie */
402
0
    PropertyStartsContext context(*this, sa);
403
0
    utrie2_enum(normTrie, NULL, enumNorm16PropertyStartsRange, &context);
404
405
    /* add Hangul LV syllables and LV+1 because of skippables */
406
0
    for(UChar c=Hangul::HANGUL_BASE; c<Hangul::HANGUL_LIMIT; c+=Hangul::JAMO_T_COUNT) {
407
0
        sa->add(sa->set, c);
408
0
        sa->add(sa->set, c+1);
409
0
    }
410
0
    sa->add(sa->set, Hangul::HANGUL_LIMIT); /* add Hangul+1 to continue with other properties */
411
0
}
412
413
void
414
0
Normalizer2Impl::addCanonIterPropertyStarts(const USetAdder *sa, UErrorCode &errorCode) const {
415
    /* add the start code point of each same-value range of the canonical iterator data trie */
416
0
    if(ensureCanonIterData(errorCode)) {
417
        // currently only used for the SEGMENT_STARTER property
418
0
        utrie2_enum(fCanonIterData->trie, segmentStarterMapper, enumPropertyStartsRange, sa);
419
0
    }
420
0
}
421
422
const UChar *
423
Normalizer2Impl::copyLowPrefixFromNulTerminated(const UChar *src,
424
                                                UChar32 minNeedDataCP,
425
                                                ReorderingBuffer *buffer,
426
0
                                                UErrorCode &errorCode) const {
427
    // Make some effort to support NUL-terminated strings reasonably.
428
    // Take the part of the fast quick check loop that does not look up
429
    // data and check the first part of the string.
430
    // After this prefix, determine the string length to simplify the rest
431
    // of the code.
432
0
    const UChar *prevSrc=src;
433
0
    UChar c;
434
0
    while((c=*src++)<minNeedDataCP && c!=0) {}
435
    // Back out the last character for full processing.
436
    // Copy this prefix.
437
0
    if(--src!=prevSrc) {
438
0
        if(buffer!=NULL) {
439
0
            buffer->appendZeroCC(prevSrc, src, errorCode);
440
0
        }
441
0
    }
442
0
    return src;
443
0
}
444
445
UnicodeString &
446
Normalizer2Impl::decompose(const UnicodeString &src, UnicodeString &dest,
447
0
                           UErrorCode &errorCode) const {
448
0
    if(U_FAILURE(errorCode)) {
449
0
        dest.setToBogus();
450
0
        return dest;
451
0
    }
452
0
    const UChar *sArray=src.getBuffer();
453
0
    if(&dest==&src || sArray==NULL) {
454
0
        errorCode=U_ILLEGAL_ARGUMENT_ERROR;
455
0
        dest.setToBogus();
456
0
        return dest;
457
0
    }
458
0
    decompose(sArray, sArray+src.length(), dest, src.length(), errorCode);
459
0
    return dest;
460
0
}
461
462
void
463
Normalizer2Impl::decompose(const UChar *src, const UChar *limit,
464
                           UnicodeString &dest,
465
                           int32_t destLengthEstimate,
466
0
                           UErrorCode &errorCode) const {
467
0
    if(destLengthEstimate<0 && limit!=NULL) {
468
0
        destLengthEstimate=(int32_t)(limit-src);
469
0
    }
470
0
    dest.remove();
471
0
    ReorderingBuffer buffer(*this, dest);
472
0
    if(buffer.init(destLengthEstimate, errorCode)) {
473
0
        decompose(src, limit, &buffer, errorCode);
474
0
    }
475
0
}
476
477
// Dual functionality:
478
// buffer!=NULL: normalize
479
// buffer==NULL: isNormalized/spanQuickCheckYes
480
const UChar *
481
Normalizer2Impl::decompose(const UChar *src, const UChar *limit,
482
                           ReorderingBuffer *buffer,
483
0
                           UErrorCode &errorCode) const {
484
0
    UChar32 minNoCP=minDecompNoCP;
485
0
    if(limit==NULL) {
486
0
        src=copyLowPrefixFromNulTerminated(src, minNoCP, buffer, errorCode);
487
0
        if(U_FAILURE(errorCode)) {
488
0
            return src;
489
0
        }
490
0
        limit=u_strchr(src, 0);
491
0
    }
492
493
0
    const UChar *prevSrc;
494
0
    UChar32 c=0;
495
0
    uint16_t norm16=0;
496
497
    // only for quick check
498
0
    const UChar *prevBoundary=src;
499
0
    uint8_t prevCC=0;
500
501
0
    for(;;) {
502
        // count code units below the minimum or with irrelevant data for the quick check
503
0
        for(prevSrc=src; src!=limit;) {
504
0
            if( (c=*src)<minNoCP ||
505
0
                isMostDecompYesAndZeroCC(norm16=UTRIE2_GET16_FROM_U16_SINGLE_LEAD(normTrie, c))
506
0
            ) {
507
0
                ++src;
508
0
            } else if(!U16_IS_SURROGATE(c)) {
509
0
                break;
510
0
            } else {
511
0
                UChar c2;
512
0
                if(U16_IS_SURROGATE_LEAD(c)) {
513
0
                    if((src+1)!=limit && U16_IS_TRAIL(c2=src[1])) {
514
0
                        c=U16_GET_SUPPLEMENTARY(c, c2);
515
0
                    }
516
0
                } else /* trail surrogate */ {
517
0
                    if(prevSrc<src && U16_IS_LEAD(c2=*(src-1))) {
518
0
                        --src;
519
0
                        c=U16_GET_SUPPLEMENTARY(c2, c);
520
0
                    }
521
0
                }
522
0
                if(isMostDecompYesAndZeroCC(norm16=getNorm16(c))) {
523
0
                    src+=U16_LENGTH(c);
524
0
                } else {
525
0
                    break;
526
0
                }
527
0
            }
528
0
        }
529
        // copy these code units all at once
530
0
        if(src!=prevSrc) {
531
0
            if(buffer!=NULL) {
532
0
                if(!buffer->appendZeroCC(prevSrc, src, errorCode)) {
533
0
                    break;
534
0
                }
535
0
            } else {
536
0
                prevCC=0;
537
0
                prevBoundary=src;
538
0
            }
539
0
        }
540
0
        if(src==limit) {
541
0
            break;
542
0
        }
543
544
        // Check one above-minimum, relevant code point.
545
0
        src+=U16_LENGTH(c);
546
0
        if(buffer!=NULL) {
547
0
            if(!decompose(c, norm16, *buffer, errorCode)) {
548
0
                break;
549
0
            }
550
0
        } else {
551
0
            if(isDecompYes(norm16)) {
552
0
                uint8_t cc=getCCFromYesOrMaybe(norm16);
553
0
                if(prevCC<=cc || cc==0) {
554
0
                    prevCC=cc;
555
0
                    if(cc<=1) {
556
0
                        prevBoundary=src;
557
0
                    }
558
0
                    continue;
559
0
                }
560
0
            }
561
0
            return prevBoundary;  // "no" or cc out of order
562
0
        }
563
0
    }
564
0
    return src;
565
0
}
566
567
// Decompose a short piece of text which is likely to contain characters that
568
// fail the quick check loop and/or where the quick check loop's overhead
569
// is unlikely to be amortized.
570
// Called by the compose() and makeFCD() implementations.
571
UBool Normalizer2Impl::decomposeShort(const UChar *src, const UChar *limit,
572
                                      ReorderingBuffer &buffer,
573
3.69M
                                      UErrorCode &errorCode) const {
574
7.87M
    while(src<limit) {
575
4.17M
        UChar32 c;
576
4.17M
        uint16_t norm16;
577
4.17M
        UTRIE2_U16_NEXT16(normTrie, src, limit, c, norm16);
578
4.17M
        if(!decompose(c, norm16, buffer, errorCode)) {
579
0
            return FALSE;
580
0
        }
581
4.17M
    }
582
3.69M
    return TRUE;
583
3.69M
}
584
585
UBool Normalizer2Impl::decompose(UChar32 c, uint16_t norm16,
586
                                 ReorderingBuffer &buffer,
587
4.17M
                                 UErrorCode &errorCode) const {
588
    // Only loops for 1:1 algorithmic mappings.
589
6.39M
    for(;;) {
590
        // get the decomposition and the lead and trail cc's
591
6.39M
        if(isDecompYes(norm16)) {
592
            // c does not decompose
593
2.33M
            return buffer.append(c, getCCFromYesOrMaybe(norm16), errorCode);
594
4.05M
        } else if(isHangul(norm16)) {
595
            // Hangul syllable: decompose algorithmically
596
463
            UChar jamos[3];
597
463
            return buffer.appendZeroCC(jamos, jamos+Hangul::decompose(c, jamos), errorCode);
598
4.05M
        } else if(isDecompNoAlgorithmic(norm16)) {
599
2.22M
            c=mapAlgorithmic(c, norm16);
600
2.22M
            norm16=getNorm16(c);
601
2.22M
        } else {
602
            // c decomposes, get everything from the variable-length extra data
603
1.83M
            const uint16_t *mapping=getMapping(norm16);
604
1.83M
            uint16_t firstUnit=*mapping;
605
1.83M
            int32_t length=firstUnit&MAPPING_LENGTH_MASK;
606
1.83M
            uint8_t leadCC, trailCC;
607
1.83M
            trailCC=(uint8_t)(firstUnit>>8);
608
1.83M
            if(firstUnit&MAPPING_HAS_CCC_LCCC_WORD) {
609
10.4k
                leadCC=(uint8_t)(*(mapping-1)>>8);
610
1.82M
            } else {
611
1.82M
                leadCC=0;
612
1.82M
            }
613
1.83M
            return buffer.append((const UChar *)mapping+1, length, leadCC, trailCC, errorCode);
614
1.83M
        }
615
6.39M
    }
616
4.17M
}
617
618
const UChar *
619
0
Normalizer2Impl::getDecomposition(UChar32 c, UChar buffer[4], int32_t &length) const {
620
0
    const UChar *decomp=NULL;
621
0
    uint16_t norm16;
622
0
    for(;;) {
623
0
        if(c<minDecompNoCP || isDecompYes(norm16=getNorm16(c))) {
624
            // c does not decompose
625
0
            return decomp;
626
0
        } else if(isHangul(norm16)) {
627
            // Hangul syllable: decompose algorithmically
628
0
            length=Hangul::decompose(c, buffer);
629
0
            return buffer;
630
0
        } else if(isDecompNoAlgorithmic(norm16)) {
631
0
            c=mapAlgorithmic(c, norm16);
632
0
            decomp=buffer;
633
0
            length=0;
634
0
            U16_APPEND_UNSAFE(buffer, length, c);
635
0
        } else {
636
            // c decomposes, get everything from the variable-length extra data
637
0
            const uint16_t *mapping=getMapping(norm16);
638
0
            length=*mapping&MAPPING_LENGTH_MASK;
639
0
            return (const UChar *)mapping+1;
640
0
        }
641
0
    }
642
0
}
643
644
// The capacity of the buffer must be 30=MAPPING_LENGTH_MASK-1
645
// so that a raw mapping fits that consists of one unit ("rm0")
646
// plus all but the first two code units of the normal mapping.
647
// The maximum length of a normal mapping is 31=MAPPING_LENGTH_MASK.
648
const UChar *
649
0
Normalizer2Impl::getRawDecomposition(UChar32 c, UChar buffer[30], int32_t &length) const {
650
    // We do not loop in this method because an algorithmic mapping itself
651
    // becomes a final result rather than having to be decomposed recursively.
652
0
    uint16_t norm16;
653
0
    if(c<minDecompNoCP || isDecompYes(norm16=getNorm16(c))) {
654
        // c does not decompose
655
0
        return NULL;
656
0
    } else if(isHangul(norm16)) {
657
        // Hangul syllable: decompose algorithmically
658
0
        Hangul::getRawDecomposition(c, buffer);
659
0
        length=2;
660
0
        return buffer;
661
0
    } else if(isDecompNoAlgorithmic(norm16)) {
662
0
        c=mapAlgorithmic(c, norm16);
663
0
        length=0;
664
0
        U16_APPEND_UNSAFE(buffer, length, c);
665
0
        return buffer;
666
0
    } else {
667
        // c decomposes, get everything from the variable-length extra data
668
0
        const uint16_t *mapping=getMapping(norm16);
669
0
        uint16_t firstUnit=*mapping;
670
0
        int32_t mLength=firstUnit&MAPPING_LENGTH_MASK;  // length of normal mapping
671
0
        if(firstUnit&MAPPING_HAS_RAW_MAPPING) {
672
            // Read the raw mapping from before the firstUnit and before the optional ccc/lccc word.
673
            // Bit 7=MAPPING_HAS_CCC_LCCC_WORD
674
0
            const uint16_t *rawMapping=mapping-((firstUnit>>7)&1)-1;
675
0
            uint16_t rm0=*rawMapping;
676
0
            if(rm0<=MAPPING_LENGTH_MASK) {
677
0
                length=rm0;
678
0
                return (const UChar *)rawMapping-rm0;
679
0
            } else {
680
                // Copy the normal mapping and replace its first two code units with rm0.
681
0
                buffer[0]=(UChar)rm0;
682
0
                u_memcpy(buffer+1, (const UChar *)mapping+1+2, mLength-2);
683
0
                length=mLength-1;
684
0
                return buffer;
685
0
            }
686
0
        } else {
687
0
            length=mLength;
688
0
            return (const UChar *)mapping+1;
689
0
        }
690
0
    }
691
0
}
692
693
void Normalizer2Impl::decomposeAndAppend(const UChar *src, const UChar *limit,
694
                                         UBool doDecompose,
695
                                         UnicodeString &safeMiddle,
696
                                         ReorderingBuffer &buffer,
697
0
                                         UErrorCode &errorCode) const {
698
0
    buffer.copyReorderableSuffixTo(safeMiddle);
699
0
    if(doDecompose) {
700
0
        decompose(src, limit, &buffer, errorCode);
701
0
        return;
702
0
    }
703
    // Just merge the strings at the boundary.
704
0
    ForwardUTrie2StringIterator iter(normTrie, src, limit);
705
0
    uint8_t firstCC, prevCC, cc;
706
0
    firstCC=prevCC=cc=getCC(iter.next16());
707
0
    while(cc!=0) {
708
0
        prevCC=cc;
709
0
        cc=getCC(iter.next16());
710
0
    };
711
0
    if(limit==NULL) {  // appendZeroCC() needs limit!=NULL
712
0
        limit=u_strchr(iter.codePointStart, 0);
713
0
    }
714
715
0
    if (buffer.append(src, (int32_t)(iter.codePointStart-src), firstCC, prevCC, errorCode)) {
716
0
        buffer.appendZeroCC(iter.codePointStart, limit, errorCode);
717
0
    }
718
0
}
719
720
// Note: hasDecompBoundary() could be implemented as aliases to
721
// hasFCDBoundaryBefore() and hasFCDBoundaryAfter()
722
// at the cost of building the FCD trie for a decomposition normalizer.
723
0
UBool Normalizer2Impl::hasDecompBoundary(UChar32 c, UBool before) const {
724
0
    for(;;) {
725
0
        if(c<minDecompNoCP) {
726
0
            return TRUE;
727
0
        }
728
0
        uint16_t norm16=getNorm16(c);
729
0
        if(isHangul(norm16) || isDecompYesAndZeroCC(norm16)) {
730
0
            return TRUE;
731
0
        } else if(norm16>MIN_NORMAL_MAYBE_YES) {
732
0
            return FALSE;  // ccc!=0
733
0
        } else if(isDecompNoAlgorithmic(norm16)) {
734
0
            c=mapAlgorithmic(c, norm16);
735
0
        } else {
736
            // c decomposes, get everything from the variable-length extra data
737
0
            const uint16_t *mapping=getMapping(norm16);
738
0
            uint16_t firstUnit=*mapping;
739
0
            if((firstUnit&MAPPING_LENGTH_MASK)==0) {
740
0
                return FALSE;
741
0
            }
742
0
            if(!before) {
743
                // decomp after-boundary: same as hasFCDBoundaryAfter(),
744
                // fcd16<=1 || trailCC==0
745
0
                if(firstUnit>0x1ff) {
746
0
                    return FALSE;  // trailCC>1
747
0
                }
748
0
                if(firstUnit<=0xff) {
749
0
                    return TRUE;  // trailCC==0
750
0
                }
751
                // if(trailCC==1) test leadCC==0, same as checking for before-boundary
752
0
            }
753
            // TRUE if leadCC==0 (hasFCDBoundaryBefore())
754
0
            return (firstUnit&MAPPING_HAS_CCC_LCCC_WORD)==0 || (*(mapping-1)&0xff00)==0;
755
0
        }
756
0
    }
757
0
}
758
759
/*
760
 * Finds the recomposition result for
761
 * a forward-combining "lead" character,
762
 * specified with a pointer to its compositions list,
763
 * and a backward-combining "trail" character.
764
 *
765
 * If the lead and trail characters combine, then this function returns
766
 * the following "compositeAndFwd" value:
767
 * Bits 21..1  composite character
768
 * Bit      0  set if the composite is a forward-combining starter
769
 * otherwise it returns -1.
770
 *
771
 * The compositions list has (trail, compositeAndFwd) pair entries,
772
 * encoded as either pairs or triples of 16-bit units.
773
 * The last entry has the high bit of its first unit set.
774
 *
775
 * The list is sorted by ascending trail characters (there are no duplicates).
776
 * A linear search is used.
777
 *
778
 * See normalizer2impl.h for a more detailed description
779
 * of the compositions list format.
780
 */
781
778k
int32_t Normalizer2Impl::combine(const uint16_t *list, UChar32 trail) {
782
778k
    uint16_t key1, firstUnit;
783
778k
    if(trail<COMP_1_TRAIL_LIMIT) {
784
        // trail character is 0..33FF
785
        // result entry may have 2 or 3 units
786
775k
        key1=(uint16_t)(trail<<1);
787
4.61M
        while(key1>(firstUnit=*list)) {
788
3.83M
            list+=2+(firstUnit&COMP_1_TRIPLE);
789
3.83M
        }
790
775k
        if(key1==(firstUnit&COMP_1_TRAIL_MASK)) {
791
770k
            if(firstUnit&COMP_1_TRIPLE) {
792
0
                return ((int32_t)list[1]<<16)|list[2];
793
770k
            } else {
794
770k
                return list[1];
795
770k
            }
796
770k
        }
797
775k
    } else {
798
        // trail character is 3400..10FFFF
799
        // result entry has 3 units
800
2.29k
        key1=(uint16_t)(COMP_1_TRAIL_LIMIT+
801
2.29k
                        (((trail>>COMP_1_TRAIL_SHIFT))&
802
2.29k
                          ~COMP_1_TRIPLE));
803
2.29k
        uint16_t key2=(uint16_t)(trail<<COMP_2_TRAIL_SHIFT);
804
2.29k
        uint16_t secondUnit;
805
7.05k
        for(;;) {
806
7.05k
            if(key1>(firstUnit=*list)) {
807
3.09k
                list+=2+(firstUnit&COMP_1_TRIPLE);
808
3.96k
            } else if(key1==(firstUnit&COMP_1_TRAIL_MASK)) {
809
3.68k
                if(key2>(secondUnit=list[1])) {
810
2.02k
                    if(firstUnit&COMP_1_LAST_TUPLE) {
811
355
                        break;
812
1.67k
                    } else {
813
1.67k
                        list+=3;
814
1.67k
                    }
815
2.02k
                } else if(key2==(secondUnit&COMP_2_TRAIL_MASK)) {
816
1.21k
                    return ((int32_t)(secondUnit&~COMP_2_TRAIL_MASK)<<16)|list[2];
817
1.21k
                } else {
818
446
                    break;
819
446
                }
820
3.68k
            } else {
821
273
                break;
822
273
            }
823
7.05k
        }
824
2.29k
    }
825
6.25k
    return -1;
826
778k
}
827
828
/**
829
  * @param list some character's compositions list
830
  * @param set recursively receives the composites from these compositions
831
  */
832
0
void Normalizer2Impl::addComposites(const uint16_t *list, UnicodeSet &set) const {
833
0
    uint16_t firstUnit;
834
0
    int32_t compositeAndFwd;
835
0
    do {
836
0
        firstUnit=*list;
837
0
        if((firstUnit&COMP_1_TRIPLE)==0) {
838
0
            compositeAndFwd=list[1];
839
0
            list+=2;
840
0
        } else {
841
0
            compositeAndFwd=(((int32_t)list[1]&~COMP_2_TRAIL_MASK)<<16)|list[2];
842
0
            list+=3;
843
0
        }
844
0
        UChar32 composite=compositeAndFwd>>1;
845
0
        if((compositeAndFwd&1)!=0) {
846
0
            addComposites(getCompositionsListForComposite(getNorm16(composite)), set);
847
0
        }
848
0
        set.add(composite);
849
0
    } while((firstUnit&COMP_1_LAST_TUPLE)==0);
850
0
}
851
852
/*
853
 * Recomposes the buffer text starting at recomposeStartIndex
854
 * (which is in NFD - decomposed and canonically ordered),
855
 * and truncates the buffer contents.
856
 *
857
 * Note that recomposition never lengthens the text:
858
 * Any character consists of either one or two code units;
859
 * a composition may contain at most one more code unit than the original starter,
860
 * while the combining mark that is removed has at least one code unit.
861
 */
862
void Normalizer2Impl::recompose(ReorderingBuffer &buffer, int32_t recomposeStartIndex,
863
3.69M
                                UBool onlyContiguous) const {
864
3.69M
    UChar *p=buffer.getStart()+recomposeStartIndex;
865
3.69M
    UChar *limit=buffer.getLimit();
866
3.69M
    if(p==limit) {
867
362k
        return;
868
362k
    }
869
870
3.33M
    UChar *starter, *pRemove, *q, *r;
871
3.33M
    const uint16_t *compositionsList;
872
3.33M
    UChar32 c, compositeAndFwd;
873
3.33M
    uint16_t norm16;
874
3.33M
    uint8_t cc, prevCC;
875
3.33M
    UBool starterIsSupplementary;
876
877
    // Some of the following variables are not used until we have a forward-combining starter
878
    // and are only initialized now to avoid compiler warnings.
879
3.33M
    compositionsList=NULL;  // used as indicator for whether we have a forward-combining starter
880
3.33M
    starter=NULL;
881
3.33M
    starterIsSupplementary=FALSE;
882
3.33M
    prevCC=0;
883
884
5.68M
    for(;;) {
885
5.68M
        UTRIE2_U16_NEXT16(normTrie, p, limit, c, norm16);
886
5.68M
        cc=getCCFromYesOrMaybe(norm16);
887
5.68M
        if( // this character combines backward and
888
5.68M
            isMaybe(norm16) &&
889
            // we have seen a starter that combines forward and
890
5.68M
            compositionsList!=NULL &&
891
            // the backward-combining character is not blocked
892
5.68M
            (prevCC<cc || prevCC==0)
893
5.68M
        ) {
894
782k
            if(isJamoVT(norm16)) {
895
                // c is a Jamo V/T, see if we can compose it with the previous character.
896
4.09k
                if(c<Hangul::JAMO_T_BASE) {
897
                    // c is a Jamo Vowel, compose with previous Jamo L and following Jamo T.
898
3.88k
                    UChar prev=(UChar)(*starter-Hangul::JAMO_L_BASE);
899
3.88k
                    if(prev<Hangul::JAMO_L_COUNT) {
900
3.47k
                        pRemove=p-1;
901
3.47k
                        UChar syllable=(UChar)
902
3.47k
                            (Hangul::HANGUL_BASE+
903
3.47k
                             (prev*Hangul::JAMO_V_COUNT+(c-Hangul::JAMO_V_BASE))*
904
3.47k
                             Hangul::JAMO_T_COUNT);
905
3.47k
                        UChar t;
906
3.47k
                        if(p!=limit && (t=(UChar)(*p-Hangul::JAMO_T_BASE))<Hangul::JAMO_T_COUNT) {
907
821
                            ++p;
908
821
                            syllable+=t;  // The next character was a Jamo T.
909
821
                        }
910
3.47k
                        *starter=syllable;
911
                        // remove the Jamo V/T
912
3.47k
                        q=pRemove;
913
3.47k
                        r=p;
914
5.72k
                        while(r<limit) {
915
2.25k
                            *q++=*r++;
916
2.25k
                        }
917
3.47k
                        limit=q;
918
3.47k
                        p=pRemove;
919
3.47k
                    }
920
3.88k
                }
921
                /*
922
                 * No "else" for Jamo T:
923
                 * Since the input is in NFD, there are no Hangul LV syllables that
924
                 * a Jamo T could combine with.
925
                 * All Jamo Ts are combined above when handling Jamo Vs.
926
                 */
927
4.09k
                if(p==limit) {
928
2.85k
                    break;
929
2.85k
                }
930
1.23k
                compositionsList=NULL;
931
1.23k
                continue;
932
778k
            } else if((compositeAndFwd=combine(compositionsList, c))>=0) {
933
                // The starter and the combining mark (c) do combine.
934
771k
                UChar32 composite=compositeAndFwd>>1;
935
936
                // Replace the starter with the composite, remove the combining mark.
937
771k
                pRemove=p-U16_LENGTH(c);  // pRemove & p: start & limit of the combining mark
938
771k
                if(starterIsSupplementary) {
939
1.21k
                    if(U_IS_SUPPLEMENTARY(composite)) {
940
                        // both are supplementary
941
1.21k
                        starter[0]=U16_LEAD(composite);
942
1.21k
                        starter[1]=U16_TRAIL(composite);
943
1.21k
                    } else {
944
0
                        *starter=(UChar)composite;
945
                        // The composite is shorter than the starter,
946
                        // move the intermediate characters forward one.
947
0
                        starterIsSupplementary=FALSE;
948
0
                        q=starter+1;
949
0
                        r=q+1;
950
0
                        while(r<pRemove) {
951
0
                            *q++=*r++;
952
0
                        }
953
0
                        --pRemove;
954
0
                    }
955
770k
                } else if(U_IS_SUPPLEMENTARY(composite)) {
956
                    // The composite is longer than the starter,
957
                    // move the intermediate characters back one.
958
0
                    starterIsSupplementary=TRUE;
959
0
                    ++starter;  // temporarily increment for the loop boundary
960
0
                    q=pRemove;
961
0
                    r=++pRemove;
962
0
                    while(starter<q) {
963
0
                        *--r=*--q;
964
0
                    }
965
0
                    *starter=U16_TRAIL(composite);
966
0
                    *--starter=U16_LEAD(composite);  // undo the temporary increment
967
770k
                } else {
968
                    // both are on the BMP
969
770k
                    *starter=(UChar)composite;
970
770k
                }
971
972
                /* remove the combining mark by moving the following text over it */
973
771k
                if(pRemove<p) {
974
771k
                    q=pRemove;
975
771k
                    r=p;
976
789k
                    while(r<limit) {
977
17.6k
                        *q++=*r++;
978
17.6k
                    }
979
771k
                    limit=q;
980
771k
                    p=pRemove;
981
771k
                }
982
                // Keep prevCC because we removed the combining mark.
983
984
771k
                if(p==limit) {
985
765k
                    break;
986
765k
                }
987
                // Is the composite a starter that combines forward?
988
6.41k
                if(compositeAndFwd&1) {
989
4.45k
                    compositionsList=
990
4.45k
                        getCompositionsListForComposite(getNorm16(composite));
991
4.45k
                } else {
992
1.96k
                    compositionsList=NULL;
993
1.96k
                }
994
995
                // We combined; continue with looking for compositions.
996
6.41k
                continue;
997
771k
            }
998
782k
        }
999
1000
        // no combination this time
1001
4.91M
        prevCC=cc;
1002
4.91M
        if(p==limit) {
1003
2.56M
            break;
1004
2.56M
        }
1005
1006
        // If c did not combine, then check if it is a starter.
1007
2.34M
        if(cc==0) {
1008
            // Found a new starter.
1009
2.29M
            if((compositionsList=getCompositionsListForDecompYes(norm16))!=NULL) {
1010
                // It may combine with something, prepare for it.
1011
1.06M
                if(U_IS_BMP(c)) {
1012
1.06M
                    starterIsSupplementary=FALSE;
1013
1.06M
                    starter=p-1;
1014
1.06M
                } else {
1015
2.08k
                    starterIsSupplementary=TRUE;
1016
2.08k
                    starter=p-2;
1017
2.08k
                }
1018
1.06M
            }
1019
2.29M
        } else if(onlyContiguous) {
1020
            // FCC: no discontiguous compositions; any intervening character blocks.
1021
0
            compositionsList=NULL;
1022
0
        }
1023
2.34M
    }
1024
3.33M
    buffer.setReorderingLimit(limit);
1025
3.33M
}
1026
1027
UChar32
1028
0
Normalizer2Impl::composePair(UChar32 a, UChar32 b) const {
1029
0
    uint16_t norm16=getNorm16(a);  // maps an out-of-range 'a' to inert norm16=0
1030
0
    const uint16_t *list;
1031
0
    if(isInert(norm16)) {
1032
0
        return U_SENTINEL;
1033
0
    } else if(norm16<minYesNoMappingsOnly) {
1034
0
        if(isJamoL(norm16)) {
1035
0
            b-=Hangul::JAMO_V_BASE;
1036
0
            if(0<=b && b<Hangul::JAMO_V_COUNT) {
1037
0
                return
1038
0
                    (Hangul::HANGUL_BASE+
1039
0
                     ((a-Hangul::JAMO_L_BASE)*Hangul::JAMO_V_COUNT+b)*
1040
0
                     Hangul::JAMO_T_COUNT);
1041
0
            } else {
1042
0
                return U_SENTINEL;
1043
0
            }
1044
0
        } else if(isHangul(norm16)) {
1045
0
            b-=Hangul::JAMO_T_BASE;
1046
0
            if(Hangul::isHangulWithoutJamoT(a) && 0<b && b<Hangul::JAMO_T_COUNT) {  // not b==0!
1047
0
                return a+b;
1048
0
            } else {
1049
0
                return U_SENTINEL;
1050
0
            }
1051
0
        } else {
1052
            // 'a' has a compositions list in extraData
1053
0
            list=extraData+norm16;
1054
0
            if(norm16>minYesNo) {  // composite 'a' has both mapping & compositions list
1055
0
                list+=  // mapping pointer
1056
0
                    1+  // +1 to skip the first unit with the mapping lenth
1057
0
                    (*list&MAPPING_LENGTH_MASK);  // + mapping length
1058
0
            }
1059
0
        }
1060
0
    } else if(norm16<minMaybeYes || MIN_NORMAL_MAYBE_YES<=norm16) {
1061
0
        return U_SENTINEL;
1062
0
    } else {
1063
0
        list=maybeYesCompositions+norm16-minMaybeYes;
1064
0
    }
1065
0
    if(b<0 || 0x10ffff<b) {  // combine(list, b) requires a valid code point b
1066
0
        return U_SENTINEL;
1067
0
    }
1068
0
#if U_SIGNED_RIGHT_SHIFT_IS_ARITHMETIC
1069
0
    return combine(list, b)>>1;
1070
#else
1071
    int32_t compositeAndFwd=combine(list, b);
1072
    return compositeAndFwd>=0 ? compositeAndFwd>>1 : U_SENTINEL;
1073
#endif
1074
0
}
1075
1076
// Very similar to composeQuickCheck(): Make the same changes in both places if relevant.
1077
// doCompose: normalize
1078
// !doCompose: isNormalized (buffer must be empty and initialized)
1079
UBool
1080
Normalizer2Impl::compose(const UChar *src, const UChar *limit,
1081
                         UBool onlyContiguous,
1082
                         UBool doCompose,
1083
                         ReorderingBuffer &buffer,
1084
7.07M
                         UErrorCode &errorCode) const {
1085
    /*
1086
     * prevBoundary points to the last character before the current one
1087
     * that has a composition boundary before it with ccc==0 and quick check "yes".
1088
     * Keeping track of prevBoundary saves us looking for a composition boundary
1089
     * when we find a "no" or "maybe".
1090
     *
1091
     * When we back out from prevSrc back to prevBoundary,
1092
     * then we also remove those same characters (which had been simply copied
1093
     * or canonically-order-inserted) from the ReorderingBuffer.
1094
     * Therefore, at all times, the [prevBoundary..prevSrc[ source units
1095
     * must correspond 1:1 to destination units at the end of the destination buffer.
1096
     */
1097
7.07M
    const UChar *prevBoundary=src;
1098
7.07M
    UChar32 minNoMaybeCP=minCompNoMaybeCP;
1099
7.07M
    if(limit==NULL) {
1100
0
        src=copyLowPrefixFromNulTerminated(src, minNoMaybeCP,
1101
0
                                           doCompose ? &buffer : NULL,
1102
0
                                           errorCode);
1103
0
        if(U_FAILURE(errorCode)) {
1104
0
            return FALSE;
1105
0
        }
1106
0
        if(prevBoundary<src) {
1107
            // Set prevBoundary to the last character in the prefix.
1108
0
            prevBoundary=src-1;
1109
0
        }
1110
0
        limit=u_strchr(src, 0);
1111
0
    }
1112
1113
7.07M
    const UChar *prevSrc;
1114
7.07M
    UChar32 c=0;
1115
7.07M
    uint16_t norm16=0;
1116
1117
    // only for isNormalized
1118
7.07M
    uint8_t prevCC=0;
1119
1120
11.2M
    for(;;) {
1121
        // count code units below the minimum or with irrelevant data for the quick check
1122
15.2M
        for(prevSrc=src; src!=limit;) {
1123
8.16M
            if( (c=*src)<minNoMaybeCP ||
1124
8.16M
                isCompYesAndZeroCC(norm16=UTRIE2_GET16_FROM_U16_SINGLE_LEAD(normTrie, c))
1125
8.16M
            ) {
1126
4.02M
                ++src;
1127
4.14M
            } else if(!U16_IS_SURROGATE(c)) {
1128
4.12M
                break;
1129
4.12M
            } else {
1130
19.5k
                UChar c2;
1131
19.5k
                if(U16_IS_SURROGATE_LEAD(c)) {
1132
19.5k
                    if((src+1)!=limit && U16_IS_TRAIL(c2=src[1])) {
1133
19.5k
                        c=U16_GET_SUPPLEMENTARY(c, c2);
1134
19.5k
                    }
1135
19.5k
                } else /* trail surrogate */ {
1136
0
                    if(prevSrc<src && U16_IS_LEAD(c2=*(src-1))) {
1137
0
                        --src;
1138
0
                        c=U16_GET_SUPPLEMENTARY(c2, c);
1139
0
                    }
1140
0
                }
1141
19.5k
                if(isCompYesAndZeroCC(norm16=getNorm16(c))) {
1142
5.91k
                    src+=U16_LENGTH(c);
1143
13.6k
                } else {
1144
13.6k
                    break;
1145
13.6k
                }
1146
19.5k
            }
1147
8.16M
        }
1148
        // copy these code units all at once
1149
11.2M
        if(src!=prevSrc) {
1150
3.57M
            if(doCompose) {
1151
3.56M
                if(!buffer.appendZeroCC(prevSrc, src, errorCode)) {
1152
0
                    break;
1153
0
                }
1154
3.56M
            } else {
1155
6.56k
                prevCC=0;
1156
6.56k
            }
1157
3.57M
            if(src==limit) {
1158
2.97M
                break;
1159
2.97M
            }
1160
            // Set prevBoundary to the last character in the quick check loop.
1161
596k
            prevBoundary=src-1;
1162
596k
            if( U16_IS_TRAIL(*prevBoundary) && prevSrc<prevBoundary &&
1163
596k
                U16_IS_LEAD(*(prevBoundary-1))
1164
596k
            ) {
1165
2.46k
                --prevBoundary;
1166
2.46k
            }
1167
            // The start of the current character (c).
1168
596k
            prevSrc=src;
1169
7.63M
        } else if(src==limit) {
1170
4.09M
            break;
1171
4.09M
        }
1172
1173
4.13M
        src+=U16_LENGTH(c);
1174
        /*
1175
         * isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo.
1176
         * c is either a "noNo" (has a mapping) or a "maybeYes" (combines backward)
1177
         * or has ccc!=0.
1178
         * Check for Jamo V/T, then for regular characters.
1179
         * c is not a Hangul syllable or Jamo L because those have "yes" properties.
1180
         */
1181
4.13M
        if(isJamoVT(norm16) && prevBoundary!=prevSrc) {
1182
4.47k
            UChar prev=*(prevSrc-1);
1183
4.47k
            UBool needToDecompose=FALSE;
1184
4.47k
            if(c<Hangul::JAMO_T_BASE) {
1185
                // c is a Jamo Vowel, compose with previous Jamo L and following Jamo T.
1186
2.27k
                prev=(UChar)(prev-Hangul::JAMO_L_BASE);
1187
2.27k
                if(prev<Hangul::JAMO_L_COUNT) {
1188
2.02k
                    if(!doCompose) {
1189
216
                        return FALSE;
1190
216
                    }
1191
1.80k
                    UChar syllable=(UChar)
1192
1.80k
                        (Hangul::HANGUL_BASE+
1193
1.80k
                         (prev*Hangul::JAMO_V_COUNT+(c-Hangul::JAMO_V_BASE))*
1194
1.80k
                         Hangul::JAMO_T_COUNT);
1195
1.80k
                    UChar t;
1196
1.80k
                    if(src!=limit && (t=(UChar)(*src-Hangul::JAMO_T_BASE))<Hangul::JAMO_T_COUNT) {
1197
196
                        ++src;
1198
196
                        syllable+=t;  // The next character was a Jamo T.
1199
196
                        prevBoundary=src;
1200
196
                        buffer.setLastChar(syllable);
1201
196
                        continue;
1202
196
                    }
1203
                    // If we see L+V+x where x!=T then we drop to the slow path,
1204
                    // decompose and recompose.
1205
                    // This is to deal with NFKC finding normal L and V but a
1206
                    // compatibility variant of a T. We need to either fully compose that
1207
                    // combination here (which would complicate the code and may not work
1208
                    // with strange custom data) or use the slow path -- or else our replacing
1209
                    // two input characters (L+V) with one output character (LV syllable)
1210
                    // would violate the invariant that [prevBoundary..prevSrc[ has the same
1211
                    // length as what we appended to the buffer since prevBoundary.
1212
1.61k
                    needToDecompose=TRUE;
1213
1.61k
                }
1214
2.27k
            } else if(Hangul::isHangulWithoutJamoT(prev)) {
1215
                // c is a Jamo Trailing consonant,
1216
                // compose with previous Hangul LV that does not contain a Jamo T.
1217
481
                if(!doCompose) {
1218
287
                    return FALSE;
1219
287
                }
1220
194
                buffer.setLastChar((UChar)(prev+c-Hangul::JAMO_T_BASE));
1221
194
                prevBoundary=src;
1222
194
                continue;
1223
481
            }
1224
3.58k
            if(!needToDecompose) {
1225
                // The Jamo V/T did not compose into a Hangul syllable.
1226
1.96k
                if(doCompose) {
1227
270
                    if(!buffer.appendBMP((UChar)c, 0, errorCode)) {
1228
0
                        break;
1229
0
                    }
1230
1.69k
                } else {
1231
1.69k
                    prevCC=0;
1232
1.69k
                }
1233
1.96k
                continue;
1234
1.96k
            }
1235
3.58k
        }
1236
        /*
1237
         * Source buffer pointers:
1238
         *
1239
         *  all done      quick check   current char  not yet
1240
         *                "yes" but     (c)           processed
1241
         *                may combine
1242
         *                forward
1243
         * [-------------[-------------[-------------[-------------[
1244
         * |             |             |             |             |
1245
         * orig. src     prevBoundary  prevSrc       src           limit
1246
         *
1247
         *
1248
         * Destination buffer pointers inside the ReorderingBuffer:
1249
         *
1250
         *  all done      might take    not filled yet
1251
         *                characters for
1252
         *                reordering
1253
         * [-------------[-------------[-------------[
1254
         * |             |             |             |
1255
         * start         reorderStart  limit         |
1256
         *                             +remainingCap.+
1257
         */
1258
4.13M
        if(norm16>=MIN_YES_YES_WITH_CC) {
1259
433k
            uint8_t cc=(uint8_t)norm16;  // cc!=0
1260
433k
            if( onlyContiguous &&  // FCC
1261
433k
                (doCompose ? buffer.getLastCC() : prevCC)==0 &&
1262
433k
                prevBoundary<prevSrc &&
1263
                // buffer.getLastCC()==0 && prevBoundary<prevSrc tell us that
1264
                // [prevBoundary..prevSrc[ (which is exactly one character under these conditions)
1265
                // passed the quick check "yes && ccc==0" test.
1266
                // Check whether the last character was a "yesYes" or a "yesNo".
1267
                // If a "yesNo", then we get its trailing ccc from its
1268
                // mapping and check for canonical order.
1269
                // All other cases are ok.
1270
433k
                getTrailCCFromCompYesAndZeroCC(prevBoundary, prevSrc)>cc
1271
433k
            ) {
1272
                // Fails FCD test, need to decompose and contiguously recompose.
1273
0
                if(!doCompose) {
1274
0
                    return FALSE;
1275
0
                }
1276
433k
            } else if(doCompose) {
1277
430k
                if(!buffer.append(c, cc, errorCode)) {
1278
0
                    break;
1279
0
                }
1280
430k
                continue;
1281
430k
            } else if(prevCC<=cc) {
1282
2.26k
                prevCC=cc;
1283
2.26k
                continue;
1284
2.26k
            } else {
1285
335
                return FALSE;
1286
335
            }
1287
3.69M
        } else if(!doCompose && !isMaybeOrNonZeroCC(norm16)) {
1288
695
            return FALSE;
1289
695
        }
1290
1291
        /*
1292
         * Find appropriate boundaries around this character,
1293
         * decompose the source text from between the boundaries,
1294
         * and recompose it.
1295
         *
1296
         * We may need to remove the last few characters from the ReorderingBuffer
1297
         * to account for source text that was copied or appended
1298
         * but needs to take part in the recomposition.
1299
         */
1300
1301
        /*
1302
         * Find the last composition boundary in [prevBoundary..src[.
1303
         * It is either the decomposition of the current character (at prevSrc),
1304
         * or prevBoundary.
1305
         */
1306
3.69M
        if(hasCompBoundaryBefore(c, norm16)) {
1307
2.89M
            prevBoundary=prevSrc;
1308
2.89M
        } else if(doCompose) {
1309
798k
            buffer.removeSuffix((int32_t)(prevSrc-prevBoundary));
1310
798k
        }
1311
1312
        // Find the next composition boundary in [src..limit[ -
1313
        // modifies src to point to the next starter.
1314
3.69M
        src=(UChar *)findNextCompBoundary(src, limit);
1315
1316
        // Decompose [prevBoundary..src[ into the buffer and then recompose that part of it.
1317
3.69M
        int32_t recomposeStartIndex=buffer.length();
1318
3.69M
        if(!decomposeShort(prevBoundary, src, buffer, errorCode)) {
1319
0
            break;
1320
0
        }
1321
3.69M
        recompose(buffer, recomposeStartIndex, onlyContiguous);
1322
3.69M
        if(!doCompose) {
1323
4.81k
            if(!buffer.equals(prevBoundary, src)) {
1324
1.11k
                return FALSE;
1325
1.11k
            }
1326
3.70k
            buffer.remove();
1327
3.70k
            prevCC=0;
1328
3.70k
        }
1329
1330
        // Move to the next starter. We never need to look back before this point again.
1331
3.69M
        prevBoundary=src;
1332
3.69M
    }
1333
7.07M
    return TRUE;
1334
7.07M
}
1335
1336
// Very similar to compose(): Make the same changes in both places if relevant.
1337
// pQCResult==NULL: spanQuickCheckYes
1338
// pQCResult!=NULL: quickCheck (*pQCResult must be UNORM_YES)
1339
const UChar *
1340
Normalizer2Impl::composeQuickCheck(const UChar *src, const UChar *limit,
1341
                                   UBool onlyContiguous,
1342
0
                                   UNormalizationCheckResult *pQCResult) const {
1343
    /*
1344
     * prevBoundary points to the last character before the current one
1345
     * that has a composition boundary before it with ccc==0 and quick check "yes".
1346
     */
1347
0
    const UChar *prevBoundary=src;
1348
0
    UChar32 minNoMaybeCP=minCompNoMaybeCP;
1349
0
    if(limit==NULL) {
1350
0
        UErrorCode errorCode=U_ZERO_ERROR;
1351
0
        src=copyLowPrefixFromNulTerminated(src, minNoMaybeCP, NULL, errorCode);
1352
0
        if(prevBoundary<src) {
1353
            // Set prevBoundary to the last character in the prefix.
1354
0
            prevBoundary=src-1;
1355
0
        }
1356
0
        limit=u_strchr(src, 0);
1357
0
    }
1358
1359
0
    const UChar *prevSrc;
1360
0
    UChar32 c=0;
1361
0
    uint16_t norm16=0;
1362
0
    uint8_t prevCC=0;
1363
1364
0
    for(;;) {
1365
        // count code units below the minimum or with irrelevant data for the quick check
1366
0
        for(prevSrc=src;;) {
1367
0
            if(src==limit) {
1368
0
                return src;
1369
0
            }
1370
0
            if( (c=*src)<minNoMaybeCP ||
1371
0
                isCompYesAndZeroCC(norm16=UTRIE2_GET16_FROM_U16_SINGLE_LEAD(normTrie, c))
1372
0
            ) {
1373
0
                ++src;
1374
0
            } else if(!U16_IS_SURROGATE(c)) {
1375
0
                break;
1376
0
            } else {
1377
0
                UChar c2;
1378
0
                if(U16_IS_SURROGATE_LEAD(c)) {
1379
0
                    if((src+1)!=limit && U16_IS_TRAIL(c2=src[1])) {
1380
0
                        c=U16_GET_SUPPLEMENTARY(c, c2);
1381
0
                    }
1382
0
                } else /* trail surrogate */ {
1383
0
                    if(prevSrc<src && U16_IS_LEAD(c2=*(src-1))) {
1384
0
                        --src;
1385
0
                        c=U16_GET_SUPPLEMENTARY(c2, c);
1386
0
                    }
1387
0
                }
1388
0
                if(isCompYesAndZeroCC(norm16=getNorm16(c))) {
1389
0
                    src+=U16_LENGTH(c);
1390
0
                } else {
1391
0
                    break;
1392
0
                }
1393
0
            }
1394
0
        }
1395
0
        if(src!=prevSrc) {
1396
            // Set prevBoundary to the last character in the quick check loop.
1397
0
            prevBoundary=src-1;
1398
0
            if( U16_IS_TRAIL(*prevBoundary) && prevSrc<prevBoundary &&
1399
0
                U16_IS_LEAD(*(prevBoundary-1))
1400
0
            ) {
1401
0
                --prevBoundary;
1402
0
            }
1403
0
            prevCC=0;
1404
            // The start of the current character (c).
1405
0
            prevSrc=src;
1406
0
        }
1407
1408
0
        src+=U16_LENGTH(c);
1409
        /*
1410
         * isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo.
1411
         * c is either a "noNo" (has a mapping) or a "maybeYes" (combines backward)
1412
         * or has ccc!=0.
1413
         */
1414
0
        if(isMaybeOrNonZeroCC(norm16)) {
1415
0
            uint8_t cc=getCCFromYesOrMaybe(norm16);
1416
0
            if( onlyContiguous &&  // FCC
1417
0
                cc!=0 &&
1418
0
                prevCC==0 &&
1419
0
                prevBoundary<prevSrc &&
1420
                // prevCC==0 && prevBoundary<prevSrc tell us that
1421
                // [prevBoundary..prevSrc[ (which is exactly one character under these conditions)
1422
                // passed the quick check "yes && ccc==0" test.
1423
                // Check whether the last character was a "yesYes" or a "yesNo".
1424
                // If a "yesNo", then we get its trailing ccc from its
1425
                // mapping and check for canonical order.
1426
                // All other cases are ok.
1427
0
                getTrailCCFromCompYesAndZeroCC(prevBoundary, prevSrc)>cc
1428
0
            ) {
1429
                // Fails FCD test.
1430
0
            } else if(prevCC<=cc || cc==0) {
1431
0
                prevCC=cc;
1432
0
                if(norm16<MIN_YES_YES_WITH_CC) {
1433
0
                    if(pQCResult!=NULL) {
1434
0
                        *pQCResult=UNORM_MAYBE;
1435
0
                    } else {
1436
0
                        return prevBoundary;
1437
0
                    }
1438
0
                }
1439
0
                continue;
1440
0
            }
1441
0
        }
1442
0
        if(pQCResult!=NULL) {
1443
0
            *pQCResult=UNORM_NO;
1444
0
        }
1445
0
        return prevBoundary;
1446
0
    }
1447
0
}
1448
1449
void Normalizer2Impl::composeAndAppend(const UChar *src, const UChar *limit,
1450
                                       UBool doCompose,
1451
                                       UBool onlyContiguous,
1452
                                       UnicodeString &safeMiddle,
1453
                                       ReorderingBuffer &buffer,
1454
559k
                                       UErrorCode &errorCode) const {
1455
559k
    if(!buffer.isEmpty()) {
1456
559k
        const UChar *firstStarterInSrc=findNextCompBoundary(src, limit);
1457
559k
        if(src!=firstStarterInSrc) {
1458
546k
            const UChar *lastStarterInDest=findPreviousCompBoundary(buffer.getStart(),
1459
546k
                                                                    buffer.getLimit());
1460
546k
            int32_t destSuffixLength=(int32_t)(buffer.getLimit()-lastStarterInDest);
1461
546k
            UnicodeString middle(lastStarterInDest, destSuffixLength);
1462
546k
            buffer.removeSuffix(destSuffixLength);
1463
546k
            safeMiddle=middle;
1464
546k
            middle.append(src, (int32_t)(firstStarterInSrc-src));
1465
546k
            const UChar *middleStart=middle.getBuffer();
1466
546k
            compose(middleStart, middleStart+middle.length(), onlyContiguous,
1467
546k
                    TRUE, buffer, errorCode);
1468
546k
            if(U_FAILURE(errorCode)) {
1469
0
                return;
1470
0
            }
1471
546k
            src=firstStarterInSrc;
1472
546k
        }
1473
559k
    }
1474
559k
    if(doCompose) {
1475
559k
        compose(src, limit, onlyContiguous, TRUE, buffer, errorCode);
1476
559k
    } else {
1477
0
        if(limit==NULL) {  // appendZeroCC() needs limit!=NULL
1478
0
            limit=u_strchr(src, 0);
1479
0
        }
1480
0
        buffer.appendZeroCC(src, limit, errorCode);
1481
0
    }
1482
559k
}
1483
1484
/**
1485
 * Does c have a composition boundary before it?
1486
 * True if its decomposition begins with a character that has
1487
 * ccc=0 && NFC_QC=Yes (isCompYesAndZeroCC()).
1488
 * As a shortcut, this is true if c itself has ccc=0 && NFC_QC=Yes
1489
 * (isCompYesAndZeroCC()) so we need not decompose.
1490
 */
1491
9.12M
UBool Normalizer2Impl::hasCompBoundaryBefore(UChar32 c, uint16_t norm16) const {
1492
11.3M
    for(;;) {
1493
11.3M
        if(isCompYesAndZeroCC(norm16)) {
1494
6.92M
            return TRUE;
1495
6.92M
        } else if(isMaybeOrNonZeroCC(norm16)) {
1496
1.02M
            return FALSE;
1497
3.40M
        } else if(isDecompNoAlgorithmic(norm16)) {
1498
2.22M
            c=mapAlgorithmic(c, norm16);
1499
2.22M
            norm16=getNorm16(c);
1500
2.22M
        } else {
1501
            // c decomposes, get everything from the variable-length extra data
1502
1.18M
            const uint16_t *mapping=getMapping(norm16);
1503
1.18M
            uint16_t firstUnit=*mapping;
1504
1.18M
            if((firstUnit&MAPPING_LENGTH_MASK)==0) {
1505
393k
                return FALSE;
1506
393k
            }
1507
787k
            if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD) && (*(mapping-1)&0xff00)) {
1508
11.0k
                return FALSE;  // non-zero leadCC
1509
11.0k
            }
1510
776k
            int32_t i=1;  // skip over the firstUnit
1511
776k
            UChar32 c;
1512
776k
            U16_NEXT_UNSAFE(mapping, i, c);
1513
776k
            return isCompYesAndZeroCC(getNorm16(c));
1514
787k
        }
1515
11.3M
    }
1516
9.12M
}
1517
1518
0
UBool Normalizer2Impl::hasCompBoundaryAfter(UChar32 c, UBool onlyContiguous, UBool testInert) const {
1519
0
    for(;;) {
1520
0
        uint16_t norm16=getNorm16(c);
1521
0
        if(isInert(norm16)) {
1522
0
            return TRUE;
1523
0
        } else if(norm16<=minYesNo) {
1524
            // Hangul: norm16==minYesNo
1525
            // Hangul LVT has a boundary after it.
1526
            // Hangul LV and non-inert yesYes characters combine forward.
1527
0
            return isHangul(norm16) && !Hangul::isHangulWithoutJamoT((UChar)c);
1528
0
        } else if(norm16>= (testInert ? minNoNo : minMaybeYes)) {
1529
0
            return FALSE;
1530
0
        } else if(isDecompNoAlgorithmic(norm16)) {
1531
0
            c=mapAlgorithmic(c, norm16);
1532
0
        } else {
1533
            // c decomposes, get everything from the variable-length extra data.
1534
            // If testInert, then c must be a yesNo character which has lccc=0,
1535
            // otherwise it could be a noNo.
1536
0
            const uint16_t *mapping=getMapping(norm16);
1537
0
            uint16_t firstUnit=*mapping;
1538
            // TRUE if
1539
            //   not MAPPING_NO_COMP_BOUNDARY_AFTER
1540
            //     (which is set if
1541
            //       c is not deleted, and
1542
            //       it and its decomposition do not combine forward, and it has a starter)
1543
            //   and if FCC then trailCC<=1
1544
0
            return
1545
0
                (firstUnit&MAPPING_NO_COMP_BOUNDARY_AFTER)==0 &&
1546
0
                (!onlyContiguous || firstUnit<=0x1ff);
1547
0
        }
1548
0
    }
1549
0
}
1550
1551
546k
const UChar *Normalizer2Impl::findPreviousCompBoundary(const UChar *start, const UChar *p) const {
1552
546k
    BackwardUTrie2StringIterator iter(normTrie, start, p);
1553
546k
    uint16_t norm16;
1554
546k
    do {
1555
546k
        norm16=iter.previous16();
1556
546k
    } while(!hasCompBoundaryBefore(iter.codePoint, norm16));
1557
    // We could also test hasCompBoundaryAfter() and return iter.codePointLimit,
1558
    // but that's probably not worth the extra cost.
1559
546k
    return iter.codePointStart;
1560
546k
}
1561
1562
4.25M
const UChar *Normalizer2Impl::findNextCompBoundary(const UChar *p, const UChar *limit) const {
1563
4.25M
    ForwardUTrie2StringIterator iter(normTrie, p, limit);
1564
4.25M
    uint16_t norm16;
1565
4.88M
    do {
1566
4.88M
        norm16=iter.next16();
1567
4.88M
    } while(!hasCompBoundaryBefore(iter.codePoint, norm16));
1568
4.25M
    return iter.codePointStart;
1569
4.25M
}
1570
1571
// Note: normalizer2impl.cpp r30982 (2011-nov-27)
1572
// still had getFCDTrie() which built and cached an FCD trie.
1573
// That provided faster access to FCD data than getFCD16FromNormData()
1574
// but required synchronization and consumed some 10kB of heap memory
1575
// in any process that uses FCD (e.g., via collation).
1576
// tccc180[] and smallFCD[] are intended to help with any loss of performance,
1577
// at least for Latin & CJK.
1578
1579
// Gets the FCD value from the regular normalization data.
1580
384
uint16_t Normalizer2Impl::getFCD16FromNormData(UChar32 c) const {
1581
    // Only loops for 1:1 algorithmic mappings.
1582
502
    for(;;) {
1583
502
        uint16_t norm16=getNorm16(c);
1584
502
        if(norm16<=minYesNo) {
1585
            // no decomposition or Hangul syllable, all zeros
1586
72
            return 0;
1587
430
        } else if(norm16>=MIN_NORMAL_MAYBE_YES) {
1588
            // combining mark
1589
0
            norm16&=0xff;
1590
0
            return norm16|(norm16<<8);
1591
430
        } else if(norm16>=minMaybeYes) {
1592
0
            return 0;
1593
430
        } else if(isDecompNoAlgorithmic(norm16)) {
1594
118
            c=mapAlgorithmic(c, norm16);
1595
312
        } else {
1596
            // c decomposes, get everything from the variable-length extra data
1597
312
            const uint16_t *mapping=getMapping(norm16);
1598
312
            uint16_t firstUnit=*mapping;
1599
312
            if((firstUnit&MAPPING_LENGTH_MASK)==0) {
1600
                // A character that is deleted (maps to an empty string) must
1601
                // get the worst-case lccc and tccc values because arbitrary
1602
                // characters on both sides will become adjacent.
1603
2
                return 0x1ff;
1604
310
            } else {
1605
310
                norm16=firstUnit>>8;  // tccc
1606
310
                if(firstUnit&MAPPING_HAS_CCC_LCCC_WORD) {
1607
0
                    norm16|=*(mapping-1)&0xff00;  // lccc
1608
0
                }
1609
310
                return norm16;
1610
310
            }
1611
312
        }
1612
502
    }
1613
384
}
1614
1615
// Dual functionality:
1616
// buffer!=NULL: normalize
1617
// buffer==NULL: isNormalized/quickCheck/spanQuickCheckYes
1618
const UChar *
1619
Normalizer2Impl::makeFCD(const UChar *src, const UChar *limit,
1620
                         ReorderingBuffer *buffer,
1621
0
                         UErrorCode &errorCode) const {
1622
    // Tracks the last FCD-safe boundary, before lccc=0 or after properly-ordered tccc<=1.
1623
    // Similar to the prevBoundary in the compose() implementation.
1624
0
    const UChar *prevBoundary=src;
1625
0
    int32_t prevFCD16=0;
1626
0
    if(limit==NULL) {
1627
0
        src=copyLowPrefixFromNulTerminated(src, MIN_CCC_LCCC_CP, buffer, errorCode);
1628
0
        if(U_FAILURE(errorCode)) {
1629
0
            return src;
1630
0
        }
1631
0
        if(prevBoundary<src) {
1632
0
            prevBoundary=src;
1633
            // We know that the previous character's lccc==0.
1634
            // Fetching the fcd16 value was deferred for this below-U+0300 code point.
1635
0
            prevFCD16=getFCD16(*(src-1));
1636
0
            if(prevFCD16>1) {
1637
0
                --prevBoundary;
1638
0
            }
1639
0
        }
1640
0
        limit=u_strchr(src, 0);
1641
0
    }
1642
1643
    // Note: In this function we use buffer->appendZeroCC() because we track
1644
    // the lead and trail combining classes here, rather than leaving it to
1645
    // the ReorderingBuffer.
1646
    // The exception is the call to decomposeShort() which uses the buffer
1647
    // in the normal way.
1648
1649
0
    const UChar *prevSrc;
1650
0
    UChar32 c=0;
1651
0
    uint16_t fcd16=0;
1652
1653
0
    for(;;) {
1654
        // count code units with lccc==0
1655
0
        for(prevSrc=src; src!=limit;) {
1656
0
            if((c=*src)<MIN_CCC_LCCC_CP) {
1657
0
                prevFCD16=~c;
1658
0
                ++src;
1659
0
            } else if(!singleLeadMightHaveNonZeroFCD16(c)) {
1660
0
                prevFCD16=0;
1661
0
                ++src;
1662
0
            } else {
1663
0
                if(U16_IS_SURROGATE(c)) {
1664
0
                    UChar c2;
1665
0
                    if(U16_IS_SURROGATE_LEAD(c)) {
1666
0
                        if((src+1)!=limit && U16_IS_TRAIL(c2=src[1])) {
1667
0
                            c=U16_GET_SUPPLEMENTARY(c, c2);
1668
0
                        }
1669
0
                    } else /* trail surrogate */ {
1670
0
                        if(prevSrc<src && U16_IS_LEAD(c2=*(src-1))) {
1671
0
                            --src;
1672
0
                            c=U16_GET_SUPPLEMENTARY(c2, c);
1673
0
                        }
1674
0
                    }
1675
0
                }
1676
0
                if((fcd16=getFCD16FromNormData(c))<=0xff) {
1677
0
                    prevFCD16=fcd16;
1678
0
                    src+=U16_LENGTH(c);
1679
0
                } else {
1680
0
                    break;
1681
0
                }
1682
0
            }
1683
0
        }
1684
        // copy these code units all at once
1685
0
        if(src!=prevSrc) {
1686
0
            if(buffer!=NULL && !buffer->appendZeroCC(prevSrc, src, errorCode)) {
1687
0
                break;
1688
0
            }
1689
0
            if(src==limit) {
1690
0
                break;
1691
0
            }
1692
0
            prevBoundary=src;
1693
            // We know that the previous character's lccc==0.
1694
0
            if(prevFCD16<0) {
1695
                // Fetching the fcd16 value was deferred for this below-U+0300 code point.
1696
0
                UChar32 prev=~prevFCD16;
1697
0
                prevFCD16= prev<0x180 ? tccc180[prev] : getFCD16FromNormData(prev);
1698
0
                if(prevFCD16>1) {
1699
0
                    --prevBoundary;
1700
0
                }
1701
0
            } else {
1702
0
                const UChar *p=src-1;
1703
0
                if(U16_IS_TRAIL(*p) && prevSrc<p && U16_IS_LEAD(*(p-1))) {
1704
0
                    --p;
1705
                    // Need to fetch the previous character's FCD value because
1706
                    // prevFCD16 was just for the trail surrogate code point.
1707
0
                    prevFCD16=getFCD16FromNormData(U16_GET_SUPPLEMENTARY(p[0], p[1]));
1708
                    // Still known to have lccc==0 because its lead surrogate unit had lccc==0.
1709
0
                }
1710
0
                if(prevFCD16>1) {
1711
0
                    prevBoundary=p;
1712
0
                }
1713
0
            }
1714
            // The start of the current character (c).
1715
0
            prevSrc=src;
1716
0
        } else if(src==limit) {
1717
0
            break;
1718
0
        }
1719
1720
0
        src+=U16_LENGTH(c);
1721
        // The current character (c) at [prevSrc..src[ has a non-zero lead combining class.
1722
        // Check for proper order, and decompose locally if necessary.
1723
0
        if((prevFCD16&0xff)<=(fcd16>>8)) {
1724
            // proper order: prev tccc <= current lccc
1725
0
            if((fcd16&0xff)<=1) {
1726
0
                prevBoundary=src;
1727
0
            }
1728
0
            if(buffer!=NULL && !buffer->appendZeroCC(c, errorCode)) {
1729
0
                break;
1730
0
            }
1731
0
            prevFCD16=fcd16;
1732
0
            continue;
1733
0
        } else if(buffer==NULL) {
1734
0
            return prevBoundary;  // quick check "no"
1735
0
        } else {
1736
            /*
1737
             * Back out the part of the source that we copied or appended
1738
             * already but is now going to be decomposed.
1739
             * prevSrc is set to after what was copied/appended.
1740
             */
1741
0
            buffer->removeSuffix((int32_t)(prevSrc-prevBoundary));
1742
            /*
1743
             * Find the part of the source that needs to be decomposed,
1744
             * up to the next safe boundary.
1745
             */
1746
0
            src=findNextFCDBoundary(src, limit);
1747
            /*
1748
             * The source text does not fulfill the conditions for FCD.
1749
             * Decompose and reorder a limited piece of the text.
1750
             */
1751
0
            if(!decomposeShort(prevBoundary, src, *buffer, errorCode)) {
1752
0
                break;
1753
0
            }
1754
0
            prevBoundary=src;
1755
0
            prevFCD16=0;
1756
0
        }
1757
0
    }
1758
0
    return src;
1759
0
}
1760
1761
void Normalizer2Impl::makeFCDAndAppend(const UChar *src, const UChar *limit,
1762
                                       UBool doMakeFCD,
1763
                                       UnicodeString &safeMiddle,
1764
                                       ReorderingBuffer &buffer,
1765
0
                                       UErrorCode &errorCode) const {
1766
0
    if(!buffer.isEmpty()) {
1767
0
        const UChar *firstBoundaryInSrc=findNextFCDBoundary(src, limit);
1768
0
        if(src!=firstBoundaryInSrc) {
1769
0
            const UChar *lastBoundaryInDest=findPreviousFCDBoundary(buffer.getStart(),
1770
0
                                                                    buffer.getLimit());
1771
0
            int32_t destSuffixLength=(int32_t)(buffer.getLimit()-lastBoundaryInDest);
1772
0
            UnicodeString middle(lastBoundaryInDest, destSuffixLength);
1773
0
            buffer.removeSuffix(destSuffixLength);
1774
0
            safeMiddle=middle;
1775
0
            middle.append(src, (int32_t)(firstBoundaryInSrc-src));
1776
0
            const UChar *middleStart=middle.getBuffer();
1777
0
            makeFCD(middleStart, middleStart+middle.length(), &buffer, errorCode);
1778
0
            if(U_FAILURE(errorCode)) {
1779
0
                return;
1780
0
            }
1781
0
            src=firstBoundaryInSrc;
1782
0
        }
1783
0
    }
1784
0
    if(doMakeFCD) {
1785
0
        makeFCD(src, limit, &buffer, errorCode);
1786
0
    } else {
1787
0
        if(limit==NULL) {  // appendZeroCC() needs limit!=NULL
1788
0
            limit=u_strchr(src, 0);
1789
0
        }
1790
0
        buffer.appendZeroCC(src, limit, errorCode);
1791
0
    }
1792
0
}
1793
1794
0
const UChar *Normalizer2Impl::findPreviousFCDBoundary(const UChar *start, const UChar *p) const {
1795
0
    while(start<p && previousFCD16(start, p)>0xff) {}
1796
0
    return p;
1797
0
}
1798
1799
0
const UChar *Normalizer2Impl::findNextFCDBoundary(const UChar *p, const UChar *limit) const {
1800
0
    while(p<limit) {
1801
0
        const UChar *codePointStart=p;
1802
0
        if(nextFCD16(p, limit)<=0xff) {
1803
0
            return codePointStart;
1804
0
        }
1805
0
    }
1806
0
    return p;
1807
0
}
1808
1809
// CanonicalIterator data -------------------------------------------------- ***
1810
1811
CanonIterData::CanonIterData(UErrorCode &errorCode) :
1812
0
        trie(utrie2_open(0, 0, &errorCode)),
1813
0
        canonStartSets(uprv_deleteUObject, NULL, errorCode) {}
1814
1815
0
CanonIterData::~CanonIterData() {
1816
0
    utrie2_close(trie);
1817
0
}
1818
1819
0
void CanonIterData::addToStartSet(UChar32 origin, UChar32 decompLead, UErrorCode &errorCode) {
1820
0
    uint32_t canonValue=utrie2_get32(trie, decompLead);
1821
0
    if((canonValue&(CANON_HAS_SET|CANON_VALUE_MASK))==0 && origin!=0) {
1822
        // origin is the first character whose decomposition starts with
1823
        // the character for which we are setting the value.
1824
0
        utrie2_set32(trie, decompLead, canonValue|origin, &errorCode);
1825
0
    } else {
1826
        // origin is not the first character, or it is U+0000.
1827
0
        UnicodeSet *set;
1828
0
        if((canonValue&CANON_HAS_SET)==0) {
1829
0
            set=new UnicodeSet;
1830
0
            if(set==NULL) {
1831
0
                errorCode=U_MEMORY_ALLOCATION_ERROR;
1832
0
                return;
1833
0
            }
1834
0
            UChar32 firstOrigin=(UChar32)(canonValue&CANON_VALUE_MASK);
1835
0
            canonValue=(canonValue&~CANON_VALUE_MASK)|CANON_HAS_SET|(uint32_t)canonStartSets.size();
1836
0
            utrie2_set32(trie, decompLead, canonValue, &errorCode);
1837
0
            canonStartSets.addElement(set, errorCode);
1838
0
            if(firstOrigin!=0) {
1839
0
                set->add(firstOrigin);
1840
0
            }
1841
0
        } else {
1842
0
            set=(UnicodeSet *)canonStartSets[(int32_t)(canonValue&CANON_VALUE_MASK)];
1843
0
        }
1844
0
        set->add(origin);
1845
0
    }
1846
0
}
1847
1848
U_CDECL_BEGIN
1849
1850
// Call Normalizer2Impl::makeCanonIterDataFromNorm16() for a range of same-norm16 characters.
1851
//     context: the Normalizer2Impl
1852
static UBool U_CALLCONV
1853
0
enumCIDRangeHandler(const void *context, UChar32 start, UChar32 end, uint32_t value) {
1854
0
    UErrorCode errorCode = U_ZERO_ERROR;
1855
0
    if (value != 0) {
1856
0
        Normalizer2Impl *impl = (Normalizer2Impl *)context;
1857
0
        impl->makeCanonIterDataFromNorm16(
1858
0
            start, end, (uint16_t)value, *impl->fCanonIterData, errorCode);
1859
0
    }
1860
0
    return U_SUCCESS(errorCode);
1861
0
}
1862
1863
1864
1865
// UInitOnce instantiation function for CanonIterData
1866
1867
static void U_CALLCONV 
1868
0
initCanonIterData(Normalizer2Impl *impl, UErrorCode &errorCode) {
1869
0
    U_ASSERT(impl->fCanonIterData == NULL);
1870
0
    impl->fCanonIterData = new CanonIterData(errorCode);
1871
0
    if (impl->fCanonIterData == NULL) {
1872
0
        errorCode=U_MEMORY_ALLOCATION_ERROR;
1873
0
    }
1874
0
    if (U_SUCCESS(errorCode)) {
1875
0
        utrie2_enum(impl->getNormTrie(), NULL, enumCIDRangeHandler, impl);
1876
0
        utrie2_freeze(impl->fCanonIterData->trie, UTRIE2_32_VALUE_BITS, &errorCode);
1877
0
    }
1878
0
    if (U_FAILURE(errorCode)) {
1879
0
        delete impl->fCanonIterData;
1880
0
        impl->fCanonIterData = NULL;
1881
0
    }
1882
0
}
1883
1884
U_CDECL_END
1885
1886
void Normalizer2Impl::makeCanonIterDataFromNorm16(UChar32 start, UChar32 end, uint16_t norm16,
1887
                                                  CanonIterData &newData,
1888
0
                                                  UErrorCode &errorCode) const {
1889
0
    if(norm16==0 || (minYesNo<=norm16 && norm16<minNoNo)) {
1890
        // Inert, or 2-way mapping (including Hangul syllable).
1891
        // We do not write a canonStartSet for any yesNo character.
1892
        // Composites from 2-way mappings are added at runtime from the
1893
        // starter's compositions list, and the other characters in
1894
        // 2-way mappings get CANON_NOT_SEGMENT_STARTER set because they are
1895
        // "maybe" characters.
1896
0
        return;
1897
0
    }
1898
0
    for(UChar32 c=start; c<=end; ++c) {
1899
0
        uint32_t oldValue=utrie2_get32(newData.trie, c);
1900
0
        uint32_t newValue=oldValue;
1901
0
        if(norm16>=minMaybeYes) {
1902
            // not a segment starter if it occurs in a decomposition or has cc!=0
1903
0
            newValue|=CANON_NOT_SEGMENT_STARTER;
1904
0
            if(norm16<MIN_NORMAL_MAYBE_YES) {
1905
0
                newValue|=CANON_HAS_COMPOSITIONS;
1906
0
            }
1907
0
        } else if(norm16<minYesNo) {
1908
0
            newValue|=CANON_HAS_COMPOSITIONS;
1909
0
        } else {
1910
            // c has a one-way decomposition
1911
0
            UChar32 c2=c;
1912
0
            uint16_t norm16_2=norm16;
1913
0
            while(limitNoNo<=norm16_2 && norm16_2<minMaybeYes) {
1914
0
                c2=mapAlgorithmic(c2, norm16_2);
1915
0
                norm16_2=getNorm16(c2);
1916
0
            }
1917
0
            if(minYesNo<=norm16_2 && norm16_2<limitNoNo) {
1918
                // c decomposes, get everything from the variable-length extra data
1919
0
                const uint16_t *mapping=getMapping(norm16_2);
1920
0
                uint16_t firstUnit=*mapping;
1921
0
                int32_t length=firstUnit&MAPPING_LENGTH_MASK;
1922
0
                if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0) {
1923
0
                    if(c==c2 && (*(mapping-1)&0xff)!=0) {
1924
0
                        newValue|=CANON_NOT_SEGMENT_STARTER;  // original c has cc!=0
1925
0
                    }
1926
0
                }
1927
                // Skip empty mappings (no characters in the decomposition).
1928
0
                if(length!=0) {
1929
0
                    ++mapping;  // skip over the firstUnit
1930
                    // add c to first code point's start set
1931
0
                    int32_t i=0;
1932
0
                    U16_NEXT_UNSAFE(mapping, i, c2);
1933
0
                    newData.addToStartSet(c, c2, errorCode);
1934
                    // Set CANON_NOT_SEGMENT_STARTER for each remaining code point of a
1935
                    // one-way mapping. A 2-way mapping is possible here after
1936
                    // intermediate algorithmic mapping.
1937
0
                    if(norm16_2>=minNoNo) {
1938
0
                        while(i<length) {
1939
0
                            U16_NEXT_UNSAFE(mapping, i, c2);
1940
0
                            uint32_t c2Value=utrie2_get32(newData.trie, c2);
1941
0
                            if((c2Value&CANON_NOT_SEGMENT_STARTER)==0) {
1942
0
                                utrie2_set32(newData.trie, c2, c2Value|CANON_NOT_SEGMENT_STARTER,
1943
0
                                             &errorCode);
1944
0
                            }
1945
0
                        }
1946
0
                    }
1947
0
                }
1948
0
            } else {
1949
                // c decomposed to c2 algorithmically; c has cc==0
1950
0
                newData.addToStartSet(c, c2, errorCode);
1951
0
            }
1952
0
        }
1953
0
        if(newValue!=oldValue) {
1954
0
            utrie2_set32(newData.trie, c, newValue, &errorCode);
1955
0
        }
1956
0
    }
1957
0
}
1958
1959
0
UBool Normalizer2Impl::ensureCanonIterData(UErrorCode &errorCode) const {
1960
    // Logically const: Synchronized instantiation.
1961
0
    Normalizer2Impl *me=const_cast<Normalizer2Impl *>(this);
1962
0
    umtx_initOnce(me->fCanonIterDataInitOnce, &initCanonIterData, me, errorCode);
1963
0
    return U_SUCCESS(errorCode);
1964
0
}
1965
1966
0
int32_t Normalizer2Impl::getCanonValue(UChar32 c) const {
1967
0
    return (int32_t)utrie2_get32(fCanonIterData->trie, c);
1968
0
}
1969
1970
0
const UnicodeSet &Normalizer2Impl::getCanonStartSet(int32_t n) const {
1971
0
    return *(const UnicodeSet *)fCanonIterData->canonStartSets[n];
1972
0
}
1973
1974
0
UBool Normalizer2Impl::isCanonSegmentStarter(UChar32 c) const {
1975
0
    return getCanonValue(c)>=0;
1976
0
}
1977
1978
0
UBool Normalizer2Impl::getCanonStartSet(UChar32 c, UnicodeSet &set) const {
1979
0
    int32_t canonValue=getCanonValue(c)&~CANON_NOT_SEGMENT_STARTER;
1980
0
    if(canonValue==0) {
1981
0
        return FALSE;
1982
0
    }
1983
0
    set.clear();
1984
0
    int32_t value=canonValue&CANON_VALUE_MASK;
1985
0
    if((canonValue&CANON_HAS_SET)!=0) {
1986
0
        set.addAll(getCanonStartSet(value));
1987
0
    } else if(value!=0) {
1988
0
        set.add(value);
1989
0
    }
1990
0
    if((canonValue&CANON_HAS_COMPOSITIONS)!=0) {
1991
0
        uint16_t norm16=getNorm16(c);
1992
0
        if(norm16==JAMO_L) {
1993
0
            UChar32 syllable=
1994
0
                (UChar32)(Hangul::HANGUL_BASE+(c-Hangul::JAMO_L_BASE)*Hangul::JAMO_VT_COUNT);
1995
0
            set.add(syllable, syllable+Hangul::JAMO_VT_COUNT-1);
1996
0
        } else {
1997
0
            addComposites(getCompositionsList(norm16), set);
1998
0
        }
1999
0
    }
2000
0
    return TRUE;
2001
0
}
2002
2003
U_NAMESPACE_END
2004
2005
// Normalizer2 data swapping ----------------------------------------------- ***
2006
2007
U_NAMESPACE_USE
2008
2009
U_CAPI int32_t U_EXPORT2
2010
unorm2_swap(const UDataSwapper *ds,
2011
            const void *inData, int32_t length, void *outData,
2012
0
            UErrorCode *pErrorCode) {
2013
0
    const UDataInfo *pInfo;
2014
0
    int32_t headerSize;
2015
2016
0
    const uint8_t *inBytes;
2017
0
    uint8_t *outBytes;
2018
2019
0
    const int32_t *inIndexes;
2020
0
    int32_t indexes[Normalizer2Impl::IX_MIN_MAYBE_YES+1];
2021
2022
0
    int32_t i, offset, nextOffset, size;
2023
2024
    /* udata_swapDataHeader checks the arguments */
2025
0
    headerSize=udata_swapDataHeader(ds, inData, length, outData, pErrorCode);
2026
0
    if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
2027
0
        return 0;
2028
0
    }
2029
2030
    /* check data format and format version */
2031
0
    pInfo=(const UDataInfo *)((const char *)inData+4);
2032
0
    if(!(
2033
0
        pInfo->dataFormat[0]==0x4e &&   /* dataFormat="Nrm2" */
2034
0
        pInfo->dataFormat[1]==0x72 &&
2035
0
        pInfo->dataFormat[2]==0x6d &&
2036
0
        pInfo->dataFormat[3]==0x32 &&
2037
0
        (pInfo->formatVersion[0]==1 || pInfo->formatVersion[0]==2)
2038
0
    )) {
2039
0
        udata_printError(ds, "unorm2_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as Normalizer2 data\n",
2040
0
                         pInfo->dataFormat[0], pInfo->dataFormat[1],
2041
0
                         pInfo->dataFormat[2], pInfo->dataFormat[3],
2042
0
                         pInfo->formatVersion[0]);
2043
0
        *pErrorCode=U_UNSUPPORTED_ERROR;
2044
0
        return 0;
2045
0
    }
2046
2047
0
    inBytes=(const uint8_t *)inData+headerSize;
2048
0
    outBytes=(uint8_t *)outData+headerSize;
2049
2050
0
    inIndexes=(const int32_t *)inBytes;
2051
2052
0
    if(length>=0) {
2053
0
        length-=headerSize;
2054
0
        if(length<(int32_t)sizeof(indexes)) {
2055
0
            udata_printError(ds, "unorm2_swap(): too few bytes (%d after header) for Normalizer2 data\n",
2056
0
                             length);
2057
0
            *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
2058
0
            return 0;
2059
0
        }
2060
0
    }
2061
2062
    /* read the first few indexes */
2063
0
    for(i=0; i<=Normalizer2Impl::IX_MIN_MAYBE_YES; ++i) {
2064
0
        indexes[i]=udata_readInt32(ds, inIndexes[i]);
2065
0
    }
2066
2067
    /* get the total length of the data */
2068
0
    size=indexes[Normalizer2Impl::IX_TOTAL_SIZE];
2069
2070
0
    if(length>=0) {
2071
0
        if(length<size) {
2072
0
            udata_printError(ds, "unorm2_swap(): too few bytes (%d after header) for all of Normalizer2 data\n",
2073
0
                             length);
2074
0
            *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
2075
0
            return 0;
2076
0
        }
2077
2078
        /* copy the data for inaccessible bytes */
2079
0
        if(inBytes!=outBytes) {
2080
0
            uprv_memcpy(outBytes, inBytes, size);
2081
0
        }
2082
2083
0
        offset=0;
2084
2085
        /* swap the int32_t indexes[] */
2086
0
        nextOffset=indexes[Normalizer2Impl::IX_NORM_TRIE_OFFSET];
2087
0
        ds->swapArray32(ds, inBytes, nextOffset-offset, outBytes, pErrorCode);
2088
0
        offset=nextOffset;
2089
2090
        /* swap the UTrie2 */
2091
0
        nextOffset=indexes[Normalizer2Impl::IX_EXTRA_DATA_OFFSET];
2092
0
        utrie2_swap(ds, inBytes+offset, nextOffset-offset, outBytes+offset, pErrorCode);
2093
0
        offset=nextOffset;
2094
2095
        /* swap the uint16_t extraData[] */
2096
0
        nextOffset=indexes[Normalizer2Impl::IX_SMALL_FCD_OFFSET];
2097
0
        ds->swapArray16(ds, inBytes+offset, nextOffset-offset, outBytes+offset, pErrorCode);
2098
0
        offset=nextOffset;
2099
2100
        /* no need to swap the uint8_t smallFCD[] (new in formatVersion 2) */
2101
0
        nextOffset=indexes[Normalizer2Impl::IX_SMALL_FCD_OFFSET+1];
2102
0
        offset=nextOffset;
2103
2104
0
        U_ASSERT(offset==size);
2105
0
    }
2106
2107
0
    return headerSize+size;
2108
0
}
2109
2110
#endif  // !UCONFIG_NO_NORMALIZATION