Coverage Report

Created: 2023-03-04 07:00

/src/icu/icu4c/source/i18n/collationdatabuilder.cpp
Line
Count
Source (jump to first uncovered line)
1
// © 2016 and later: Unicode, Inc. and others.
2
// License & terms of use: http://www.unicode.org/copyright.html
3
/*
4
*******************************************************************************
5
* Copyright (C) 2012-2015, International Business Machines
6
* Corporation and others.  All Rights Reserved.
7
*******************************************************************************
8
* collationdatabuilder.cpp
9
*
10
* (replaced the former ucol_elm.cpp)
11
*
12
* created on: 2012apr01
13
* created by: Markus W. Scherer
14
*/
15
16
#include "unicode/utypes.h"
17
18
#if !UCONFIG_NO_COLLATION
19
20
#include "unicode/localpointer.h"
21
#include "unicode/uchar.h"
22
#include "unicode/ucharstrie.h"
23
#include "unicode/ucharstriebuilder.h"
24
#include "unicode/uniset.h"
25
#include "unicode/unistr.h"
26
#include "unicode/usetiter.h"
27
#include "unicode/utf16.h"
28
#include "cmemory.h"
29
#include "collation.h"
30
#include "collationdata.h"
31
#include "collationdatabuilder.h"
32
#include "collationfastlatinbuilder.h"
33
#include "collationiterator.h"
34
#include "normalizer2impl.h"
35
#include "utrie2.h"
36
#include "uvectr32.h"
37
#include "uvectr64.h"
38
#include "uvector.h"
39
40
U_NAMESPACE_BEGIN
41
42
572
CollationDataBuilder::CEModifier::~CEModifier() {}
43
44
/**
45
 * Build-time context and CE32 for a code point.
46
 * If a code point has contextual mappings, then the default (no-context) mapping
47
 * and all conditional mappings are stored in a singly-linked list
48
 * of ConditionalCE32, sorted by context strings.
49
 *
50
 * Context strings sort by prefix length, then by prefix, then by contraction suffix.
51
 * Context strings must be unique and in ascending order.
52
 */
53
struct ConditionalCE32 : public UMemory {
54
    ConditionalCE32()
55
            : context(),
56
              ce32(0), defaultCE32(Collation::NO_CE32), builtCE32(Collation::NO_CE32),
57
1.54k
              next(-1) {}
58
    ConditionalCE32(const UnicodeString &ct, uint32_t ce)
59
            : context(ct),
60
              ce32(ce), defaultCE32(Collation::NO_CE32), builtCE32(Collation::NO_CE32),
61
271k
              next(-1) {}
62
63
0
    inline UBool hasContext() const { return context.length() > 1; }
64
19.3M
    inline int32_t prefixLength() const { return context.charAt(0); }
65
66
    /**
67
     * "\0" for the first entry for any code point, with its default CE32.
68
     *
69
     * Otherwise one unit with the length of the prefix string,
70
     * then the prefix string, then the contraction suffix.
71
     */
72
    UnicodeString context;
73
    /**
74
     * CE32 for the code point and its context.
75
     * Can be special (e.g., for an expansion) but not contextual (prefix or contraction tag).
76
     */
77
    uint32_t ce32;
78
    /**
79
     * Default CE32 for all contexts with this same prefix.
80
     * Initially NO_CE32. Set only while building runtime data structures,
81
     * and only on one of the nodes of a sub-list with the same prefix.
82
     */
83
    uint32_t defaultCE32;
84
    /**
85
     * CE32 for the built contexts.
86
     * When fetching CEs from the builder, the contexts are built into their runtime form
87
     * so that the normal collation implementation can process them.
88
     * The result is cached in the list head. It is reset when the contexts are modified.
89
     * All of these builtCE32 are invalidated by clearContexts(),
90
     * via incrementing the contextsEra.
91
     */
92
    uint32_t builtCE32;
93
    /**
94
     * The "era" of building intermediate contexts when the above builtCE32 was set.
95
     * When the array of cached, temporary contexts overflows, then clearContexts()
96
     * removes them all and invalidates the builtCE32 that used to point to built tries.
97
     */
98
    int32_t era = -1;
99
    /**
100
     * Index of the next ConditionalCE32.
101
     * Negative for the end of the list.
102
     */
103
    int32_t next;
104
    // Note: We could create a separate class for all of the contextual mappings for
105
    // a code point, with the builtCE32, the era, and a list of the actual mappings.
106
    // The class that represents one mapping would then not need to
107
    // store those fields in each element.
108
};
109
110
U_CDECL_BEGIN
111
112
void U_CALLCONV
113
271k
uprv_deleteConditionalCE32(void *obj) {
114
271k
    delete static_cast<ConditionalCE32 *>(obj);
115
271k
}
116
117
U_CDECL_END
118
119
/**
120
 * Build-time collation element and character iterator.
121
 * Uses the runtime CollationIterator for fetching CEs for a string
122
 * but reads from the builder's unfinished data structures.
123
 * In particular, this class reads from the unfinished trie
124
 * and has to avoid CollationIterator::nextCE() and redirect other
125
 * calls to data->getCE32() and data->getCE32FromSupplementary().
126
 *
127
 * We do this so that we need not implement the collation algorithm
128
 * again for the builder and make it behave exactly like the runtime code.
129
 * That would be more difficult to test and maintain than this indirection.
130
 *
131
 * Some CE32 tags (for example, the DIGIT_TAG) do not occur in the builder data,
132
 * so the data accesses from those code paths need not be modified.
133
 *
134
 * This class iterates directly over whole code points
135
 * so that the CollationIterator does not need the finished trie
136
 * for handling the LEAD_SURROGATE_TAG.
137
 */
138
class DataBuilderCollationIterator : public CollationIterator {
139
public:
140
    DataBuilderCollationIterator(CollationDataBuilder &b);
141
142
    virtual ~DataBuilderCollationIterator();
143
144
    int32_t fetchCEs(const UnicodeString &str, int32_t start, int64_t ces[], int32_t cesLength);
145
146
    virtual void resetToOffset(int32_t newOffset) override;
147
    virtual int32_t getOffset() const override;
148
149
    virtual UChar32 nextCodePoint(UErrorCode &errorCode) override;
150
    virtual UChar32 previousCodePoint(UErrorCode &errorCode) override;
151
152
protected:
153
    virtual void forwardNumCodePoints(int32_t num, UErrorCode &errorCode) override;
154
    virtual void backwardNumCodePoints(int32_t num, UErrorCode &errorCode) override;
155
156
    virtual uint32_t getDataCE32(UChar32 c) const override;
157
    virtual uint32_t getCE32FromBuilderData(uint32_t ce32, UErrorCode &errorCode) override;
158
159
    CollationDataBuilder &builder;
160
    CollationData builderData;
161
    uint32_t jamoCE32s[CollationData::JAMO_CE32S_LENGTH];
162
    const UnicodeString *s;
163
    int32_t pos;
164
};
165
166
DataBuilderCollationIterator::DataBuilderCollationIterator(CollationDataBuilder &b)
167
        : CollationIterator(&builderData, /*numeric=*/ false),
168
          builder(b), builderData(b.nfcImpl),
169
1.22k
          s(nullptr), pos(0) {
170
1.22k
    builderData.base = builder.base;
171
    // Set all of the jamoCE32s[] to indirection CE32s.
172
83.3k
    for(int32_t j = 0; j < CollationData::JAMO_CE32S_LENGTH; ++j) {  // Count across Jamo types.
173
82.1k
        UChar32 jamo = CollationDataBuilder::jamoCpFromIndex(j);
174
82.1k
        jamoCE32s[j] = Collation::makeCE32FromTagAndIndex(Collation::BUILDER_DATA_TAG, jamo) |
175
82.1k
                CollationDataBuilder::IS_BUILDER_JAMO_CE32;
176
82.1k
    }
177
1.22k
    builderData.jamoCE32s = jamoCE32s;
178
1.22k
}
179
180
1.22k
DataBuilderCollationIterator::~DataBuilderCollationIterator() {}
181
182
int32_t
183
DataBuilderCollationIterator::fetchCEs(const UnicodeString &str, int32_t start,
184
5.51M
                                       int64_t ces[], int32_t cesLength) {
185
    // Set the pointers each time, in case they changed due to reallocation.
186
5.51M
    builderData.ce32s = reinterpret_cast<const uint32_t *>(builder.ce32s.getBuffer());
187
5.51M
    builderData.ces = builder.ce64s.getBuffer();
188
5.51M
    builderData.contexts = builder.contexts.getBuffer();
189
    // Modified copy of CollationIterator::nextCE() and CollationIterator::nextCEFromCE32().
190
5.51M
    reset();
191
5.51M
    s = &str;
192
5.51M
    pos = start;
193
5.51M
    UErrorCode errorCode = U_ZERO_ERROR;
194
98.4M
    while(U_SUCCESS(errorCode) && pos < s->length()) {
195
        // No need to keep all CEs in the iterator buffer.
196
92.9M
        clearCEs();
197
92.9M
        UChar32 c = s->char32At(pos);
198
92.9M
        pos += U16_LENGTH(c);
199
92.9M
        uint32_t ce32 = utrie2_get32(builder.trie, c);
200
92.9M
        const CollationData *d;
201
92.9M
        if(ce32 == Collation::FALLBACK_CE32) {
202
81.6M
            d = builder.base;
203
81.6M
            ce32 = builder.base->getCE32(c);
204
81.6M
        } else {
205
11.3M
            d = &builderData;
206
11.3M
        }
207
92.9M
        appendCEsFromCE32(d, c, ce32, /*forward=*/ true, errorCode);
208
92.9M
        U_ASSERT(U_SUCCESS(errorCode));
209
202M
        for(int32_t i = 0; i < getCEsLength(); ++i) {
210
109M
            int64_t ce = getCE(i);
211
109M
            if(ce != 0) {
212
79.9M
                if(cesLength < Collation::MAX_EXPANSION_LENGTH) {
213
13.0M
                    ces[cesLength] = ce;
214
13.0M
                }
215
79.9M
                ++cesLength;
216
79.9M
            }
217
109M
        }
218
92.9M
    }
219
5.51M
    return cesLength;
220
5.51M
}
221
222
void
223
0
DataBuilderCollationIterator::resetToOffset(int32_t newOffset) {
224
0
    reset();
225
0
    pos = newOffset;
226
0
}
227
228
int32_t
229
0
DataBuilderCollationIterator::getOffset() const {
230
0
    return pos;
231
0
}
232
233
UChar32
234
83.2M
DataBuilderCollationIterator::nextCodePoint(UErrorCode & /*errorCode*/) {
235
83.2M
    if(pos == s->length()) {
236
245k
        return U_SENTINEL;
237
245k
    }
238
83.0M
    UChar32 c = s->char32At(pos);
239
83.0M
    pos += U16_LENGTH(c);
240
83.0M
    return c;
241
83.2M
}
242
243
UChar32
244
13.4M
DataBuilderCollationIterator::previousCodePoint(UErrorCode & /*errorCode*/) {
245
13.4M
    if(pos == 0) {
246
15.3k
        return U_SENTINEL;
247
15.3k
    }
248
13.4M
    UChar32 c = s->char32At(pos - 1);
249
13.4M
    pos -= U16_LENGTH(c);
250
13.4M
    return c;
251
13.4M
}
252
253
void
254
6.35M
DataBuilderCollationIterator::forwardNumCodePoints(int32_t num, UErrorCode & /*errorCode*/) {
255
6.35M
    pos = s->moveIndex32(pos, num);
256
6.35M
}
257
258
void
259
9.95M
DataBuilderCollationIterator::backwardNumCodePoints(int32_t num, UErrorCode & /*errorCode*/) {
260
9.95M
    pos = s->moveIndex32(pos, -num);
261
9.95M
}
262
263
uint32_t
264
116k
DataBuilderCollationIterator::getDataCE32(UChar32 c) const {
265
116k
    return utrie2_get32(builder.trie, c);
266
116k
}
267
268
uint32_t
269
15.2M
DataBuilderCollationIterator::getCE32FromBuilderData(uint32_t ce32, UErrorCode &errorCode) {
270
15.2M
    if (U_FAILURE(errorCode)) { return 0; }
271
15.2M
    U_ASSERT(Collation::hasCE32Tag(ce32, Collation::BUILDER_DATA_TAG));
272
15.2M
    if((ce32 & CollationDataBuilder::IS_BUILDER_JAMO_CE32) != 0) {
273
10.3M
        UChar32 jamo = Collation::indexFromCE32(ce32);
274
10.3M
        return utrie2_get32(builder.trie, jamo);
275
10.3M
    } else {
276
4.85M
        ConditionalCE32 *cond = builder.getConditionalCE32ForCE32(ce32);
277
4.85M
        if (cond == nullptr) {
278
0
            errorCode = U_INTERNAL_PROGRAM_ERROR;
279
            // TODO: ICU-21531 figure out why this happens.
280
0
            return 0;
281
0
        }
282
4.85M
        if(cond->builtCE32 == Collation::NO_CE32 || cond->era != builder.contextsEra) {
283
            // Build the context-sensitive mappings into their runtime form and cache the result.
284
188k
            cond->builtCE32 = builder.buildContext(cond, errorCode);
285
188k
            if(errorCode == U_BUFFER_OVERFLOW_ERROR) {
286
570
                errorCode = U_ZERO_ERROR;
287
570
                builder.clearContexts();
288
570
                cond->builtCE32 = builder.buildContext(cond, errorCode);
289
570
            }
290
188k
            cond->era = builder.contextsEra;
291
188k
            builderData.contexts = builder.contexts.getBuffer();
292
188k
        }
293
4.85M
        return cond->builtCE32;
294
4.85M
    }
295
15.2M
}
296
297
// ------------------------------------------------------------------------- ***
298
299
CollationDataBuilder::CollationDataBuilder(UBool icu4xMode, UErrorCode &errorCode)
300
        : nfcImpl(*Normalizer2Factory::getNFCImpl(errorCode)),
301
          base(nullptr), baseSettings(nullptr),
302
          trie(nullptr),
303
          ce32s(errorCode), ce64s(errorCode), conditionalCE32s(errorCode),
304
          modified(false),
305
          icu4xMode(icu4xMode),
306
          fastLatinEnabled(false), fastLatinBuilder(nullptr),
307
2.43k
          collIter(nullptr) {
308
    // Reserve the first CE32 for U+0000.
309
2.43k
    if (!icu4xMode) {
310
2.43k
        ce32s.addElement(0, errorCode);
311
2.43k
    }
312
2.43k
    conditionalCE32s.setDeleter(uprv_deleteConditionalCE32);
313
2.43k
}
314
315
2.43k
CollationDataBuilder::~CollationDataBuilder() {
316
2.43k
    utrie2_close(trie);
317
2.43k
    delete fastLatinBuilder;
318
2.43k
    delete collIter;
319
2.43k
}
320
321
void
322
2.43k
CollationDataBuilder::initForTailoring(const CollationData *b, UErrorCode &errorCode) {
323
2.43k
    if(U_FAILURE(errorCode)) { return; }
324
2.43k
    if(trie != nullptr) {
325
0
        errorCode = U_INVALID_STATE_ERROR;
326
0
        return;
327
0
    }
328
2.43k
    if(b == nullptr) {
329
0
        errorCode = U_ILLEGAL_ARGUMENT_ERROR;
330
0
        return;
331
0
    }
332
2.43k
    base = b;
333
334
    // For a tailoring, the default is to fall back to the base.
335
    // For ICU4X, use the same value for fallback as for the default
336
    // to avoid having to have different blocks for the two.
337
2.43k
    trie = utrie2_open(Collation::FALLBACK_CE32, icu4xMode ? Collation::FALLBACK_CE32 : Collation::FFFD_CE32, &errorCode);
338
339
2.43k
    if (!icu4xMode) {
340
        // Set the Latin-1 letters block so that it is allocated first in the data array,
341
        // to try to improve locality of reference when sorting Latin-1 text.
342
        // Do not use utrie2_setRange32() since that will not actually allocate blocks
343
        // that are filled with the default value.
344
        // ASCII (0..7F) is already preallocated anyway.
345
158k
        for(UChar32 c = 0xc0; c <= 0xff; ++c) {
346
156k
            utrie2_set32(trie, c, Collation::FALLBACK_CE32, &errorCode);
347
156k
        }
348
349
        // Hangul syllables are not tailorable (except via tailoring Jamos).
350
        // Always set the Hangul tag to help performance.
351
        // Do this here, rather than in buildMappings(),
352
        // so that we see the HANGUL_TAG in various assertions.
353
2.43k
        uint32_t hangulCE32 = Collation::makeCE32FromTagAndIndex(Collation::HANGUL_TAG, 0);
354
2.43k
        utrie2_setRange32(trie, Hangul::HANGUL_BASE, Hangul::HANGUL_END, hangulCE32, true, &errorCode);
355
356
        // Copy the set contents but don't copy/clone the set as a whole because
357
        // that would copy the isFrozen state too.
358
2.43k
        unsafeBackwardSet.addAll(*b->unsafeBackwardSet);
359
2.43k
    }
360
361
2.43k
    if(U_FAILURE(errorCode)) { return; }
362
2.43k
}
363
364
UBool
365
CollationDataBuilder::maybeSetPrimaryRange(UChar32 start, UChar32 end,
366
                                           uint32_t primary, int32_t step,
367
0
                                           UErrorCode &errorCode) {
368
0
    if(U_FAILURE(errorCode)) { return false; }
369
0
    U_ASSERT(start <= end);
370
    // TODO: Do we need to check what values are currently set for start..end?
371
    // An offset range is worth it only if we can achieve an overlap between
372
    // adjacent UTrie2 blocks of 32 code points each.
373
    // An offset CE is also a little more expensive to look up and compute
374
    // than a simple CE.
375
    // If the range spans at least three UTrie2 block boundaries (> 64 code points),
376
    // then we take it.
377
    // If the range spans one or two block boundaries and there are
378
    // at least 4 code points on either side, then we take it.
379
    // (We could additionally require a minimum range length of, say, 16.)
380
0
    int32_t blockDelta = (end >> 5) - (start >> 5);
381
0
    if(2 <= step && step <= 0x7f &&
382
0
            (blockDelta >= 3 ||
383
0
            (blockDelta > 0 && (start & 0x1f) <= 0x1c && (end & 0x1f) >= 3))) {
384
0
        int64_t dataCE = ((int64_t)primary << 32) | (start << 8) | step;
385
0
        if(isCompressiblePrimary(primary)) { dataCE |= 0x80; }
386
0
        int32_t index = addCE(dataCE, errorCode);
387
0
        if(U_FAILURE(errorCode)) { return 0; }
388
0
        if(index > Collation::MAX_INDEX) {
389
0
            errorCode = U_BUFFER_OVERFLOW_ERROR;
390
0
            return 0;
391
0
        }
392
0
        uint32_t offsetCE32 = Collation::makeCE32FromTagAndIndex(Collation::OFFSET_TAG, index);
393
0
        utrie2_setRange32(trie, start, end, offsetCE32, true, &errorCode);
394
0
        modified = true;
395
0
        return true;
396
0
    } else {
397
0
        return false;
398
0
    }
399
0
}
400
401
uint32_t
402
CollationDataBuilder::setPrimaryRangeAndReturnNext(UChar32 start, UChar32 end,
403
                                                   uint32_t primary, int32_t step,
404
0
                                                   UErrorCode &errorCode) {
405
0
    if(U_FAILURE(errorCode)) { return 0; }
406
0
    UBool isCompressible = isCompressiblePrimary(primary);
407
0
    if(maybeSetPrimaryRange(start, end, primary, step, errorCode)) {
408
0
        return Collation::incThreeBytePrimaryByOffset(primary, isCompressible,
409
0
                                                      (end - start + 1) * step);
410
0
    } else {
411
        // Short range: Set individual CE32s.
412
0
        for(;;) {
413
0
            utrie2_set32(trie, start, Collation::makeLongPrimaryCE32(primary), &errorCode);
414
0
            ++start;
415
0
            primary = Collation::incThreeBytePrimaryByOffset(primary, isCompressible, step);
416
0
            if(start > end) { return primary; }
417
0
        }
418
0
        modified = true;
419
0
    }
420
0
}
421
422
uint32_t
423
214k
CollationDataBuilder::getCE32FromOffsetCE32(UBool fromBase, UChar32 c, uint32_t ce32) const {
424
214k
    int32_t i = Collation::indexFromCE32(ce32);
425
214k
    int64_t dataCE = fromBase ? base->ces[i] : ce64s.elementAti(i);
426
214k
    uint32_t p = Collation::getThreeBytePrimaryForOffsetData(c, dataCE);
427
214k
    return Collation::makeLongPrimaryCE32(p);
428
214k
}
429
430
UBool
431
0
CollationDataBuilder::isCompressibleLeadByte(uint32_t b) const {
432
0
    return base->isCompressibleLeadByte(b);
433
0
}
434
435
UBool
436
0
CollationDataBuilder::isAssigned(UChar32 c) const {
437
0
    return Collation::isAssignedCE32(utrie2_get32(trie, c));
438
0
}
439
440
uint32_t
441
0
CollationDataBuilder::getLongPrimaryIfSingleCE(UChar32 c) const {
442
0
    uint32_t ce32 = utrie2_get32(trie, c);
443
0
    if(Collation::isLongPrimaryCE32(ce32)) {
444
0
        return Collation::primaryFromLongPrimaryCE32(ce32);
445
0
    } else {
446
0
        return 0;
447
0
    }
448
0
}
449
450
int64_t
451
0
CollationDataBuilder::getSingleCE(UChar32 c, UErrorCode &errorCode) const {
452
0
    if(U_FAILURE(errorCode)) { return 0; }
453
    // Keep parallel with CollationData::getSingleCE().
454
0
    UBool fromBase = false;
455
0
    uint32_t ce32 = utrie2_get32(trie, c);
456
0
    if(ce32 == Collation::FALLBACK_CE32) {
457
0
        fromBase = true;
458
0
        ce32 = base->getCE32(c);
459
0
    }
460
0
    while(Collation::isSpecialCE32(ce32)) {
461
0
        switch(Collation::tagFromCE32(ce32)) {
462
0
        case Collation::LATIN_EXPANSION_TAG:
463
0
        case Collation::BUILDER_DATA_TAG:
464
0
        case Collation::PREFIX_TAG:
465
0
        case Collation::CONTRACTION_TAG:
466
0
        case Collation::HANGUL_TAG:
467
0
        case Collation::LEAD_SURROGATE_TAG:
468
0
            errorCode = U_UNSUPPORTED_ERROR;
469
0
            return 0;
470
0
        case Collation::FALLBACK_TAG:
471
0
        case Collation::RESERVED_TAG_3:
472
0
            errorCode = U_INTERNAL_PROGRAM_ERROR;
473
0
            return 0;
474
0
        case Collation::LONG_PRIMARY_TAG:
475
0
            return Collation::ceFromLongPrimaryCE32(ce32);
476
0
        case Collation::LONG_SECONDARY_TAG:
477
0
            return Collation::ceFromLongSecondaryCE32(ce32);
478
0
        case Collation::EXPANSION32_TAG:
479
0
            if(Collation::lengthFromCE32(ce32) == 1) {
480
0
                int32_t i = Collation::indexFromCE32(ce32);
481
0
                ce32 = fromBase ? base->ce32s[i] : ce32s.elementAti(i);
482
0
                break;
483
0
            } else {
484
0
                errorCode = U_UNSUPPORTED_ERROR;
485
0
                return 0;
486
0
            }
487
0
        case Collation::EXPANSION_TAG: {
488
0
            if(Collation::lengthFromCE32(ce32) == 1) {
489
0
                int32_t i = Collation::indexFromCE32(ce32);
490
0
                return fromBase ? base->ces[i] : ce64s.elementAti(i);
491
0
            } else {
492
0
                errorCode = U_UNSUPPORTED_ERROR;
493
0
                return 0;
494
0
            }
495
0
        }
496
0
        case Collation::DIGIT_TAG:
497
            // Fetch the non-numeric-collation CE32 and continue.
498
0
            ce32 = ce32s.elementAti(Collation::indexFromCE32(ce32));
499
0
            break;
500
0
        case Collation::U0000_TAG:
501
0
            U_ASSERT(c == 0);
502
            // Fetch the normal ce32 for U+0000 and continue.
503
0
            ce32 = fromBase ? base->ce32s[0] : ce32s.elementAti(0);
504
0
            break;
505
0
        case Collation::OFFSET_TAG:
506
0
            ce32 = getCE32FromOffsetCE32(fromBase, c, ce32);
507
0
            break;
508
0
        case Collation::IMPLICIT_TAG:
509
0
            return Collation::unassignedCEFromCodePoint(c);
510
0
        }
511
0
    }
512
0
    return Collation::ceFromSimpleCE32(ce32);
513
0
}
514
515
int32_t
516
67.0k
CollationDataBuilder::addCE(int64_t ce, UErrorCode &errorCode) {
517
67.0k
    int32_t length = ce64s.size();
518
94.6M
    for(int32_t i = 0; i < length; ++i) {
519
94.5M
        if(ce == ce64s.elementAti(i)) { return i; }
520
94.5M
    }
521
56.9k
    ce64s.addElement(ce, errorCode);
522
56.9k
    return length;
523
67.0k
}
524
525
int32_t
526
8.83k
CollationDataBuilder::addCE32(uint32_t ce32, UErrorCode &errorCode) {
527
8.83k
    int32_t length = ce32s.size();
528
17.3M
    for(int32_t i = 0; i < length; ++i) {
529
17.3M
        if(ce32 == (uint32_t)ce32s.elementAti(i)) { return i; }
530
17.3M
    }
531
5.61k
    ce32s.addElement((int32_t)ce32, errorCode);  
532
5.61k
    return length;
533
8.83k
}
534
535
int32_t
536
CollationDataBuilder::addConditionalCE32(const UnicodeString &context, uint32_t ce32,
537
271k
                                         UErrorCode &errorCode) {
538
271k
    if(U_FAILURE(errorCode)) { return -1; }
539
271k
    U_ASSERT(!context.isEmpty());
540
271k
    int32_t index = conditionalCE32s.size();
541
271k
    if(index > Collation::MAX_INDEX) {
542
0
        errorCode = U_BUFFER_OVERFLOW_ERROR;
543
0
        return -1;
544
0
    }
545
271k
    LocalPointer<ConditionalCE32> cond(new ConditionalCE32(context, ce32), errorCode);
546
271k
    conditionalCE32s.adoptElement(cond.orphan(), errorCode);
547
271k
    if(U_FAILURE(errorCode)) {
548
0
        return -1;
549
0
    }
550
271k
    return index;
551
271k
}
552
553
void
554
CollationDataBuilder::add(const UnicodeString &prefix, const UnicodeString &s,
555
                          const int64_t ces[], int32_t cesLength,
556
0
                          UErrorCode &errorCode) {
557
0
    uint32_t ce32 = encodeCEs(ces, cesLength, errorCode);
558
0
    addCE32(prefix, s, ce32, errorCode);
559
0
}
560
561
void
562
CollationDataBuilder::addCE32(const UnicodeString &prefix, const UnicodeString &s,
563
1.35M
                              uint32_t ce32, UErrorCode &errorCode) {
564
1.35M
    if(U_FAILURE(errorCode)) { return; }
565
1.35M
    if(s.isEmpty()) {
566
0
        errorCode = U_ILLEGAL_ARGUMENT_ERROR;
567
0
        return;
568
0
    }
569
1.35M
    if(trie == nullptr || utrie2_isFrozen(trie)) {
570
0
        errorCode = U_INVALID_STATE_ERROR;
571
0
        return;
572
0
    }
573
1.35M
    UChar32 c = s.char32At(0);
574
1.35M
    int32_t cLength = U16_LENGTH(c);
575
1.35M
    uint32_t oldCE32 = utrie2_get32(trie, c);
576
1.35M
    UBool hasContext = !prefix.isEmpty() || s.length() > cLength;
577
578
1.35M
    if (icu4xMode) {
579
0
        if (base && c >= 0x1100 && c < 0x1200) {
580
            // Omit jamo tailorings.
581
            // TODO(https://github.com/unicode-org/icu4x/issues/1941).
582
0
        }
583
0
        const Normalizer2* nfdNormalizer = Normalizer2::getNFDInstance(errorCode);
584
0
        UnicodeString sInNfd;
585
0
        nfdNormalizer->normalize(s, sInNfd, errorCode);
586
0
        if (s != sInNfd) {
587
            // s is not in NFD, so it cannot match in ICU4X, since ICU4X only
588
            // does NFD lookups.
589
            // Now check that we're only rejecting known cases.
590
0
            if (s.length() == 2) {
591
0
                char16_t second = s.charAt(1);
592
0
                if (second == 0x0F73 || second == 0x0F75 || second == 0x0F81) {
593
                    // Second is a special decomposing Tibetan vowel sign.
594
                    // These also get added in the decomposed form, so ignoring
595
                    // this instance is OK.
596
0
                    return;
597
0
                }
598
0
                if (c == 0xFDD1 && second == 0xAC00) {
599
                    // This strange contraction exists in the root and
600
                    // doesn't have a decomposed counterpart there.
601
                    // This won't match in ICU4X anyway and is very strange:
602
                    // Unassigned Arabic presentation form contracting with
603
                    // the very first Hangul syllable. Let's ignore this
604
                    // explicitly.
605
0
                    return;
606
0
                }
607
0
            }
608
            // Unknown case worth investigating if ever found.
609
0
            errorCode = U_UNSUPPORTED_ERROR;
610
0
            return;
611
0
        }
612
613
0
        if (!prefix.isEmpty()) {
614
0
            UnicodeString prefixInNfd;
615
0
            nfdNormalizer->normalize(prefix, prefixInNfd, errorCode);
616
0
            if (prefix != prefixInNfd) {
617
0
                errorCode = U_UNSUPPORTED_ERROR;
618
0
                return;
619
0
            }
620
621
0
            int32_t count = prefix.countChar32();
622
0
            if (count > 2) {
623
                // Prefix too long for ICU4X.
624
0
                errorCode = U_UNSUPPORTED_ERROR;
625
0
                return;
626
0
            }
627
0
            UChar32 utf32[4];
628
0
            int32_t len = prefix.toUTF32(utf32, 4, errorCode);
629
0
            if (len != count) {
630
0
                errorCode = U_INVALID_STATE_ERROR;
631
0
                return;
632
0
            }
633
0
            UChar32 c = utf32[0];
634
0
            if (u_getCombiningClass(c)) {
635
                // Prefix must start with as starter for ICU4X.
636
0
                errorCode = U_UNSUPPORTED_ERROR;
637
0
                return;
638
0
            }
639
            // XXX: Korean searchjl has jamo in prefix, so commenting out this
640
            // check for now. ICU4X currently ignores non-root jamo tables anyway.
641
            // searchjl was added in
642
            // https://unicode-org.atlassian.net/browse/CLDR-3560
643
            // Contractions were changed to prefixes in
644
            // https://unicode-org.atlassian.net/browse/CLDR-6546
645
            //
646
            // if ((c >= 0x1100 && c < 0x1200) || (c >= 0xAC00 && c < 0xD7A4)) {
647
            //     errorCode = U_UNSUPPORTED_ERROR;
648
            //     return;
649
            // }
650
0
            if ((len > 1) && !(utf32[1] == 0x3099 || utf32[1] == 0x309A)) {
651
                // Second character in prefix, if present, must be a kana voicing mark for ICU4X.
652
0
                errorCode = U_UNSUPPORTED_ERROR;
653
0
                return;
654
0
            }
655
0
        }
656
657
0
        if (s.length() > cLength) {
658
            // Check that there's no modern Hangul in contractions.
659
0
            for (int32_t i = 0; i < s.length(); ++i) {
660
0
                char16_t c = s.charAt(i);
661
0
                if ((c >= 0x1100 && c < 0x1100 + 19) || (c >= 0x1161 && c < 0x1161 + 21) || (c >= 0x11A7 && c < 0x11A7 + 28) || (c >= 0xAC00 && c < 0xD7A4)) {
662
0
                    errorCode = U_UNSUPPORTED_ERROR;
663
0
                    return;
664
0
                }
665
0
            }
666
0
        }
667
0
    }
668
669
1.35M
    if(oldCE32 == Collation::FALLBACK_CE32) {
670
        // First tailoring for c.
671
        // If c has contextual base mappings or if we add a contextual mapping,
672
        // then copy the base mappings.
673
        // Otherwise we just override the base mapping.
674
461k
        uint32_t baseCE32 = base->getFinalCE32(base->getCE32(c));
675
461k
        if(hasContext || Collation::ce32HasContext(baseCE32)) {
676
14.2k
            oldCE32 = copyFromBaseCE32(c, baseCE32, true, errorCode);
677
14.2k
            utrie2_set32(trie, c, oldCE32, &errorCode);
678
14.2k
            if(U_FAILURE(errorCode)) { return; }
679
14.2k
        }
680
461k
    }
681
1.35M
    if(!hasContext) {
682
        // No prefix, no contraction.
683
1.17M
        if(!isBuilderContextCE32(oldCE32)) {
684
1.16M
            utrie2_set32(trie, c, ce32, &errorCode);
685
1.16M
        } else {
686
14.8k
            ConditionalCE32 *cond = getConditionalCE32ForCE32(oldCE32);
687
14.8k
            cond->builtCE32 = Collation::NO_CE32;
688
14.8k
            cond->ce32 = ce32;
689
14.8k
        }
690
1.17M
    } else {
691
179k
        ConditionalCE32 *cond;
692
179k
        if(!isBuilderContextCE32(oldCE32)) {
693
            // Replace the simple oldCE32 with a builder context CE32
694
            // pointing to a new ConditionalCE32 list head.
695
15.8k
            int32_t index = addConditionalCE32(UnicodeString((char16_t)0), oldCE32, errorCode);
696
15.8k
            if(U_FAILURE(errorCode)) { return; }
697
15.8k
            uint32_t contextCE32 = makeBuilderContextCE32(index);
698
15.8k
            utrie2_set32(trie, c, contextCE32, &errorCode);
699
15.8k
            contextChars.add(c);
700
15.8k
            cond = getConditionalCE32(index);
701
163k
        } else {
702
163k
            cond = getConditionalCE32ForCE32(oldCE32);
703
163k
            cond->builtCE32 = Collation::NO_CE32;
704
163k
        }
705
179k
        UnicodeString suffix(s, cLength);
706
179k
        UnicodeString context((char16_t)prefix.length());
707
179k
        context.append(prefix).append(suffix);
708
179k
        unsafeBackwardSet.addAll(suffix);
709
5.23M
        for(;;) {
710
            // invariant: context > cond->context
711
5.23M
            int32_t next = cond->next;
712
5.23M
            if(next < 0) {
713
                // Append a new ConditionalCE32 after cond.
714
36.0k
                int32_t index = addConditionalCE32(context, ce32, errorCode);
715
36.0k
                if(U_FAILURE(errorCode)) { return; }
716
36.0k
                cond->next = index;
717
36.0k
                break;
718
36.0k
            }
719
5.19M
            ConditionalCE32 *nextCond = getConditionalCE32(next);
720
5.19M
            int8_t cmp = context.compare(nextCond->context);
721
5.19M
            if(cmp < 0) {
722
                // Insert a new ConditionalCE32 between cond and nextCond.
723
103k
                int32_t index = addConditionalCE32(context, ce32, errorCode);
724
103k
                if(U_FAILURE(errorCode)) { return; }
725
103k
                cond->next = index;
726
103k
                getConditionalCE32(index)->next = next;
727
103k
                break;
728
5.09M
            } else if(cmp == 0) {
729
                // Same context as before, overwrite its ce32.
730
39.8k
                nextCond->ce32 = ce32;
731
39.8k
                break;
732
39.8k
            }
733
5.05M
            cond = nextCond;
734
5.05M
        }
735
179k
    }
736
1.35M
    modified = true;
737
1.35M
}
738
739
uint32_t
740
2.72M
CollationDataBuilder::encodeOneCEAsCE32(int64_t ce) {
741
2.72M
    uint32_t p = (uint32_t)(ce >> 32);
742
2.72M
    uint32_t lower32 = (uint32_t)ce;
743
2.72M
    uint32_t t = (uint32_t)(ce & 0xffff);
744
2.72M
    U_ASSERT((t & 0xc000) != 0xc000);  // Impossible case bits 11 mark special CE32s.
745
2.72M
    if((ce & INT64_C(0xffff00ff00ff)) == 0) {
746
        // normal form ppppsstt
747
2.34M
        return p | (lower32 >> 16) | (t >> 8);
748
2.34M
    } else if((ce & INT64_C(0xffffffffff)) == Collation::COMMON_SEC_AND_TER_CE) {
749
        // long-primary form ppppppC1
750
191k
        return Collation::makeLongPrimaryCE32(p);
751
191k
    } else if(p == 0 && (t & 0xff) == 0) {
752
        // long-secondary form ssssttC2
753
6.62k
        return Collation::makeLongSecondaryCE32(lower32);
754
6.62k
    }
755
184k
    return Collation::NO_CE32;
756
2.72M
}
757
758
uint32_t
759
617k
CollationDataBuilder::encodeOneCE(int64_t ce, UErrorCode &errorCode) {
760
    // Try to encode one CE as one CE32.
761
617k
    uint32_t ce32 = encodeOneCEAsCE32(ce);
762
617k
    if(ce32 != Collation::NO_CE32) { return ce32; }
763
67.0k
    int32_t index = addCE(ce, errorCode);
764
67.0k
    if(U_FAILURE(errorCode)) { return 0; }
765
67.0k
    if(index > Collation::MAX_INDEX) {
766
0
        errorCode = U_BUFFER_OVERFLOW_ERROR;
767
0
        return 0;
768
0
    }
769
67.0k
    return Collation::makeCE32FromTagIndexAndLength(Collation::EXPANSION_TAG, index, 1);
770
67.0k
}
771
772
uint32_t
773
CollationDataBuilder::encodeCEs(const int64_t ces[], int32_t cesLength,
774
1.33M
                                UErrorCode &errorCode) {
775
1.33M
    if(U_FAILURE(errorCode)) { return 0; }
776
1.33M
    if(cesLength < 0 || cesLength > Collation::MAX_EXPANSION_LENGTH) {
777
0
        errorCode = U_ILLEGAL_ARGUMENT_ERROR;
778
0
        return 0;
779
0
    }
780
1.33M
    if(trie == nullptr || utrie2_isFrozen(trie)) {
781
0
        errorCode = U_INVALID_STATE_ERROR;
782
0
        return 0;
783
0
    }
784
1.33M
    if(cesLength == 0) {
785
        // Convenience: We cannot map to nothing, but we can map to a completely ignorable CE.
786
        // Do this here so that callers need not do it.
787
299
        return encodeOneCEAsCE32(0);
788
1.33M
    } else if(cesLength == 1) {
789
497k
        return encodeOneCE(ces[0], errorCode);
790
839k
    } else if(cesLength == 2 && !icu4xMode) {
791
        // Try to encode two CEs as one CE32.
792
        // Turn this off for ICU4X, because without the canonical closure
793
        // these are so rare that it doesn't make sense to spend a branch
794
        // on checking this tag when using the data.
795
564k
        int64_t ce0 = ces[0];
796
564k
        int64_t ce1 = ces[1];
797
564k
        uint32_t p0 = (uint32_t)(ce0 >> 32);
798
564k
        if((ce0 & INT64_C(0xffffffffff00ff)) == Collation::COMMON_SECONDARY_CE &&
799
564k
                (ce1 & INT64_C(0xffffffff00ffffff)) == Collation::COMMON_TERTIARY_CE &&
800
564k
                p0 != 0) {
801
            // Latin mini expansion
802
640
            return
803
640
                p0 |
804
640
                (((uint32_t)ce0 & 0xff00u) << 8) |
805
640
                (uint32_t)(ce1 >> 16) |
806
640
                Collation::SPECIAL_CE32_LOW_BYTE |
807
640
                Collation::LATIN_EXPANSION_TAG;
808
640
        }
809
564k
    }
810
    // Try to encode two or more CEs as CE32s.
811
839k
    int32_t newCE32s[Collation::MAX_EXPANSION_LENGTH];
812
2.83M
    for(int32_t i = 0;; ++i) {
813
2.83M
        if(i == cesLength) {
814
722k
            return encodeExpansion32(newCE32s, cesLength, errorCode);
815
722k
        }
816
2.11M
        uint32_t ce32 = encodeOneCEAsCE32(ces[i]);
817
2.11M
        if(ce32 == Collation::NO_CE32) { break; }
818
1.99M
        newCE32s[i] = (int32_t)ce32;
819
1.99M
    }
820
117k
    return encodeExpansion(ces, cesLength, errorCode);
821
839k
}
822
823
uint32_t
824
125k
CollationDataBuilder::encodeExpansion(const int64_t ces[], int32_t length, UErrorCode &errorCode) {
825
125k
    if(U_FAILURE(errorCode)) { return 0; }
826
    // See if this sequence of CEs has already been stored.
827
125k
    int64_t first = ces[0];
828
125k
    int32_t ce64sMax = ce64s.size() - length;
829
91.4M
    for(int32_t i = 0; i <= ce64sMax; ++i) {
830
91.3M
        if(first == ce64s.elementAti(i)) {
831
22.0M
            if(i > Collation::MAX_INDEX) {
832
0
                errorCode = U_BUFFER_OVERFLOW_ERROR;
833
0
                return 0;
834
0
            }
835
45.3M
            for(int32_t j = 1;; ++j) {
836
45.3M
                if(j == length) {
837
62.2k
                    return Collation::makeCE32FromTagIndexAndLength(
838
62.2k
                            Collation::EXPANSION_TAG, i, length);
839
62.2k
                }
840
45.3M
                if(ce64s.elementAti(i + j) != ces[j]) { break; }
841
45.3M
            }
842
22.0M
        }
843
91.3M
    }
844
    // Store the new sequence.
845
63.3k
    int32_t i = ce64s.size();
846
63.3k
    if(i > Collation::MAX_INDEX) {
847
0
        errorCode = U_BUFFER_OVERFLOW_ERROR;
848
0
        return 0;
849
0
    }
850
291k
    for(int32_t j = 0; j < length; ++j) {
851
227k
        ce64s.addElement(ces[j], errorCode);
852
227k
    }
853
63.3k
    return Collation::makeCE32FromTagIndexAndLength(Collation::EXPANSION_TAG, i, length);
854
63.3k
}
855
856
uint32_t
857
CollationDataBuilder::encodeExpansion32(const int32_t newCE32s[], int32_t length,
858
755k
                                        UErrorCode &errorCode) {
859
755k
    if(U_FAILURE(errorCode)) { return 0; }
860
    // See if this sequence of CE32s has already been stored.
861
755k
    int32_t first = newCE32s[0];
862
755k
    int32_t ce32sMax = ce32s.size() - length;
863
2.71G
    for(int32_t i = 0; i <= ce32sMax; ++i) {
864
2.71G
        if(first == ce32s.elementAti(i)) {
865
85.4M
            if(i > Collation::MAX_INDEX) {
866
0
                errorCode = U_BUFFER_OVERFLOW_ERROR;
867
0
                return 0;
868
0
            }
869
99.4M
            for(int32_t j = 1;; ++j) {
870
99.4M
                if(j == length) {
871
131k
                    return Collation::makeCE32FromTagIndexAndLength(
872
131k
                            Collation::EXPANSION32_TAG, i, length);
873
131k
                }
874
99.3M
                if(ce32s.elementAti(i + j) != newCE32s[j]) { break; }
875
99.3M
            }
876
85.4M
        }
877
2.71G
    }
878
    // Store the new sequence.
879
623k
    int32_t i = ce32s.size();
880
623k
    if(i > Collation::MAX_INDEX) {
881
0
        errorCode = U_BUFFER_OVERFLOW_ERROR;
882
0
        return 0;
883
0
    }
884
2.19M
    for(int32_t j = 0; j < length; ++j) {
885
1.57M
        ce32s.addElement(newCE32s[j], errorCode);
886
1.57M
    }
887
623k
    return Collation::makeCE32FromTagIndexAndLength(Collation::EXPANSION32_TAG, i, length);
888
623k
}
889
890
uint32_t
891
CollationDataBuilder::copyFromBaseCE32(UChar32 c, uint32_t ce32, UBool withContext,
892
708k
                                       UErrorCode &errorCode) {
893
708k
    if(U_FAILURE(errorCode)) { return 0; }
894
708k
    if(!Collation::isSpecialCE32(ce32)) { return ce32; }
895
577k
    switch(Collation::tagFromCE32(ce32)) {
896
289k
    case Collation::LONG_PRIMARY_TAG:
897
293k
    case Collation::LONG_SECONDARY_TAG:
898
325k
    case Collation::LATIN_EXPANSION_TAG:
899
        // copy as is
900
325k
        break;
901
19.0k
    case Collation::EXPANSION32_TAG: {
902
19.0k
        const uint32_t *baseCE32s = base->ce32s + Collation::indexFromCE32(ce32);
903
19.0k
        int32_t length = Collation::lengthFromCE32(ce32);
904
19.0k
        ce32 = encodeExpansion32(
905
19.0k
            reinterpret_cast<const int32_t *>(baseCE32s), length, errorCode);
906
19.0k
        break;
907
293k
    }
908
3.98k
    case Collation::EXPANSION_TAG: {
909
3.98k
        const int64_t *baseCEs = base->ces + Collation::indexFromCE32(ce32);
910
3.98k
        int32_t length = Collation::lengthFromCE32(ce32);
911
3.98k
        ce32 = encodeExpansion(baseCEs, length, errorCode);
912
3.98k
        break;
913
293k
    }
914
117
    case Collation::PREFIX_TAG: {
915
        // Flatten prefixes and nested suffixes (contractions)
916
        // into a linear list of ConditionalCE32.
917
117
        const char16_t *p = base->contexts + Collation::indexFromCE32(ce32);
918
117
        ce32 = CollationData::readCE32(p);  // Default if no prefix match.
919
117
        if(!withContext) {
920
0
            return copyFromBaseCE32(c, ce32, false, errorCode);
921
0
        }
922
117
        ConditionalCE32 head;
923
117
        UnicodeString context((char16_t)0);
924
117
        int32_t index;
925
117
        if(Collation::isContractionCE32(ce32)) {
926
0
            index = copyContractionsFromBaseCE32(context, c, ce32, &head, errorCode);
927
117
        } else {
928
117
            ce32 = copyFromBaseCE32(c, ce32, true, errorCode);
929
117
            head.next = index = addConditionalCE32(context, ce32, errorCode);
930
117
        }
931
117
        if(U_FAILURE(errorCode)) { return 0; }
932
117
        ConditionalCE32 *cond = getConditionalCE32(index);  // the last ConditionalCE32 so far
933
117
        UCharsTrie::Iterator prefixes(p + 2, 0, errorCode);
934
351
        while(prefixes.next(errorCode)) {
935
234
            context = prefixes.getString();
936
234
            context.reverse();
937
234
            context.insert(0, (char16_t)context.length());
938
234
            ce32 = (uint32_t)prefixes.getValue();
939
234
            if(Collation::isContractionCE32(ce32)) {
940
0
                index = copyContractionsFromBaseCE32(context, c, ce32, cond, errorCode);
941
234
            } else {
942
234
                ce32 = copyFromBaseCE32(c, ce32, true, errorCode);
943
234
                cond->next = index = addConditionalCE32(context, ce32, errorCode);
944
234
            }
945
234
            if(U_FAILURE(errorCode)) { return 0; }
946
234
            cond = getConditionalCE32(index);
947
234
        }
948
117
        ce32 = makeBuilderContextCE32(head.next);
949
117
        contextChars.add(c);
950
117
        break;
951
117
    }
952
1.51k
    case Collation::CONTRACTION_TAG: {
953
1.51k
        if(!withContext) {
954
84
            const char16_t *p = base->contexts + Collation::indexFromCE32(ce32);
955
84
            ce32 = CollationData::readCE32(p);  // Default if no suffix match.
956
84
            return copyFromBaseCE32(c, ce32, false, errorCode);
957
84
        }
958
1.42k
        ConditionalCE32 head;
959
1.42k
        UnicodeString context((char16_t)0);
960
1.42k
        copyContractionsFromBaseCE32(context, c, ce32, &head, errorCode);
961
1.42k
        ce32 = makeBuilderContextCE32(head.next);
962
1.42k
        contextChars.add(c);
963
1.42k
        break;
964
1.51k
    }
965
0
    case Collation::HANGUL_TAG:
966
0
        errorCode = U_UNSUPPORTED_ERROR;  // We forbid tailoring of Hangul syllables.
967
0
        break;
968
214k
    case Collation::OFFSET_TAG:
969
214k
        ce32 = getCE32FromOffsetCE32(true, c, ce32);
970
214k
        break;
971
13.1k
    case Collation::IMPLICIT_TAG:
972
13.1k
        ce32 = encodeOneCE(Collation::unassignedCEFromCodePoint(c), errorCode);
973
13.1k
        break;
974
0
    default:
975
0
        UPRV_UNREACHABLE_EXIT;  // require ce32 == base->getFinalCE32(ce32)
976
577k
    }
977
577k
    return ce32;
978
577k
}
979
980
int32_t
981
CollationDataBuilder::copyContractionsFromBaseCE32(UnicodeString &context, UChar32 c, uint32_t ce32,
982
1.42k
                                                   ConditionalCE32 *cond, UErrorCode &errorCode) {
983
1.42k
    if(U_FAILURE(errorCode)) { return 0; }
984
1.42k
    const char16_t *p = base->contexts + Collation::indexFromCE32(ce32);
985
1.42k
    int32_t index;
986
1.42k
    if((ce32 & Collation::CONTRACT_SINGLE_CP_NO_MATCH) != 0) {
987
        // No match on the single code point.
988
        // We are underneath a prefix, and the default mapping is just
989
        // a fallback to the mappings for a shorter prefix.
990
0
        U_ASSERT(context.length() > 1);
991
0
        index = -1;
992
1.42k
    } else {
993
1.42k
        ce32 = CollationData::readCE32(p);  // Default if no suffix match.
994
1.42k
        U_ASSERT(!Collation::isContractionCE32(ce32));
995
1.42k
        ce32 = copyFromBaseCE32(c, ce32, true, errorCode);
996
1.42k
        cond->next = index = addConditionalCE32(context, ce32, errorCode);
997
1.42k
        if(U_FAILURE(errorCode)) { return 0; }
998
1.42k
        cond = getConditionalCE32(index);
999
1.42k
    }
1000
1001
1.42k
    int32_t suffixStart = context.length();
1002
1.42k
    UCharsTrie::Iterator suffixes(p + 2, 0, errorCode);
1003
19.4k
    while(suffixes.next(errorCode)) {
1004
17.9k
        context.append(suffixes.getString());
1005
17.9k
        ce32 = copyFromBaseCE32(c, (uint32_t)suffixes.getValue(), true, errorCode);
1006
17.9k
        cond->next = index = addConditionalCE32(context, ce32, errorCode);
1007
17.9k
        if(U_FAILURE(errorCode)) { return 0; }
1008
        // No need to update the unsafeBackwardSet because the tailoring set
1009
        // is already a copy of the base set.
1010
17.9k
        cond = getConditionalCE32(index);
1011
17.9k
        context.truncate(suffixStart);
1012
17.9k
    }
1013
1.42k
    U_ASSERT(index >= 0);
1014
1.42k
    return index;
1015
1.42k
}
1016
1017
class CopyHelper {
1018
public:
1019
    CopyHelper(const CollationDataBuilder &s, CollationDataBuilder &d,
1020
               const CollationDataBuilder::CEModifier &m, UErrorCode &initialErrorCode)
1021
            : src(s), dest(d), modifier(m),
1022
572
              errorCode(initialErrorCode) {}
1023
1024
172k
    UBool copyRangeCE32(UChar32 start, UChar32 end, uint32_t ce32) {
1025
172k
        ce32 = copyCE32(ce32);
1026
172k
        utrie2_setRange32(dest.trie, start, end, ce32, true, &errorCode);
1027
172k
        if(CollationDataBuilder::isBuilderContextCE32(ce32)) {
1028
7.89k
            dest.contextChars.add(start, end);
1029
7.89k
        }
1030
172k
        return U_SUCCESS(errorCode);
1031
172k
    }
1032
1033
269k
    uint32_t copyCE32(uint32_t ce32) {
1034
269k
        if(!Collation::isSpecialCE32(ce32)) {
1035
113k
            int64_t ce = modifier.modifyCE32(ce32);
1036
113k
            if(ce != Collation::NO_CE) {
1037
107k
                ce32 = dest.encodeOneCE(ce, errorCode);
1038
107k
            }
1039
155k
        } else {
1040
155k
            int32_t tag = Collation::tagFromCE32(ce32);
1041
155k
            if(tag == Collation::EXPANSION32_TAG) {
1042
120k
                const uint32_t *srcCE32s = reinterpret_cast<uint32_t *>(src.ce32s.getBuffer());
1043
120k
                srcCE32s += Collation::indexFromCE32(ce32);
1044
120k
                int32_t length = Collation::lengthFromCE32(ce32);
1045
                // Inspect the source CE32s. Just copy them if none are modified.
1046
                // Otherwise copy to modifiedCEs, with modifications.
1047
120k
                UBool isModified = false;
1048
527k
                for(int32_t i = 0; i < length; ++i) {
1049
406k
                    ce32 = srcCE32s[i];
1050
406k
                    int64_t ce;
1051
406k
                    if(Collation::isSpecialCE32(ce32) ||
1052
406k
                            (ce = modifier.modifyCE32(ce32)) == Collation::NO_CE) {
1053
279k
                        if(isModified) {
1054
106k
                            modifiedCEs[i] = Collation::ceFromCE32(ce32);
1055
106k
                        }
1056
279k
                    } else {
1057
126k
                        if(!isModified) {
1058
226k
                            for(int32_t j = 0; j < i; ++j) {
1059
120k
                                modifiedCEs[j] = Collation::ceFromCE32(srcCE32s[j]);
1060
120k
                            }
1061
106k
                            isModified = true;
1062
106k
                        }
1063
126k
                        modifiedCEs[i] = ce;
1064
126k
                    }
1065
406k
                }
1066
120k
                if(isModified) {
1067
106k
                    ce32 = dest.encodeCEs(modifiedCEs, length, errorCode);
1068
106k
                } else {
1069
14.0k
                    ce32 = dest.encodeExpansion32(
1070
14.0k
                        reinterpret_cast<const int32_t *>(srcCE32s), length, errorCode);
1071
14.0k
                }
1072
120k
            } else if(tag == Collation::EXPANSION_TAG) {
1073
17.8k
                const int64_t *srcCEs = src.ce64s.getBuffer();
1074
17.8k
                srcCEs += Collation::indexFromCE32(ce32);
1075
17.8k
                int32_t length = Collation::lengthFromCE32(ce32);
1076
                // Inspect the source CEs. Just copy them if none are modified.
1077
                // Otherwise copy to modifiedCEs, with modifications.
1078
17.8k
                UBool isModified = false;
1079
193k
                for(int32_t i = 0; i < length; ++i) {
1080
175k
                    int64_t srcCE = srcCEs[i];
1081
175k
                    int64_t ce = modifier.modifyCE(srcCE);
1082
175k
                    if(ce == Collation::NO_CE) {
1083
155k
                        if(isModified) {
1084
60.1k
                            modifiedCEs[i] = srcCE;
1085
60.1k
                        }
1086
155k
                    } else {
1087
20.0k
                        if(!isModified) {
1088
100k
                            for(int32_t j = 0; j < i; ++j) {
1089
86.9k
                                modifiedCEs[j] = srcCEs[j];
1090
86.9k
                            }
1091
13.4k
                            isModified = true;
1092
13.4k
                        }
1093
20.0k
                        modifiedCEs[i] = ce;
1094
20.0k
                    }
1095
175k
                }
1096
17.8k
                if(isModified) {
1097
13.4k
                    ce32 = dest.encodeCEs(modifiedCEs, length, errorCode);
1098
13.4k
                } else {
1099
4.38k
                    ce32 = dest.encodeExpansion(srcCEs, length, errorCode);
1100
4.38k
                }
1101
17.8k
            } else if(tag == Collation::BUILDER_DATA_TAG) {
1102
                // Copy the list of ConditionalCE32.
1103
7.89k
                ConditionalCE32 *cond = src.getConditionalCE32ForCE32(ce32);
1104
7.89k
                U_ASSERT(!cond->hasContext());
1105
7.89k
                int32_t destIndex = dest.addConditionalCE32(
1106
7.89k
                        cond->context, copyCE32(cond->ce32), errorCode);
1107
7.89k
                ce32 = CollationDataBuilder::makeBuilderContextCE32(destIndex);
1108
96.6k
                while(cond->next >= 0) {
1109
88.7k
                    cond = src.getConditionalCE32(cond->next);
1110
88.7k
                    ConditionalCE32 *prevDestCond = dest.getConditionalCE32(destIndex);
1111
88.7k
                    destIndex = dest.addConditionalCE32(
1112
88.7k
                            cond->context, copyCE32(cond->ce32), errorCode);
1113
88.7k
                    int32_t suffixStart = cond->prefixLength() + 1;
1114
88.7k
                    dest.unsafeBackwardSet.addAll(cond->context.tempSubString(suffixStart));
1115
88.7k
                    prevDestCond->next = destIndex;
1116
88.7k
                }
1117
8.90k
            } else {
1118
                // Just copy long CEs and Latin mini expansions (and other expected values) as is,
1119
                // assuming that the modifier would not modify them.
1120
8.90k
                U_ASSERT(tag == Collation::LONG_PRIMARY_TAG ||
1121
8.90k
                        tag == Collation::LONG_SECONDARY_TAG ||
1122
8.90k
                        tag == Collation::LATIN_EXPANSION_TAG ||
1123
8.90k
                        tag == Collation::HANGUL_TAG);
1124
8.90k
            }
1125
155k
        }
1126
269k
        return ce32;
1127
269k
    }
1128
1129
    const CollationDataBuilder &src;
1130
    CollationDataBuilder &dest;
1131
    const CollationDataBuilder::CEModifier &modifier;
1132
    int64_t modifiedCEs[Collation::MAX_EXPANSION_LENGTH];
1133
    UErrorCode errorCode;
1134
};
1135
1136
U_CDECL_BEGIN
1137
1138
static UBool U_CALLCONV
1139
194k
enumRangeForCopy(const void *context, UChar32 start, UChar32 end, uint32_t value) {
1140
194k
    return
1141
194k
        value == Collation::UNASSIGNED_CE32 || value == Collation::FALLBACK_CE32 ||
1142
194k
        ((CopyHelper *)context)->copyRangeCE32(start, end, value);
1143
194k
}
1144
1145
U_CDECL_END
1146
1147
void
1148
CollationDataBuilder::copyFrom(const CollationDataBuilder &src, const CEModifier &modifier,
1149
572
                               UErrorCode &errorCode) {
1150
572
    if(U_FAILURE(errorCode)) { return; }
1151
572
    if(trie == nullptr || utrie2_isFrozen(trie)) {
1152
0
        errorCode = U_INVALID_STATE_ERROR;
1153
0
        return;
1154
0
    }
1155
572
    CopyHelper helper(src, *this, modifier, errorCode);
1156
572
    utrie2_enum(src.trie, nullptr, enumRangeForCopy, &helper);
1157
572
    errorCode = helper.errorCode;
1158
    // Update the contextChars and the unsafeBackwardSet while copying,
1159
    // in case a character had conditional mappings in the source builder
1160
    // and they were removed later.
1161
572
    modified |= src.modified;
1162
572
}
1163
1164
void
1165
588
CollationDataBuilder::optimize(const UnicodeSet &set, UErrorCode &errorCode) {
1166
588
    if(U_FAILURE(errorCode) || set.isEmpty()) { return; }
1167
572
    UnicodeSetIterator iter(set);
1168
689k
    while(iter.next() && !iter.isString()) {
1169
689k
        UChar32 c = iter.getCodepoint();
1170
689k
        uint32_t ce32 = utrie2_get32(trie, c);
1171
689k
        if(ce32 == Collation::FALLBACK_CE32) {
1172
674k
            ce32 = base->getFinalCE32(base->getCE32(c));
1173
674k
            ce32 = copyFromBaseCE32(c, ce32, true, errorCode);
1174
674k
            utrie2_set32(trie, c, ce32, &errorCode);
1175
674k
        }
1176
689k
    }
1177
572
    modified = true;
1178
572
}
1179
1180
void
1181
107
CollationDataBuilder::suppressContractions(const UnicodeSet &set, UErrorCode &errorCode) {
1182
107
    if(U_FAILURE(errorCode) || set.isEmpty()) { return; }
1183
107
    UnicodeSetIterator iter(set);
1184
321
    while(iter.next() && !iter.isString()) {
1185
214
        UChar32 c = iter.getCodepoint();
1186
214
        uint32_t ce32 = utrie2_get32(trie, c);
1187
214
        if(ce32 == Collation::FALLBACK_CE32) {
1188
84
            ce32 = base->getFinalCE32(base->getCE32(c));
1189
84
            if(Collation::ce32HasContext(ce32)) {
1190
84
                ce32 = copyFromBaseCE32(c, ce32, false /* without context */, errorCode);
1191
84
                utrie2_set32(trie, c, ce32, &errorCode);
1192
84
            }
1193
130
        } else if(isBuilderContextCE32(ce32)) {
1194
4
            ce32 = getConditionalCE32ForCE32(ce32)->ce32;
1195
            // Simply abandon the list of ConditionalCE32.
1196
            // The caller will copy this builder in the end,
1197
            // eliminating unreachable data.
1198
4
            utrie2_set32(trie, c, ce32, &errorCode);
1199
4
            contextChars.remove(c);
1200
4
        }
1201
214
    }
1202
107
    modified = true;
1203
107
}
1204
1205
UBool
1206
572
CollationDataBuilder::getJamoCE32s(uint32_t jamoCE32s[], UErrorCode &errorCode) {
1207
572
    if(U_FAILURE(errorCode)) { return false; }
1208
561
    UBool anyJamoAssigned = base == nullptr;  // always set jamoCE32s in the base data
1209
561
    UBool needToCopyFromBase = false;
1210
38.1k
    for(int32_t j = 0; j < CollationData::JAMO_CE32S_LENGTH; ++j) {  // Count across Jamo types.
1211
37.5k
        UChar32 jamo = jamoCpFromIndex(j);
1212
37.5k
        UBool fromBase = false;
1213
37.5k
        uint32_t ce32 = utrie2_get32(trie, jamo);
1214
37.5k
        anyJamoAssigned |= Collation::isAssignedCE32(ce32);
1215
        // TODO: Try to prevent [optimize [Jamo]] from counting as anyJamoAssigned.
1216
        // (As of CLDR 24 [2013] the Korean tailoring does not optimize conjoining Jamo.)
1217
37.5k
        if(ce32 == Collation::FALLBACK_CE32) {
1218
37.2k
            fromBase = true;
1219
37.2k
            ce32 = base->getCE32(jamo);
1220
37.2k
        }
1221
37.5k
        if(Collation::isSpecialCE32(ce32)) {
1222
28
            switch(Collation::tagFromCE32(ce32)) {
1223
11
            case Collation::LONG_PRIMARY_TAG:
1224
12
            case Collation::LONG_SECONDARY_TAG:
1225
12
            case Collation::LATIN_EXPANSION_TAG:
1226
                // Copy the ce32 as-is.
1227
12
                break;
1228
12
            case Collation::EXPANSION32_TAG:
1229
16
            case Collation::EXPANSION_TAG:
1230
16
            case Collation::PREFIX_TAG:
1231
16
            case Collation::CONTRACTION_TAG:
1232
16
                if(fromBase) {
1233
                    // Defer copying until we know if anyJamoAssigned.
1234
0
                    ce32 = Collation::FALLBACK_CE32;
1235
0
                    needToCopyFromBase = true;
1236
0
                }
1237
16
                break;
1238
0
            case Collation::IMPLICIT_TAG:
1239
                // An unassigned Jamo should only occur in tests with incomplete bases.
1240
0
                U_ASSERT(fromBase);
1241
0
                ce32 = Collation::FALLBACK_CE32;
1242
0
                needToCopyFromBase = true;
1243
0
                break;
1244
0
            case Collation::OFFSET_TAG:
1245
0
                ce32 = getCE32FromOffsetCE32(fromBase, jamo, ce32);
1246
0
                break;
1247
0
            case Collation::FALLBACK_TAG:
1248
0
            case Collation::RESERVED_TAG_3:
1249
0
            case Collation::BUILDER_DATA_TAG:
1250
0
            case Collation::DIGIT_TAG:
1251
0
            case Collation::U0000_TAG:
1252
0
            case Collation::HANGUL_TAG:
1253
0
            case Collation::LEAD_SURROGATE_TAG:
1254
0
                errorCode = U_INTERNAL_PROGRAM_ERROR;
1255
0
                return false;
1256
28
            }
1257
28
        }
1258
37.5k
        jamoCE32s[j] = ce32;
1259
37.5k
    }
1260
561
    if(anyJamoAssigned && needToCopyFromBase) {
1261
0
        for(int32_t j = 0; j < CollationData::JAMO_CE32S_LENGTH; ++j) {
1262
0
            if(jamoCE32s[j] == Collation::FALLBACK_CE32) {
1263
0
                UChar32 jamo = jamoCpFromIndex(j);
1264
0
                jamoCE32s[j] = copyFromBaseCE32(jamo, base->getCE32(jamo),
1265
0
                                                /*withContext=*/ true, errorCode);
1266
0
            }
1267
0
        }
1268
0
    }
1269
561
    return anyJamoAssigned && U_SUCCESS(errorCode);
1270
561
}
1271
1272
void
1273
572
CollationDataBuilder::setDigitTags(UErrorCode &errorCode) {
1274
572
    UnicodeSet digits(UNICODE_STRING_SIMPLE("[:Nd:]"), errorCode);
1275
572
    if(U_FAILURE(errorCode)) { return; }
1276
561
    UnicodeSetIterator iter(digits);
1277
382k
    while(iter.next()) {
1278
381k
        U_ASSERT(!iter.isString());
1279
381k
        UChar32 c = iter.getCodepoint();
1280
381k
        uint32_t ce32 = utrie2_get32(trie, c);
1281
381k
        if(ce32 != Collation::FALLBACK_CE32 && ce32 != Collation::UNASSIGNED_CE32) {
1282
8.83k
            int32_t index = addCE32(ce32, errorCode);
1283
8.83k
            if(U_FAILURE(errorCode)) { return; }
1284
8.83k
            if(index > Collation::MAX_INDEX) {
1285
0
                errorCode = U_BUFFER_OVERFLOW_ERROR;
1286
0
                return;
1287
0
            }
1288
8.83k
            ce32 = Collation::makeCE32FromTagIndexAndLength(
1289
8.83k
                    Collation::DIGIT_TAG, index, u_charDigitValue(c));
1290
8.83k
            utrie2_set32(trie, c, ce32, &errorCode);
1291
8.83k
        }
1292
381k
    }
1293
561
}
1294
1295
U_CDECL_BEGIN
1296
1297
static UBool U_CALLCONV
1298
586k
enumRangeLeadValue(const void *context, UChar32 /*start*/, UChar32 /*end*/, uint32_t value) {
1299
586k
    int32_t *pValue = (int32_t *)context;
1300
586k
    if(value == Collation::UNASSIGNED_CE32) {
1301
0
        value = Collation::LEAD_ALL_UNASSIGNED;
1302
586k
    } else if(value == Collation::FALLBACK_CE32) {
1303
585k
        value = Collation::LEAD_ALL_FALLBACK;
1304
585k
    } else {
1305
836
        *pValue = Collation::LEAD_MIXED;
1306
836
        return false;
1307
836
    }
1308
585k
    if(*pValue < 0) {
1309
585k
        *pValue = (int32_t)value;
1310
585k
    } else if(*pValue != (int32_t)value) {
1311
0
        *pValue = Collation::LEAD_MIXED;
1312
0
        return false;
1313
0
    }
1314
585k
    return true;
1315
585k
}
1316
1317
U_CDECL_END
1318
1319
void
1320
572
CollationDataBuilder::setLeadSurrogates(UErrorCode &errorCode) {
1321
586k
    for(char16_t lead = 0xd800; lead < 0xdc00; ++lead) {
1322
585k
        int32_t value = -1;
1323
585k
        utrie2_enumForLeadSurrogate(trie, lead, nullptr, enumRangeLeadValue, &value);
1324
585k
        utrie2_set32ForLeadSurrogateCodeUnit(
1325
585k
            trie, lead,
1326
585k
            Collation::makeCE32FromTagAndIndex(Collation::LEAD_SURROGATE_TAG, 0) | (uint32_t)value,
1327
585k
            &errorCode);
1328
585k
    }
1329
572
}
1330
1331
void
1332
572
CollationDataBuilder::build(CollationData &data, UErrorCode &errorCode) {
1333
572
    buildMappings(data, errorCode);
1334
572
    if(base != nullptr) {
1335
572
        data.numericPrimary = base->numericPrimary;
1336
572
        data.compressibleBytes = base->compressibleBytes;
1337
572
        data.numScripts = base->numScripts;
1338
572
        data.scriptsIndex = base->scriptsIndex;
1339
572
        data.scriptStarts = base->scriptStarts;
1340
572
        data.scriptStartsLength = base->scriptStartsLength;
1341
572
    }
1342
572
    buildFastLatinTable(data, errorCode);
1343
572
}
1344
1345
void
1346
572
CollationDataBuilder::buildMappings(CollationData &data, UErrorCode &errorCode) {
1347
572
    if(U_FAILURE(errorCode)) { return; }
1348
572
    if(trie == nullptr || utrie2_isFrozen(trie)) {
1349
0
        errorCode = U_INVALID_STATE_ERROR;
1350
0
        return;
1351
0
    }
1352
1353
572
    buildContexts(errorCode);
1354
1355
572
    uint32_t jamoCE32s[CollationData::JAMO_CE32S_LENGTH];
1356
572
    int32_t jamoIndex = -1;
1357
572
    if(getJamoCE32s(jamoCE32s, errorCode)) {
1358
29
        jamoIndex = ce32s.size();
1359
1.97k
        for(int32_t i = 0; i < CollationData::JAMO_CE32S_LENGTH; ++i) {
1360
1.94k
            ce32s.addElement((int32_t)jamoCE32s[i], errorCode);
1361
1.94k
        }
1362
        // Small optimization: Use a bit in the Hangul ce32
1363
        // to indicate that none of the Jamo CE32s are isSpecialCE32()
1364
        // (as it should be in the root collator).
1365
        // It allows CollationIterator to avoid recursive function calls and per-Jamo tests.
1366
        // In order to still have good trie compression and keep this code simple,
1367
        // we only set this flag if a whole block of 588 Hangul syllables starting with
1368
        // a common leading consonant (Jamo L) has this property.
1369
29
        UBool isAnyJamoVTSpecial = false;
1370
1.17k
        for(int32_t i = Hangul::JAMO_L_COUNT; i < CollationData::JAMO_CE32S_LENGTH; ++i) {
1371
1.14k
            if(Collation::isSpecialCE32(jamoCE32s[i])) {
1372
6
                isAnyJamoVTSpecial = true;
1373
6
                break;
1374
6
            }
1375
1.14k
        }
1376
29
        uint32_t hangulCE32 = Collation::makeCE32FromTagAndIndex(Collation::HANGUL_TAG, 0);
1377
29
        UChar32 c = Hangul::HANGUL_BASE;
1378
580
        for(int32_t i = 0; i < Hangul::JAMO_L_COUNT; ++i) {  // iterate over the Jamo L
1379
551
            uint32_t ce32 = hangulCE32;
1380
551
            if(!isAnyJamoVTSpecial && !Collation::isSpecialCE32(jamoCE32s[i])) {
1381
422
                ce32 |= Collation::HANGUL_NO_SPECIAL_JAMO;
1382
422
            }
1383
551
            UChar32 limit = c + Hangul::JAMO_VT_COUNT;
1384
551
            utrie2_setRange32(trie, c, limit - 1, ce32, true, &errorCode);
1385
551
            c = limit;
1386
551
        }
1387
543
    } else {
1388
        // Copy the Hangul CE32s from the base in blocks per Jamo L,
1389
        // assuming that HANGUL_NO_SPECIAL_JAMO is set or not set for whole blocks.
1390
10.8k
        for(UChar32 c = Hangul::HANGUL_BASE; c < Hangul::HANGUL_LIMIT;) {
1391
10.3k
            uint32_t ce32 = base->getCE32(c);
1392
10.3k
            U_ASSERT(Collation::hasCE32Tag(ce32, Collation::HANGUL_TAG));
1393
10.3k
            UChar32 limit = c + Hangul::JAMO_VT_COUNT;
1394
10.3k
            utrie2_setRange32(trie, c, limit - 1, ce32, true, &errorCode);
1395
10.3k
            c = limit;
1396
10.3k
        }
1397
543
    }
1398
1399
572
    setDigitTags(errorCode);
1400
572
    setLeadSurrogates(errorCode);
1401
1402
572
    if (!icu4xMode) {
1403
        // For U+0000, move its normal ce32 into CE32s[0] and set U0000_TAG.
1404
572
        ce32s.setElementAt((int32_t)utrie2_get32(trie, 0), 0);
1405
572
        utrie2_set32(trie, 0, Collation::makeCE32FromTagAndIndex(Collation::U0000_TAG, 0), &errorCode);
1406
572
    }
1407
1408
572
    utrie2_freeze(trie, UTRIE2_32_VALUE_BITS, &errorCode);
1409
572
    if(U_FAILURE(errorCode)) { return; }
1410
1411
    // Mark each lead surrogate as "unsafe"
1412
    // if any of its 1024 associated supplementary code points is "unsafe".
1413
561
    UChar32 c = 0x10000;
1414
575k
    for(char16_t lead = 0xd800; lead < 0xdc00; ++lead, c += 0x400) {
1415
574k
        if(unsafeBackwardSet.containsSome(c, c + 0x3ff)) {
1416
12.4k
            unsafeBackwardSet.add(lead);
1417
12.4k
        }
1418
574k
    }
1419
561
    unsafeBackwardSet.freeze();
1420
1421
561
    data.trie = trie;
1422
561
    data.ce32s = reinterpret_cast<const uint32_t *>(ce32s.getBuffer());
1423
561
    data.ces = ce64s.getBuffer();
1424
561
    data.contexts = contexts.getBuffer();
1425
1426
561
    data.ce32sLength = ce32s.size();
1427
561
    data.cesLength = ce64s.size();
1428
561
    data.contextsLength = contexts.length();
1429
1430
561
    data.base = base;
1431
561
    if(jamoIndex >= 0) {
1432
29
        data.jamoCE32s = data.ce32s + jamoIndex;
1433
532
    } else {
1434
532
        data.jamoCE32s = base->jamoCE32s;
1435
532
    }
1436
561
    data.unsafeBackwardSet = &unsafeBackwardSet;
1437
561
}
1438
1439
void
1440
1.14k
CollationDataBuilder::clearContexts() {
1441
1.14k
    contexts.remove();
1442
    // Incrementing the contexts build "era" invalidates all of the builtCE32
1443
    // from before this clearContexts() call.
1444
    // Simpler than finding and resetting all of those fields.
1445
1.14k
    ++contextsEra;
1446
1.14k
}
1447
1448
void
1449
572
CollationDataBuilder::buildContexts(UErrorCode &errorCode) {
1450
572
    if(U_FAILURE(errorCode)) { return; }
1451
    // Ignore abandoned lists and the cached builtCE32,
1452
    // and build all contexts from scratch.
1453
572
    clearContexts();
1454
572
    UnicodeSetIterator iter(contextChars);
1455
8.94k
    while(U_SUCCESS(errorCode) && iter.next()) {
1456
8.36k
        U_ASSERT(!iter.isString());
1457
8.36k
        UChar32 c = iter.getCodepoint();
1458
8.36k
        uint32_t ce32 = utrie2_get32(trie, c);
1459
8.36k
        if(!isBuilderContextCE32(ce32)) {
1460
            // Impossible: No context data for c in contextChars.
1461
0
            errorCode = U_INTERNAL_PROGRAM_ERROR;
1462
0
            return;
1463
0
        }
1464
8.36k
        ConditionalCE32 *cond = getConditionalCE32ForCE32(ce32);
1465
8.36k
        ce32 = buildContext(cond, errorCode);
1466
8.36k
        utrie2_set32(trie, c, ce32, &errorCode);
1467
8.36k
    }
1468
572
}
1469
1470
uint32_t
1471
197k
CollationDataBuilder::buildContext(ConditionalCE32 *head, UErrorCode &errorCode) {
1472
197k
    if(U_FAILURE(errorCode)) { return 0; }
1473
    // The list head must have no context.
1474
197k
    U_ASSERT(!head->hasContext());
1475
    // The list head must be followed by one or more nodes that all do have context.
1476
197k
    U_ASSERT(head->next >= 0);
1477
197k
    UCharsTrieBuilder prefixBuilder(errorCode);
1478
197k
    UCharsTrieBuilder contractionBuilder(errorCode);
1479
    // This outer loop goes from each prefix to the next.
1480
    // For each prefix it finds the one or more same-prefix entries (firstCond..lastCond).
1481
    // If there are multiple suffixes for the same prefix,
1482
    // then an inner loop builds a contraction trie for them.
1483
599k
    for(ConditionalCE32 *cond = head;; cond = getConditionalCE32(cond->next)) {
1484
599k
        if(U_FAILURE(errorCode)) { return 0; }  // early out for memory allocation errors
1485
        // After the list head, the prefix or suffix can be empty, but not both.
1486
599k
        U_ASSERT(cond == head || cond->hasContext());
1487
599k
        int32_t prefixLength = cond->prefixLength();
1488
599k
        UnicodeString prefix(cond->context, 0, prefixLength + 1);
1489
        // Collect all contraction suffixes for one prefix.
1490
599k
        ConditionalCE32 *firstCond = cond;
1491
599k
        ConditionalCE32 *lastCond;
1492
11.8M
        do {
1493
11.8M
            lastCond = cond;
1494
            // Clear the defaultCE32 fields as we go.
1495
            // They are left over from building a previous version of this list of contexts.
1496
            //
1497
            // One of the code paths below may copy a preceding defaultCE32
1498
            // into its emptySuffixCE32.
1499
            // If a new suffix has been inserted before what used to be
1500
            // the firstCond for its prefix, then that previous firstCond could still
1501
            // contain an outdated defaultCE32 from an earlier buildContext() and
1502
            // result in an incorrect emptySuffixCE32.
1503
            // So we reset all defaultCE32 before reading and setting new values.
1504
11.8M
            cond->defaultCE32 = Collation::NO_CE32;
1505
11.8M
        } while(cond->next >= 0 &&
1506
11.8M
                (cond = getConditionalCE32(cond->next))->context.startsWith(prefix));
1507
599k
        uint32_t ce32;
1508
599k
        int32_t suffixStart = prefixLength + 1;  // == prefix.length()
1509
599k
        if(lastCond->context.length() == suffixStart) {
1510
            // One prefix without contraction suffix.
1511
122k
            U_ASSERT(firstCond == lastCond);
1512
122k
            ce32 = lastCond->ce32;
1513
122k
            cond = lastCond;
1514
476k
        } else {
1515
            // Build the contractions trie.
1516
476k
            contractionBuilder.clear();
1517
            // Entry for an empty suffix, to be stored before the trie.
1518
476k
            uint32_t emptySuffixCE32 = 0;
1519
476k
            uint32_t flags = 0;
1520
476k
            if(firstCond->context.length() == suffixStart) {
1521
                // There is a mapping for the prefix and the single character c. (p|c)
1522
                // If no other suffix matches, then we return this value.
1523
179k
                emptySuffixCE32 = firstCond->ce32;
1524
179k
                cond = getConditionalCE32(firstCond->next);
1525
297k
            } else {
1526
                // There is no mapping for the prefix and just the single character.
1527
                // (There is no p|c, only p|cd, p|ce etc.)
1528
297k
                flags |= Collation::CONTRACT_SINGLE_CP_NO_MATCH;
1529
                // When the prefix matches but none of the prefix-specific suffixes,
1530
                // then we fall back to the mappings with the next-longest prefix,
1531
                // and ultimately to mappings with no prefix.
1532
                // Each fallback might be another set of contractions.
1533
                // For example, if there are mappings for ch, p|cd, p|ce, but not for p|c,
1534
                // then in text "pch" we find the ch contraction.
1535
18.6M
                for(cond = head;; cond = getConditionalCE32(cond->next)) {
1536
18.6M
                    int32_t length = cond->prefixLength();
1537
18.6M
                    if(length == prefixLength) { break; }
1538
18.3M
                    if(cond->defaultCE32 != Collation::NO_CE32 &&
1539
18.3M
                            (length==0 || prefix.endsWith(cond->context, 1, length))) {
1540
378k
                        emptySuffixCE32 = cond->defaultCE32;
1541
378k
                    }
1542
18.3M
                }
1543
297k
                cond = firstCond;
1544
297k
            }
1545
            // Optimization: Set a flag when
1546
            // the first character of every contraction suffix has lccc!=0.
1547
            // Short-circuits contraction matching when a normal letter follows.
1548
476k
            flags |= Collation::CONTRACT_NEXT_CCC;
1549
            // Add all of the non-empty suffixes into the contraction trie.
1550
11.5M
            for(;;) {
1551
11.5M
                UnicodeString suffix(cond->context, suffixStart);
1552
11.5M
                uint16_t fcd16 = nfcImpl.getFCD16(suffix.char32At(0));
1553
11.5M
                if(fcd16 <= 0xff) {
1554
10.7M
                    flags &= ~Collation::CONTRACT_NEXT_CCC;
1555
10.7M
                }
1556
11.5M
                fcd16 = nfcImpl.getFCD16(suffix.char32At(suffix.length() - 1));
1557
11.5M
                if(fcd16 > 0xff) {
1558
                    // The last suffix character has lccc!=0, allowing for discontiguous contractions.
1559
977k
                    flags |= Collation::CONTRACT_TRAILING_CCC;
1560
977k
                }
1561
11.5M
                if (icu4xMode && (flags & Collation::CONTRACT_HAS_STARTER) == 0) {
1562
0
                    for (int32_t i = 0; i < suffix.length();) {
1563
0
                        UChar32 c = suffix.char32At(i);
1564
0
                            if (!u_getCombiningClass(c)) {
1565
0
                                flags |= Collation::CONTRACT_HAS_STARTER;
1566
0
                                break;
1567
0
                            }
1568
0
                        if (c > 0xFFFF) {
1569
0
                            i += 2;
1570
0
                        } else {
1571
0
                            ++i;
1572
0
                        }
1573
0
                    }
1574
0
                }
1575
11.5M
                contractionBuilder.add(suffix, (int32_t)cond->ce32, errorCode);
1576
11.5M
                if(cond == lastCond) { break; }
1577
11.0M
                cond = getConditionalCE32(cond->next);
1578
11.0M
            }
1579
476k
            int32_t index = addContextTrie(emptySuffixCE32, contractionBuilder, errorCode);
1580
476k
            if(U_FAILURE(errorCode)) { return 0; }
1581
476k
            if(index > Collation::MAX_INDEX) {
1582
478
                errorCode = U_BUFFER_OVERFLOW_ERROR;
1583
478
                return 0;
1584
478
            }
1585
475k
            ce32 = Collation::makeCE32FromTagAndIndex(Collation::CONTRACTION_TAG, index) | flags;
1586
475k
        }
1587
598k
        U_ASSERT(cond == lastCond);
1588
598k
        firstCond->defaultCE32 = ce32;
1589
598k
        if(prefixLength == 0) {
1590
196k
            if(cond->next < 0) {
1591
                // No non-empty prefixes, only contractions.
1592
167k
                return ce32;
1593
167k
            }
1594
401k
        } else {
1595
401k
            prefix.remove(0, 1);  // Remove the length unit.
1596
401k
            prefix.reverse();
1597
401k
            prefixBuilder.add(prefix, (int32_t)ce32, errorCode);
1598
401k
            if(cond->next < 0) { break; }
1599
401k
        }
1600
598k
    }
1601
29.0k
    U_ASSERT(head->defaultCE32 != Collation::NO_CE32);
1602
29.0k
    int32_t index = addContextTrie(head->defaultCE32, prefixBuilder, errorCode);
1603
29.0k
    if(U_FAILURE(errorCode)) { return 0; }
1604
29.0k
    if(index > Collation::MAX_INDEX) {
1605
92
        errorCode = U_BUFFER_OVERFLOW_ERROR;
1606
92
        return 0;
1607
92
    }
1608
28.9k
    return Collation::makeCE32FromTagAndIndex(Collation::PREFIX_TAG, index);
1609
29.0k
}
1610
1611
int32_t
1612
CollationDataBuilder::addContextTrie(uint32_t defaultCE32, UCharsTrieBuilder &trieBuilder,
1613
505k
                                     UErrorCode &errorCode) {
1614
505k
    UnicodeString context;
1615
505k
    context.append((char16_t)(defaultCE32 >> 16)).append((char16_t)defaultCE32);
1616
505k
    UnicodeString trieString;
1617
505k
    context.append(trieBuilder.buildUnicodeString(USTRINGTRIE_BUILD_SMALL, trieString, errorCode));
1618
505k
    if(U_FAILURE(errorCode)) { return -1; }
1619
505k
    int32_t index = contexts.indexOf(context);
1620
505k
    if(index < 0) {
1621
213k
        index = contexts.length();
1622
213k
        contexts.append(context);
1623
213k
    }
1624
505k
    return index;
1625
505k
}
1626
1627
void
1628
572
CollationDataBuilder::buildFastLatinTable(CollationData &data, UErrorCode &errorCode) {
1629
572
    if(U_FAILURE(errorCode) || !fastLatinEnabled) { return; }
1630
1631
561
    delete fastLatinBuilder;
1632
561
    fastLatinBuilder = new CollationFastLatinBuilder(errorCode);
1633
561
    if(fastLatinBuilder == nullptr) {
1634
0
        errorCode = U_MEMORY_ALLOCATION_ERROR;
1635
0
        return;
1636
0
    }
1637
561
    if(fastLatinBuilder->forData(data, errorCode)) {
1638
521
        const uint16_t *table = fastLatinBuilder->getTable();
1639
521
        int32_t length = fastLatinBuilder->lengthOfTable();
1640
521
        if(base != nullptr && length == base->fastLatinTableLength &&
1641
521
                uprv_memcmp(table, base->fastLatinTable, length * 2) == 0) {
1642
            // Same fast Latin table as in the base, use that one instead.
1643
159
            delete fastLatinBuilder;
1644
159
            fastLatinBuilder = nullptr;
1645
159
            table = base->fastLatinTable;
1646
159
        }
1647
521
        data.fastLatinTable = table;
1648
521
        data.fastLatinTableLength = length;
1649
521
    } else {
1650
40
        delete fastLatinBuilder;
1651
40
        fastLatinBuilder = nullptr;
1652
40
    }
1653
561
}
1654
1655
int32_t
1656
1.49M
CollationDataBuilder::getCEs(const UnicodeString &s, int64_t ces[], int32_t cesLength) {
1657
1.49M
    return getCEs(s, 0, ces, cesLength);
1658
1.49M
}
1659
1660
int32_t
1661
CollationDataBuilder::getCEs(const UnicodeString &prefix, const UnicodeString &s,
1662
4.01M
                             int64_t ces[], int32_t cesLength) {
1663
4.01M
    int32_t prefixLength = prefix.length();
1664
4.01M
    if(prefixLength == 0) {
1665
3.98M
        return getCEs(s, 0, ces, cesLength);
1666
3.98M
    } else {
1667
33.2k
        return getCEs(prefix + s, prefixLength, ces, cesLength);
1668
33.2k
    }
1669
4.01M
}
1670
1671
int32_t
1672
CollationDataBuilder::getCEs(const UnicodeString &s, int32_t start,
1673
5.51M
                             int64_t ces[], int32_t cesLength) {
1674
5.51M
    if(collIter == nullptr) {
1675
1.22k
        collIter = new DataBuilderCollationIterator(*this);
1676
1.22k
        if(collIter == nullptr) { return 0; }
1677
1.22k
    }
1678
5.51M
    return collIter->fetchCEs(s, start, ces, cesLength);
1679
5.51M
}
1680
1681
U_NAMESPACE_END
1682
1683
#endif  // !UCONFIG_NO_COLLATION