Coverage Report

Created: 2025-10-24 06:54

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/icu/icu4c/source/i18n/collationbuilder.cpp
Line
Count
Source
1
// © 2016 and later: Unicode, Inc. and others.
2
// License & terms of use: http://www.unicode.org/copyright.html
3
/*
4
*******************************************************************************
5
* Copyright (C) 2013-2014, International Business Machines
6
* Corporation and others.  All Rights Reserved.
7
*******************************************************************************
8
* collationbuilder.cpp
9
*
10
* (replaced the former ucol_bld.cpp)
11
*
12
* created on: 2013may06
13
* created by: Markus W. Scherer
14
*/
15
16
#ifdef DEBUG_COLLATION_BUILDER
17
#include <stdio.h>
18
#endif
19
20
#include "unicode/utypes.h"
21
22
#if !UCONFIG_NO_COLLATION
23
24
#include "unicode/caniter.h"
25
#include "unicode/normalizer2.h"
26
#include "unicode/tblcoll.h"
27
#include "unicode/parseerr.h"
28
#include "unicode/uchar.h"
29
#include "unicode/ucol.h"
30
#include "unicode/unistr.h"
31
#include "unicode/usetiter.h"
32
#include "unicode/utf16.h"
33
#include "unicode/uversion.h"
34
#include "cmemory.h"
35
#include "collation.h"
36
#include "collationbuilder.h"
37
#include "collationdata.h"
38
#include "collationdatabuilder.h"
39
#include "collationfastlatin.h"
40
#include "collationroot.h"
41
#include "collationrootelements.h"
42
#include "collationruleparser.h"
43
#include "collationsettings.h"
44
#include "collationtailoring.h"
45
#include "collationweights.h"
46
#include "normalizer2impl.h"
47
#include "uassert.h"
48
#include "ucol_imp.h"
49
#include "utf16collationiterator.h"
50
51
U_NAMESPACE_BEGIN
52
53
namespace {
54
55
class BundleImporter : public CollationRuleParser::Importer {
56
public:
57
7.67k
    BundleImporter() {}
58
    virtual ~BundleImporter();
59
    virtual void getRules(
60
            const char *localeID, const char *collationType,
61
            UnicodeString &rules,
62
            const char *&errorReason, UErrorCode &errorCode) override;
63
};
64
65
BundleImporter::~BundleImporter() {}
66
67
void
68
BundleImporter::getRules(
69
        const char *localeID, const char *collationType,
70
        UnicodeString &rules,
71
5.36k
        const char *& /*errorReason*/, UErrorCode &errorCode) {
72
5.36k
    CollationLoader::loadRules(localeID, collationType, rules, errorCode);
73
5.36k
}
74
75
}  // namespace
76
77
// RuleBasedCollator implementation ---------------------------------------- ***
78
79
// These methods are here, rather than in rulebasedcollator.cpp,
80
// for modularization:
81
// Most code using Collator does not need to build a Collator from rules.
82
// By moving these constructors and helper methods to a separate file,
83
// most code will not have a static dependency on the builder code.
84
85
RuleBasedCollator::RuleBasedCollator()
86
0
        : data(nullptr),
87
0
          settings(nullptr),
88
0
          tailoring(nullptr),
89
0
          cacheEntry(nullptr),
90
0
          validLocale(""),
91
0
          explicitlySetAttributes(0),
92
0
          actualLocaleIsSameAsValid(false) {
93
0
}
94
95
RuleBasedCollator::RuleBasedCollator(const UnicodeString &rules, UErrorCode &errorCode)
96
7.67k
        : data(nullptr),
97
7.67k
          settings(nullptr),
98
7.67k
          tailoring(nullptr),
99
7.67k
          cacheEntry(nullptr),
100
7.67k
          validLocale(""),
101
7.67k
          explicitlySetAttributes(0),
102
7.67k
          actualLocaleIsSameAsValid(false) {
103
7.67k
    internalBuildTailoring(rules, UCOL_DEFAULT, UCOL_DEFAULT, nullptr, nullptr, errorCode);
104
7.67k
}
105
106
RuleBasedCollator::RuleBasedCollator(const UnicodeString &rules, ECollationStrength strength,
107
                                     UErrorCode &errorCode)
108
0
        : data(nullptr),
109
0
          settings(nullptr),
110
0
          tailoring(nullptr),
111
0
          cacheEntry(nullptr),
112
0
          validLocale(""),
113
0
          explicitlySetAttributes(0),
114
0
          actualLocaleIsSameAsValid(false) {
115
0
    internalBuildTailoring(rules, strength, UCOL_DEFAULT, nullptr, nullptr, errorCode);
116
0
}
117
118
RuleBasedCollator::RuleBasedCollator(const UnicodeString &rules,
119
                                     UColAttributeValue decompositionMode,
120
                                     UErrorCode &errorCode)
121
0
        : data(nullptr),
122
0
          settings(nullptr),
123
0
          tailoring(nullptr),
124
0
          cacheEntry(nullptr),
125
0
          validLocale(""),
126
0
          explicitlySetAttributes(0),
127
0
          actualLocaleIsSameAsValid(false) {
128
0
    internalBuildTailoring(rules, UCOL_DEFAULT, decompositionMode, nullptr, nullptr, errorCode);
129
0
}
130
131
RuleBasedCollator::RuleBasedCollator(const UnicodeString &rules,
132
                                     ECollationStrength strength,
133
                                     UColAttributeValue decompositionMode,
134
                                     UErrorCode &errorCode)
135
0
        : data(nullptr),
136
0
          settings(nullptr),
137
0
          tailoring(nullptr),
138
0
          cacheEntry(nullptr),
139
0
          validLocale(""),
140
0
          explicitlySetAttributes(0),
141
0
          actualLocaleIsSameAsValid(false) {
142
0
    internalBuildTailoring(rules, strength, decompositionMode, nullptr, nullptr, errorCode);
143
0
}
144
145
RuleBasedCollator::RuleBasedCollator(const UnicodeString &rules,
146
                                     UParseError &parseError, UnicodeString &reason,
147
                                     UErrorCode &errorCode)
148
0
        : data(nullptr),
149
0
          settings(nullptr),
150
0
          tailoring(nullptr),
151
0
          cacheEntry(nullptr),
152
0
          validLocale(""),
153
0
          explicitlySetAttributes(0),
154
0
          actualLocaleIsSameAsValid(false) {
155
0
    internalBuildTailoring(rules, UCOL_DEFAULT, UCOL_DEFAULT, &parseError, &reason, errorCode);
156
0
}
157
158
void
159
RuleBasedCollator::internalBuildTailoring(const UnicodeString &rules,
160
                                          int32_t strength,
161
                                          UColAttributeValue decompositionMode,
162
                                          UParseError *outParseError, UnicodeString *outReason,
163
7.67k
                                          UErrorCode &errorCode) {
164
7.67k
    const CollationTailoring *base = CollationRoot::getRoot(errorCode);
165
7.67k
    if(U_FAILURE(errorCode)) { return; }
166
7.67k
    if(outReason != nullptr) { outReason->remove(); }
167
7.67k
    CollationBuilder builder(base, errorCode);
168
7.67k
    UVersionInfo noVersion = { 0, 0, 0, 0 };
169
7.67k
    BundleImporter importer;
170
7.67k
    LocalPointer<CollationTailoring> t(builder.parseAndBuild(rules, noVersion,
171
7.67k
                                                             &importer,
172
7.67k
                                                             outParseError, errorCode));
173
7.67k
    if(U_FAILURE(errorCode)) {
174
3.61k
        const char *reason = builder.getErrorReason();
175
3.61k
        if(reason != nullptr && outReason != nullptr) {
176
0
            *outReason = UnicodeString(reason, -1, US_INV);
177
0
        }
178
3.61k
        return;
179
3.61k
    }
180
4.06k
    t->actualLocale.setToBogus();
181
4.06k
    adoptTailoring(t.orphan(), errorCode);
182
    // Set attributes after building the collator,
183
    // to keep the default settings consistent with the rule string.
184
4.06k
    if(strength != UCOL_DEFAULT) {
185
0
        setAttribute(UCOL_STRENGTH, static_cast<UColAttributeValue>(strength), errorCode);
186
0
    }
187
4.06k
    if(decompositionMode != UCOL_DEFAULT) {
188
0
        setAttribute(UCOL_NORMALIZATION_MODE, decompositionMode, errorCode);
189
0
    }
190
4.06k
}
191
192
// CollationBuilder implementation ----------------------------------------- ***
193
194
CollationBuilder::CollationBuilder(const CollationTailoring *b, UBool icu4xMode, UErrorCode &errorCode)
195
7.67k
        : nfd(*Normalizer2::getNFDInstance(errorCode)),
196
7.67k
          fcd(*Normalizer2Factory::getFCDInstance(errorCode)),
197
7.67k
          nfcImpl(*Normalizer2Factory::getNFCImpl(errorCode)),
198
7.67k
          base(b),
199
7.67k
          baseData(b->data),
200
7.67k
          rootElements(b->data->rootElements, b->data->rootElementsLength),
201
7.67k
          variableTop(0),
202
7.67k
          dataBuilder(new CollationDataBuilder(icu4xMode, errorCode)), fastLatinEnabled(true),
203
7.67k
          icu4xMode(icu4xMode),
204
7.67k
          errorReason(nullptr),
205
7.67k
          cesLength(0),
206
7.67k
          rootPrimaryIndexes(errorCode), nodes(errorCode) {
207
7.67k
    nfcImpl.ensureCanonIterData(errorCode);
208
7.67k
    if(U_FAILURE(errorCode)) {
209
0
        errorReason = "CollationBuilder fields initialization failed";
210
0
        return;
211
0
    }
212
7.67k
    if(dataBuilder == nullptr) {
213
0
        errorCode = U_MEMORY_ALLOCATION_ERROR;
214
0
        return;
215
0
    }
216
7.67k
    dataBuilder->initForTailoring(baseData, errorCode);
217
7.67k
    if(U_FAILURE(errorCode)) {
218
0
        errorReason = "CollationBuilder initialization failed";
219
0
    }
220
7.67k
}
221
222
CollationBuilder::CollationBuilder(const CollationTailoring *b, UErrorCode &errorCode)
223
7.67k
  : CollationBuilder(b, false, errorCode)
224
7.67k
{}
225
226
7.67k
CollationBuilder::~CollationBuilder() {
227
7.67k
    delete dataBuilder;
228
7.67k
}
229
230
CollationTailoring *
231
CollationBuilder::parseAndBuild(const UnicodeString &ruleString,
232
                                const UVersionInfo rulesVersion,
233
                                CollationRuleParser::Importer *importer,
234
                                UParseError *outParseError,
235
7.67k
                                UErrorCode &errorCode) {
236
7.67k
    if(U_FAILURE(errorCode)) { return nullptr; }
237
7.67k
    if(baseData->rootElements == nullptr) {
238
0
        errorCode = U_MISSING_RESOURCE_ERROR;
239
0
        errorReason = "missing root elements data, tailoring not supported";
240
0
        return nullptr;
241
0
    }
242
7.67k
    LocalPointer<CollationTailoring> tailoring(new CollationTailoring(base->settings));
243
7.67k
    if(tailoring.isNull() || tailoring->isBogus()) {
244
0
        errorCode = U_MEMORY_ALLOCATION_ERROR;
245
0
        return nullptr;
246
0
    }
247
7.67k
    CollationRuleParser parser(baseData, errorCode);
248
7.67k
    if(U_FAILURE(errorCode)) { return nullptr; }
249
    // Note: This always bases &[last variable] and &[first regular]
250
    // on the root collator's maxVariable/variableTop.
251
    // If we wanted this to change after [maxVariable x], then we would keep
252
    // the tailoring.settings pointer here and read its variableTop when we need it.
253
    // See http://unicode.org/cldr/trac/ticket/6070
254
7.67k
    variableTop = base->settings->variableTop;
255
7.67k
    parser.setSink(this);
256
7.67k
    parser.setImporter(importer);
257
7.67k
    CollationSettings &ownedSettings = *SharedObject::copyOnWrite(tailoring->settings);
258
7.67k
    parser.parse(ruleString, ownedSettings, outParseError, errorCode);
259
7.67k
    errorReason = parser.getErrorReason();
260
7.67k
    if(U_FAILURE(errorCode)) { return nullptr; }
261
4.09k
    if(dataBuilder->hasMappings()) {
262
3.86k
        makeTailoredCEs(errorCode);
263
3.86k
        if (!icu4xMode) {
264
3.86k
            closeOverComposites(errorCode);
265
3.86k
        }
266
3.86k
        finalizeCEs(errorCode);
267
3.86k
        if (!icu4xMode) {
268
            // Copy all of ASCII, and Latin-1 letters, into each tailoring.
269
3.86k
            optimizeSet.add(0, 0x7f);
270
3.86k
            optimizeSet.add(0xc0, 0xff);
271
            // Hangul is decomposed on the fly during collation,
272
            // and the tailoring data is always built with HANGUL_TAG specials.
273
3.86k
            optimizeSet.remove(Hangul::HANGUL_BASE, Hangul::HANGUL_END);
274
3.86k
            dataBuilder->optimize(optimizeSet, errorCode);
275
3.86k
        }
276
3.86k
        tailoring->ensureOwnedData(errorCode);
277
3.86k
        if(U_FAILURE(errorCode)) { return nullptr; }
278
3.82k
        if(fastLatinEnabled) { dataBuilder->enableFastLatin(); }
279
3.82k
        dataBuilder->build(*tailoring->ownedData, errorCode);
280
3.82k
        tailoring->builder = dataBuilder;
281
3.82k
        dataBuilder = nullptr;
282
3.82k
    } else {
283
234
        tailoring->data = baseData;
284
234
    }
285
4.06k
    if(U_FAILURE(errorCode)) { return nullptr; }
286
4.06k
    ownedSettings.fastLatinOptions = CollationFastLatin::getOptions(
287
4.06k
        tailoring->data, ownedSettings,
288
4.06k
        ownedSettings.fastLatinPrimaries, UPRV_LENGTHOF(ownedSettings.fastLatinPrimaries));
289
4.06k
    tailoring->rules = ruleString;
290
4.06k
    tailoring->rules.getTerminatedBuffer();  // ensure NUL-termination
291
4.06k
    tailoring->setVersion(base->version, rulesVersion);
292
4.06k
    return tailoring.orphan();
293
4.06k
}
294
295
void
296
CollationBuilder::addReset(int32_t strength, const UnicodeString &str,
297
340k
                           const char *&parserErrorReason, UErrorCode &errorCode) {
298
340k
    if(U_FAILURE(errorCode)) { return; }
299
340k
    U_ASSERT(!str.isEmpty());
300
340k
    if(str.charAt(0) == CollationRuleParser::POS_LEAD) {
301
1.04k
        ces[0] = getSpecialResetPosition(str, parserErrorReason, errorCode);
302
1.04k
        cesLength = 1;
303
1.04k
        if(U_FAILURE(errorCode)) { return; }
304
1.04k
        U_ASSERT((ces[0] & Collation::CASE_AND_QUATERNARY_MASK) == 0);
305
339k
    } else {
306
        // normal reset to a character or string
307
339k
        UnicodeString nfdString = nfd.normalize(str, errorCode);
308
339k
        if(U_FAILURE(errorCode)) {
309
0
            parserErrorReason = "normalizing the reset position";
310
0
            return;
311
0
        }
312
339k
        cesLength = dataBuilder->getCEs(nfdString, ces, 0);
313
339k
        if(cesLength > Collation::MAX_EXPANSION_LENGTH) {
314
63
            errorCode = U_ILLEGAL_ARGUMENT_ERROR;
315
63
            parserErrorReason = "reset position maps to too many collation elements (more than 31)";
316
63
            return;
317
63
        }
318
339k
    }
319
340k
    if(strength == UCOL_IDENTICAL) { return; }  // simple reset-at-position
320
321
    // &[before strength]position
322
6.93k
    U_ASSERT(UCOL_PRIMARY <= strength && strength <= UCOL_TERTIARY);
323
6.93k
    int32_t index = findOrInsertNodeForCEs(strength, parserErrorReason, errorCode);
324
6.93k
    if(U_FAILURE(errorCode)) { return; }
325
326
6.93k
    int64_t node = nodes.elementAti(index);
327
    // If the index is for a "weaker" node,
328
    // then skip backwards over this and further "weaker" nodes.
329
50.6k
    while(strengthFromNode(node) > strength) {
330
43.6k
        index = previousIndexFromNode(node);
331
43.6k
        node = nodes.elementAti(index);
332
43.6k
    }
333
334
    // Find or insert a node whose index we will put into a temporary CE.
335
6.93k
    if(strengthFromNode(node) == strength && isTailoredNode(node)) {
336
        // Reset to just before this same-strength tailored node.
337
227
        index = previousIndexFromNode(node);
338
6.71k
    } else if(strength == UCOL_PRIMARY) {
339
        // root primary node (has no previous index)
340
4.86k
        uint32_t p = weight32FromNode(node);
341
4.86k
        if(p == 0) {
342
2
            errorCode = U_UNSUPPORTED_ERROR;
343
2
            parserErrorReason = "reset primary-before ignorable not possible";
344
2
            return;
345
2
        }
346
4.86k
        if(p <= rootElements.getFirstPrimary()) {
347
            // There is no primary gap between ignorables and the space-first-primary.
348
0
            errorCode = U_UNSUPPORTED_ERROR;
349
0
            parserErrorReason = "reset primary-before first non-ignorable not supported";
350
0
            return;
351
0
        }
352
4.86k
        if(p == Collation::FIRST_TRAILING_PRIMARY) {
353
            // We do not support tailoring to an unassigned-implicit CE.
354
0
            errorCode = U_UNSUPPORTED_ERROR;
355
0
            parserErrorReason = "reset primary-before [first trailing] not supported";
356
0
            return;
357
0
        }
358
4.86k
        p = rootElements.getPrimaryBefore(p, baseData->isCompressiblePrimary(p));
359
4.86k
        index = findOrInsertNodeForPrimary(p, errorCode);
360
        // Go to the last node in this list:
361
        // Tailor after the last node between adjacent root nodes.
362
34.7k
        for(;;) {
363
34.7k
            node = nodes.elementAti(index);
364
34.7k
            int32_t nextIndex = nextIndexFromNode(node);
365
34.7k
            if(nextIndex == 0) { break; }
366
29.8k
            index = nextIndex;
367
29.8k
        }
368
4.86k
    } else {
369
        // &[before 2] or &[before 3]
370
1.84k
        index = findCommonNode(index, UCOL_SECONDARY);
371
1.84k
        if(strength >= UCOL_TERTIARY) {
372
275
            index = findCommonNode(index, UCOL_TERTIARY);
373
275
        }
374
        // findCommonNode() stayed on the stronger node or moved to
375
        // an explicit common-weight node of the reset-before strength.
376
1.84k
        node = nodes.elementAti(index);
377
1.84k
        if(strengthFromNode(node) == strength) {
378
            // Found a same-strength node with an explicit weight.
379
1.10k
            uint32_t weight16 = weight16FromNode(node);
380
1.10k
            if(weight16 == 0) {
381
0
                errorCode = U_UNSUPPORTED_ERROR;
382
0
                if(strength == UCOL_SECONDARY) {
383
0
                    parserErrorReason = "reset secondary-before secondary ignorable not possible";
384
0
                } else {
385
0
                    parserErrorReason = "reset tertiary-before completely ignorable not possible";
386
0
                }
387
0
                return;
388
0
            }
389
1.10k
            U_ASSERT(weight16 > Collation::BEFORE_WEIGHT16);
390
            // Reset to just before this node.
391
            // Insert the preceding same-level explicit weight if it is not there already.
392
            // Which explicit weight immediately precedes this one?
393
1.10k
            weight16 = getWeight16Before(index, node, strength);
394
            // Does this preceding weight have a node?
395
1.10k
            uint32_t previousWeight16;
396
1.10k
            int32_t previousIndex = previousIndexFromNode(node);
397
7.66k
            for(int32_t i = previousIndex;; i = previousIndexFromNode(node)) {
398
7.66k
                node = nodes.elementAti(i);
399
7.66k
                int32_t previousStrength = strengthFromNode(node);
400
7.66k
                if(previousStrength < strength) {
401
25
                    U_ASSERT(weight16 >= Collation::COMMON_WEIGHT16 || i == previousIndex);
402
                    // Either the reset element has an above-common weight and
403
                    // the parent node provides the implied common weight,
404
                    // or the reset element has a weight<=common in the node
405
                    // right after the parent, and we need to insert the preceding weight.
406
25
                    previousWeight16 = Collation::COMMON_WEIGHT16;
407
25
                    break;
408
7.64k
                } else if(previousStrength == strength && !isTailoredNode(node)) {
409
1.07k
                    previousWeight16 = weight16FromNode(node);
410
1.07k
                    break;
411
1.07k
                }
412
                // Skip weaker nodes and same-level tailored nodes.
413
7.66k
            }
414
1.10k
            if(previousWeight16 == weight16) {
415
                // The preceding weight has a node,
416
                // maybe with following weaker or tailored nodes.
417
                // Reset to the last of them.
418
1.07k
                index = previousIndex;
419
1.07k
            } else {
420
                // Insert a node with the preceding weight, reset to that.
421
25
                node = nodeFromWeight16(weight16) | nodeFromStrength(strength);
422
25
                index = insertNodeBetween(previousIndex, index, node, errorCode);
423
25
            }
424
1.10k
        } else {
425
            // Found a stronger node with implied strength-common weight.
426
742
            uint32_t weight16 = getWeight16Before(index, node, strength);
427
742
            index = findOrInsertWeakNode(index, weight16, strength, errorCode);
428
742
        }
429
        // Strength of the temporary CE = strength of its reset position.
430
        // Code above raises an error if the before-strength is stronger.
431
1.84k
        strength = ceStrength(ces[cesLength - 1]);
432
1.84k
    }
433
6.93k
    if(U_FAILURE(errorCode)) {
434
0
        parserErrorReason = "inserting reset position for &[before n]";
435
0
        return;
436
0
    }
437
6.93k
    ces[cesLength - 1] = tempCEFromIndexAndStrength(index, strength);
438
6.93k
}
439
440
uint32_t
441
1.84k
CollationBuilder::getWeight16Before(int32_t index, int64_t node, int32_t level) {
442
1.84k
    U_ASSERT(strengthFromNode(node) < level || !isTailoredNode(node));
443
    // Collect the root CE weights if this node is for a root CE.
444
    // If it is not, then return the low non-primary boundary for a tailored CE.
445
1.84k
    uint32_t t;
446
1.84k
    if(strengthFromNode(node) == UCOL_TERTIARY) {
447
55
        t = weight16FromNode(node);
448
1.79k
    } else {
449
1.79k
        t = Collation::COMMON_WEIGHT16;  // Stronger node with implied common weight.
450
1.79k
    }
451
2.01k
    while(strengthFromNode(node) > UCOL_SECONDARY) {
452
170
        index = previousIndexFromNode(node);
453
170
        node = nodes.elementAti(index);
454
170
    }
455
1.84k
    if(isTailoredNode(node)) {
456
324
        return Collation::BEFORE_WEIGHT16;
457
324
    }
458
1.52k
    uint32_t s;
459
1.52k
    if(strengthFromNode(node) == UCOL_SECONDARY) {
460
1.05k
        s = weight16FromNode(node);
461
1.05k
    } else {
462
463
        s = Collation::COMMON_WEIGHT16;  // Stronger node with implied common weight.
463
463
    }
464
10.1k
    while(strengthFromNode(node) > UCOL_PRIMARY) {
465
8.65k
        index = previousIndexFromNode(node);
466
8.65k
        node = nodes.elementAti(index);
467
8.65k
    }
468
1.52k
    if(isTailoredNode(node)) {
469
445
        return Collation::BEFORE_WEIGHT16;
470
445
    }
471
    // [p, s, t] is a root CE. Return the preceding weight for the requested level.
472
1.07k
    uint32_t p = weight32FromNode(node);
473
1.07k
    uint32_t weight16;
474
1.07k
    if(level == UCOL_SECONDARY) {
475
802
        weight16 = rootElements.getSecondaryBefore(p, s);
476
802
    } else {
477
275
        weight16 = rootElements.getTertiaryBefore(p, s, t);
478
275
        U_ASSERT((weight16 & ~Collation::ONLY_TERTIARY_MASK) == 0);
479
275
    }
480
1.07k
    return weight16;
481
1.52k
}
482
483
int64_t
484
CollationBuilder::getSpecialResetPosition(const UnicodeString &str,
485
1.04k
                                          const char *&parserErrorReason, UErrorCode &errorCode) {
486
1.04k
    U_ASSERT(str.length() == 2);
487
1.04k
    int64_t ce;
488
1.04k
    int32_t strength = UCOL_PRIMARY;
489
1.04k
    UBool isBoundary = false;
490
1.04k
    UChar32 pos = str.charAt(1) - CollationRuleParser::POS_BASE;
491
1.04k
    U_ASSERT(0 <= pos && pos <= CollationRuleParser::LAST_TRAILING);
492
1.04k
    switch(pos) {
493
0
    case CollationRuleParser::FIRST_TERTIARY_IGNORABLE:
494
        // Quaternary CEs are not supported.
495
        // Non-zero quaternary weights are possible only on tertiary or stronger CEs.
496
0
        return 0;
497
253
    case CollationRuleParser::LAST_TERTIARY_IGNORABLE:
498
253
        return 0;
499
0
    case CollationRuleParser::FIRST_SECONDARY_IGNORABLE: {
500
        // Look for a tailored tertiary node after [0, 0, 0].
501
0
        int32_t index = findOrInsertNodeForRootCE(0, UCOL_TERTIARY, errorCode);
502
0
        if(U_FAILURE(errorCode)) { return 0; }
503
0
        int64_t node = nodes.elementAti(index);
504
0
        if((index = nextIndexFromNode(node)) != 0) {
505
0
            node = nodes.elementAti(index);
506
0
            U_ASSERT(strengthFromNode(node) <= UCOL_TERTIARY);
507
0
            if(isTailoredNode(node) && strengthFromNode(node) == UCOL_TERTIARY) {
508
0
                return tempCEFromIndexAndStrength(index, UCOL_TERTIARY);
509
0
            }
510
0
        }
511
0
        return rootElements.getFirstTertiaryCE();
512
        // No need to look for nodeHasAnyBefore() on a tertiary node.
513
0
    }
514
786
    case CollationRuleParser::LAST_SECONDARY_IGNORABLE:
515
786
        ce = rootElements.getLastTertiaryCE();
516
786
        strength = UCOL_TERTIARY;
517
786
        break;
518
0
    case CollationRuleParser::FIRST_PRIMARY_IGNORABLE: {
519
        // Look for a tailored secondary node after [0, 0, *].
520
0
        int32_t index = findOrInsertNodeForRootCE(0, UCOL_SECONDARY, errorCode);
521
0
        if(U_FAILURE(errorCode)) { return 0; }
522
0
        int64_t node = nodes.elementAti(index);
523
0
        while((index = nextIndexFromNode(node)) != 0) {
524
0
            node = nodes.elementAti(index);
525
0
            strength = strengthFromNode(node);
526
0
            if(strength < UCOL_SECONDARY) { break; }
527
0
            if(strength == UCOL_SECONDARY) {
528
0
                if(isTailoredNode(node)) {
529
0
                    if(nodeHasBefore3(node)) {
530
0
                        index = nextIndexFromNode(nodes.elementAti(nextIndexFromNode(node)));
531
0
                        U_ASSERT(isTailoredNode(nodes.elementAti(index)));
532
0
                    }
533
0
                    return tempCEFromIndexAndStrength(index, UCOL_SECONDARY);
534
0
                } else {
535
0
                    break;
536
0
                }
537
0
            }
538
0
        }
539
0
        ce = rootElements.getFirstSecondaryCE();
540
0
        strength = UCOL_SECONDARY;
541
0
        break;
542
0
    }
543
0
    case CollationRuleParser::LAST_PRIMARY_IGNORABLE:
544
0
        ce = rootElements.getLastSecondaryCE();
545
0
        strength = UCOL_SECONDARY;
546
0
        break;
547
1
    case CollationRuleParser::FIRST_VARIABLE:
548
1
        ce = rootElements.getFirstPrimaryCE();
549
1
        isBoundary = true;  // FractionalUCA.txt: FDD1 00A0, SPACE first primary
550
1
        break;
551
0
    case CollationRuleParser::LAST_VARIABLE:
552
0
        ce = rootElements.lastCEWithPrimaryBefore(variableTop + 1);
553
0
        break;
554
0
    case CollationRuleParser::FIRST_REGULAR:
555
0
        ce = rootElements.firstCEWithPrimaryAtLeast(variableTop + 1);
556
0
        isBoundary = true;  // FractionalUCA.txt: FDD1 263A, SYMBOL first primary
557
0
        break;
558
5
    case CollationRuleParser::LAST_REGULAR:
559
        // Use the Hani-first-primary rather than the actual last "regular" CE before it,
560
        // for backward compatibility with behavior before the introduction of
561
        // script-first-primary CEs in the root collator.
562
5
        ce = rootElements.firstCEWithPrimaryAtLeast(
563
5
            baseData->getFirstPrimaryForGroup(USCRIPT_HAN));
564
5
        break;
565
0
    case CollationRuleParser::FIRST_IMPLICIT:
566
0
        ce = baseData->getSingleCE(0x4e00, errorCode);
567
0
        break;
568
0
    case CollationRuleParser::LAST_IMPLICIT:
569
        // We do not support tailoring to an unassigned-implicit CE.
570
0
        errorCode = U_UNSUPPORTED_ERROR;
571
0
        parserErrorReason = "reset to [last implicit] not supported";
572
0
        return 0;
573
0
    case CollationRuleParser::FIRST_TRAILING:
574
0
        ce = Collation::makeCE(Collation::FIRST_TRAILING_PRIMARY);
575
0
        isBoundary = true;  // trailing first primary (there is no mapping for it)
576
0
        break;
577
1
    case CollationRuleParser::LAST_TRAILING:
578
1
        errorCode = U_ILLEGAL_ARGUMENT_ERROR;
579
1
        parserErrorReason = "LDML forbids tailoring to U+FFFF";
580
1
        return 0;
581
0
    default:
582
0
        UPRV_UNREACHABLE_EXIT;
583
1.04k
    }
584
585
792
    int32_t index = findOrInsertNodeForRootCE(ce, strength, errorCode);
586
792
    if(U_FAILURE(errorCode)) { return 0; }
587
792
    int64_t node = nodes.elementAti(index);
588
792
    if((pos & 1) == 0) {
589
        // even pos = [first xyz]
590
1
        if(!nodeHasAnyBefore(node) && isBoundary) {
591
            // A <group> first primary boundary is artificially added to FractionalUCA.txt.
592
            // It is reachable via its special contraction, but is not normally used.
593
            // Find the first character tailored after the boundary CE,
594
            // or the first real root CE after it.
595
1
            if((index = nextIndexFromNode(node)) != 0) {
596
                // If there is a following node, then it must be tailored
597
                // because there are no root CEs with a boundary primary
598
                // and non-common secondary/tertiary weights.
599
0
                node = nodes.elementAti(index);
600
0
                U_ASSERT(isTailoredNode(node));
601
0
                ce = tempCEFromIndexAndStrength(index, strength);
602
1
            } else {
603
1
                U_ASSERT(strength == UCOL_PRIMARY);
604
1
                uint32_t p = static_cast<uint32_t>(ce >> 32);
605
1
                int32_t pIndex = rootElements.findPrimary(p);
606
1
                UBool isCompressible = baseData->isCompressiblePrimary(p);
607
1
                p = rootElements.getPrimaryAfter(p, pIndex, isCompressible);
608
1
                ce = Collation::makeCE(p);
609
1
                index = findOrInsertNodeForRootCE(ce, UCOL_PRIMARY, errorCode);
610
1
                if(U_FAILURE(errorCode)) { return 0; }
611
1
                node = nodes.elementAti(index);
612
1
            }
613
1
        }
614
1
        if(nodeHasAnyBefore(node)) {
615
            // Get the first node that was tailored before this one at a weaker strength.
616
0
            if(nodeHasBefore2(node)) {
617
0
                index = nextIndexFromNode(nodes.elementAti(nextIndexFromNode(node)));
618
0
                node = nodes.elementAti(index);
619
0
            }
620
0
            if(nodeHasBefore3(node)) {
621
0
                index = nextIndexFromNode(nodes.elementAti(nextIndexFromNode(node)));
622
0
            }
623
0
            U_ASSERT(isTailoredNode(nodes.elementAti(index)));
624
0
            ce = tempCEFromIndexAndStrength(index, strength);
625
0
        }
626
791
    } else {
627
        // odd pos = [last xyz]
628
        // Find the last node that was tailored after the [last xyz]
629
        // at a strength no greater than the position's strength.
630
171k
        for(;;) {
631
171k
            int32_t nextIndex = nextIndexFromNode(node);
632
171k
            if(nextIndex == 0) { break; }
633
170k
            int64_t nextNode = nodes.elementAti(nextIndex);
634
170k
            if(strengthFromNode(nextNode) < strength) { break; }
635
170k
            index = nextIndex;
636
170k
            node = nextNode;
637
170k
        }
638
        // Do not make a temporary CE for a root node.
639
        // This last node might be the node for the root CE itself,
640
        // or a node with a common secondary or tertiary weight.
641
791
        if(isTailoredNode(node)) {
642
628
            ce = tempCEFromIndexAndStrength(index, strength);
643
628
        }
644
791
    }
645
792
    return ce;
646
792
}
647
648
void
649
CollationBuilder::addRelation(int32_t strength, const UnicodeString &prefix,
650
                              const UnicodeString &str, const UnicodeString &extension,
651
4.93M
                              const char *&parserErrorReason, UErrorCode &errorCode) {
652
4.93M
    if(U_FAILURE(errorCode)) { return; }
653
4.93M
    UnicodeString nfdPrefix;
654
4.93M
    if(!prefix.isEmpty()) {
655
4.90k
        nfd.normalize(prefix, nfdPrefix, errorCode);
656
4.90k
        if(U_FAILURE(errorCode)) {
657
0
            parserErrorReason = "normalizing the relation prefix";
658
0
            return;
659
0
        }
660
4.90k
    }
661
4.93M
    UnicodeString nfdString = nfd.normalize(str, errorCode);
662
4.93M
    if(U_FAILURE(errorCode)) {
663
0
        parserErrorReason = "normalizing the relation string";
664
0
        return;
665
0
    }
666
667
    // The runtime code decomposes Hangul syllables on the fly,
668
    // with recursive processing but without making the Jamo pieces visible for matching.
669
    // It does not work with certain types of contextual mappings.
670
4.93M
    int32_t nfdLength = nfdString.length();
671
4.93M
    if(nfdLength >= 2) {
672
855k
        char16_t c = nfdString.charAt(0);
673
855k
        if(Hangul::isJamoL(c) || Hangul::isJamoV(c)) {
674
            // While handling a Hangul syllable, contractions starting with Jamo L or V
675
            // would not see the following Jamo of that syllable.
676
9
            errorCode = U_UNSUPPORTED_ERROR;
677
9
            parserErrorReason = "contractions starting with conjoining Jamo L or V not supported";
678
9
            return;
679
9
        }
680
855k
        c = nfdString.charAt(nfdLength - 1);
681
855k
        if(Hangul::isJamoL(c) ||
682
855k
                (Hangul::isJamoV(c) && Hangul::isJamoL(nfdString.charAt(nfdLength - 2)))) {
683
            // A contraction ending with Jamo L or L+V would require
684
            // generating Hangul syllables in addTailComposites() (588 for a Jamo L),
685
            // or decomposing a following Hangul syllable on the fly, during contraction matching.
686
5
            errorCode = U_UNSUPPORTED_ERROR;
687
5
            parserErrorReason = "contractions ending with conjoining Jamo L or L+V not supported";
688
5
            return;
689
5
        }
690
        // A Hangul syllable completely inside a contraction is ok.
691
855k
    }
692
    // Note: If there is a prefix, then the parser checked that
693
    // both the prefix and the string begin with NFC boundaries (not Jamo V or T).
694
    // Therefore: prefix.isEmpty() || !isJamoVOrT(nfdString.charAt(0))
695
    // (While handling a Hangul syllable, prefixes on Jamo V or T
696
    // would not see the previous Jamo of that syllable.)
697
698
4.93M
    if(strength != UCOL_IDENTICAL) {
699
        // Find the node index after which we insert the new tailored node.
700
3.53M
        int32_t index = findOrInsertNodeForCEs(strength, parserErrorReason, errorCode);
701
3.53M
        U_ASSERT(cesLength > 0);
702
3.53M
        int64_t ce = ces[cesLength - 1];
703
3.53M
        if (strength == UCOL_PRIMARY && !isTempCE(ce) && static_cast<uint32_t>(ce >> 32) == 0) {
704
            // There is no primary gap between ignorables and the space-first-primary.
705
7
            errorCode = U_UNSUPPORTED_ERROR;
706
7
            parserErrorReason = "tailoring primary after ignorables not supported";
707
7
            return;
708
7
        }
709
3.53M
        if(strength == UCOL_QUATERNARY && ce == 0) {
710
            // The CE data structure does not support non-zero quaternary weights
711
            // on tertiary ignorables.
712
1
            errorCode = U_UNSUPPORTED_ERROR;
713
1
            parserErrorReason = "tailoring quaternary after tertiary ignorables not supported";
714
1
            return;
715
1
        }
716
        // Insert the new tailored node.
717
3.53M
        index = insertTailoredNodeAfter(index, strength, errorCode);
718
3.53M
        if(U_FAILURE(errorCode)) {
719
6
            parserErrorReason = "modifying collation elements";
720
6
            return;
721
6
        }
722
        // Strength of the temporary CE:
723
        // The new relation may yield a stronger CE but not a weaker one.
724
3.53M
        int32_t tempStrength = ceStrength(ce);
725
3.53M
        if(strength < tempStrength) { tempStrength = strength; }
726
3.53M
        ces[cesLength - 1] = tempCEFromIndexAndStrength(index, tempStrength);
727
3.53M
    }
728
729
4.93M
    setCaseBits(nfdString, parserErrorReason, errorCode);
730
4.93M
    if(U_FAILURE(errorCode)) { return; }
731
732
4.93M
    int32_t cesLengthBeforeExtension = cesLength;
733
4.93M
    if(!extension.isEmpty()) {
734
1.05k
        UnicodeString nfdExtension = nfd.normalize(extension, errorCode);
735
1.05k
        if(U_FAILURE(errorCode)) {
736
0
            parserErrorReason = "normalizing the relation extension";
737
0
            return;
738
0
        }
739
1.05k
        cesLength = dataBuilder->getCEs(nfdExtension, ces, cesLength);
740
1.05k
        if(cesLength > Collation::MAX_EXPANSION_LENGTH) {
741
1
            errorCode = U_ILLEGAL_ARGUMENT_ERROR;
742
1
            parserErrorReason =
743
1
                "extension string adds too many collation elements (more than 31 total)";
744
1
            return;
745
1
        }
746
1.05k
    }
747
4.93M
    uint32_t ce32 = Collation::UNASSIGNED_CE32;
748
4.93M
    if(!icu4xMode && (prefix != nfdPrefix || str != nfdString) &&
749
21.3k
            !ignorePrefix(prefix, errorCode) && !ignoreString(str, errorCode)) {
750
        // Map from the original input to the CEs.
751
        // We do this in case the canonical closure is incomplete,
752
        // so that it is possible to explicitly provide the missing mappings.
753
20.0k
        ce32 = addIfDifferent(prefix, str, ces, cesLength, ce32, errorCode);
754
20.0k
    }
755
4.93M
    if (!icu4xMode) {
756
4.93M
        addWithClosure(nfdPrefix, nfdString, ces, cesLength, ce32, errorCode);
757
4.93M
    } else {
758
0
        addIfDifferent(nfdPrefix, nfdString, ces, cesLength, ce32, errorCode);
759
0
    }
760
4.93M
    if(U_FAILURE(errorCode)) {
761
307
        parserErrorReason = "writing collation elements";
762
307
        return;
763
307
    }
764
4.93M
    cesLength = cesLengthBeforeExtension;
765
4.93M
}
766
767
int32_t
768
CollationBuilder::findOrInsertNodeForCEs(int32_t strength, const char *&parserErrorReason,
769
3.54M
                                         UErrorCode &errorCode) {
770
3.54M
    if(U_FAILURE(errorCode)) { return 0; }
771
3.54M
    U_ASSERT(UCOL_PRIMARY <= strength && strength <= UCOL_QUATERNARY);
772
773
    // Find the last CE that is at least as "strong" as the requested difference.
774
    // Note: Stronger is smaller (UCOL_PRIMARY=0).
775
3.54M
    int64_t ce;
776
3.54M
    for(;; --cesLength) {
777
3.54M
        if(cesLength == 0) {
778
858
            ce = ces[0] = 0;
779
858
            cesLength = 1;
780
858
            break;
781
3.54M
        } else {
782
3.54M
            ce = ces[cesLength - 1];
783
3.54M
        }
784
3.54M
        if(ceStrength(ce) <= strength) { break; }
785
3.54M
    }
786
787
3.54M
    if(isTempCE(ce)) {
788
        // No need to findCommonNode() here for lower levels
789
        // because insertTailoredNodeAfter() will do that anyway.
790
3.49M
        return indexFromTempCE(ce);
791
3.49M
    }
792
793
    // root CE
794
45.1k
    if (static_cast<uint8_t>(ce >> 56) == Collation::UNASSIGNED_IMPLICIT_BYTE) {
795
7
        errorCode = U_UNSUPPORTED_ERROR;
796
7
        parserErrorReason = "tailoring relative to an unassigned code point not supported";
797
7
        return 0;
798
7
    }
799
45.1k
    return findOrInsertNodeForRootCE(ce, strength, errorCode);
800
45.1k
}
801
802
int32_t
803
45.9k
CollationBuilder::findOrInsertNodeForRootCE(int64_t ce, int32_t strength, UErrorCode &errorCode) {
804
45.9k
    if(U_FAILURE(errorCode)) { return 0; }
805
45.9k
    U_ASSERT((uint8_t)(ce >> 56) != Collation::UNASSIGNED_IMPLICIT_BYTE);
806
807
    // Find or insert the node for each of the root CE's weights,
808
    // down to the requested level/strength.
809
    // Root CEs must have common=zero quaternary weights (for which we never insert any nodes).
810
45.9k
    U_ASSERT((ce & 0xc0) == 0);
811
45.9k
    int32_t index = findOrInsertNodeForPrimary(static_cast<uint32_t>(ce >> 32), errorCode);
812
45.9k
    if(strength >= UCOL_SECONDARY) {
813
30.1k
        uint32_t lower32 = static_cast<uint32_t>(ce);
814
30.1k
        index = findOrInsertWeakNode(index, lower32 >> 16, UCOL_SECONDARY, errorCode);
815
30.1k
        if(strength >= UCOL_TERTIARY) {
816
12.9k
            index = findOrInsertWeakNode(index, lower32 & Collation::ONLY_TERTIARY_MASK,
817
12.9k
                                         UCOL_TERTIARY, errorCode);
818
12.9k
        }
819
30.1k
    }
820
45.9k
    return index;
821
45.9k
}
822
823
namespace {
824
825
/**
826
 * Like Java Collections.binarySearch(List, key, Comparator).
827
 *
828
 * @return the index>=0 where the item was found,
829
 *         or the index<0 for inserting the string at ~index in sorted order
830
 *         (index into rootPrimaryIndexes)
831
 */
832
int32_t
833
binarySearchForRootPrimaryNode(const int32_t *rootPrimaryIndexes, int32_t length,
834
50.8k
                               const int64_t *nodes, uint32_t p) {
835
50.8k
    if(length == 0) { return ~0; }
836
46.5k
    int32_t start = 0;
837
46.5k
    int32_t limit = length;
838
195k
    for (;;) {
839
195k
        int32_t i = (start + limit) / 2;
840
195k
        int64_t node = nodes[rootPrimaryIndexes[i]];
841
195k
        uint32_t nodePrimary = static_cast<uint32_t>(node >> 32); // weight32FromNode(node)
842
195k
        if (p == nodePrimary) {
843
28.7k
            return i;
844
166k
        } else if (p < nodePrimary) {
845
65.1k
            if (i == start) {
846
1.94k
                return ~start;  // insert s before i
847
1.94k
            }
848
63.1k
            limit = i;
849
101k
        } else {
850
101k
            if (i == start) {
851
15.7k
                return ~(start + 1);  // insert s after i
852
15.7k
            }
853
85.3k
            start = i;
854
85.3k
        }
855
195k
    }
856
46.5k
}
857
858
}  // namespace
859
860
int32_t
861
50.8k
CollationBuilder::findOrInsertNodeForPrimary(uint32_t p, UErrorCode &errorCode) {
862
50.8k
    if(U_FAILURE(errorCode)) { return 0; }
863
864
50.8k
    int32_t rootIndex = binarySearchForRootPrimaryNode(
865
50.8k
        rootPrimaryIndexes.getBuffer(), rootPrimaryIndexes.size(), nodes.getBuffer(), p);
866
50.8k
    if(rootIndex >= 0) {
867
28.7k
        return rootPrimaryIndexes.elementAti(rootIndex);
868
28.7k
    } else {
869
        // Start a new list of nodes with this primary.
870
22.0k
        int32_t index = nodes.size();
871
22.0k
        nodes.addElement(nodeFromWeight32(p), errorCode);
872
22.0k
        rootPrimaryIndexes.insertElementAt(index, ~rootIndex, errorCode);
873
22.0k
        return index;
874
22.0k
    }
875
50.8k
}
876
877
int32_t
878
43.8k
CollationBuilder::findOrInsertWeakNode(int32_t index, uint32_t weight16, int32_t level, UErrorCode &errorCode) {
879
43.8k
    if(U_FAILURE(errorCode)) { return 0; }
880
43.8k
    U_ASSERT(0 <= index && index < nodes.size());
881
43.8k
    U_ASSERT(UCOL_SECONDARY <= level && level <= UCOL_TERTIARY);
882
883
43.8k
    if(weight16 == Collation::COMMON_WEIGHT16) {
884
36.9k
        return findCommonNode(index, level);
885
36.9k
    }
886
887
    // If this will be the first below-common weight for the parent node,
888
    // then we will also need to insert a common weight after it.
889
6.91k
    int64_t node = nodes.elementAti(index);
890
6.91k
    U_ASSERT(strengthFromNode(node) < level);  // parent node is stronger
891
6.91k
    if(weight16 != 0 && weight16 < Collation::COMMON_WEIGHT16) {
892
1.09k
        int32_t hasThisLevelBefore = level == UCOL_SECONDARY ? HAS_BEFORE2 : HAS_BEFORE3;
893
1.09k
        if((node & hasThisLevelBefore) == 0) {
894
            // The parent node has an implied level-common weight.
895
901
            int64_t commonNode =
896
901
                nodeFromWeight16(Collation::COMMON_WEIGHT16) | nodeFromStrength(level);
897
901
            if(level == UCOL_SECONDARY) {
898
                // Move the HAS_BEFORE3 flag from the parent node
899
                // to the new secondary common node.
900
522
                commonNode |= node & HAS_BEFORE3;
901
522
                node &= ~static_cast<int64_t>(HAS_BEFORE3);
902
522
            }
903
901
            nodes.setElementAt(node | hasThisLevelBefore, index);
904
            // Insert below-common-weight node.
905
901
            int32_t nextIndex = nextIndexFromNode(node);
906
901
            node = nodeFromWeight16(weight16) | nodeFromStrength(level);
907
901
            index = insertNodeBetween(index, nextIndex, node, errorCode);
908
            // Insert common-weight node.
909
901
            insertNodeBetween(index, nextIndex, commonNode, errorCode);
910
            // Return index of below-common-weight node.
911
901
            return index;
912
901
        }
913
1.09k
    }
914
915
    // Find the root CE's weight for this level.
916
    // Postpone insertion if not found:
917
    // Insert the new root node before the next stronger node,
918
    // or before the next root node with the same strength and a larger weight.
919
6.01k
    int32_t nextIndex;
920
25.9k
    while((nextIndex = nextIndexFromNode(node)) != 0) {
921
23.5k
        node = nodes.elementAti(nextIndex);
922
23.5k
        int32_t nextStrength = strengthFromNode(node);
923
23.5k
        if(nextStrength <= level) {
924
            // Insert before a stronger node.
925
7.83k
            if(nextStrength < level) { break; }
926
            // nextStrength == level
927
7.68k
            if(!isTailoredNode(node)) {
928
4.45k
                uint32_t nextWeight16 = weight16FromNode(node);
929
4.45k
                if(nextWeight16 == weight16) {
930
                    // Found the node for the root CE up to this level.
931
3.10k
                    return nextIndex;
932
3.10k
                }
933
                // Insert before a node with a larger same-strength weight.
934
1.34k
                if(nextWeight16 > weight16) { break; }
935
1.34k
            }
936
7.68k
        }
937
        // Skip the next node.
938
19.9k
        index = nextIndex;
939
19.9k
    }
940
2.90k
    node = nodeFromWeight16(weight16) | nodeFromStrength(level);
941
2.90k
    return insertNodeBetween(index, nextIndex, node, errorCode);
942
6.01k
}
943
944
int32_t
945
3.53M
CollationBuilder::insertTailoredNodeAfter(int32_t index, int32_t strength, UErrorCode &errorCode) {
946
3.53M
    if(U_FAILURE(errorCode)) { return 0; }
947
3.53M
    U_ASSERT(0 <= index && index < nodes.size());
948
3.53M
    if(strength >= UCOL_SECONDARY) {
949
818k
        index = findCommonNode(index, UCOL_SECONDARY);
950
818k
        if(strength >= UCOL_TERTIARY) {
951
164k
            index = findCommonNode(index, UCOL_TERTIARY);
952
164k
        }
953
818k
    }
954
    // Postpone insertion:
955
    // Insert the new node before the next one with a strength at least as strong.
956
3.53M
    int64_t node = nodes.elementAti(index);
957
3.53M
    int32_t nextIndex;
958
3.56M
    while((nextIndex = nextIndexFromNode(node)) != 0) {
959
515k
        node = nodes.elementAti(nextIndex);
960
515k
        if(strengthFromNode(node) <= strength) { break; }
961
        // Skip the next node which has a weaker (larger) strength than the new one.
962
24.7k
        index = nextIndex;
963
24.7k
    }
964
3.53M
    node = IS_TAILORED | nodeFromStrength(strength);
965
3.53M
    return insertNodeBetween(index, nextIndex, node, errorCode);
966
3.53M
}
967
968
int32_t
969
CollationBuilder::insertNodeBetween(int32_t index, int32_t nextIndex, int64_t node,
970
3.54M
                                    UErrorCode &errorCode) {
971
3.54M
    if(U_FAILURE(errorCode)) { return 0; }
972
3.54M
    U_ASSERT(previousIndexFromNode(node) == 0);
973
3.54M
    U_ASSERT(nextIndexFromNode(node) == 0);
974
3.54M
    U_ASSERT(nextIndexFromNode(nodes.elementAti(index)) == nextIndex);
975
    // Append the new node and link it to the existing nodes.
976
3.54M
    int32_t newIndex = nodes.size();
977
3.54M
    node |= nodeFromPreviousIndex(index) | nodeFromNextIndex(nextIndex);
978
3.54M
    nodes.addElement(node, errorCode);
979
3.54M
    if(U_FAILURE(errorCode)) { return 0; }
980
    // nodes[index].nextIndex = newIndex
981
3.54M
    node = nodes.elementAti(index);
982
3.54M
    nodes.setElementAt(changeNodeNextIndex(node, newIndex), index);
983
    // nodes[nextIndex].previousIndex = newIndex
984
3.54M
    if(nextIndex != 0) {
985
491k
        node = nodes.elementAti(nextIndex);
986
491k
        nodes.setElementAt(changeNodePreviousIndex(node, newIndex), nextIndex);
987
491k
    }
988
3.54M
    return newIndex;
989
3.54M
}
990
991
int32_t
992
1.02M
CollationBuilder::findCommonNode(int32_t index, int32_t strength) const {
993
1.02M
    U_ASSERT(UCOL_SECONDARY <= strength && strength <= UCOL_TERTIARY);
994
1.02M
    int64_t node = nodes.elementAti(index);
995
1.02M
    if(strengthFromNode(node) >= strength) {
996
        // The current node is no stronger.
997
927k
        return index;
998
927k
    }
999
94.0k
    if(strength == UCOL_SECONDARY ? !nodeHasBefore2(node) : !nodeHasBefore3(node)) {
1000
        // The current node implies the strength-common weight.
1001
92.4k
        return index;
1002
92.4k
    }
1003
1.57k
    index = nextIndexFromNode(node);
1004
1.57k
    node = nodes.elementAti(index);
1005
1.57k
    U_ASSERT(!isTailoredNode(node) && strengthFromNode(node) == strength &&
1006
1.57k
            weight16FromNode(node) < Collation::COMMON_WEIGHT16);
1007
    // Skip to the explicit common node.
1008
9.30k
    do {
1009
9.30k
        index = nextIndexFromNode(node);
1010
9.30k
        node = nodes.elementAti(index);
1011
9.30k
        U_ASSERT(strengthFromNode(node) >= strength);
1012
9.30k
    } while(isTailoredNode(node) || strengthFromNode(node) > strength ||
1013
1.62k
            weight16FromNode(node) < Collation::COMMON_WEIGHT16);
1014
1.57k
    U_ASSERT(weight16FromNode(node) == Collation::COMMON_WEIGHT16);
1015
1.57k
    return index;
1016
94.0k
}
1017
1018
void
1019
CollationBuilder::setCaseBits(const UnicodeString &nfdString,
1020
4.93M
                              const char *&parserErrorReason, UErrorCode &errorCode) {
1021
4.93M
    if(U_FAILURE(errorCode)) { return; }
1022
4.93M
    int32_t numTailoredPrimaries = 0;
1023
14.2M
    for(int32_t i = 0; i < cesLength; ++i) {
1024
9.34M
        if(ceStrength(ces[i]) == UCOL_PRIMARY) { ++numTailoredPrimaries; }
1025
9.34M
    }
1026
    // We should not be able to get too many case bits because
1027
    // cesLength<=31==MAX_EXPANSION_LENGTH.
1028
    // 31 pairs of case bits fit into an int64_t without setting its sign bit.
1029
4.93M
    U_ASSERT(numTailoredPrimaries <= 31);
1030
1031
4.93M
    int64_t cases = 0;
1032
4.93M
    if(numTailoredPrimaries > 0) {
1033
4.74M
        const char16_t *s = nfdString.getBuffer();
1034
4.74M
        UTF16CollationIterator baseCEs(baseData, false, s, s, s + nfdString.length());
1035
4.74M
        int32_t baseCEsLength = baseCEs.fetchCEs(errorCode) - 1;
1036
4.74M
        if(U_FAILURE(errorCode)) {
1037
0
            parserErrorReason = "fetching root CEs for tailored string";
1038
0
            return;
1039
0
        }
1040
4.74M
        U_ASSERT(baseCEsLength >= 0 && baseCEs.getCE(baseCEsLength) == Collation::NO_CE);
1041
1042
4.74M
        uint32_t lastCase = 0;
1043
4.74M
        int32_t numBasePrimaries = 0;
1044
10.2M
        for(int32_t i = 0; i < baseCEsLength; ++i) {
1045
5.47M
            int64_t ce = baseCEs.getCE(i);
1046
5.47M
            if((ce >> 32) != 0) {
1047
5.36M
                ++numBasePrimaries;
1048
5.36M
                uint32_t c = (static_cast<uint32_t>(ce) >> 14) & 3;
1049
5.36M
                U_ASSERT(c == 0 || c == 2);  // lowercase or uppercase, no mixed case in any base CE
1050
5.36M
                if(numBasePrimaries < numTailoredPrimaries) {
1051
1.13M
                    cases |= static_cast<int64_t>(c) << ((numBasePrimaries - 1) * 2);
1052
4.23M
                } else if(numBasePrimaries == numTailoredPrimaries) {
1053
4.07M
                    lastCase = c;
1054
4.07M
                } else if(c != lastCase) {
1055
                    // There are more base primary CEs than tailored primaries.
1056
                    // Set mixed case if the case bits of the remainder differ.
1057
6.90k
                    lastCase = 1;
1058
                    // Nothing more can change.
1059
6.90k
                    break;
1060
6.90k
                }
1061
5.36M
            }
1062
5.47M
        }
1063
4.74M
        if(numBasePrimaries >= numTailoredPrimaries) {
1064
4.07M
            cases |= static_cast<int64_t>(lastCase) << ((numTailoredPrimaries - 1) * 2);
1065
4.07M
        }
1066
4.74M
    }
1067
1068
14.2M
    for(int32_t i = 0; i < cesLength; ++i) {
1069
9.34M
        int64_t ce = ces[i] & INT64_C(0xffffffffffff3fff);  // clear old case bits
1070
9.34M
        int32_t strength = ceStrength(ce);
1071
9.34M
        if(strength == UCOL_PRIMARY) {
1072
8.95M
            ce |= (cases & 3) << 14;
1073
8.95M
            cases >>= 2;
1074
8.95M
        } else if(strength == UCOL_TERTIARY) {
1075
            // Tertiary CEs must have uppercase bits.
1076
            // See the LDML spec, and comments in class CollationCompare.
1077
68.0k
            ce |= 0x8000;
1078
68.0k
        }
1079
        // Tertiary ignorable CEs must have 0 case bits.
1080
        // We set 0 case bits for secondary CEs too
1081
        // since currently only U+0345 is cased and maps to a secondary CE,
1082
        // and it is lowercase. Other secondaries are uncased.
1083
        // See [[:Cased:]&[:uca1=:]] where uca1 queries the root primary weight.
1084
9.34M
        ces[i] = ce;
1085
9.34M
    }
1086
4.93M
}
1087
1088
void
1089
CollationBuilder::suppressContractions(const UnicodeSet &set, const char *&parserErrorReason,
1090
250
                                       UErrorCode &errorCode) {
1091
250
    if(U_FAILURE(errorCode)) { return; }
1092
250
    dataBuilder->suppressContractions(set, errorCode);
1093
250
    if(U_FAILURE(errorCode)) {
1094
0
        parserErrorReason = "application of [suppressContractions [set]] failed";
1095
0
    }
1096
250
}
1097
1098
void
1099
CollationBuilder::optimize(const UnicodeSet &set, const char *& /* parserErrorReason */,
1100
94
                           UErrorCode &errorCode) {
1101
94
    if(U_FAILURE(errorCode)) { return; }
1102
94
    optimizeSet.addAll(set);
1103
94
}
1104
1105
uint32_t
1106
CollationBuilder::addWithClosure(const UnicodeString &nfdPrefix, const UnicodeString &nfdString,
1107
                                 const int64_t newCEs[], int32_t newCEsLength, uint32_t ce32,
1108
4.93M
                                 UErrorCode &errorCode) {
1109
    // Map from the NFD input to the CEs.
1110
4.93M
    ce32 = addIfDifferent(nfdPrefix, nfdString, newCEs, newCEsLength, ce32, errorCode);
1111
4.93M
    ce32 = addOnlyClosure(nfdPrefix, nfdString, newCEs, newCEsLength, ce32, errorCode);
1112
4.93M
    addTailComposites(nfdPrefix, nfdString, errorCode);
1113
4.93M
    return ce32;
1114
4.93M
}
1115
1116
// ICU-22517
1117
// This constant defines a limit for the addOnlyClosure to return
1118
// error, to avoid taking a long time for canonical closure expansion.
1119
// Please let us know if you have a reasonable use case that needed
1120
// for a practical Collation rule that needs to increase this limit.
1121
// This value is needed for compiling a rule with eight Hangul syllables such as
1122
// "&a=b쫊쫊쫊쫊쫊쫊쫊" without error, which should be more than realistic
1123
// usage.
1124
static constexpr int32_t kClosureLoopLimit = 3000;
1125
1126
uint32_t
1127
CollationBuilder::addOnlyClosure(const UnicodeString &nfdPrefix, const UnicodeString &nfdString,
1128
                                 const int64_t newCEs[], int32_t newCEsLength, uint32_t ce32,
1129
5.78M
                                 UErrorCode &errorCode) {
1130
5.78M
    if(U_FAILURE(errorCode)) { return ce32; }
1131
1132
5.78M
    int32_t loop = 0;
1133
    // Map from canonically equivalent input to the CEs. (But not from the all-NFD input.)
1134
5.78M
    if(nfdPrefix.isEmpty()) {
1135
5.77M
        CanonicalIterator stringIter(nfdString, errorCode);
1136
5.77M
        if(U_FAILURE(errorCode)) { return ce32; }
1137
5.77M
        UnicodeString prefix;
1138
14.3M
        for(;;) {
1139
14.3M
            UnicodeString str = stringIter.next();
1140
14.3M
            if(str.isBogus()) { break; }
1141
8.56M
            if (loop++ > kClosureLoopLimit) {
1142
                // To avoid hang as in ICU-22517, return with error.
1143
101
                errorCode = U_INPUT_TOO_LONG_ERROR;
1144
101
                return ce32;
1145
101
            }
1146
8.56M
            if(ignoreString(str, errorCode) || str == nfdString) { continue; }
1147
1.79M
            ce32 = addIfDifferent(prefix, str, newCEs, newCEsLength, ce32, errorCode);
1148
1.79M
            if(U_FAILURE(errorCode)) { return ce32; }
1149
1.79M
        }
1150
5.77M
    } else {
1151
7.26k
        CanonicalIterator prefixIter(nfdPrefix, errorCode);
1152
7.26k
        CanonicalIterator stringIter(nfdString, errorCode);
1153
7.26k
        if(U_FAILURE(errorCode)) { return ce32; }
1154
69.3k
        for(;;) {
1155
69.3k
            UnicodeString prefix = prefixIter.next();
1156
69.3k
            if(prefix.isBogus()) { break; }
1157
62.1k
            if(ignorePrefix(prefix, errorCode)) { continue; }
1158
50.9k
            UBool samePrefix = prefix == nfdPrefix;
1159
373k
            for(;;) {
1160
373k
                UnicodeString str = stringIter.next();
1161
373k
                if(str.isBogus()) { break; }
1162
322k
                if (loop++ > kClosureLoopLimit) {
1163
                    // To avoid hang as in ICU-22517, return with error.
1164
52
                    errorCode = U_INPUT_TOO_LONG_ERROR;
1165
52
                    return ce32;
1166
52
                }
1167
322k
                if(ignoreString(str, errorCode) || (samePrefix && str == nfdString)) { continue; }
1168
143k
                ce32 = addIfDifferent(prefix, str, newCEs, newCEsLength, ce32, errorCode);
1169
143k
                if(U_FAILURE(errorCode)) { return ce32; }
1170
143k
            }
1171
50.8k
            stringIter.reset();
1172
50.8k
        }
1173
7.24k
    }
1174
5.78M
    return ce32;
1175
5.78M
}
1176
1177
void
1178
CollationBuilder::addTailComposites(const UnicodeString &nfdPrefix, const UnicodeString &nfdString,
1179
4.93M
                                    UErrorCode &errorCode) {
1180
4.93M
    if(U_FAILURE(errorCode)) { return; }
1181
1182
    // Look for the last starter in the NFD string.
1183
4.93M
    UChar32 lastStarter;
1184
4.93M
    int32_t indexAfterLastStarter = nfdString.length();
1185
4.99M
    for(;;) {
1186
4.99M
        if(indexAfterLastStarter == 0) { return; }  // no starter at all
1187
4.95M
        lastStarter = nfdString.char32At(indexAfterLastStarter - 1);
1188
4.95M
        if(nfd.getCombiningClass(lastStarter) == 0) { break; }
1189
58.3k
        indexAfterLastStarter -= U16_LENGTH(lastStarter);
1190
58.3k
    }
1191
    // No closure to Hangul syllables since we decompose them on the fly.
1192
4.89M
    if(Hangul::isJamoL(lastStarter)) { return; }
1193
1194
    // Are there any composites whose decomposition starts with the lastStarter?
1195
    // Note: Normalizer2Impl does not currently return start sets for NFC_QC=Maybe characters.
1196
    // We might find some more equivalent mappings here if it did.
1197
4.89M
    UnicodeSet composites;
1198
4.89M
    if(!nfcImpl.getCanonStartSet(lastStarter, composites)) { return; }
1199
1200
228k
    UnicodeString decomp;
1201
228k
    UnicodeString newNFDString, newString;
1202
228k
    int64_t newCEs[Collation::MAX_EXPANSION_LENGTH];
1203
228k
    UnicodeSetIterator iter(composites);
1204
1.52M
    while(iter.next()) {
1205
1.29M
        U_ASSERT(!iter.isString());
1206
1.29M
        UChar32 composite = iter.getCodepoint();
1207
1.29M
        nfd.getDecomposition(composite, decomp);
1208
1.29M
        if(!mergeCompositeIntoString(nfdString, indexAfterLastStarter, composite, decomp,
1209
1.29M
                                     newNFDString, newString, errorCode)) {
1210
273k
            continue;
1211
273k
        }
1212
1.01M
        int32_t newCEsLength = dataBuilder->getCEs(nfdPrefix, newNFDString, newCEs, 0);
1213
1.01M
        if(newCEsLength > Collation::MAX_EXPANSION_LENGTH) {
1214
            // Ignore mappings that we cannot store.
1215
3.91k
            continue;
1216
3.91k
        }
1217
        // Note: It is possible that the newCEs do not make use of the mapping
1218
        // for which we are adding the tail composites, in which case we might be adding
1219
        // unnecessary mappings.
1220
        // For example, when we add tail composites for ae^ (^=combining circumflex),
1221
        // UCA discontiguous-contraction matching does not find any matches
1222
        // for ae_^ (_=any combining diacritic below) *unless* there is also
1223
        // a contraction mapping for ae.
1224
        // Thus, if there is no ae contraction, then the ae^ mapping is ignored
1225
        // while fetching the newCEs for ae_^.
1226
        // TODO: Try to detect this effectively.
1227
        // (Alternatively, print a warning when prefix contractions are missing.)
1228
1229
        // We do not need an explicit mapping for the NFD strings.
1230
        // It is fine if the NFD input collates like this via a sequence of mappings.
1231
        // It also saves a little bit of space, and may reduce the set of characters with contractions.
1232
1.01M
        uint32_t ce32 = addIfDifferent(nfdPrefix, newString,
1233
1.01M
                                       newCEs, newCEsLength, Collation::UNASSIGNED_CE32, errorCode);
1234
1.01M
        if(ce32 != Collation::UNASSIGNED_CE32) {
1235
            // was different, was added
1236
850k
            addOnlyClosure(nfdPrefix, newNFDString, newCEs, newCEsLength, ce32, errorCode);
1237
850k
        }
1238
1.01M
    }
1239
228k
}
1240
1241
UBool
1242
CollationBuilder::mergeCompositeIntoString(const UnicodeString &nfdString,
1243
                                           int32_t indexAfterLastStarter,
1244
                                           UChar32 composite, const UnicodeString &decomp,
1245
                                           UnicodeString &newNFDString, UnicodeString &newString,
1246
1.29M
                                           UErrorCode &errorCode) const {
1247
1.29M
    if(U_FAILURE(errorCode)) { return false; }
1248
1.29M
    U_ASSERT(nfdString.char32At(indexAfterLastStarter - 1) == decomp.char32At(0));
1249
1.29M
    int32_t lastStarterLength = decomp.moveIndex32(0, 1);
1250
1.29M
    if(lastStarterLength == decomp.length()) {
1251
        // Singleton decompositions should be found by addWithClosure()
1252
        // and the CanonicalIterator, so we can ignore them here.
1253
117k
        return false;
1254
117k
    }
1255
1.17M
    if(nfdString.compare(indexAfterLastStarter, 0x7fffffff,
1256
1.17M
                         decomp, lastStarterLength, 0x7fffffff) == 0) {
1257
        // same strings, nothing new to be found here
1258
14.5k
        return false;
1259
14.5k
    }
1260
1261
    // Make new FCD strings that combine a composite, or its decomposition,
1262
    // into the nfdString's last starter and the combining marks following it.
1263
    // Make an NFD version, and a version with the composite.
1264
1.16M
    newNFDString.setTo(nfdString, 0, indexAfterLastStarter);
1265
1.16M
    newString.setTo(nfdString, 0, indexAfterLastStarter - lastStarterLength).append(composite);
1266
1267
    // The following is related to discontiguous contraction matching,
1268
    // but builds only FCD strings (or else returns false).
1269
1.16M
    int32_t sourceIndex = indexAfterLastStarter;
1270
1.16M
    int32_t decompIndex = lastStarterLength;
1271
    // Small optimization: We keep the source character across loop iterations
1272
    // because we do not always consume it,
1273
    // and then need not fetch it again nor look up its combining class again.
1274
1.16M
    UChar32 sourceChar = U_SENTINEL;
1275
    // The cc variables need to be declared before the loop so that at the end
1276
    // they are set to the last combining classes seen.
1277
1.16M
    uint8_t sourceCC = 0;
1278
1.16M
    uint8_t decompCC = 0;
1279
1.21M
    for(;;) {
1280
1.21M
        if(sourceChar < 0) {
1281
1.17M
            if(sourceIndex >= nfdString.length()) { break; }
1282
180k
            sourceChar = nfdString.char32At(sourceIndex);
1283
180k
            sourceCC = nfd.getCombiningClass(sourceChar);
1284
180k
            U_ASSERT(sourceCC != 0);
1285
180k
        }
1286
        // We consume a decomposition character in each iteration.
1287
226k
        if(decompIndex >= decomp.length()) { break; }
1288
198k
        UChar32 decompChar = decomp.char32At(decompIndex);
1289
198k
        decompCC = nfd.getCombiningClass(decompChar);
1290
        // Compare the two characters and their combining classes.
1291
198k
        if(decompCC == 0) {
1292
            // Unable to merge because the source contains a non-zero combining mark
1293
            // but the composite's decomposition contains another starter.
1294
            // The strings would not be equivalent.
1295
415
            return false;
1296
197k
        } else if(sourceCC < decompCC) {
1297
            // Composite + sourceChar would not be FCD.
1298
23.7k
            return false;
1299
174k
        } else if(decompCC < sourceCC) {
1300
46.2k
            newNFDString.append(decompChar);
1301
46.2k
            decompIndex += U16_LENGTH(decompChar);
1302
127k
        } else if(decompChar != sourceChar) {
1303
            // Blocked because same combining class.
1304
117k
            return false;
1305
117k
        } else {  // match: decompChar == sourceChar
1306
10.0k
            newNFDString.append(decompChar);
1307
10.0k
            decompIndex += U16_LENGTH(decompChar);
1308
10.0k
            sourceIndex += U16_LENGTH(decompChar);
1309
10.0k
            sourceChar = U_SENTINEL;
1310
10.0k
        }
1311
198k
    }
1312
    // We are at the end of at least one of the two inputs.
1313
1.01M
    if(sourceChar >= 0) {  // more characters from nfdString but not from decomp
1314
28.3k
        if(sourceCC < decompCC) {
1315
            // Appending the next source character to the composite would not be FCD.
1316
0
            return false;
1317
0
        }
1318
28.3k
        newNFDString.append(nfdString, sourceIndex, 0x7fffffff);
1319
28.3k
        newString.append(nfdString, sourceIndex, 0x7fffffff);
1320
991k
    } else if(decompIndex < decomp.length()) {  // more characters from decomp, not from nfdString
1321
989k
        newNFDString.append(decomp, decompIndex, 0x7fffffff);
1322
989k
    }
1323
1.01M
    U_ASSERT(nfd.isNormalized(newNFDString, errorCode));
1324
1.01M
    U_ASSERT(fcd.isNormalized(newString, errorCode));
1325
1.01M
    U_ASSERT(nfd.normalize(newString, errorCode) == newNFDString);  // canonically equivalent
1326
1.01M
    return true;
1327
1.01M
}
1328
1329
UBool
1330
83.4k
CollationBuilder::ignorePrefix(const UnicodeString &s, UErrorCode &errorCode) const {
1331
    // Do not map non-FCD prefixes.
1332
83.4k
    return !isFCD(s, errorCode);
1333
83.4k
}
1334
1335
UBool
1336
8.90M
CollationBuilder::ignoreString(const UnicodeString &s, UErrorCode &errorCode) const {
1337
    // Do not map non-FCD strings.
1338
    // Do not map strings that start with Hangul syllables: We decompose those on the fly.
1339
8.90M
    return !isFCD(s, errorCode) || Hangul::isHangul(s.charAt(0));
1340
8.90M
}
1341
1342
UBool
1343
8.99M
CollationBuilder::isFCD(const UnicodeString &s, UErrorCode &errorCode) const {
1344
8.99M
    return U_SUCCESS(errorCode) && fcd.isNormalized(s, errorCode);
1345
8.99M
}
1346
1347
void
1348
3.86k
CollationBuilder::closeOverComposites(UErrorCode &errorCode) {
1349
3.86k
    UnicodeSet composites(UNICODE_STRING_SIMPLE("[:NFD_QC=N:]"), errorCode);  // Java: static final
1350
3.86k
    if(U_FAILURE(errorCode)) { return; }
1351
    // Hangul is decomposed on the fly during collation.
1352
3.82k
    composites.remove(Hangul::HANGUL_BASE, Hangul::HANGUL_END);
1353
3.82k
    UnicodeString prefix;  // empty
1354
3.82k
    UnicodeString nfdString;
1355
3.82k
    UnicodeSetIterator iter(composites);
1356
7.96M
    while(iter.next()) {
1357
7.96M
        U_ASSERT(!iter.isString());
1358
7.96M
        nfd.getDecomposition(iter.getCodepoint(), nfdString);
1359
7.96M
        cesLength = dataBuilder->getCEs(nfdString, ces, 0);
1360
7.96M
        if(cesLength > Collation::MAX_EXPANSION_LENGTH) {
1361
            // Too many CEs from the decomposition (unusual), ignore this composite.
1362
            // We could add a capacity parameter to getCEs() and reallocate if necessary.
1363
            // However, this can only really happen in contrived cases.
1364
2.74k
            continue;
1365
2.74k
        }
1366
7.96M
        const UnicodeString &composite(iter.getString());
1367
7.96M
        addIfDifferent(prefix, composite, ces, cesLength, Collation::UNASSIGNED_CE32, errorCode);
1368
7.96M
    }
1369
3.82k
}
1370
1371
uint32_t
1372
CollationBuilder::addIfDifferent(const UnicodeString &prefix, const UnicodeString &str,
1373
                                 const int64_t newCEs[], int32_t newCEsLength, uint32_t ce32,
1374
15.8M
                                 UErrorCode &errorCode) {
1375
15.8M
    if(U_FAILURE(errorCode)) { return ce32; }
1376
15.8M
    int64_t oldCEs[Collation::MAX_EXPANSION_LENGTH];
1377
15.8M
    int32_t oldCEsLength = dataBuilder->getCEs(prefix, str, oldCEs, 0);
1378
15.8M
    if(!sameCEs(newCEs, newCEsLength, oldCEs, oldCEsLength)) {
1379
5.84M
        if(ce32 == Collation::UNASSIGNED_CE32) {
1380
5.43M
            ce32 = dataBuilder->encodeCEs(newCEs, newCEsLength, errorCode);
1381
5.43M
        }
1382
5.84M
        dataBuilder->addCE32(prefix, str, ce32, errorCode);
1383
5.84M
    }
1384
15.8M
    return ce32;
1385
15.8M
}
1386
1387
UBool
1388
CollationBuilder::sameCEs(const int64_t ces1[], int32_t ces1Length,
1389
15.8M
                          const int64_t ces2[], int32_t ces2Length) {
1390
15.8M
    if(ces1Length != ces2Length) {
1391
1.11M
        return false;
1392
1.11M
    }
1393
14.7M
    U_ASSERT(ces1Length <= Collation::MAX_EXPANSION_LENGTH);
1394
34.6M
    for(int32_t i = 0; i < ces1Length; ++i) {
1395
24.6M
        if(ces1[i] != ces2[i]) { return false; }
1396
24.6M
    }
1397
10.0M
    return true;
1398
14.7M
}
1399
1400
#ifdef DEBUG_COLLATION_BUILDER
1401
1402
uint32_t
1403
alignWeightRight(uint32_t w) {
1404
    if(w != 0) {
1405
        while((w & 0xff) == 0) { w >>= 8; }
1406
    }
1407
    return w;
1408
}
1409
1410
#endif
1411
1412
void
1413
3.86k
CollationBuilder::makeTailoredCEs(UErrorCode &errorCode) {
1414
3.86k
    if(U_FAILURE(errorCode)) { return; }
1415
1416
3.86k
    CollationWeights primaries, secondaries, tertiaries;
1417
3.86k
    int64_t *nodesArray = nodes.getBuffer();
1418
#ifdef DEBUG_COLLATION_BUILDER
1419
        puts("\nCollationBuilder::makeTailoredCEs()");
1420
#endif
1421
1422
18.1k
    for(int32_t rpi = 0; rpi < rootPrimaryIndexes.size(); ++rpi) {
1423
14.3k
        int32_t i = rootPrimaryIndexes.elementAti(rpi);
1424
14.3k
        int64_t node = nodesArray[i];
1425
14.3k
        uint32_t p = weight32FromNode(node);
1426
14.3k
        uint32_t s = p == 0 ? 0 : Collation::COMMON_WEIGHT16;
1427
14.3k
        uint32_t t = s;
1428
14.3k
        uint32_t q = 0;
1429
14.3k
        UBool pIsTailored = false;
1430
14.3k
        UBool sIsTailored = false;
1431
14.3k
        UBool tIsTailored = false;
1432
#ifdef DEBUG_COLLATION_BUILDER
1433
        printf("\nprimary     %lx\n", (long)alignWeightRight(p));
1434
#endif
1435
14.3k
        int32_t pIndex = p == 0 ? 0 : rootElements.findPrimary(p);
1436
14.3k
        int32_t nextIndex = nextIndexFromNode(node);
1437
2.36M
        while(nextIndex != 0) {
1438
2.34M
            i = nextIndex;
1439
2.34M
            node = nodesArray[i];
1440
2.34M
            nextIndex = nextIndexFromNode(node);
1441
2.34M
            int32_t strength = strengthFromNode(node);
1442
2.34M
            if(strength == UCOL_QUATERNARY) {
1443
544
                U_ASSERT(isTailoredNode(node));
1444
#ifdef DEBUG_COLLATION_BUILDER
1445
                printf("      quat+     ");
1446
#endif
1447
544
                if(q == 3) {
1448
11
                    errorCode = U_BUFFER_OVERFLOW_ERROR;
1449
11
                    errorReason = "quaternary tailoring gap too small";
1450
11
                    return;
1451
11
                }
1452
533
                ++q;
1453
2.34M
            } else {
1454
2.34M
                if(strength == UCOL_TERTIARY) {
1455
72.8k
                    if(isTailoredNode(node)) {
1456
#ifdef DEBUG_COLLATION_BUILDER
1457
                        printf("    ter+        ");
1458
#endif
1459
71.8k
                        if(!tIsTailored) {
1460
                            // First tailored tertiary node for [p, s].
1461
7.17k
                            int32_t tCount = countTailoredNodes(nodesArray, nextIndex,
1462
7.17k
                                                                UCOL_TERTIARY) + 1;
1463
7.17k
                            uint32_t tLimit;
1464
7.17k
                            if(t == 0) {
1465
                                // Gap at the beginning of the tertiary CE range.
1466
162
                                t = rootElements.getTertiaryBoundary() - 0x100;
1467
162
                                tLimit = rootElements.getFirstTertiaryCE() & Collation::ONLY_TERTIARY_MASK;
1468
7.01k
                            } else if(!pIsTailored && !sIsTailored) {
1469
                                // p and s are root weights.
1470
2.04k
                                tLimit = rootElements.getTertiaryAfter(pIndex, s, t);
1471
4.96k
                            } else if(t == Collation::BEFORE_WEIGHT16) {
1472
0
                                tLimit = Collation::COMMON_WEIGHT16;
1473
4.96k
                            } else {
1474
                                // [p, s] is tailored.
1475
4.96k
                                U_ASSERT(t == Collation::COMMON_WEIGHT16);
1476
4.96k
                                tLimit = rootElements.getTertiaryBoundary();
1477
4.96k
                            }
1478
7.17k
                            U_ASSERT(tLimit == 0x4000 || (tLimit & ~Collation::ONLY_TERTIARY_MASK) == 0);
1479
7.17k
                            tertiaries.initForTertiary();
1480
7.17k
                            if(!tertiaries.allocWeights(t, tLimit, tCount)) {
1481
9
                                errorCode = U_BUFFER_OVERFLOW_ERROR;
1482
9
                                errorReason = "tertiary tailoring gap too small";
1483
9
                                return;
1484
9
                            }
1485
7.16k
                            tIsTailored = true;
1486
7.16k
                        }
1487
71.8k
                        t = tertiaries.nextWeight();
1488
71.8k
                        U_ASSERT(t != 0xffffffff);
1489
71.8k
                    } else {
1490
1.02k
                        t = weight16FromNode(node);
1491
1.02k
                        tIsTailored = false;
1492
#ifdef DEBUG_COLLATION_BUILDER
1493
                        printf("    ter     %lx\n", (long)alignWeightRight(t));
1494
#endif
1495
1.02k
                    }
1496
2.27M
                } else {
1497
2.27M
                    if(strength == UCOL_SECONDARY) {
1498
183k
                        if(isTailoredNode(node)) {
1499
#ifdef DEBUG_COLLATION_BUILDER
1500
                            printf("  sec+          ");
1501
#endif
1502
182k
                            if(!sIsTailored) {
1503
                                // First tailored secondary node for p.
1504
5.73k
                                int32_t sCount = countTailoredNodes(nodesArray, nextIndex,
1505
5.73k
                                                                    UCOL_SECONDARY) + 1;
1506
5.73k
                                uint32_t sLimit;
1507
5.73k
                                if(s == 0) {
1508
                                    // Gap at the beginning of the secondary CE range.
1509
54
                                    s = rootElements.getSecondaryBoundary() - 0x100;
1510
54
                                    sLimit = rootElements.getFirstSecondaryCE() >> 16;
1511
5.68k
                                } else if(!pIsTailored) {
1512
                                    // p is a root primary.
1513
3.99k
                                    sLimit = rootElements.getSecondaryAfter(pIndex, s);
1514
3.99k
                                } else if(s == Collation::BEFORE_WEIGHT16) {
1515
69
                                    sLimit = Collation::COMMON_WEIGHT16;
1516
1.61k
                                } else {
1517
                                    // p is a tailored primary.
1518
1.61k
                                    U_ASSERT(s == Collation::COMMON_WEIGHT16);
1519
1.61k
                                    sLimit = rootElements.getSecondaryBoundary();
1520
1.61k
                                }
1521
5.73k
                                if(s == Collation::COMMON_WEIGHT16) {
1522
                                    // Do not tailor into the getSortKey() range of
1523
                                    // compressed common secondaries.
1524
4.44k
                                    s = rootElements.getLastCommonSecondary();
1525
4.44k
                                }
1526
5.73k
                                secondaries.initForSecondary();
1527
5.73k
                                if(!secondaries.allocWeights(s, sLimit, sCount)) {
1528
2
                                    errorCode = U_BUFFER_OVERFLOW_ERROR;
1529
2
                                    errorReason = "secondary tailoring gap too small";
1530
#ifdef DEBUG_COLLATION_BUILDER
1531
                                    printf("!secondaries.allocWeights(%lx, %lx, sCount=%ld)\n",
1532
                                           (long)alignWeightRight(s), (long)alignWeightRight(sLimit),
1533
                                           (long)alignWeightRight(sCount));
1534
#endif
1535
2
                                    return;
1536
2
                                }
1537
5.73k
                                sIsTailored = true;
1538
5.73k
                            }
1539
182k
                            s = secondaries.nextWeight();
1540
182k
                            U_ASSERT(s != 0xffffffff);
1541
182k
                        } else {
1542
1.89k
                            s = weight16FromNode(node);
1543
1.89k
                            sIsTailored = false;
1544
#ifdef DEBUG_COLLATION_BUILDER
1545
                            printf("  sec       %lx\n", (long)alignWeightRight(s));
1546
#endif
1547
1.89k
                        }
1548
2.09M
                    } else /* UCOL_PRIMARY */ {
1549
2.09M
                        U_ASSERT(isTailoredNode(node));
1550
#ifdef DEBUG_COLLATION_BUILDER
1551
                        printf("pri+            ");
1552
#endif
1553
2.09M
                        if(!pIsTailored) {
1554
                            // First tailored primary node in this list.
1555
7.46k
                            int32_t pCount = countTailoredNodes(nodesArray, nextIndex,
1556
7.46k
                                                                UCOL_PRIMARY) + 1;
1557
7.46k
                            UBool isCompressible = baseData->isCompressiblePrimary(p);
1558
7.46k
                            uint32_t pLimit =
1559
7.46k
                                rootElements.getPrimaryAfter(p, pIndex, isCompressible);
1560
7.46k
                            primaries.initForPrimary(isCompressible);
1561
7.46k
                            if(!primaries.allocWeights(p, pLimit, pCount)) {
1562
10
                                errorCode = U_BUFFER_OVERFLOW_ERROR;  // TODO: introduce a more specific UErrorCode?
1563
10
                                errorReason = "primary tailoring gap too small";
1564
10
                                return;
1565
10
                            }
1566
7.45k
                            pIsTailored = true;
1567
7.45k
                        }
1568
2.09M
                        p = primaries.nextWeight();
1569
2.09M
                        U_ASSERT(p != 0xffffffff);
1570
2.09M
                        s = Collation::COMMON_WEIGHT16;
1571
2.09M
                        sIsTailored = false;
1572
2.09M
                    }
1573
2.27M
                    t = s == 0 ? 0 : Collation::COMMON_WEIGHT16;
1574
2.27M
                    tIsTailored = false;
1575
2.27M
                }
1576
2.34M
                q = 0;
1577
2.34M
            }
1578
2.34M
            if(isTailoredNode(node)) {
1579
2.34M
                nodesArray[i] = Collation::makeCE(p, s, t, q);
1580
#ifdef DEBUG_COLLATION_BUILDER
1581
                printf("%016llx\n", (long long)nodesArray[i]);
1582
#endif
1583
2.34M
            }
1584
2.34M
        }
1585
14.3k
    }
1586
3.86k
}
1587
1588
int32_t
1589
20.3k
CollationBuilder::countTailoredNodes(const int64_t *nodesArray, int32_t i, int32_t strength) {
1590
20.3k
    int32_t count = 0;
1591
2.82M
    for(;;) {
1592
2.82M
        if(i == 0) { break; }
1593
2.81M
        int64_t node = nodesArray[i];
1594
2.81M
        if(strengthFromNode(node) < strength) { break; }
1595
2.80M
        if(strengthFromNode(node) == strength) {
1596
2.55M
            if(isTailoredNode(node)) {
1597
2.55M
                ++count;
1598
2.55M
            } else {
1599
714
                break;
1600
714
            }
1601
2.55M
        }
1602
2.80M
        i = nextIndexFromNode(node);
1603
2.80M
    }
1604
20.3k
    return count;
1605
20.3k
}
1606
1607
class CEFinalizer : public CollationDataBuilder::CEModifier {
1608
public:
1609
3.82k
    CEFinalizer(const int64_t *ces) : finalCEs(ces) {}
1610
    virtual ~CEFinalizer();
1611
2.47M
    virtual int64_t modifyCE32(uint32_t ce32) const override {
1612
2.47M
        U_ASSERT(!Collation::isSpecialCE32(ce32));
1613
2.47M
        if(CollationBuilder::isTempCE32(ce32)) {
1614
            // retain case bits
1615
1.55M
            return finalCEs[CollationBuilder::indexFromTempCE32(ce32)] | ((ce32 & 0xc0) << 8);
1616
1.55M
        } else {
1617
922k
            return Collation::NO_CE;
1618
922k
        }
1619
2.47M
    }
1620
656k
    virtual int64_t modifyCE(int64_t ce) const override {
1621
656k
        if(CollationBuilder::isTempCE(ce)) {
1622
            // retain case bits
1623
54.3k
            return finalCEs[CollationBuilder::indexFromTempCE(ce)] | (ce & 0xc000);
1624
601k
        } else {
1625
601k
            return Collation::NO_CE;
1626
601k
        }
1627
656k
    }
1628
1629
private:
1630
    const int64_t *finalCEs;
1631
};
1632
1633
3.82k
CEFinalizer::~CEFinalizer() {}
1634
1635
void
1636
3.86k
CollationBuilder::finalizeCEs(UErrorCode &errorCode) {
1637
3.86k
    if(U_FAILURE(errorCode)) { return; }
1638
3.82k
    LocalPointer<CollationDataBuilder> newBuilder(new CollationDataBuilder(icu4xMode, errorCode), errorCode);
1639
3.82k
    if(U_FAILURE(errorCode)) {
1640
0
        return;
1641
0
    }
1642
3.82k
    newBuilder->initForTailoring(baseData, errorCode);
1643
3.82k
    CEFinalizer finalizer(nodes.getBuffer());
1644
3.82k
    newBuilder->copyFrom(*dataBuilder, finalizer, errorCode);
1645
3.82k
    if(U_FAILURE(errorCode)) { return; }
1646
3.82k
    delete dataBuilder;
1647
3.82k
    dataBuilder = newBuilder.orphan();
1648
3.82k
}
1649
1650
int32_t
1651
25.7M
CollationBuilder::ceStrength(int64_t ce) {
1652
25.7M
    return
1653
25.7M
        isTempCE(ce) ? strengthFromTempCE(ce) :
1654
25.7M
        (ce & INT64_C(0xff00000000000000)) != 0 ? UCOL_PRIMARY :
1655
10.6M
        (static_cast<uint32_t>(ce) & 0xff000000) != 0 ? UCOL_SECONDARY :
1656
616k
        ce != 0 ? UCOL_TERTIARY :
1657
1.98k
        UCOL_IDENTICAL;
1658
25.7M
}
1659
1660
U_NAMESPACE_END
1661
1662
U_NAMESPACE_USE
1663
1664
U_CAPI UCollator * U_EXPORT2
1665
ucol_openRules(const char16_t *rules, int32_t rulesLength,
1666
               UColAttributeValue normalizationMode, UCollationStrength strength,
1667
0
               UParseError *parseError, UErrorCode *pErrorCode) {
1668
0
    if(U_FAILURE(*pErrorCode)) { return nullptr; }
1669
0
    if(rules == nullptr && rulesLength != 0) {
1670
0
        *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
1671
0
        return nullptr;
1672
0
    }
1673
0
    RuleBasedCollator *coll = new RuleBasedCollator();
1674
0
    if(coll == nullptr) {
1675
0
        *pErrorCode = U_MEMORY_ALLOCATION_ERROR;
1676
0
        return nullptr;
1677
0
    }
1678
0
    UnicodeString r(rulesLength < 0, rules, rulesLength);
1679
0
    coll->internalBuildTailoring(r, strength, normalizationMode, parseError, nullptr, *pErrorCode);
1680
0
    if(U_FAILURE(*pErrorCode)) {
1681
0
        delete coll;
1682
0
        return nullptr;
1683
0
    }
1684
0
    return coll->toUCollator();
1685
0
}
1686
1687
static const int32_t internalBufferSize = 512;
1688
1689
// The @internal ucol_getUnsafeSet() was moved here from ucol_sit.cpp
1690
// because it calls UnicodeSet "builder" code that depends on all Unicode properties,
1691
// and the rest of the collation "runtime" code only depends on normalization.
1692
// This function is not related to the collation builder,
1693
// but it did not seem worth moving it into its own .cpp file,
1694
// nor rewriting it to use lower-level UnicodeSet and Normalizer2Impl methods.
1695
U_CAPI int32_t U_EXPORT2
1696
ucol_getUnsafeSet( const UCollator *coll,
1697
                  USet *unsafe,
1698
                  UErrorCode *status)
1699
0
{
1700
0
    char16_t buffer[internalBufferSize];
1701
0
    int32_t len = 0;
1702
1703
0
    uset_clear(unsafe);
1704
1705
    // cccpattern = "[[:^tccc=0:][:^lccc=0:]]", unfortunately variant
1706
0
    static const char16_t cccpattern[25] = { 0x5b, 0x5b, 0x3a, 0x5e, 0x74, 0x63, 0x63, 0x63, 0x3d, 0x30, 0x3a, 0x5d,
1707
0
                                    0x5b, 0x3a, 0x5e, 0x6c, 0x63, 0x63, 0x63, 0x3d, 0x30, 0x3a, 0x5d, 0x5d, 0x00 };
1708
1709
    // add chars that fail the fcd check
1710
0
    uset_applyPattern(unsafe, cccpattern, 24, USET_IGNORE_SPACE, status);
1711
1712
    // add lead/trail surrogates
1713
    // (trail surrogates should need to be unsafe only if the caller tests for UTF-16 code *units*,
1714
    // not when testing code *points*)
1715
0
    uset_addRange(unsafe, 0xd800, 0xdfff);
1716
1717
0
    USet *contractions = uset_open(0,0);
1718
1719
0
    int32_t i = 0, j = 0;
1720
0
    ucol_getContractionsAndExpansions(coll, contractions, nullptr, false, status);
1721
0
    int32_t contsSize = uset_size(contractions);
1722
0
    UChar32 c = 0;
1723
    // Contraction set consists only of strings
1724
    // to get unsafe code points, we need to
1725
    // break the strings apart and add them to the unsafe set
1726
0
    for(i = 0; i < contsSize; i++) {
1727
0
        len = uset_getItem(contractions, i, nullptr, nullptr, buffer, internalBufferSize, status);
1728
0
        if(len > 0) {
1729
0
            j = 0;
1730
0
            while(j < len) {
1731
0
                U16_NEXT(buffer, j, len, c);
1732
0
                if(j < len) {
1733
0
                    uset_add(unsafe, c);
1734
0
                }
1735
0
            }
1736
0
        }
1737
0
    }
1738
1739
0
    uset_close(contractions);
1740
1741
0
    return uset_size(unsafe);
1742
0
}
1743
1744
#endif  // !UCONFIG_NO_COLLATION