Coverage Report

Created: 2026-02-05 06:34

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/icu/icu4c/source/i18n/collationruleparser.cpp
Line
Count
Source
1
// © 2016 and later: Unicode, Inc. and others.
2
// License & terms of use: http://www.unicode.org/copyright.html
3
/*
4
*******************************************************************************
5
* Copyright (C) 2013-2015, International Business Machines
6
* Corporation and others.  All Rights Reserved.
7
*******************************************************************************
8
* collationruleparser.cpp
9
*
10
* (replaced the former ucol_tok.cpp)
11
*
12
* created on: 2013apr10
13
* created by: Markus W. Scherer
14
*/
15
16
#include "unicode/utypes.h"
17
18
#if !UCONFIG_NO_COLLATION
19
20
#include "unicode/normalizer2.h"
21
#include "unicode/parseerr.h"
22
#include "unicode/uchar.h"
23
#include "unicode/ucol.h"
24
#include "unicode/uloc.h"
25
#include "unicode/unistr.h"
26
#include "unicode/utf16.h"
27
#include "charstr.h"
28
#include "cmemory.h"
29
#include "collation.h"
30
#include "collationdata.h"
31
#include "collationruleparser.h"
32
#include "collationsettings.h"
33
#include "collationtailoring.h"
34
#include "cstring.h"
35
#include "patternprops.h"
36
#include "uassert.h"
37
#include "ulocimp.h"
38
#include "uvectr32.h"
39
40
U_NAMESPACE_BEGIN
41
42
namespace {
43
44
const char16_t BEFORE[] = { 0x5b, 0x62, 0x65, 0x66, 0x6f, 0x72, 0x65, 0 };  // "[before"
45
const int32_t BEFORE_LENGTH = 7;
46
47
}  // namespace
48
49
7.53k
CollationRuleParser::Sink::~Sink() {}
50
51
void
52
0
CollationRuleParser::Sink::suppressContractions(const UnicodeSet &, const char *&, UErrorCode &) {}
53
54
void
55
0
CollationRuleParser::Sink::optimize(const UnicodeSet &, const char *&, UErrorCode &) {}
56
57
7.53k
CollationRuleParser::Importer::~Importer() {}
58
59
CollationRuleParser::CollationRuleParser(const CollationData *base, UErrorCode &errorCode)
60
7.53k
        : nfd(*Normalizer2::getNFDInstance(errorCode)),
61
7.53k
          nfc(*Normalizer2::getNFCInstance(errorCode)),
62
7.53k
          rules(nullptr), baseData(base), settings(nullptr),
63
7.53k
          parseError(nullptr), errorReason(nullptr),
64
7.53k
          sink(nullptr), importer(nullptr),
65
7.53k
          ruleIndex(0) {
66
7.53k
}
67
68
7.53k
CollationRuleParser::~CollationRuleParser() {
69
7.53k
}
70
71
void
72
CollationRuleParser::parse(const UnicodeString &ruleString,
73
                           CollationSettings &outSettings,
74
                           UParseError *outParseError,
75
7.53k
                           UErrorCode &errorCode) {
76
7.53k
    if(U_FAILURE(errorCode)) { return; }
77
7.53k
    settings = &outSettings;
78
7.53k
    parseError = outParseError;
79
7.53k
    if(parseError != nullptr) {
80
0
        parseError->line = 0;
81
0
        parseError->offset = -1;
82
0
        parseError->preContext[0] = 0;
83
0
        parseError->postContext[0] = 0;
84
0
    }
85
7.53k
    errorReason = nullptr;
86
7.53k
    parse(ruleString, errorCode);
87
7.53k
}
88
89
void
90
12.9k
CollationRuleParser::parse(const UnicodeString &ruleString, UErrorCode &errorCode) {
91
12.9k
    if(U_FAILURE(errorCode)) { return; }
92
12.9k
    rules = &ruleString;
93
12.9k
    ruleIndex = 0;
94
95
383k
    while(ruleIndex < rules->length()) {
96
374k
        char16_t c = rules->charAt(ruleIndex);
97
374k
        if(PatternProps::isWhiteSpace(c)) {
98
408
            ++ruleIndex;
99
408
            continue;
100
408
        }
101
373k
        switch(c) {
102
357k
        case 0x26:  // '&'
103
357k
            parseRuleChain(errorCode);
104
357k
            break;
105
13.5k
        case 0x5b:  // '['
106
13.5k
            parseSetting(errorCode);
107
13.5k
            break;
108
1.32k
        case 0x23:  // '#' starts a comment, until the end of the line
109
1.32k
            ruleIndex = skipComment(ruleIndex + 1);
110
1.32k
            break;
111
548
        case 0x40:  // '@' is equivalent to [backwards 2]
112
548
            settings->setFlag(CollationSettings::BACKWARD_SECONDARY,
113
548
                              UCOL_ON, 0, errorCode);
114
548
            ++ruleIndex;
115
548
            break;
116
416
        case 0x21:  // '!' used to turn on Thai/Lao character reversal
117
            // Accept but ignore. The root collator has contractions
118
            // that are equivalent to the character reversal, where appropriate.
119
416
            ++ruleIndex;
120
416
            break;
121
265
        default:
122
265
            setParseError("expected a reset or setting or comment", errorCode);
123
265
            break;
124
373k
        }
125
373k
        if(U_FAILURE(errorCode)) { return; }
126
373k
    }
127
12.9k
}
128
129
void
130
357k
CollationRuleParser::parseRuleChain(UErrorCode &errorCode) {
131
357k
    int32_t resetStrength = parseResetAndPosition(errorCode);
132
357k
    UBool isFirstRelation = true;
133
1.10M
    for(;;) {
134
1.10M
        int32_t result = parseRelationOperator(errorCode);
135
1.10M
        if(U_FAILURE(errorCode)) { return; }
136
1.10M
        if(result < 0) {
137
358k
            if(ruleIndex < rules->length() && rules->charAt(ruleIndex) == 0x23) {
138
                // '#' starts a comment, until the end of the line
139
1.66k
                ruleIndex = skipComment(ruleIndex + 1);
140
1.66k
                continue;
141
1.66k
            }
142
357k
            if(isFirstRelation) {
143
456
                setParseError("reset not followed by a relation", errorCode);
144
456
            }
145
357k
            return;
146
358k
        }
147
747k
        int32_t strength = result & STRENGTH_MASK;
148
747k
        if(resetStrength < UCOL_IDENTICAL) {
149
            // reset-before rule chain
150
17.7k
            if(isFirstRelation) {
151
6.99k
                if(strength != resetStrength) {
152
0
                    setParseError("reset-before strength differs from its first relation", errorCode);
153
0
                    return;
154
0
                }
155
10.7k
            } else {
156
10.7k
                if(strength < resetStrength) {
157
0
                    setParseError("reset-before strength followed by a stronger relation", errorCode);
158
0
                    return;
159
0
                }
160
10.7k
            }
161
17.7k
        }
162
747k
        int32_t i = ruleIndex + (result >> OFFSET_SHIFT);  // skip over the relation operator
163
747k
        if((result & STARRED_FLAG) == 0) {
164
735k
            parseRelationStrings(strength, i, errorCode);
165
735k
        } else {
166
12.2k
            parseStarredCharacters(strength, i, errorCode);
167
12.2k
        }
168
747k
        if(U_FAILURE(errorCode)) { return; }
169
747k
        isFirstRelation = false;
170
747k
    }
171
357k
}
172
173
int32_t
174
357k
CollationRuleParser::parseResetAndPosition(UErrorCode &errorCode) {
175
357k
    if(U_FAILURE(errorCode)) { return UCOL_DEFAULT; }
176
357k
    int32_t i = skipWhiteSpace(ruleIndex + 1);
177
357k
    int32_t j;
178
357k
    char16_t c;
179
357k
    int32_t resetStrength;
180
357k
    if(rules->compare(i, BEFORE_LENGTH, BEFORE, 0, BEFORE_LENGTH) == 0 &&
181
6.99k
            (j = i + BEFORE_LENGTH) < rules->length() &&
182
6.99k
            PatternProps::isWhiteSpace(rules->charAt(j)) &&
183
6.99k
            ((j = skipWhiteSpace(j + 1)) + 1) < rules->length() &&
184
6.99k
            0x31 <= (c = rules->charAt(j)) && c <= 0x33 &&
185
6.99k
            rules->charAt(j + 1) == 0x5d) {
186
        // &[before n] with n=1 or 2 or 3
187
6.99k
        resetStrength = UCOL_PRIMARY + (c - 0x31);
188
6.99k
        i = skipWhiteSpace(j + 2);
189
350k
    } else {
190
350k
        resetStrength = UCOL_IDENTICAL;
191
350k
    }
192
357k
    if(i >= rules->length()) {
193
51
        setParseError("reset without position", errorCode);
194
51
        return UCOL_DEFAULT;
195
51
    }
196
357k
    UnicodeString str;
197
357k
    if(rules->charAt(i) == 0x5b) {  // '['
198
1.11k
        i = parseSpecialPosition(i, str, errorCode);
199
356k
    } else {
200
356k
        i = parseTailoringString(i, str, errorCode);
201
356k
    }
202
357k
    sink->addReset(resetStrength, str, errorReason, errorCode);
203
357k
    if(U_FAILURE(errorCode)) { setErrorContext(); }
204
357k
    ruleIndex = i;
205
357k
    return resetStrength;
206
357k
}
207
208
int32_t
209
1.10M
CollationRuleParser::parseRelationOperator(UErrorCode &errorCode) {
210
1.10M
    if(U_FAILURE(errorCode)) { return UCOL_DEFAULT; }
211
1.10M
    ruleIndex = skipWhiteSpace(ruleIndex);
212
1.10M
    if(ruleIndex >= rules->length()) { return UCOL_DEFAULT; }
213
1.09M
    int32_t strength;
214
1.09M
    int32_t i = ruleIndex;
215
1.09M
    char16_t c = rules->charAt(i++);
216
1.09M
    switch(c) {
217
134k
    case 0x3c:  // '<'
218
134k
        if(i < rules->length() && rules->charAt(i) == 0x3c) {  // <<
219
89.1k
            ++i;
220
89.1k
            if(i < rules->length() && rules->charAt(i) == 0x3c) {  // <<<
221
60.5k
                ++i;
222
60.5k
                if(i < rules->length() && rules->charAt(i) == 0x3c) {  // <<<<
223
737
                    ++i;
224
737
                    strength = UCOL_QUATERNARY;
225
59.8k
                } else {
226
59.8k
                    strength = UCOL_TERTIARY;
227
59.8k
                }
228
60.5k
            } else {
229
28.6k
                strength = UCOL_SECONDARY;
230
28.6k
            }
231
89.1k
        } else {
232
45.5k
            strength = UCOL_PRIMARY;
233
45.5k
        }
234
134k
        if(i < rules->length() && rules->charAt(i) == 0x2a) {  // '*'
235
11.5k
            ++i;
236
11.5k
            strength |= STARRED_FLAG;
237
11.5k
        }
238
134k
        break;
239
5.07k
    case 0x3b:  // ';' same as <<
240
5.07k
        strength = UCOL_SECONDARY;
241
5.07k
        break;
242
3.26k
    case 0x2c:  // ',' same as <<<
243
3.26k
        strength = UCOL_TERTIARY;
244
3.26k
        break;
245
604k
    case 0x3d:  // '='
246
604k
        strength = UCOL_IDENTICAL;
247
604k
        if(i < rules->length() && rules->charAt(i) == 0x2a) {  // '*'
248
730
            ++i;
249
730
            strength |= STARRED_FLAG;
250
730
        }
251
604k
        break;
252
350k
    default:
253
350k
        return UCOL_DEFAULT;
254
1.09M
    }
255
747k
    return ((i - ruleIndex) << OFFSET_SHIFT) | strength;
256
1.09M
}
257
258
void
259
735k
CollationRuleParser::parseRelationStrings(int32_t strength, int32_t i, UErrorCode &errorCode) {
260
    // Parse
261
    //     prefix | str / extension
262
    // where prefix and extension are optional.
263
735k
    UnicodeString prefix, str, extension;
264
735k
    i = parseTailoringString(i, str, errorCode);
265
735k
    if(U_FAILURE(errorCode)) { return; }
266
735k
    char16_t next = (i < rules->length()) ? rules->charAt(i) : 0;
267
735k
    if(next == 0x7c) {  // '|' separates the context prefix from the string.
268
4.98k
        prefix = str;
269
4.98k
        i = parseTailoringString(i + 1, str, errorCode);
270
4.98k
        if(U_FAILURE(errorCode)) { return; }
271
4.98k
        next = (i < rules->length()) ? rules->charAt(i) : 0;
272
4.98k
    }
273
735k
    if(next == 0x2f) {  // '/' separates the string from the extension.
274
1.04k
        i = parseTailoringString(i + 1, extension, errorCode);
275
1.04k
    }
276
735k
    if(!prefix.isEmpty()) {
277
4.98k
        UChar32 prefix0 = prefix.char32At(0);
278
4.98k
        UChar32 c = str.char32At(0);
279
4.98k
        if(!nfc.hasBoundaryBefore(prefix0) || !nfc.hasBoundaryBefore(c)) {
280
8
            setParseError("in 'prefix|str', prefix and str must each start with an NFC boundary",
281
8
                          errorCode);
282
8
            return;
283
8
        }
284
4.98k
    }
285
735k
    sink->addRelation(strength, prefix, str, extension, errorReason, errorCode);
286
735k
    if(U_FAILURE(errorCode)) { setErrorContext(); }
287
735k
    ruleIndex = i;
288
735k
}
289
290
void
291
12.2k
CollationRuleParser::parseStarredCharacters(int32_t strength, int32_t i, UErrorCode &errorCode) {
292
12.2k
    UnicodeString empty, raw;
293
12.2k
    i = parseString(skipWhiteSpace(i), raw, errorCode);
294
12.2k
    if(U_FAILURE(errorCode)) { return; }
295
12.2k
    if(raw.isEmpty()) {
296
15
        setParseError("missing starred-relation string", errorCode);
297
15
        return;
298
15
    }
299
12.2k
    UChar32 prev = -1;
300
12.2k
    int32_t j = 0;
301
18.8k
    for(;;) {
302
209k
        while(j < raw.length()) {
303
190k
            UChar32 c = raw.char32At(j);
304
190k
            if(!nfd.isInert(c)) {
305
24
                setParseError("starred-relation string is not all NFD-inert", errorCode);
306
24
                return;
307
24
            }
308
190k
            sink->addRelation(strength, empty, UnicodeString(c), empty, errorReason, errorCode);
309
190k
            if(U_FAILURE(errorCode)) {
310
2
                setErrorContext();
311
2
                return;
312
2
            }
313
190k
            j += U16_LENGTH(c);
314
190k
            prev = c;
315
190k
        }
316
18.8k
        if(i >= rules->length() || rules->charAt(i) != 0x2d) {  // '-'
317
12.1k
            break;
318
12.1k
        }
319
6.70k
        if(prev < 0) {
320
3
            setParseError("range without start in starred-relation string", errorCode);
321
3
            return;
322
3
        }
323
6.70k
        i = parseString(i + 1, raw, errorCode);
324
6.70k
        if(U_FAILURE(errorCode)) { return; }
325
6.70k
        if(raw.isEmpty()) {
326
4
            setParseError("range without end in starred-relation string", errorCode);
327
4
            return;
328
4
        }
329
6.70k
        UChar32 c = raw.char32At(0);
330
6.70k
        if(c < prev) {
331
2
            setParseError("range start greater than end in starred-relation string", errorCode);
332
2
            return;
333
2
        }
334
        // range prev-c
335
6.69k
        UnicodeString s;
336
4.11M
        while(++prev <= c) {
337
4.10M
            if(!nfd.isInert(prev)) {
338
27
                setParseError("starred-relation string range is not all NFD-inert", errorCode);
339
27
                return;
340
27
            }
341
4.10M
            if(U_IS_SURROGATE(prev)) {
342
1
                setParseError("starred-relation string range contains a surrogate", errorCode);
343
1
                return;
344
1
            }
345
4.10M
            if(0xfffd <= prev && prev <= 0xffff) {
346
1
                setParseError("starred-relation string range contains U+FFFD, U+FFFE or U+FFFF", errorCode);
347
1
                return;
348
1
            }
349
4.10M
            s.setTo(prev);
350
4.10M
            sink->addRelation(strength, empty, s, empty, errorReason, errorCode);
351
4.10M
            if(U_FAILURE(errorCode)) {
352
0
                setErrorContext();
353
0
                return;
354
0
            }
355
4.10M
        }
356
6.67k
        prev = -1;
357
6.67k
        j = U16_LENGTH(c);
358
6.67k
    }
359
12.1k
    ruleIndex = skipWhiteSpace(i);
360
12.1k
}
361
362
int32_t
363
1.09M
CollationRuleParser::parseTailoringString(int32_t i, UnicodeString &raw, UErrorCode &errorCode) {
364
1.09M
    i = parseString(skipWhiteSpace(i), raw, errorCode);
365
1.09M
    if(U_SUCCESS(errorCode) && raw.isEmpty()) {
366
118
        setParseError("missing relation string", errorCode);
367
118
    }
368
1.09M
    return skipWhiteSpace(i);
369
1.09M
}
370
371
int32_t
372
1.11M
CollationRuleParser::parseString(int32_t i, UnicodeString &raw, UErrorCode &errorCode) {
373
1.11M
    if(U_FAILURE(errorCode)) { return i; }
374
1.11M
    raw.remove();
375
3.09M
    while(i < rules->length()) {
376
3.08M
        UChar32 c = rules->charAt(i++);
377
3.08M
        if(isSyntaxChar(c)) {
378
204k
            if(c == 0x27) {  // apostrophe
379
11.9k
                if(i < rules->length() && rules->charAt(i) == 0x27) {
380
                    // Double apostrophe, encodes a single one.
381
266
                    raw.append(static_cast<char16_t>(0x27));
382
266
                    ++i;
383
266
                    continue;
384
266
                }
385
                // Quote literal text until the next single apostrophe.
386
61.0k
                for(;;) {
387
61.0k
                    if(i == rules->length()) {
388
31
                        setParseError("quoted literal text missing terminating apostrophe", errorCode);
389
31
                        return i;
390
31
                    }
391
61.0k
                    c = rules->charAt(i++);
392
61.0k
                    if(c == 0x27) {
393
11.8k
                        if(i < rules->length() && rules->charAt(i) == 0x27) {
394
                            // Double apostrophe inside quoted literal text,
395
                            // still encodes a single apostrophe.
396
219
                            ++i;
397
11.6k
                        } else {
398
11.6k
                            break;
399
11.6k
                        }
400
11.8k
                    }
401
49.3k
                    raw.append(static_cast<char16_t>(c));
402
49.3k
                }
403
192k
            } else if(c == 0x5c) {  // backslash
404
776
                if(i == rules->length()) {
405
4
                    setParseError("backslash escape at the end of the rule string", errorCode);
406
4
                    return i;
407
4
                }
408
772
                c = rules->char32At(i);
409
772
                raw.append(c);
410
772
                i += U16_LENGTH(c);
411
192k
            } else {
412
                // Any other syntax character terminates a string.
413
192k
                --i;
414
192k
                break;
415
192k
            }
416
2.88M
        } else if(PatternProps::isWhiteSpace(c)) {
417
            // Unquoted white space terminates a string.
418
916k
            --i;
419
916k
            break;
420
1.96M
        } else {
421
1.96M
            raw.append(static_cast<char16_t>(c));
422
1.96M
        }
423
3.08M
    }
424
3.12M
    for(int32_t j = 0; j < raw.length();) {
425
2.01M
        UChar32 c = raw.char32At(j);
426
2.01M
        if(U_IS_SURROGATE(c)) {
427
28
            setParseError("string contains an unpaired surrogate", errorCode);
428
28
            return i;
429
28
        }
430
2.01M
        if(0xfffd <= c && c <= 0xffff) {
431
12
            setParseError("string contains U+FFFD, U+FFFE or U+FFFF", errorCode);
432
12
            return i;
433
12
        }
434
2.01M
        j += U16_LENGTH(c);
435
2.01M
    }
436
1.11M
    return i;
437
1.11M
}
438
439
namespace {
440
441
const char* const positions[] = {
442
    "first tertiary ignorable",
443
    "last tertiary ignorable",
444
    "first secondary ignorable",
445
    "last secondary ignorable",
446
    "first primary ignorable",
447
    "last primary ignorable",
448
    "first variable",
449
    "last variable",
450
    "first regular",
451
    "last regular",
452
    "first implicit",
453
    "last implicit",
454
    "first trailing",
455
    "last trailing"
456
};
457
458
}  // namespace
459
460
int32_t
461
1.11k
CollationRuleParser::parseSpecialPosition(int32_t i, UnicodeString &str, UErrorCode &errorCode) {
462
1.11k
    if(U_FAILURE(errorCode)) { return 0; }
463
1.11k
    UnicodeString raw;
464
1.11k
    int32_t j = readWords(i + 1, raw);
465
1.11k
    if(j > i && rules->charAt(j) == 0x5d && !raw.isEmpty()) {  // words end with ]
466
1.11k
        ++j;
467
4.02k
        for(int32_t pos = 0; pos < UPRV_LENGTHOF(positions); ++pos) {
468
4.01k
            if(raw == UnicodeString(positions[pos], -1, US_INV)) {
469
1.10k
                str.setTo(POS_LEAD).append(static_cast<char16_t>(POS_BASE + pos));
470
1.10k
                return j;
471
1.10k
            }
472
4.01k
        }
473
7
        if(raw == UNICODE_STRING_SIMPLE("top")) {
474
0
            str.setTo(POS_LEAD).append(static_cast<char16_t>(POS_BASE + LAST_REGULAR));
475
0
            return j;
476
0
        }
477
7
        if(raw == UNICODE_STRING_SIMPLE("variable top")) {
478
0
            str.setTo(POS_LEAD).append(static_cast<char16_t>(POS_BASE + LAST_VARIABLE));
479
0
            return j;
480
0
        }
481
7
    }
482
16
    setParseError("not a valid special reset position", errorCode);
483
16
    return i;
484
1.11k
}
485
486
void
487
13.5k
CollationRuleParser::parseSetting(UErrorCode &errorCode) {
488
13.5k
    if(U_FAILURE(errorCode)) { return; }
489
13.5k
    UnicodeString raw;
490
13.5k
    int32_t i = ruleIndex + 1;
491
13.5k
    int32_t j = readWords(i, raw);
492
13.5k
    if(j <= i || raw.isEmpty()) {
493
257
        setParseError("expected a setting/option at '['", errorCode);
494
257
    }
495
13.5k
    if(rules->charAt(j) == 0x5d) {  // words end with ]
496
11.5k
        ++j;
497
11.5k
        if(raw.startsWith(UNICODE_STRING_SIMPLE("reorder")) &&
498
3.37k
                (raw.length() == 7 || raw.charAt(7) == 0x20)) {
499
3.37k
            parseReordering(raw, errorCode);
500
3.37k
            ruleIndex = j;
501
3.37k
            return;
502
3.37k
        }
503
8.13k
        if(raw == UNICODE_STRING_SIMPLE("backwards 2")) {
504
10
            settings->setFlag(CollationSettings::BACKWARD_SECONDARY,
505
10
                              UCOL_ON, 0, errorCode);
506
10
            ruleIndex = j;
507
10
            return;
508
10
        }
509
8.12k
        UnicodeString v;
510
8.12k
        int32_t valueIndex = raw.lastIndexOf(static_cast<char16_t>(0x20));
511
8.12k
        if(valueIndex >= 0) {
512
7.98k
            v.setTo(raw, valueIndex + 1);
513
7.98k
            raw.truncate(valueIndex);
514
7.98k
        }
515
8.12k
        if(raw == UNICODE_STRING_SIMPLE("strength") && v.length() == 1) {
516
286
            int32_t value = UCOL_DEFAULT;
517
286
            char16_t c = v.charAt(0);
518
286
            if(0x31 <= c && c <= 0x34) {  // 1..4
519
44
                value = UCOL_PRIMARY + (c - 0x31);
520
242
            } else if(c == 0x49) {  // 'I'
521
241
                value = UCOL_IDENTICAL;
522
241
            }
523
286
            if(value != UCOL_DEFAULT) {
524
285
                settings->setStrength(value, 0, errorCode);
525
285
                ruleIndex = j;
526
285
                return;
527
285
            }
528
7.84k
        } else if(raw == UNICODE_STRING_SIMPLE("alternate")) {
529
514
            UColAttributeValue value = UCOL_DEFAULT;
530
514
            if(v == UNICODE_STRING_SIMPLE("non-ignorable")) {
531
0
                value = UCOL_NON_IGNORABLE;
532
514
            } else if(v == UNICODE_STRING_SIMPLE("shifted")) {
533
513
                value = UCOL_SHIFTED;
534
513
            }
535
514
            if(value != UCOL_DEFAULT) {
536
513
                settings->setAlternateHandling(value, 0, errorCode);
537
513
                ruleIndex = j;
538
513
                return;
539
513
            }
540
7.32k
        } else if(raw == UNICODE_STRING_SIMPLE("maxVariable")) {
541
0
            int32_t value = UCOL_DEFAULT;
542
0
            if(v == UNICODE_STRING_SIMPLE("space")) {
543
0
                value = CollationSettings::MAX_VAR_SPACE;
544
0
            } else if(v == UNICODE_STRING_SIMPLE("punct")) {
545
0
                value = CollationSettings::MAX_VAR_PUNCT;
546
0
            } else if(v == UNICODE_STRING_SIMPLE("symbol")) {
547
0
                value = CollationSettings::MAX_VAR_SYMBOL;
548
0
            } else if(v == UNICODE_STRING_SIMPLE("currency")) {
549
0
                value = CollationSettings::MAX_VAR_CURRENCY;
550
0
            }
551
0
            if(value != UCOL_DEFAULT) {
552
0
                settings->setMaxVariable(value, 0, errorCode);
553
0
                settings->variableTop = baseData->getLastPrimaryForGroup(
554
0
                    UCOL_REORDER_CODE_FIRST + value);
555
0
                U_ASSERT(settings->variableTop != 0);
556
0
                ruleIndex = j;
557
0
                return;
558
0
            }
559
7.32k
        } else if(raw == UNICODE_STRING_SIMPLE("caseFirst")) {
560
122
            UColAttributeValue value = UCOL_DEFAULT;
561
122
            if(v == UNICODE_STRING_SIMPLE("off")) {
562
0
                value = UCOL_OFF;
563
122
            } else if(v == UNICODE_STRING_SIMPLE("lower")) {
564
0
                value = UCOL_LOWER_FIRST;
565
122
            } else if(v == UNICODE_STRING_SIMPLE("upper")) {
566
121
                value = UCOL_UPPER_FIRST;
567
121
            }
568
122
            if(value != UCOL_DEFAULT) {
569
121
                settings->setCaseFirst(value, 0, errorCode);
570
121
                ruleIndex = j;
571
121
                return;
572
121
            }
573
7.20k
        } else if(raw == UNICODE_STRING_SIMPLE("caseLevel")) {
574
1
            UColAttributeValue value = getOnOffValue(v);
575
1
            if(value != UCOL_DEFAULT) {
576
0
                settings->setFlag(CollationSettings::CASE_LEVEL, value, 0, errorCode);
577
0
                ruleIndex = j;
578
0
                return;
579
0
            }
580
7.20k
        } else if(raw == UNICODE_STRING_SIMPLE("normalization")) {
581
1.47k
            UColAttributeValue value = getOnOffValue(v);
582
1.47k
            if(value != UCOL_DEFAULT) {
583
1.47k
                settings->setFlag(CollationSettings::CHECK_FCD, value, 0, errorCode);
584
1.47k
                ruleIndex = j;
585
1.47k
                return;
586
1.47k
            }
587
5.72k
        } else if(raw == UNICODE_STRING_SIMPLE("numericOrdering")) {
588
0
            UColAttributeValue value = getOnOffValue(v);
589
0
            if(value != UCOL_DEFAULT) {
590
0
                settings->setFlag(CollationSettings::NUMERIC, value, 0, errorCode);
591
0
                ruleIndex = j;
592
0
                return;
593
0
            }
594
5.72k
        } else if(raw == UNICODE_STRING_SIMPLE("hiraganaQ")) {
595
1
            UColAttributeValue value = getOnOffValue(v);
596
1
            if(value != UCOL_DEFAULT) {
597
0
                if(value == UCOL_ON) {
598
0
                    setParseError("[hiraganaQ on] is not supported", errorCode);
599
0
                }
600
0
                ruleIndex = j;
601
0
                return;
602
0
            }
603
5.72k
        } else if(raw == UNICODE_STRING_SIMPLE("import")) {
604
5.65k
            CharString lang;
605
5.65k
            lang.appendInvariantChars(v, errorCode);
606
5.65k
            if(errorCode == U_MEMORY_ALLOCATION_ERROR) { return; }
607
            // BCP 47 language tag -> ICU locale ID
608
5.65k
            int32_t parsedLength;
609
5.65k
            CharString localeID = ulocimp_forLanguageTag(lang.data(), -1, &parsedLength, errorCode);
610
5.65k
            if(U_FAILURE(errorCode) || parsedLength != lang.length()) {
611
165
                errorCode = U_ZERO_ERROR;
612
165
                setParseError("expected language tag in [import langTag]", errorCode);
613
165
                return;
614
165
            }
615
            // localeID minus all keywords
616
5.48k
            CharString baseID = ulocimp_getBaseName(localeID.toStringPiece(), errorCode);
617
5.48k
            if (U_FAILURE(errorCode)) {
618
0
                errorCode = U_ZERO_ERROR;
619
0
                setParseError("expected language tag in [import langTag]", errorCode);
620
0
                return;
621
0
            }
622
5.48k
            if (baseID.isEmpty()) {
623
160
                baseID.copyFrom("root", errorCode);
624
5.32k
            } else if (baseID[0] == '_') {
625
                // CharString doesn't have any insert() method, only append().
626
21
                constexpr char und[] = "und";
627
21
                constexpr int32_t length = sizeof und - 1;
628
21
                int32_t dummy;
629
21
                char* tail = baseID.getAppendBuffer(length, length, dummy, errorCode);
630
21
                char* head = baseID.data();
631
21
                uprv_memmove(head + length, head, baseID.length());
632
21
                uprv_memcpy(head, und, length);
633
21
                baseID.append(tail, length, errorCode);
634
21
            }
635
            // @collation=type, or length=0 if not specified
636
5.48k
            CharString collationType = ulocimp_getKeywordValue(localeID.data(), "collation", errorCode);
637
5.48k
            if(U_FAILURE(errorCode)) {
638
0
                errorCode = U_ZERO_ERROR;
639
0
                setParseError("expected language tag in [import langTag]", errorCode);
640
0
                return;
641
0
            }
642
5.48k
            if(importer == nullptr) {
643
0
                setParseError("[import langTag] is not supported", errorCode);
644
5.48k
            } else {
645
5.48k
                UnicodeString importedRules;
646
5.48k
                importer->getRules(baseID.data(),
647
5.48k
                                   !collationType.isEmpty() ? collationType.data() : "standard",
648
5.48k
                                   importedRules, errorReason, errorCode);
649
5.48k
                if(U_FAILURE(errorCode)) {
650
66
                    if(errorReason == nullptr) {
651
66
                        errorReason = "[import langTag] failed";
652
66
                    }
653
66
                    setErrorContext();
654
66
                    return;
655
66
                }
656
5.41k
                const UnicodeString *outerRules = rules;
657
5.41k
                int32_t outerRuleIndex = ruleIndex;
658
5.41k
                parse(importedRules, errorCode);
659
5.41k
                if(U_FAILURE(errorCode)) {
660
17
                    if(parseError != nullptr) {
661
0
                        parseError->offset = outerRuleIndex;
662
0
                    }
663
17
                }
664
5.41k
                rules = outerRules;
665
5.41k
                ruleIndex = j;
666
5.41k
            }
667
5.41k
            return;
668
5.48k
        }
669
8.12k
    } else if(rules->charAt(j) == 0x5b) {  // words end with [
670
1.89k
        UnicodeSet set;
671
1.89k
        j = parseUnicodeSet(j, set, errorCode);
672
1.89k
        if(U_FAILURE(errorCode)) { return; }
673
337
        if(raw == UNICODE_STRING_SIMPLE("optimize")) {
674
92
            sink->optimize(set, errorReason, errorCode);
675
92
            if(U_FAILURE(errorCode)) { setErrorContext(); }
676
92
            ruleIndex = j;
677
92
            return;
678
245
        } else if(raw == UNICODE_STRING_SIMPLE("suppressContractions")) {
679
214
            sink->suppressContractions(set, errorReason, errorCode);
680
214
            if(U_FAILURE(errorCode)) { setErrorContext(); }
681
214
            ruleIndex = j;
682
214
            return;
683
214
        }
684
337
    }
685
214
    setParseError("not a valid setting/option", errorCode);
686
214
}
687
688
void
689
3.37k
CollationRuleParser::parseReordering(const UnicodeString &raw, UErrorCode &errorCode) {
690
3.37k
    if(U_FAILURE(errorCode)) { return; }
691
3.37k
    int32_t i = 7;  // after "reorder"
692
3.37k
    if(i == raw.length()) {
693
        // empty [reorder] with no codes
694
0
        settings->resetReordering();
695
0
        return;
696
0
    }
697
    // Parse the codes in [reorder aa bb cc].
698
3.37k
    UVector32 reorderCodes(errorCode);
699
3.37k
    if(U_FAILURE(errorCode)) { return; }
700
3.37k
    CharString word;
701
12.5k
    while(i < raw.length()) {
702
9.19k
        ++i;  // skip the word-separating space
703
9.19k
        int32_t limit = raw.indexOf(static_cast<char16_t>(0x20), i);
704
9.19k
        if(limit < 0) { limit = raw.length(); }
705
9.19k
        word.clear().appendInvariantChars(raw.tempSubStringBetween(i, limit), errorCode);
706
9.19k
        if(U_FAILURE(errorCode)) { return; }
707
9.19k
        int32_t code = getReorderCode(word.data());
708
9.19k
        if(code < 0) {
709
0
            setParseError("unknown script or reorder code", errorCode);
710
0
            return;
711
0
        }
712
9.19k
        reorderCodes.addElement(code, errorCode);
713
9.19k
        if(U_FAILURE(errorCode)) { return; }
714
9.19k
        i = limit;
715
9.19k
    }
716
3.37k
    settings->setReordering(*baseData, reorderCodes.getBuffer(), reorderCodes.size(), errorCode);
717
3.37k
}
718
719
static const char *const gSpecialReorderCodes[] = {
720
    "space", "punct", "symbol", "currency", "digit"
721
};
722
723
int32_t
724
9.19k
CollationRuleParser::getReorderCode(const char *word) {
725
55.1k
    for(int32_t i = 0; i < UPRV_LENGTHOF(gSpecialReorderCodes); ++i) {
726
45.9k
        if(uprv_stricmp(word, gSpecialReorderCodes[i]) == 0) {
727
0
            return UCOL_REORDER_CODE_FIRST + i;
728
0
        }
729
45.9k
    }
730
9.19k
    int32_t script = u_getPropertyValueEnum(UCHAR_SCRIPT, word);
731
9.19k
    if(script >= 0) {
732
9.19k
        return script;
733
9.19k
    }
734
0
    if(uprv_stricmp(word, "others") == 0) {
735
0
        return UCOL_REORDER_CODE_OTHERS;  // same as Zzzz = USCRIPT_UNKNOWN
736
0
    }
737
0
    return -1;
738
0
}
739
740
UColAttributeValue
741
1.48k
CollationRuleParser::getOnOffValue(const UnicodeString &s) {
742
1.48k
    if(s == UNICODE_STRING_SIMPLE("on")) {
743
1.47k
        return UCOL_ON;
744
1.47k
    } else if(s == UNICODE_STRING_SIMPLE("off")) {
745
0
        return UCOL_OFF;
746
4
    } else {
747
4
        return UCOL_DEFAULT;
748
4
    }
749
1.48k
}
750
751
int32_t
752
1.89k
CollationRuleParser::parseUnicodeSet(int32_t i, UnicodeSet &set, UErrorCode &errorCode) {
753
    // Collect a UnicodeSet pattern between a balanced pair of [brackets].
754
1.89k
    int32_t level = 0;
755
1.89k
    int32_t j = i;
756
253k
    for(;;) {
757
253k
        if(j == rules->length()) {
758
106
            setParseError("unbalanced UnicodeSet pattern brackets", errorCode);
759
106
            return j;
760
106
        }
761
253k
        char16_t c = rules->charAt(j++);
762
253k
        if(c == 0x5b) {  // '['
763
8.62k
            ++level;
764
244k
        } else if(c == 0x5d) {  // ']'
765
8.32k
            if(--level == 0) { break; }
766
8.32k
        }
767
253k
    }
768
1.78k
    set.applyPattern(rules->tempSubStringBetween(i, j), errorCode);
769
1.78k
    if(U_FAILURE(errorCode)) {
770
675
        errorCode = U_ZERO_ERROR;
771
675
        setParseError("not a valid UnicodeSet pattern", errorCode);
772
675
        return j;
773
675
    }
774
1.11k
    j = skipWhiteSpace(j);
775
1.11k
    if(j == rules->length() || rules->charAt(j) != 0x5d) {
776
773
        setParseError("missing option-terminating ']' after UnicodeSet pattern", errorCode);
777
773
        return j;
778
773
    }
779
337
    return ++j;
780
1.11k
}
781
782
int32_t
783
14.6k
CollationRuleParser::readWords(int32_t i, UnicodeString &raw) const {
784
14.6k
    static const char16_t sp = 0x20;
785
14.6k
    raw.remove();
786
14.6k
    i = skipWhiteSpace(i);
787
270k
    for(;;) {
788
270k
        if(i >= rules->length()) { return 0; }
789
270k
        char16_t c = rules->charAt(i);
790
270k
        if(isSyntaxChar(c) && c != 0x2d && c != 0x5f) {  // syntax except -_
791
14.3k
            if(raw.isEmpty()) { return i; }
792
14.3k
            if(raw.endsWith(&sp, 1)) {  // remove trailing space
793
1.16k
                raw.truncate(raw.length() - 1);
794
1.16k
            }
795
14.3k
            return i;
796
14.3k
        }
797
256k
        if(PatternProps::isWhiteSpace(c)) {
798
21.0k
            raw.append(sp);
799
21.0k
            i = skipWhiteSpace(i + 1);
800
235k
        } else {
801
235k
            raw.append(c);
802
235k
            ++i;
803
235k
        }
804
256k
    }
805
14.6k
}
806
807
int32_t
808
2.98k
CollationRuleParser::skipComment(int32_t i) const {
809
    // skip to past the newline
810
115k
    while(i < rules->length()) {
811
115k
        char16_t c = rules->charAt(i++);
812
        // LF or FF or CR or NEL or LS or PS
813
115k
        if(c == 0xa || c == 0xc || c == 0xd || c == 0x85 || c == 0x2028 || c == 0x2029) {
814
            // Unicode Newline Guidelines: "A readline function should stop at NLF, LS, FF, or PS."
815
            // NLF (new line function) = CR or LF or CR+LF or NEL.
816
            // No need to collect all of CR+LF because a following LF will be ignored anyway.
817
2.42k
            break;
818
2.42k
        }
819
115k
    }
820
2.98k
    return i;
821
2.98k
}
822
823
void
824
3.25k
CollationRuleParser::setParseError(const char *reason, UErrorCode &errorCode) {
825
3.25k
    if(U_FAILURE(errorCode)) { return; }
826
    // Error code consistent with the old parser (from ca. 2001),
827
    // rather than U_PARSE_ERROR;
828
3.09k
    errorCode = U_INVALID_FORMAT_ERROR;
829
3.09k
    errorReason = reason;
830
3.09k
    if(parseError != nullptr) { setErrorContext(); }
831
3.09k
}
832
833
void
834
546
CollationRuleParser::setErrorContext() {
835
546
    if(parseError == nullptr) { return; }
836
837
    // Note: This relies on the calling code maintaining the ruleIndex
838
    // at a position that is useful for debugging.
839
    // For example, at the beginning of a reset or relation etc.
840
0
    parseError->offset = ruleIndex;
841
0
    parseError->line = 0;  // We are not counting line numbers.
842
843
    // before ruleIndex
844
0
    int32_t start = ruleIndex - (U_PARSE_CONTEXT_LEN - 1);
845
0
    if(start < 0) {
846
0
        start = 0;
847
0
    } else if(start > 0 && U16_IS_TRAIL(rules->charAt(start))) {
848
0
        ++start;
849
0
    }
850
0
    int32_t length = ruleIndex - start;
851
0
    rules->extract(start, length, parseError->preContext);
852
0
    parseError->preContext[length] = 0;
853
854
    // starting from ruleIndex
855
0
    length = rules->length() - ruleIndex;
856
0
    if(length >= U_PARSE_CONTEXT_LEN) {
857
0
        length = U_PARSE_CONTEXT_LEN - 1;
858
0
        if(U16_IS_LEAD(rules->charAt(ruleIndex + length - 1))) {
859
0
            --length;
860
0
        }
861
0
    }
862
0
    rules->extract(ruleIndex, length, parseError->postContext);
863
0
    parseError->postContext[length] = 0;
864
0
}
865
866
UBool
867
3.35M
CollationRuleParser::isSyntaxChar(UChar32 c) {
868
3.35M
    return 0x21 <= c && c <= 0x7e &&
869
537k
            (c <= 0x2f || (0x3a <= c && c <= 0x40) ||
870
329k
            (0x5b <= c && c <= 0x60) || (0x7b <= c));
871
3.35M
}
872
873
int32_t
874
3.73M
CollationRuleParser::skipWhiteSpace(int32_t i) const {
875
4.66M
    while(i < rules->length() && PatternProps::isWhiteSpace(rules->charAt(i))) {
876
929k
        ++i;
877
929k
    }
878
3.73M
    return i;
879
3.73M
}
880
881
U_NAMESPACE_END
882
883
#endif  // !UCONFIG_NO_COLLATION