Coverage Report

Created: 2018-09-25 14:53

/src/mozilla-central/intl/icu/source/i18n/collationruleparser.cpp
Line
Count
Source (jump to first uncovered line)
1
// © 2016 and later: Unicode, Inc. and others.
2
// License & terms of use: http://www.unicode.org/copyright.html
3
/*
4
*******************************************************************************
5
* Copyright (C) 2013-2015, International Business Machines
6
* Corporation and others.  All Rights Reserved.
7
*******************************************************************************
8
* collationruleparser.cpp
9
*
10
* (replaced the former ucol_tok.cpp)
11
*
12
* created on: 2013apr10
13
* created by: Markus W. Scherer
14
*/
15
16
#include "unicode/utypes.h"
17
18
#if !UCONFIG_NO_COLLATION
19
20
#include "unicode/normalizer2.h"
21
#include "unicode/parseerr.h"
22
#include "unicode/uchar.h"
23
#include "unicode/ucol.h"
24
#include "unicode/uloc.h"
25
#include "unicode/unistr.h"
26
#include "unicode/utf16.h"
27
#include "charstr.h"
28
#include "cmemory.h"
29
#include "collation.h"
30
#include "collationdata.h"
31
#include "collationruleparser.h"
32
#include "collationsettings.h"
33
#include "collationtailoring.h"
34
#include "cstring.h"
35
#include "patternprops.h"
36
#include "uassert.h"
37
#include "uvectr32.h"
38
39
U_NAMESPACE_BEGIN
40
41
namespace {
42
43
static const UChar BEFORE[] = { 0x5b, 0x62, 0x65, 0x66, 0x6f, 0x72, 0x65, 0 };  // "[before"
44
const int32_t BEFORE_LENGTH = 7;
45
46
}  // namespace
47
48
0
CollationRuleParser::Sink::~Sink() {}
49
50
void
51
0
CollationRuleParser::Sink::suppressContractions(const UnicodeSet &, const char *&, UErrorCode &) {}
52
53
void
54
0
CollationRuleParser::Sink::optimize(const UnicodeSet &, const char *&, UErrorCode &) {}
55
56
0
CollationRuleParser::Importer::~Importer() {}
57
58
CollationRuleParser::CollationRuleParser(const CollationData *base, UErrorCode &errorCode)
59
        : nfd(*Normalizer2::getNFDInstance(errorCode)),
60
          nfc(*Normalizer2::getNFCInstance(errorCode)),
61
          rules(NULL), baseData(base), settings(NULL),
62
          parseError(NULL), errorReason(NULL),
63
          sink(NULL), importer(NULL),
64
0
          ruleIndex(0) {
65
0
}
66
67
0
CollationRuleParser::~CollationRuleParser() {
68
0
}
69
70
void
71
CollationRuleParser::parse(const UnicodeString &ruleString,
72
                           CollationSettings &outSettings,
73
                           UParseError *outParseError,
74
0
                           UErrorCode &errorCode) {
75
0
    if(U_FAILURE(errorCode)) { return; }
76
0
    settings = &outSettings;
77
0
    parseError = outParseError;
78
0
    if(parseError != NULL) {
79
0
        parseError->line = 0;
80
0
        parseError->offset = -1;
81
0
        parseError->preContext[0] = 0;
82
0
        parseError->postContext[0] = 0;
83
0
    }
84
0
    errorReason = NULL;
85
0
    parse(ruleString, errorCode);
86
0
}
87
88
void
89
0
CollationRuleParser::parse(const UnicodeString &ruleString, UErrorCode &errorCode) {
90
0
    if(U_FAILURE(errorCode)) { return; }
91
0
    rules = &ruleString;
92
0
    ruleIndex = 0;
93
0
94
0
    while(ruleIndex < rules->length()) {
95
0
        UChar c = rules->charAt(ruleIndex);
96
0
        if(PatternProps::isWhiteSpace(c)) {
97
0
            ++ruleIndex;
98
0
            continue;
99
0
        }
100
0
        switch(c) {
101
0
        case 0x26:  // '&'
102
0
            parseRuleChain(errorCode);
103
0
            break;
104
0
        case 0x5b:  // '['
105
0
            parseSetting(errorCode);
106
0
            break;
107
0
        case 0x23:  // '#' starts a comment, until the end of the line
108
0
            ruleIndex = skipComment(ruleIndex + 1);
109
0
            break;
110
0
        case 0x40:  // '@' is equivalent to [backwards 2]
111
0
            settings->setFlag(CollationSettings::BACKWARD_SECONDARY,
112
0
                              UCOL_ON, 0, errorCode);
113
0
            ++ruleIndex;
114
0
            break;
115
0
        case 0x21:  // '!' used to turn on Thai/Lao character reversal
116
0
            // Accept but ignore. The root collator has contractions
117
0
            // that are equivalent to the character reversal, where appropriate.
118
0
            ++ruleIndex;
119
0
            break;
120
0
        default:
121
0
            setParseError("expected a reset or setting or comment", errorCode);
122
0
            break;
123
0
        }
124
0
        if(U_FAILURE(errorCode)) { return; }
125
0
    }
126
0
}
127
128
void
129
0
CollationRuleParser::parseRuleChain(UErrorCode &errorCode) {
130
0
    int32_t resetStrength = parseResetAndPosition(errorCode);
131
0
    UBool isFirstRelation = TRUE;
132
0
    for(;;) {
133
0
        int32_t result = parseRelationOperator(errorCode);
134
0
        if(U_FAILURE(errorCode)) { return; }
135
0
        if(result < 0) {
136
0
            if(ruleIndex < rules->length() && rules->charAt(ruleIndex) == 0x23) {
137
0
                // '#' starts a comment, until the end of the line
138
0
                ruleIndex = skipComment(ruleIndex + 1);
139
0
                continue;
140
0
            }
141
0
            if(isFirstRelation) {
142
0
                setParseError("reset not followed by a relation", errorCode);
143
0
            }
144
0
            return;
145
0
        }
146
0
        int32_t strength = result & STRENGTH_MASK;
147
0
        if(resetStrength < UCOL_IDENTICAL) {
148
0
            // reset-before rule chain
149
0
            if(isFirstRelation) {
150
0
                if(strength != resetStrength) {
151
0
                    setParseError("reset-before strength differs from its first relation", errorCode);
152
0
                    return;
153
0
                }
154
0
            } else {
155
0
                if(strength < resetStrength) {
156
0
                    setParseError("reset-before strength followed by a stronger relation", errorCode);
157
0
                    return;
158
0
                }
159
0
            }
160
0
        }
161
0
        int32_t i = ruleIndex + (result >> OFFSET_SHIFT);  // skip over the relation operator
162
0
        if((result & STARRED_FLAG) == 0) {
163
0
            parseRelationStrings(strength, i, errorCode);
164
0
        } else {
165
0
            parseStarredCharacters(strength, i, errorCode);
166
0
        }
167
0
        if(U_FAILURE(errorCode)) { return; }
168
0
        isFirstRelation = FALSE;
169
0
    }
170
0
}
171
172
int32_t
173
0
CollationRuleParser::parseResetAndPosition(UErrorCode &errorCode) {
174
0
    if(U_FAILURE(errorCode)) { return UCOL_DEFAULT; }
175
0
    int32_t i = skipWhiteSpace(ruleIndex + 1);
176
0
    int32_t j;
177
0
    UChar c;
178
0
    int32_t resetStrength;
179
0
    if(rules->compare(i, BEFORE_LENGTH, BEFORE, 0, BEFORE_LENGTH) == 0 &&
180
0
            (j = i + BEFORE_LENGTH) < rules->length() &&
181
0
            PatternProps::isWhiteSpace(rules->charAt(j)) &&
182
0
            ((j = skipWhiteSpace(j + 1)) + 1) < rules->length() &&
183
0
            0x31 <= (c = rules->charAt(j)) && c <= 0x33 &&
184
0
            rules->charAt(j + 1) == 0x5d) {
185
0
        // &[before n] with n=1 or 2 or 3
186
0
        resetStrength = UCOL_PRIMARY + (c - 0x31);
187
0
        i = skipWhiteSpace(j + 2);
188
0
    } else {
189
0
        resetStrength = UCOL_IDENTICAL;
190
0
    }
191
0
    if(i >= rules->length()) {
192
0
        setParseError("reset without position", errorCode);
193
0
        return UCOL_DEFAULT;
194
0
    }
195
0
    UnicodeString str;
196
0
    if(rules->charAt(i) == 0x5b) {  // '['
197
0
        i = parseSpecialPosition(i, str, errorCode);
198
0
    } else {
199
0
        i = parseTailoringString(i, str, errorCode);
200
0
    }
201
0
    sink->addReset(resetStrength, str, errorReason, errorCode);
202
0
    if(U_FAILURE(errorCode)) { setErrorContext(); }
203
0
    ruleIndex = i;
204
0
    return resetStrength;
205
0
}
206
207
int32_t
208
0
CollationRuleParser::parseRelationOperator(UErrorCode &errorCode) {
209
0
    if(U_FAILURE(errorCode)) { return UCOL_DEFAULT; }
210
0
    ruleIndex = skipWhiteSpace(ruleIndex);
211
0
    if(ruleIndex >= rules->length()) { return UCOL_DEFAULT; }
212
0
    int32_t strength;
213
0
    int32_t i = ruleIndex;
214
0
    UChar c = rules->charAt(i++);
215
0
    switch(c) {
216
0
    case 0x3c:  // '<'
217
0
        if(i < rules->length() && rules->charAt(i) == 0x3c) {  // <<
218
0
            ++i;
219
0
            if(i < rules->length() && rules->charAt(i) == 0x3c) {  // <<<
220
0
                ++i;
221
0
                if(i < rules->length() && rules->charAt(i) == 0x3c) {  // <<<<
222
0
                    ++i;
223
0
                    strength = UCOL_QUATERNARY;
224
0
                } else {
225
0
                    strength = UCOL_TERTIARY;
226
0
                }
227
0
            } else {
228
0
                strength = UCOL_SECONDARY;
229
0
            }
230
0
        } else {
231
0
            strength = UCOL_PRIMARY;
232
0
        }
233
0
        if(i < rules->length() && rules->charAt(i) == 0x2a) {  // '*'
234
0
            ++i;
235
0
            strength |= STARRED_FLAG;
236
0
        }
237
0
        break;
238
0
    case 0x3b:  // ';' same as <<
239
0
        strength = UCOL_SECONDARY;
240
0
        break;
241
0
    case 0x2c:  // ',' same as <<<
242
0
        strength = UCOL_TERTIARY;
243
0
        break;
244
0
    case 0x3d:  // '='
245
0
        strength = UCOL_IDENTICAL;
246
0
        if(i < rules->length() && rules->charAt(i) == 0x2a) {  // '*'
247
0
            ++i;
248
0
            strength |= STARRED_FLAG;
249
0
        }
250
0
        break;
251
0
    default:
252
0
        return UCOL_DEFAULT;
253
0
    }
254
0
    return ((i - ruleIndex) << OFFSET_SHIFT) | strength;
255
0
}
256
257
void
258
0
CollationRuleParser::parseRelationStrings(int32_t strength, int32_t i, UErrorCode &errorCode) {
259
0
    // Parse
260
0
    //     prefix | str / extension
261
0
    // where prefix and extension are optional.
262
0
    UnicodeString prefix, str, extension;
263
0
    i = parseTailoringString(i, str, errorCode);
264
0
    if(U_FAILURE(errorCode)) { return; }
265
0
    UChar next = (i < rules->length()) ? rules->charAt(i) : 0;
266
0
    if(next == 0x7c) {  // '|' separates the context prefix from the string.
267
0
        prefix = str;
268
0
        i = parseTailoringString(i + 1, str, errorCode);
269
0
        if(U_FAILURE(errorCode)) { return; }
270
0
        next = (i < rules->length()) ? rules->charAt(i) : 0;
271
0
    }
272
0
    if(next == 0x2f) {  // '/' separates the string from the extension.
273
0
        i = parseTailoringString(i + 1, extension, errorCode);
274
0
    }
275
0
    if(!prefix.isEmpty()) {
276
0
        UChar32 prefix0 = prefix.char32At(0);
277
0
        UChar32 c = str.char32At(0);
278
0
        if(!nfc.hasBoundaryBefore(prefix0) || !nfc.hasBoundaryBefore(c)) {
279
0
            setParseError("in 'prefix|str', prefix and str must each start with an NFC boundary",
280
0
                          errorCode);
281
0
            return;
282
0
        }
283
0
    }
284
0
    sink->addRelation(strength, prefix, str, extension, errorReason, errorCode);
285
0
    if(U_FAILURE(errorCode)) { setErrorContext(); }
286
0
    ruleIndex = i;
287
0
}
288
289
void
290
0
CollationRuleParser::parseStarredCharacters(int32_t strength, int32_t i, UErrorCode &errorCode) {
291
0
    UnicodeString empty, raw;
292
0
    i = parseString(skipWhiteSpace(i), raw, errorCode);
293
0
    if(U_FAILURE(errorCode)) { return; }
294
0
    if(raw.isEmpty()) {
295
0
        setParseError("missing starred-relation string", errorCode);
296
0
        return;
297
0
    }
298
0
    UChar32 prev = -1;
299
0
    int32_t j = 0;
300
0
    for(;;) {
301
0
        while(j < raw.length()) {
302
0
            UChar32 c = raw.char32At(j);
303
0
            if(!nfd.isInert(c)) {
304
0
                setParseError("starred-relation string is not all NFD-inert", errorCode);
305
0
                return;
306
0
            }
307
0
            sink->addRelation(strength, empty, UnicodeString(c), empty, errorReason, errorCode);
308
0
            if(U_FAILURE(errorCode)) {
309
0
                setErrorContext();
310
0
                return;
311
0
            }
312
0
            j += U16_LENGTH(c);
313
0
            prev = c;
314
0
        }
315
0
        if(i >= rules->length() || rules->charAt(i) != 0x2d) {  // '-'
316
0
            break;
317
0
        }
318
0
        if(prev < 0) {
319
0
            setParseError("range without start in starred-relation string", errorCode);
320
0
            return;
321
0
        }
322
0
        i = parseString(i + 1, raw, errorCode);
323
0
        if(U_FAILURE(errorCode)) { return; }
324
0
        if(raw.isEmpty()) {
325
0
            setParseError("range without end in starred-relation string", errorCode);
326
0
            return;
327
0
        }
328
0
        UChar32 c = raw.char32At(0);
329
0
        if(c < prev) {
330
0
            setParseError("range start greater than end in starred-relation string", errorCode);
331
0
            return;
332
0
        }
333
0
        // range prev-c
334
0
        UnicodeString s;
335
0
        while(++prev <= c) {
336
0
            if(!nfd.isInert(prev)) {
337
0
                setParseError("starred-relation string range is not all NFD-inert", errorCode);
338
0
                return;
339
0
            }
340
0
            if(U_IS_SURROGATE(prev)) {
341
0
                setParseError("starred-relation string range contains a surrogate", errorCode);
342
0
                return;
343
0
            }
344
0
            if(0xfffd <= prev && prev <= 0xffff) {
345
0
                setParseError("starred-relation string range contains U+FFFD, U+FFFE or U+FFFF", errorCode);
346
0
                return;
347
0
            }
348
0
            s.setTo(prev);
349
0
            sink->addRelation(strength, empty, s, empty, errorReason, errorCode);
350
0
            if(U_FAILURE(errorCode)) {
351
0
                setErrorContext();
352
0
                return;
353
0
            }
354
0
        }
355
0
        prev = -1;
356
0
        j = U16_LENGTH(c);
357
0
    }
358
0
    ruleIndex = skipWhiteSpace(i);
359
0
}
360
361
int32_t
362
0
CollationRuleParser::parseTailoringString(int32_t i, UnicodeString &raw, UErrorCode &errorCode) {
363
0
    i = parseString(skipWhiteSpace(i), raw, errorCode);
364
0
    if(U_SUCCESS(errorCode) && raw.isEmpty()) {
365
0
        setParseError("missing relation string", errorCode);
366
0
    }
367
0
    return skipWhiteSpace(i);
368
0
}
369
370
int32_t
371
0
CollationRuleParser::parseString(int32_t i, UnicodeString &raw, UErrorCode &errorCode) {
372
0
    if(U_FAILURE(errorCode)) { return i; }
373
0
    raw.remove();
374
0
    while(i < rules->length()) {
375
0
        UChar32 c = rules->charAt(i++);
376
0
        if(isSyntaxChar(c)) {
377
0
            if(c == 0x27) {  // apostrophe
378
0
                if(i < rules->length() && rules->charAt(i) == 0x27) {
379
0
                    // Double apostrophe, encodes a single one.
380
0
                    raw.append((UChar)0x27);
381
0
                    ++i;
382
0
                    continue;
383
0
                }
384
0
                // Quote literal text until the next single apostrophe.
385
0
                for(;;) {
386
0
                    if(i == rules->length()) {
387
0
                        setParseError("quoted literal text missing terminating apostrophe", errorCode);
388
0
                        return i;
389
0
                    }
390
0
                    c = rules->charAt(i++);
391
0
                    if(c == 0x27) {
392
0
                        if(i < rules->length() && rules->charAt(i) == 0x27) {
393
0
                            // Double apostrophe inside quoted literal text,
394
0
                            // still encodes a single apostrophe.
395
0
                            ++i;
396
0
                        } else {
397
0
                            break;
398
0
                        }
399
0
                    }
400
0
                    raw.append((UChar)c);
401
0
                }
402
0
            } else if(c == 0x5c) {  // backslash
403
0
                if(i == rules->length()) {
404
0
                    setParseError("backslash escape at the end of the rule string", errorCode);
405
0
                    return i;
406
0
                }
407
0
                c = rules->char32At(i);
408
0
                raw.append(c);
409
0
                i += U16_LENGTH(c);
410
0
            } else {
411
0
                // Any other syntax character terminates a string.
412
0
                --i;
413
0
                break;
414
0
            }
415
0
        } else if(PatternProps::isWhiteSpace(c)) {
416
0
            // Unquoted white space terminates a string.
417
0
            --i;
418
0
            break;
419
0
        } else {
420
0
            raw.append((UChar)c);
421
0
        }
422
0
    }
423
0
    for(int32_t j = 0; j < raw.length();) {
424
0
        UChar32 c = raw.char32At(j);
425
0
        if(U_IS_SURROGATE(c)) {
426
0
            setParseError("string contains an unpaired surrogate", errorCode);
427
0
            return i;
428
0
        }
429
0
        if(0xfffd <= c && c <= 0xffff) {
430
0
            setParseError("string contains U+FFFD, U+FFFE or U+FFFF", errorCode);
431
0
            return i;
432
0
        }
433
0
        j += U16_LENGTH(c);
434
0
    }
435
0
    return i;
436
0
}
437
438
namespace {
439
440
static const char *const positions[] = {
441
    "first tertiary ignorable",
442
    "last tertiary ignorable",
443
    "first secondary ignorable",
444
    "last secondary ignorable",
445
    "first primary ignorable",
446
    "last primary ignorable",
447
    "first variable",
448
    "last variable",
449
    "first regular",
450
    "last regular",
451
    "first implicit",
452
    "last implicit",
453
    "first trailing",
454
    "last trailing"
455
};
456
457
}  // namespace
458
459
int32_t
460
0
CollationRuleParser::parseSpecialPosition(int32_t i, UnicodeString &str, UErrorCode &errorCode) {
461
0
    if(U_FAILURE(errorCode)) { return 0; }
462
0
    UnicodeString raw;
463
0
    int32_t j = readWords(i + 1, raw);
464
0
    if(j > i && rules->charAt(j) == 0x5d && !raw.isEmpty()) {  // words end with ]
465
0
        ++j;
466
0
        for(int32_t pos = 0; pos < UPRV_LENGTHOF(positions); ++pos) {
467
0
            if(raw == UnicodeString(positions[pos], -1, US_INV)) {
468
0
                str.setTo((UChar)POS_LEAD).append((UChar)(POS_BASE + pos));
469
0
                return j;
470
0
            }
471
0
        }
472
0
        if(raw == UNICODE_STRING_SIMPLE("top")) {
473
0
            str.setTo((UChar)POS_LEAD).append((UChar)(POS_BASE + LAST_REGULAR));
474
0
            return j;
475
0
        }
476
0
        if(raw == UNICODE_STRING_SIMPLE("variable top")) {
477
0
            str.setTo((UChar)POS_LEAD).append((UChar)(POS_BASE + LAST_VARIABLE));
478
0
            return j;
479
0
        }
480
0
    }
481
0
    setParseError("not a valid special reset position", errorCode);
482
0
    return i;
483
0
}
484
485
void
486
0
CollationRuleParser::parseSetting(UErrorCode &errorCode) {
487
0
    if(U_FAILURE(errorCode)) { return; }
488
0
    UnicodeString raw;
489
0
    int32_t i = ruleIndex + 1;
490
0
    int32_t j = readWords(i, raw);
491
0
    if(j <= i || raw.isEmpty()) {
492
0
        setParseError("expected a setting/option at '['", errorCode);
493
0
    }
494
0
    if(rules->charAt(j) == 0x5d) {  // words end with ]
495
0
        ++j;
496
0
        if(raw.startsWith(UNICODE_STRING_SIMPLE("reorder")) &&
497
0
                (raw.length() == 7 || raw.charAt(7) == 0x20)) {
498
0
            parseReordering(raw, errorCode);
499
0
            ruleIndex = j;
500
0
            return;
501
0
        }
502
0
        if(raw == UNICODE_STRING_SIMPLE("backwards 2")) {
503
0
            settings->setFlag(CollationSettings::BACKWARD_SECONDARY,
504
0
                              UCOL_ON, 0, errorCode);
505
0
            ruleIndex = j;
506
0
            return;
507
0
        }
508
0
        UnicodeString v;
509
0
        int32_t valueIndex = raw.lastIndexOf((UChar)0x20);
510
0
        if(valueIndex >= 0) {
511
0
            v.setTo(raw, valueIndex + 1);
512
0
            raw.truncate(valueIndex);
513
0
        }
514
0
        if(raw == UNICODE_STRING_SIMPLE("strength") && v.length() == 1) {
515
0
            int32_t value = UCOL_DEFAULT;
516
0
            UChar c = v.charAt(0);
517
0
            if(0x31 <= c && c <= 0x34) {  // 1..4
518
0
                value = UCOL_PRIMARY + (c - 0x31);
519
0
            } else if(c == 0x49) {  // 'I'
520
0
                value = UCOL_IDENTICAL;
521
0
            }
522
0
            if(value != UCOL_DEFAULT) {
523
0
                settings->setStrength(value, 0, errorCode);
524
0
                ruleIndex = j;
525
0
                return;
526
0
            }
527
0
        } else if(raw == UNICODE_STRING_SIMPLE("alternate")) {
528
0
            UColAttributeValue value = UCOL_DEFAULT;
529
0
            if(v == UNICODE_STRING_SIMPLE("non-ignorable")) {
530
0
                value = UCOL_NON_IGNORABLE;
531
0
            } else if(v == UNICODE_STRING_SIMPLE("shifted")) {
532
0
                value = UCOL_SHIFTED;
533
0
            }
534
0
            if(value != UCOL_DEFAULT) {
535
0
                settings->setAlternateHandling(value, 0, errorCode);
536
0
                ruleIndex = j;
537
0
                return;
538
0
            }
539
0
        } else if(raw == UNICODE_STRING_SIMPLE("maxVariable")) {
540
0
            int32_t value = UCOL_DEFAULT;
541
0
            if(v == UNICODE_STRING_SIMPLE("space")) {
542
0
                value = CollationSettings::MAX_VAR_SPACE;
543
0
            } else if(v == UNICODE_STRING_SIMPLE("punct")) {
544
0
                value = CollationSettings::MAX_VAR_PUNCT;
545
0
            } else if(v == UNICODE_STRING_SIMPLE("symbol")) {
546
0
                value = CollationSettings::MAX_VAR_SYMBOL;
547
0
            } else if(v == UNICODE_STRING_SIMPLE("currency")) {
548
0
                value = CollationSettings::MAX_VAR_CURRENCY;
549
0
            }
550
0
            if(value != UCOL_DEFAULT) {
551
0
                settings->setMaxVariable(value, 0, errorCode);
552
0
                settings->variableTop = baseData->getLastPrimaryForGroup(
553
0
                    UCOL_REORDER_CODE_FIRST + value);
554
0
                U_ASSERT(settings->variableTop != 0);
555
0
                ruleIndex = j;
556
0
                return;
557
0
            }
558
0
        } else if(raw == UNICODE_STRING_SIMPLE("caseFirst")) {
559
0
            UColAttributeValue value = UCOL_DEFAULT;
560
0
            if(v == UNICODE_STRING_SIMPLE("off")) {
561
0
                value = UCOL_OFF;
562
0
            } else if(v == UNICODE_STRING_SIMPLE("lower")) {
563
0
                value = UCOL_LOWER_FIRST;
564
0
            } else if(v == UNICODE_STRING_SIMPLE("upper")) {
565
0
                value = UCOL_UPPER_FIRST;
566
0
            }
567
0
            if(value != UCOL_DEFAULT) {
568
0
                settings->setCaseFirst(value, 0, errorCode);
569
0
                ruleIndex = j;
570
0
                return;
571
0
            }
572
0
        } else if(raw == UNICODE_STRING_SIMPLE("caseLevel")) {
573
0
            UColAttributeValue value = getOnOffValue(v);
574
0
            if(value != UCOL_DEFAULT) {
575
0
                settings->setFlag(CollationSettings::CASE_LEVEL, value, 0, errorCode);
576
0
                ruleIndex = j;
577
0
                return;
578
0
            }
579
0
        } else if(raw == UNICODE_STRING_SIMPLE("normalization")) {
580
0
            UColAttributeValue value = getOnOffValue(v);
581
0
            if(value != UCOL_DEFAULT) {
582
0
                settings->setFlag(CollationSettings::CHECK_FCD, value, 0, errorCode);
583
0
                ruleIndex = j;
584
0
                return;
585
0
            }
586
0
        } else if(raw == UNICODE_STRING_SIMPLE("numericOrdering")) {
587
0
            UColAttributeValue value = getOnOffValue(v);
588
0
            if(value != UCOL_DEFAULT) {
589
0
                settings->setFlag(CollationSettings::NUMERIC, value, 0, errorCode);
590
0
                ruleIndex = j;
591
0
                return;
592
0
            }
593
0
        } else if(raw == UNICODE_STRING_SIMPLE("hiraganaQ")) {
594
0
            UColAttributeValue value = getOnOffValue(v);
595
0
            if(value != UCOL_DEFAULT) {
596
0
                if(value == UCOL_ON) {
597
0
                    setParseError("[hiraganaQ on] is not supported", errorCode);
598
0
                }
599
0
                ruleIndex = j;
600
0
                return;
601
0
            }
602
0
        } else if(raw == UNICODE_STRING_SIMPLE("import")) {
603
0
            CharString lang;
604
0
            lang.appendInvariantChars(v, errorCode);
605
0
            if(errorCode == U_MEMORY_ALLOCATION_ERROR) { return; }
606
0
            // BCP 47 language tag -> ICU locale ID
607
0
            char localeID[ULOC_FULLNAME_CAPACITY];
608
0
            int32_t parsedLength;
609
0
            int32_t length = uloc_forLanguageTag(lang.data(), localeID, ULOC_FULLNAME_CAPACITY,
610
0
                                                 &parsedLength, &errorCode);
611
0
            if(U_FAILURE(errorCode) ||
612
0
                    parsedLength != lang.length() || length >= ULOC_FULLNAME_CAPACITY) {
613
0
                errorCode = U_ZERO_ERROR;
614
0
                setParseError("expected language tag in [import langTag]", errorCode);
615
0
                return;
616
0
            }
617
0
            // localeID minus all keywords
618
0
            char baseID[ULOC_FULLNAME_CAPACITY];
619
0
            length = uloc_getBaseName(localeID, baseID, ULOC_FULLNAME_CAPACITY, &errorCode);
620
0
            if(U_FAILURE(errorCode) || length >= ULOC_KEYWORDS_CAPACITY) {
621
0
                errorCode = U_ZERO_ERROR;
622
0
                setParseError("expected language tag in [import langTag]", errorCode);
623
0
                return;
624
0
            }
625
0
            if(length == 3 && uprv_memcmp(baseID, "und", 3) == 0) {
626
0
                uprv_strcpy(baseID, "root");
627
0
            }
628
0
            // @collation=type, or length=0 if not specified
629
0
            char collationType[ULOC_KEYWORDS_CAPACITY];
630
0
            length = uloc_getKeywordValue(localeID, "collation",
631
0
                                          collationType, ULOC_KEYWORDS_CAPACITY,
632
0
                                          &errorCode);
633
0
            if(U_FAILURE(errorCode) || length >= ULOC_KEYWORDS_CAPACITY) {
634
0
                errorCode = U_ZERO_ERROR;
635
0
                setParseError("expected language tag in [import langTag]", errorCode);
636
0
                return;
637
0
            }
638
0
            if(importer == NULL) {
639
0
                setParseError("[import langTag] is not supported", errorCode);
640
0
            } else {
641
0
                UnicodeString importedRules;
642
0
                importer->getRules(baseID, length > 0 ? collationType : "standard",
643
0
                                   importedRules, errorReason, errorCode);
644
0
                if(U_FAILURE(errorCode)) {
645
0
                    if(errorReason == NULL) {
646
0
                        errorReason = "[import langTag] failed";
647
0
                    }
648
0
                    setErrorContext();
649
0
                    return;
650
0
                }
651
0
                const UnicodeString *outerRules = rules;
652
0
                int32_t outerRuleIndex = ruleIndex;
653
0
                parse(importedRules, errorCode);
654
0
                if(U_FAILURE(errorCode)) {
655
0
                    if(parseError != NULL) {
656
0
                        parseError->offset = outerRuleIndex;
657
0
                    }
658
0
                }
659
0
                rules = outerRules;
660
0
                ruleIndex = j;
661
0
            }
662
0
            return;
663
0
        }
664
0
    } else if(rules->charAt(j) == 0x5b) {  // words end with [
665
0
        UnicodeSet set;
666
0
        j = parseUnicodeSet(j, set, errorCode);
667
0
        if(U_FAILURE(errorCode)) { return; }
668
0
        if(raw == UNICODE_STRING_SIMPLE("optimize")) {
669
0
            sink->optimize(set, errorReason, errorCode);
670
0
            if(U_FAILURE(errorCode)) { setErrorContext(); }
671
0
            ruleIndex = j;
672
0
            return;
673
0
        } else if(raw == UNICODE_STRING_SIMPLE("suppressContractions")) {
674
0
            sink->suppressContractions(set, errorReason, errorCode);
675
0
            if(U_FAILURE(errorCode)) { setErrorContext(); }
676
0
            ruleIndex = j;
677
0
            return;
678
0
        }
679
0
    }
680
0
    setParseError("not a valid setting/option", errorCode);
681
0
}
682
683
void
684
0
CollationRuleParser::parseReordering(const UnicodeString &raw, UErrorCode &errorCode) {
685
0
    if(U_FAILURE(errorCode)) { return; }
686
0
    int32_t i = 7;  // after "reorder"
687
0
    if(i == raw.length()) {
688
0
        // empty [reorder] with no codes
689
0
        settings->resetReordering();
690
0
        return;
691
0
    }
692
0
    // Parse the codes in [reorder aa bb cc].
693
0
    UVector32 reorderCodes(errorCode);
694
0
    if(U_FAILURE(errorCode)) { return; }
695
0
    CharString word;
696
0
    while(i < raw.length()) {
697
0
        ++i;  // skip the word-separating space
698
0
        int32_t limit = raw.indexOf((UChar)0x20, i);
699
0
        if(limit < 0) { limit = raw.length(); }
700
0
        word.clear().appendInvariantChars(raw.tempSubStringBetween(i, limit), errorCode);
701
0
        if(U_FAILURE(errorCode)) { return; }
702
0
        int32_t code = getReorderCode(word.data());
703
0
        if(code < 0) {
704
0
            setParseError("unknown script or reorder code", errorCode);
705
0
            return;
706
0
        }
707
0
        reorderCodes.addElement(code, errorCode);
708
0
        if(U_FAILURE(errorCode)) { return; }
709
0
        i = limit;
710
0
    }
711
0
    settings->setReordering(*baseData, reorderCodes.getBuffer(), reorderCodes.size(), errorCode);
712
0
}
713
714
static const char *const gSpecialReorderCodes[] = {
715
    "space", "punct", "symbol", "currency", "digit"
716
};
717
718
int32_t
719
0
CollationRuleParser::getReorderCode(const char *word) {
720
0
    for(int32_t i = 0; i < UPRV_LENGTHOF(gSpecialReorderCodes); ++i) {
721
0
        if(uprv_stricmp(word, gSpecialReorderCodes[i]) == 0) {
722
0
            return UCOL_REORDER_CODE_FIRST + i;
723
0
        }
724
0
    }
725
0
    int32_t script = u_getPropertyValueEnum(UCHAR_SCRIPT, word);
726
0
    if(script >= 0) {
727
0
        return script;
728
0
    }
729
0
    if(uprv_stricmp(word, "others") == 0) {
730
0
        return UCOL_REORDER_CODE_OTHERS;  // same as Zzzz = USCRIPT_UNKNOWN
731
0
    }
732
0
    return -1;
733
0
}
734
735
UColAttributeValue
736
0
CollationRuleParser::getOnOffValue(const UnicodeString &s) {
737
0
    if(s == UNICODE_STRING_SIMPLE("on")) {
738
0
        return UCOL_ON;
739
0
    } else if(s == UNICODE_STRING_SIMPLE("off")) {
740
0
        return UCOL_OFF;
741
0
    } else {
742
0
        return UCOL_DEFAULT;
743
0
    }
744
0
}
745
746
int32_t
747
0
CollationRuleParser::parseUnicodeSet(int32_t i, UnicodeSet &set, UErrorCode &errorCode) {
748
0
    // Collect a UnicodeSet pattern between a balanced pair of [brackets].
749
0
    int32_t level = 0;
750
0
    int32_t j = i;
751
0
    for(;;) {
752
0
        if(j == rules->length()) {
753
0
            setParseError("unbalanced UnicodeSet pattern brackets", errorCode);
754
0
            return j;
755
0
        }
756
0
        UChar c = rules->charAt(j++);
757
0
        if(c == 0x5b) {  // '['
758
0
            ++level;
759
0
        } else if(c == 0x5d) {  // ']'
760
0
            if(--level == 0) { break; }
761
0
        }
762
0
    }
763
0
    set.applyPattern(rules->tempSubStringBetween(i, j), errorCode);
764
0
    if(U_FAILURE(errorCode)) {
765
0
        errorCode = U_ZERO_ERROR;
766
0
        setParseError("not a valid UnicodeSet pattern", errorCode);
767
0
        return j;
768
0
    }
769
0
    j = skipWhiteSpace(j);
770
0
    if(j == rules->length() || rules->charAt(j) != 0x5d) {
771
0
        setParseError("missing option-terminating ']' after UnicodeSet pattern", errorCode);
772
0
        return j;
773
0
    }
774
0
    return ++j;
775
0
}
776
777
int32_t
778
0
CollationRuleParser::readWords(int32_t i, UnicodeString &raw) const {
779
0
    static const UChar sp = 0x20;
780
0
    raw.remove();
781
0
    i = skipWhiteSpace(i);
782
0
    for(;;) {
783
0
        if(i >= rules->length()) { return 0; }
784
0
        UChar c = rules->charAt(i);
785
0
        if(isSyntaxChar(c) && c != 0x2d && c != 0x5f) {  // syntax except -_
786
0
            if(raw.isEmpty()) { return i; }
787
0
            if(raw.endsWith(&sp, 1)) {  // remove trailing space
788
0
                raw.truncate(raw.length() - 1);
789
0
            }
790
0
            return i;
791
0
        }
792
0
        if(PatternProps::isWhiteSpace(c)) {
793
0
            raw.append(sp);
794
0
            i = skipWhiteSpace(i + 1);
795
0
        } else {
796
0
            raw.append(c);
797
0
            ++i;
798
0
        }
799
0
    }
800
0
}
801
802
int32_t
803
0
CollationRuleParser::skipComment(int32_t i) const {
804
0
    // skip to past the newline
805
0
    while(i < rules->length()) {
806
0
        UChar c = rules->charAt(i++);
807
0
        // LF or FF or CR or NEL or LS or PS
808
0
        if(c == 0xa || c == 0xc || c == 0xd || c == 0x85 || c == 0x2028 || c == 0x2029) {
809
0
            // Unicode Newline Guidelines: "A readline function should stop at NLF, LS, FF, or PS."
810
0
            // NLF (new line function) = CR or LF or CR+LF or NEL.
811
0
            // No need to collect all of CR+LF because a following LF will be ignored anyway.
812
0
            break;
813
0
        }
814
0
    }
815
0
    return i;
816
0
}
817
818
void
819
0
CollationRuleParser::setParseError(const char *reason, UErrorCode &errorCode) {
820
0
    if(U_FAILURE(errorCode)) { return; }
821
0
    // Error code consistent with the old parser (from ca. 2001),
822
0
    // rather than U_PARSE_ERROR;
823
0
    errorCode = U_INVALID_FORMAT_ERROR;
824
0
    errorReason = reason;
825
0
    if(parseError != NULL) { setErrorContext(); }
826
0
}
827
828
void
829
0
CollationRuleParser::setErrorContext() {
830
0
    if(parseError == NULL) { return; }
831
0
832
0
    // Note: This relies on the calling code maintaining the ruleIndex
833
0
    // at a position that is useful for debugging.
834
0
    // For example, at the beginning of a reset or relation etc.
835
0
    parseError->offset = ruleIndex;
836
0
    parseError->line = 0;  // We are not counting line numbers.
837
0
838
0
    // before ruleIndex
839
0
    int32_t start = ruleIndex - (U_PARSE_CONTEXT_LEN - 1);
840
0
    if(start < 0) {
841
0
        start = 0;
842
0
    } else if(start > 0 && U16_IS_TRAIL(rules->charAt(start))) {
843
0
        ++start;
844
0
    }
845
0
    int32_t length = ruleIndex - start;
846
0
    rules->extract(start, length, parseError->preContext);
847
0
    parseError->preContext[length] = 0;
848
0
849
0
    // starting from ruleIndex
850
0
    length = rules->length() - ruleIndex;
851
0
    if(length >= U_PARSE_CONTEXT_LEN) {
852
0
        length = U_PARSE_CONTEXT_LEN - 1;
853
0
        if(U16_IS_LEAD(rules->charAt(ruleIndex + length - 1))) {
854
0
            --length;
855
0
        }
856
0
    }
857
0
    rules->extract(ruleIndex, length, parseError->postContext);
858
0
    parseError->postContext[length] = 0;
859
0
}
860
861
UBool
862
0
CollationRuleParser::isSyntaxChar(UChar32 c) {
863
0
    return 0x21 <= c && c <= 0x7e &&
864
0
            (c <= 0x2f || (0x3a <= c && c <= 0x40) ||
865
0
            (0x5b <= c && c <= 0x60) || (0x7b <= c));
866
0
}
867
868
int32_t
869
0
CollationRuleParser::skipWhiteSpace(int32_t i) const {
870
0
    while(i < rules->length() && PatternProps::isWhiteSpace(rules->charAt(i))) {
871
0
        ++i;
872
0
    }
873
0
    return i;
874
0
}
875
876
U_NAMESPACE_END
877
878
#endif  // !UCONFIG_NO_COLLATION