Coverage Report

Created: 2025-11-07 06:50

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/icu/icu4c/source/i18n/messageformat2_parser.cpp
Line
Count
Source
1
// © 2024 and later: Unicode, Inc. and others.
2
// License & terms of use: http://www.unicode.org/copyright.html
3
4
#include "unicode/utypes.h"
5
6
#if !UCONFIG_NO_NORMALIZATION
7
8
#if !UCONFIG_NO_FORMATTING
9
10
#if !UCONFIG_NO_MF2
11
12
#include "unicode/uniset.h"
13
#include "messageformat2_errors.h"
14
#include "messageformat2_macros.h"
15
#include "messageformat2_parser.h"
16
#include "ucln_in.h"
17
#include "umutex.h"
18
#include "uvector.h" // U_ASSERT
19
20
U_NAMESPACE_BEGIN
21
22
namespace message2 {
23
24
using namespace pluralimpl;
25
26
using namespace data_model;
27
28
/*
29
    The `ERROR()` macro sets a syntax error in the context
30
    and sets the offset in `parseError` to `index`. It does not alter control flow.
31
*/
32
#define ERROR(errorCode)                                                                                \
33
8.61M
    if (!errors.hasSyntaxError()) {                                                                     \
34
6.61k
        setParseError(parseError, index);                                                               \
35
6.61k
        errors.addSyntaxError(errorCode);                                                               \
36
6.61k
    }
37
38
#define ERROR_AT(errorCode, i)                                                                          \
39
41.3k
    if (!errors.hasSyntaxError()) {                                                                     \
40
9
        setParseError(parseError, i);                                                                   \
41
9
        errors.addSyntaxError(errorCode);                                                               \
42
9
    }
43
44
// Increments the line number and updates the "characters seen before
45
// current line" count in `parseError`, iff `peek()` is a newline
46
11.7M
void Parser::maybeAdvanceLine() {
47
11.7M
    if (peek() == LF) {
48
79.5k
        parseError.line++;
49
        // add 1 to index to get the number of characters seen so far
50
        // (including the newline)
51
79.5k
        parseError.lengthBeforeCurrentLine = index + 1;
52
79.5k
    }
53
11.7M
}
54
55
/*
56
    Signals an error and returns either if `parseError` already denotes an
57
    error, or `index` is out of bounds for the string `source`
58
*/
59
#define CHECK_BOUNDS(errorCode)                                                            \
60
973k
    if (!inBounds()) {                                                                     \
61
1.55k
        ERROR(errorCode);                                                                  \
62
1.55k
        return;                                                                            \
63
1.55k
    }
64
#define CHECK_BOUNDS_1(errorCode)                                                          \
65
56.5k
    if (!inBounds(1)) {                                                                    \
66
21
        ERROR_AT(errorCode, index + 1);                                                    \
67
21
        return;                                                                            \
68
21
    }
69
70
// -------------------------------------
71
// Helper functions
72
73
13.6k
static void copyContext(const UChar in[U_PARSE_CONTEXT_LEN], UChar out[U_PARSE_CONTEXT_LEN]) {
74
13.6k
    for (int32_t i = 0; i < U_PARSE_CONTEXT_LEN; i++) {
75
13.6k
        out[i] = in[i];
76
13.6k
        if (in[i] == '\0') {
77
13.6k
            break;
78
13.6k
        }
79
13.6k
    }
80
13.6k
}
81
82
6.80k
/* static */ void Parser::translateParseError(const MessageParseError &messageParseError, UParseError &parseError) {
83
6.80k
    parseError.line = messageParseError.line;
84
6.80k
    parseError.offset = messageParseError.offset;
85
6.80k
    copyContext(messageParseError.preContext, parseError.preContext);
86
6.80k
    copyContext(messageParseError.postContext, parseError.postContext);
87
6.80k
}
88
89
6.62k
/* static */ void Parser::setParseError(MessageParseError &parseError, uint32_t index) {
90
    // Translate absolute to relative offset
91
6.62k
    parseError.offset = index                               // Start with total number of characters seen
92
6.62k
                      - parseError.lengthBeforeCurrentLine; // Subtract all characters before the current line
93
    // TODO: Fill this in with actual pre and post-context
94
6.62k
    parseError.preContext[0] = 0;
95
6.62k
    parseError.postContext[0] = 0;
96
6.62k
}
97
98
// -------------------------------------
99
// Initialization of UnicodeSets
100
101
namespace unisets {
102
103
UnicodeSet* gUnicodeSets[unisets::UNISETS_KEY_COUNT] = {};
104
105
71.2k
inline UnicodeSet* getImpl(Key key) {
106
71.2k
    return gUnicodeSets[key];
107
71.2k
}
108
109
icu::UInitOnce gMF2ParseUniSetsInitOnce {};
110
}
111
112
1
UnicodeSet* initContentChars(UErrorCode& status) {
113
1
    if (U_FAILURE(status)) {
114
0
        return nullptr;
115
0
    }
116
117
1
    UnicodeSet* result = new UnicodeSet(0x0001, 0x0008); // Omit NULL, HTAB and LF
118
1
    if (result == nullptr) {
119
0
        status = U_MEMORY_ALLOCATION_ERROR;
120
0
        return nullptr;
121
0
    }
122
1
    result->add(0x000B, 0x000C); // Omit CR
123
1
    result->add(0x000E, 0x001F); // Omit SP
124
1
    result->add(0x0021, 0x002D); // Omit '.'
125
1
    result->add(0x002F, 0x003F); // Omit '@'
126
1
    result->add(0x0041, 0x005B); // Omit '\'
127
1
    result->add(0x005D, 0x007A); // Omit { | }
128
1
    result->add(0x007E, 0x2FFF); // Omit IDEOGRAPHIC_SPACE
129
1
    result->add(0x3001, 0x10FFFF); // Allowing surrogates is intentional
130
1
    result->freeze();
131
1
    return result;
132
1
}
133
134
1
UnicodeSet* initWhitespace(UErrorCode& status) {
135
1
    if (U_FAILURE(status)) {
136
0
        return nullptr;
137
0
    }
138
139
1
    UnicodeSet* result = new UnicodeSet();
140
1
    if (result == nullptr) {
141
0
        status = U_MEMORY_ALLOCATION_ERROR;
142
0
        return nullptr;
143
0
    }
144
1
    result->add(SPACE);
145
1
    result->add(HTAB);
146
1
    result->add(CR);
147
1
    result->add(LF);
148
1
    result->add(IDEOGRAPHIC_SPACE);
149
1
    result->freeze();
150
1
    return result;
151
1
}
152
153
1
UnicodeSet* initBidiControls(UErrorCode& status) {
154
1
    UnicodeSet* result = new UnicodeSet(UnicodeString("[\\u061C]"), status);
155
1
    if (U_FAILURE(status)) {
156
0
        return nullptr;
157
0
    }
158
1
    result->add(0x200E, 0x200F);
159
1
    result->add(0x2066, 0x2069);
160
1
    result->freeze();
161
1
    return result;
162
1
}
163
164
1
UnicodeSet* initAlpha(UErrorCode& status) {
165
1
    UnicodeSet* result = new UnicodeSet(UnicodeString("[:letter:]"), status);
166
1
    if (U_FAILURE(status)) {
167
0
        return nullptr;
168
0
    }
169
1
    result->freeze();
170
1
    return result;
171
1
}
172
173
1
UnicodeSet* initDigits(UErrorCode& status) {
174
1
    UnicodeSet* result = new UnicodeSet(UnicodeString("[:number:]"), status);
175
1
    if (U_FAILURE(status)) {
176
0
        return nullptr;
177
0
    }
178
1
    result->freeze();
179
1
    return result;
180
1
}
181
182
1
UnicodeSet* initNameStartChars(UErrorCode& status) {
183
1
    if (U_FAILURE(status)) {
184
0
        return nullptr;
185
0
    }
186
187
1
    UnicodeSet* isAlpha = unisets::gUnicodeSets[unisets::ALPHA] = initAlpha(status);
188
1
    if (U_FAILURE(status)) {
189
0
        return nullptr;
190
0
    }
191
1
    UnicodeSet* result = new UnicodeSet();
192
1
    if (result == nullptr) {
193
0
        status = U_MEMORY_ALLOCATION_ERROR;
194
0
        return nullptr;
195
1
    };
196
197
1
    result->addAll(*isAlpha);
198
1
    result->add(0x002B);
199
1
    result->add(0x005F);
200
1
    result->add(0x00A1, 0x061B);
201
1
    result->add(0x061D, 0x167F);
202
1
    result->add(0x1681, 0x1FFF);
203
1
    result->add(0x200B, 0x200D);
204
1
    result->add(0x2010, 0x2027);
205
1
    result->add(0x2030, 0x205E);
206
1
    result->add(0x2060, 0x2065);
207
1
    result->add(0x206A, 0x2FFF);
208
1
    result->add(0x3001, 0xD7FF);
209
1
    result->add(0xE000, 0xFDCF);
210
1
    result->add(0xFDF0, 0xFFFD);
211
1
    result->add(0x10000, 0x1FFFD);
212
1
    result->add(0x20000, 0x2FFFD);
213
1
    result->add(0x30000, 0x3FFFD);
214
1
    result->add(0x40000, 0x4FFFD);
215
1
    result->add(0x50000, 0x5FFFD);
216
1
    result->add(0x60000, 0x6FFFD);
217
1
    result->add(0x70000, 0x7FFFD);
218
1
    result->add(0x80000, 0x8FFFD);
219
1
    result->add(0x90000, 0x9FFFD);
220
1
    result->add(0xA0000, 0xAFFFD);
221
1
    result->add(0xB0000, 0xBFFFD);
222
1
    result->add(0xC0000, 0xCFFFD);
223
1
    result->add(0xD0000, 0xDFFFD);
224
1
    result->add(0xE0000, 0xEFFFD);
225
1
    result->add(0xF0000, 0xFFFFD);
226
1
    result->add(0x100000, 0x10FFFD);
227
1
    result->freeze();
228
1
    return result;
229
1
}
230
231
1
UnicodeSet* initNameChars(UErrorCode& status) {
232
1
    if (U_FAILURE(status)) {
233
0
        return nullptr;
234
0
    }
235
236
1
    UnicodeSet* nameStart = unisets::gUnicodeSets[unisets::NAME_START] = initNameStartChars(status);
237
1
    UnicodeSet* digit = unisets::gUnicodeSets[unisets::DIGIT] = initDigits(status);
238
1
    if (U_FAILURE(status)) {
239
0
        return nullptr;
240
0
    }
241
1
    UnicodeSet* result = new UnicodeSet();
242
1
    if (result == nullptr) {
243
0
        status = U_MEMORY_ALLOCATION_ERROR;
244
0
        return nullptr;
245
1
    };
246
1
    result->addAll(*nameStart);
247
1
    result->addAll(*digit);
248
1
    result->add(HYPHEN);
249
1
    result->add(PERIOD);
250
1
    result->freeze();
251
1
    return result;
252
1
}
253
254
1
UnicodeSet* initTextChars(UErrorCode& status) {
255
1
    if (U_FAILURE(status)) {
256
0
        return nullptr;
257
0
    }
258
259
1
    UnicodeSet* content = unisets::gUnicodeSets[unisets::CONTENT] = initContentChars(status);
260
1
    UnicodeSet* whitespace = unisets::gUnicodeSets[unisets::WHITESPACE] = initWhitespace(status);
261
1
    if (U_FAILURE(status)) {
262
0
        return nullptr;
263
0
    }
264
1
    UnicodeSet* result = new UnicodeSet();
265
1
    if (result == nullptr) {
266
0
        status = U_MEMORY_ALLOCATION_ERROR;
267
0
        return nullptr;
268
1
    };
269
1
    result->addAll(*content);
270
1
    result->addAll(*whitespace);
271
1
    result->add(PERIOD);
272
1
    result->add(AT);
273
1
    result->add(PIPE);
274
1
    result->freeze();
275
1
    return result;
276
1
}
277
278
1
UnicodeSet* initQuotedChars(UErrorCode& status) {
279
1
    if (U_FAILURE(status)) {
280
0
        return nullptr;
281
0
    }
282
283
1
    unisets::gUnicodeSets[unisets::TEXT] = initTextChars(status);
284
1
    if (U_FAILURE(status)) {
285
0
        return nullptr;
286
0
    }
287
1
    UnicodeSet* result = new UnicodeSet();
288
1
    if (result == nullptr) {
289
0
        status = U_MEMORY_ALLOCATION_ERROR;
290
0
        return nullptr;
291
1
    };
292
    // content and whitespace were initialized by `initTextChars()`
293
1
    UnicodeSet* content = unisets::getImpl(unisets::CONTENT);
294
1
    if (content == nullptr) {
295
0
        status = U_MEMORY_ALLOCATION_ERROR;
296
0
        return nullptr;
297
0
    }
298
1
    result->addAll(*content);
299
1
    UnicodeSet* whitespace = unisets::getImpl(unisets::WHITESPACE);
300
1
    if (whitespace == nullptr) {
301
0
        status = U_MEMORY_ALLOCATION_ERROR;
302
0
        return nullptr;
303
0
    }
304
1
    result->addAll(*whitespace);
305
1
    result->add(PERIOD);
306
1
    result->add(AT);
307
1
    result->add(LEFT_CURLY_BRACE);
308
1
    result->add(RIGHT_CURLY_BRACE);
309
1
    result->freeze();
310
1
    return result;
311
1
}
312
313
1
UnicodeSet* initEscapableChars(UErrorCode& status) {
314
1
    if (U_FAILURE(status)) {
315
0
        return nullptr;
316
0
    }
317
318
1
    UnicodeSet* result = new UnicodeSet();
319
1
    if (result == nullptr) {
320
0
        status = U_MEMORY_ALLOCATION_ERROR;
321
0
        return nullptr;
322
0
    }
323
1
    result->add(PIPE);
324
1
    result->add(BACKSLASH);
325
1
    result->add(LEFT_CURLY_BRACE);
326
1
    result->add(RIGHT_CURLY_BRACE);
327
1
    result->freeze();
328
1
    return result;
329
1
}
330
331
namespace unisets {
332
333
0
UBool U_CALLCONV cleanupMF2ParseUniSets() {
334
0
    for (int32_t i = 0; i < UNISETS_KEY_COUNT; i++) {
335
0
        delete gUnicodeSets[i];
336
0
        gUnicodeSets[i] = nullptr;
337
0
    }
338
0
    gMF2ParseUniSetsInitOnce.reset();
339
0
    return true;
340
0
}
341
342
1
void U_CALLCONV initMF2ParseUniSets(UErrorCode& status) {
343
1
    ucln_i18n_registerCleanup(UCLN_I18N_MF2_UNISETS, cleanupMF2ParseUniSets);
344
    /*
345
      Each of the init functions initializes the UnicodeSets
346
      that it depends on.
347
348
      initBidiControls (no dependencies)
349
350
      initEscapableChars (no dependencies)
351
352
      initNameChars depends on
353
         initDigits
354
         initNameStartChars depends on
355
           initAlpha
356
357
      initQuotedChars depends on
358
         initTextChars depends on
359
            initContentChars
360
            initWhitespace
361
     */
362
1
    gUnicodeSets[unisets::BIDI] = initBidiControls(status);
363
1
    gUnicodeSets[unisets::NAME_CHAR] = initNameChars(status);
364
1
    gUnicodeSets[unisets::QUOTED] = initQuotedChars(status);
365
1
    gUnicodeSets[unisets::ESCAPABLE] = initEscapableChars(status);
366
367
1
    if (U_FAILURE(status)) {
368
0
        cleanupMF2ParseUniSets();
369
0
    }
370
1
}
371
372
71.2k
const UnicodeSet* get(Key key, UErrorCode& status) {
373
71.2k
    umtx_initOnce(gMF2ParseUniSetsInitOnce, &initMF2ParseUniSets, status);
374
71.2k
    if (U_FAILURE(status)) {
375
0
        return nullptr;
376
0
    }
377
71.2k
    UnicodeSet* result = getImpl(key);
378
71.2k
    if (result == nullptr) {
379
0
        status = U_MEMORY_ALLOCATION_ERROR;
380
0
    }
381
71.2k
    return result;
382
71.2k
}
383
384
}
385
386
// -------------------------------------
387
// Predicates
388
389
/*
390
  The following helper predicates should exactly match nonterminals in the MessageFormat 2 grammar:
391
392
  `isContentChar()`   : `content-char`
393
  `isTextChar()`      : `text-char`
394
  `isAlpha()`         : `ALPHA`
395
  `isDigit()`         : `DIGIT`
396
  `isNameStart()`     : `name-start`
397
  `isNameChar()`      : `name-char`
398
  `isUnquotedStart()` : `unquoted-start`
399
  `isQuotedChar()`    : `quoted-char`
400
  `isWhitespace()`    : `s`
401
*/
402
403
0
bool Parser::isContentChar(UChar32 c) const {
404
0
    return contentChars->contains(c);
405
0
}
406
407
// See `bidi` in the MF2 grammar
408
13.4M
bool Parser::isBidiControl(UChar32 c) const {
409
13.4M
    return bidiControlChars->contains(c);
410
13.4M
}
411
412
// See `ws` in the MessageFormat 2 grammar
413
13.4M
bool Parser::isWhitespace(UChar32 c) const {
414
13.4M
    return whitespaceChars->contains(c);
415
13.4M
}
416
417
11.5M
bool Parser::isTextChar(UChar32 c) const {
418
11.5M
    return textChars->contains(c);
419
11.5M
}
420
421
0
bool Parser::isAlpha(UChar32 c) const {
422
0
    return alphaChars->contains(c);
423
0
}
424
425
79
bool Parser::isDigit(UChar32 c) const {
426
79
    return digitChars->contains(c);
427
79
}
428
429
390k
bool Parser::isNameStart(UChar32 c) const {
430
390k
    return nameStartChars->contains(c);
431
390k
}
432
433
23.8M
bool Parser::isNameChar(UChar32 c) const {
434
23.8M
    return nameChars->contains(c);
435
23.8M
}
436
437
12.3k
bool Parser::isUnquotedStart(UChar32 c) const {
438
12.3k
    return isNameChar(c);
439
12.3k
}
440
441
119k
bool Parser::isQuotedChar(UChar32 c) const {
442
119k
    return quotedChars->contains(c);
443
119k
}
444
445
11.6M
bool Parser::isEscapableChar(UChar32 c) const {
446
11.6M
    return escapableChars->contains(c);
447
11.6M
}
448
449
// Returns true iff `c` can begin a `function` nonterminal
450
176k
static bool isFunctionStart(UChar32 c) {
451
176k
    switch (c) {
452
161k
    case COLON: {
453
161k
        return true;
454
0
    }
455
14.8k
    default: {
456
14.8k
        return false;
457
0
    }
458
176k
    }
459
176k
}
460
461
// Returns true iff `c` can begin an `annotation` nonterminal
462
68.6k
static bool isAnnotationStart(UChar32 c) {
463
68.6k
    return isFunctionStart(c);
464
68.6k
}
465
466
// Returns true iff `c` can begin a `literal` nonterminal
467
157
bool Parser::isLiteralStart(UChar32 c) const {
468
157
    return (c == PIPE || isNameStart(c) || c == HYPHEN || isDigit(c));
469
157
}
470
471
// Returns true iff `c` can begin a `key` nonterminal
472
161
bool Parser::isKeyStart(UChar32 c) const {
473
161
    return (c == ASTERISK || isLiteralStart(c));
474
161
}
475
476
0
bool Parser::isDeclarationStart() {
477
0
    return (peek() == ID_LOCAL[0]
478
0
            && inBounds(1)
479
0
            && peek(1) == ID_LOCAL[1])
480
0
        || (peek() == ID_INPUT[0]
481
0
            && inBounds(1)
482
0
            && peek(1) == ID_INPUT[1]);
483
0
}
484
485
// -------------------------------------
486
// Parsing functions
487
488
489
/*
490
  TODO: Since handling the whitespace ambiguities needs to be repeated
491
  in several different places and is hard to factor out,
492
  it probably would be better to replace the parser with a lexer + parser
493
  to separate tokenizing from parsing, which would simplify the code significantly.
494
  This has the disadvantage that there is no token grammar for MessageFormat,
495
  so one would have to be invented that isn't a component of the spec.
496
 */
497
498
/*
499
    This is a recursive-descent scannerless parser that,
500
    with a few exceptions, uses 1 character of lookahead.
501
502
    This may not be an exhaustive list, as the additions of attributes and reserved
503
    statements introduced several new ambiguities.
504
505
All but three of the exceptions involve ambiguities about the meaning of whitespace.
506
One ambiguity not involving whitespace is:
507
identifier -> namespace ":" name
508
vs.
509
identifier -> name
510
511
`namespace` and `name` can't be distinguished without arbitrary lookahead.
512
(For how this is handled, see parseIdentifier())
513
514
The second ambiguity not involving whitespace is:
515
complex-message -> *(declaration[s]) complex-body
516
                -> declaration *(declaration[s]) complex-body
517
                -> declaration complex-body
518
                -> reserved-statement complex-body
519
                -> .foo {$x} .match // ...
520
When processing the '.', arbitrary lookahead is required to distinguish the
521
arbitrary-length unsupported keyword from `.match`.
522
(For how this is handled, see parseDeclarations()).
523
524
The third ambiguity not involving whitespace is:
525
complex-message -> *(declaration [s]) complex-body
526
                -> reserved-statement *(declaration [s]) complex-body
527
                -> reserved-statement complex-body
528
                -> reserved-statement quotedPattern
529
                -> reserved-keyword [s reserved-body] 1*([s] expression) quoted-pattern
530
                -> reserved-keyword expression quoted-pattern
531
 Example: .foo {1} {{1}}
532
533
 Without lookahead, the opening '{' of the quoted pattern can't be distinguished
534
 from the opening '{' of another expression in the unsupported statement.
535
 (Though this only requires 1 character of lookahead.)
536
537
 Otherwise:
538
539
There are at least seven ambiguities in the grammar that can't be resolved with finite
540
lookahead (since whitespace sequences can be arbitrarily long). They are resolved
541
with a form of backtracking (early exit). No state needs to be saved/restored
542
since whitespace doesn't affect the shape of the resulting parse tree, so it's
543
not true backtracking.
544
545
In addition, the grammar has been refactored
546
in a semantics-preserving way in some cases to make the code easier to structure.
547
548
First: variant = when 1*(s key) [s] pattern
549
   Example: when k     {a}
550
   When reading the first space after 'k', it's ambiguous whether it's the
551
   required space before another key, or the optional space before `pattern`.
552
 (See comments in parseNonEmptyKeys())
553
554
Second: expression = "{" [s] (((literal / variable) [s annotation]) / annotation) [s] "}"
555
        annotation = (function *(s option)) / reserved
556
   Example: {:f    }
557
   When reading the first space after 'f', it's ambiguous whether it's the
558
   required space before an option, or the optional trailing space after an options list
559
   (in this case, the options list is empty).
560
 (See comments in parseOptions() -- handling this case also meant it was easier to base
561
  the code on a slightly refactored grammar, which should be semantically equivalent.)
562
563
Third: expression = "{" [s] (((literal / variable) [s annotation]) / annotation) [s] "}"
564
        annotation = (function *(s option)) / reserved
565
   Example: {@a }
566
   Similar to the previous case; see comments in parseReserved()
567
568
Fourth: expression = "{" [s] (((literal / variable) [s annotation]) / annotation) [s] "}"
569
   Example: {|foo|   }
570
   When reading the first space after the '|', it's ambiguous whether it's the required
571
   space before an annotation, or the optional trailing space before the '}'.
572
  (See comments in parseLiteralOrVariableWithAnnotation(); handling this case relies on
573
  the same grammar refactoring as the second exception.)
574
575
    Most functions match a non-terminal in the grammar, except as explained
576
    in comments.
577
578
Fifth: matcher = match-statement 1*([s] variant)
579
               -> match 1 *([s] selector) 1*([s] variant)
580
    Example: match {42} * {{_}}
581
 When reading the space after the first '}', it's unclear whether
582
 it's the optional space before another selector, or the optional space
583
 before a variant.
584
585
Sixth: annotation-expression = "{" [s] annotation *(s attribute) [s] "}"
586
       -> "{" [s] function *(s attribute) [s] "}"
587
       -> "{" [s] ":" identifier *(s option) *(s attribute) [s] "}"
588
       -> "{" [s] ":" identifier s attribute *(s attribute) [s] "}"
589
590
     Example: {:func @foo}
591
(Note: the same ambiguity is present with variable-expression and literal-expression)
592
593
Seventh:
594
595
596
When parsing the space, it's unclear whether it's the optional space before an
597
option, or the optional space before an attribute.
598
599
 Unless otherwise noted in a comment, all helper functions that take
600
    a `source` string, an `index` unsigned int, and an `errorCode` `UErrorCode`
601
    have the precondition:
602
      `index` < `len()`
603
    and the postcondition:
604
      `U_FAILURE(errorCode)` || `index < `len()`
605
*/
606
607
/*
608
  No pre, no post.
609
  A message may end with whitespace, so `index` may equal `len()` on exit.
610
*/
611
4.28M
void Parser::parseRequiredWS(UErrorCode& errorCode) {
612
4.28M
    bool sawWhitespace = false;
613
614
    // The loop exits either when we consume all the input,
615
    // or when we see a non-whitespace character.
616
4.37M
    while (true) {
617
        // Check if all input has been consumed
618
4.37M
        if (!inBounds()) {
619
            // If whitespace isn't required -- or if we saw it already --
620
            // then the caller is responsible for checking this case and
621
            // setting an error if necessary.
622
235
            if (sawWhitespace) {
623
                // Not an error.
624
87
                return;
625
87
            }
626
            // Otherwise, whitespace is required; the end of the input has
627
            // been reached without whitespace. This is an error.
628
148
            ERROR(errorCode);
629
148
            return;
630
235
        }
631
632
        // Input remains; process the next character if it's whitespace,
633
        // exit the loop otherwise
634
4.36M
        if (isWhitespace(peek())) {
635
84.5k
            sawWhitespace = true;
636
            // Increment line number in parse error if we consume a newline
637
84.5k
            maybeAdvanceLine();
638
84.5k
            next();
639
4.28M
        } else {
640
4.28M
            break;
641
4.28M
        }
642
4.36M
    }
643
644
4.28M
    if (!sawWhitespace) {
645
4.20M
        ERROR(errorCode);
646
4.20M
    }
647
4.28M
}
648
649
4.42M
void Parser::parseOptionalBidi() {
650
4.43M
    while (true) {
651
4.43M
        if (!inBounds()) {
652
632
            return;
653
632
        }
654
4.42M
        if (isBidiControl(peek())) {
655
2.78k
            next();
656
4.42M
        } else {
657
4.42M
            break;
658
4.42M
        }
659
4.42M
    }
660
4.42M
}
661
662
/*
663
  No pre, no post, because a message may end with whitespace
664
  Matches `s` in the MF2 grammar
665
*/
666
4.28M
void Parser::parseRequiredWhitespace(UErrorCode& errorCode) {
667
4.28M
    parseOptionalBidi();
668
4.28M
    parseRequiredWS(errorCode);
669
4.28M
    parseOptionalWhitespace();
670
4.28M
    normalizedInput += SPACE;
671
4.28M
}
672
673
/*
674
  No pre, no post, for the same reason as `parseWhitespaceMaybeRequired()`.
675
*/
676
4.63M
void Parser::parseOptionalWhitespace() {
677
4.64M
    while (true) {
678
4.64M
        if (!inBounds()) {
679
2.82k
            return;
680
2.82k
        }
681
4.64M
        auto cp = peek();
682
4.64M
        if (isWhitespace(cp) || isBidiControl(cp)) {
683
15.7k
            maybeAdvanceLine();
684
15.7k
            next();
685
4.62M
        } else {
686
4.62M
            break;
687
4.62M
        }
688
4.64M
    }
689
4.63M
}
690
691
// Consumes a single character, signaling an error if `peek()` != `c`
692
// No postcondition -- a message can end with a '}' token
693
470k
void Parser::parseToken(UChar32 c, UErrorCode& errorCode) {
694
470k
    CHECK_BOUNDS(errorCode);
695
696
470k
    if (peek() == c) {
697
325k
        next();
698
325k
        normalizedInput += c;
699
325k
        return;
700
325k
    }
701
    // Next character didn't match -- error out
702
144k
    ERROR(errorCode);
703
144k
}
704
705
/*
706
   Consumes a fixed-length token, signaling an error if the token isn't a prefix of
707
   the string beginning at `peek()`
708
   No postcondition -- a message can end with a '}' token
709
*/
710
56.5k
void Parser::parseToken(const std::u16string_view& token, UErrorCode& errorCode) {
711
56.5k
    U_ASSERT(inBounds());
712
713
56.5k
    int32_t tokenPos = 0;
714
169k
    while (tokenPos < static_cast<int32_t>(token.length())) {
715
169k
        if (peek() != token[tokenPos]) {
716
56.5k
            ERROR(errorCode);
717
56.5k
            return;
718
56.5k
        }
719
112k
        normalizedInput += token[tokenPos];
720
112k
        next();
721
112k
        tokenPos++;
722
112k
    }
723
56.5k
}
724
725
/*
726
   Consumes optional whitespace, possibly advancing `index` to `index'`,
727
   then consumes a fixed-length token (signaling an error if the token isn't a prefix of
728
   the string beginning at `source[index']`),
729
   then consumes optional whitespace again
730
*/
731
0
void Parser::parseTokenWithWhitespace(const std::u16string_view& token, UErrorCode& errorCode) {
732
    // No need for error check or bounds check before parseOptionalWhitespace
733
0
    parseOptionalWhitespace();
734
    // Establish precondition
735
0
    CHECK_BOUNDS(errorCode);
736
0
    parseToken(token, errorCode);
737
0
    parseOptionalWhitespace();
738
    // Guarantee postcondition
739
0
    CHECK_BOUNDS(errorCode);
740
0
}
741
742
/*
743
   Consumes optional whitespace, possibly advancing `index` to `index'`,
744
   then consumes a single character (signaling an error if it doesn't match
745
   `source[index']`),
746
   then consumes optional whitespace again
747
*/
748
28.8k
void Parser::parseTokenWithWhitespace(UChar32 c, UErrorCode& errorCode) {
749
    // No need for error check or bounds check before parseOptionalWhitespace()
750
28.8k
    parseOptionalWhitespace();
751
    // Establish precondition
752
28.8k
    CHECK_BOUNDS(errorCode);
753
28.5k
    parseToken(c, errorCode);
754
28.5k
    parseOptionalWhitespace();
755
    // Guarantee postcondition
756
28.5k
    CHECK_BOUNDS(errorCode);
757
28.5k
}
758
759
/*
760
  Consumes a possibly-empty sequence of name-chars. Appends to `str`
761
  and returns `str`.
762
*/
763
222k
UnicodeString Parser::parseNameChars(UnicodeString& str, UErrorCode& errorCode) {
764
222k
    if (U_FAILURE(errorCode)) {
765
0
        return {};
766
0
    }
767
768
19.7M
    while (isNameChar(peek())) {
769
19.5M
        UChar32 c = peek();
770
19.5M
        str += c;
771
19.5M
        normalizedInput += c;
772
19.5M
        next();
773
19.5M
        if (!inBounds()) {
774
824
            ERROR(errorCode);
775
824
            break;
776
824
        }
777
19.5M
    }
778
779
222k
    return str;
780
222k
}
781
782
/*
783
  Consumes a non-empty sequence of `name-char`s, the first of which is
784
  also a `name-start`.
785
  that begins with a character `start` such that `isNameStart(start)`.
786
787
  Returns this sequence.
788
789
  (Matches the `name` nonterminal in the grammar.)
790
*/
791
359k
UnicodeString Parser::parseName(UErrorCode& errorCode) {
792
359k
    UnicodeString name;
793
794
359k
    U_ASSERT(inBounds());
795
796
359k
    if (!(isNameStart(peek()) || isBidiControl(peek()))) {
797
289k
        ERROR(errorCode);
798
289k
        return name;
799
289k
    }
800
801
    // name       = [bidi] name-start *name-char [bidi]
802
803
    // [bidi]
804
70.8k
    parseOptionalBidi();
805
806
    // name-start *name-char
807
70.8k
    parseNameChars(name, errorCode);
808
809
    // [bidi]
810
70.8k
    parseOptionalBidi();
811
812
70.8k
    return name;
813
359k
}
814
815
/*
816
  Consumes a '$' followed by a `name`, returning a VariableName
817
  with `name` as its name
818
819
  (Matches the `variable` nonterminal in the grammar.)
820
*/
821
180k
VariableName Parser::parseVariableName(UErrorCode& errorCode) {
822
180k
    VariableName result;
823
824
180k
    U_ASSERT(inBounds());
825
826
180k
    parseToken(DOLLAR, errorCode);
827
180k
    if (!inBounds()) {
828
58
        ERROR(errorCode);
829
58
        return result;
830
58
    }
831
180k
    return VariableName(parseName(errorCode));
832
180k
}
833
834
/*
835
  Corresponds to the `identifier` nonterminal in the grammar
836
*/
837
98.7k
UnicodeString Parser::parseIdentifier(UErrorCode& errorCode) {
838
98.7k
    U_ASSERT(inBounds());
839
840
98.7k
    UnicodeString result;
841
    // The following is a hack to get around ambiguity in the grammar:
842
    // identifier -> namespace ":" name
843
    // vs.
844
    // identifier -> name
845
    // can't be distinguished without arbitrary lookahead.
846
    // Instead, we treat the production as:
847
    // identifier -> namespace *(":"name)
848
    // and then check for multiple colons.
849
850
    // Parse namespace
851
98.7k
    result += parseName(errorCode);
852
98.7k
    int32_t firstColon = -1;
853
179k
    while (inBounds() && peek() == COLON) {
854
        // Parse ':' separator
855
80.4k
        if (firstColon == -1) {
856
46.7k
            firstColon = index;
857
46.7k
        }
858
80.4k
        parseToken(COLON, errorCode);
859
80.4k
        result += COLON;
860
        // Check for message ending with something like "foo:"
861
80.4k
        if (!inBounds()) {
862
58
            ERROR(errorCode);
863
80.4k
        } else {
864
            // Parse name part
865
80.4k
            result += parseName(errorCode);
866
80.4k
        }
867
80.4k
    }
868
869
    // If there's at least one ':', scan from the first ':'
870
    // to the end of the name to check for multiple ':'s
871
98.7k
    if (firstColon != -1) {
872
7.62M
        for (int32_t i = firstColon + 1; i < result.length(); i++) {
873
7.57M
            if (result[i] == COLON) {
874
130
                ERROR_AT(errorCode, i);
875
130
                return {};
876
130
            }
877
7.57M
        }
878
46.7k
    }
879
880
98.6k
    return result;
881
98.7k
}
882
883
/*
884
  Consumes a reference to a function, matching the ": identifier"
885
  in the `function` nonterminal in the grammar.
886
887
  Returns the function name.
888
*/
889
53.8k
FunctionName Parser::parseFunction(UErrorCode& errorCode) {
890
53.8k
    U_ASSERT(inBounds());
891
53.8k
    if (!isFunctionStart(peek())) {
892
0
        ERROR(errorCode);
893
0
        return FunctionName();
894
0
    }
895
896
53.8k
    normalizedInput += peek();
897
53.8k
    next(); // Consume the function start character
898
53.8k
    if (!inBounds()) {
899
34
        ERROR(errorCode);
900
34
        return FunctionName();
901
34
    }
902
53.7k
    return parseIdentifier(errorCode);
903
53.8k
}
904
905
906
/*
907
  Precondition: peek() == BACKSLASH
908
909
  Consume an escaped character.
910
  Corresponds to `escaped-char` in the grammar.
911
912
  No postcondition (a message can end with an escaped char)
913
*/
914
8.52k
UnicodeString Parser::parseEscapeSequence(UErrorCode& errorCode) {
915
8.52k
    U_ASSERT(inBounds());
916
8.52k
    U_ASSERT(peek() == BACKSLASH);
917
8.52k
    normalizedInput += BACKSLASH;
918
8.52k
    next(); // Skip the initial backslash
919
8.52k
    UnicodeString str;
920
8.52k
    if (inBounds()) {
921
        // Expect a '{', '|' or '}'
922
8.50k
        switch (peek()) {
923
1.07k
        case LEFT_CURLY_BRACE:
924
1.75k
        case RIGHT_CURLY_BRACE:
925
6.83k
        case PIPE:
926
7.42k
        case BACKSLASH: {
927
            /* Append to the output string */
928
7.42k
            str += peek();
929
            /* Update normalizedInput */
930
7.42k
            normalizedInput += peek();
931
            /* Consume the character */
932
7.42k
            next();
933
7.42k
            return str;
934
6.83k
        }
935
1.07k
        default: {
936
            // No other characters are allowed here
937
1.07k
            break;
938
6.83k
        }
939
8.50k
        }
940
8.50k
    }
941
   // If control reaches here, there was an error
942
1.09k
   ERROR(errorCode);
943
1.09k
   return str;
944
8.52k
}
945
946
947
/*
948
  Consume and return a quoted literal, matching the `literal` nonterminal in the grammar.
949
*/
950
4.76k
Literal Parser::parseQuotedLiteral(UErrorCode& errorCode) {
951
4.76k
    bool error = false;
952
953
4.76k
    UnicodeString contents;
954
4.76k
    if (U_SUCCESS(errorCode)) {
955
        // Parse the opening '|'
956
4.76k
        parseToken(PIPE, errorCode);
957
4.76k
        if (!inBounds()) {
958
16
            ERROR(errorCode);
959
16
            error = true;
960
4.75k
        } else {
961
            // Parse the contents
962
4.75k
            bool done = false;
963
131k
            while (!done) {
964
126k
                if (peek() == BACKSLASH) {
965
7.13k
                    contents += parseEscapeSequence(errorCode);
966
119k
                } else if (isQuotedChar(peek())) {
967
114k
                    contents += peek();
968
                    // Handle cases like:
969
                    // |}{| -- we want to escape everywhere that
970
                    // can be escaped, to make round-trip checking
971
                    // easier -- so this case normalizes to
972
                    // |\}\{|
973
114k
                    if (isEscapableChar(peek())) {
974
5.71k
                        normalizedInput += BACKSLASH;
975
5.71k
                    }
976
114k
                    normalizedInput += peek();
977
114k
                    next(); // Consume this character
978
114k
                    maybeAdvanceLine();
979
114k
                } else {
980
                    // Assume the sequence of literal characters ends here
981
4.67k
                    done = true;
982
4.67k
                }
983
126k
                if (!inBounds()) {
984
82
                    ERROR(errorCode);
985
82
                    error = true;
986
82
                    break;
987
82
                }
988
126k
            }
989
4.75k
        }
990
4.76k
    }
991
992
4.76k
    if (error) {
993
98
        return {};
994
98
    }
995
996
    // Parse the closing '|'
997
4.67k
    parseToken(PIPE, errorCode);
998
999
4.67k
    return Literal(true, contents);
1000
4.76k
}
1001
1002
// Parse (1*DIGIT)
1003
0
UnicodeString Parser::parseDigits(UErrorCode& errorCode) {
1004
0
    if (U_FAILURE(errorCode)) {
1005
0
        return {};
1006
0
    }
1007
1008
0
    U_ASSERT(isDigit(peek()));
1009
1010
0
    UnicodeString contents;
1011
0
    do {
1012
0
        contents += peek();
1013
0
        normalizedInput += peek();
1014
0
        next();
1015
0
        if (!inBounds()) {
1016
0
            ERROR(errorCode);
1017
0
            return {};
1018
0
        }
1019
0
    } while (isDigit(peek()));
1020
1021
0
    return contents;
1022
0
}
1023
/*
1024
  Consume and return an unquoted literal, matching the `unquoted` nonterminal in the grammar.
1025
*/
1026
4.05M
Literal Parser::parseUnquotedLiteral(UErrorCode& errorCode) {
1027
4.05M
    if (U_FAILURE(errorCode)) {
1028
0
        return {};
1029
0
    }
1030
    // unquoted-literal = 1*name-char
1031
1032
4.05M
    if (!(isNameChar(peek()))) {
1033
3.90M
        ERROR(errorCode);
1034
3.90M
        return {};
1035
3.90M
    }
1036
1037
151k
    UnicodeString contents;
1038
151k
    parseNameChars(contents, errorCode);
1039
151k
    return Literal(false, contents);
1040
4.05M
}
1041
1042
/*
1043
  Consume and return a literal, matching the `literal` nonterminal in the grammar.
1044
*/
1045
4.05M
Literal Parser::parseLiteral(UErrorCode& errorCode) {
1046
4.05M
    Literal result;
1047
4.05M
    if (!inBounds()) {
1048
276
        ERROR(errorCode);
1049
4.05M
    } else {
1050
4.05M
        if (peek() == PIPE) {
1051
4.76k
            result = parseQuotedLiteral(errorCode);
1052
4.05M
        } else {
1053
4.05M
            result = parseUnquotedLiteral(errorCode);
1054
4.05M
        }
1055
        // Guarantee postcondition
1056
4.05M
        if (!inBounds()) {
1057
492
            ERROR(errorCode);
1058
492
        }
1059
4.05M
    }
1060
1061
4.05M
    return result;
1062
4.05M
}
1063
1064
/*
1065
  Consume a @name-value pair, matching the `attribute` nonterminal in the grammar.
1066
1067
  Adds the option to `options`
1068
*/
1069
template<class T>
1070
23.6k
void Parser::parseAttribute(AttributeAdder<T>& attrAdder, UErrorCode& errorCode) {
1071
23.6k
    U_ASSERT(inBounds());
1072
1073
23.6k
    U_ASSERT(peek() == AT);
1074
    // Consume the '@'
1075
23.6k
    parseToken(AT, errorCode);
1076
1077
    // Parse LHS
1078
23.6k
    UnicodeString lhs = parseIdentifier(errorCode);
1079
1080
    // Prepare to "backtrack" to resolve ambiguity
1081
    // about whether whitespace precedes another
1082
    // attribute, or the '=' sign
1083
23.6k
    int32_t savedIndex = index;
1084
23.6k
    parseOptionalWhitespace();
1085
1086
23.6k
    Operand rand;
1087
23.6k
    if (peek() == EQUALS) {
1088
        // Parse '='
1089
8.47k
        parseTokenWithWhitespace(EQUALS, errorCode);
1090
1091
8.47k
        UnicodeString rhsStr;
1092
        // Parse RHS, which must be a literal
1093
        // attribute = "@" identifier [o "=" o literal]
1094
8.47k
        rand = Operand(parseLiteral(errorCode));
1095
15.1k
    } else {
1096
        // attribute -> "@" identifier [[s] "=" [s]]
1097
        // Use null operand, which `rand` is already set to
1098
        // "Backtrack" by restoring the whitespace (if there was any)
1099
15.1k
        index = savedIndex;
1100
15.1k
    }
1101
1102
23.6k
    attrAdder.addAttribute(lhs, std::move(Operand(rand)), errorCode);
1103
23.6k
}
void icu_79::message2::Parser::parseAttribute<icu_79::message2::data_model::Expression::Builder>(icu_79::message2::AttributeAdder<icu_79::message2::data_model::Expression::Builder>&, UErrorCode&)
Line
Count
Source
1070
14.2k
void Parser::parseAttribute(AttributeAdder<T>& attrAdder, UErrorCode& errorCode) {
1071
14.2k
    U_ASSERT(inBounds());
1072
1073
14.2k
    U_ASSERT(peek() == AT);
1074
    // Consume the '@'
1075
14.2k
    parseToken(AT, errorCode);
1076
1077
    // Parse LHS
1078
14.2k
    UnicodeString lhs = parseIdentifier(errorCode);
1079
1080
    // Prepare to "backtrack" to resolve ambiguity
1081
    // about whether whitespace precedes another
1082
    // attribute, or the '=' sign
1083
14.2k
    int32_t savedIndex = index;
1084
14.2k
    parseOptionalWhitespace();
1085
1086
14.2k
    Operand rand;
1087
14.2k
    if (peek() == EQUALS) {
1088
        // Parse '='
1089
1.55k
        parseTokenWithWhitespace(EQUALS, errorCode);
1090
1091
1.55k
        UnicodeString rhsStr;
1092
        // Parse RHS, which must be a literal
1093
        // attribute = "@" identifier [o "=" o literal]
1094
1.55k
        rand = Operand(parseLiteral(errorCode));
1095
12.7k
    } else {
1096
        // attribute -> "@" identifier [[s] "=" [s]]
1097
        // Use null operand, which `rand` is already set to
1098
        // "Backtrack" by restoring the whitespace (if there was any)
1099
12.7k
        index = savedIndex;
1100
12.7k
    }
1101
1102
14.2k
    attrAdder.addAttribute(lhs, std::move(Operand(rand)), errorCode);
1103
14.2k
}
void icu_79::message2::Parser::parseAttribute<icu_79::message2::data_model::Markup::Builder>(icu_79::message2::AttributeAdder<icu_79::message2::data_model::Markup::Builder>&, UErrorCode&)
Line
Count
Source
1070
9.37k
void Parser::parseAttribute(AttributeAdder<T>& attrAdder, UErrorCode& errorCode) {
1071
9.37k
    U_ASSERT(inBounds());
1072
1073
9.37k
    U_ASSERT(peek() == AT);
1074
    // Consume the '@'
1075
9.37k
    parseToken(AT, errorCode);
1076
1077
    // Parse LHS
1078
9.37k
    UnicodeString lhs = parseIdentifier(errorCode);
1079
1080
    // Prepare to "backtrack" to resolve ambiguity
1081
    // about whether whitespace precedes another
1082
    // attribute, or the '=' sign
1083
9.37k
    int32_t savedIndex = index;
1084
9.37k
    parseOptionalWhitespace();
1085
1086
9.37k
    Operand rand;
1087
9.37k
    if (peek() == EQUALS) {
1088
        // Parse '='
1089
6.92k
        parseTokenWithWhitespace(EQUALS, errorCode);
1090
1091
6.92k
        UnicodeString rhsStr;
1092
        // Parse RHS, which must be a literal
1093
        // attribute = "@" identifier [o "=" o literal]
1094
6.92k
        rand = Operand(parseLiteral(errorCode));
1095
6.92k
    } else {
1096
        // attribute -> "@" identifier [[s] "=" [s]]
1097
        // Use null operand, which `rand` is already set to
1098
        // "Backtrack" by restoring the whitespace (if there was any)
1099
2.45k
        index = savedIndex;
1100
2.45k
    }
1101
1102
9.37k
    attrAdder.addAttribute(lhs, std::move(Operand(rand)), errorCode);
1103
9.37k
}
1104
1105
/*
1106
  Consume a name-value pair, matching the `option` nonterminal in the grammar.
1107
1108
  Adds the option to `optionList`
1109
*/
1110
template<class T>
1111
6.11k
void Parser::parseOption(OptionAdder<T>& addOption, UErrorCode& errorCode) {
1112
6.11k
    U_ASSERT(inBounds());
1113
1114
    // Parse LHS
1115
6.11k
    UnicodeString lhs = parseIdentifier(errorCode);
1116
1117
    // Parse '='
1118
6.11k
    parseTokenWithWhitespace(EQUALS, errorCode);
1119
1120
6.11k
    UnicodeString rhsStr;
1121
6.11k
    Operand rand;
1122
    // Parse RHS, which is either a literal or variable
1123
6.11k
    switch (peek()) {
1124
1.62k
    case DOLLAR: {
1125
1.62k
        rand = Operand(parseVariableName(errorCode));
1126
1.62k
        break;
1127
0
    }
1128
4.49k
    default: {
1129
        // Must be a literal
1130
4.49k
        rand = Operand(parseLiteral(errorCode));
1131
4.49k
        break;
1132
0
    }
1133
6.11k
    }
1134
6.11k
    U_ASSERT(!rand.isNull());
1135
1136
    // Finally, add the key=value mapping
1137
    // Use a local error code, check for duplicate option error and
1138
    // record it as with other errors
1139
6.11k
    UErrorCode status = U_ZERO_ERROR;
1140
6.11k
    addOption.addOption(lhs, std::move(rand), status);
1141
6.11k
    if (U_FAILURE(status)) {
1142
1.86k
      U_ASSERT(status == U_MF_DUPLICATE_OPTION_NAME_ERROR);
1143
1.86k
      errors.setDuplicateOptionName(errorCode);
1144
1.86k
    }
1145
6.11k
}
void icu_79::message2::Parser::parseOption<icu_79::message2::data_model::Operator::Builder>(icu_79::message2::OptionAdder<icu_79::message2::data_model::Operator::Builder>&, UErrorCode&)
Line
Count
Source
1111
1.68k
void Parser::parseOption(OptionAdder<T>& addOption, UErrorCode& errorCode) {
1112
1.68k
    U_ASSERT(inBounds());
1113
1114
    // Parse LHS
1115
1.68k
    UnicodeString lhs = parseIdentifier(errorCode);
1116
1117
    // Parse '='
1118
1.68k
    parseTokenWithWhitespace(EQUALS, errorCode);
1119
1120
1.68k
    UnicodeString rhsStr;
1121
1.68k
    Operand rand;
1122
    // Parse RHS, which is either a literal or variable
1123
1.68k
    switch (peek()) {
1124
891
    case DOLLAR: {
1125
891
        rand = Operand(parseVariableName(errorCode));
1126
891
        break;
1127
0
    }
1128
795
    default: {
1129
        // Must be a literal
1130
795
        rand = Operand(parseLiteral(errorCode));
1131
795
        break;
1132
0
    }
1133
1.68k
    }
1134
1.68k
    U_ASSERT(!rand.isNull());
1135
1136
    // Finally, add the key=value mapping
1137
    // Use a local error code, check for duplicate option error and
1138
    // record it as with other errors
1139
1.68k
    UErrorCode status = U_ZERO_ERROR;
1140
1.68k
    addOption.addOption(lhs, std::move(rand), status);
1141
1.68k
    if (U_FAILURE(status)) {
1142
666
      U_ASSERT(status == U_MF_DUPLICATE_OPTION_NAME_ERROR);
1143
666
      errors.setDuplicateOptionName(errorCode);
1144
666
    }
1145
1.68k
}
void icu_79::message2::Parser::parseOption<icu_79::message2::data_model::Markup::Builder>(icu_79::message2::OptionAdder<icu_79::message2::data_model::Markup::Builder>&, UErrorCode&)
Line
Count
Source
1111
4.43k
void Parser::parseOption(OptionAdder<T>& addOption, UErrorCode& errorCode) {
1112
4.43k
    U_ASSERT(inBounds());
1113
1114
    // Parse LHS
1115
4.43k
    UnicodeString lhs = parseIdentifier(errorCode);
1116
1117
    // Parse '='
1118
4.43k
    parseTokenWithWhitespace(EQUALS, errorCode);
1119
1120
4.43k
    UnicodeString rhsStr;
1121
4.43k
    Operand rand;
1122
    // Parse RHS, which is either a literal or variable
1123
4.43k
    switch (peek()) {
1124
730
    case DOLLAR: {
1125
730
        rand = Operand(parseVariableName(errorCode));
1126
730
        break;
1127
0
    }
1128
3.70k
    default: {
1129
        // Must be a literal
1130
3.70k
        rand = Operand(parseLiteral(errorCode));
1131
3.70k
        break;
1132
0
    }
1133
4.43k
    }
1134
4.43k
    U_ASSERT(!rand.isNull());
1135
1136
    // Finally, add the key=value mapping
1137
    // Use a local error code, check for duplicate option error and
1138
    // record it as with other errors
1139
4.43k
    UErrorCode status = U_ZERO_ERROR;
1140
4.43k
    addOption.addOption(lhs, std::move(rand), status);
1141
4.43k
    if (U_FAILURE(status)) {
1142
1.20k
      U_ASSERT(status == U_MF_DUPLICATE_OPTION_NAME_ERROR);
1143
1.20k
      errors.setDuplicateOptionName(errorCode);
1144
1.20k
    }
1145
4.43k
}
1146
1147
/*
1148
  Note: there are multiple overloads of parseOptions() for parsing
1149
  options within markup, vs. within an expression, vs. parsing
1150
  attributes. This should be refactored. TODO
1151
 */
1152
1153
/*
1154
  Consume optional whitespace followed by a sequence of options
1155
  (possibly empty), separated by whitespace
1156
*/
1157
template <class T>
1158
63.7k
void Parser::parseOptions(OptionAdder<T>& addOption, UErrorCode& errorCode) {
1159
    // Early exit if out of bounds -- no more work is possible
1160
63.7k
    CHECK_BOUNDS(errorCode);
1161
1162
/*
1163
Arbitrary lookahead is required to parse option lists. To see why, consider
1164
these rules from the grammar:
1165
1166
expression = "{" [s] (((literal / variable) [s annotation]) / annotation) [s] "}"
1167
annotation = (function *(s option)) / reserved
1168
1169
And this example:
1170
{:foo  }
1171
1172
Derivation:
1173
expression -> "{" [s] (((literal / variable) [s annotation]) / annotation) [s] "}"
1174
           -> "{" [s] annotation [s] "}"
1175
           -> "{" [s] ((function *(s option)) / reserved) [s] "}"
1176
           -> "{" [s] function *(s option) [s] "}"
1177
1178
In this example, knowing whether to expect a '}' or the start of another option
1179
after the whitespace would require arbitrary lookahead -- in other words, which
1180
rule should we apply?
1181
    *(s option) -> s option *(s option)
1182
  or
1183
    *(s option) ->
1184
1185
The same would apply to the example {:foo k=v } (note the trailing space after "v").
1186
1187
This is addressed using a form of backtracking and (to make the backtracking easier
1188
to apply) a slight refactoring to the grammar.
1189
1190
This code is written as if the grammar is:
1191
  expression = "{" [s] (((literal / variable) ([s] / [s annotation])) / annotation) "}"
1192
  annotation = (function *(s option) [s]) / (reserved [s])
1193
1194
Parsing the `*(s option) [s]` sequence can be done within `parseOptions()`, meaning
1195
that `parseExpression()` can safely require a '}' after `parseOptions()` finishes.
1196
1197
Note that when "backtracking" really just means early exit, since only whitespace
1198
is involved and there's no state to save.
1199
1200
There is a separate but similar ambiguity as to whether the space precedes
1201
an option or an attribute.
1202
*/
1203
1204
69.7k
    while(true) {
1205
        // If the next character is not whitespace, that means we've already
1206
        // parsed the entire options list (which may have been empty) and there's
1207
        // no trailing whitespace. In that case, exit.
1208
69.7k
        if (!isWhitespace(peek())) {
1209
39.5k
            break;
1210
39.5k
        }
1211
30.1k
        int32_t firstWhitespace = index;
1212
1213
        // In any case other than an empty options list, there must be at least
1214
        // one whitespace character.
1215
30.1k
        parseRequiredWhitespace(errorCode);
1216
        // Restore precondition
1217
30.1k
        CHECK_BOUNDS(errorCode);
1218
1219
        // If a name character follows, then at least one more option remains
1220
        // in the list.
1221
        // Otherwise, we've consumed all the options and any trailing whitespace,
1222
        // and can exit.
1223
        // Note that exiting is sort of like backtracking: "(s option)" doesn't apply,
1224
        // so we back out to [s].
1225
30.1k
        if (!isNameStart(peek())) {
1226
            // We've consumed all the options (meaning that either we consumed non-empty
1227
            // whitespace, or consumed at least one option.)
1228
            // Done.
1229
            // Remove the required whitespace from normalizedInput
1230
24.0k
            normalizedInput.truncate(normalizedInput.length() - 1);
1231
            // "Backtrack" so as to leave the optional whitespace there
1232
            // when parsing attributes
1233
24.0k
            index = firstWhitespace;
1234
24.0k
            break;
1235
24.0k
        }
1236
6.11k
        parseOption(addOption, errorCode);
1237
6.11k
    }
1238
63.6k
}
void icu_79::message2::Parser::parseOptions<icu_79::message2::data_model::Operator::Builder>(icu_79::message2::OptionAdder<icu_79::message2::data_model::Operator::Builder>&, UErrorCode&)
Line
Count
Source
1158
53.8k
void Parser::parseOptions(OptionAdder<T>& addOption, UErrorCode& errorCode) {
1159
    // Early exit if out of bounds -- no more work is possible
1160
53.8k
    CHECK_BOUNDS(errorCode);
1161
1162
/*
1163
Arbitrary lookahead is required to parse option lists. To see why, consider
1164
these rules from the grammar:
1165
1166
expression = "{" [s] (((literal / variable) [s annotation]) / annotation) [s] "}"
1167
annotation = (function *(s option)) / reserved
1168
1169
And this example:
1170
{:foo  }
1171
1172
Derivation:
1173
expression -> "{" [s] (((literal / variable) [s annotation]) / annotation) [s] "}"
1174
           -> "{" [s] annotation [s] "}"
1175
           -> "{" [s] ((function *(s option)) / reserved) [s] "}"
1176
           -> "{" [s] function *(s option) [s] "}"
1177
1178
In this example, knowing whether to expect a '}' or the start of another option
1179
after the whitespace would require arbitrary lookahead -- in other words, which
1180
rule should we apply?
1181
    *(s option) -> s option *(s option)
1182
  or
1183
    *(s option) ->
1184
1185
The same would apply to the example {:foo k=v } (note the trailing space after "v").
1186
1187
This is addressed using a form of backtracking and (to make the backtracking easier
1188
to apply) a slight refactoring to the grammar.
1189
1190
This code is written as if the grammar is:
1191
  expression = "{" [s] (((literal / variable) ([s] / [s annotation])) / annotation) "}"
1192
  annotation = (function *(s option) [s]) / (reserved [s])
1193
1194
Parsing the `*(s option) [s]` sequence can be done within `parseOptions()`, meaning
1195
that `parseExpression()` can safely require a '}' after `parseOptions()` finishes.
1196
1197
Note that when "backtracking" really just means early exit, since only whitespace
1198
is involved and there's no state to save.
1199
1200
There is a separate but similar ambiguity as to whether the space precedes
1201
an option or an attribute.
1202
*/
1203
1204
55.3k
    while(true) {
1205
        // If the next character is not whitespace, that means we've already
1206
        // parsed the entire options list (which may have been empty) and there's
1207
        // no trailing whitespace. In that case, exit.
1208
55.3k
        if (!isWhitespace(peek())) {
1209
38.9k
            break;
1210
38.9k
        }
1211
16.3k
        int32_t firstWhitespace = index;
1212
1213
        // In any case other than an empty options list, there must be at least
1214
        // one whitespace character.
1215
16.3k
        parseRequiredWhitespace(errorCode);
1216
        // Restore precondition
1217
16.3k
        CHECK_BOUNDS(errorCode);
1218
1219
        // If a name character follows, then at least one more option remains
1220
        // in the list.
1221
        // Otherwise, we've consumed all the options and any trailing whitespace,
1222
        // and can exit.
1223
        // Note that exiting is sort of like backtracking: "(s option)" doesn't apply,
1224
        // so we back out to [s].
1225
16.3k
        if (!isNameStart(peek())) {
1226
            // We've consumed all the options (meaning that either we consumed non-empty
1227
            // whitespace, or consumed at least one option.)
1228
            // Done.
1229
            // Remove the required whitespace from normalizedInput
1230
14.6k
            normalizedInput.truncate(normalizedInput.length() - 1);
1231
            // "Backtrack" so as to leave the optional whitespace there
1232
            // when parsing attributes
1233
14.6k
            index = firstWhitespace;
1234
14.6k
            break;
1235
14.6k
        }
1236
1.68k
        parseOption(addOption, errorCode);
1237
1.68k
    }
1238
53.6k
}
void icu_79::message2::Parser::parseOptions<icu_79::message2::data_model::Markup::Builder>(icu_79::message2::OptionAdder<icu_79::message2::data_model::Markup::Builder>&, UErrorCode&)
Line
Count
Source
1158
9.95k
void Parser::parseOptions(OptionAdder<T>& addOption, UErrorCode& errorCode) {
1159
    // Early exit if out of bounds -- no more work is possible
1160
9.95k
    CHECK_BOUNDS(errorCode);
1161
1162
/*
1163
Arbitrary lookahead is required to parse option lists. To see why, consider
1164
these rules from the grammar:
1165
1166
expression = "{" [s] (((literal / variable) [s annotation]) / annotation) [s] "}"
1167
annotation = (function *(s option)) / reserved
1168
1169
And this example:
1170
{:foo  }
1171
1172
Derivation:
1173
expression -> "{" [s] (((literal / variable) [s annotation]) / annotation) [s] "}"
1174
           -> "{" [s] annotation [s] "}"
1175
           -> "{" [s] ((function *(s option)) / reserved) [s] "}"
1176
           -> "{" [s] function *(s option) [s] "}"
1177
1178
In this example, knowing whether to expect a '}' or the start of another option
1179
after the whitespace would require arbitrary lookahead -- in other words, which
1180
rule should we apply?
1181
    *(s option) -> s option *(s option)
1182
  or
1183
    *(s option) ->
1184
1185
The same would apply to the example {:foo k=v } (note the trailing space after "v").
1186
1187
This is addressed using a form of backtracking and (to make the backtracking easier
1188
to apply) a slight refactoring to the grammar.
1189
1190
This code is written as if the grammar is:
1191
  expression = "{" [s] (((literal / variable) ([s] / [s annotation])) / annotation) "}"
1192
  annotation = (function *(s option) [s]) / (reserved [s])
1193
1194
Parsing the `*(s option) [s]` sequence can be done within `parseOptions()`, meaning
1195
that `parseExpression()` can safely require a '}' after `parseOptions()` finishes.
1196
1197
Note that when "backtracking" really just means early exit, since only whitespace
1198
is involved and there's no state to save.
1199
1200
There is a separate but similar ambiguity as to whether the space precedes
1201
an option or an attribute.
1202
*/
1203
1204
14.3k
    while(true) {
1205
        // If the next character is not whitespace, that means we've already
1206
        // parsed the entire options list (which may have been empty) and there's
1207
        // no trailing whitespace. In that case, exit.
1208
14.3k
        if (!isWhitespace(peek())) {
1209
610
            break;
1210
610
        }
1211
13.7k
        int32_t firstWhitespace = index;
1212
1213
        // In any case other than an empty options list, there must be at least
1214
        // one whitespace character.
1215
13.7k
        parseRequiredWhitespace(errorCode);
1216
        // Restore precondition
1217
13.7k
        CHECK_BOUNDS(errorCode);
1218
1219
        // If a name character follows, then at least one more option remains
1220
        // in the list.
1221
        // Otherwise, we've consumed all the options and any trailing whitespace,
1222
        // and can exit.
1223
        // Note that exiting is sort of like backtracking: "(s option)" doesn't apply,
1224
        // so we back out to [s].
1225
13.7k
        if (!isNameStart(peek())) {
1226
            // We've consumed all the options (meaning that either we consumed non-empty
1227
            // whitespace, or consumed at least one option.)
1228
            // Done.
1229
            // Remove the required whitespace from normalizedInput
1230
9.33k
            normalizedInput.truncate(normalizedInput.length() - 1);
1231
            // "Backtrack" so as to leave the optional whitespace there
1232
            // when parsing attributes
1233
9.33k
            index = firstWhitespace;
1234
9.33k
            break;
1235
9.33k
        }
1236
4.43k
        parseOption(addOption, errorCode);
1237
4.43k
    }
1238
9.95k
}
1239
1240
/*
1241
  Consume optional whitespace followed by a sequence of attributes
1242
  (possibly empty), separated by whitespace
1243
*/
1244
template<class T>
1245
75.5k
void Parser::parseAttributes(AttributeAdder<T>& attrAdder, UErrorCode& errorCode) {
1246
1247
    // Early exit if out of bounds -- no more work is possible
1248
75.5k
    if (!inBounds()) {
1249
747
        ERROR(errorCode);
1250
747
        return;
1251
747
    }
1252
1253
/*
1254
Arbitrary lookahead is required to parse attribute lists, similarly to option lists.
1255
(See comment in parseOptions()).
1256
*/
1257
1258
98.4k
    while(true) {
1259
        // If the next character is not whitespace, that means we've already
1260
        // parsed the entire attributes list (which may have been empty) and there's
1261
        // no trailing whitespace. In that case, exit.
1262
98.4k
        if (!isWhitespace(peek())) {
1263
58.8k
            break;
1264
58.8k
        }
1265
1266
        // In any case other than an empty attributes list, there must be at least
1267
        // one whitespace character.
1268
39.5k
        parseRequiredWhitespace(errorCode);
1269
        // Restore precondition
1270
39.5k
        if (!inBounds()) {
1271
29
            ERROR(errorCode);
1272
29
            break;
1273
29
        }
1274
1275
        // If an '@' follows, then at least one more attribute remains
1276
        // in the list.
1277
        // Otherwise, we've consumed all the attributes and any trailing whitespace,
1278
        // and can exit.
1279
        // Note that exiting is sort of like backtracking: "(s attributes)" doesn't apply,
1280
        // so we back out to [s].
1281
39.5k
        if (peek() != AT) {
1282
            // We've consumed all the attributes (meaning that either we consumed non-empty
1283
            // whitespace, or consumed at least one attribute.)
1284
            // Done.
1285
            // Remove the whitespace from normalizedInput
1286
15.8k
            normalizedInput.truncate(normalizedInput.length() - 1);
1287
15.8k
            break;
1288
15.8k
        }
1289
23.6k
        parseAttribute(attrAdder, errorCode);
1290
23.6k
    }
1291
74.7k
}
void icu_79::message2::Parser::parseAttributes<icu_79::message2::data_model::Expression::Builder>(icu_79::message2::AttributeAdder<icu_79::message2::data_model::Expression::Builder>&, UErrorCode&)
Line
Count
Source
1245
66.1k
void Parser::parseAttributes(AttributeAdder<T>& attrAdder, UErrorCode& errorCode) {
1246
1247
    // Early exit if out of bounds -- no more work is possible
1248
66.1k
    if (!inBounds()) {
1249
747
        ERROR(errorCode);
1250
747
        return;
1251
747
    }
1252
1253
/*
1254
Arbitrary lookahead is required to parse attribute lists, similarly to option lists.
1255
(See comment in parseOptions()).
1256
*/
1257
1258
79.6k
    while(true) {
1259
        // If the next character is not whitespace, that means we've already
1260
        // parsed the entire attributes list (which may have been empty) and there's
1261
        // no trailing whitespace. In that case, exit.
1262
79.6k
        if (!isWhitespace(peek())) {
1263
54.0k
            break;
1264
54.0k
        }
1265
1266
        // In any case other than an empty attributes list, there must be at least
1267
        // one whitespace character.
1268
25.6k
        parseRequiredWhitespace(errorCode);
1269
        // Restore precondition
1270
25.6k
        if (!inBounds()) {
1271
18
            ERROR(errorCode);
1272
18
            break;
1273
18
        }
1274
1275
        // If an '@' follows, then at least one more attribute remains
1276
        // in the list.
1277
        // Otherwise, we've consumed all the attributes and any trailing whitespace,
1278
        // and can exit.
1279
        // Note that exiting is sort of like backtracking: "(s attributes)" doesn't apply,
1280
        // so we back out to [s].
1281
25.6k
        if (peek() != AT) {
1282
            // We've consumed all the attributes (meaning that either we consumed non-empty
1283
            // whitespace, or consumed at least one attribute.)
1284
            // Done.
1285
            // Remove the whitespace from normalizedInput
1286
11.3k
            normalizedInput.truncate(normalizedInput.length() - 1);
1287
11.3k
            break;
1288
11.3k
        }
1289
14.2k
        parseAttribute(attrAdder, errorCode);
1290
14.2k
    }
1291
65.3k
}
void icu_79::message2::Parser::parseAttributes<icu_79::message2::data_model::Markup::Builder>(icu_79::message2::AttributeAdder<icu_79::message2::data_model::Markup::Builder>&, UErrorCode&)
Line
Count
Source
1245
9.39k
void Parser::parseAttributes(AttributeAdder<T>& attrAdder, UErrorCode& errorCode) {
1246
1247
    // Early exit if out of bounds -- no more work is possible
1248
9.39k
    if (!inBounds()) {
1249
0
        ERROR(errorCode);
1250
0
        return;
1251
0
    }
1252
1253
/*
1254
Arbitrary lookahead is required to parse attribute lists, similarly to option lists.
1255
(See comment in parseOptions()).
1256
*/
1257
1258
18.7k
    while(true) {
1259
        // If the next character is not whitespace, that means we've already
1260
        // parsed the entire attributes list (which may have been empty) and there's
1261
        // no trailing whitespace. In that case, exit.
1262
18.7k
        if (!isWhitespace(peek())) {
1263
4.86k
            break;
1264
4.86k
        }
1265
1266
        // In any case other than an empty attributes list, there must be at least
1267
        // one whitespace character.
1268
13.9k
        parseRequiredWhitespace(errorCode);
1269
        // Restore precondition
1270
13.9k
        if (!inBounds()) {
1271
11
            ERROR(errorCode);
1272
11
            break;
1273
11
        }
1274
1275
        // If an '@' follows, then at least one more attribute remains
1276
        // in the list.
1277
        // Otherwise, we've consumed all the attributes and any trailing whitespace,
1278
        // and can exit.
1279
        // Note that exiting is sort of like backtracking: "(s attributes)" doesn't apply,
1280
        // so we back out to [s].
1281
13.9k
        if (peek() != AT) {
1282
            // We've consumed all the attributes (meaning that either we consumed non-empty
1283
            // whitespace, or consumed at least one attribute.)
1284
            // Done.
1285
            // Remove the whitespace from normalizedInput
1286
4.52k
            normalizedInput.truncate(normalizedInput.length() - 1);
1287
4.52k
            break;
1288
4.52k
        }
1289
9.37k
        parseAttribute(attrAdder, errorCode);
1290
9.37k
    }
1291
9.39k
}
1292
1293
/*
1294
  Consume a function call, matching the `annotation`
1295
  nonterminal in the grammar
1296
1297
  Returns an `Operator` representing this (a reserved is a parse error)
1298
*/
1299
53.8k
Operator Parser::parseAnnotation(UErrorCode& status) {
1300
53.8k
    U_ASSERT(inBounds());
1301
53.8k
    Operator::Builder ratorBuilder(status);
1302
53.8k
    if (U_FAILURE(status)) {
1303
0
        return {};
1304
0
    }
1305
53.8k
    if (isFunctionStart(peek())) {
1306
        // Consume the function name
1307
53.8k
        FunctionName func = parseFunction(status);
1308
53.8k
        ratorBuilder.setFunctionName(std::move(func));
1309
1310
53.8k
        OptionAdder<Operator::Builder> addOptions(ratorBuilder);
1311
        // Consume the options (which may be empty)
1312
53.8k
        parseOptions(addOptions, status);
1313
53.8k
    } else {
1314
0
        ERROR(status);
1315
0
    }
1316
53.8k
    return ratorBuilder.build(status);
1317
53.8k
}
1318
1319
/*
1320
  Consume a literal or variable (depending on `isVariable`),
1321
  followed by either required whitespace followed by an annotation,
1322
  or optional whitespace.
1323
*/
1324
void Parser::parseLiteralOrVariableWithAnnotation(bool isVariable,
1325
                                                  Expression::Builder& builder,
1326
11.6k
                                                  UErrorCode& status) {
1327
11.6k
    CHECK_ERROR(status);
1328
1329
11.6k
    U_ASSERT(inBounds());
1330
1331
11.6k
    Operand rand;
1332
11.6k
    if (isVariable) {
1333
466
        rand = Operand(parseVariableName(status));
1334
11.1k
    } else {
1335
11.1k
        rand = Operand(parseLiteral(status));
1336
11.1k
    }
1337
1338
11.6k
    builder.setOperand(std::move(rand));
1339
1340
/*
1341
Parsing a literal or variable with an optional annotation requires arbitrary lookahead.
1342
To see why, consider this rule from the grammar:
1343
1344
expression = "{" [s] (((literal / variable) [s annotation]) / annotation) [s] "}"
1345
1346
And this example:
1347
1348
{|foo|   }
1349
1350
Derivation:
1351
expression -> "{" [s] (((literal / variable) [s annotation]) / annotation) [s] "}"
1352
           -> "{" [s] ((literal / variable) [s annotation]) [s] "}"
1353
           -> "{" [s] (literal [s annotation]) [s] "}"
1354
1355
When reading the ' ' after the second '|', it's ambiguous whether that's the required
1356
space before an annotation, or the optional space before the '}'.
1357
1358
To make this ambiguity easier to handle, this code is based on the same grammar
1359
refactoring for the `expression` nonterminal that `parseOptions()` relies on. See
1360
the comment in `parseOptions()` for details.
1361
*/
1362
1363
11.6k
    if (isWhitespace(peek())) {
1364
3.24k
      int32_t firstWhitespace = index;
1365
1366
      // If the next character is whitespace, either [s annotation] or [s] applies
1367
      // (the character is either the required space before an annotation, or optional
1368
      // trailing space after the literal or variable). It's still ambiguous which
1369
      // one does apply.
1370
3.24k
      parseOptionalWhitespace();
1371
      // Restore precondition
1372
3.24k
      CHECK_BOUNDS(status);
1373
1374
      // This next check resolves the ambiguity between [s annotation] and [s]
1375
3.23k
      bool isSAnnotation = isAnnotationStart(peek());
1376
1377
3.23k
      if (isSAnnotation) {
1378
709
        normalizedInput += SPACE;
1379
709
      }
1380
1381
3.23k
      if (isSAnnotation) {
1382
        // The previously consumed whitespace precedes an annotation
1383
709
        builder.setOperator(parseAnnotation(status));
1384
2.52k
      } else {
1385
          // Either there's a right curly brace (will be consumed by the caller),
1386
          // or there's an error and the trailing whitespace should be
1387
          // handled by the caller. However, this is not an error
1388
          // here because we're just parsing `literal [s annotation]`.
1389
2.52k
          index = firstWhitespace;
1390
2.52k
      }
1391
8.36k
    } else {
1392
      // Either there was never whitespace, or
1393
      // the previously consumed whitespace is the optional trailing whitespace;
1394
      // either the next character is '}' or the error will be handled by parseExpression.
1395
      // Do nothing, since the operand was already set
1396
8.36k
    }
1397
1398
    // At the end of this code, the next character should either be '}',
1399
    // whitespace followed by a '}',
1400
    // or end-of-input
1401
11.6k
}
1402
1403
/*
1404
  Consume an expression, matching the `expression` nonterminal in the grammar
1405
*/
1406
1407
1.42k
static void exprFallback(Expression::Builder& exprBuilder) {
1408
    // Construct a literal consisting just of  The U+FFFD REPLACEMENT CHARACTER
1409
    // per https://github.com/unicode-org/message-format-wg/blob/main/spec/formatting.md#fallback-resolution
1410
1.42k
    exprBuilder.setOperand(Operand(Literal(false, UnicodeString(REPLACEMENT))));
1411
1.42k
}
1412
1413
0
static Expression exprFallback(UErrorCode& status) {
1414
0
    Expression result;
1415
0
    if (U_SUCCESS(status)) {
1416
0
        Expression::Builder exprBuilder(status);
1417
0
        if (U_SUCCESS(status)) {
1418
            // Construct a literal consisting just of  The U+FFFD REPLACEMENT CHARACTER
1419
            // per https://github.com/unicode-org/message-format-wg/blob/main/spec/formatting.md#fallback-resolution
1420
0
            exprBuilder.setOperand(Operand(Literal(false, UnicodeString(REPLACEMENT))));
1421
0
            UErrorCode status = U_ZERO_ERROR;
1422
0
            result = exprBuilder.build(status);
1423
            // An operand was set, so there can't be an error
1424
0
            U_ASSERT(U_SUCCESS(status));
1425
0
        }
1426
0
    }
1427
0
    return result;
1428
0
}
1429
1430
66.1k
Expression Parser::parseExpression(UErrorCode& status) {
1431
66.1k
    if (U_FAILURE(status)) {
1432
0
        return {};
1433
0
    }
1434
1435
    // Early return if out of input -- no more work is possible
1436
66.1k
    U_ASSERT(inBounds());
1437
1438
    // Parse opening brace
1439
66.1k
    parseToken(LEFT_CURLY_BRACE, status);
1440
    // Optional whitespace after opening brace
1441
66.1k
    parseOptionalWhitespace();
1442
1443
66.1k
    Expression::Builder exprBuilder(status);
1444
    // Restore precondition
1445
66.1k
    if (!inBounds()) {
1446
58
        exprFallback(exprBuilder);
1447
66.0k
    } else {
1448
        // literal '|', variable '$' or annotation
1449
66.0k
        switch (peek()) {
1450
191
        case PIPE: {
1451
            // Quoted literal
1452
191
            parseLiteralOrVariableWithAnnotation(false, exprBuilder, status);
1453
191
            break;
1454
0
        }
1455
466
        case DOLLAR: {
1456
            // Variable
1457
466
            parseLiteralOrVariableWithAnnotation(true, exprBuilder, status);
1458
466
            break;
1459
0
        }
1460
65.4k
        default: {
1461
65.4k
            if (isAnnotationStart(peek())) {
1462
53.1k
                Operator rator = parseAnnotation(status);
1463
53.1k
                exprBuilder.setOperator(std::move(rator));
1464
53.1k
            } else if (isUnquotedStart(peek())) {
1465
                // Unquoted literal
1466
10.9k
                parseLiteralOrVariableWithAnnotation(false, exprBuilder, status);
1467
10.9k
            } else {
1468
                // Not a literal, variable or annotation -- error out
1469
1.36k
                ERROR(status);
1470
1.36k
                exprFallback(exprBuilder);
1471
1.36k
                break;
1472
1.36k
            }
1473
64.0k
            break;
1474
65.4k
        }
1475
66.0k
        }
1476
66.0k
    }
1477
1478
    // Parse attributes
1479
66.1k
    AttributeAdder<Expression::Builder> attrAdder(exprBuilder);
1480
66.1k
    parseAttributes(attrAdder, status);
1481
1482
    // Parse optional space
1483
    // (the last [s] in e.g. "{" [s] literal [s annotation] *(s attribute) [s] "}")
1484
66.1k
    parseOptionalWhitespace();
1485
1486
    // Either an operand or operator (or both) must have been set already,
1487
    // so there can't be an error
1488
66.1k
    UErrorCode localStatus = U_ZERO_ERROR;
1489
66.1k
    Expression result = exprBuilder.build(localStatus);
1490
66.1k
    U_ASSERT(U_SUCCESS(localStatus));
1491
1492
    // Check for end-of-input and missing '}'
1493
66.1k
    if (!inBounds()) {
1494
825
        ERROR(status);
1495
65.3k
    } else {
1496
        // Otherwise, it's safe to check for the '}'
1497
65.3k
        parseToken(RIGHT_CURLY_BRACE, status);
1498
65.3k
    }
1499
66.1k
    return result;
1500
66.1k
}
1501
1502
/*
1503
  Parse a .local declaration, matching the `local-declaration`
1504
  production in the grammar
1505
*/
1506
14.2k
void Parser::parseLocalDeclaration(UErrorCode& status) {
1507
    // End-of-input here would be an error; even empty
1508
    // declarations must be followed by a body
1509
14.2k
    CHECK_BOUNDS(status);
1510
1511
14.2k
    parseToken(ID_LOCAL, status);
1512
14.2k
    parseRequiredWhitespace(status);
1513
1514
    // Restore precondition
1515
14.2k
    CHECK_BOUNDS(status);
1516
14.2k
    VariableName lhs = parseVariableName(status);
1517
14.2k
    parseTokenWithWhitespace(EQUALS, status);
1518
    // Restore precondition before calling parseExpression()
1519
14.2k
    CHECK_BOUNDS(status);
1520
1521
14.2k
    Expression rhs = parseExpression(status);
1522
1523
    // Add binding from lhs to rhs, unless there was an error
1524
    // (This ensures that if there was a correct lhs but a
1525
    // parse error in rhs, the fallback for uses of the
1526
    // lhs will be its own name rather than the rhs)
1527
    /* This affects the behavior of this test case, which the spec
1528
       is ambiguous about:
1529
1530
       .local $bar {|foo|} {{{$bar}}}
1531
1532
       Should `$bar` still be bound to a value although
1533
       its declaration is syntactically incorrect (missing the '=')?
1534
       This code says no, but it needs to change if
1535
       https://github.com/unicode-org/message-format-wg/issues/703
1536
       is resolved differently.
1537
    */
1538
14.2k
    CHECK_ERROR(status);
1539
14.2k
    if (!errors.hasSyntaxError()) {
1540
0
        dataModel.addBinding(Binding(std::move(lhs), std::move(rhs)), status);
1541
        // Check if status is U_DUPLICATE_DECLARATION_ERROR
1542
        // and add that as an internal error if so
1543
0
        if (status == U_MF_DUPLICATE_DECLARATION_ERROR) {
1544
0
            status = U_ZERO_ERROR;
1545
0
            errors.addError(StaticErrorType::DuplicateDeclarationError, status);
1546
0
        }
1547
0
    }
1548
14.2k
}
1549
1550
/*
1551
  Parse an .input declaration, matching the `local-declaration`
1552
  production in the grammar
1553
*/
1554
41.5k
void Parser::parseInputDeclaration(UErrorCode& status) {
1555
    // End-of-input here would be an error; even empty
1556
    // declarations must be followed by a body
1557
41.5k
    CHECK_BOUNDS(status);
1558
1559
41.5k
    parseToken(ID_INPUT, status);
1560
41.5k
    parseOptionalWhitespace();
1561
1562
    // Restore precondition before calling parseExpression()
1563
41.5k
    CHECK_BOUNDS(status);
1564
1565
    // Save the index for error diagnostics
1566
41.5k
    int32_t exprIndex = index;
1567
41.5k
    Expression rhs = parseExpression(status);
1568
1569
    // Here we have to check that the rhs is a variable-expression
1570
41.5k
    if (!rhs.getOperand().isVariable()) {
1571
        // This case is a syntax error; report it at the beginning
1572
        // of the expression
1573
41.1k
        ERROR_AT(status, exprIndex);
1574
41.1k
        return;
1575
41.1k
    }
1576
1577
423
    VariableName lhs = rhs.getOperand().asVariable();
1578
1579
    // Add binding from lhs to rhs
1580
    // This just adds a new local variable that shadows the message
1581
    // argument referred to, which is harmless.
1582
    // When evaluating the RHS, the new local is not in scope
1583
    // and the message argument will be correctly referred to.
1584
423
    CHECK_ERROR(status);
1585
423
    if (!errors.hasSyntaxError()) {
1586
0
        dataModel.addBinding(Binding::input(std::move(lhs), std::move(rhs), status), status);
1587
        // Check if status is U_MF_DUPLICATE_DECLARATION_ERROR
1588
        // and add that as an internal error if so
1589
0
        if (status == U_MF_DUPLICATE_DECLARATION_ERROR) {
1590
0
            status = U_ZERO_ERROR;
1591
0
            errors.addError(StaticErrorType::DuplicateDeclarationError, status);
1592
0
        }
1593
0
    }
1594
423
}
1595
1596
/*
1597
  Consume a possibly-empty sequence of declarations separated by whitespace;
1598
  each declaration matches the `declaration` nonterminal in the grammar
1599
1600
  Builds up an environment representing those declarations
1601
*/
1602
1.06k
void Parser::parseDeclarations(UErrorCode& status) {
1603
    // End-of-input here would be an error; even empty
1604
    // declarations must be followed by a body
1605
1.06k
    CHECK_BOUNDS(status);
1606
1607
56.7k
    while (peek() == PERIOD) {
1608
56.5k
        CHECK_BOUNDS_1(status);
1609
56.5k
        if (peek(1) == ID_LOCAL[1]) {
1610
14.2k
            parseLocalDeclaration(status);
1611
42.2k
        } else if (peek(1) == ID_INPUT[1]) {
1612
41.5k
            parseInputDeclaration(status);
1613
41.5k
        } else {
1614
            // Done parsing declarations
1615
709
            break;
1616
709
        }
1617
1618
        // Avoid looping infinitely
1619
55.8k
        CHECK_ERROR(status);
1620
1621
55.8k
        parseOptionalWhitespace();
1622
        // Restore precondition
1623
55.8k
        CHECK_BOUNDS(status);
1624
55.6k
    }
1625
1.06k
}
1626
1627
/*
1628
  Consume a text character
1629
  matching the `text-char` nonterminal in the grammar
1630
1631
  No postcondition (a message can end with a text-char)
1632
*/
1633
11.5M
UnicodeString Parser::parseTextChar(UErrorCode& status) {
1634
11.5M
    UnicodeString str;
1635
11.5M
    if (!inBounds() || !(isTextChar(peek()))) {
1636
        // Error -- text-char is expected here
1637
235
        ERROR(status);
1638
11.5M
    } else {
1639
        // See comment in parseQuotedLiteral()
1640
11.5M
        if (isEscapableChar(peek())) {
1641
76.2k
            normalizedInput += BACKSLASH;
1642
76.2k
        }
1643
11.5M
        normalizedInput += peek();
1644
11.5M
        str += peek();
1645
11.5M
        next();
1646
11.5M
        maybeAdvanceLine();
1647
11.5M
    }
1648
11.5M
    return str;
1649
11.5M
}
1650
1651
/*
1652
  Consume an `nmtoken`, `literal`, or the string "*", matching
1653
  the `key` nonterminal in the grammar
1654
*/
1655
4.03M
Key Parser::parseKey(UErrorCode& status) {
1656
4.03M
    U_ASSERT(inBounds());
1657
1658
4.03M
    Key k; // wildcard by default
1659
    // Literal | '*'
1660
4.03M
    switch (peek()) {
1661
1.07k
    case ASTERISK: {
1662
1.07k
        next();
1663
1.07k
        normalizedInput += ASTERISK;
1664
        // Guarantee postcondition
1665
1.07k
        if (!inBounds()) {
1666
9
            ERROR(status);
1667
9
            return k;
1668
9
        }
1669
1.06k
        break;
1670
1.07k
    }
1671
4.03M
    default: {
1672
        // Literal
1673
4.03M
        k = Key(parseLiteral(status));
1674
4.03M
        break;
1675
1.07k
    }
1676
4.03M
    }
1677
4.03M
    return k;
1678
4.03M
}
1679
1680
/*
1681
  Consume a non-empty sequence of `key`s separated by whitespace
1682
1683
  Takes ownership of `keys`
1684
*/
1685
553
SelectorKeys Parser::parseNonEmptyKeys(UErrorCode& status) {
1686
553
    SelectorKeys result;
1687
1688
553
    if (U_FAILURE(status)) {
1689
0
        return result;
1690
0
    }
1691
1692
553
    U_ASSERT(inBounds());
1693
1694
/*
1695
Arbitrary lookahead is required to parse key lists. To see why, consider
1696
this rule from the grammar:
1697
1698
variant = key *(s key) [s] quoted-pattern
1699
1700
And this example:
1701
when k1 k2   {a}
1702
1703
Derivation:
1704
   variant -> key *(s key) [s] quoted-pattern
1705
           -> key s key *(s key) quoted-pattern
1706
1707
After matching ' ' to `s` and 'k2' to `key`, it would require arbitrary lookahead
1708
to know whether to expect the start of a pattern or the start of another key.
1709
In other words: is the second whitespace sequence the required space in *(s key),
1710
or the optional space in [s] quoted-pattern?
1711
1712
This is addressed using "backtracking" (similarly to `parseOptions()`).
1713
*/
1714
1715
553
    SelectorKeys::Builder keysBuilder(status);
1716
553
    if (U_FAILURE(status)) {
1717
0
        return result;
1718
0
    }
1719
1720
    // Since the first key is required, it's simplest to parse it separately.
1721
553
    keysBuilder.add(parseKey(status), status);
1722
1723
    // Restore precondition
1724
553
    if (!inBounds()) {
1725
47
        ERROR(status);
1726
47
        return result;
1727
47
    }
1728
1729
    // We've seen at least one whitespace-key pair, so now we can parse
1730
    // *(s key) [s]
1731
4.03M
    while (peek() != LEFT_CURLY_BRACE || isWhitespace(peek()) || isBidiControl(peek())) {
1732
4.03M
        bool wasWhitespace = isWhitespace(peek()) || isBidiControl(peek());
1733
4.03M
        parseRequiredWhitespace(status);
1734
4.03M
        if (!wasWhitespace) {
1735
            // Avoid infinite loop when parsing something like:
1736
            // when * @{!...
1737
4.02M
            next();
1738
4.02M
        }
1739
1740
        // Restore precondition
1741
4.03M
        if (!inBounds()) {
1742
224
            ERROR(status);
1743
224
            return result;
1744
224
        }
1745
1746
        // At this point, it's ambiguous whether we are inside (s key) or [s].
1747
        // This check resolves that ambiguity.
1748
4.03M
        if (peek() == LEFT_CURLY_BRACE) {
1749
            // A pattern follows, so what we just parsed was the optional
1750
            // trailing whitespace. All the keys have been parsed.
1751
1752
            // Unpush the whitespace from `normalizedInput`
1753
107
            normalizedInput.truncate(normalizedInput.length() - 1);
1754
107
            break;
1755
107
        }
1756
4.03M
        keysBuilder.add(parseKey(status), status);
1757
4.03M
    }
1758
1759
282
    return keysBuilder.build(status);
1760
506
}
1761
1762
296
Pattern Parser::parseQuotedPattern(UErrorCode& status) {
1763
296
    U_ASSERT(inBounds());
1764
1765
296
    parseToken(LEFT_CURLY_BRACE, status);
1766
296
    parseToken(LEFT_CURLY_BRACE, status);
1767
296
    Pattern p = parseSimpleMessage(status);
1768
296
    parseToken(RIGHT_CURLY_BRACE, status);
1769
296
    parseToken(RIGHT_CURLY_BRACE, status);
1770
296
    return p;
1771
296
}
1772
1773
/*
1774
  Consume a `placeholder`, matching the nonterminal in the grammar
1775
  No postcondition (a markup can end a message)
1776
*/
1777
15.2k
Markup Parser::parseMarkup(UErrorCode& status) {
1778
15.2k
    U_ASSERT(inBounds(1));
1779
1780
15.2k
    U_ASSERT(peek() == LEFT_CURLY_BRACE);
1781
1782
15.2k
    Markup::Builder builder(status);
1783
15.2k
    if (U_FAILURE(status)) {
1784
0
        return {};
1785
0
    }
1786
1787
    // Consume the '{'
1788
15.2k
    next();
1789
15.2k
    normalizedInput += LEFT_CURLY_BRACE;
1790
15.2k
    parseOptionalWhitespace();
1791
15.2k
    bool closing = false;
1792
15.2k
    switch (peek()) {
1793
13.2k
    case NUMBER_SIGN: {
1794
        // Open or standalone; consume the '#'
1795
13.2k
        normalizedInput += peek();
1796
13.2k
        next();
1797
13.2k
        break;
1798
0
    }
1799
1.94k
    case SLASH: {
1800
        // Closing
1801
1.94k
        normalizedInput += peek();
1802
1.94k
        closing = true;
1803
1.94k
        next();
1804
1.94k
        break;
1805
0
    }
1806
0
    default: {
1807
0
        ERROR(status);
1808
0
        return {};
1809
0
    }
1810
15.2k
    }
1811
1812
    // Parse the markup identifier
1813
15.2k
    builder.setName(parseIdentifier(status));
1814
1815
    // Parse the options, which must begin with a ' '
1816
    // if present
1817
15.2k
    if (inBounds() && (isWhitespace(peek()) || isBidiControl(peek()))) {
1818
9.95k
        OptionAdder<Markup::Builder> optionAdder(builder);
1819
9.95k
        parseOptions(optionAdder, status);
1820
9.95k
    }
1821
1822
    // Parse the attributes, which also must begin
1823
    // with a ' '
1824
15.2k
    if (inBounds() && (isWhitespace(peek()) || isBidiControl(peek()))) {
1825
9.39k
        AttributeAdder<Markup::Builder> attrAdder(builder);
1826
9.39k
        parseAttributes(attrAdder, status);
1827
9.39k
    }
1828
1829
15.2k
    parseOptionalWhitespace();
1830
1831
15.2k
    bool standalone = false;
1832
    // Check if this is a standalone or not
1833
15.2k
    if (!closing) {
1834
13.2k
        if (inBounds() && peek() == SLASH) {
1835
311
            standalone = true;
1836
311
            normalizedInput += SLASH;
1837
311
            next();
1838
311
        }
1839
13.2k
    }
1840
1841
15.2k
    parseToken(RIGHT_CURLY_BRACE, status);
1842
1843
15.2k
    if (standalone) {
1844
311
        builder.setStandalone();
1845
14.9k
    } else if (closing) {
1846
1.94k
        builder.setClose();
1847
12.9k
    } else {
1848
12.9k
        builder.setOpen();
1849
12.9k
    }
1850
1851
15.2k
    return builder.build(status);
1852
15.2k
}
1853
1854
/*
1855
  Consume a `placeholder`, matching the nonterminal in the grammar
1856
  No postcondition (a placeholder can end a message)
1857
*/
1858
25.5k
std::variant<Expression, Markup> Parser::parsePlaceholder(UErrorCode& status) {
1859
25.5k
    U_ASSERT(peek() == LEFT_CURLY_BRACE);
1860
1861
25.5k
    if (!inBounds()) {
1862
0
        ERROR(status);
1863
0
        return exprFallback(status);
1864
0
    }
1865
1866
    // Need to look ahead arbitrarily since whitespace
1867
    // can appear before the '{' and '#'
1868
    // in markup
1869
25.5k
    int32_t tempIndex = 1;
1870
25.5k
    bool isMarkup = false;
1871
26.3k
    while (inBounds(1)) {
1872
26.3k
        UChar32 c = peek(tempIndex);
1873
26.3k
        if (c == NUMBER_SIGN || c == SLASH) {
1874
15.2k
            isMarkup = true;
1875
15.2k
            break;
1876
15.2k
        }
1877
11.1k
        if (!(isWhitespace(c) || isBidiControl(c))) {
1878
10.3k
            break;
1879
10.3k
        }
1880
809
        tempIndex++;
1881
809
    }
1882
1883
25.5k
    if (isMarkup) {
1884
15.2k
        return parseMarkup(status);
1885
15.2k
    }
1886
10.3k
    return parseExpression(status);
1887
25.5k
}
1888
1889
/*
1890
  Consume a `simple-message`, matching the nonterminal in the grammar
1891
  Postcondition: `index == len()` or U_FAILURE(status);
1892
  for a syntactically correct message, this will consume the entire input
1893
*/
1894
6.35k
Pattern Parser::parseSimpleMessage(UErrorCode& status) {
1895
6.35k
    Pattern::Builder result(status);
1896
1897
6.35k
    if (U_SUCCESS(status)) {
1898
6.35k
        Expression expression;
1899
11.5M
        while (inBounds()) {
1900
11.5M
            switch (peek()) {
1901
25.5k
            case LEFT_CURLY_BRACE: {
1902
                // Must be placeholder
1903
25.5k
                std::variant<Expression, Markup> piece = parsePlaceholder(status);
1904
25.5k
                if (std::holds_alternative<Expression>(piece)) {
1905
10.3k
                    Expression expr = *std::get_if<Expression>(&piece);
1906
10.3k
                    result.add(std::move(expr), status);
1907
15.2k
                } else {
1908
15.2k
                    Markup markup = *std::get_if<Markup>(&piece);
1909
15.2k
                    result.add(std::move(markup), status);
1910
15.2k
                }
1911
25.5k
                break;
1912
0
            }
1913
1.39k
            case BACKSLASH: {
1914
                // Must be escaped-char
1915
1.39k
                result.add(parseEscapeSequence(status), status);
1916
1.39k
                break;
1917
0
            }
1918
10
            case RIGHT_CURLY_BRACE: {
1919
                // Distinguish unescaped '}' from end of quoted pattern
1920
10
                break;
1921
0
            }
1922
11.5M
            default: {
1923
                // Must be text-char
1924
11.5M
                result.add(parseTextChar(status), status);
1925
11.5M
                break;
1926
0
            }
1927
11.5M
            }
1928
11.5M
            if (peek() == RIGHT_CURLY_BRACE) {
1929
                // End of quoted pattern
1930
157
                break;
1931
157
            }
1932
            // Don't loop infinitely
1933
11.5M
            if (errors.hasSyntaxError() || U_FAILURE(status)) {
1934
5.64k
                break;
1935
5.64k
            }
1936
11.5M
        }
1937
6.35k
    }
1938
6.35k
    return result.build(status);
1939
6.35k
}
1940
1941
555
void Parser::parseVariant(UErrorCode& status) {
1942
555
    CHECK_ERROR(status);
1943
1944
    // At least one key is required
1945
553
    SelectorKeys keyList(parseNonEmptyKeys(status));
1946
1947
    // parseNonEmptyKeys() consumes any trailing whitespace,
1948
    // so the pattern can be consumed next.
1949
1950
    // Restore precondition before calling parsePattern()
1951
    // (which must return a non-null value)
1952
553
    CHECK_BOUNDS(status);
1953
282
    Pattern rhs = parseQuotedPattern(status);
1954
1955
282
    dataModel.addVariant(std::move(keyList), std::move(rhs), status);
1956
282
}
1957
1958
/*
1959
  Consume a `selectors` (matching the nonterminal in the grammar),
1960
  followed by a non-empty sequence of `variant`s (matching the nonterminal
1961
  in the grammar) preceded by whitespace
1962
  No postcondition (on return, `index` might equal `len()` with no syntax error
1963
  because a message can end with a variant)
1964
*/
1965
730
void Parser::parseSelectors(UErrorCode& status) {
1966
730
    CHECK_ERROR(status);
1967
1968
730
    U_ASSERT(inBounds());
1969
1970
730
    parseToken(ID_MATCH, status);
1971
1972
730
    bool empty = true;
1973
    // Parse selectors
1974
    // "Backtracking" is required here. It's not clear if whitespace is
1975
    // (`[s]` selector) or (`[s]` variant)
1976
165k
    while (isWhitespace(peek()) || peek() == DOLLAR) {
1977
164k
        int32_t whitespaceStart = index;
1978
164k
        parseRequiredWhitespace(status);
1979
        // Restore precondition
1980
164k
        CHECK_BOUNDS(status);
1981
164k
        if (peek() != DOLLAR) {
1982
            // This is not necessarily an error, but rather,
1983
            // means the whitespace we parsed was the optional
1984
            // whitespace preceding the first variant, not the
1985
            // required whitespace preceding a subsequent variable.
1986
            // In that case, "push back" the whitespace.
1987
84
            normalizedInput.truncate(normalizedInput.length() - 1);
1988
84
            index = whitespaceStart;
1989
84
            break;
1990
84
        }
1991
164k
        VariableName var = parseVariableName(status);
1992
164k
        empty = false;
1993
1994
164k
        dataModel.addSelector(std::move(var), status);
1995
164k
        CHECK_ERROR(status);
1996
164k
    }
1997
1998
    // At least one selector is required
1999
715
    if (empty) {
2000
200
        ERROR(status);
2001
200
        return;
2002
200
    }
2003
2004
515
    #define CHECK_END_OF_INPUT                     \
2005
515
        if (!inBounds()) {                         \
2006
62
            break;                                 \
2007
62
        }                                          \
2008
515
2009
    // Parse variants
2010
    // matcher = match-statement s variant *(o variant)
2011
2012
    // Parse first variant
2013
515
    parseRequiredWhitespace(status);
2014
515
    if (!inBounds()) {
2015
53
        ERROR(status);
2016
53
        return;
2017
53
    }
2018
462
    parseVariant(status);
2019
462
    if (!inBounds()) {
2020
        // Not an error; there might be only one variant
2021
291
        return;
2022
291
    }
2023
2024
171
    while (isWhitespace(peek()) || isBidiControl(peek()) || isKeyStart(peek())) {
2025
95
        parseOptionalWhitespace();
2026
        // Restore the precondition.
2027
        // Trailing whitespace is allowed.
2028
95
        if (!inBounds()) {
2029
2
            return;
2030
2
        }
2031
2032
93
        parseVariant(status);
2033
2034
        // Restore the precondition, *without* erroring out if we've
2035
        // reached the end of input. That's because it's valid for the
2036
        // message to end with a variant that has no trailing whitespace.
2037
        // Why do we need to check this condition twice inside the loop?
2038
        // Because if we don't check it here, the `isWhitespace()` call in
2039
        // the loop head will read off the end of the input string.
2040
93
        CHECK_END_OF_INPUT
2041
2042
31
        if (errors.hasSyntaxError() || U_FAILURE(status)) {
2043
31
            break;
2044
31
        }
2045
31
    }
2046
171
}
2047
2048
/*
2049
  Consume a `body` (matching the nonterminal in the grammar),
2050
  No postcondition (on return, `index` might equal `len()` with no syntax error,
2051
  because a message can end with a body (trailing whitespace is optional)
2052
*/
2053
2054
318
void Parser::errorPattern(UErrorCode& status) {
2055
318
    errors.addSyntaxError(status);
2056
    // Set to empty pattern
2057
318
    Pattern::Builder result = Pattern::Builder(status);
2058
318
    CHECK_ERROR(status);
2059
2060
    // If still in bounds, then add the remaining input as a single text part
2061
    // to the pattern
2062
    /*
2063
      TODO: this behavior isn't documented in the spec, but it comes from
2064
      https://github.com/messageformat/messageformat/blob/e0087bff312d759b67a9129eac135d318a1f0ce7/packages/mf2-messageformat/src/__fixtures/test-messages.json#L236
2065
      and a pending pull request https://github.com/unicode-org/message-format-wg/pull/462 will clarify
2066
      whether this is the intent behind the spec
2067
     */
2068
318
    UnicodeString partStr(LEFT_CURLY_BRACE);
2069
4.58M
    while (inBounds()) {
2070
4.58M
        partStr += peek();
2071
4.58M
        next();
2072
4.58M
    }
2073
    // Add curly braces around the entire output (same comment as above)
2074
318
    partStr += RIGHT_CURLY_BRACE;
2075
318
    result.add(std::move(partStr), status);
2076
318
    dataModel.setPattern(result.build(status));
2077
318
}
2078
2079
1.06k
void Parser::parseBody(UErrorCode& status) {
2080
1.06k
    CHECK_ERROR(status);
2081
2082
    // Out-of-input is a syntax warning
2083
1.06k
    if (!inBounds()) {
2084
173
        errorPattern(status);
2085
173
        return;
2086
173
    }
2087
2088
    // Body must be either a pattern or selectors
2089
889
    switch (peek()) {
2090
14
    case LEFT_CURLY_BRACE: {
2091
        // Pattern
2092
14
        dataModel.setPattern(parseQuotedPattern(status));
2093
14
        break;
2094
0
    }
2095
730
    case ID_MATCH[0]: {
2096
        // Selectors
2097
730
        parseSelectors(status);
2098
730
        return;
2099
0
    }
2100
145
    default: {
2101
145
        ERROR(status);
2102
145
        errorPattern(status);
2103
145
        return;
2104
0
    }
2105
889
    }
2106
889
}
2107
2108
// -------------------------------------
2109
// Parses the source pattern.
2110
2111
7.12k
void Parser::parse(UParseError &parseErrorResult, UErrorCode& status) {
2112
7.12k
    CHECK_ERROR(status);
2113
2114
7.12k
    bool complex = false;
2115
    // First, "look ahead" to determine if this is a simple or complex
2116
    // message. To do that, check the first non-whitespace character.
2117
8.64k
    while (inBounds(index) && (isWhitespace(peek()) || isBidiControl(peek()))) {
2118
1.51k
        next();
2119
1.51k
    }
2120
2121
    // Message can be empty, so we need to only look ahead
2122
    // if we know it's non-empty
2123
7.12k
    if (inBounds()) {
2124
7.11k
        if (peek() == PERIOD
2125
6.06k
            || (inBounds(1)
2126
5.93k
                && peek() == LEFT_CURLY_BRACE
2127
4.85k
                && peek(1) == LEFT_CURLY_BRACE)) {
2128
1.06k
            complex = true;
2129
1.06k
        }
2130
7.11k
    }
2131
    // Reset index
2132
7.12k
    index = 0;
2133
2134
    // Message can be empty, so we need to only look ahead
2135
    // if we know it's non-empty
2136
7.12k
    if (complex) {
2137
1.06k
        parseOptionalWhitespace();
2138
1.06k
        parseDeclarations(status);
2139
1.06k
        parseBody(status);
2140
1.06k
        parseOptionalWhitespace();
2141
6.06k
    } else {
2142
        // Simple message
2143
        // For normalization, quote the pattern
2144
6.06k
        normalizedInput += LEFT_CURLY_BRACE;
2145
6.06k
        normalizedInput += LEFT_CURLY_BRACE;
2146
6.06k
        dataModel.setPattern(parseSimpleMessage(status));
2147
6.06k
        normalizedInput += RIGHT_CURLY_BRACE;
2148
6.06k
        normalizedInput += RIGHT_CURLY_BRACE;
2149
6.06k
    }
2150
2151
7.12k
    CHECK_ERROR(status);
2152
2153
    // There are no errors; finally, check that the entire input was consumed
2154
6.80k
    if (!allConsumed()) {
2155
4.71k
        ERROR(status);
2156
4.71k
    }
2157
2158
    // Finally, copy the relevant fields of the internal `MessageParseError`
2159
    // into the `UParseError` argument
2160
6.80k
    translateParseError(parseError, parseErrorResult);
2161
6.80k
}
2162
2163
7.12k
Parser::~Parser() {}
2164
2165
} // namespace message2
2166
U_NAMESPACE_END
2167
2168
#endif /* #if !UCONFIG_NO_MF2 */
2169
2170
#endif /* #if !UCONFIG_NO_FORMATTING */
2171
2172
#endif /* #if !UCONFIG_NO_NORMALIZATION */