Coverage Report

Created: 2026-05-06 06:16

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/icu/icu4c/source/i18n/messageformat2_parser.cpp
Line
Count
Source
1
// © 2024 and later: Unicode, Inc. and others.
2
// License & terms of use: http://www.unicode.org/copyright.html
3
4
#include "unicode/utypes.h"
5
6
#if !UCONFIG_NO_NORMALIZATION
7
8
#if !UCONFIG_NO_FORMATTING
9
10
#if !UCONFIG_NO_MF2
11
12
#include "unicode/uniset.h"
13
#include "messageformat2_errors.h"
14
#include "messageformat2_macros.h"
15
#include "messageformat2_parser.h"
16
#include "ucln_in.h"
17
#include "umutex.h"
18
#include "uvector.h" // U_ASSERT
19
20
U_NAMESPACE_BEGIN
21
22
namespace message2 {
23
24
using namespace pluralimpl;
25
26
using namespace data_model;
27
28
/*
29
    The `ERROR()` macro sets a syntax error in the context
30
    and sets the offset in `parseError` to `index`. It does not alter control flow.
31
*/
32
#define ERROR(errorCode)                                                                                \
33
17.3M
    if (!errors.hasSyntaxError()) {                                                                     \
34
6.53k
        setParseError(parseError, index);                                                               \
35
6.53k
        errors.addSyntaxError(errorCode);                                                               \
36
6.53k
    }
37
38
#define ERROR_AT(errorCode, i)                                                                          \
39
25.4k
    if (!errors.hasSyntaxError()) {                                                                     \
40
7
        setParseError(parseError, i);                                                                   \
41
7
        errors.addSyntaxError(errorCode);                                                               \
42
7
    }
43
44
// Increments the line number and updates the "characters seen before
45
// current line" count in `parseError`, iff `peek()` is a newline
46
9.73M
void Parser::maybeAdvanceLine() {
47
9.73M
    if (peek() == LF) {
48
159k
        parseError.line++;
49
        // add 1 to index to get the number of characters seen so far
50
        // (including the newline)
51
159k
        parseError.lengthBeforeCurrentLine = index + 1;
52
159k
    }
53
9.73M
}
54
55
/*
56
    Signals an error and returns either if `parseError` already denotes an
57
    error, or `index` is out of bounds for the string `source`
58
*/
59
#define CHECK_BOUNDS(errorCode)                                                            \
60
2.23M
    if (!inBounds()) {                                                                     \
61
1.50k
        ERROR(errorCode);                                                                  \
62
1.50k
        return;                                                                            \
63
1.50k
    }
64
#define CHECK_BOUNDS_1(errorCode)                                                          \
65
65.0k
    if (!inBounds(1)) {                                                                    \
66
23
        ERROR_AT(errorCode, index + 1);                                                    \
67
23
        return;                                                                            \
68
23
    }
69
70
// -------------------------------------
71
// Helper functions
72
73
13.5k
static void copyContext(const UChar in[U_PARSE_CONTEXT_LEN], UChar out[U_PARSE_CONTEXT_LEN]) {
74
13.5k
    for (int32_t i = 0; i < U_PARSE_CONTEXT_LEN; i++) {
75
13.5k
        out[i] = in[i];
76
13.5k
        if (in[i] == '\0') {
77
13.5k
            break;
78
13.5k
        }
79
13.5k
    }
80
13.5k
}
81
82
6.76k
/* static */ void Parser::translateParseError(const MessageParseError &messageParseError, UParseError &parseError) {
83
6.76k
    parseError.line = messageParseError.line;
84
6.76k
    parseError.offset = messageParseError.offset;
85
6.76k
    copyContext(messageParseError.preContext, parseError.preContext);
86
6.76k
    copyContext(messageParseError.postContext, parseError.postContext);
87
6.76k
}
88
89
6.53k
/* static */ void Parser::setParseError(MessageParseError &parseError, uint32_t index) {
90
    // Translate absolute to relative offset
91
6.53k
    parseError.offset = index                               // Start with total number of characters seen
92
6.53k
                      - parseError.lengthBeforeCurrentLine; // Subtract all characters before the current line
93
    // TODO: Fill this in with actual pre and post-context
94
6.53k
    parseError.preContext[0] = 0;
95
6.53k
    parseError.postContext[0] = 0;
96
6.53k
}
97
98
// -------------------------------------
99
// Initialization of UnicodeSets
100
101
namespace unisets {
102
103
UnicodeSet* gUnicodeSets[unisets::UNISETS_KEY_COUNT] = {};
104
105
70.2k
inline UnicodeSet* getImpl(Key key) {
106
70.2k
    return gUnicodeSets[key];
107
70.2k
}
108
109
icu::UInitOnce gMF2ParseUniSetsInitOnce {};
110
}
111
112
1
UnicodeSet* initContentChars(UErrorCode& status) {
113
1
    if (U_FAILURE(status)) {
114
0
        return nullptr;
115
0
    }
116
117
1
    UnicodeSet* result = new UnicodeSet(0x0001, 0x0008); // Omit NULL, HTAB and LF
118
1
    if (result == nullptr) {
119
0
        status = U_MEMORY_ALLOCATION_ERROR;
120
0
        return nullptr;
121
0
    }
122
1
    result->add(0x000B, 0x000C); // Omit CR
123
1
    result->add(0x000E, 0x001F); // Omit SP
124
1
    result->add(0x0021, 0x002D); // Omit '.'
125
1
    result->add(0x002F, 0x003F); // Omit '@'
126
1
    result->add(0x0041, 0x005B); // Omit '\'
127
1
    result->add(0x005D, 0x007A); // Omit { | }
128
1
    result->add(0x007E, 0x2FFF); // Omit IDEOGRAPHIC_SPACE
129
1
    result->add(0x3001, 0x10FFFF); // Allowing surrogates is intentional
130
1
    result->freeze();
131
1
    return result;
132
1
}
133
134
1
UnicodeSet* initWhitespace(UErrorCode& status) {
135
1
    if (U_FAILURE(status)) {
136
0
        return nullptr;
137
0
    }
138
139
1
    UnicodeSet* result = new UnicodeSet();
140
1
    if (result == nullptr) {
141
0
        status = U_MEMORY_ALLOCATION_ERROR;
142
0
        return nullptr;
143
0
    }
144
1
    result->add(SPACE);
145
1
    result->add(HTAB);
146
1
    result->add(CR);
147
1
    result->add(LF);
148
1
    result->add(IDEOGRAPHIC_SPACE);
149
1
    result->freeze();
150
1
    return result;
151
1
}
152
153
1
UnicodeSet* initBidiControls(UErrorCode& status) {
154
1
    UnicodeSet* result = new UnicodeSet(UnicodeString("[\\u061C]"), status);
155
1
    if (U_FAILURE(status)) {
156
0
        return nullptr;
157
0
    }
158
1
    result->add(0x200E, 0x200F);
159
1
    result->add(0x2066, 0x2069);
160
1
    result->freeze();
161
1
    return result;
162
1
}
163
164
1
UnicodeSet* initAlpha(UErrorCode& status) {
165
1
    UnicodeSet* result = new UnicodeSet(UnicodeString("[:letter:]"), status);
166
1
    if (U_FAILURE(status)) {
167
0
        return nullptr;
168
0
    }
169
1
    result->freeze();
170
1
    return result;
171
1
}
172
173
1
UnicodeSet* initDigits(UErrorCode& status) {
174
1
    UnicodeSet* result = new UnicodeSet(UnicodeString("[:number:]"), status);
175
1
    if (U_FAILURE(status)) {
176
0
        return nullptr;
177
0
    }
178
1
    result->freeze();
179
1
    return result;
180
1
}
181
182
1
UnicodeSet* initNameStartChars(UErrorCode& status) {
183
1
    if (U_FAILURE(status)) {
184
0
        return nullptr;
185
0
    }
186
187
1
    UnicodeSet* isAlpha = unisets::gUnicodeSets[unisets::ALPHA] = initAlpha(status);
188
1
    if (U_FAILURE(status)) {
189
0
        return nullptr;
190
0
    }
191
1
    UnicodeSet* result = new UnicodeSet();
192
1
    if (result == nullptr) {
193
0
        status = U_MEMORY_ALLOCATION_ERROR;
194
0
        return nullptr;
195
1
    };
196
197
1
    result->addAll(*isAlpha);
198
1
    result->add(0x002B);
199
1
    result->add(0x005F);
200
1
    result->add(0x00A1, 0x061B);
201
1
    result->add(0x061D, 0x167F);
202
1
    result->add(0x1681, 0x1FFF);
203
1
    result->add(0x200B, 0x200D);
204
1
    result->add(0x2010, 0x2027);
205
1
    result->add(0x2030, 0x205E);
206
1
    result->add(0x2060, 0x2065);
207
1
    result->add(0x206A, 0x2FFF);
208
1
    result->add(0x3001, 0xD7FF);
209
1
    result->add(0xE000, 0xFDCF);
210
1
    result->add(0xFDF0, 0xFFFD);
211
1
    result->add(0x10000, 0x1FFFD);
212
1
    result->add(0x20000, 0x2FFFD);
213
1
    result->add(0x30000, 0x3FFFD);
214
1
    result->add(0x40000, 0x4FFFD);
215
1
    result->add(0x50000, 0x5FFFD);
216
1
    result->add(0x60000, 0x6FFFD);
217
1
    result->add(0x70000, 0x7FFFD);
218
1
    result->add(0x80000, 0x8FFFD);
219
1
    result->add(0x90000, 0x9FFFD);
220
1
    result->add(0xA0000, 0xAFFFD);
221
1
    result->add(0xB0000, 0xBFFFD);
222
1
    result->add(0xC0000, 0xCFFFD);
223
1
    result->add(0xD0000, 0xDFFFD);
224
1
    result->add(0xE0000, 0xEFFFD);
225
1
    result->add(0xF0000, 0xFFFFD);
226
1
    result->add(0x100000, 0x10FFFD);
227
1
    result->freeze();
228
1
    return result;
229
1
}
230
231
1
UnicodeSet* initNameChars(UErrorCode& status) {
232
1
    if (U_FAILURE(status)) {
233
0
        return nullptr;
234
0
    }
235
236
1
    UnicodeSet* nameStart = unisets::gUnicodeSets[unisets::NAME_START] = initNameStartChars(status);
237
1
    UnicodeSet* digit = unisets::gUnicodeSets[unisets::DIGIT] = initDigits(status);
238
1
    if (U_FAILURE(status)) {
239
0
        return nullptr;
240
0
    }
241
1
    UnicodeSet* result = new UnicodeSet();
242
1
    if (result == nullptr) {
243
0
        status = U_MEMORY_ALLOCATION_ERROR;
244
0
        return nullptr;
245
1
    };
246
1
    result->addAll(*nameStart);
247
1
    result->addAll(*digit);
248
1
    result->add(HYPHEN);
249
1
    result->add(PERIOD);
250
1
    result->freeze();
251
1
    return result;
252
1
}
253
254
1
UnicodeSet* initTextChars(UErrorCode& status) {
255
1
    if (U_FAILURE(status)) {
256
0
        return nullptr;
257
0
    }
258
259
1
    UnicodeSet* content = unisets::gUnicodeSets[unisets::CONTENT] = initContentChars(status);
260
1
    UnicodeSet* whitespace = unisets::gUnicodeSets[unisets::WHITESPACE] = initWhitespace(status);
261
1
    if (U_FAILURE(status)) {
262
0
        return nullptr;
263
0
    }
264
1
    UnicodeSet* result = new UnicodeSet();
265
1
    if (result == nullptr) {
266
0
        status = U_MEMORY_ALLOCATION_ERROR;
267
0
        return nullptr;
268
1
    };
269
1
    result->addAll(*content);
270
1
    result->addAll(*whitespace);
271
1
    result->add(PERIOD);
272
1
    result->add(AT);
273
1
    result->add(PIPE);
274
1
    result->freeze();
275
1
    return result;
276
1
}
277
278
1
UnicodeSet* initQuotedChars(UErrorCode& status) {
279
1
    if (U_FAILURE(status)) {
280
0
        return nullptr;
281
0
    }
282
283
1
    unisets::gUnicodeSets[unisets::TEXT] = initTextChars(status);
284
1
    if (U_FAILURE(status)) {
285
0
        return nullptr;
286
0
    }
287
1
    UnicodeSet* result = new UnicodeSet();
288
1
    if (result == nullptr) {
289
0
        status = U_MEMORY_ALLOCATION_ERROR;
290
0
        return nullptr;
291
1
    };
292
    // content and whitespace were initialized by `initTextChars()`
293
1
    UnicodeSet* content = unisets::getImpl(unisets::CONTENT);
294
1
    if (content == nullptr) {
295
0
        status = U_MEMORY_ALLOCATION_ERROR;
296
0
        return nullptr;
297
0
    }
298
1
    result->addAll(*content);
299
1
    UnicodeSet* whitespace = unisets::getImpl(unisets::WHITESPACE);
300
1
    if (whitespace == nullptr) {
301
0
        status = U_MEMORY_ALLOCATION_ERROR;
302
0
        return nullptr;
303
0
    }
304
1
    result->addAll(*whitespace);
305
1
    result->add(PERIOD);
306
1
    result->add(AT);
307
1
    result->add(LEFT_CURLY_BRACE);
308
1
    result->add(RIGHT_CURLY_BRACE);
309
1
    result->freeze();
310
1
    return result;
311
1
}
312
313
1
UnicodeSet* initEscapableChars(UErrorCode& status) {
314
1
    if (U_FAILURE(status)) {
315
0
        return nullptr;
316
0
    }
317
318
1
    UnicodeSet* result = new UnicodeSet();
319
1
    if (result == nullptr) {
320
0
        status = U_MEMORY_ALLOCATION_ERROR;
321
0
        return nullptr;
322
0
    }
323
1
    result->add(PIPE);
324
1
    result->add(BACKSLASH);
325
1
    result->add(LEFT_CURLY_BRACE);
326
1
    result->add(RIGHT_CURLY_BRACE);
327
1
    result->freeze();
328
1
    return result;
329
1
}
330
331
namespace unisets {
332
333
0
UBool U_CALLCONV cleanupMF2ParseUniSets() {
334
0
    for (int32_t i = 0; i < UNISETS_KEY_COUNT; i++) {
335
0
        delete gUnicodeSets[i];
336
0
        gUnicodeSets[i] = nullptr;
337
0
    }
338
0
    gMF2ParseUniSetsInitOnce.reset();
339
0
    return true;
340
0
}
341
342
1
void U_CALLCONV initMF2ParseUniSets(UErrorCode& status) {
343
1
    ucln_i18n_registerCleanup(UCLN_I18N_MF2_UNISETS, cleanupMF2ParseUniSets);
344
    /*
345
      Each of the init functions initializes the UnicodeSets
346
      that it depends on.
347
348
      initBidiControls (no dependencies)
349
350
      initEscapableChars (no dependencies)
351
352
      initNameChars depends on
353
         initDigits
354
         initNameStartChars depends on
355
           initAlpha
356
357
      initQuotedChars depends on
358
         initTextChars depends on
359
            initContentChars
360
            initWhitespace
361
     */
362
1
    gUnicodeSets[unisets::BIDI] = initBidiControls(status);
363
1
    gUnicodeSets[unisets::NAME_CHAR] = initNameChars(status);
364
1
    gUnicodeSets[unisets::QUOTED] = initQuotedChars(status);
365
1
    gUnicodeSets[unisets::ESCAPABLE] = initEscapableChars(status);
366
367
1
    if (U_FAILURE(status)) {
368
0
        cleanupMF2ParseUniSets();
369
0
    }
370
1
}
371
372
70.2k
const UnicodeSet* get(Key key, UErrorCode& status) {
373
70.2k
    umtx_initOnce(gMF2ParseUniSetsInitOnce, &initMF2ParseUniSets, status);
374
70.2k
    if (U_FAILURE(status)) {
375
0
        return nullptr;
376
0
    }
377
70.2k
    UnicodeSet* result = getImpl(key);
378
70.2k
    if (result == nullptr) {
379
0
        status = U_MEMORY_ALLOCATION_ERROR;
380
0
    }
381
70.2k
    return result;
382
70.2k
}
383
384
}
385
386
// -------------------------------------
387
// Predicates
388
389
/*
390
  The following helper predicates should exactly match nonterminals in the MessageFormat 2 grammar:
391
392
  `isContentChar()`   : `content-char`
393
  `isTextChar()`      : `text-char`
394
  `isAlpha()`         : `ALPHA`
395
  `isDigit()`         : `DIGIT`
396
  `isNameStart()`     : `name-start`
397
  `isNameChar()`      : `name-char`
398
  `isUnquotedStart()` : `unquoted-start`
399
  `isQuotedChar()`    : `quoted-char`
400
  `isWhitespace()`    : `s`
401
*/
402
403
0
bool Parser::isContentChar(UChar32 c) const {
404
0
    return contentChars->contains(c);
405
0
}
406
407
// See `bidi` in the MF2 grammar
408
26.6M
bool Parser::isBidiControl(UChar32 c) const {
409
26.6M
    return bidiControlChars->contains(c);
410
26.6M
}
411
412
// See `ws` in the MessageFormat 2 grammar
413
26.7M
bool Parser::isWhitespace(UChar32 c) const {
414
26.7M
    return whitespaceChars->contains(c);
415
26.7M
}
416
417
9.49M
bool Parser::isTextChar(UChar32 c) const {
418
9.49M
    return textChars->contains(c);
419
9.49M
}
420
421
0
bool Parser::isAlpha(UChar32 c) const {
422
0
    return alphaChars->contains(c);
423
0
}
424
425
86
bool Parser::isDigit(UChar32 c) const {
426
86
    return digitChars->contains(c);
427
86
}
428
429
991k
bool Parser::isNameStart(UChar32 c) const {
430
991k
    return nameStartChars->contains(c);
431
991k
}
432
433
33.0M
bool Parser::isNameChar(UChar32 c) const {
434
33.0M
    return nameChars->contains(c);
435
33.0M
}
436
437
8.61k
bool Parser::isUnquotedStart(UChar32 c) const {
438
8.61k
    return isNameChar(c);
439
8.61k
}
440
441
53.4k
bool Parser::isQuotedChar(UChar32 c) const {
442
53.4k
    return quotedChars->contains(c);
443
53.4k
}
444
445
9.54M
bool Parser::isEscapableChar(UChar32 c) const {
446
9.54M
    return escapableChars->contains(c);
447
9.54M
}
448
449
// Returns true iff `c` can begin a `function` nonterminal
450
198k
static bool isFunctionStart(UChar32 c) {
451
198k
    switch (c) {
452
188k
    case COLON: {
453
188k
        return true;
454
0
    }
455
10.6k
    default: {
456
10.6k
        return false;
457
0
    }
458
198k
    }
459
198k
}
460
461
// Returns true iff `c` can begin an `annotation` nonterminal
462
73.3k
static bool isAnnotationStart(UChar32 c) {
463
73.3k
    return isFunctionStart(c);
464
73.3k
}
465
466
// Returns true iff `c` can begin a `literal` nonterminal
467
168
bool Parser::isLiteralStart(UChar32 c) const {
468
168
    return (c == PIPE || isNameStart(c) || c == HYPHEN || isDigit(c));
469
168
}
470
471
// Returns true iff `c` can begin a `key` nonterminal
472
173
bool Parser::isKeyStart(UChar32 c) const {
473
173
    return (c == ASTERISK || isLiteralStart(c));
474
173
}
475
476
0
bool Parser::isDeclarationStart() {
477
0
    return (peek() == ID_LOCAL[0]
478
0
            && inBounds(1)
479
0
            && peek(1) == ID_LOCAL[1])
480
0
        || (peek() == ID_INPUT[0]
481
0
            && inBounds(1)
482
0
            && peek(1) == ID_INPUT[1]);
483
0
}
484
485
// -------------------------------------
486
// Parsing functions
487
488
489
/*
490
  TODO: Since handling the whitespace ambiguities needs to be repeated
491
  in several different places and is hard to factor out,
492
  it probably would be better to replace the parser with a lexer + parser
493
  to separate tokenizing from parsing, which would simplify the code significantly.
494
  This has the disadvantage that there is no token grammar for MessageFormat,
495
  so one would have to be invented that isn't a component of the spec.
496
 */
497
498
/*
499
    This is a recursive-descent scannerless parser that,
500
    with a few exceptions, uses 1 character of lookahead.
501
502
    This may not be an exhaustive list, as the additions of attributes and reserved
503
    statements introduced several new ambiguities.
504
505
All but three of the exceptions involve ambiguities about the meaning of whitespace.
506
One ambiguity not involving whitespace is:
507
identifier -> namespace ":" name
508
vs.
509
identifier -> name
510
511
`namespace` and `name` can't be distinguished without arbitrary lookahead.
512
(For how this is handled, see parseIdentifier())
513
514
The second ambiguity not involving whitespace is:
515
complex-message -> *(declaration[s]) complex-body
516
                -> declaration *(declaration[s]) complex-body
517
                -> declaration complex-body
518
                -> reserved-statement complex-body
519
                -> .foo {$x} .match // ...
520
When processing the '.', arbitrary lookahead is required to distinguish the
521
arbitrary-length unsupported keyword from `.match`.
522
(For how this is handled, see parseDeclarations()).
523
524
The third ambiguity not involving whitespace is:
525
complex-message -> *(declaration [s]) complex-body
526
                -> reserved-statement *(declaration [s]) complex-body
527
                -> reserved-statement complex-body
528
                -> reserved-statement quotedPattern
529
                -> reserved-keyword [s reserved-body] 1*([s] expression) quoted-pattern
530
                -> reserved-keyword expression quoted-pattern
531
 Example: .foo {1} {{1}}
532
533
 Without lookahead, the opening '{' of the quoted pattern can't be distinguished
534
 from the opening '{' of another expression in the unsupported statement.
535
 (Though this only requires 1 character of lookahead.)
536
537
 Otherwise:
538
539
There are at least seven ambiguities in the grammar that can't be resolved with finite
540
lookahead (since whitespace sequences can be arbitrarily long). They are resolved
541
with a form of backtracking (early exit). No state needs to be saved/restored
542
since whitespace doesn't affect the shape of the resulting parse tree, so it's
543
not true backtracking.
544
545
In addition, the grammar has been refactored
546
in a semantics-preserving way in some cases to make the code easier to structure.
547
548
First: variant = when 1*(s key) [s] pattern
549
   Example: when k     {a}
550
   When reading the first space after 'k', it's ambiguous whether it's the
551
   required space before another key, or the optional space before `pattern`.
552
 (See comments in parseNonEmptyKeys())
553
554
Second: expression = "{" [s] (((literal / variable) [s annotation]) / annotation) [s] "}"
555
        annotation = (function *(s option)) / reserved
556
   Example: {:f    }
557
   When reading the first space after 'f', it's ambiguous whether it's the
558
   required space before an option, or the optional trailing space after an options list
559
   (in this case, the options list is empty).
560
 (See comments in parseOptions() -- handling this case also meant it was easier to base
561
  the code on a slightly refactored grammar, which should be semantically equivalent.)
562
563
Third: expression = "{" [s] (((literal / variable) [s annotation]) / annotation) [s] "}"
564
        annotation = (function *(s option)) / reserved
565
   Example: {@a }
566
   Similar to the previous case; see comments in parseReserved()
567
568
Fourth: expression = "{" [s] (((literal / variable) [s annotation]) / annotation) [s] "}"
569
   Example: {|foo|   }
570
   When reading the first space after the '|', it's ambiguous whether it's the required
571
   space before an annotation, or the optional trailing space before the '}'.
572
  (See comments in parseLiteralOrVariableWithAnnotation(); handling this case relies on
573
  the same grammar refactoring as the second exception.)
574
575
    Most functions match a non-terminal in the grammar, except as explained
576
    in comments.
577
578
Fifth: matcher = match-statement 1*([s] variant)
579
               -> match 1 *([s] selector) 1*([s] variant)
580
    Example: match {42} * {{_}}
581
 When reading the space after the first '}', it's unclear whether
582
 it's the optional space before another selector, or the optional space
583
 before a variant.
584
585
Sixth: annotation-expression = "{" [s] annotation *(s attribute) [s] "}"
586
       -> "{" [s] function *(s attribute) [s] "}"
587
       -> "{" [s] ":" identifier *(s option) *(s attribute) [s] "}"
588
       -> "{" [s] ":" identifier s attribute *(s attribute) [s] "}"
589
590
     Example: {:func @foo}
591
(Note: the same ambiguity is present with variable-expression and literal-expression)
592
593
Seventh:
594
595
596
When parsing the space, it's unclear whether it's the optional space before an
597
option, or the optional space before an attribute.
598
599
 Unless otherwise noted in a comment, all helper functions that take
600
    a `source` string, an `index` unsigned int, and an `errorCode` `UErrorCode`
601
    have the precondition:
602
      `index` < `len()`
603
    and the postcondition:
604
      `U_FAILURE(errorCode)` || `index < `len()`
605
*/
606
607
/*
608
  No pre, no post.
609
  A message may end with whitespace, so `index` may equal `len()` on exit.
610
*/
611
8.68M
void Parser::parseRequiredWS(UErrorCode& errorCode) {
612
8.68M
    bool sawWhitespace = false;
613
614
    // The loop exits either when we consume all the input,
615
    // or when we see a non-whitespace character.
616
8.83M
    while (true) {
617
        // Check if all input has been consumed
618
8.83M
        if (!inBounds()) {
619
            // If whitespace isn't required -- or if we saw it already --
620
            // then the caller is responsible for checking this case and
621
            // setting an error if necessary.
622
220
            if (sawWhitespace) {
623
                // Not an error.
624
86
                return;
625
86
            }
626
            // Otherwise, whitespace is required; the end of the input has
627
            // been reached without whitespace. This is an error.
628
134
            ERROR(errorCode);
629
134
            return;
630
220
        }
631
632
        // Input remains; process the next character if it's whitespace,
633
        // exit the loop otherwise
634
8.83M
        if (isWhitespace(peek())) {
635
147k
            sawWhitespace = true;
636
            // Increment line number in parse error if we consume a newline
637
147k
            maybeAdvanceLine();
638
147k
            next();
639
8.68M
        } else {
640
8.68M
            break;
641
8.68M
        }
642
8.83M
    }
643
644
8.68M
    if (!sawWhitespace) {
645
8.54M
        ERROR(errorCode);
646
8.54M
    }
647
8.68M
}
648
649
8.84M
void Parser::parseOptionalBidi() {
650
8.85M
    while (true) {
651
8.85M
        if (!inBounds()) {
652
605
            return;
653
605
        }
654
8.85M
        if (isBidiControl(peek())) {
655
2.25k
            next();
656
8.84M
        } else {
657
8.84M
            break;
658
8.84M
        }
659
8.85M
    }
660
8.84M
}
661
662
/*
663
  No pre, no post, because a message may end with whitespace
664
  Matches `s` in the MF2 grammar
665
*/
666
8.68M
void Parser::parseRequiredWhitespace(UErrorCode& errorCode) {
667
8.68M
    parseOptionalBidi();
668
8.68M
    parseRequiredWS(errorCode);
669
8.68M
    parseOptionalWhitespace();
670
8.68M
    normalizedInput += SPACE;
671
8.68M
}
672
673
/*
674
  No pre, no post, for the same reason as `parseWhitespaceMaybeRequired()`.
675
*/
676
9.09M
void Parser::parseOptionalWhitespace() {
677
9.13M
    while (true) {
678
9.13M
        if (!inBounds()) {
679
2.83k
            return;
680
2.83k
        }
681
9.12M
        auto cp = peek();
682
9.12M
        if (isWhitespace(cp) || isBidiControl(cp)) {
683
37.9k
            maybeAdvanceLine();
684
37.9k
            next();
685
9.09M
        } else {
686
9.09M
            break;
687
9.09M
        }
688
9.12M
    }
689
9.09M
}
690
691
// Consumes a single character, signaling an error if `peek()` != `c`
692
// No postcondition -- a message can end with a '}' token
693
1.07M
void Parser::parseToken(UChar32 c, UErrorCode& errorCode) {
694
1.07M
    CHECK_BOUNDS(errorCode);
695
696
1.06M
    if (peek() == c) {
697
862k
        next();
698
862k
        normalizedInput += c;
699
862k
        return;
700
862k
    }
701
    // Next character didn't match -- error out
702
206k
    ERROR(errorCode);
703
206k
}
704
705
/*
706
   Consumes a fixed-length token, signaling an error if the token isn't a prefix of
707
   the string beginning at `peek()`
708
   No postcondition -- a message can end with a '}' token
709
*/
710
65.0k
void Parser::parseToken(const std::u16string_view& token, UErrorCode& errorCode) {
711
65.0k
    U_ASSERT(inBounds());
712
713
65.0k
    int32_t tokenPos = 0;
714
194k
    while (tokenPos < static_cast<int32_t>(token.length())) {
715
194k
        if (peek() != token[tokenPos]) {
716
65.0k
            ERROR(errorCode);
717
65.0k
            return;
718
65.0k
        }
719
129k
        normalizedInput += token[tokenPos];
720
129k
        next();
721
129k
        tokenPos++;
722
129k
    }
723
65.0k
}
724
725
/*
726
   Consumes optional whitespace, possibly advancing `index` to `index'`,
727
   then consumes a fixed-length token (signaling an error if the token isn't a prefix of
728
   the string beginning at `source[index']`),
729
   then consumes optional whitespace again
730
*/
731
0
void Parser::parseTokenWithWhitespace(const std::u16string_view& token, UErrorCode& errorCode) {
732
    // No need for error check or bounds check before parseOptionalWhitespace
733
0
    parseOptionalWhitespace();
734
    // Establish precondition
735
0
    CHECK_BOUNDS(errorCode);
736
0
    parseToken(token, errorCode);
737
0
    parseOptionalWhitespace();
738
    // Guarantee postcondition
739
0
    CHECK_BOUNDS(errorCode);
740
0
}
741
742
/*
743
   Consumes optional whitespace, possibly advancing `index` to `index'`,
744
   then consumes a single character (signaling an error if it doesn't match
745
   `source[index']`),
746
   then consumes optional whitespace again
747
*/
748
52.0k
void Parser::parseTokenWithWhitespace(UChar32 c, UErrorCode& errorCode) {
749
    // No need for error check or bounds check before parseOptionalWhitespace()
750
52.0k
    parseOptionalWhitespace();
751
    // Establish precondition
752
52.0k
    CHECK_BOUNDS(errorCode);
753
51.8k
    parseToken(c, errorCode);
754
51.8k
    parseOptionalWhitespace();
755
    // Guarantee postcondition
756
51.8k
    CHECK_BOUNDS(errorCode);
757
51.7k
}
758
759
/*
760
  Consumes a possibly-empty sequence of name-chars. Appends to `str`
761
  and returns `str`.
762
*/
763
258k
UnicodeString Parser::parseNameChars(UnicodeString& str, UErrorCode& errorCode) {
764
258k
    if (U_FAILURE(errorCode)) {
765
0
        return {};
766
0
    }
767
768
25.1M
    while (isNameChar(peek())) {
769
24.9M
        UChar32 c = peek();
770
24.9M
        str += c;
771
24.9M
        normalizedInput += c;
772
24.9M
        next();
773
24.9M
        if (!inBounds()) {
774
792
            ERROR(errorCode);
775
792
            break;
776
792
        }
777
24.9M
    }
778
779
258k
    return str;
780
258k
}
781
782
/*
783
  Consumes a non-empty sequence of `name-char`s, the first of which is
784
  also a `name-start`.
785
  that begins with a character `start` such that `isNameStart(start)`.
786
787
  Returns this sequence.
788
789
  (Matches the `name` nonterminal in the grammar.)
790
*/
791
937k
UnicodeString Parser::parseName(UErrorCode& errorCode) {
792
937k
    UnicodeString name;
793
794
937k
    U_ASSERT(inBounds());
795
796
937k
    if (!(isNameStart(peek()) || isBidiControl(peek()))) {
797
856k
        ERROR(errorCode);
798
856k
        return name;
799
856k
    }
800
801
    // name       = [bidi] name-start *name-char [bidi]
802
803
    // [bidi]
804
81.8k
    parseOptionalBidi();
805
806
    // name-start *name-char
807
81.8k
    parseNameChars(name, errorCode);
808
809
    // [bidi]
810
81.8k
    parseOptionalBidi();
811
812
81.8k
    return name;
813
937k
}
814
815
/*
816
  Consumes a '$' followed by a `name`, returning a VariableName
817
  with `name` as its name
818
819
  (Matches the `variable` nonterminal in the grammar.)
820
*/
821
739k
VariableName Parser::parseVariableName(UErrorCode& errorCode) {
822
739k
    VariableName result;
823
824
739k
    U_ASSERT(inBounds());
825
826
739k
    parseToken(DOLLAR, errorCode);
827
739k
    if (!inBounds()) {
828
58
        ERROR(errorCode);
829
58
        return result;
830
58
    }
831
739k
    return VariableName(parseName(errorCode));
832
739k
}
833
834
/*
835
  Corresponds to the `identifier` nonterminal in the grammar
836
*/
837
124k
UnicodeString Parser::parseIdentifier(UErrorCode& errorCode) {
838
124k
    U_ASSERT(inBounds());
839
840
124k
    UnicodeString result;
841
    // The following is a hack to get around ambiguity in the grammar:
842
    // identifier -> namespace ":" name
843
    // vs.
844
    // identifier -> name
845
    // can't be distinguished without arbitrary lookahead.
846
    // Instead, we treat the production as:
847
    // identifier -> namespace *(":"name)
848
    // and then check for multiple colons.
849
850
    // Parse namespace
851
124k
    result += parseName(errorCode);
852
124k
    int32_t firstColon = -1;
853
198k
    while (inBounds() && peek() == COLON) {
854
        // Parse ':' separator
855
74.0k
        if (firstColon == -1) {
856
49.2k
            firstColon = index;
857
49.2k
        }
858
74.0k
        parseToken(COLON, errorCode);
859
74.0k
        result += COLON;
860
        // Check for message ending with something like "foo:"
861
74.0k
        if (!inBounds()) {
862
51
            ERROR(errorCode);
863
74.0k
        } else {
864
            // Parse name part
865
74.0k
            result += parseName(errorCode);
866
74.0k
        }
867
74.0k
    }
868
869
    // If there's at least one ':', scan from the first ':'
870
    // to the end of the name to check for multiple ':'s
871
124k
    if (firstColon != -1) {
872
8.05M
        for (int32_t i = firstColon + 1; i < result.length(); i++) {
873
8.00M
            if (result[i] == COLON) {
874
140
                ERROR_AT(errorCode, i);
875
140
                return {};
876
140
            }
877
8.00M
        }
878
49.2k
    }
879
880
124k
    return result;
881
124k
}
882
883
/*
884
  Consumes a reference to a function, matching the ": identifier"
885
  in the `function` nonterminal in the grammar.
886
887
  Returns the function name.
888
*/
889
62.7k
FunctionName Parser::parseFunction(UErrorCode& errorCode) {
890
62.7k
    U_ASSERT(inBounds());
891
62.7k
    if (!isFunctionStart(peek())) {
892
0
        ERROR(errorCode);
893
0
        return FunctionName();
894
0
    }
895
896
62.7k
    normalizedInput += peek();
897
62.7k
    next(); // Consume the function start character
898
62.7k
    if (!inBounds()) {
899
33
        ERROR(errorCode);
900
33
        return FunctionName();
901
33
    }
902
62.7k
    return parseIdentifier(errorCode);
903
62.7k
}
904
905
906
/*
907
  Precondition: peek() == BACKSLASH
908
909
  Consume an escaped character.
910
  Corresponds to `escaped-char` in the grammar.
911
912
  No postcondition (a message can end with an escaped char)
913
*/
914
4.59k
UnicodeString Parser::parseEscapeSequence(UErrorCode& errorCode) {
915
4.59k
    U_ASSERT(inBounds());
916
4.59k
    U_ASSERT(peek() == BACKSLASH);
917
4.59k
    normalizedInput += BACKSLASH;
918
4.59k
    next(); // Skip the initial backslash
919
4.59k
    UnicodeString str;
920
4.59k
    if (inBounds()) {
921
        // Expect a '{', '|' or '}'
922
4.57k
        switch (peek()) {
923
599
        case LEFT_CURLY_BRACE:
924
1.11k
        case RIGHT_CURLY_BRACE:
925
3.40k
        case PIPE:
926
3.99k
        case BACKSLASH: {
927
            /* Append to the output string */
928
3.99k
            str += peek();
929
            /* Update normalizedInput */
930
3.99k
            normalizedInput += peek();
931
            /* Consume the character */
932
3.99k
            next();
933
3.99k
            return str;
934
3.40k
        }
935
574
        default: {
936
            // No other characters are allowed here
937
574
            break;
938
3.40k
        }
939
4.57k
        }
940
4.57k
    }
941
   // If control reaches here, there was an error
942
597
   ERROR(errorCode);
943
597
   return str;
944
4.59k
}
945
946
947
/*
948
  Consume and return a quoted literal, matching the `literal` nonterminal in the grammar.
949
*/
950
3.01k
Literal Parser::parseQuotedLiteral(UErrorCode& errorCode) {
951
3.01k
    bool error = false;
952
953
3.01k
    UnicodeString contents;
954
3.01k
    if (U_SUCCESS(errorCode)) {
955
        // Parse the opening '|'
956
3.01k
        parseToken(PIPE, errorCode);
957
3.01k
        if (!inBounds()) {
958
17
            ERROR(errorCode);
959
17
            error = true;
960
3.00k
        } else {
961
            // Parse the contents
962
3.00k
            bool done = false;
963
59.6k
            while (!done) {
964
56.7k
                if (peek() == BACKSLASH) {
965
3.29k
                    contents += parseEscapeSequence(errorCode);
966
53.4k
                } else if (isQuotedChar(peek())) {
967
50.5k
                    contents += peek();
968
                    // Handle cases like:
969
                    // |}{| -- we want to escape everywhere that
970
                    // can be escaped, to make round-trip checking
971
                    // easier -- so this case normalizes to
972
                    // |\}\{|
973
50.5k
                    if (isEscapableChar(peek())) {
974
3.45k
                        normalizedInput += BACKSLASH;
975
3.45k
                    }
976
50.5k
                    normalizedInput += peek();
977
50.5k
                    next(); // Consume this character
978
50.5k
                    maybeAdvanceLine();
979
50.5k
                } else {
980
                    // Assume the sequence of literal characters ends here
981
2.91k
                    done = true;
982
2.91k
                }
983
56.7k
                if (!inBounds()) {
984
87
                    ERROR(errorCode);
985
87
                    error = true;
986
87
                    break;
987
87
                }
988
56.7k
            }
989
3.00k
        }
990
3.01k
    }
991
992
3.01k
    if (error) {
993
104
        return {};
994
104
    }
995
996
    // Parse the closing '|'
997
2.91k
    parseToken(PIPE, errorCode);
998
999
2.91k
    return Literal(true, contents);
1000
3.01k
}
1001
1002
// Parse (1*DIGIT)
1003
0
UnicodeString Parser::parseDigits(UErrorCode& errorCode) {
1004
0
    if (U_FAILURE(errorCode)) {
1005
0
        return {};
1006
0
    }
1007
1008
0
    U_ASSERT(isDigit(peek()));
1009
1010
0
    UnicodeString contents;
1011
0
    do {
1012
0
        contents += peek();
1013
0
        normalizedInput += peek();
1014
0
        next();
1015
0
        if (!inBounds()) {
1016
0
            ERROR(errorCode);
1017
0
            return {};
1018
0
        }
1019
0
    } while (isDigit(peek()));
1020
1021
0
    return contents;
1022
0
}
1023
/*
1024
  Consume and return an unquoted literal, matching the `unquoted` nonterminal in the grammar.
1025
*/
1026
7.83M
Literal Parser::parseUnquotedLiteral(UErrorCode& errorCode) {
1027
7.83M
    if (U_FAILURE(errorCode)) {
1028
0
        return {};
1029
0
    }
1030
    // unquoted-literal = 1*name-char
1031
1032
7.83M
    if (!(isNameChar(peek()))) {
1033
7.65M
        ERROR(errorCode);
1034
7.65M
        return {};
1035
7.65M
    }
1036
1037
177k
    UnicodeString contents;
1038
177k
    parseNameChars(contents, errorCode);
1039
177k
    return Literal(false, contents);
1040
7.83M
}
1041
1042
/*
1043
  Consume and return a literal, matching the `literal` nonterminal in the grammar.
1044
*/
1045
7.84M
Literal Parser::parseLiteral(UErrorCode& errorCode) {
1046
7.84M
    Literal result;
1047
7.84M
    if (!inBounds()) {
1048
250
        ERROR(errorCode);
1049
7.84M
    } else {
1050
7.84M
        if (peek() == PIPE) {
1051
3.01k
            result = parseQuotedLiteral(errorCode);
1052
7.83M
        } else {
1053
7.83M
            result = parseUnquotedLiteral(errorCode);
1054
7.83M
        }
1055
        // Guarantee postcondition
1056
7.84M
        if (!inBounds()) {
1057
477
            ERROR(errorCode);
1058
477
        }
1059
7.84M
    }
1060
1061
7.84M
    return result;
1062
7.84M
}
1063
1064
/*
1065
  Consume a @name-value pair, matching the `attribute` nonterminal in the grammar.
1066
1067
  Adds the option to `options`
1068
*/
1069
template<class T>
1070
42.9k
void Parser::parseAttribute(AttributeAdder<T>& attrAdder, UErrorCode& errorCode) {
1071
42.9k
    U_ASSERT(inBounds());
1072
1073
42.9k
    U_ASSERT(peek() == AT);
1074
    // Consume the '@'
1075
42.9k
    parseToken(AT, errorCode);
1076
1077
    // Parse LHS
1078
42.9k
    UnicodeString lhs = parseIdentifier(errorCode);
1079
1080
    // Prepare to "backtrack" to resolve ambiguity
1081
    // about whether whitespace precedes another
1082
    // attribute, or the '=' sign
1083
42.9k
    int32_t savedIndex = index;
1084
42.9k
    parseOptionalWhitespace();
1085
1086
42.9k
    Operand rand;
1087
42.9k
    if (peek() == EQUALS) {
1088
        // Parse '='
1089
6.50k
        parseTokenWithWhitespace(EQUALS, errorCode);
1090
1091
6.50k
        UnicodeString rhsStr;
1092
        // Parse RHS, which must be a literal
1093
        // attribute = "@" identifier [o "=" o literal]
1094
6.50k
        rand = Operand(parseLiteral(errorCode));
1095
36.4k
    } else {
1096
        // attribute -> "@" identifier [[s] "=" [s]]
1097
        // Use null operand, which `rand` is already set to
1098
        // "Backtrack" by restoring the whitespace (if there was any)
1099
36.4k
        index = savedIndex;
1100
36.4k
    }
1101
1102
42.9k
    attrAdder.addAttribute(lhs, std::move(Operand(rand)), errorCode);
1103
42.9k
}
void icu_79::message2::Parser::parseAttribute<icu_79::message2::data_model::Expression::Builder>(icu_79::message2::AttributeAdder<icu_79::message2::data_model::Expression::Builder>&, UErrorCode&)
Line
Count
Source
1070
35.3k
void Parser::parseAttribute(AttributeAdder<T>& attrAdder, UErrorCode& errorCode) {
1071
35.3k
    U_ASSERT(inBounds());
1072
1073
35.3k
    U_ASSERT(peek() == AT);
1074
    // Consume the '@'
1075
35.3k
    parseToken(AT, errorCode);
1076
1077
    // Parse LHS
1078
35.3k
    UnicodeString lhs = parseIdentifier(errorCode);
1079
1080
    // Prepare to "backtrack" to resolve ambiguity
1081
    // about whether whitespace precedes another
1082
    // attribute, or the '=' sign
1083
35.3k
    int32_t savedIndex = index;
1084
35.3k
    parseOptionalWhitespace();
1085
1086
35.3k
    Operand rand;
1087
35.3k
    if (peek() == EQUALS) {
1088
        // Parse '='
1089
1.20k
        parseTokenWithWhitespace(EQUALS, errorCode);
1090
1091
1.20k
        UnicodeString rhsStr;
1092
        // Parse RHS, which must be a literal
1093
        // attribute = "@" identifier [o "=" o literal]
1094
1.20k
        rand = Operand(parseLiteral(errorCode));
1095
34.1k
    } else {
1096
        // attribute -> "@" identifier [[s] "=" [s]]
1097
        // Use null operand, which `rand` is already set to
1098
        // "Backtrack" by restoring the whitespace (if there was any)
1099
34.1k
        index = savedIndex;
1100
34.1k
    }
1101
1102
35.3k
    attrAdder.addAttribute(lhs, std::move(Operand(rand)), errorCode);
1103
35.3k
}
void icu_79::message2::Parser::parseAttribute<icu_79::message2::data_model::Markup::Builder>(icu_79::message2::AttributeAdder<icu_79::message2::data_model::Markup::Builder>&, UErrorCode&)
Line
Count
Source
1070
7.54k
void Parser::parseAttribute(AttributeAdder<T>& attrAdder, UErrorCode& errorCode) {
1071
7.54k
    U_ASSERT(inBounds());
1072
1073
7.54k
    U_ASSERT(peek() == AT);
1074
    // Consume the '@'
1075
7.54k
    parseToken(AT, errorCode);
1076
1077
    // Parse LHS
1078
7.54k
    UnicodeString lhs = parseIdentifier(errorCode);
1079
1080
    // Prepare to "backtrack" to resolve ambiguity
1081
    // about whether whitespace precedes another
1082
    // attribute, or the '=' sign
1083
7.54k
    int32_t savedIndex = index;
1084
7.54k
    parseOptionalWhitespace();
1085
1086
7.54k
    Operand rand;
1087
7.54k
    if (peek() == EQUALS) {
1088
        // Parse '='
1089
5.30k
        parseTokenWithWhitespace(EQUALS, errorCode);
1090
1091
5.30k
        UnicodeString rhsStr;
1092
        // Parse RHS, which must be a literal
1093
        // attribute = "@" identifier [o "=" o literal]
1094
5.30k
        rand = Operand(parseLiteral(errorCode));
1095
5.30k
    } else {
1096
        // attribute -> "@" identifier [[s] "=" [s]]
1097
        // Use null operand, which `rand` is already set to
1098
        // "Backtrack" by restoring the whitespace (if there was any)
1099
2.24k
        index = savedIndex;
1100
2.24k
    }
1101
1102
7.54k
    attrAdder.addAttribute(lhs, std::move(Operand(rand)), errorCode);
1103
7.54k
}
1104
1105
/*
1106
  Consume a name-value pair, matching the `option` nonterminal in the grammar.
1107
1108
  Adds the option to `optionList`
1109
*/
1110
template<class T>
1111
6.87k
void Parser::parseOption(OptionAdder<T>& addOption, UErrorCode& errorCode) {
1112
6.87k
    U_ASSERT(inBounds());
1113
1114
    // Parse LHS
1115
6.87k
    UnicodeString lhs = parseIdentifier(errorCode);
1116
1117
    // Parse '='
1118
6.87k
    parseTokenWithWhitespace(EQUALS, errorCode);
1119
1120
6.87k
    UnicodeString rhsStr;
1121
6.87k
    Operand rand;
1122
    // Parse RHS, which is either a literal or variable
1123
6.87k
    switch (peek()) {
1124
2.61k
    case DOLLAR: {
1125
2.61k
        rand = Operand(parseVariableName(errorCode));
1126
2.61k
        break;
1127
0
    }
1128
4.25k
    default: {
1129
        // Must be a literal
1130
4.25k
        rand = Operand(parseLiteral(errorCode));
1131
4.25k
        break;
1132
0
    }
1133
6.87k
    }
1134
6.87k
    U_ASSERT(!rand.isNull());
1135
1136
    // Finally, add the key=value mapping
1137
    // Use a local error code, check for duplicate option error and
1138
    // record it as with other errors
1139
6.87k
    UErrorCode status = U_ZERO_ERROR;
1140
6.87k
    addOption.addOption(lhs, std::move(rand), status);
1141
6.87k
    if (U_FAILURE(status)) {
1142
3.31k
      U_ASSERT(status == U_MF_DUPLICATE_OPTION_NAME_ERROR);
1143
3.31k
      errors.setDuplicateOptionName(errorCode);
1144
3.31k
    }
1145
6.87k
}
void icu_79::message2::Parser::parseOption<icu_79::message2::data_model::Operator::Builder>(icu_79::message2::OptionAdder<icu_79::message2::data_model::Operator::Builder>&, UErrorCode&)
Line
Count
Source
1111
3.11k
void Parser::parseOption(OptionAdder<T>& addOption, UErrorCode& errorCode) {
1112
3.11k
    U_ASSERT(inBounds());
1113
1114
    // Parse LHS
1115
3.11k
    UnicodeString lhs = parseIdentifier(errorCode);
1116
1117
    // Parse '='
1118
3.11k
    parseTokenWithWhitespace(EQUALS, errorCode);
1119
1120
3.11k
    UnicodeString rhsStr;
1121
3.11k
    Operand rand;
1122
    // Parse RHS, which is either a literal or variable
1123
3.11k
    switch (peek()) {
1124
1.83k
    case DOLLAR: {
1125
1.83k
        rand = Operand(parseVariableName(errorCode));
1126
1.83k
        break;
1127
0
    }
1128
1.27k
    default: {
1129
        // Must be a literal
1130
1.27k
        rand = Operand(parseLiteral(errorCode));
1131
1.27k
        break;
1132
0
    }
1133
3.11k
    }
1134
3.11k
    U_ASSERT(!rand.isNull());
1135
1136
    // Finally, add the key=value mapping
1137
    // Use a local error code, check for duplicate option error and
1138
    // record it as with other errors
1139
3.11k
    UErrorCode status = U_ZERO_ERROR;
1140
3.11k
    addOption.addOption(lhs, std::move(rand), status);
1141
3.11k
    if (U_FAILURE(status)) {
1142
2.19k
      U_ASSERT(status == U_MF_DUPLICATE_OPTION_NAME_ERROR);
1143
2.19k
      errors.setDuplicateOptionName(errorCode);
1144
2.19k
    }
1145
3.11k
}
void icu_79::message2::Parser::parseOption<icu_79::message2::data_model::Markup::Builder>(icu_79::message2::OptionAdder<icu_79::message2::data_model::Markup::Builder>&, UErrorCode&)
Line
Count
Source
1111
3.75k
void Parser::parseOption(OptionAdder<T>& addOption, UErrorCode& errorCode) {
1112
3.75k
    U_ASSERT(inBounds());
1113
1114
    // Parse LHS
1115
3.75k
    UnicodeString lhs = parseIdentifier(errorCode);
1116
1117
    // Parse '='
1118
3.75k
    parseTokenWithWhitespace(EQUALS, errorCode);
1119
1120
3.75k
    UnicodeString rhsStr;
1121
3.75k
    Operand rand;
1122
    // Parse RHS, which is either a literal or variable
1123
3.75k
    switch (peek()) {
1124
777
    case DOLLAR: {
1125
777
        rand = Operand(parseVariableName(errorCode));
1126
777
        break;
1127
0
    }
1128
2.97k
    default: {
1129
        // Must be a literal
1130
2.97k
        rand = Operand(parseLiteral(errorCode));
1131
2.97k
        break;
1132
0
    }
1133
3.75k
    }
1134
3.75k
    U_ASSERT(!rand.isNull());
1135
1136
    // Finally, add the key=value mapping
1137
    // Use a local error code, check for duplicate option error and
1138
    // record it as with other errors
1139
3.75k
    UErrorCode status = U_ZERO_ERROR;
1140
3.75k
    addOption.addOption(lhs, std::move(rand), status);
1141
3.75k
    if (U_FAILURE(status)) {
1142
1.12k
      U_ASSERT(status == U_MF_DUPLICATE_OPTION_NAME_ERROR);
1143
1.12k
      errors.setDuplicateOptionName(errorCode);
1144
1.12k
    }
1145
3.75k
}
1146
1147
/*
1148
  Note: there are multiple overloads of parseOptions() for parsing
1149
  options within markup, vs. within an expression, vs. parsing
1150
  attributes. This should be refactored. TODO
1151
 */
1152
1153
/*
1154
  Consume optional whitespace followed by a sequence of options
1155
  (possibly empty), separated by whitespace
1156
*/
1157
template <class T>
1158
70.6k
void Parser::parseOptions(OptionAdder<T>& addOption, UErrorCode& errorCode) {
1159
    // Early exit if out of bounds -- no more work is possible
1160
70.6k
    CHECK_BOUNDS(errorCode);
1161
1162
/*
1163
Arbitrary lookahead is required to parse option lists. To see why, consider
1164
these rules from the grammar:
1165
1166
expression = "{" [s] (((literal / variable) [s annotation]) / annotation) [s] "}"
1167
annotation = (function *(s option)) / reserved
1168
1169
And this example:
1170
{:foo  }
1171
1172
Derivation:
1173
expression -> "{" [s] (((literal / variable) [s annotation]) / annotation) [s] "}"
1174
           -> "{" [s] annotation [s] "}"
1175
           -> "{" [s] ((function *(s option)) / reserved) [s] "}"
1176
           -> "{" [s] function *(s option) [s] "}"
1177
1178
In this example, knowing whether to expect a '}' or the start of another option
1179
after the whitespace would require arbitrary lookahead -- in other words, which
1180
rule should we apply?
1181
    *(s option) -> s option *(s option)
1182
  or
1183
    *(s option) ->
1184
1185
The same would apply to the example {:foo k=v } (note the trailing space after "v").
1186
1187
This is addressed using a form of backtracking and (to make the backtracking easier
1188
to apply) a slight refactoring to the grammar.
1189
1190
This code is written as if the grammar is:
1191
  expression = "{" [s] (((literal / variable) ([s] / [s annotation])) / annotation) "}"
1192
  annotation = (function *(s option) [s]) / (reserved [s])
1193
1194
Parsing the `*(s option) [s]` sequence can be done within `parseOptions()`, meaning
1195
that `parseExpression()` can safely require a '}' after `parseOptions()` finishes.
1196
1197
Note that when "backtracking" really just means early exit, since only whitespace
1198
is involved and there's no state to save.
1199
1200
There is a separate but similar ambiguity as to whether the space precedes
1201
an option or an attribute.
1202
*/
1203
1204
77.4k
    while(true) {
1205
        // If the next character is not whitespace, that means we've already
1206
        // parsed the entire options list (which may have been empty) and there's
1207
        // no trailing whitespace. In that case, exit.
1208
77.4k
        if (!isWhitespace(peek())) {
1209
23.9k
            break;
1210
23.9k
        }
1211
53.4k
        int32_t firstWhitespace = index;
1212
1213
        // In any case other than an empty options list, there must be at least
1214
        // one whitespace character.
1215
53.4k
        parseRequiredWhitespace(errorCode);
1216
        // Restore precondition
1217
53.4k
        CHECK_BOUNDS(errorCode);
1218
1219
        // If a name character follows, then at least one more option remains
1220
        // in the list.
1221
        // Otherwise, we've consumed all the options and any trailing whitespace,
1222
        // and can exit.
1223
        // Note that exiting is sort of like backtracking: "(s option)" doesn't apply,
1224
        // so we back out to [s].
1225
53.4k
        if (!isNameStart(peek())) {
1226
            // We've consumed all the options (meaning that either we consumed non-empty
1227
            // whitespace, or consumed at least one option.)
1228
            // Done.
1229
            // Remove the required whitespace from normalizedInput
1230
46.5k
            normalizedInput.truncate(normalizedInput.length() - 1);
1231
            // "Backtrack" so as to leave the optional whitespace there
1232
            // when parsing attributes
1233
46.5k
            index = firstWhitespace;
1234
46.5k
            break;
1235
46.5k
        }
1236
6.87k
        parseOption(addOption, errorCode);
1237
6.87k
    }
1238
70.5k
}
void icu_79::message2::Parser::parseOptions<icu_79::message2::data_model::Operator::Builder>(icu_79::message2::OptionAdder<icu_79::message2::data_model::Operator::Builder>&, UErrorCode&)
Line
Count
Source
1158
62.7k
void Parser::parseOptions(OptionAdder<T>& addOption, UErrorCode& errorCode) {
1159
    // Early exit if out of bounds -- no more work is possible
1160
62.7k
    CHECK_BOUNDS(errorCode);
1161
1162
/*
1163
Arbitrary lookahead is required to parse option lists. To see why, consider
1164
these rules from the grammar:
1165
1166
expression = "{" [s] (((literal / variable) [s annotation]) / annotation) [s] "}"
1167
annotation = (function *(s option)) / reserved
1168
1169
And this example:
1170
{:foo  }
1171
1172
Derivation:
1173
expression -> "{" [s] (((literal / variable) [s annotation]) / annotation) [s] "}"
1174
           -> "{" [s] annotation [s] "}"
1175
           -> "{" [s] ((function *(s option)) / reserved) [s] "}"
1176
           -> "{" [s] function *(s option) [s] "}"
1177
1178
In this example, knowing whether to expect a '}' or the start of another option
1179
after the whitespace would require arbitrary lookahead -- in other words, which
1180
rule should we apply?
1181
    *(s option) -> s option *(s option)
1182
  or
1183
    *(s option) ->
1184
1185
The same would apply to the example {:foo k=v } (note the trailing space after "v").
1186
1187
This is addressed using a form of backtracking and (to make the backtracking easier
1188
to apply) a slight refactoring to the grammar.
1189
1190
This code is written as if the grammar is:
1191
  expression = "{" [s] (((literal / variable) ([s] / [s annotation])) / annotation) "}"
1192
  annotation = (function *(s option) [s]) / (reserved [s])
1193
1194
Parsing the `*(s option) [s]` sequence can be done within `parseOptions()`, meaning
1195
that `parseExpression()` can safely require a '}' after `parseOptions()` finishes.
1196
1197
Note that when "backtracking" really just means early exit, since only whitespace
1198
is involved and there's no state to save.
1199
1200
There is a separate but similar ambiguity as to whether the space precedes
1201
an option or an attribute.
1202
*/
1203
1204
65.7k
    while(true) {
1205
        // If the next character is not whitespace, that means we've already
1206
        // parsed the entire options list (which may have been empty) and there's
1207
        // no trailing whitespace. In that case, exit.
1208
65.7k
        if (!isWhitespace(peek())) {
1209
23.4k
            break;
1210
23.4k
        }
1211
42.2k
        int32_t firstWhitespace = index;
1212
1213
        // In any case other than an empty options list, there must be at least
1214
        // one whitespace character.
1215
42.2k
        parseRequiredWhitespace(errorCode);
1216
        // Restore precondition
1217
42.2k
        CHECK_BOUNDS(errorCode);
1218
1219
        // If a name character follows, then at least one more option remains
1220
        // in the list.
1221
        // Otherwise, we've consumed all the options and any trailing whitespace,
1222
        // and can exit.
1223
        // Note that exiting is sort of like backtracking: "(s option)" doesn't apply,
1224
        // so we back out to [s].
1225
42.2k
        if (!isNameStart(peek())) {
1226
            // We've consumed all the options (meaning that either we consumed non-empty
1227
            // whitespace, or consumed at least one option.)
1228
            // Done.
1229
            // Remove the required whitespace from normalizedInput
1230
39.1k
            normalizedInput.truncate(normalizedInput.length() - 1);
1231
            // "Backtrack" so as to leave the optional whitespace there
1232
            // when parsing attributes
1233
39.1k
            index = firstWhitespace;
1234
39.1k
            break;
1235
39.1k
        }
1236
3.11k
        parseOption(addOption, errorCode);
1237
3.11k
    }
1238
62.5k
}
void icu_79::message2::Parser::parseOptions<icu_79::message2::data_model::Markup::Builder>(icu_79::message2::OptionAdder<icu_79::message2::data_model::Markup::Builder>&, UErrorCode&)
Line
Count
Source
1158
7.95k
void Parser::parseOptions(OptionAdder<T>& addOption, UErrorCode& errorCode) {
1159
    // Early exit if out of bounds -- no more work is possible
1160
7.95k
    CHECK_BOUNDS(errorCode);
1161
1162
/*
1163
Arbitrary lookahead is required to parse option lists. To see why, consider
1164
these rules from the grammar:
1165
1166
expression = "{" [s] (((literal / variable) [s annotation]) / annotation) [s] "}"
1167
annotation = (function *(s option)) / reserved
1168
1169
And this example:
1170
{:foo  }
1171
1172
Derivation:
1173
expression -> "{" [s] (((literal / variable) [s annotation]) / annotation) [s] "}"
1174
           -> "{" [s] annotation [s] "}"
1175
           -> "{" [s] ((function *(s option)) / reserved) [s] "}"
1176
           -> "{" [s] function *(s option) [s] "}"
1177
1178
In this example, knowing whether to expect a '}' or the start of another option
1179
after the whitespace would require arbitrary lookahead -- in other words, which
1180
rule should we apply?
1181
    *(s option) -> s option *(s option)
1182
  or
1183
    *(s option) ->
1184
1185
The same would apply to the example {:foo k=v } (note the trailing space after "v").
1186
1187
This is addressed using a form of backtracking and (to make the backtracking easier
1188
to apply) a slight refactoring to the grammar.
1189
1190
This code is written as if the grammar is:
1191
  expression = "{" [s] (((literal / variable) ([s] / [s annotation])) / annotation) "}"
1192
  annotation = (function *(s option) [s]) / (reserved [s])
1193
1194
Parsing the `*(s option) [s]` sequence can be done within `parseOptions()`, meaning
1195
that `parseExpression()` can safely require a '}' after `parseOptions()` finishes.
1196
1197
Note that when "backtracking" really just means early exit, since only whitespace
1198
is involved and there's no state to save.
1199
1200
There is a separate but similar ambiguity as to whether the space precedes
1201
an option or an attribute.
1202
*/
1203
1204
11.7k
    while(true) {
1205
        // If the next character is not whitespace, that means we've already
1206
        // parsed the entire options list (which may have been empty) and there's
1207
        // no trailing whitespace. In that case, exit.
1208
11.7k
        if (!isWhitespace(peek())) {
1209
570
            break;
1210
570
        }
1211
11.1k
        int32_t firstWhitespace = index;
1212
1213
        // In any case other than an empty options list, there must be at least
1214
        // one whitespace character.
1215
11.1k
        parseRequiredWhitespace(errorCode);
1216
        // Restore precondition
1217
11.1k
        CHECK_BOUNDS(errorCode);
1218
1219
        // If a name character follows, then at least one more option remains
1220
        // in the list.
1221
        // Otherwise, we've consumed all the options and any trailing whitespace,
1222
        // and can exit.
1223
        // Note that exiting is sort of like backtracking: "(s option)" doesn't apply,
1224
        // so we back out to [s].
1225
11.1k
        if (!isNameStart(peek())) {
1226
            // We've consumed all the options (meaning that either we consumed non-empty
1227
            // whitespace, or consumed at least one option.)
1228
            // Done.
1229
            // Remove the required whitespace from normalizedInput
1230
7.37k
            normalizedInput.truncate(normalizedInput.length() - 1);
1231
            // "Backtrack" so as to leave the optional whitespace there
1232
            // when parsing attributes
1233
7.37k
            index = firstWhitespace;
1234
7.37k
            break;
1235
7.37k
        }
1236
3.75k
        parseOption(addOption, errorCode);
1237
3.75k
    }
1238
7.95k
}
1239
1240
/*
1241
  Consume optional whitespace followed by a sequence of attributes
1242
  (possibly empty), separated by whitespace
1243
*/
1244
template<class T>
1245
79.3k
void Parser::parseAttributes(AttributeAdder<T>& attrAdder, UErrorCode& errorCode) {
1246
1247
    // Early exit if out of bounds -- no more work is possible
1248
79.3k
    if (!inBounds()) {
1249
749
        ERROR(errorCode);
1250
749
        return;
1251
749
    }
1252
1253
/*
1254
Arbitrary lookahead is required to parse attribute lists, similarly to option lists.
1255
(See comment in parseOptions()).
1256
*/
1257
1258
121k
    while(true) {
1259
        // If the next character is not whitespace, that means we've already
1260
        // parsed the entire attributes list (which may have been empty) and there's
1261
        // no trailing whitespace. In that case, exit.
1262
121k
        if (!isWhitespace(peek())) {
1263
49.6k
            break;
1264
49.6k
        }
1265
1266
        // In any case other than an empty attributes list, there must be at least
1267
        // one whitespace character.
1268
71.9k
        parseRequiredWhitespace(errorCode);
1269
        // Restore precondition
1270
71.9k
        if (!inBounds()) {
1271
30
            ERROR(errorCode);
1272
30
            break;
1273
30
        }
1274
1275
        // If an '@' follows, then at least one more attribute remains
1276
        // in the list.
1277
        // Otherwise, we've consumed all the attributes and any trailing whitespace,
1278
        // and can exit.
1279
        // Note that exiting is sort of like backtracking: "(s attributes)" doesn't apply,
1280
        // so we back out to [s].
1281
71.8k
        if (peek() != AT) {
1282
            // We've consumed all the attributes (meaning that either we consumed non-empty
1283
            // whitespace, or consumed at least one attribute.)
1284
            // Done.
1285
            // Remove the whitespace from normalizedInput
1286
28.9k
            normalizedInput.truncate(normalizedInput.length() - 1);
1287
28.9k
            break;
1288
28.9k
        }
1289
42.9k
        parseAttribute(attrAdder, errorCode);
1290
42.9k
    }
1291
78.6k
}
void icu_79::message2::Parser::parseAttributes<icu_79::message2::data_model::Expression::Builder>(icu_79::message2::AttributeAdder<icu_79::message2::data_model::Expression::Builder>&, UErrorCode&)
Line
Count
Source
1245
71.7k
void Parser::parseAttributes(AttributeAdder<T>& attrAdder, UErrorCode& errorCode) {
1246
1247
    // Early exit if out of bounds -- no more work is possible
1248
71.7k
    if (!inBounds()) {
1249
749
        ERROR(errorCode);
1250
749
        return;
1251
749
    }
1252
1253
/*
1254
Arbitrary lookahead is required to parse attribute lists, similarly to option lists.
1255
(See comment in parseOptions()).
1256
*/
1257
1258
106k
    while(true) {
1259
        // If the next character is not whitespace, that means we've already
1260
        // parsed the entire attributes list (which may have been empty) and there's
1261
        // no trailing whitespace. In that case, exit.
1262
106k
        if (!isWhitespace(peek())) {
1263
45.6k
            break;
1264
45.6k
        }
1265
1266
        // In any case other than an empty attributes list, there must be at least
1267
        // one whitespace character.
1268
60.7k
        parseRequiredWhitespace(errorCode);
1269
        // Restore precondition
1270
60.7k
        if (!inBounds()) {
1271
17
            ERROR(errorCode);
1272
17
            break;
1273
17
        }
1274
1275
        // If an '@' follows, then at least one more attribute remains
1276
        // in the list.
1277
        // Otherwise, we've consumed all the attributes and any trailing whitespace,
1278
        // and can exit.
1279
        // Note that exiting is sort of like backtracking: "(s attributes)" doesn't apply,
1280
        // so we back out to [s].
1281
60.7k
        if (peek() != AT) {
1282
            // We've consumed all the attributes (meaning that either we consumed non-empty
1283
            // whitespace, or consumed at least one attribute.)
1284
            // Done.
1285
            // Remove the whitespace from normalizedInput
1286
25.3k
            normalizedInput.truncate(normalizedInput.length() - 1);
1287
25.3k
            break;
1288
25.3k
        }
1289
35.3k
        parseAttribute(attrAdder, errorCode);
1290
35.3k
    }
1291
71.0k
}
void icu_79::message2::Parser::parseAttributes<icu_79::message2::data_model::Markup::Builder>(icu_79::message2::AttributeAdder<icu_79::message2::data_model::Markup::Builder>&, UErrorCode&)
Line
Count
Source
1245
7.60k
void Parser::parseAttributes(AttributeAdder<T>& attrAdder, UErrorCode& errorCode) {
1246
1247
    // Early exit if out of bounds -- no more work is possible
1248
7.60k
    if (!inBounds()) {
1249
0
        ERROR(errorCode);
1250
0
        return;
1251
0
    }
1252
1253
/*
1254
Arbitrary lookahead is required to parse attribute lists, similarly to option lists.
1255
(See comment in parseOptions()).
1256
*/
1257
1258
15.1k
    while(true) {
1259
        // If the next character is not whitespace, that means we've already
1260
        // parsed the entire attributes list (which may have been empty) and there's
1261
        // no trailing whitespace. In that case, exit.
1262
15.1k
        if (!isWhitespace(peek())) {
1263
3.98k
            break;
1264
3.98k
        }
1265
1266
        // In any case other than an empty attributes list, there must be at least
1267
        // one whitespace character.
1268
11.1k
        parseRequiredWhitespace(errorCode);
1269
        // Restore precondition
1270
11.1k
        if (!inBounds()) {
1271
13
            ERROR(errorCode);
1272
13
            break;
1273
13
        }
1274
1275
        // If an '@' follows, then at least one more attribute remains
1276
        // in the list.
1277
        // Otherwise, we've consumed all the attributes and any trailing whitespace,
1278
        // and can exit.
1279
        // Note that exiting is sort of like backtracking: "(s attributes)" doesn't apply,
1280
        // so we back out to [s].
1281
11.1k
        if (peek() != AT) {
1282
            // We've consumed all the attributes (meaning that either we consumed non-empty
1283
            // whitespace, or consumed at least one attribute.)
1284
            // Done.
1285
            // Remove the whitespace from normalizedInput
1286
3.60k
            normalizedInput.truncate(normalizedInput.length() - 1);
1287
3.60k
            break;
1288
3.60k
        }
1289
7.54k
        parseAttribute(attrAdder, errorCode);
1290
7.54k
    }
1291
7.60k
}
1292
1293
/*
1294
  Consume a function call, matching the `annotation`
1295
  nonterminal in the grammar
1296
1297
  Returns an `Operator` representing this (a reserved is a parse error)
1298
*/
1299
62.7k
Operator Parser::parseAnnotation(UErrorCode& status) {
1300
62.7k
    U_ASSERT(inBounds());
1301
62.7k
    Operator::Builder ratorBuilder(status);
1302
62.7k
    if (U_FAILURE(status)) {
1303
0
        return {};
1304
0
    }
1305
62.7k
    if (isFunctionStart(peek())) {
1306
        // Consume the function name
1307
62.7k
        FunctionName func = parseFunction(status);
1308
62.7k
        ratorBuilder.setFunctionName(std::move(func));
1309
1310
62.7k
        OptionAdder<Operator::Builder> addOptions(ratorBuilder);
1311
        // Consume the options (which may be empty)
1312
62.7k
        parseOptions(addOptions, status);
1313
62.7k
    } else {
1314
0
        ERROR(status);
1315
0
    }
1316
62.7k
    return ratorBuilder.build(status);
1317
62.7k
}
1318
1319
/*
1320
  Consume a literal or variable (depending on `isVariable`),
1321
  followed by either required whitespace followed by an annotation,
1322
  or optional whitespace.
1323
*/
1324
void Parser::parseLiteralOrVariableWithAnnotation(bool isVariable,
1325
                                                  Expression::Builder& builder,
1326
8.34k
                                                  UErrorCode& status) {
1327
8.34k
    CHECK_ERROR(status);
1328
1329
8.34k
    U_ASSERT(inBounds());
1330
1331
8.34k
    Operand rand;
1332
8.34k
    if (isVariable) {
1333
378
        rand = Operand(parseVariableName(status));
1334
7.96k
    } else {
1335
7.96k
        rand = Operand(parseLiteral(status));
1336
7.96k
    }
1337
1338
8.34k
    builder.setOperand(std::move(rand));
1339
1340
/*
1341
Parsing a literal or variable with an optional annotation requires arbitrary lookahead.
1342
To see why, consider this rule from the grammar:
1343
1344
expression = "{" [s] (((literal / variable) [s annotation]) / annotation) [s] "}"
1345
1346
And this example:
1347
1348
{|foo|   }
1349
1350
Derivation:
1351
expression -> "{" [s] (((literal / variable) [s annotation]) / annotation) [s] "}"
1352
           -> "{" [s] ((literal / variable) [s annotation]) [s] "}"
1353
           -> "{" [s] (literal [s annotation]) [s] "}"
1354
1355
When reading the ' ' after the second '|', it's ambiguous whether that's the required
1356
space before an annotation, or the optional space before the '}'.
1357
1358
To make this ambiguity easier to handle, this code is based on the same grammar
1359
refactoring for the `expression` nonterminal that `parseOptions()` relies on. See
1360
the comment in `parseOptions()` for details.
1361
*/
1362
1363
8.34k
    if (isWhitespace(peek())) {
1364
2.39k
      int32_t firstWhitespace = index;
1365
1366
      // If the next character is whitespace, either [s annotation] or [s] applies
1367
      // (the character is either the required space before an annotation, or optional
1368
      // trailing space after the literal or variable). It's still ambiguous which
1369
      // one does apply.
1370
2.39k
      parseOptionalWhitespace();
1371
      // Restore precondition
1372
2.39k
      CHECK_BOUNDS(status);
1373
1374
      // This next check resolves the ambiguity between [s annotation] and [s]
1375
2.38k
      bool isSAnnotation = isAnnotationStart(peek());
1376
1377
2.38k
      if (isSAnnotation) {
1378
374
        normalizedInput += SPACE;
1379
374
      }
1380
1381
2.38k
      if (isSAnnotation) {
1382
        // The previously consumed whitespace precedes an annotation
1383
374
        builder.setOperator(parseAnnotation(status));
1384
2.01k
      } else {
1385
          // Either there's a right curly brace (will be consumed by the caller),
1386
          // or there's an error and the trailing whitespace should be
1387
          // handled by the caller. However, this is not an error
1388
          // here because we're just parsing `literal [s annotation]`.
1389
2.01k
          index = firstWhitespace;
1390
2.01k
      }
1391
5.95k
    } else {
1392
      // Either there was never whitespace, or
1393
      // the previously consumed whitespace is the optional trailing whitespace;
1394
      // either the next character is '}' or the error will be handled by parseExpression.
1395
      // Do nothing, since the operand was already set
1396
5.95k
    }
1397
1398
    // At the end of this code, the next character should either be '}',
1399
    // whitespace followed by a '}',
1400
    // or end-of-input
1401
8.34k
}
1402
1403
/*
1404
  Consume an expression, matching the `expression` nonterminal in the grammar
1405
*/
1406
1407
1.05k
static void exprFallback(Expression::Builder& exprBuilder) {
1408
    // Construct a literal consisting just of  The U+FFFD REPLACEMENT CHARACTER
1409
    // per https://github.com/unicode-org/message-format-wg/blob/main/spec/formatting.md#fallback-resolution
1410
1.05k
    exprBuilder.setOperand(Operand(Literal(false, UnicodeString(REPLACEMENT))));
1411
1.05k
}
1412
1413
0
static Expression exprFallback(UErrorCode& status) {
1414
0
    Expression result;
1415
0
    if (U_SUCCESS(status)) {
1416
0
        Expression::Builder exprBuilder(status);
1417
0
        if (U_SUCCESS(status)) {
1418
            // Construct a literal consisting just of  The U+FFFD REPLACEMENT CHARACTER
1419
            // per https://github.com/unicode-org/message-format-wg/blob/main/spec/formatting.md#fallback-resolution
1420
0
            exprBuilder.setOperand(Operand(Literal(false, UnicodeString(REPLACEMENT))));
1421
0
            UErrorCode status = U_ZERO_ERROR;
1422
0
            result = exprBuilder.build(status);
1423
            // An operand was set, so there can't be an error
1424
0
            U_ASSERT(U_SUCCESS(status));
1425
0
        }
1426
0
    }
1427
0
    return result;
1428
0
}
1429
1430
71.7k
Expression Parser::parseExpression(UErrorCode& status) {
1431
71.7k
    if (U_FAILURE(status)) {
1432
0
        return {};
1433
0
    }
1434
1435
    // Early return if out of input -- no more work is possible
1436
71.7k
    U_ASSERT(inBounds());
1437
1438
    // Parse opening brace
1439
71.7k
    parseToken(LEFT_CURLY_BRACE, status);
1440
    // Optional whitespace after opening brace
1441
71.7k
    parseOptionalWhitespace();
1442
1443
71.7k
    Expression::Builder exprBuilder(status);
1444
    // Restore precondition
1445
71.7k
    if (!inBounds()) {
1446
75
        exprFallback(exprBuilder);
1447
71.6k
    } else {
1448
        // literal '|', variable '$' or annotation
1449
71.6k
        switch (peek()) {
1450
329
        case PIPE: {
1451
            // Quoted literal
1452
329
            parseLiteralOrVariableWithAnnotation(false, exprBuilder, status);
1453
329
            break;
1454
0
        }
1455
378
        case DOLLAR: {
1456
            // Variable
1457
378
            parseLiteralOrVariableWithAnnotation(true, exprBuilder, status);
1458
378
            break;
1459
0
        }
1460
70.9k
        default: {
1461
70.9k
            if (isAnnotationStart(peek())) {
1462
62.3k
                Operator rator = parseAnnotation(status);
1463
62.3k
                exprBuilder.setOperator(std::move(rator));
1464
62.3k
            } else if (isUnquotedStart(peek())) {
1465
                // Unquoted literal
1466
7.63k
                parseLiteralOrVariableWithAnnotation(false, exprBuilder, status);
1467
7.63k
            } else {
1468
                // Not a literal, variable or annotation -- error out
1469
980
                ERROR(status);
1470
980
                exprFallback(exprBuilder);
1471
980
                break;
1472
980
            }
1473
69.9k
            break;
1474
70.9k
        }
1475
71.6k
        }
1476
71.6k
    }
1477
1478
    // Parse attributes
1479
71.7k
    AttributeAdder<Expression::Builder> attrAdder(exprBuilder);
1480
71.7k
    parseAttributes(attrAdder, status);
1481
1482
    // Parse optional space
1483
    // (the last [s] in e.g. "{" [s] literal [s annotation] *(s attribute) [s] "}")
1484
71.7k
    parseOptionalWhitespace();
1485
1486
    // Either an operand or operator (or both) must have been set already,
1487
    // so there can't be an error
1488
71.7k
    UErrorCode localStatus = U_ZERO_ERROR;
1489
71.7k
    Expression result = exprBuilder.build(localStatus);
1490
71.7k
    U_ASSERT(U_SUCCESS(localStatus));
1491
1492
    // Check for end-of-input and missing '}'
1493
71.7k
    if (!inBounds()) {
1494
838
        ERROR(status);
1495
70.9k
    } else {
1496
        // Otherwise, it's safe to check for the '}'
1497
70.9k
        parseToken(RIGHT_CURLY_BRACE, status);
1498
70.9k
    }
1499
71.7k
    return result;
1500
71.7k
}
1501
1502
/*
1503
  Parse a .local declaration, matching the `local-declaration`
1504
  production in the grammar
1505
*/
1506
38.6k
void Parser::parseLocalDeclaration(UErrorCode& status) {
1507
    // End-of-input here would be an error; even empty
1508
    // declarations must be followed by a body
1509
38.6k
    CHECK_BOUNDS(status);
1510
1511
38.6k
    parseToken(ID_LOCAL, status);
1512
38.6k
    parseRequiredWhitespace(status);
1513
1514
    // Restore precondition
1515
38.6k
    CHECK_BOUNDS(status);
1516
38.6k
    VariableName lhs = parseVariableName(status);
1517
38.6k
    parseTokenWithWhitespace(EQUALS, status);
1518
    // Restore precondition before calling parseExpression()
1519
38.6k
    CHECK_BOUNDS(status);
1520
1521
38.6k
    Expression rhs = parseExpression(status);
1522
1523
    // Add binding from lhs to rhs, unless there was an error
1524
    // (This ensures that if there was a correct lhs but a
1525
    // parse error in rhs, the fallback for uses of the
1526
    // lhs will be its own name rather than the rhs)
1527
    /* This affects the behavior of this test case, which the spec
1528
       is ambiguous about:
1529
1530
       .local $bar {|foo|} {{{$bar}}}
1531
1532
       Should `$bar` still be bound to a value although
1533
       its declaration is syntactically incorrect (missing the '=')?
1534
       This code says no, but it needs to change if
1535
       https://github.com/unicode-org/message-format-wg/issues/703
1536
       is resolved differently.
1537
    */
1538
38.6k
    CHECK_ERROR(status);
1539
38.6k
    if (!errors.hasSyntaxError()) {
1540
0
        dataModel.addBinding(Binding(std::move(lhs), std::move(rhs)), status);
1541
        // Check if status is U_DUPLICATE_DECLARATION_ERROR
1542
        // and add that as an internal error if so
1543
0
        if (status == U_MF_DUPLICATE_DECLARATION_ERROR) {
1544
0
            status = U_ZERO_ERROR;
1545
0
            errors.addError(StaticErrorType::DuplicateDeclarationError, status);
1546
0
        }
1547
0
    }
1548
38.6k
}
1549
1550
/*
1551
  Parse an .input declaration, matching the `local-declaration`
1552
  production in the grammar
1553
*/
1554
25.6k
void Parser::parseInputDeclaration(UErrorCode& status) {
1555
    // End-of-input here would be an error; even empty
1556
    // declarations must be followed by a body
1557
25.6k
    CHECK_BOUNDS(status);
1558
1559
25.6k
    parseToken(ID_INPUT, status);
1560
25.6k
    parseOptionalWhitespace();
1561
1562
    // Restore precondition before calling parseExpression()
1563
25.6k
    CHECK_BOUNDS(status);
1564
1565
    // Save the index for error diagnostics
1566
25.6k
    int32_t exprIndex = index;
1567
25.6k
    Expression rhs = parseExpression(status);
1568
1569
    // Here we have to check that the rhs is a variable-expression
1570
25.6k
    if (!rhs.getOperand().isVariable()) {
1571
        // This case is a syntax error; report it at the beginning
1572
        // of the expression
1573
25.3k
        ERROR_AT(status, exprIndex);
1574
25.3k
        return;
1575
25.3k
    }
1576
1577
340
    VariableName lhs = rhs.getOperand().asVariable();
1578
1579
    // Add binding from lhs to rhs
1580
    // This just adds a new local variable that shadows the message
1581
    // argument referred to, which is harmless.
1582
    // When evaluating the RHS, the new local is not in scope
1583
    // and the message argument will be correctly referred to.
1584
340
    CHECK_ERROR(status);
1585
340
    if (!errors.hasSyntaxError()) {
1586
0
        dataModel.addBinding(Binding::input(std::move(lhs), std::move(rhs), status), status);
1587
        // Check if status is U_MF_DUPLICATE_DECLARATION_ERROR
1588
        // and add that as an internal error if so
1589
0
        if (status == U_MF_DUPLICATE_DECLARATION_ERROR) {
1590
0
            status = U_ZERO_ERROR;
1591
0
            errors.addError(StaticErrorType::DuplicateDeclarationError, status);
1592
0
        }
1593
0
    }
1594
340
}
1595
1596
/*
1597
  Consume a possibly-empty sequence of declarations separated by whitespace;
1598
  each declaration matches the `declaration` nonterminal in the grammar
1599
1600
  Builds up an environment representing those declarations
1601
*/
1602
1.10k
void Parser::parseDeclarations(UErrorCode& status) {
1603
    // End-of-input here would be an error; even empty
1604
    // declarations must be followed by a body
1605
1.10k
    CHECK_BOUNDS(status);
1606
1607
65.2k
    while (peek() == PERIOD) {
1608
65.0k
        CHECK_BOUNDS_1(status);
1609
65.0k
        if (peek(1) == ID_LOCAL[1]) {
1610
38.6k
            parseLocalDeclaration(status);
1611
38.6k
        } else if (peek(1) == ID_INPUT[1]) {
1612
25.6k
            parseInputDeclaration(status);
1613
25.6k
        } else {
1614
            // Done parsing declarations
1615
722
            break;
1616
722
        }
1617
1618
        // Avoid looping infinitely
1619
64.3k
        CHECK_ERROR(status);
1620
1621
64.3k
        parseOptionalWhitespace();
1622
        // Restore precondition
1623
64.3k
        CHECK_BOUNDS(status);
1624
64.1k
    }
1625
1.10k
}
1626
1627
/*
1628
  Consume a text character
1629
  matching the `text-char` nonterminal in the grammar
1630
1631
  No postcondition (a message can end with a text-char)
1632
*/
1633
9.49M
UnicodeString Parser::parseTextChar(UErrorCode& status) {
1634
9.49M
    UnicodeString str;
1635
9.49M
    if (!inBounds() || !(isTextChar(peek()))) {
1636
        // Error -- text-char is expected here
1637
263
        ERROR(status);
1638
9.49M
    } else {
1639
        // See comment in parseQuotedLiteral()
1640
9.49M
        if (isEscapableChar(peek())) {
1641
27.2k
            normalizedInput += BACKSLASH;
1642
27.2k
        }
1643
9.49M
        normalizedInput += peek();
1644
9.49M
        str += peek();
1645
9.49M
        next();
1646
9.49M
        maybeAdvanceLine();
1647
9.49M
    }
1648
9.49M
    return str;
1649
9.49M
}
1650
1651
/*
1652
  Consume an `nmtoken`, `literal`, or the string "*", matching
1653
  the `key` nonterminal in the grammar
1654
*/
1655
7.82M
Key Parser::parseKey(UErrorCode& status) {
1656
7.82M
    U_ASSERT(inBounds());
1657
1658
7.82M
    Key k; // wildcard by default
1659
    // Literal | '*'
1660
7.82M
    switch (peek()) {
1661
1.09k
    case ASTERISK: {
1662
1.09k
        next();
1663
1.09k
        normalizedInput += ASTERISK;
1664
        // Guarantee postcondition
1665
1.09k
        if (!inBounds()) {
1666
6
            ERROR(status);
1667
6
            return k;
1668
6
        }
1669
1.09k
        break;
1670
1.09k
    }
1671
7.82M
    default: {
1672
        // Literal
1673
7.82M
        k = Key(parseLiteral(status));
1674
7.82M
        break;
1675
1.09k
    }
1676
7.82M
    }
1677
7.82M
    return k;
1678
7.82M
}
1679
1680
/*
1681
  Consume a non-empty sequence of `key`s separated by whitespace
1682
1683
  Takes ownership of `keys`
1684
*/
1685
568
SelectorKeys Parser::parseNonEmptyKeys(UErrorCode& status) {
1686
568
    SelectorKeys result;
1687
1688
568
    if (U_FAILURE(status)) {
1689
0
        return result;
1690
0
    }
1691
1692
568
    U_ASSERT(inBounds());
1693
1694
/*
1695
Arbitrary lookahead is required to parse key lists. To see why, consider
1696
this rule from the grammar:
1697
1698
variant = key *(s key) [s] quoted-pattern
1699
1700
And this example:
1701
when k1 k2   {a}
1702
1703
Derivation:
1704
   variant -> key *(s key) [s] quoted-pattern
1705
           -> key s key *(s key) quoted-pattern
1706
1707
After matching ' ' to `s` and 'k2' to `key`, it would require arbitrary lookahead
1708
to know whether to expect the start of a pattern or the start of another key.
1709
In other words: is the second whitespace sequence the required space in *(s key),
1710
or the optional space in [s] quoted-pattern?
1711
1712
This is addressed using "backtracking" (similarly to `parseOptions()`).
1713
*/
1714
1715
568
    SelectorKeys::Builder keysBuilder(status);
1716
568
    if (U_FAILURE(status)) {
1717
0
        return result;
1718
0
    }
1719
1720
    // Since the first key is required, it's simplest to parse it separately.
1721
568
    keysBuilder.add(parseKey(status), status);
1722
1723
    // Restore precondition
1724
568
    if (!inBounds()) {
1725
47
        ERROR(status);
1726
47
        return result;
1727
47
    }
1728
1729
    // We've seen at least one whitespace-key pair, so now we can parse
1730
    // *(s key) [s]
1731
7.82M
    while (peek() != LEFT_CURLY_BRACE || isWhitespace(peek()) || isBidiControl(peek())) {
1732
7.82M
        bool wasWhitespace = isWhitespace(peek()) || isBidiControl(peek());
1733
7.82M
        parseRequiredWhitespace(status);
1734
7.82M
        if (!wasWhitespace) {
1735
            // Avoid infinite loop when parsing something like:
1736
            // when * @{!...
1737
7.80M
            next();
1738
7.80M
        }
1739
1740
        // Restore precondition
1741
7.82M
        if (!inBounds()) {
1742
227
            ERROR(status);
1743
227
            return result;
1744
227
        }
1745
1746
        // At this point, it's ambiguous whether we are inside (s key) or [s].
1747
        // This check resolves that ambiguity.
1748
7.82M
        if (peek() == LEFT_CURLY_BRACE) {
1749
            // A pattern follows, so what we just parsed was the optional
1750
            // trailing whitespace. All the keys have been parsed.
1751
1752
            // Unpush the whitespace from `normalizedInput`
1753
129
            normalizedInput.truncate(normalizedInput.length() - 1);
1754
129
            break;
1755
129
        }
1756
7.82M
        keysBuilder.add(parseKey(status), status);
1757
7.82M
    }
1758
1759
294
    return keysBuilder.build(status);
1760
521
}
1761
1762
308
Pattern Parser::parseQuotedPattern(UErrorCode& status) {
1763
308
    U_ASSERT(inBounds());
1764
1765
308
    parseToken(LEFT_CURLY_BRACE, status);
1766
308
    parseToken(LEFT_CURLY_BRACE, status);
1767
308
    Pattern p = parseSimpleMessage(status);
1768
308
    parseToken(RIGHT_CURLY_BRACE, status);
1769
308
    parseToken(RIGHT_CURLY_BRACE, status);
1770
308
    return p;
1771
308
}
1772
1773
/*
1774
  Consume a `placeholder`, matching the nonterminal in the grammar
1775
  No postcondition (a markup can end a message)
1776
*/
1777
12.0k
Markup Parser::parseMarkup(UErrorCode& status) {
1778
12.0k
    U_ASSERT(inBounds(1));
1779
1780
12.0k
    U_ASSERT(peek() == LEFT_CURLY_BRACE);
1781
1782
12.0k
    Markup::Builder builder(status);
1783
12.0k
    if (U_FAILURE(status)) {
1784
0
        return {};
1785
0
    }
1786
1787
    // Consume the '{'
1788
12.0k
    next();
1789
12.0k
    normalizedInput += LEFT_CURLY_BRACE;
1790
12.0k
    parseOptionalWhitespace();
1791
12.0k
    bool closing = false;
1792
12.0k
    switch (peek()) {
1793
10.2k
    case NUMBER_SIGN: {
1794
        // Open or standalone; consume the '#'
1795
10.2k
        normalizedInput += peek();
1796
10.2k
        next();
1797
10.2k
        break;
1798
0
    }
1799
1.86k
    case SLASH: {
1800
        // Closing
1801
1.86k
        normalizedInput += peek();
1802
1.86k
        closing = true;
1803
1.86k
        next();
1804
1.86k
        break;
1805
0
    }
1806
0
    default: {
1807
0
        ERROR(status);
1808
0
        return {};
1809
0
    }
1810
12.0k
    }
1811
1812
    // Parse the markup identifier
1813
12.0k
    builder.setName(parseIdentifier(status));
1814
1815
    // Parse the options, which must begin with a ' '
1816
    // if present
1817
12.0k
    if (inBounds() && (isWhitespace(peek()) || isBidiControl(peek()))) {
1818
7.95k
        OptionAdder<Markup::Builder> optionAdder(builder);
1819
7.95k
        parseOptions(optionAdder, status);
1820
7.95k
    }
1821
1822
    // Parse the attributes, which also must begin
1823
    // with a ' '
1824
12.0k
    if (inBounds() && (isWhitespace(peek()) || isBidiControl(peek()))) {
1825
7.60k
        AttributeAdder<Markup::Builder> attrAdder(builder);
1826
7.60k
        parseAttributes(attrAdder, status);
1827
7.60k
    }
1828
1829
12.0k
    parseOptionalWhitespace();
1830
1831
12.0k
    bool standalone = false;
1832
    // Check if this is a standalone or not
1833
12.0k
    if (!closing) {
1834
10.2k
        if (inBounds() && peek() == SLASH) {
1835
208
            standalone = true;
1836
208
            normalizedInput += SLASH;
1837
208
            next();
1838
208
        }
1839
10.2k
    }
1840
1841
12.0k
    parseToken(RIGHT_CURLY_BRACE, status);
1842
1843
12.0k
    if (standalone) {
1844
208
        builder.setStandalone();
1845
11.8k
    } else if (closing) {
1846
1.86k
        builder.setClose();
1847
9.99k
    } else {
1848
9.99k
        builder.setOpen();
1849
9.99k
    }
1850
1851
12.0k
    return builder.build(status);
1852
12.0k
}
1853
1854
/*
1855
  Consume a `placeholder`, matching the nonterminal in the grammar
1856
  No postcondition (a placeholder can end a message)
1857
*/
1858
19.5k
std::variant<Expression, Markup> Parser::parsePlaceholder(UErrorCode& status) {
1859
19.5k
    U_ASSERT(peek() == LEFT_CURLY_BRACE);
1860
1861
19.5k
    if (!inBounds()) {
1862
0
        ERROR(status);
1863
0
        return exprFallback(status);
1864
0
    }
1865
1866
    // Need to look ahead arbitrarily since whitespace
1867
    // can appear before the '{' and '#'
1868
    // in markup
1869
19.5k
    int32_t tempIndex = 1;
1870
19.5k
    bool isMarkup = false;
1871
26.0k
    while (inBounds(1)) {
1872
26.0k
        UChar32 c = peek(tempIndex);
1873
26.0k
        if (c == NUMBER_SIGN || c == SLASH) {
1874
12.0k
            isMarkup = true;
1875
12.0k
            break;
1876
12.0k
        }
1877
13.9k
        if (!(isWhitespace(c) || isBidiControl(c))) {
1878
7.41k
            break;
1879
7.41k
        }
1880
6.53k
        tempIndex++;
1881
6.53k
    }
1882
1883
19.5k
    if (isMarkup) {
1884
12.0k
        return parseMarkup(status);
1885
12.0k
    }
1886
7.45k
    return parseExpression(status);
1887
19.5k
}
1888
1889
/*
1890
  Consume a `simple-message`, matching the nonterminal in the grammar
1891
  Postcondition: `index == len()` or U_FAILURE(status);
1892
  for a syntactically correct message, this will consume the entire input
1893
*/
1894
6.23k
Pattern Parser::parseSimpleMessage(UErrorCode& status) {
1895
6.23k
    Pattern::Builder result(status);
1896
1897
6.23k
    if (U_SUCCESS(status)) {
1898
6.23k
        Expression expression;
1899
9.51M
        while (inBounds()) {
1900
9.51M
            switch (peek()) {
1901
19.5k
            case LEFT_CURLY_BRACE: {
1902
                // Must be placeholder
1903
19.5k
                std::variant<Expression, Markup> piece = parsePlaceholder(status);
1904
19.5k
                if (std::holds_alternative<Expression>(piece)) {
1905
7.45k
                    Expression expr = *std::get_if<Expression>(&piece);
1906
7.45k
                    result.add(std::move(expr), status);
1907
12.0k
                } else {
1908
12.0k
                    Markup markup = *std::get_if<Markup>(&piece);
1909
12.0k
                    result.add(std::move(markup), status);
1910
12.0k
                }
1911
19.5k
                break;
1912
0
            }
1913
1.29k
            case BACKSLASH: {
1914
                // Must be escaped-char
1915
1.29k
                result.add(parseEscapeSequence(status), status);
1916
1.29k
                break;
1917
0
            }
1918
16
            case RIGHT_CURLY_BRACE: {
1919
                // Distinguish unescaped '}' from end of quoted pattern
1920
16
                break;
1921
0
            }
1922
9.49M
            default: {
1923
                // Must be text-char
1924
9.49M
                result.add(parseTextChar(status), status);
1925
9.49M
                break;
1926
0
            }
1927
9.51M
            }
1928
9.51M
            if (peek() == RIGHT_CURLY_BRACE) {
1929
                // End of quoted pattern
1930
203
                break;
1931
203
            }
1932
            // Don't loop infinitely
1933
9.51M
            if (errors.hasSyntaxError() || U_FAILURE(status)) {
1934
5.48k
                break;
1935
5.48k
            }
1936
9.51M
        }
1937
6.23k
    }
1938
6.23k
    return result.build(status);
1939
6.23k
}
1940
1941
571
void Parser::parseVariant(UErrorCode& status) {
1942
571
    CHECK_ERROR(status);
1943
1944
    // At least one key is required
1945
568
    SelectorKeys keyList(parseNonEmptyKeys(status));
1946
1947
    // parseNonEmptyKeys() consumes any trailing whitespace,
1948
    // so the pattern can be consumed next.
1949
1950
    // Restore precondition before calling parsePattern()
1951
    // (which must return a non-null value)
1952
568
    CHECK_BOUNDS(status);
1953
294
    Pattern rhs = parseQuotedPattern(status);
1954
1955
294
    dataModel.addVariant(std::move(keyList), std::move(rhs), status);
1956
294
}
1957
1958
/*
1959
  Consume a `selectors` (matching the nonterminal in the grammar),
1960
  followed by a non-empty sequence of `variant`s (matching the nonterminal
1961
  in the grammar) preceded by whitespace
1962
  No postcondition (on return, `index` might equal `len()` with no syntax error
1963
  because a message can end with a variant)
1964
*/
1965
745
void Parser::parseSelectors(UErrorCode& status) {
1966
745
    CHECK_ERROR(status);
1967
1968
745
    U_ASSERT(inBounds());
1969
1970
745
    parseToken(ID_MATCH, status);
1971
1972
745
    bool empty = true;
1973
    // Parse selectors
1974
    // "Backtracking" is required here. It's not clear if whitespace is
1975
    // (`[s]` selector) or (`[s]` variant)
1976
698k
    while (isWhitespace(peek()) || peek() == DOLLAR) {
1977
697k
        int32_t whitespaceStart = index;
1978
697k
        parseRequiredWhitespace(status);
1979
        // Restore precondition
1980
697k
        CHECK_BOUNDS(status);
1981
697k
        if (peek() != DOLLAR) {
1982
            // This is not necessarily an error, but rather,
1983
            // means the whitespace we parsed was the optional
1984
            // whitespace preceding the first variant, not the
1985
            // required whitespace preceding a subsequent variable.
1986
            // In that case, "push back" the whitespace.
1987
77
            normalizedInput.truncate(normalizedInput.length() - 1);
1988
77
            index = whitespaceStart;
1989
77
            break;
1990
77
        }
1991
697k
        VariableName var = parseVariableName(status);
1992
697k
        empty = false;
1993
1994
697k
        dataModel.addSelector(std::move(var), status);
1995
697k
        CHECK_ERROR(status);
1996
697k
    }
1997
1998
    // At least one selector is required
1999
730
    if (empty) {
2000
200
        ERROR(status);
2001
200
        return;
2002
200
    }
2003
2004
530
    #define CHECK_END_OF_INPUT                     \
2005
530
        if (!inBounds()) {                         \
2006
62
            break;                                 \
2007
62
        }                                          \
2008
530
2009
    // Parse variants
2010
    // matcher = match-statement s variant *(o variant)
2011
2012
    // Parse first variant
2013
530
    parseRequiredWhitespace(status);
2014
530
    if (!inBounds()) {
2015
57
        ERROR(status);
2016
57
        return;
2017
57
    }
2018
473
    parseVariant(status);
2019
473
    if (!inBounds()) {
2020
        // Not an error; there might be only one variant
2021
291
        return;
2022
291
    }
2023
2024
182
    while (isWhitespace(peek()) || isBidiControl(peek()) || isKeyStart(peek())) {
2025
100
        parseOptionalWhitespace();
2026
        // Restore the precondition.
2027
        // Trailing whitespace is allowed.
2028
100
        if (!inBounds()) {
2029
2
            return;
2030
2
        }
2031
2032
98
        parseVariant(status);
2033
2034
        // Restore the precondition, *without* erroring out if we've
2035
        // reached the end of input. That's because it's valid for the
2036
        // message to end with a variant that has no trailing whitespace.
2037
        // Why do we need to check this condition twice inside the loop?
2038
        // Because if we don't check it here, the `isWhitespace()` call in
2039
        // the loop head will read off the end of the input string.
2040
98
        CHECK_END_OF_INPUT
2041
2042
36
        if (errors.hasSyntaxError() || U_FAILURE(status)) {
2043
36
            break;
2044
36
        }
2045
36
    }
2046
182
}
2047
2048
/*
2049
  Consume a `body` (matching the nonterminal in the grammar),
2050
  No postcondition (on return, `index` might equal `len()` with no syntax error,
2051
  because a message can end with a body (trailing whitespace is optional)
2052
*/
2053
2054
343
void Parser::errorPattern(UErrorCode& status) {
2055
343
    errors.addSyntaxError(status);
2056
    // Set to empty pattern
2057
343
    Pattern::Builder result = Pattern::Builder(status);
2058
343
    CHECK_ERROR(status);
2059
2060
    // If still in bounds, then add the remaining input as a single text part
2061
    // to the pattern
2062
    /*
2063
      TODO: this behavior isn't documented in the spec, but it comes from
2064
      https://github.com/messageformat/messageformat/blob/e0087bff312d759b67a9129eac135d318a1f0ce7/packages/mf2-messageformat/src/__fixtures/test-messages.json#L236
2065
      and a pending pull request https://github.com/unicode-org/message-format-wg/pull/462 will clarify
2066
      whether this is the intent behind the spec
2067
     */
2068
343
    UnicodeString partStr(LEFT_CURLY_BRACE);
2069
4.15M
    while (inBounds()) {
2070
4.15M
        partStr += peek();
2071
4.15M
        next();
2072
4.15M
    }
2073
    // Add curly braces around the entire output (same comment as above)
2074
343
    partStr += RIGHT_CURLY_BRACE;
2075
343
    result.add(std::move(partStr), status);
2076
343
    dataModel.setPattern(result.build(status));
2077
343
}
2078
2079
1.10k
void Parser::parseBody(UErrorCode& status) {
2080
1.10k
    CHECK_ERROR(status);
2081
2082
    // Out-of-input is a syntax warning
2083
1.10k
    if (!inBounds()) {
2084
177
        errorPattern(status);
2085
177
        return;
2086
177
    }
2087
2088
    // Body must be either a pattern or selectors
2089
925
    switch (peek()) {
2090
14
    case LEFT_CURLY_BRACE: {
2091
        // Pattern
2092
14
        dataModel.setPattern(parseQuotedPattern(status));
2093
14
        break;
2094
0
    }
2095
745
    case ID_MATCH[0]: {
2096
        // Selectors
2097
745
        parseSelectors(status);
2098
745
        return;
2099
0
    }
2100
166
    default: {
2101
166
        ERROR(status);
2102
166
        errorPattern(status);
2103
166
        return;
2104
0
    }
2105
925
    }
2106
925
}
2107
2108
// -------------------------------------
2109
// Parses the source pattern.
2110
2111
7.02k
void Parser::parse(UParseError &parseErrorResult, UErrorCode& status) {
2112
7.02k
    CHECK_ERROR(status);
2113
2114
7.02k
    bool complex = false;
2115
    // First, "look ahead" to determine if this is a simple or complex
2116
    // message. To do that, check the first non-whitespace character.
2117
10.8k
    while (inBounds(index) && (isWhitespace(peek()) || isBidiControl(peek()))) {
2118
3.86k
        next();
2119
3.86k
    }
2120
2121
    // Message can be empty, so we need to only look ahead
2122
    // if we know it's non-empty
2123
7.02k
    if (inBounds()) {
2124
7.02k
        if (peek() == PERIOD
2125
5.92k
            || (inBounds(1)
2126
5.80k
                && peek() == LEFT_CURLY_BRACE
2127
4.69k
                && peek(1) == LEFT_CURLY_BRACE)) {
2128
1.10k
            complex = true;
2129
1.10k
        }
2130
7.02k
    }
2131
    // Reset index
2132
7.02k
    index = 0;
2133
2134
    // Message can be empty, so we need to only look ahead
2135
    // if we know it's non-empty
2136
7.02k
    if (complex) {
2137
1.10k
        parseOptionalWhitespace();
2138
1.10k
        parseDeclarations(status);
2139
1.10k
        parseBody(status);
2140
1.10k
        parseOptionalWhitespace();
2141
5.92k
    } else {
2142
        // Simple message
2143
        // For normalization, quote the pattern
2144
5.92k
        normalizedInput += LEFT_CURLY_BRACE;
2145
5.92k
        normalizedInput += LEFT_CURLY_BRACE;
2146
5.92k
        dataModel.setPattern(parseSimpleMessage(status));
2147
5.92k
        normalizedInput += RIGHT_CURLY_BRACE;
2148
5.92k
        normalizedInput += RIGHT_CURLY_BRACE;
2149
5.92k
    }
2150
2151
7.02k
    CHECK_ERROR(status);
2152
2153
    // There are no errors; finally, check that the entire input was consumed
2154
6.76k
    if (!allConsumed()) {
2155
4.64k
        ERROR(status);
2156
4.64k
    }
2157
2158
    // Finally, copy the relevant fields of the internal `MessageParseError`
2159
    // into the `UParseError` argument
2160
6.76k
    translateParseError(parseError, parseErrorResult);
2161
6.76k
}
2162
2163
7.02k
Parser::~Parser() {}
2164
2165
} // namespace message2
2166
U_NAMESPACE_END
2167
2168
#endif /* #if !UCONFIG_NO_MF2 */
2169
2170
#endif /* #if !UCONFIG_NO_FORMATTING */
2171
2172
#endif /* #if !UCONFIG_NO_NORMALIZATION */