/src/icu/icu4c/source/i18n/messageformat2_parser.cpp
Line | Count | Source |
1 | | // © 2024 and later: Unicode, Inc. and others. |
2 | | // License & terms of use: http://www.unicode.org/copyright.html |
3 | | |
4 | | #include "unicode/utypes.h" |
5 | | |
6 | | #if !UCONFIG_NO_NORMALIZATION |
7 | | |
8 | | #if !UCONFIG_NO_FORMATTING |
9 | | |
10 | | #if !UCONFIG_NO_MF2 |
11 | | |
12 | | #include "unicode/uniset.h" |
13 | | #include "messageformat2_errors.h" |
14 | | #include "messageformat2_macros.h" |
15 | | #include "messageformat2_parser.h" |
16 | | #include "ucln_in.h" |
17 | | #include "umutex.h" |
18 | | #include "uvector.h" // U_ASSERT |
19 | | |
20 | | U_NAMESPACE_BEGIN |
21 | | |
22 | | namespace message2 { |
23 | | |
24 | | using namespace pluralimpl; |
25 | | |
26 | | using namespace data_model; |
27 | | |
28 | | /* |
29 | | The `ERROR()` macro sets a syntax error in the context |
30 | | and sets the offset in `parseError` to `index`. It does not alter control flow. |
31 | | */ |
32 | | #define ERROR(errorCode) \ |
33 | 8.61M | if (!errors.hasSyntaxError()) { \ |
34 | 6.61k | setParseError(parseError, index); \ |
35 | 6.61k | errors.addSyntaxError(errorCode); \ |
36 | 6.61k | } |
37 | | |
38 | | #define ERROR_AT(errorCode, i) \ |
39 | 41.3k | if (!errors.hasSyntaxError()) { \ |
40 | 9 | setParseError(parseError, i); \ |
41 | 9 | errors.addSyntaxError(errorCode); \ |
42 | 9 | } |
43 | | |
44 | | // Increments the line number and updates the "characters seen before |
45 | | // current line" count in `parseError`, iff `peek()` is a newline |
46 | 11.7M | void Parser::maybeAdvanceLine() { |
47 | 11.7M | if (peek() == LF) { |
48 | 79.5k | parseError.line++; |
49 | | // add 1 to index to get the number of characters seen so far |
50 | | // (including the newline) |
51 | 79.5k | parseError.lengthBeforeCurrentLine = index + 1; |
52 | 79.5k | } |
53 | 11.7M | } |
54 | | |
55 | | /* |
56 | | Signals an error and returns either if `parseError` already denotes an |
57 | | error, or `index` is out of bounds for the string `source` |
58 | | */ |
59 | | #define CHECK_BOUNDS(errorCode) \ |
60 | 973k | if (!inBounds()) { \ |
61 | 1.55k | ERROR(errorCode); \ |
62 | 1.55k | return; \ |
63 | 1.55k | } |
64 | | #define CHECK_BOUNDS_1(errorCode) \ |
65 | 56.5k | if (!inBounds(1)) { \ |
66 | 21 | ERROR_AT(errorCode, index + 1); \ |
67 | 21 | return; \ |
68 | 21 | } |
69 | | |
70 | | // ------------------------------------- |
71 | | // Helper functions |
72 | | |
73 | 13.6k | static void copyContext(const UChar in[U_PARSE_CONTEXT_LEN], UChar out[U_PARSE_CONTEXT_LEN]) { |
74 | 13.6k | for (int32_t i = 0; i < U_PARSE_CONTEXT_LEN; i++) { |
75 | 13.6k | out[i] = in[i]; |
76 | 13.6k | if (in[i] == '\0') { |
77 | 13.6k | break; |
78 | 13.6k | } |
79 | 13.6k | } |
80 | 13.6k | } |
81 | | |
82 | 6.80k | /* static */ void Parser::translateParseError(const MessageParseError &messageParseError, UParseError &parseError) { |
83 | 6.80k | parseError.line = messageParseError.line; |
84 | 6.80k | parseError.offset = messageParseError.offset; |
85 | 6.80k | copyContext(messageParseError.preContext, parseError.preContext); |
86 | 6.80k | copyContext(messageParseError.postContext, parseError.postContext); |
87 | 6.80k | } |
88 | | |
89 | 6.62k | /* static */ void Parser::setParseError(MessageParseError &parseError, uint32_t index) { |
90 | | // Translate absolute to relative offset |
91 | 6.62k | parseError.offset = index // Start with total number of characters seen |
92 | 6.62k | - parseError.lengthBeforeCurrentLine; // Subtract all characters before the current line |
93 | | // TODO: Fill this in with actual pre and post-context |
94 | 6.62k | parseError.preContext[0] = 0; |
95 | 6.62k | parseError.postContext[0] = 0; |
96 | 6.62k | } |
97 | | |
98 | | // ------------------------------------- |
99 | | // Initialization of UnicodeSets |
100 | | |
101 | | namespace unisets { |
102 | | |
103 | | UnicodeSet* gUnicodeSets[unisets::UNISETS_KEY_COUNT] = {}; |
104 | | |
105 | 71.2k | inline UnicodeSet* getImpl(Key key) { |
106 | 71.2k | return gUnicodeSets[key]; |
107 | 71.2k | } |
108 | | |
109 | | icu::UInitOnce gMF2ParseUniSetsInitOnce {}; |
110 | | } |
111 | | |
112 | 1 | UnicodeSet* initContentChars(UErrorCode& status) { |
113 | 1 | if (U_FAILURE(status)) { |
114 | 0 | return nullptr; |
115 | 0 | } |
116 | | |
117 | 1 | UnicodeSet* result = new UnicodeSet(0x0001, 0x0008); // Omit NULL, HTAB and LF |
118 | 1 | if (result == nullptr) { |
119 | 0 | status = U_MEMORY_ALLOCATION_ERROR; |
120 | 0 | return nullptr; |
121 | 0 | } |
122 | 1 | result->add(0x000B, 0x000C); // Omit CR |
123 | 1 | result->add(0x000E, 0x001F); // Omit SP |
124 | 1 | result->add(0x0021, 0x002D); // Omit '.' |
125 | 1 | result->add(0x002F, 0x003F); // Omit '@' |
126 | 1 | result->add(0x0041, 0x005B); // Omit '\' |
127 | 1 | result->add(0x005D, 0x007A); // Omit { | } |
128 | 1 | result->add(0x007E, 0x2FFF); // Omit IDEOGRAPHIC_SPACE |
129 | 1 | result->add(0x3001, 0x10FFFF); // Allowing surrogates is intentional |
130 | 1 | result->freeze(); |
131 | 1 | return result; |
132 | 1 | } |
133 | | |
134 | 1 | UnicodeSet* initWhitespace(UErrorCode& status) { |
135 | 1 | if (U_FAILURE(status)) { |
136 | 0 | return nullptr; |
137 | 0 | } |
138 | | |
139 | 1 | UnicodeSet* result = new UnicodeSet(); |
140 | 1 | if (result == nullptr) { |
141 | 0 | status = U_MEMORY_ALLOCATION_ERROR; |
142 | 0 | return nullptr; |
143 | 0 | } |
144 | 1 | result->add(SPACE); |
145 | 1 | result->add(HTAB); |
146 | 1 | result->add(CR); |
147 | 1 | result->add(LF); |
148 | 1 | result->add(IDEOGRAPHIC_SPACE); |
149 | 1 | result->freeze(); |
150 | 1 | return result; |
151 | 1 | } |
152 | | |
153 | 1 | UnicodeSet* initBidiControls(UErrorCode& status) { |
154 | 1 | UnicodeSet* result = new UnicodeSet(UnicodeString("[\\u061C]"), status); |
155 | 1 | if (U_FAILURE(status)) { |
156 | 0 | return nullptr; |
157 | 0 | } |
158 | 1 | result->add(0x200E, 0x200F); |
159 | 1 | result->add(0x2066, 0x2069); |
160 | 1 | result->freeze(); |
161 | 1 | return result; |
162 | 1 | } |
163 | | |
164 | 1 | UnicodeSet* initAlpha(UErrorCode& status) { |
165 | 1 | UnicodeSet* result = new UnicodeSet(UnicodeString("[:letter:]"), status); |
166 | 1 | if (U_FAILURE(status)) { |
167 | 0 | return nullptr; |
168 | 0 | } |
169 | 1 | result->freeze(); |
170 | 1 | return result; |
171 | 1 | } |
172 | | |
173 | 1 | UnicodeSet* initDigits(UErrorCode& status) { |
174 | 1 | UnicodeSet* result = new UnicodeSet(UnicodeString("[:number:]"), status); |
175 | 1 | if (U_FAILURE(status)) { |
176 | 0 | return nullptr; |
177 | 0 | } |
178 | 1 | result->freeze(); |
179 | 1 | return result; |
180 | 1 | } |
181 | | |
182 | 1 | UnicodeSet* initNameStartChars(UErrorCode& status) { |
183 | 1 | if (U_FAILURE(status)) { |
184 | 0 | return nullptr; |
185 | 0 | } |
186 | | |
187 | 1 | UnicodeSet* isAlpha = unisets::gUnicodeSets[unisets::ALPHA] = initAlpha(status); |
188 | 1 | if (U_FAILURE(status)) { |
189 | 0 | return nullptr; |
190 | 0 | } |
191 | 1 | UnicodeSet* result = new UnicodeSet(); |
192 | 1 | if (result == nullptr) { |
193 | 0 | status = U_MEMORY_ALLOCATION_ERROR; |
194 | 0 | return nullptr; |
195 | 1 | }; |
196 | | |
197 | 1 | result->addAll(*isAlpha); |
198 | 1 | result->add(0x002B); |
199 | 1 | result->add(0x005F); |
200 | 1 | result->add(0x00A1, 0x061B); |
201 | 1 | result->add(0x061D, 0x167F); |
202 | 1 | result->add(0x1681, 0x1FFF); |
203 | 1 | result->add(0x200B, 0x200D); |
204 | 1 | result->add(0x2010, 0x2027); |
205 | 1 | result->add(0x2030, 0x205E); |
206 | 1 | result->add(0x2060, 0x2065); |
207 | 1 | result->add(0x206A, 0x2FFF); |
208 | 1 | result->add(0x3001, 0xD7FF); |
209 | 1 | result->add(0xE000, 0xFDCF); |
210 | 1 | result->add(0xFDF0, 0xFFFD); |
211 | 1 | result->add(0x10000, 0x1FFFD); |
212 | 1 | result->add(0x20000, 0x2FFFD); |
213 | 1 | result->add(0x30000, 0x3FFFD); |
214 | 1 | result->add(0x40000, 0x4FFFD); |
215 | 1 | result->add(0x50000, 0x5FFFD); |
216 | 1 | result->add(0x60000, 0x6FFFD); |
217 | 1 | result->add(0x70000, 0x7FFFD); |
218 | 1 | result->add(0x80000, 0x8FFFD); |
219 | 1 | result->add(0x90000, 0x9FFFD); |
220 | 1 | result->add(0xA0000, 0xAFFFD); |
221 | 1 | result->add(0xB0000, 0xBFFFD); |
222 | 1 | result->add(0xC0000, 0xCFFFD); |
223 | 1 | result->add(0xD0000, 0xDFFFD); |
224 | 1 | result->add(0xE0000, 0xEFFFD); |
225 | 1 | result->add(0xF0000, 0xFFFFD); |
226 | 1 | result->add(0x100000, 0x10FFFD); |
227 | 1 | result->freeze(); |
228 | 1 | return result; |
229 | 1 | } |
230 | | |
231 | 1 | UnicodeSet* initNameChars(UErrorCode& status) { |
232 | 1 | if (U_FAILURE(status)) { |
233 | 0 | return nullptr; |
234 | 0 | } |
235 | | |
236 | 1 | UnicodeSet* nameStart = unisets::gUnicodeSets[unisets::NAME_START] = initNameStartChars(status); |
237 | 1 | UnicodeSet* digit = unisets::gUnicodeSets[unisets::DIGIT] = initDigits(status); |
238 | 1 | if (U_FAILURE(status)) { |
239 | 0 | return nullptr; |
240 | 0 | } |
241 | 1 | UnicodeSet* result = new UnicodeSet(); |
242 | 1 | if (result == nullptr) { |
243 | 0 | status = U_MEMORY_ALLOCATION_ERROR; |
244 | 0 | return nullptr; |
245 | 1 | }; |
246 | 1 | result->addAll(*nameStart); |
247 | 1 | result->addAll(*digit); |
248 | 1 | result->add(HYPHEN); |
249 | 1 | result->add(PERIOD); |
250 | 1 | result->freeze(); |
251 | 1 | return result; |
252 | 1 | } |
253 | | |
254 | 1 | UnicodeSet* initTextChars(UErrorCode& status) { |
255 | 1 | if (U_FAILURE(status)) { |
256 | 0 | return nullptr; |
257 | 0 | } |
258 | | |
259 | 1 | UnicodeSet* content = unisets::gUnicodeSets[unisets::CONTENT] = initContentChars(status); |
260 | 1 | UnicodeSet* whitespace = unisets::gUnicodeSets[unisets::WHITESPACE] = initWhitespace(status); |
261 | 1 | if (U_FAILURE(status)) { |
262 | 0 | return nullptr; |
263 | 0 | } |
264 | 1 | UnicodeSet* result = new UnicodeSet(); |
265 | 1 | if (result == nullptr) { |
266 | 0 | status = U_MEMORY_ALLOCATION_ERROR; |
267 | 0 | return nullptr; |
268 | 1 | }; |
269 | 1 | result->addAll(*content); |
270 | 1 | result->addAll(*whitespace); |
271 | 1 | result->add(PERIOD); |
272 | 1 | result->add(AT); |
273 | 1 | result->add(PIPE); |
274 | 1 | result->freeze(); |
275 | 1 | return result; |
276 | 1 | } |
277 | | |
278 | 1 | UnicodeSet* initQuotedChars(UErrorCode& status) { |
279 | 1 | if (U_FAILURE(status)) { |
280 | 0 | return nullptr; |
281 | 0 | } |
282 | | |
283 | 1 | unisets::gUnicodeSets[unisets::TEXT] = initTextChars(status); |
284 | 1 | if (U_FAILURE(status)) { |
285 | 0 | return nullptr; |
286 | 0 | } |
287 | 1 | UnicodeSet* result = new UnicodeSet(); |
288 | 1 | if (result == nullptr) { |
289 | 0 | status = U_MEMORY_ALLOCATION_ERROR; |
290 | 0 | return nullptr; |
291 | 1 | }; |
292 | | // content and whitespace were initialized by `initTextChars()` |
293 | 1 | UnicodeSet* content = unisets::getImpl(unisets::CONTENT); |
294 | 1 | if (content == nullptr) { |
295 | 0 | status = U_MEMORY_ALLOCATION_ERROR; |
296 | 0 | return nullptr; |
297 | 0 | } |
298 | 1 | result->addAll(*content); |
299 | 1 | UnicodeSet* whitespace = unisets::getImpl(unisets::WHITESPACE); |
300 | 1 | if (whitespace == nullptr) { |
301 | 0 | status = U_MEMORY_ALLOCATION_ERROR; |
302 | 0 | return nullptr; |
303 | 0 | } |
304 | 1 | result->addAll(*whitespace); |
305 | 1 | result->add(PERIOD); |
306 | 1 | result->add(AT); |
307 | 1 | result->add(LEFT_CURLY_BRACE); |
308 | 1 | result->add(RIGHT_CURLY_BRACE); |
309 | 1 | result->freeze(); |
310 | 1 | return result; |
311 | 1 | } |
312 | | |
313 | 1 | UnicodeSet* initEscapableChars(UErrorCode& status) { |
314 | 1 | if (U_FAILURE(status)) { |
315 | 0 | return nullptr; |
316 | 0 | } |
317 | | |
318 | 1 | UnicodeSet* result = new UnicodeSet(); |
319 | 1 | if (result == nullptr) { |
320 | 0 | status = U_MEMORY_ALLOCATION_ERROR; |
321 | 0 | return nullptr; |
322 | 0 | } |
323 | 1 | result->add(PIPE); |
324 | 1 | result->add(BACKSLASH); |
325 | 1 | result->add(LEFT_CURLY_BRACE); |
326 | 1 | result->add(RIGHT_CURLY_BRACE); |
327 | 1 | result->freeze(); |
328 | 1 | return result; |
329 | 1 | } |
330 | | |
331 | | namespace unisets { |
332 | | |
333 | 0 | UBool U_CALLCONV cleanupMF2ParseUniSets() { |
334 | 0 | for (int32_t i = 0; i < UNISETS_KEY_COUNT; i++) { |
335 | 0 | delete gUnicodeSets[i]; |
336 | 0 | gUnicodeSets[i] = nullptr; |
337 | 0 | } |
338 | 0 | gMF2ParseUniSetsInitOnce.reset(); |
339 | 0 | return true; |
340 | 0 | } |
341 | | |
342 | 1 | void U_CALLCONV initMF2ParseUniSets(UErrorCode& status) { |
343 | 1 | ucln_i18n_registerCleanup(UCLN_I18N_MF2_UNISETS, cleanupMF2ParseUniSets); |
344 | | /* |
345 | | Each of the init functions initializes the UnicodeSets |
346 | | that it depends on. |
347 | | |
348 | | initBidiControls (no dependencies) |
349 | | |
350 | | initEscapableChars (no dependencies) |
351 | | |
352 | | initNameChars depends on |
353 | | initDigits |
354 | | initNameStartChars depends on |
355 | | initAlpha |
356 | | |
357 | | initQuotedChars depends on |
358 | | initTextChars depends on |
359 | | initContentChars |
360 | | initWhitespace |
361 | | */ |
362 | 1 | gUnicodeSets[unisets::BIDI] = initBidiControls(status); |
363 | 1 | gUnicodeSets[unisets::NAME_CHAR] = initNameChars(status); |
364 | 1 | gUnicodeSets[unisets::QUOTED] = initQuotedChars(status); |
365 | 1 | gUnicodeSets[unisets::ESCAPABLE] = initEscapableChars(status); |
366 | | |
367 | 1 | if (U_FAILURE(status)) { |
368 | 0 | cleanupMF2ParseUniSets(); |
369 | 0 | } |
370 | 1 | } |
371 | | |
372 | 71.2k | const UnicodeSet* get(Key key, UErrorCode& status) { |
373 | 71.2k | umtx_initOnce(gMF2ParseUniSetsInitOnce, &initMF2ParseUniSets, status); |
374 | 71.2k | if (U_FAILURE(status)) { |
375 | 0 | return nullptr; |
376 | 0 | } |
377 | 71.2k | UnicodeSet* result = getImpl(key); |
378 | 71.2k | if (result == nullptr) { |
379 | 0 | status = U_MEMORY_ALLOCATION_ERROR; |
380 | 0 | } |
381 | 71.2k | return result; |
382 | 71.2k | } |
383 | | |
384 | | } |
385 | | |
386 | | // ------------------------------------- |
387 | | // Predicates |
388 | | |
389 | | /* |
390 | | The following helper predicates should exactly match nonterminals in the MessageFormat 2 grammar: |
391 | | |
392 | | `isContentChar()` : `content-char` |
393 | | `isTextChar()` : `text-char` |
394 | | `isAlpha()` : `ALPHA` |
395 | | `isDigit()` : `DIGIT` |
396 | | `isNameStart()` : `name-start` |
397 | | `isNameChar()` : `name-char` |
398 | | `isUnquotedStart()` : `unquoted-start` |
399 | | `isQuotedChar()` : `quoted-char` |
400 | | `isWhitespace()` : `s` |
401 | | */ |
402 | | |
403 | 0 | bool Parser::isContentChar(UChar32 c) const { |
404 | 0 | return contentChars->contains(c); |
405 | 0 | } |
406 | | |
407 | | // See `bidi` in the MF2 grammar |
408 | 13.4M | bool Parser::isBidiControl(UChar32 c) const { |
409 | 13.4M | return bidiControlChars->contains(c); |
410 | 13.4M | } |
411 | | |
412 | | // See `ws` in the MessageFormat 2 grammar |
413 | 13.4M | bool Parser::isWhitespace(UChar32 c) const { |
414 | 13.4M | return whitespaceChars->contains(c); |
415 | 13.4M | } |
416 | | |
417 | 11.5M | bool Parser::isTextChar(UChar32 c) const { |
418 | 11.5M | return textChars->contains(c); |
419 | 11.5M | } |
420 | | |
421 | 0 | bool Parser::isAlpha(UChar32 c) const { |
422 | 0 | return alphaChars->contains(c); |
423 | 0 | } |
424 | | |
425 | 79 | bool Parser::isDigit(UChar32 c) const { |
426 | 79 | return digitChars->contains(c); |
427 | 79 | } |
428 | | |
429 | 390k | bool Parser::isNameStart(UChar32 c) const { |
430 | 390k | return nameStartChars->contains(c); |
431 | 390k | } |
432 | | |
433 | 23.8M | bool Parser::isNameChar(UChar32 c) const { |
434 | 23.8M | return nameChars->contains(c); |
435 | 23.8M | } |
436 | | |
437 | 12.3k | bool Parser::isUnquotedStart(UChar32 c) const { |
438 | 12.3k | return isNameChar(c); |
439 | 12.3k | } |
440 | | |
441 | 119k | bool Parser::isQuotedChar(UChar32 c) const { |
442 | 119k | return quotedChars->contains(c); |
443 | 119k | } |
444 | | |
445 | 11.6M | bool Parser::isEscapableChar(UChar32 c) const { |
446 | 11.6M | return escapableChars->contains(c); |
447 | 11.6M | } |
448 | | |
449 | | // Returns true iff `c` can begin a `function` nonterminal |
450 | 176k | static bool isFunctionStart(UChar32 c) { |
451 | 176k | switch (c) { |
452 | 161k | case COLON: { |
453 | 161k | return true; |
454 | 0 | } |
455 | 14.8k | default: { |
456 | 14.8k | return false; |
457 | 0 | } |
458 | 176k | } |
459 | 176k | } |
460 | | |
461 | | // Returns true iff `c` can begin an `annotation` nonterminal |
462 | 68.6k | static bool isAnnotationStart(UChar32 c) { |
463 | 68.6k | return isFunctionStart(c); |
464 | 68.6k | } |
465 | | |
466 | | // Returns true iff `c` can begin a `literal` nonterminal |
467 | 157 | bool Parser::isLiteralStart(UChar32 c) const { |
468 | 157 | return (c == PIPE || isNameStart(c) || c == HYPHEN || isDigit(c)); |
469 | 157 | } |
470 | | |
471 | | // Returns true iff `c` can begin a `key` nonterminal |
472 | 161 | bool Parser::isKeyStart(UChar32 c) const { |
473 | 161 | return (c == ASTERISK || isLiteralStart(c)); |
474 | 161 | } |
475 | | |
476 | 0 | bool Parser::isDeclarationStart() { |
477 | 0 | return (peek() == ID_LOCAL[0] |
478 | 0 | && inBounds(1) |
479 | 0 | && peek(1) == ID_LOCAL[1]) |
480 | 0 | || (peek() == ID_INPUT[0] |
481 | 0 | && inBounds(1) |
482 | 0 | && peek(1) == ID_INPUT[1]); |
483 | 0 | } |
484 | | |
485 | | // ------------------------------------- |
486 | | // Parsing functions |
487 | | |
488 | | |
489 | | /* |
490 | | TODO: Since handling the whitespace ambiguities needs to be repeated |
491 | | in several different places and is hard to factor out, |
492 | | it probably would be better to replace the parser with a lexer + parser |
493 | | to separate tokenizing from parsing, which would simplify the code significantly. |
494 | | This has the disadvantage that there is no token grammar for MessageFormat, |
495 | | so one would have to be invented that isn't a component of the spec. |
496 | | */ |
497 | | |
498 | | /* |
499 | | This is a recursive-descent scannerless parser that, |
500 | | with a few exceptions, uses 1 character of lookahead. |
501 | | |
502 | | This may not be an exhaustive list, as the additions of attributes and reserved |
503 | | statements introduced several new ambiguities. |
504 | | |
505 | | All but three of the exceptions involve ambiguities about the meaning of whitespace. |
506 | | One ambiguity not involving whitespace is: |
507 | | identifier -> namespace ":" name |
508 | | vs. |
509 | | identifier -> name |
510 | | |
511 | | `namespace` and `name` can't be distinguished without arbitrary lookahead. |
512 | | (For how this is handled, see parseIdentifier()) |
513 | | |
514 | | The second ambiguity not involving whitespace is: |
515 | | complex-message -> *(declaration[s]) complex-body |
516 | | -> declaration *(declaration[s]) complex-body |
517 | | -> declaration complex-body |
518 | | -> reserved-statement complex-body |
519 | | -> .foo {$x} .match // ... |
520 | | When processing the '.', arbitrary lookahead is required to distinguish the |
521 | | arbitrary-length unsupported keyword from `.match`. |
522 | | (For how this is handled, see parseDeclarations()). |
523 | | |
524 | | The third ambiguity not involving whitespace is: |
525 | | complex-message -> *(declaration [s]) complex-body |
526 | | -> reserved-statement *(declaration [s]) complex-body |
527 | | -> reserved-statement complex-body |
528 | | -> reserved-statement quotedPattern |
529 | | -> reserved-keyword [s reserved-body] 1*([s] expression) quoted-pattern |
530 | | -> reserved-keyword expression quoted-pattern |
531 | | Example: .foo {1} {{1}} |
532 | | |
533 | | Without lookahead, the opening '{' of the quoted pattern can't be distinguished |
534 | | from the opening '{' of another expression in the unsupported statement. |
535 | | (Though this only requires 1 character of lookahead.) |
536 | | |
537 | | Otherwise: |
538 | | |
539 | | There are at least seven ambiguities in the grammar that can't be resolved with finite |
540 | | lookahead (since whitespace sequences can be arbitrarily long). They are resolved |
541 | | with a form of backtracking (early exit). No state needs to be saved/restored |
542 | | since whitespace doesn't affect the shape of the resulting parse tree, so it's |
543 | | not true backtracking. |
544 | | |
545 | | In addition, the grammar has been refactored |
546 | | in a semantics-preserving way in some cases to make the code easier to structure. |
547 | | |
548 | | First: variant = when 1*(s key) [s] pattern |
549 | | Example: when k {a} |
550 | | When reading the first space after 'k', it's ambiguous whether it's the |
551 | | required space before another key, or the optional space before `pattern`. |
552 | | (See comments in parseNonEmptyKeys()) |
553 | | |
554 | | Second: expression = "{" [s] (((literal / variable) [s annotation]) / annotation) [s] "}" |
555 | | annotation = (function *(s option)) / reserved |
556 | | Example: {:f } |
557 | | When reading the first space after 'f', it's ambiguous whether it's the |
558 | | required space before an option, or the optional trailing space after an options list |
559 | | (in this case, the options list is empty). |
560 | | (See comments in parseOptions() -- handling this case also meant it was easier to base |
561 | | the code on a slightly refactored grammar, which should be semantically equivalent.) |
562 | | |
563 | | Third: expression = "{" [s] (((literal / variable) [s annotation]) / annotation) [s] "}" |
564 | | annotation = (function *(s option)) / reserved |
565 | | Example: {@a } |
566 | | Similar to the previous case; see comments in parseReserved() |
567 | | |
568 | | Fourth: expression = "{" [s] (((literal / variable) [s annotation]) / annotation) [s] "}" |
569 | | Example: {|foo| } |
570 | | When reading the first space after the '|', it's ambiguous whether it's the required |
571 | | space before an annotation, or the optional trailing space before the '}'. |
572 | | (See comments in parseLiteralOrVariableWithAnnotation(); handling this case relies on |
573 | | the same grammar refactoring as the second exception.) |
574 | | |
575 | | Most functions match a non-terminal in the grammar, except as explained |
576 | | in comments. |
577 | | |
578 | | Fifth: matcher = match-statement 1*([s] variant) |
579 | | -> match 1 *([s] selector) 1*([s] variant) |
580 | | Example: match {42} * {{_}} |
581 | | When reading the space after the first '}', it's unclear whether |
582 | | it's the optional space before another selector, or the optional space |
583 | | before a variant. |
584 | | |
585 | | Sixth: annotation-expression = "{" [s] annotation *(s attribute) [s] "}" |
586 | | -> "{" [s] function *(s attribute) [s] "}" |
587 | | -> "{" [s] ":" identifier *(s option) *(s attribute) [s] "}" |
588 | | -> "{" [s] ":" identifier s attribute *(s attribute) [s] "}" |
589 | | |
590 | | Example: {:func @foo} |
591 | | (Note: the same ambiguity is present with variable-expression and literal-expression) |
592 | | |
593 | | Seventh: |
594 | | |
595 | | |
596 | | When parsing the space, it's unclear whether it's the optional space before an |
597 | | option, or the optional space before an attribute. |
598 | | |
599 | | Unless otherwise noted in a comment, all helper functions that take |
600 | | a `source` string, an `index` unsigned int, and an `errorCode` `UErrorCode` |
601 | | have the precondition: |
602 | | `index` < `len()` |
603 | | and the postcondition: |
604 | | `U_FAILURE(errorCode)` || `index < `len()` |
605 | | */ |
606 | | |
607 | | /* |
608 | | No pre, no post. |
609 | | A message may end with whitespace, so `index` may equal `len()` on exit. |
610 | | */ |
611 | 4.28M | void Parser::parseRequiredWS(UErrorCode& errorCode) { |
612 | 4.28M | bool sawWhitespace = false; |
613 | | |
614 | | // The loop exits either when we consume all the input, |
615 | | // or when we see a non-whitespace character. |
616 | 4.37M | while (true) { |
617 | | // Check if all input has been consumed |
618 | 4.37M | if (!inBounds()) { |
619 | | // If whitespace isn't required -- or if we saw it already -- |
620 | | // then the caller is responsible for checking this case and |
621 | | // setting an error if necessary. |
622 | 235 | if (sawWhitespace) { |
623 | | // Not an error. |
624 | 87 | return; |
625 | 87 | } |
626 | | // Otherwise, whitespace is required; the end of the input has |
627 | | // been reached without whitespace. This is an error. |
628 | 148 | ERROR(errorCode); |
629 | 148 | return; |
630 | 235 | } |
631 | | |
632 | | // Input remains; process the next character if it's whitespace, |
633 | | // exit the loop otherwise |
634 | 4.36M | if (isWhitespace(peek())) { |
635 | 84.5k | sawWhitespace = true; |
636 | | // Increment line number in parse error if we consume a newline |
637 | 84.5k | maybeAdvanceLine(); |
638 | 84.5k | next(); |
639 | 4.28M | } else { |
640 | 4.28M | break; |
641 | 4.28M | } |
642 | 4.36M | } |
643 | | |
644 | 4.28M | if (!sawWhitespace) { |
645 | 4.20M | ERROR(errorCode); |
646 | 4.20M | } |
647 | 4.28M | } |
648 | | |
649 | 4.42M | void Parser::parseOptionalBidi() { |
650 | 4.43M | while (true) { |
651 | 4.43M | if (!inBounds()) { |
652 | 632 | return; |
653 | 632 | } |
654 | 4.42M | if (isBidiControl(peek())) { |
655 | 2.78k | next(); |
656 | 4.42M | } else { |
657 | 4.42M | break; |
658 | 4.42M | } |
659 | 4.42M | } |
660 | 4.42M | } |
661 | | |
662 | | /* |
663 | | No pre, no post, because a message may end with whitespace |
664 | | Matches `s` in the MF2 grammar |
665 | | */ |
666 | 4.28M | void Parser::parseRequiredWhitespace(UErrorCode& errorCode) { |
667 | 4.28M | parseOptionalBidi(); |
668 | 4.28M | parseRequiredWS(errorCode); |
669 | 4.28M | parseOptionalWhitespace(); |
670 | 4.28M | normalizedInput += SPACE; |
671 | 4.28M | } |
672 | | |
673 | | /* |
674 | | No pre, no post, for the same reason as `parseWhitespaceMaybeRequired()`. |
675 | | */ |
676 | 4.63M | void Parser::parseOptionalWhitespace() { |
677 | 4.64M | while (true) { |
678 | 4.64M | if (!inBounds()) { |
679 | 2.82k | return; |
680 | 2.82k | } |
681 | 4.64M | auto cp = peek(); |
682 | 4.64M | if (isWhitespace(cp) || isBidiControl(cp)) { |
683 | 15.7k | maybeAdvanceLine(); |
684 | 15.7k | next(); |
685 | 4.62M | } else { |
686 | 4.62M | break; |
687 | 4.62M | } |
688 | 4.64M | } |
689 | 4.63M | } |
690 | | |
691 | | // Consumes a single character, signaling an error if `peek()` != `c` |
692 | | // No postcondition -- a message can end with a '}' token |
693 | 470k | void Parser::parseToken(UChar32 c, UErrorCode& errorCode) { |
694 | 470k | CHECK_BOUNDS(errorCode); |
695 | | |
696 | 470k | if (peek() == c) { |
697 | 325k | next(); |
698 | 325k | normalizedInput += c; |
699 | 325k | return; |
700 | 325k | } |
701 | | // Next character didn't match -- error out |
702 | 144k | ERROR(errorCode); |
703 | 144k | } |
704 | | |
705 | | /* |
706 | | Consumes a fixed-length token, signaling an error if the token isn't a prefix of |
707 | | the string beginning at `peek()` |
708 | | No postcondition -- a message can end with a '}' token |
709 | | */ |
710 | 56.5k | void Parser::parseToken(const std::u16string_view& token, UErrorCode& errorCode) { |
711 | 56.5k | U_ASSERT(inBounds()); |
712 | | |
713 | 56.5k | int32_t tokenPos = 0; |
714 | 169k | while (tokenPos < static_cast<int32_t>(token.length())) { |
715 | 169k | if (peek() != token[tokenPos]) { |
716 | 56.5k | ERROR(errorCode); |
717 | 56.5k | return; |
718 | 56.5k | } |
719 | 112k | normalizedInput += token[tokenPos]; |
720 | 112k | next(); |
721 | 112k | tokenPos++; |
722 | 112k | } |
723 | 56.5k | } |
724 | | |
725 | | /* |
726 | | Consumes optional whitespace, possibly advancing `index` to `index'`, |
727 | | then consumes a fixed-length token (signaling an error if the token isn't a prefix of |
728 | | the string beginning at `source[index']`), |
729 | | then consumes optional whitespace again |
730 | | */ |
731 | 0 | void Parser::parseTokenWithWhitespace(const std::u16string_view& token, UErrorCode& errorCode) { |
732 | | // No need for error check or bounds check before parseOptionalWhitespace |
733 | 0 | parseOptionalWhitespace(); |
734 | | // Establish precondition |
735 | 0 | CHECK_BOUNDS(errorCode); |
736 | 0 | parseToken(token, errorCode); |
737 | 0 | parseOptionalWhitespace(); |
738 | | // Guarantee postcondition |
739 | 0 | CHECK_BOUNDS(errorCode); |
740 | 0 | } |
741 | | |
742 | | /* |
743 | | Consumes optional whitespace, possibly advancing `index` to `index'`, |
744 | | then consumes a single character (signaling an error if it doesn't match |
745 | | `source[index']`), |
746 | | then consumes optional whitespace again |
747 | | */ |
748 | 28.8k | void Parser::parseTokenWithWhitespace(UChar32 c, UErrorCode& errorCode) { |
749 | | // No need for error check or bounds check before parseOptionalWhitespace() |
750 | 28.8k | parseOptionalWhitespace(); |
751 | | // Establish precondition |
752 | 28.8k | CHECK_BOUNDS(errorCode); |
753 | 28.5k | parseToken(c, errorCode); |
754 | 28.5k | parseOptionalWhitespace(); |
755 | | // Guarantee postcondition |
756 | 28.5k | CHECK_BOUNDS(errorCode); |
757 | 28.5k | } |
758 | | |
759 | | /* |
760 | | Consumes a possibly-empty sequence of name-chars. Appends to `str` |
761 | | and returns `str`. |
762 | | */ |
763 | 222k | UnicodeString Parser::parseNameChars(UnicodeString& str, UErrorCode& errorCode) { |
764 | 222k | if (U_FAILURE(errorCode)) { |
765 | 0 | return {}; |
766 | 0 | } |
767 | | |
768 | 19.7M | while (isNameChar(peek())) { |
769 | 19.5M | UChar32 c = peek(); |
770 | 19.5M | str += c; |
771 | 19.5M | normalizedInput += c; |
772 | 19.5M | next(); |
773 | 19.5M | if (!inBounds()) { |
774 | 824 | ERROR(errorCode); |
775 | 824 | break; |
776 | 824 | } |
777 | 19.5M | } |
778 | | |
779 | 222k | return str; |
780 | 222k | } |
781 | | |
782 | | /* |
783 | | Consumes a non-empty sequence of `name-char`s, the first of which is |
784 | | also a `name-start`. |
785 | | that begins with a character `start` such that `isNameStart(start)`. |
786 | | |
787 | | Returns this sequence. |
788 | | |
789 | | (Matches the `name` nonterminal in the grammar.) |
790 | | */ |
791 | 359k | UnicodeString Parser::parseName(UErrorCode& errorCode) { |
792 | 359k | UnicodeString name; |
793 | | |
794 | 359k | U_ASSERT(inBounds()); |
795 | | |
796 | 359k | if (!(isNameStart(peek()) || isBidiControl(peek()))) { |
797 | 289k | ERROR(errorCode); |
798 | 289k | return name; |
799 | 289k | } |
800 | | |
801 | | // name = [bidi] name-start *name-char [bidi] |
802 | | |
803 | | // [bidi] |
804 | 70.8k | parseOptionalBidi(); |
805 | | |
806 | | // name-start *name-char |
807 | 70.8k | parseNameChars(name, errorCode); |
808 | | |
809 | | // [bidi] |
810 | 70.8k | parseOptionalBidi(); |
811 | | |
812 | 70.8k | return name; |
813 | 359k | } |
814 | | |
815 | | /* |
816 | | Consumes a '$' followed by a `name`, returning a VariableName |
817 | | with `name` as its name |
818 | | |
819 | | (Matches the `variable` nonterminal in the grammar.) |
820 | | */ |
821 | 180k | VariableName Parser::parseVariableName(UErrorCode& errorCode) { |
822 | 180k | VariableName result; |
823 | | |
824 | 180k | U_ASSERT(inBounds()); |
825 | | |
826 | 180k | parseToken(DOLLAR, errorCode); |
827 | 180k | if (!inBounds()) { |
828 | 58 | ERROR(errorCode); |
829 | 58 | return result; |
830 | 58 | } |
831 | 180k | return VariableName(parseName(errorCode)); |
832 | 180k | } |
833 | | |
834 | | /* |
835 | | Corresponds to the `identifier` nonterminal in the grammar |
836 | | */ |
837 | 98.7k | UnicodeString Parser::parseIdentifier(UErrorCode& errorCode) { |
838 | 98.7k | U_ASSERT(inBounds()); |
839 | | |
840 | 98.7k | UnicodeString result; |
841 | | // The following is a hack to get around ambiguity in the grammar: |
842 | | // identifier -> namespace ":" name |
843 | | // vs. |
844 | | // identifier -> name |
845 | | // can't be distinguished without arbitrary lookahead. |
846 | | // Instead, we treat the production as: |
847 | | // identifier -> namespace *(":"name) |
848 | | // and then check for multiple colons. |
849 | | |
850 | | // Parse namespace |
851 | 98.7k | result += parseName(errorCode); |
852 | 98.7k | int32_t firstColon = -1; |
853 | 179k | while (inBounds() && peek() == COLON) { |
854 | | // Parse ':' separator |
855 | 80.4k | if (firstColon == -1) { |
856 | 46.7k | firstColon = index; |
857 | 46.7k | } |
858 | 80.4k | parseToken(COLON, errorCode); |
859 | 80.4k | result += COLON; |
860 | | // Check for message ending with something like "foo:" |
861 | 80.4k | if (!inBounds()) { |
862 | 58 | ERROR(errorCode); |
863 | 80.4k | } else { |
864 | | // Parse name part |
865 | 80.4k | result += parseName(errorCode); |
866 | 80.4k | } |
867 | 80.4k | } |
868 | | |
869 | | // If there's at least one ':', scan from the first ':' |
870 | | // to the end of the name to check for multiple ':'s |
871 | 98.7k | if (firstColon != -1) { |
872 | 7.62M | for (int32_t i = firstColon + 1; i < result.length(); i++) { |
873 | 7.57M | if (result[i] == COLON) { |
874 | 130 | ERROR_AT(errorCode, i); |
875 | 130 | return {}; |
876 | 130 | } |
877 | 7.57M | } |
878 | 46.7k | } |
879 | | |
880 | 98.6k | return result; |
881 | 98.7k | } |
882 | | |
883 | | /* |
884 | | Consumes a reference to a function, matching the ": identifier" |
885 | | in the `function` nonterminal in the grammar. |
886 | | |
887 | | Returns the function name. |
888 | | */ |
889 | 53.8k | FunctionName Parser::parseFunction(UErrorCode& errorCode) { |
890 | 53.8k | U_ASSERT(inBounds()); |
891 | 53.8k | if (!isFunctionStart(peek())) { |
892 | 0 | ERROR(errorCode); |
893 | 0 | return FunctionName(); |
894 | 0 | } |
895 | | |
896 | 53.8k | normalizedInput += peek(); |
897 | 53.8k | next(); // Consume the function start character |
898 | 53.8k | if (!inBounds()) { |
899 | 34 | ERROR(errorCode); |
900 | 34 | return FunctionName(); |
901 | 34 | } |
902 | 53.7k | return parseIdentifier(errorCode); |
903 | 53.8k | } |
904 | | |
905 | | |
906 | | /* |
907 | | Precondition: peek() == BACKSLASH |
908 | | |
909 | | Consume an escaped character. |
910 | | Corresponds to `escaped-char` in the grammar. |
911 | | |
912 | | No postcondition (a message can end with an escaped char) |
913 | | */ |
914 | 8.52k | UnicodeString Parser::parseEscapeSequence(UErrorCode& errorCode) { |
915 | 8.52k | U_ASSERT(inBounds()); |
916 | 8.52k | U_ASSERT(peek() == BACKSLASH); |
917 | 8.52k | normalizedInput += BACKSLASH; |
918 | 8.52k | next(); // Skip the initial backslash |
919 | 8.52k | UnicodeString str; |
920 | 8.52k | if (inBounds()) { |
921 | | // Expect a '{', '|' or '}' |
922 | 8.50k | switch (peek()) { |
923 | 1.07k | case LEFT_CURLY_BRACE: |
924 | 1.75k | case RIGHT_CURLY_BRACE: |
925 | 6.83k | case PIPE: |
926 | 7.42k | case BACKSLASH: { |
927 | | /* Append to the output string */ |
928 | 7.42k | str += peek(); |
929 | | /* Update normalizedInput */ |
930 | 7.42k | normalizedInput += peek(); |
931 | | /* Consume the character */ |
932 | 7.42k | next(); |
933 | 7.42k | return str; |
934 | 6.83k | } |
935 | 1.07k | default: { |
936 | | // No other characters are allowed here |
937 | 1.07k | break; |
938 | 6.83k | } |
939 | 8.50k | } |
940 | 8.50k | } |
941 | | // If control reaches here, there was an error |
942 | 1.09k | ERROR(errorCode); |
943 | 1.09k | return str; |
944 | 8.52k | } |
945 | | |
946 | | |
947 | | /* |
948 | | Consume and return a quoted literal, matching the `literal` nonterminal in the grammar. |
949 | | */ |
950 | 4.76k | Literal Parser::parseQuotedLiteral(UErrorCode& errorCode) { |
951 | 4.76k | bool error = false; |
952 | | |
953 | 4.76k | UnicodeString contents; |
954 | 4.76k | if (U_SUCCESS(errorCode)) { |
955 | | // Parse the opening '|' |
956 | 4.76k | parseToken(PIPE, errorCode); |
957 | 4.76k | if (!inBounds()) { |
958 | 16 | ERROR(errorCode); |
959 | 16 | error = true; |
960 | 4.75k | } else { |
961 | | // Parse the contents |
962 | 4.75k | bool done = false; |
963 | 131k | while (!done) { |
964 | 126k | if (peek() == BACKSLASH) { |
965 | 7.13k | contents += parseEscapeSequence(errorCode); |
966 | 119k | } else if (isQuotedChar(peek())) { |
967 | 114k | contents += peek(); |
968 | | // Handle cases like: |
969 | | // |}{| -- we want to escape everywhere that |
970 | | // can be escaped, to make round-trip checking |
971 | | // easier -- so this case normalizes to |
972 | | // |\}\{| |
973 | 114k | if (isEscapableChar(peek())) { |
974 | 5.71k | normalizedInput += BACKSLASH; |
975 | 5.71k | } |
976 | 114k | normalizedInput += peek(); |
977 | 114k | next(); // Consume this character |
978 | 114k | maybeAdvanceLine(); |
979 | 114k | } else { |
980 | | // Assume the sequence of literal characters ends here |
981 | 4.67k | done = true; |
982 | 4.67k | } |
983 | 126k | if (!inBounds()) { |
984 | 82 | ERROR(errorCode); |
985 | 82 | error = true; |
986 | 82 | break; |
987 | 82 | } |
988 | 126k | } |
989 | 4.75k | } |
990 | 4.76k | } |
991 | | |
992 | 4.76k | if (error) { |
993 | 98 | return {}; |
994 | 98 | } |
995 | | |
996 | | // Parse the closing '|' |
997 | 4.67k | parseToken(PIPE, errorCode); |
998 | | |
999 | 4.67k | return Literal(true, contents); |
1000 | 4.76k | } |
1001 | | |
1002 | | // Parse (1*DIGIT) |
1003 | 0 | UnicodeString Parser::parseDigits(UErrorCode& errorCode) { |
1004 | 0 | if (U_FAILURE(errorCode)) { |
1005 | 0 | return {}; |
1006 | 0 | } |
1007 | | |
1008 | 0 | U_ASSERT(isDigit(peek())); |
1009 | |
|
1010 | 0 | UnicodeString contents; |
1011 | 0 | do { |
1012 | 0 | contents += peek(); |
1013 | 0 | normalizedInput += peek(); |
1014 | 0 | next(); |
1015 | 0 | if (!inBounds()) { |
1016 | 0 | ERROR(errorCode); |
1017 | 0 | return {}; |
1018 | 0 | } |
1019 | 0 | } while (isDigit(peek())); |
1020 | | |
1021 | 0 | return contents; |
1022 | 0 | } |
1023 | | /* |
1024 | | Consume and return an unquoted literal, matching the `unquoted` nonterminal in the grammar. |
1025 | | */ |
1026 | 4.05M | Literal Parser::parseUnquotedLiteral(UErrorCode& errorCode) { |
1027 | 4.05M | if (U_FAILURE(errorCode)) { |
1028 | 0 | return {}; |
1029 | 0 | } |
1030 | | // unquoted-literal = 1*name-char |
1031 | | |
1032 | 4.05M | if (!(isNameChar(peek()))) { |
1033 | 3.90M | ERROR(errorCode); |
1034 | 3.90M | return {}; |
1035 | 3.90M | } |
1036 | | |
1037 | 151k | UnicodeString contents; |
1038 | 151k | parseNameChars(contents, errorCode); |
1039 | 151k | return Literal(false, contents); |
1040 | 4.05M | } |
1041 | | |
1042 | | /* |
1043 | | Consume and return a literal, matching the `literal` nonterminal in the grammar. |
1044 | | */ |
1045 | 4.05M | Literal Parser::parseLiteral(UErrorCode& errorCode) { |
1046 | 4.05M | Literal result; |
1047 | 4.05M | if (!inBounds()) { |
1048 | 276 | ERROR(errorCode); |
1049 | 4.05M | } else { |
1050 | 4.05M | if (peek() == PIPE) { |
1051 | 4.76k | result = parseQuotedLiteral(errorCode); |
1052 | 4.05M | } else { |
1053 | 4.05M | result = parseUnquotedLiteral(errorCode); |
1054 | 4.05M | } |
1055 | | // Guarantee postcondition |
1056 | 4.05M | if (!inBounds()) { |
1057 | 492 | ERROR(errorCode); |
1058 | 492 | } |
1059 | 4.05M | } |
1060 | | |
1061 | 4.05M | return result; |
1062 | 4.05M | } |
1063 | | |
1064 | | /* |
1065 | | Consume a @name-value pair, matching the `attribute` nonterminal in the grammar. |
1066 | | |
1067 | | Adds the option to `options` |
1068 | | */ |
1069 | | template<class T> |
1070 | 23.6k | void Parser::parseAttribute(AttributeAdder<T>& attrAdder, UErrorCode& errorCode) { |
1071 | 23.6k | U_ASSERT(inBounds()); |
1072 | | |
1073 | 23.6k | U_ASSERT(peek() == AT); |
1074 | | // Consume the '@' |
1075 | 23.6k | parseToken(AT, errorCode); |
1076 | | |
1077 | | // Parse LHS |
1078 | 23.6k | UnicodeString lhs = parseIdentifier(errorCode); |
1079 | | |
1080 | | // Prepare to "backtrack" to resolve ambiguity |
1081 | | // about whether whitespace precedes another |
1082 | | // attribute, or the '=' sign |
1083 | 23.6k | int32_t savedIndex = index; |
1084 | 23.6k | parseOptionalWhitespace(); |
1085 | | |
1086 | 23.6k | Operand rand; |
1087 | 23.6k | if (peek() == EQUALS) { |
1088 | | // Parse '=' |
1089 | 8.47k | parseTokenWithWhitespace(EQUALS, errorCode); |
1090 | | |
1091 | 8.47k | UnicodeString rhsStr; |
1092 | | // Parse RHS, which must be a literal |
1093 | | // attribute = "@" identifier [o "=" o literal] |
1094 | 8.47k | rand = Operand(parseLiteral(errorCode)); |
1095 | 15.1k | } else { |
1096 | | // attribute -> "@" identifier [[s] "=" [s]] |
1097 | | // Use null operand, which `rand` is already set to |
1098 | | // "Backtrack" by restoring the whitespace (if there was any) |
1099 | 15.1k | index = savedIndex; |
1100 | 15.1k | } |
1101 | | |
1102 | 23.6k | attrAdder.addAttribute(lhs, std::move(Operand(rand)), errorCode); |
1103 | 23.6k | } void icu_79::message2::Parser::parseAttribute<icu_79::message2::data_model::Expression::Builder>(icu_79::message2::AttributeAdder<icu_79::message2::data_model::Expression::Builder>&, UErrorCode&) Line | Count | Source | 1070 | 14.2k | void Parser::parseAttribute(AttributeAdder<T>& attrAdder, UErrorCode& errorCode) { | 1071 | 14.2k | U_ASSERT(inBounds()); | 1072 | | | 1073 | 14.2k | U_ASSERT(peek() == AT); | 1074 | | // Consume the '@' | 1075 | 14.2k | parseToken(AT, errorCode); | 1076 | | | 1077 | | // Parse LHS | 1078 | 14.2k | UnicodeString lhs = parseIdentifier(errorCode); | 1079 | | | 1080 | | // Prepare to "backtrack" to resolve ambiguity | 1081 | | // about whether whitespace precedes another | 1082 | | // attribute, or the '=' sign | 1083 | 14.2k | int32_t savedIndex = index; | 1084 | 14.2k | parseOptionalWhitespace(); | 1085 | | | 1086 | 14.2k | Operand rand; | 1087 | 14.2k | if (peek() == EQUALS) { | 1088 | | // Parse '=' | 1089 | 1.55k | parseTokenWithWhitespace(EQUALS, errorCode); | 1090 | | | 1091 | 1.55k | UnicodeString rhsStr; | 1092 | | // Parse RHS, which must be a literal | 1093 | | // attribute = "@" identifier [o "=" o literal] | 1094 | 1.55k | rand = Operand(parseLiteral(errorCode)); | 1095 | 12.7k | } else { | 1096 | | // attribute -> "@" identifier [[s] "=" [s]] | 1097 | | // Use null operand, which `rand` is already set to | 1098 | | // "Backtrack" by restoring the whitespace (if there was any) | 1099 | 12.7k | index = savedIndex; | 1100 | 12.7k | } | 1101 | | | 1102 | 14.2k | attrAdder.addAttribute(lhs, std::move(Operand(rand)), errorCode); | 1103 | 14.2k | } |
void icu_79::message2::Parser::parseAttribute<icu_79::message2::data_model::Markup::Builder>(icu_79::message2::AttributeAdder<icu_79::message2::data_model::Markup::Builder>&, UErrorCode&) Line | Count | Source | 1070 | 9.37k | void Parser::parseAttribute(AttributeAdder<T>& attrAdder, UErrorCode& errorCode) { | 1071 | 9.37k | U_ASSERT(inBounds()); | 1072 | | | 1073 | 9.37k | U_ASSERT(peek() == AT); | 1074 | | // Consume the '@' | 1075 | 9.37k | parseToken(AT, errorCode); | 1076 | | | 1077 | | // Parse LHS | 1078 | 9.37k | UnicodeString lhs = parseIdentifier(errorCode); | 1079 | | | 1080 | | // Prepare to "backtrack" to resolve ambiguity | 1081 | | // about whether whitespace precedes another | 1082 | | // attribute, or the '=' sign | 1083 | 9.37k | int32_t savedIndex = index; | 1084 | 9.37k | parseOptionalWhitespace(); | 1085 | | | 1086 | 9.37k | Operand rand; | 1087 | 9.37k | if (peek() == EQUALS) { | 1088 | | // Parse '=' | 1089 | 6.92k | parseTokenWithWhitespace(EQUALS, errorCode); | 1090 | | | 1091 | 6.92k | UnicodeString rhsStr; | 1092 | | // Parse RHS, which must be a literal | 1093 | | // attribute = "@" identifier [o "=" o literal] | 1094 | 6.92k | rand = Operand(parseLiteral(errorCode)); | 1095 | 6.92k | } else { | 1096 | | // attribute -> "@" identifier [[s] "=" [s]] | 1097 | | // Use null operand, which `rand` is already set to | 1098 | | // "Backtrack" by restoring the whitespace (if there was any) | 1099 | 2.45k | index = savedIndex; | 1100 | 2.45k | } | 1101 | | | 1102 | 9.37k | attrAdder.addAttribute(lhs, std::move(Operand(rand)), errorCode); | 1103 | 9.37k | } |
|
1104 | | |
1105 | | /* |
1106 | | Consume a name-value pair, matching the `option` nonterminal in the grammar. |
1107 | | |
1108 | | Adds the option to `optionList` |
1109 | | */ |
1110 | | template<class T> |
1111 | 6.11k | void Parser::parseOption(OptionAdder<T>& addOption, UErrorCode& errorCode) { |
1112 | 6.11k | U_ASSERT(inBounds()); |
1113 | | |
1114 | | // Parse LHS |
1115 | 6.11k | UnicodeString lhs = parseIdentifier(errorCode); |
1116 | | |
1117 | | // Parse '=' |
1118 | 6.11k | parseTokenWithWhitespace(EQUALS, errorCode); |
1119 | | |
1120 | 6.11k | UnicodeString rhsStr; |
1121 | 6.11k | Operand rand; |
1122 | | // Parse RHS, which is either a literal or variable |
1123 | 6.11k | switch (peek()) { |
1124 | 1.62k | case DOLLAR: { |
1125 | 1.62k | rand = Operand(parseVariableName(errorCode)); |
1126 | 1.62k | break; |
1127 | 0 | } |
1128 | 4.49k | default: { |
1129 | | // Must be a literal |
1130 | 4.49k | rand = Operand(parseLiteral(errorCode)); |
1131 | 4.49k | break; |
1132 | 0 | } |
1133 | 6.11k | } |
1134 | 6.11k | U_ASSERT(!rand.isNull()); |
1135 | | |
1136 | | // Finally, add the key=value mapping |
1137 | | // Use a local error code, check for duplicate option error and |
1138 | | // record it as with other errors |
1139 | 6.11k | UErrorCode status = U_ZERO_ERROR; |
1140 | 6.11k | addOption.addOption(lhs, std::move(rand), status); |
1141 | 6.11k | if (U_FAILURE(status)) { |
1142 | 1.86k | U_ASSERT(status == U_MF_DUPLICATE_OPTION_NAME_ERROR); |
1143 | 1.86k | errors.setDuplicateOptionName(errorCode); |
1144 | 1.86k | } |
1145 | 6.11k | } void icu_79::message2::Parser::parseOption<icu_79::message2::data_model::Operator::Builder>(icu_79::message2::OptionAdder<icu_79::message2::data_model::Operator::Builder>&, UErrorCode&) Line | Count | Source | 1111 | 1.68k | void Parser::parseOption(OptionAdder<T>& addOption, UErrorCode& errorCode) { | 1112 | 1.68k | U_ASSERT(inBounds()); | 1113 | | | 1114 | | // Parse LHS | 1115 | 1.68k | UnicodeString lhs = parseIdentifier(errorCode); | 1116 | | | 1117 | | // Parse '=' | 1118 | 1.68k | parseTokenWithWhitespace(EQUALS, errorCode); | 1119 | | | 1120 | 1.68k | UnicodeString rhsStr; | 1121 | 1.68k | Operand rand; | 1122 | | // Parse RHS, which is either a literal or variable | 1123 | 1.68k | switch (peek()) { | 1124 | 891 | case DOLLAR: { | 1125 | 891 | rand = Operand(parseVariableName(errorCode)); | 1126 | 891 | break; | 1127 | 0 | } | 1128 | 795 | default: { | 1129 | | // Must be a literal | 1130 | 795 | rand = Operand(parseLiteral(errorCode)); | 1131 | 795 | break; | 1132 | 0 | } | 1133 | 1.68k | } | 1134 | 1.68k | U_ASSERT(!rand.isNull()); | 1135 | | | 1136 | | // Finally, add the key=value mapping | 1137 | | // Use a local error code, check for duplicate option error and | 1138 | | // record it as with other errors | 1139 | 1.68k | UErrorCode status = U_ZERO_ERROR; | 1140 | 1.68k | addOption.addOption(lhs, std::move(rand), status); | 1141 | 1.68k | if (U_FAILURE(status)) { | 1142 | 666 | U_ASSERT(status == U_MF_DUPLICATE_OPTION_NAME_ERROR); | 1143 | 666 | errors.setDuplicateOptionName(errorCode); | 1144 | 666 | } | 1145 | 1.68k | } |
void icu_79::message2::Parser::parseOption<icu_79::message2::data_model::Markup::Builder>(icu_79::message2::OptionAdder<icu_79::message2::data_model::Markup::Builder>&, UErrorCode&) Line | Count | Source | 1111 | 4.43k | void Parser::parseOption(OptionAdder<T>& addOption, UErrorCode& errorCode) { | 1112 | 4.43k | U_ASSERT(inBounds()); | 1113 | | | 1114 | | // Parse LHS | 1115 | 4.43k | UnicodeString lhs = parseIdentifier(errorCode); | 1116 | | | 1117 | | // Parse '=' | 1118 | 4.43k | parseTokenWithWhitespace(EQUALS, errorCode); | 1119 | | | 1120 | 4.43k | UnicodeString rhsStr; | 1121 | 4.43k | Operand rand; | 1122 | | // Parse RHS, which is either a literal or variable | 1123 | 4.43k | switch (peek()) { | 1124 | 730 | case DOLLAR: { | 1125 | 730 | rand = Operand(parseVariableName(errorCode)); | 1126 | 730 | break; | 1127 | 0 | } | 1128 | 3.70k | default: { | 1129 | | // Must be a literal | 1130 | 3.70k | rand = Operand(parseLiteral(errorCode)); | 1131 | 3.70k | break; | 1132 | 0 | } | 1133 | 4.43k | } | 1134 | 4.43k | U_ASSERT(!rand.isNull()); | 1135 | | | 1136 | | // Finally, add the key=value mapping | 1137 | | // Use a local error code, check for duplicate option error and | 1138 | | // record it as with other errors | 1139 | 4.43k | UErrorCode status = U_ZERO_ERROR; | 1140 | 4.43k | addOption.addOption(lhs, std::move(rand), status); | 1141 | 4.43k | if (U_FAILURE(status)) { | 1142 | 1.20k | U_ASSERT(status == U_MF_DUPLICATE_OPTION_NAME_ERROR); | 1143 | 1.20k | errors.setDuplicateOptionName(errorCode); | 1144 | 1.20k | } | 1145 | 4.43k | } |
|
1146 | | |
1147 | | /* |
1148 | | Note: there are multiple overloads of parseOptions() for parsing |
1149 | | options within markup, vs. within an expression, vs. parsing |
1150 | | attributes. This should be refactored. TODO |
1151 | | */ |
1152 | | |
1153 | | /* |
1154 | | Consume optional whitespace followed by a sequence of options |
1155 | | (possibly empty), separated by whitespace |
1156 | | */ |
1157 | | template <class T> |
1158 | 63.7k | void Parser::parseOptions(OptionAdder<T>& addOption, UErrorCode& errorCode) { |
1159 | | // Early exit if out of bounds -- no more work is possible |
1160 | 63.7k | CHECK_BOUNDS(errorCode); |
1161 | | |
1162 | | /* |
1163 | | Arbitrary lookahead is required to parse option lists. To see why, consider |
1164 | | these rules from the grammar: |
1165 | | |
1166 | | expression = "{" [s] (((literal / variable) [s annotation]) / annotation) [s] "}" |
1167 | | annotation = (function *(s option)) / reserved |
1168 | | |
1169 | | And this example: |
1170 | | {:foo } |
1171 | | |
1172 | | Derivation: |
1173 | | expression -> "{" [s] (((literal / variable) [s annotation]) / annotation) [s] "}" |
1174 | | -> "{" [s] annotation [s] "}" |
1175 | | -> "{" [s] ((function *(s option)) / reserved) [s] "}" |
1176 | | -> "{" [s] function *(s option) [s] "}" |
1177 | | |
1178 | | In this example, knowing whether to expect a '}' or the start of another option |
1179 | | after the whitespace would require arbitrary lookahead -- in other words, which |
1180 | | rule should we apply? |
1181 | | *(s option) -> s option *(s option) |
1182 | | or |
1183 | | *(s option) -> |
1184 | | |
1185 | | The same would apply to the example {:foo k=v } (note the trailing space after "v"). |
1186 | | |
1187 | | This is addressed using a form of backtracking and (to make the backtracking easier |
1188 | | to apply) a slight refactoring to the grammar. |
1189 | | |
1190 | | This code is written as if the grammar is: |
1191 | | expression = "{" [s] (((literal / variable) ([s] / [s annotation])) / annotation) "}" |
1192 | | annotation = (function *(s option) [s]) / (reserved [s]) |
1193 | | |
1194 | | Parsing the `*(s option) [s]` sequence can be done within `parseOptions()`, meaning |
1195 | | that `parseExpression()` can safely require a '}' after `parseOptions()` finishes. |
1196 | | |
1197 | | Note that when "backtracking" really just means early exit, since only whitespace |
1198 | | is involved and there's no state to save. |
1199 | | |
1200 | | There is a separate but similar ambiguity as to whether the space precedes |
1201 | | an option or an attribute. |
1202 | | */ |
1203 | | |
1204 | 69.7k | while(true) { |
1205 | | // If the next character is not whitespace, that means we've already |
1206 | | // parsed the entire options list (which may have been empty) and there's |
1207 | | // no trailing whitespace. In that case, exit. |
1208 | 69.7k | if (!isWhitespace(peek())) { |
1209 | 39.5k | break; |
1210 | 39.5k | } |
1211 | 30.1k | int32_t firstWhitespace = index; |
1212 | | |
1213 | | // In any case other than an empty options list, there must be at least |
1214 | | // one whitespace character. |
1215 | 30.1k | parseRequiredWhitespace(errorCode); |
1216 | | // Restore precondition |
1217 | 30.1k | CHECK_BOUNDS(errorCode); |
1218 | | |
1219 | | // If a name character follows, then at least one more option remains |
1220 | | // in the list. |
1221 | | // Otherwise, we've consumed all the options and any trailing whitespace, |
1222 | | // and can exit. |
1223 | | // Note that exiting is sort of like backtracking: "(s option)" doesn't apply, |
1224 | | // so we back out to [s]. |
1225 | 30.1k | if (!isNameStart(peek())) { |
1226 | | // We've consumed all the options (meaning that either we consumed non-empty |
1227 | | // whitespace, or consumed at least one option.) |
1228 | | // Done. |
1229 | | // Remove the required whitespace from normalizedInput |
1230 | 24.0k | normalizedInput.truncate(normalizedInput.length() - 1); |
1231 | | // "Backtrack" so as to leave the optional whitespace there |
1232 | | // when parsing attributes |
1233 | 24.0k | index = firstWhitespace; |
1234 | 24.0k | break; |
1235 | 24.0k | } |
1236 | 6.11k | parseOption(addOption, errorCode); |
1237 | 6.11k | } |
1238 | 63.6k | } void icu_79::message2::Parser::parseOptions<icu_79::message2::data_model::Operator::Builder>(icu_79::message2::OptionAdder<icu_79::message2::data_model::Operator::Builder>&, UErrorCode&) Line | Count | Source | 1158 | 53.8k | void Parser::parseOptions(OptionAdder<T>& addOption, UErrorCode& errorCode) { | 1159 | | // Early exit if out of bounds -- no more work is possible | 1160 | 53.8k | CHECK_BOUNDS(errorCode); | 1161 | | | 1162 | | /* | 1163 | | Arbitrary lookahead is required to parse option lists. To see why, consider | 1164 | | these rules from the grammar: | 1165 | | | 1166 | | expression = "{" [s] (((literal / variable) [s annotation]) / annotation) [s] "}" | 1167 | | annotation = (function *(s option)) / reserved | 1168 | | | 1169 | | And this example: | 1170 | | {:foo } | 1171 | | | 1172 | | Derivation: | 1173 | | expression -> "{" [s] (((literal / variable) [s annotation]) / annotation) [s] "}" | 1174 | | -> "{" [s] annotation [s] "}" | 1175 | | -> "{" [s] ((function *(s option)) / reserved) [s] "}" | 1176 | | -> "{" [s] function *(s option) [s] "}" | 1177 | | | 1178 | | In this example, knowing whether to expect a '}' or the start of another option | 1179 | | after the whitespace would require arbitrary lookahead -- in other words, which | 1180 | | rule should we apply? | 1181 | | *(s option) -> s option *(s option) | 1182 | | or | 1183 | | *(s option) -> | 1184 | | | 1185 | | The same would apply to the example {:foo k=v } (note the trailing space after "v"). | 1186 | | | 1187 | | This is addressed using a form of backtracking and (to make the backtracking easier | 1188 | | to apply) a slight refactoring to the grammar. | 1189 | | | 1190 | | This code is written as if the grammar is: | 1191 | | expression = "{" [s] (((literal / variable) ([s] / [s annotation])) / annotation) "}" | 1192 | | annotation = (function *(s option) [s]) / (reserved [s]) | 1193 | | | 1194 | | Parsing the `*(s option) [s]` sequence can be done within `parseOptions()`, meaning | 1195 | | that `parseExpression()` can safely require a '}' after `parseOptions()` finishes. | 1196 | | | 1197 | | Note that when "backtracking" really just means early exit, since only whitespace | 1198 | | is involved and there's no state to save. | 1199 | | | 1200 | | There is a separate but similar ambiguity as to whether the space precedes | 1201 | | an option or an attribute. | 1202 | | */ | 1203 | | | 1204 | 55.3k | while(true) { | 1205 | | // If the next character is not whitespace, that means we've already | 1206 | | // parsed the entire options list (which may have been empty) and there's | 1207 | | // no trailing whitespace. In that case, exit. | 1208 | 55.3k | if (!isWhitespace(peek())) { | 1209 | 38.9k | break; | 1210 | 38.9k | } | 1211 | 16.3k | int32_t firstWhitespace = index; | 1212 | | | 1213 | | // In any case other than an empty options list, there must be at least | 1214 | | // one whitespace character. | 1215 | 16.3k | parseRequiredWhitespace(errorCode); | 1216 | | // Restore precondition | 1217 | 16.3k | CHECK_BOUNDS(errorCode); | 1218 | | | 1219 | | // If a name character follows, then at least one more option remains | 1220 | | // in the list. | 1221 | | // Otherwise, we've consumed all the options and any trailing whitespace, | 1222 | | // and can exit. | 1223 | | // Note that exiting is sort of like backtracking: "(s option)" doesn't apply, | 1224 | | // so we back out to [s]. | 1225 | 16.3k | if (!isNameStart(peek())) { | 1226 | | // We've consumed all the options (meaning that either we consumed non-empty | 1227 | | // whitespace, or consumed at least one option.) | 1228 | | // Done. | 1229 | | // Remove the required whitespace from normalizedInput | 1230 | 14.6k | normalizedInput.truncate(normalizedInput.length() - 1); | 1231 | | // "Backtrack" so as to leave the optional whitespace there | 1232 | | // when parsing attributes | 1233 | 14.6k | index = firstWhitespace; | 1234 | 14.6k | break; | 1235 | 14.6k | } | 1236 | 1.68k | parseOption(addOption, errorCode); | 1237 | 1.68k | } | 1238 | 53.6k | } |
void icu_79::message2::Parser::parseOptions<icu_79::message2::data_model::Markup::Builder>(icu_79::message2::OptionAdder<icu_79::message2::data_model::Markup::Builder>&, UErrorCode&) Line | Count | Source | 1158 | 9.95k | void Parser::parseOptions(OptionAdder<T>& addOption, UErrorCode& errorCode) { | 1159 | | // Early exit if out of bounds -- no more work is possible | 1160 | 9.95k | CHECK_BOUNDS(errorCode); | 1161 | | | 1162 | | /* | 1163 | | Arbitrary lookahead is required to parse option lists. To see why, consider | 1164 | | these rules from the grammar: | 1165 | | | 1166 | | expression = "{" [s] (((literal / variable) [s annotation]) / annotation) [s] "}" | 1167 | | annotation = (function *(s option)) / reserved | 1168 | | | 1169 | | And this example: | 1170 | | {:foo } | 1171 | | | 1172 | | Derivation: | 1173 | | expression -> "{" [s] (((literal / variable) [s annotation]) / annotation) [s] "}" | 1174 | | -> "{" [s] annotation [s] "}" | 1175 | | -> "{" [s] ((function *(s option)) / reserved) [s] "}" | 1176 | | -> "{" [s] function *(s option) [s] "}" | 1177 | | | 1178 | | In this example, knowing whether to expect a '}' or the start of another option | 1179 | | after the whitespace would require arbitrary lookahead -- in other words, which | 1180 | | rule should we apply? | 1181 | | *(s option) -> s option *(s option) | 1182 | | or | 1183 | | *(s option) -> | 1184 | | | 1185 | | The same would apply to the example {:foo k=v } (note the trailing space after "v"). | 1186 | | | 1187 | | This is addressed using a form of backtracking and (to make the backtracking easier | 1188 | | to apply) a slight refactoring to the grammar. | 1189 | | | 1190 | | This code is written as if the grammar is: | 1191 | | expression = "{" [s] (((literal / variable) ([s] / [s annotation])) / annotation) "}" | 1192 | | annotation = (function *(s option) [s]) / (reserved [s]) | 1193 | | | 1194 | | Parsing the `*(s option) [s]` sequence can be done within `parseOptions()`, meaning | 1195 | | that `parseExpression()` can safely require a '}' after `parseOptions()` finishes. | 1196 | | | 1197 | | Note that when "backtracking" really just means early exit, since only whitespace | 1198 | | is involved and there's no state to save. | 1199 | | | 1200 | | There is a separate but similar ambiguity as to whether the space precedes | 1201 | | an option or an attribute. | 1202 | | */ | 1203 | | | 1204 | 14.3k | while(true) { | 1205 | | // If the next character is not whitespace, that means we've already | 1206 | | // parsed the entire options list (which may have been empty) and there's | 1207 | | // no trailing whitespace. In that case, exit. | 1208 | 14.3k | if (!isWhitespace(peek())) { | 1209 | 610 | break; | 1210 | 610 | } | 1211 | 13.7k | int32_t firstWhitespace = index; | 1212 | | | 1213 | | // In any case other than an empty options list, there must be at least | 1214 | | // one whitespace character. | 1215 | 13.7k | parseRequiredWhitespace(errorCode); | 1216 | | // Restore precondition | 1217 | 13.7k | CHECK_BOUNDS(errorCode); | 1218 | | | 1219 | | // If a name character follows, then at least one more option remains | 1220 | | // in the list. | 1221 | | // Otherwise, we've consumed all the options and any trailing whitespace, | 1222 | | // and can exit. | 1223 | | // Note that exiting is sort of like backtracking: "(s option)" doesn't apply, | 1224 | | // so we back out to [s]. | 1225 | 13.7k | if (!isNameStart(peek())) { | 1226 | | // We've consumed all the options (meaning that either we consumed non-empty | 1227 | | // whitespace, or consumed at least one option.) | 1228 | | // Done. | 1229 | | // Remove the required whitespace from normalizedInput | 1230 | 9.33k | normalizedInput.truncate(normalizedInput.length() - 1); | 1231 | | // "Backtrack" so as to leave the optional whitespace there | 1232 | | // when parsing attributes | 1233 | 9.33k | index = firstWhitespace; | 1234 | 9.33k | break; | 1235 | 9.33k | } | 1236 | 4.43k | parseOption(addOption, errorCode); | 1237 | 4.43k | } | 1238 | 9.95k | } |
|
1239 | | |
1240 | | /* |
1241 | | Consume optional whitespace followed by a sequence of attributes |
1242 | | (possibly empty), separated by whitespace |
1243 | | */ |
1244 | | template<class T> |
1245 | 75.5k | void Parser::parseAttributes(AttributeAdder<T>& attrAdder, UErrorCode& errorCode) { |
1246 | | |
1247 | | // Early exit if out of bounds -- no more work is possible |
1248 | 75.5k | if (!inBounds()) { |
1249 | 747 | ERROR(errorCode); |
1250 | 747 | return; |
1251 | 747 | } |
1252 | | |
1253 | | /* |
1254 | | Arbitrary lookahead is required to parse attribute lists, similarly to option lists. |
1255 | | (See comment in parseOptions()). |
1256 | | */ |
1257 | | |
1258 | 98.4k | while(true) { |
1259 | | // If the next character is not whitespace, that means we've already |
1260 | | // parsed the entire attributes list (which may have been empty) and there's |
1261 | | // no trailing whitespace. In that case, exit. |
1262 | 98.4k | if (!isWhitespace(peek())) { |
1263 | 58.8k | break; |
1264 | 58.8k | } |
1265 | | |
1266 | | // In any case other than an empty attributes list, there must be at least |
1267 | | // one whitespace character. |
1268 | 39.5k | parseRequiredWhitespace(errorCode); |
1269 | | // Restore precondition |
1270 | 39.5k | if (!inBounds()) { |
1271 | 29 | ERROR(errorCode); |
1272 | 29 | break; |
1273 | 29 | } |
1274 | | |
1275 | | // If an '@' follows, then at least one more attribute remains |
1276 | | // in the list. |
1277 | | // Otherwise, we've consumed all the attributes and any trailing whitespace, |
1278 | | // and can exit. |
1279 | | // Note that exiting is sort of like backtracking: "(s attributes)" doesn't apply, |
1280 | | // so we back out to [s]. |
1281 | 39.5k | if (peek() != AT) { |
1282 | | // We've consumed all the attributes (meaning that either we consumed non-empty |
1283 | | // whitespace, or consumed at least one attribute.) |
1284 | | // Done. |
1285 | | // Remove the whitespace from normalizedInput |
1286 | 15.8k | normalizedInput.truncate(normalizedInput.length() - 1); |
1287 | 15.8k | break; |
1288 | 15.8k | } |
1289 | 23.6k | parseAttribute(attrAdder, errorCode); |
1290 | 23.6k | } |
1291 | 74.7k | } void icu_79::message2::Parser::parseAttributes<icu_79::message2::data_model::Expression::Builder>(icu_79::message2::AttributeAdder<icu_79::message2::data_model::Expression::Builder>&, UErrorCode&) Line | Count | Source | 1245 | 66.1k | void Parser::parseAttributes(AttributeAdder<T>& attrAdder, UErrorCode& errorCode) { | 1246 | | | 1247 | | // Early exit if out of bounds -- no more work is possible | 1248 | 66.1k | if (!inBounds()) { | 1249 | 747 | ERROR(errorCode); | 1250 | 747 | return; | 1251 | 747 | } | 1252 | | | 1253 | | /* | 1254 | | Arbitrary lookahead is required to parse attribute lists, similarly to option lists. | 1255 | | (See comment in parseOptions()). | 1256 | | */ | 1257 | | | 1258 | 79.6k | while(true) { | 1259 | | // If the next character is not whitespace, that means we've already | 1260 | | // parsed the entire attributes list (which may have been empty) and there's | 1261 | | // no trailing whitespace. In that case, exit. | 1262 | 79.6k | if (!isWhitespace(peek())) { | 1263 | 54.0k | break; | 1264 | 54.0k | } | 1265 | | | 1266 | | // In any case other than an empty attributes list, there must be at least | 1267 | | // one whitespace character. | 1268 | 25.6k | parseRequiredWhitespace(errorCode); | 1269 | | // Restore precondition | 1270 | 25.6k | if (!inBounds()) { | 1271 | 18 | ERROR(errorCode); | 1272 | 18 | break; | 1273 | 18 | } | 1274 | | | 1275 | | // If an '@' follows, then at least one more attribute remains | 1276 | | // in the list. | 1277 | | // Otherwise, we've consumed all the attributes and any trailing whitespace, | 1278 | | // and can exit. | 1279 | | // Note that exiting is sort of like backtracking: "(s attributes)" doesn't apply, | 1280 | | // so we back out to [s]. | 1281 | 25.6k | if (peek() != AT) { | 1282 | | // We've consumed all the attributes (meaning that either we consumed non-empty | 1283 | | // whitespace, or consumed at least one attribute.) | 1284 | | // Done. | 1285 | | // Remove the whitespace from normalizedInput | 1286 | 11.3k | normalizedInput.truncate(normalizedInput.length() - 1); | 1287 | 11.3k | break; | 1288 | 11.3k | } | 1289 | 14.2k | parseAttribute(attrAdder, errorCode); | 1290 | 14.2k | } | 1291 | 65.3k | } |
void icu_79::message2::Parser::parseAttributes<icu_79::message2::data_model::Markup::Builder>(icu_79::message2::AttributeAdder<icu_79::message2::data_model::Markup::Builder>&, UErrorCode&) Line | Count | Source | 1245 | 9.39k | void Parser::parseAttributes(AttributeAdder<T>& attrAdder, UErrorCode& errorCode) { | 1246 | | | 1247 | | // Early exit if out of bounds -- no more work is possible | 1248 | 9.39k | if (!inBounds()) { | 1249 | 0 | ERROR(errorCode); | 1250 | 0 | return; | 1251 | 0 | } | 1252 | | | 1253 | | /* | 1254 | | Arbitrary lookahead is required to parse attribute lists, similarly to option lists. | 1255 | | (See comment in parseOptions()). | 1256 | | */ | 1257 | | | 1258 | 18.7k | while(true) { | 1259 | | // If the next character is not whitespace, that means we've already | 1260 | | // parsed the entire attributes list (which may have been empty) and there's | 1261 | | // no trailing whitespace. In that case, exit. | 1262 | 18.7k | if (!isWhitespace(peek())) { | 1263 | 4.86k | break; | 1264 | 4.86k | } | 1265 | | | 1266 | | // In any case other than an empty attributes list, there must be at least | 1267 | | // one whitespace character. | 1268 | 13.9k | parseRequiredWhitespace(errorCode); | 1269 | | // Restore precondition | 1270 | 13.9k | if (!inBounds()) { | 1271 | 11 | ERROR(errorCode); | 1272 | 11 | break; | 1273 | 11 | } | 1274 | | | 1275 | | // If an '@' follows, then at least one more attribute remains | 1276 | | // in the list. | 1277 | | // Otherwise, we've consumed all the attributes and any trailing whitespace, | 1278 | | // and can exit. | 1279 | | // Note that exiting is sort of like backtracking: "(s attributes)" doesn't apply, | 1280 | | // so we back out to [s]. | 1281 | 13.9k | if (peek() != AT) { | 1282 | | // We've consumed all the attributes (meaning that either we consumed non-empty | 1283 | | // whitespace, or consumed at least one attribute.) | 1284 | | // Done. | 1285 | | // Remove the whitespace from normalizedInput | 1286 | 4.52k | normalizedInput.truncate(normalizedInput.length() - 1); | 1287 | 4.52k | break; | 1288 | 4.52k | } | 1289 | 9.37k | parseAttribute(attrAdder, errorCode); | 1290 | 9.37k | } | 1291 | 9.39k | } |
|
1292 | | |
1293 | | /* |
1294 | | Consume a function call, matching the `annotation` |
1295 | | nonterminal in the grammar |
1296 | | |
1297 | | Returns an `Operator` representing this (a reserved is a parse error) |
1298 | | */ |
1299 | 53.8k | Operator Parser::parseAnnotation(UErrorCode& status) { |
1300 | 53.8k | U_ASSERT(inBounds()); |
1301 | 53.8k | Operator::Builder ratorBuilder(status); |
1302 | 53.8k | if (U_FAILURE(status)) { |
1303 | 0 | return {}; |
1304 | 0 | } |
1305 | 53.8k | if (isFunctionStart(peek())) { |
1306 | | // Consume the function name |
1307 | 53.8k | FunctionName func = parseFunction(status); |
1308 | 53.8k | ratorBuilder.setFunctionName(std::move(func)); |
1309 | | |
1310 | 53.8k | OptionAdder<Operator::Builder> addOptions(ratorBuilder); |
1311 | | // Consume the options (which may be empty) |
1312 | 53.8k | parseOptions(addOptions, status); |
1313 | 53.8k | } else { |
1314 | 0 | ERROR(status); |
1315 | 0 | } |
1316 | 53.8k | return ratorBuilder.build(status); |
1317 | 53.8k | } |
1318 | | |
1319 | | /* |
1320 | | Consume a literal or variable (depending on `isVariable`), |
1321 | | followed by either required whitespace followed by an annotation, |
1322 | | or optional whitespace. |
1323 | | */ |
1324 | | void Parser::parseLiteralOrVariableWithAnnotation(bool isVariable, |
1325 | | Expression::Builder& builder, |
1326 | 11.6k | UErrorCode& status) { |
1327 | 11.6k | CHECK_ERROR(status); |
1328 | | |
1329 | 11.6k | U_ASSERT(inBounds()); |
1330 | | |
1331 | 11.6k | Operand rand; |
1332 | 11.6k | if (isVariable) { |
1333 | 466 | rand = Operand(parseVariableName(status)); |
1334 | 11.1k | } else { |
1335 | 11.1k | rand = Operand(parseLiteral(status)); |
1336 | 11.1k | } |
1337 | | |
1338 | 11.6k | builder.setOperand(std::move(rand)); |
1339 | | |
1340 | | /* |
1341 | | Parsing a literal or variable with an optional annotation requires arbitrary lookahead. |
1342 | | To see why, consider this rule from the grammar: |
1343 | | |
1344 | | expression = "{" [s] (((literal / variable) [s annotation]) / annotation) [s] "}" |
1345 | | |
1346 | | And this example: |
1347 | | |
1348 | | {|foo| } |
1349 | | |
1350 | | Derivation: |
1351 | | expression -> "{" [s] (((literal / variable) [s annotation]) / annotation) [s] "}" |
1352 | | -> "{" [s] ((literal / variable) [s annotation]) [s] "}" |
1353 | | -> "{" [s] (literal [s annotation]) [s] "}" |
1354 | | |
1355 | | When reading the ' ' after the second '|', it's ambiguous whether that's the required |
1356 | | space before an annotation, or the optional space before the '}'. |
1357 | | |
1358 | | To make this ambiguity easier to handle, this code is based on the same grammar |
1359 | | refactoring for the `expression` nonterminal that `parseOptions()` relies on. See |
1360 | | the comment in `parseOptions()` for details. |
1361 | | */ |
1362 | | |
1363 | 11.6k | if (isWhitespace(peek())) { |
1364 | 3.24k | int32_t firstWhitespace = index; |
1365 | | |
1366 | | // If the next character is whitespace, either [s annotation] or [s] applies |
1367 | | // (the character is either the required space before an annotation, or optional |
1368 | | // trailing space after the literal or variable). It's still ambiguous which |
1369 | | // one does apply. |
1370 | 3.24k | parseOptionalWhitespace(); |
1371 | | // Restore precondition |
1372 | 3.24k | CHECK_BOUNDS(status); |
1373 | | |
1374 | | // This next check resolves the ambiguity between [s annotation] and [s] |
1375 | 3.23k | bool isSAnnotation = isAnnotationStart(peek()); |
1376 | | |
1377 | 3.23k | if (isSAnnotation) { |
1378 | 709 | normalizedInput += SPACE; |
1379 | 709 | } |
1380 | | |
1381 | 3.23k | if (isSAnnotation) { |
1382 | | // The previously consumed whitespace precedes an annotation |
1383 | 709 | builder.setOperator(parseAnnotation(status)); |
1384 | 2.52k | } else { |
1385 | | // Either there's a right curly brace (will be consumed by the caller), |
1386 | | // or there's an error and the trailing whitespace should be |
1387 | | // handled by the caller. However, this is not an error |
1388 | | // here because we're just parsing `literal [s annotation]`. |
1389 | 2.52k | index = firstWhitespace; |
1390 | 2.52k | } |
1391 | 8.36k | } else { |
1392 | | // Either there was never whitespace, or |
1393 | | // the previously consumed whitespace is the optional trailing whitespace; |
1394 | | // either the next character is '}' or the error will be handled by parseExpression. |
1395 | | // Do nothing, since the operand was already set |
1396 | 8.36k | } |
1397 | | |
1398 | | // At the end of this code, the next character should either be '}', |
1399 | | // whitespace followed by a '}', |
1400 | | // or end-of-input |
1401 | 11.6k | } |
1402 | | |
1403 | | /* |
1404 | | Consume an expression, matching the `expression` nonterminal in the grammar |
1405 | | */ |
1406 | | |
1407 | 1.42k | static void exprFallback(Expression::Builder& exprBuilder) { |
1408 | | // Construct a literal consisting just of The U+FFFD REPLACEMENT CHARACTER |
1409 | | // per https://github.com/unicode-org/message-format-wg/blob/main/spec/formatting.md#fallback-resolution |
1410 | 1.42k | exprBuilder.setOperand(Operand(Literal(false, UnicodeString(REPLACEMENT)))); |
1411 | 1.42k | } |
1412 | | |
1413 | 0 | static Expression exprFallback(UErrorCode& status) { |
1414 | 0 | Expression result; |
1415 | 0 | if (U_SUCCESS(status)) { |
1416 | 0 | Expression::Builder exprBuilder(status); |
1417 | 0 | if (U_SUCCESS(status)) { |
1418 | | // Construct a literal consisting just of The U+FFFD REPLACEMENT CHARACTER |
1419 | | // per https://github.com/unicode-org/message-format-wg/blob/main/spec/formatting.md#fallback-resolution |
1420 | 0 | exprBuilder.setOperand(Operand(Literal(false, UnicodeString(REPLACEMENT)))); |
1421 | 0 | UErrorCode status = U_ZERO_ERROR; |
1422 | 0 | result = exprBuilder.build(status); |
1423 | | // An operand was set, so there can't be an error |
1424 | 0 | U_ASSERT(U_SUCCESS(status)); |
1425 | 0 | } |
1426 | 0 | } |
1427 | 0 | return result; |
1428 | 0 | } |
1429 | | |
1430 | 66.1k | Expression Parser::parseExpression(UErrorCode& status) { |
1431 | 66.1k | if (U_FAILURE(status)) { |
1432 | 0 | return {}; |
1433 | 0 | } |
1434 | | |
1435 | | // Early return if out of input -- no more work is possible |
1436 | 66.1k | U_ASSERT(inBounds()); |
1437 | | |
1438 | | // Parse opening brace |
1439 | 66.1k | parseToken(LEFT_CURLY_BRACE, status); |
1440 | | // Optional whitespace after opening brace |
1441 | 66.1k | parseOptionalWhitespace(); |
1442 | | |
1443 | 66.1k | Expression::Builder exprBuilder(status); |
1444 | | // Restore precondition |
1445 | 66.1k | if (!inBounds()) { |
1446 | 58 | exprFallback(exprBuilder); |
1447 | 66.0k | } else { |
1448 | | // literal '|', variable '$' or annotation |
1449 | 66.0k | switch (peek()) { |
1450 | 191 | case PIPE: { |
1451 | | // Quoted literal |
1452 | 191 | parseLiteralOrVariableWithAnnotation(false, exprBuilder, status); |
1453 | 191 | break; |
1454 | 0 | } |
1455 | 466 | case DOLLAR: { |
1456 | | // Variable |
1457 | 466 | parseLiteralOrVariableWithAnnotation(true, exprBuilder, status); |
1458 | 466 | break; |
1459 | 0 | } |
1460 | 65.4k | default: { |
1461 | 65.4k | if (isAnnotationStart(peek())) { |
1462 | 53.1k | Operator rator = parseAnnotation(status); |
1463 | 53.1k | exprBuilder.setOperator(std::move(rator)); |
1464 | 53.1k | } else if (isUnquotedStart(peek())) { |
1465 | | // Unquoted literal |
1466 | 10.9k | parseLiteralOrVariableWithAnnotation(false, exprBuilder, status); |
1467 | 10.9k | } else { |
1468 | | // Not a literal, variable or annotation -- error out |
1469 | 1.36k | ERROR(status); |
1470 | 1.36k | exprFallback(exprBuilder); |
1471 | 1.36k | break; |
1472 | 1.36k | } |
1473 | 64.0k | break; |
1474 | 65.4k | } |
1475 | 66.0k | } |
1476 | 66.0k | } |
1477 | | |
1478 | | // Parse attributes |
1479 | 66.1k | AttributeAdder<Expression::Builder> attrAdder(exprBuilder); |
1480 | 66.1k | parseAttributes(attrAdder, status); |
1481 | | |
1482 | | // Parse optional space |
1483 | | // (the last [s] in e.g. "{" [s] literal [s annotation] *(s attribute) [s] "}") |
1484 | 66.1k | parseOptionalWhitespace(); |
1485 | | |
1486 | | // Either an operand or operator (or both) must have been set already, |
1487 | | // so there can't be an error |
1488 | 66.1k | UErrorCode localStatus = U_ZERO_ERROR; |
1489 | 66.1k | Expression result = exprBuilder.build(localStatus); |
1490 | 66.1k | U_ASSERT(U_SUCCESS(localStatus)); |
1491 | | |
1492 | | // Check for end-of-input and missing '}' |
1493 | 66.1k | if (!inBounds()) { |
1494 | 825 | ERROR(status); |
1495 | 65.3k | } else { |
1496 | | // Otherwise, it's safe to check for the '}' |
1497 | 65.3k | parseToken(RIGHT_CURLY_BRACE, status); |
1498 | 65.3k | } |
1499 | 66.1k | return result; |
1500 | 66.1k | } |
1501 | | |
1502 | | /* |
1503 | | Parse a .local declaration, matching the `local-declaration` |
1504 | | production in the grammar |
1505 | | */ |
1506 | 14.2k | void Parser::parseLocalDeclaration(UErrorCode& status) { |
1507 | | // End-of-input here would be an error; even empty |
1508 | | // declarations must be followed by a body |
1509 | 14.2k | CHECK_BOUNDS(status); |
1510 | | |
1511 | 14.2k | parseToken(ID_LOCAL, status); |
1512 | 14.2k | parseRequiredWhitespace(status); |
1513 | | |
1514 | | // Restore precondition |
1515 | 14.2k | CHECK_BOUNDS(status); |
1516 | 14.2k | VariableName lhs = parseVariableName(status); |
1517 | 14.2k | parseTokenWithWhitespace(EQUALS, status); |
1518 | | // Restore precondition before calling parseExpression() |
1519 | 14.2k | CHECK_BOUNDS(status); |
1520 | | |
1521 | 14.2k | Expression rhs = parseExpression(status); |
1522 | | |
1523 | | // Add binding from lhs to rhs, unless there was an error |
1524 | | // (This ensures that if there was a correct lhs but a |
1525 | | // parse error in rhs, the fallback for uses of the |
1526 | | // lhs will be its own name rather than the rhs) |
1527 | | /* This affects the behavior of this test case, which the spec |
1528 | | is ambiguous about: |
1529 | | |
1530 | | .local $bar {|foo|} {{{$bar}}} |
1531 | | |
1532 | | Should `$bar` still be bound to a value although |
1533 | | its declaration is syntactically incorrect (missing the '=')? |
1534 | | This code says no, but it needs to change if |
1535 | | https://github.com/unicode-org/message-format-wg/issues/703 |
1536 | | is resolved differently. |
1537 | | */ |
1538 | 14.2k | CHECK_ERROR(status); |
1539 | 14.2k | if (!errors.hasSyntaxError()) { |
1540 | 0 | dataModel.addBinding(Binding(std::move(lhs), std::move(rhs)), status); |
1541 | | // Check if status is U_DUPLICATE_DECLARATION_ERROR |
1542 | | // and add that as an internal error if so |
1543 | 0 | if (status == U_MF_DUPLICATE_DECLARATION_ERROR) { |
1544 | 0 | status = U_ZERO_ERROR; |
1545 | 0 | errors.addError(StaticErrorType::DuplicateDeclarationError, status); |
1546 | 0 | } |
1547 | 0 | } |
1548 | 14.2k | } |
1549 | | |
1550 | | /* |
1551 | | Parse an .input declaration, matching the `local-declaration` |
1552 | | production in the grammar |
1553 | | */ |
1554 | 41.5k | void Parser::parseInputDeclaration(UErrorCode& status) { |
1555 | | // End-of-input here would be an error; even empty |
1556 | | // declarations must be followed by a body |
1557 | 41.5k | CHECK_BOUNDS(status); |
1558 | | |
1559 | 41.5k | parseToken(ID_INPUT, status); |
1560 | 41.5k | parseOptionalWhitespace(); |
1561 | | |
1562 | | // Restore precondition before calling parseExpression() |
1563 | 41.5k | CHECK_BOUNDS(status); |
1564 | | |
1565 | | // Save the index for error diagnostics |
1566 | 41.5k | int32_t exprIndex = index; |
1567 | 41.5k | Expression rhs = parseExpression(status); |
1568 | | |
1569 | | // Here we have to check that the rhs is a variable-expression |
1570 | 41.5k | if (!rhs.getOperand().isVariable()) { |
1571 | | // This case is a syntax error; report it at the beginning |
1572 | | // of the expression |
1573 | 41.1k | ERROR_AT(status, exprIndex); |
1574 | 41.1k | return; |
1575 | 41.1k | } |
1576 | | |
1577 | 423 | VariableName lhs = rhs.getOperand().asVariable(); |
1578 | | |
1579 | | // Add binding from lhs to rhs |
1580 | | // This just adds a new local variable that shadows the message |
1581 | | // argument referred to, which is harmless. |
1582 | | // When evaluating the RHS, the new local is not in scope |
1583 | | // and the message argument will be correctly referred to. |
1584 | 423 | CHECK_ERROR(status); |
1585 | 423 | if (!errors.hasSyntaxError()) { |
1586 | 0 | dataModel.addBinding(Binding::input(std::move(lhs), std::move(rhs), status), status); |
1587 | | // Check if status is U_MF_DUPLICATE_DECLARATION_ERROR |
1588 | | // and add that as an internal error if so |
1589 | 0 | if (status == U_MF_DUPLICATE_DECLARATION_ERROR) { |
1590 | 0 | status = U_ZERO_ERROR; |
1591 | 0 | errors.addError(StaticErrorType::DuplicateDeclarationError, status); |
1592 | 0 | } |
1593 | 0 | } |
1594 | 423 | } |
1595 | | |
1596 | | /* |
1597 | | Consume a possibly-empty sequence of declarations separated by whitespace; |
1598 | | each declaration matches the `declaration` nonterminal in the grammar |
1599 | | |
1600 | | Builds up an environment representing those declarations |
1601 | | */ |
1602 | 1.06k | void Parser::parseDeclarations(UErrorCode& status) { |
1603 | | // End-of-input here would be an error; even empty |
1604 | | // declarations must be followed by a body |
1605 | 1.06k | CHECK_BOUNDS(status); |
1606 | | |
1607 | 56.7k | while (peek() == PERIOD) { |
1608 | 56.5k | CHECK_BOUNDS_1(status); |
1609 | 56.5k | if (peek(1) == ID_LOCAL[1]) { |
1610 | 14.2k | parseLocalDeclaration(status); |
1611 | 42.2k | } else if (peek(1) == ID_INPUT[1]) { |
1612 | 41.5k | parseInputDeclaration(status); |
1613 | 41.5k | } else { |
1614 | | // Done parsing declarations |
1615 | 709 | break; |
1616 | 709 | } |
1617 | | |
1618 | | // Avoid looping infinitely |
1619 | 55.8k | CHECK_ERROR(status); |
1620 | | |
1621 | 55.8k | parseOptionalWhitespace(); |
1622 | | // Restore precondition |
1623 | 55.8k | CHECK_BOUNDS(status); |
1624 | 55.6k | } |
1625 | 1.06k | } |
1626 | | |
1627 | | /* |
1628 | | Consume a text character |
1629 | | matching the `text-char` nonterminal in the grammar |
1630 | | |
1631 | | No postcondition (a message can end with a text-char) |
1632 | | */ |
1633 | 11.5M | UnicodeString Parser::parseTextChar(UErrorCode& status) { |
1634 | 11.5M | UnicodeString str; |
1635 | 11.5M | if (!inBounds() || !(isTextChar(peek()))) { |
1636 | | // Error -- text-char is expected here |
1637 | 235 | ERROR(status); |
1638 | 11.5M | } else { |
1639 | | // See comment in parseQuotedLiteral() |
1640 | 11.5M | if (isEscapableChar(peek())) { |
1641 | 76.2k | normalizedInput += BACKSLASH; |
1642 | 76.2k | } |
1643 | 11.5M | normalizedInput += peek(); |
1644 | 11.5M | str += peek(); |
1645 | 11.5M | next(); |
1646 | 11.5M | maybeAdvanceLine(); |
1647 | 11.5M | } |
1648 | 11.5M | return str; |
1649 | 11.5M | } |
1650 | | |
1651 | | /* |
1652 | | Consume an `nmtoken`, `literal`, or the string "*", matching |
1653 | | the `key` nonterminal in the grammar |
1654 | | */ |
1655 | 4.03M | Key Parser::parseKey(UErrorCode& status) { |
1656 | 4.03M | U_ASSERT(inBounds()); |
1657 | | |
1658 | 4.03M | Key k; // wildcard by default |
1659 | | // Literal | '*' |
1660 | 4.03M | switch (peek()) { |
1661 | 1.07k | case ASTERISK: { |
1662 | 1.07k | next(); |
1663 | 1.07k | normalizedInput += ASTERISK; |
1664 | | // Guarantee postcondition |
1665 | 1.07k | if (!inBounds()) { |
1666 | 9 | ERROR(status); |
1667 | 9 | return k; |
1668 | 9 | } |
1669 | 1.06k | break; |
1670 | 1.07k | } |
1671 | 4.03M | default: { |
1672 | | // Literal |
1673 | 4.03M | k = Key(parseLiteral(status)); |
1674 | 4.03M | break; |
1675 | 1.07k | } |
1676 | 4.03M | } |
1677 | 4.03M | return k; |
1678 | 4.03M | } |
1679 | | |
1680 | | /* |
1681 | | Consume a non-empty sequence of `key`s separated by whitespace |
1682 | | |
1683 | | Takes ownership of `keys` |
1684 | | */ |
1685 | 553 | SelectorKeys Parser::parseNonEmptyKeys(UErrorCode& status) { |
1686 | 553 | SelectorKeys result; |
1687 | | |
1688 | 553 | if (U_FAILURE(status)) { |
1689 | 0 | return result; |
1690 | 0 | } |
1691 | | |
1692 | 553 | U_ASSERT(inBounds()); |
1693 | | |
1694 | | /* |
1695 | | Arbitrary lookahead is required to parse key lists. To see why, consider |
1696 | | this rule from the grammar: |
1697 | | |
1698 | | variant = key *(s key) [s] quoted-pattern |
1699 | | |
1700 | | And this example: |
1701 | | when k1 k2 {a} |
1702 | | |
1703 | | Derivation: |
1704 | | variant -> key *(s key) [s] quoted-pattern |
1705 | | -> key s key *(s key) quoted-pattern |
1706 | | |
1707 | | After matching ' ' to `s` and 'k2' to `key`, it would require arbitrary lookahead |
1708 | | to know whether to expect the start of a pattern or the start of another key. |
1709 | | In other words: is the second whitespace sequence the required space in *(s key), |
1710 | | or the optional space in [s] quoted-pattern? |
1711 | | |
1712 | | This is addressed using "backtracking" (similarly to `parseOptions()`). |
1713 | | */ |
1714 | | |
1715 | 553 | SelectorKeys::Builder keysBuilder(status); |
1716 | 553 | if (U_FAILURE(status)) { |
1717 | 0 | return result; |
1718 | 0 | } |
1719 | | |
1720 | | // Since the first key is required, it's simplest to parse it separately. |
1721 | 553 | keysBuilder.add(parseKey(status), status); |
1722 | | |
1723 | | // Restore precondition |
1724 | 553 | if (!inBounds()) { |
1725 | 47 | ERROR(status); |
1726 | 47 | return result; |
1727 | 47 | } |
1728 | | |
1729 | | // We've seen at least one whitespace-key pair, so now we can parse |
1730 | | // *(s key) [s] |
1731 | 4.03M | while (peek() != LEFT_CURLY_BRACE || isWhitespace(peek()) || isBidiControl(peek())) { |
1732 | 4.03M | bool wasWhitespace = isWhitespace(peek()) || isBidiControl(peek()); |
1733 | 4.03M | parseRequiredWhitespace(status); |
1734 | 4.03M | if (!wasWhitespace) { |
1735 | | // Avoid infinite loop when parsing something like: |
1736 | | // when * @{!... |
1737 | 4.02M | next(); |
1738 | 4.02M | } |
1739 | | |
1740 | | // Restore precondition |
1741 | 4.03M | if (!inBounds()) { |
1742 | 224 | ERROR(status); |
1743 | 224 | return result; |
1744 | 224 | } |
1745 | | |
1746 | | // At this point, it's ambiguous whether we are inside (s key) or [s]. |
1747 | | // This check resolves that ambiguity. |
1748 | 4.03M | if (peek() == LEFT_CURLY_BRACE) { |
1749 | | // A pattern follows, so what we just parsed was the optional |
1750 | | // trailing whitespace. All the keys have been parsed. |
1751 | | |
1752 | | // Unpush the whitespace from `normalizedInput` |
1753 | 107 | normalizedInput.truncate(normalizedInput.length() - 1); |
1754 | 107 | break; |
1755 | 107 | } |
1756 | 4.03M | keysBuilder.add(parseKey(status), status); |
1757 | 4.03M | } |
1758 | | |
1759 | 282 | return keysBuilder.build(status); |
1760 | 506 | } |
1761 | | |
1762 | 296 | Pattern Parser::parseQuotedPattern(UErrorCode& status) { |
1763 | 296 | U_ASSERT(inBounds()); |
1764 | | |
1765 | 296 | parseToken(LEFT_CURLY_BRACE, status); |
1766 | 296 | parseToken(LEFT_CURLY_BRACE, status); |
1767 | 296 | Pattern p = parseSimpleMessage(status); |
1768 | 296 | parseToken(RIGHT_CURLY_BRACE, status); |
1769 | 296 | parseToken(RIGHT_CURLY_BRACE, status); |
1770 | 296 | return p; |
1771 | 296 | } |
1772 | | |
1773 | | /* |
1774 | | Consume a `placeholder`, matching the nonterminal in the grammar |
1775 | | No postcondition (a markup can end a message) |
1776 | | */ |
1777 | 15.2k | Markup Parser::parseMarkup(UErrorCode& status) { |
1778 | 15.2k | U_ASSERT(inBounds(1)); |
1779 | | |
1780 | 15.2k | U_ASSERT(peek() == LEFT_CURLY_BRACE); |
1781 | | |
1782 | 15.2k | Markup::Builder builder(status); |
1783 | 15.2k | if (U_FAILURE(status)) { |
1784 | 0 | return {}; |
1785 | 0 | } |
1786 | | |
1787 | | // Consume the '{' |
1788 | 15.2k | next(); |
1789 | 15.2k | normalizedInput += LEFT_CURLY_BRACE; |
1790 | 15.2k | parseOptionalWhitespace(); |
1791 | 15.2k | bool closing = false; |
1792 | 15.2k | switch (peek()) { |
1793 | 13.2k | case NUMBER_SIGN: { |
1794 | | // Open or standalone; consume the '#' |
1795 | 13.2k | normalizedInput += peek(); |
1796 | 13.2k | next(); |
1797 | 13.2k | break; |
1798 | 0 | } |
1799 | 1.94k | case SLASH: { |
1800 | | // Closing |
1801 | 1.94k | normalizedInput += peek(); |
1802 | 1.94k | closing = true; |
1803 | 1.94k | next(); |
1804 | 1.94k | break; |
1805 | 0 | } |
1806 | 0 | default: { |
1807 | 0 | ERROR(status); |
1808 | 0 | return {}; |
1809 | 0 | } |
1810 | 15.2k | } |
1811 | | |
1812 | | // Parse the markup identifier |
1813 | 15.2k | builder.setName(parseIdentifier(status)); |
1814 | | |
1815 | | // Parse the options, which must begin with a ' ' |
1816 | | // if present |
1817 | 15.2k | if (inBounds() && (isWhitespace(peek()) || isBidiControl(peek()))) { |
1818 | 9.95k | OptionAdder<Markup::Builder> optionAdder(builder); |
1819 | 9.95k | parseOptions(optionAdder, status); |
1820 | 9.95k | } |
1821 | | |
1822 | | // Parse the attributes, which also must begin |
1823 | | // with a ' ' |
1824 | 15.2k | if (inBounds() && (isWhitespace(peek()) || isBidiControl(peek()))) { |
1825 | 9.39k | AttributeAdder<Markup::Builder> attrAdder(builder); |
1826 | 9.39k | parseAttributes(attrAdder, status); |
1827 | 9.39k | } |
1828 | | |
1829 | 15.2k | parseOptionalWhitespace(); |
1830 | | |
1831 | 15.2k | bool standalone = false; |
1832 | | // Check if this is a standalone or not |
1833 | 15.2k | if (!closing) { |
1834 | 13.2k | if (inBounds() && peek() == SLASH) { |
1835 | 311 | standalone = true; |
1836 | 311 | normalizedInput += SLASH; |
1837 | 311 | next(); |
1838 | 311 | } |
1839 | 13.2k | } |
1840 | | |
1841 | 15.2k | parseToken(RIGHT_CURLY_BRACE, status); |
1842 | | |
1843 | 15.2k | if (standalone) { |
1844 | 311 | builder.setStandalone(); |
1845 | 14.9k | } else if (closing) { |
1846 | 1.94k | builder.setClose(); |
1847 | 12.9k | } else { |
1848 | 12.9k | builder.setOpen(); |
1849 | 12.9k | } |
1850 | | |
1851 | 15.2k | return builder.build(status); |
1852 | 15.2k | } |
1853 | | |
1854 | | /* |
1855 | | Consume a `placeholder`, matching the nonterminal in the grammar |
1856 | | No postcondition (a placeholder can end a message) |
1857 | | */ |
1858 | 25.5k | std::variant<Expression, Markup> Parser::parsePlaceholder(UErrorCode& status) { |
1859 | 25.5k | U_ASSERT(peek() == LEFT_CURLY_BRACE); |
1860 | | |
1861 | 25.5k | if (!inBounds()) { |
1862 | 0 | ERROR(status); |
1863 | 0 | return exprFallback(status); |
1864 | 0 | } |
1865 | | |
1866 | | // Need to look ahead arbitrarily since whitespace |
1867 | | // can appear before the '{' and '#' |
1868 | | // in markup |
1869 | 25.5k | int32_t tempIndex = 1; |
1870 | 25.5k | bool isMarkup = false; |
1871 | 26.3k | while (inBounds(1)) { |
1872 | 26.3k | UChar32 c = peek(tempIndex); |
1873 | 26.3k | if (c == NUMBER_SIGN || c == SLASH) { |
1874 | 15.2k | isMarkup = true; |
1875 | 15.2k | break; |
1876 | 15.2k | } |
1877 | 11.1k | if (!(isWhitespace(c) || isBidiControl(c))) { |
1878 | 10.3k | break; |
1879 | 10.3k | } |
1880 | 809 | tempIndex++; |
1881 | 809 | } |
1882 | | |
1883 | 25.5k | if (isMarkup) { |
1884 | 15.2k | return parseMarkup(status); |
1885 | 15.2k | } |
1886 | 10.3k | return parseExpression(status); |
1887 | 25.5k | } |
1888 | | |
1889 | | /* |
1890 | | Consume a `simple-message`, matching the nonterminal in the grammar |
1891 | | Postcondition: `index == len()` or U_FAILURE(status); |
1892 | | for a syntactically correct message, this will consume the entire input |
1893 | | */ |
1894 | 6.35k | Pattern Parser::parseSimpleMessage(UErrorCode& status) { |
1895 | 6.35k | Pattern::Builder result(status); |
1896 | | |
1897 | 6.35k | if (U_SUCCESS(status)) { |
1898 | 6.35k | Expression expression; |
1899 | 11.5M | while (inBounds()) { |
1900 | 11.5M | switch (peek()) { |
1901 | 25.5k | case LEFT_CURLY_BRACE: { |
1902 | | // Must be placeholder |
1903 | 25.5k | std::variant<Expression, Markup> piece = parsePlaceholder(status); |
1904 | 25.5k | if (std::holds_alternative<Expression>(piece)) { |
1905 | 10.3k | Expression expr = *std::get_if<Expression>(&piece); |
1906 | 10.3k | result.add(std::move(expr), status); |
1907 | 15.2k | } else { |
1908 | 15.2k | Markup markup = *std::get_if<Markup>(&piece); |
1909 | 15.2k | result.add(std::move(markup), status); |
1910 | 15.2k | } |
1911 | 25.5k | break; |
1912 | 0 | } |
1913 | 1.39k | case BACKSLASH: { |
1914 | | // Must be escaped-char |
1915 | 1.39k | result.add(parseEscapeSequence(status), status); |
1916 | 1.39k | break; |
1917 | 0 | } |
1918 | 10 | case RIGHT_CURLY_BRACE: { |
1919 | | // Distinguish unescaped '}' from end of quoted pattern |
1920 | 10 | break; |
1921 | 0 | } |
1922 | 11.5M | default: { |
1923 | | // Must be text-char |
1924 | 11.5M | result.add(parseTextChar(status), status); |
1925 | 11.5M | break; |
1926 | 0 | } |
1927 | 11.5M | } |
1928 | 11.5M | if (peek() == RIGHT_CURLY_BRACE) { |
1929 | | // End of quoted pattern |
1930 | 157 | break; |
1931 | 157 | } |
1932 | | // Don't loop infinitely |
1933 | 11.5M | if (errors.hasSyntaxError() || U_FAILURE(status)) { |
1934 | 5.64k | break; |
1935 | 5.64k | } |
1936 | 11.5M | } |
1937 | 6.35k | } |
1938 | 6.35k | return result.build(status); |
1939 | 6.35k | } |
1940 | | |
1941 | 555 | void Parser::parseVariant(UErrorCode& status) { |
1942 | 555 | CHECK_ERROR(status); |
1943 | | |
1944 | | // At least one key is required |
1945 | 553 | SelectorKeys keyList(parseNonEmptyKeys(status)); |
1946 | | |
1947 | | // parseNonEmptyKeys() consumes any trailing whitespace, |
1948 | | // so the pattern can be consumed next. |
1949 | | |
1950 | | // Restore precondition before calling parsePattern() |
1951 | | // (which must return a non-null value) |
1952 | 553 | CHECK_BOUNDS(status); |
1953 | 282 | Pattern rhs = parseQuotedPattern(status); |
1954 | | |
1955 | 282 | dataModel.addVariant(std::move(keyList), std::move(rhs), status); |
1956 | 282 | } |
1957 | | |
1958 | | /* |
1959 | | Consume a `selectors` (matching the nonterminal in the grammar), |
1960 | | followed by a non-empty sequence of `variant`s (matching the nonterminal |
1961 | | in the grammar) preceded by whitespace |
1962 | | No postcondition (on return, `index` might equal `len()` with no syntax error |
1963 | | because a message can end with a variant) |
1964 | | */ |
1965 | 730 | void Parser::parseSelectors(UErrorCode& status) { |
1966 | 730 | CHECK_ERROR(status); |
1967 | | |
1968 | 730 | U_ASSERT(inBounds()); |
1969 | | |
1970 | 730 | parseToken(ID_MATCH, status); |
1971 | | |
1972 | 730 | bool empty = true; |
1973 | | // Parse selectors |
1974 | | // "Backtracking" is required here. It's not clear if whitespace is |
1975 | | // (`[s]` selector) or (`[s]` variant) |
1976 | 165k | while (isWhitespace(peek()) || peek() == DOLLAR) { |
1977 | 164k | int32_t whitespaceStart = index; |
1978 | 164k | parseRequiredWhitespace(status); |
1979 | | // Restore precondition |
1980 | 164k | CHECK_BOUNDS(status); |
1981 | 164k | if (peek() != DOLLAR) { |
1982 | | // This is not necessarily an error, but rather, |
1983 | | // means the whitespace we parsed was the optional |
1984 | | // whitespace preceding the first variant, not the |
1985 | | // required whitespace preceding a subsequent variable. |
1986 | | // In that case, "push back" the whitespace. |
1987 | 84 | normalizedInput.truncate(normalizedInput.length() - 1); |
1988 | 84 | index = whitespaceStart; |
1989 | 84 | break; |
1990 | 84 | } |
1991 | 164k | VariableName var = parseVariableName(status); |
1992 | 164k | empty = false; |
1993 | | |
1994 | 164k | dataModel.addSelector(std::move(var), status); |
1995 | 164k | CHECK_ERROR(status); |
1996 | 164k | } |
1997 | | |
1998 | | // At least one selector is required |
1999 | 715 | if (empty) { |
2000 | 200 | ERROR(status); |
2001 | 200 | return; |
2002 | 200 | } |
2003 | | |
2004 | 515 | #define CHECK_END_OF_INPUT \ |
2005 | 515 | if (!inBounds()) { \ |
2006 | 62 | break; \ |
2007 | 62 | } \ |
2008 | 515 | |
2009 | | // Parse variants |
2010 | | // matcher = match-statement s variant *(o variant) |
2011 | | |
2012 | | // Parse first variant |
2013 | 515 | parseRequiredWhitespace(status); |
2014 | 515 | if (!inBounds()) { |
2015 | 53 | ERROR(status); |
2016 | 53 | return; |
2017 | 53 | } |
2018 | 462 | parseVariant(status); |
2019 | 462 | if (!inBounds()) { |
2020 | | // Not an error; there might be only one variant |
2021 | 291 | return; |
2022 | 291 | } |
2023 | | |
2024 | 171 | while (isWhitespace(peek()) || isBidiControl(peek()) || isKeyStart(peek())) { |
2025 | 95 | parseOptionalWhitespace(); |
2026 | | // Restore the precondition. |
2027 | | // Trailing whitespace is allowed. |
2028 | 95 | if (!inBounds()) { |
2029 | 2 | return; |
2030 | 2 | } |
2031 | | |
2032 | 93 | parseVariant(status); |
2033 | | |
2034 | | // Restore the precondition, *without* erroring out if we've |
2035 | | // reached the end of input. That's because it's valid for the |
2036 | | // message to end with a variant that has no trailing whitespace. |
2037 | | // Why do we need to check this condition twice inside the loop? |
2038 | | // Because if we don't check it here, the `isWhitespace()` call in |
2039 | | // the loop head will read off the end of the input string. |
2040 | 93 | CHECK_END_OF_INPUT |
2041 | | |
2042 | 31 | if (errors.hasSyntaxError() || U_FAILURE(status)) { |
2043 | 31 | break; |
2044 | 31 | } |
2045 | 31 | } |
2046 | 171 | } |
2047 | | |
2048 | | /* |
2049 | | Consume a `body` (matching the nonterminal in the grammar), |
2050 | | No postcondition (on return, `index` might equal `len()` with no syntax error, |
2051 | | because a message can end with a body (trailing whitespace is optional) |
2052 | | */ |
2053 | | |
2054 | 318 | void Parser::errorPattern(UErrorCode& status) { |
2055 | 318 | errors.addSyntaxError(status); |
2056 | | // Set to empty pattern |
2057 | 318 | Pattern::Builder result = Pattern::Builder(status); |
2058 | 318 | CHECK_ERROR(status); |
2059 | | |
2060 | | // If still in bounds, then add the remaining input as a single text part |
2061 | | // to the pattern |
2062 | | /* |
2063 | | TODO: this behavior isn't documented in the spec, but it comes from |
2064 | | https://github.com/messageformat/messageformat/blob/e0087bff312d759b67a9129eac135d318a1f0ce7/packages/mf2-messageformat/src/__fixtures/test-messages.json#L236 |
2065 | | and a pending pull request https://github.com/unicode-org/message-format-wg/pull/462 will clarify |
2066 | | whether this is the intent behind the spec |
2067 | | */ |
2068 | 318 | UnicodeString partStr(LEFT_CURLY_BRACE); |
2069 | 4.58M | while (inBounds()) { |
2070 | 4.58M | partStr += peek(); |
2071 | 4.58M | next(); |
2072 | 4.58M | } |
2073 | | // Add curly braces around the entire output (same comment as above) |
2074 | 318 | partStr += RIGHT_CURLY_BRACE; |
2075 | 318 | result.add(std::move(partStr), status); |
2076 | 318 | dataModel.setPattern(result.build(status)); |
2077 | 318 | } |
2078 | | |
2079 | 1.06k | void Parser::parseBody(UErrorCode& status) { |
2080 | 1.06k | CHECK_ERROR(status); |
2081 | | |
2082 | | // Out-of-input is a syntax warning |
2083 | 1.06k | if (!inBounds()) { |
2084 | 173 | errorPattern(status); |
2085 | 173 | return; |
2086 | 173 | } |
2087 | | |
2088 | | // Body must be either a pattern or selectors |
2089 | 889 | switch (peek()) { |
2090 | 14 | case LEFT_CURLY_BRACE: { |
2091 | | // Pattern |
2092 | 14 | dataModel.setPattern(parseQuotedPattern(status)); |
2093 | 14 | break; |
2094 | 0 | } |
2095 | 730 | case ID_MATCH[0]: { |
2096 | | // Selectors |
2097 | 730 | parseSelectors(status); |
2098 | 730 | return; |
2099 | 0 | } |
2100 | 145 | default: { |
2101 | 145 | ERROR(status); |
2102 | 145 | errorPattern(status); |
2103 | 145 | return; |
2104 | 0 | } |
2105 | 889 | } |
2106 | 889 | } |
2107 | | |
2108 | | // ------------------------------------- |
2109 | | // Parses the source pattern. |
2110 | | |
2111 | 7.12k | void Parser::parse(UParseError &parseErrorResult, UErrorCode& status) { |
2112 | 7.12k | CHECK_ERROR(status); |
2113 | | |
2114 | 7.12k | bool complex = false; |
2115 | | // First, "look ahead" to determine if this is a simple or complex |
2116 | | // message. To do that, check the first non-whitespace character. |
2117 | 8.64k | while (inBounds(index) && (isWhitespace(peek()) || isBidiControl(peek()))) { |
2118 | 1.51k | next(); |
2119 | 1.51k | } |
2120 | | |
2121 | | // Message can be empty, so we need to only look ahead |
2122 | | // if we know it's non-empty |
2123 | 7.12k | if (inBounds()) { |
2124 | 7.11k | if (peek() == PERIOD |
2125 | 6.06k | || (inBounds(1) |
2126 | 5.93k | && peek() == LEFT_CURLY_BRACE |
2127 | 4.85k | && peek(1) == LEFT_CURLY_BRACE)) { |
2128 | 1.06k | complex = true; |
2129 | 1.06k | } |
2130 | 7.11k | } |
2131 | | // Reset index |
2132 | 7.12k | index = 0; |
2133 | | |
2134 | | // Message can be empty, so we need to only look ahead |
2135 | | // if we know it's non-empty |
2136 | 7.12k | if (complex) { |
2137 | 1.06k | parseOptionalWhitespace(); |
2138 | 1.06k | parseDeclarations(status); |
2139 | 1.06k | parseBody(status); |
2140 | 1.06k | parseOptionalWhitespace(); |
2141 | 6.06k | } else { |
2142 | | // Simple message |
2143 | | // For normalization, quote the pattern |
2144 | 6.06k | normalizedInput += LEFT_CURLY_BRACE; |
2145 | 6.06k | normalizedInput += LEFT_CURLY_BRACE; |
2146 | 6.06k | dataModel.setPattern(parseSimpleMessage(status)); |
2147 | 6.06k | normalizedInput += RIGHT_CURLY_BRACE; |
2148 | 6.06k | normalizedInput += RIGHT_CURLY_BRACE; |
2149 | 6.06k | } |
2150 | | |
2151 | 7.12k | CHECK_ERROR(status); |
2152 | | |
2153 | | // There are no errors; finally, check that the entire input was consumed |
2154 | 6.80k | if (!allConsumed()) { |
2155 | 4.71k | ERROR(status); |
2156 | 4.71k | } |
2157 | | |
2158 | | // Finally, copy the relevant fields of the internal `MessageParseError` |
2159 | | // into the `UParseError` argument |
2160 | 6.80k | translateParseError(parseError, parseErrorResult); |
2161 | 6.80k | } |
2162 | | |
2163 | 7.12k | Parser::~Parser() {} |
2164 | | |
2165 | | } // namespace message2 |
2166 | | U_NAMESPACE_END |
2167 | | |
2168 | | #endif /* #if !UCONFIG_NO_MF2 */ |
2169 | | |
2170 | | #endif /* #if !UCONFIG_NO_FORMATTING */ |
2171 | | |
2172 | | #endif /* #if !UCONFIG_NO_NORMALIZATION */ |