1 // © 2024 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3
4 #include "unicode/utypes.h"
5
6 #if !UCONFIG_NO_FORMATTING
7
8 #if !UCONFIG_NO_MF2
9
10 #include "messageformat2_errors.h"
11 #include "messageformat2_macros.h"
12 #include "messageformat2_parser.h"
13 #include "uvector.h" // U_ASSERT
14
15 U_NAMESPACE_BEGIN
16
17 namespace message2 {
18
19 using namespace pluralimpl;
20
21 using namespace data_model;
22
23 /*
24 The `ERROR()` macro sets a syntax error in the context
25 and sets the offset in `parseError` to `index`. It does not alter control flow.
26 */
27 #define ERROR(errorCode) \
28 if (!errors.hasSyntaxError()) { \
29 setParseError(parseError, index); \
30 errors.addSyntaxError(errorCode); \
31 }
32
33 #define ERROR_AT(errorCode, i) \
34 if (!errors.hasSyntaxError()) { \
35 setParseError(parseError, i); \
36 errors.addSyntaxError(errorCode); \
37 }
38
39 // Increments the line number and updates the "characters seen before
40 // current line" count in `parseError`, iff `peek()` is a newline
maybeAdvanceLine()41 void Parser::maybeAdvanceLine() {
42 if (peek() == LF) {
43 parseError.line++;
44 // add 1 to index to get the number of characters seen so far
45 // (including the newline)
46 parseError.lengthBeforeCurrentLine = index + 1;
47 }
48 }
49
50 /*
51 Signals an error and returns either if `parseError` already denotes an
52 error, or `index` is out of bounds for the string `source`
53 */
54 #define CHECK_BOUNDS(errorCode) \
55 if (!inBounds()) { \
56 ERROR(errorCode); \
57 return; \
58 }
59 #define CHECK_BOUNDS_1(errorCode) \
60 if (!inBounds(1)) { \
61 ERROR_AT(errorCode, index + 1); \
62 return; \
63 }
64
65 // -------------------------------------
66 // Helper functions
67
copyContext(const UChar in[U_PARSE_CONTEXT_LEN],UChar out[U_PARSE_CONTEXT_LEN])68 static void copyContext(const UChar in[U_PARSE_CONTEXT_LEN], UChar out[U_PARSE_CONTEXT_LEN]) {
69 for (int32_t i = 0; i < U_PARSE_CONTEXT_LEN; i++) {
70 out[i] = in[i];
71 if (in[i] == '\0') {
72 break;
73 }
74 }
75 }
76
translateParseError(const MessageParseError & messageParseError,UParseError & parseError)77 /* static */ void Parser::translateParseError(const MessageParseError &messageParseError, UParseError &parseError) {
78 parseError.line = messageParseError.line;
79 parseError.offset = messageParseError.offset;
80 copyContext(messageParseError.preContext, parseError.preContext);
81 copyContext(messageParseError.postContext, parseError.postContext);
82 }
83
setParseError(MessageParseError & parseError,uint32_t index)84 /* static */ void Parser::setParseError(MessageParseError &parseError, uint32_t index) {
85 // Translate absolute to relative offset
86 parseError.offset = index // Start with total number of characters seen
87 - parseError.lengthBeforeCurrentLine; // Subtract all characters before the current line
88 // TODO: Fill this in with actual pre and post-context
89 parseError.preContext[0] = 0;
90 parseError.postContext[0] = 0;
91 }
92
93 // -------------------------------------
94 // Predicates
95
96 // Returns true if `c` is in the interval [`first`, `last`]
inRange(UChar32 c,UChar32 first,UChar32 last)97 static bool inRange(UChar32 c, UChar32 first, UChar32 last) {
98 U_ASSERT(first < last);
99 return c >= first && c <= last;
100 }
101
102 /*
103 The following helper predicates should exactly match nonterminals in the MessageFormat 2 grammar:
104
105 `isContentChar()` : `content-char`
106 `isTextChar()` : `text-char`
107 `isAlpha()` : `ALPHA`
108 `isDigit()` : `DIGIT`
109 `isNameStart()` : `name-start`
110 `isNameChar()` : `name-char`
111 `isUnquotedStart()` : `unquoted-start`
112 `isQuotedChar()` : `quoted-char`
113 `isWhitespace()` : `s`
114 */
115
isContentChar(UChar32 c)116 static bool isContentChar(UChar32 c) {
117 return inRange(c, 0x0001, 0x0008) // Omit NULL, HTAB and LF
118 || inRange(c, 0x000B, 0x000C) // Omit CR
119 || inRange(c, 0x000E, 0x001F) // Omit SP
120 || inRange(c, 0x0021, 0x002D) // Omit '.'
121 || inRange(c, 0x002F, 0x003F) // Omit '@'
122 || inRange(c, 0x0041, 0x005B) // Omit '\'
123 || inRange(c, 0x005D, 0x007A) // Omit { | }
124 || inRange(c, 0x007E, 0xD7FF) // Omit surrogates
125 || inRange(c, 0xE000, 0x10FFFF);
126 }
127
128 // See `s` in the MessageFormat 2 grammar
isWhitespace(UChar32 c)129 inline bool isWhitespace(UChar32 c) {
130 switch (c) {
131 case SPACE:
132 case HTAB:
133 case CR:
134 case LF:
135 case IDEOGRAPHIC_SPACE:
136 return true;
137 default:
138 return false;
139 }
140 }
141
isTextChar(UChar32 c)142 static bool isTextChar(UChar32 c) {
143 return isContentChar(c)
144 || isWhitespace(c)
145 || c == PERIOD
146 || c == AT
147 || c == PIPE;
148 }
149
isAlpha(UChar32 c)150 static bool isAlpha(UChar32 c) { return inRange(c, 0x0041, 0x005A) || inRange(c, 0x0061, 0x007A); }
151
isDigit(UChar32 c)152 static bool isDigit(UChar32 c) { return inRange(c, 0x0030, 0x0039); }
153
isNameStart(UChar32 c)154 static bool isNameStart(UChar32 c) {
155 return isAlpha(c) || c == UNDERSCORE || inRange(c, 0x00C0, 0x00D6) || inRange(c, 0x00D8, 0x00F6) ||
156 inRange(c, 0x00F8, 0x02FF) || inRange(c, 0x0370, 0x037D) || inRange(c, 0x037F, 0x1FFF) ||
157 inRange(c, 0x200C, 0x200D) || inRange(c, 0x2070, 0x218F) || inRange(c, 0x2C00, 0x2FEF) ||
158 inRange(c, 0x3001, 0xD7FF) || inRange(c, 0xF900, 0xFDCF) || inRange(c, 0xFDF0, 0xFFFD) ||
159 inRange(c, 0x10000, 0xEFFFF);
160 }
161
isNameChar(UChar32 c)162 static bool isNameChar(UChar32 c) {
163 return isNameStart(c) || isDigit(c) || c == HYPHEN || c == PERIOD || c == 0x00B7 ||
164 inRange(c, 0x0300, 0x036F) || inRange(c, 0x203F, 0x2040);
165 }
166
isUnquotedStart(UChar32 c)167 static bool isUnquotedStart(UChar32 c) {
168 return isNameStart(c) || isDigit(c) || c == HYPHEN || c == PERIOD || c == 0x00B7 ||
169 inRange(c, 0x0300, 0x036F) || inRange(c, 0x203F, 0x2040);
170 }
171
isQuotedChar(UChar32 c)172 static bool isQuotedChar(UChar32 c) {
173 return isContentChar(c)
174 || isWhitespace(c)
175 || c == PERIOD
176 || c == AT
177 || c == LEFT_CURLY_BRACE
178 || c == RIGHT_CURLY_BRACE;
179 }
180
isEscapableChar(UChar32 c)181 static bool isEscapableChar(UChar32 c) {
182 return c == PIPE
183 || c == BACKSLASH
184 || c == LEFT_CURLY_BRACE
185 || c == RIGHT_CURLY_BRACE;
186 }
187
188 // Returns true iff `c` can begin a `function` nonterminal
isFunctionStart(UChar32 c)189 static bool isFunctionStart(UChar32 c) {
190 switch (c) {
191 case COLON: {
192 return true;
193 }
194 default: {
195 return false;
196 }
197 }
198 }
199
200 // Returns true iff `c` can begin an `annotation` nonterminal
isAnnotationStart(UChar32 c)201 static bool isAnnotationStart(UChar32 c) {
202 return isFunctionStart(c);
203 }
204
205 // Returns true iff `c` can begin a `literal` nonterminal
isLiteralStart(UChar32 c)206 static bool isLiteralStart(UChar32 c) {
207 return (c == PIPE || isNameStart(c) || c == HYPHEN || isDigit(c));
208 }
209
210 // Returns true iff `c` can begin a `key` nonterminal
isKeyStart(UChar32 c)211 static bool isKeyStart(UChar32 c) {
212 return (c == ASTERISK || isLiteralStart(c));
213 }
214
isDeclarationStart()215 bool Parser::isDeclarationStart() {
216 return (peek() == ID_LOCAL[0]
217 && inBounds(1)
218 && peek(1) == ID_LOCAL[1])
219 || (peek() == ID_INPUT[0]
220 && inBounds(1)
221 && peek(1) == ID_INPUT[1]);
222 }
223
224 // -------------------------------------
225 // Parsing functions
226
227
228 /*
229 TODO: Since handling the whitespace ambiguities needs to be repeated
230 in several different places and is hard to factor out,
231 it probably would be better to replace the parser with a lexer + parser
232 to separate tokenizing from parsing, which would simplify the code significantly.
233 This has the disadvantage that there is no token grammar for MessageFormat,
234 so one would have to be invented that isn't a component of the spec.
235 */
236
237 /*
238 This is a recursive-descent scannerless parser that,
239 with a few exceptions, uses 1 character of lookahead.
240
241 This may not be an exhaustive list, as the additions of attributes and reserved
242 statements introduced several new ambiguities.
243
244 All but three of the exceptions involve ambiguities about the meaning of whitespace.
245 One ambiguity not involving whitespace is:
246 identifier -> namespace ":" name
247 vs.
248 identifier -> name
249
250 `namespace` and `name` can't be distinguished without arbitrary lookahead.
251 (For how this is handled, see parseIdentifier())
252
253 The second ambiguity not involving whitespace is:
254 complex-message -> *(declaration[s]) complex-body
255 -> declaration *(declaration[s]) complex-body
256 -> declaration complex-body
257 -> reserved-statement complex-body
258 -> .foo {$x} .match // ...
259 When processing the '.', arbitrary lookahead is required to distinguish the
260 arbitrary-length unsupported keyword from `.match`.
261 (For how this is handled, see parseDeclarations()).
262
263 The third ambiguity not involving whitespace is:
264 complex-message -> *(declaration [s]) complex-body
265 -> reserved-statement *(declaration [s]) complex-body
266 -> reserved-statement complex-body
267 -> reserved-statement quotedPattern
268 -> reserved-keyword [s reserved-body] 1*([s] expression) quoted-pattern
269 -> reserved-keyword expression quoted-pattern
270 Example: .foo {1} {{1}}
271
272 Without lookahead, the opening '{' of the quoted pattern can't be distinguished
273 from the opening '{' of another expression in the unsupported statement.
274 (Though this only requires 1 character of lookahead.)
275
276 Otherwise:
277
278 There are at least seven ambiguities in the grammar that can't be resolved with finite
279 lookahead (since whitespace sequences can be arbitrarily long). They are resolved
280 with a form of backtracking (early exit). No state needs to be saved/restored
281 since whitespace doesn't affect the shape of the resulting parse tree, so it's
282 not true backtracking.
283
284 In addition, the grammar has been refactored
285 in a semantics-preserving way in some cases to make the code easier to structure.
286
287 First: variant = when 1*(s key) [s] pattern
288 Example: when k {a}
289 When reading the first space after 'k', it's ambiguous whether it's the
290 required space before another key, or the optional space before `pattern`.
291 (See comments in parseNonEmptyKeys())
292
293 Second: expression = "{" [s] (((literal / variable) [s annotation]) / annotation) [s] "}"
294 annotation = (function *(s option)) / reserved
295 Example: {:f }
296 When reading the first space after 'f', it's ambiguous whether it's the
297 required space before an option, or the optional trailing space after an options list
298 (in this case, the options list is empty).
299 (See comments in parseOptions() -- handling this case also meant it was easier to base
300 the code on a slightly refactored grammar, which should be semantically equivalent.)
301
302 Third: expression = "{" [s] (((literal / variable) [s annotation]) / annotation) [s] "}"
303 annotation = (function *(s option)) / reserved
304 Example: {@a }
305 Similar to the previous case; see comments in parseReserved()
306
307 Fourth: expression = "{" [s] (((literal / variable) [s annotation]) / annotation) [s] "}"
308 Example: {|foo| }
309 When reading the first space after the '|', it's ambiguous whether it's the required
310 space before an annotation, or the optional trailing space before the '}'.
311 (See comments in parseLiteralOrVariableWithAnnotation(); handling this case relies on
312 the same grammar refactoring as the second exception.)
313
314 Most functions match a non-terminal in the grammar, except as explained
315 in comments.
316
317 Fifth: matcher = match-statement 1*([s] variant)
318 -> match 1 *([s] selector) 1*([s] variant)
319 Example: match {42} * {{_}}
320 When reading the space after the first '}', it's unclear whether
321 it's the optional space before another selector, or the optional space
322 before a variant.
323
324 Sixth: annotation-expression = "{" [s] annotation *(s attribute) [s] "}"
325 -> "{" [s] function *(s attribute) [s] "}"
326 -> "{" [s] ":" identifier *(s option) *(s attribute) [s] "}"
327 -> "{" [s] ":" identifier s attribute *(s attribute) [s] "}"
328
329 Example: {:func @foo}
330 (Note: the same ambiguity is present with variable-expression and literal-expression)
331
332 Seventh:
333
334
335 When parsing the space, it's unclear whether it's the optional space before an
336 option, or the optional space before an attribute.
337
338 Unless otherwise noted in a comment, all helper functions that take
339 a `source` string, an `index` unsigned int, and an `errorCode` `UErrorCode`
340 have the precondition:
341 `index` < `len()`
342 and the postcondition:
343 `U_FAILURE(errorCode)` || `index < `len()`
344 */
345
346 /*
347 No pre, no post.
348 A message may end with whitespace, so `index` may equal `len()` on exit.
349 */
parseWhitespaceMaybeRequired(bool required,UErrorCode & errorCode)350 void Parser::parseWhitespaceMaybeRequired(bool required, UErrorCode& errorCode) {
351 bool sawWhitespace = false;
352
353 // The loop exits either when we consume all the input,
354 // or when we see a non-whitespace character.
355 while (true) {
356 // Check if all input has been consumed
357 if (!inBounds()) {
358 // If whitespace isn't required -- or if we saw it already --
359 // then the caller is responsible for checking this case and
360 // setting an error if necessary.
361 if (!required || sawWhitespace) {
362 // Not an error.
363 return;
364 }
365 // Otherwise, whitespace is required; the end of the input has
366 // been reached without whitespace. This is an error.
367 ERROR(errorCode);
368 return;
369 }
370
371 // Input remains; process the next character if it's whitespace,
372 // exit the loop otherwise
373 if (isWhitespace(peek())) {
374 sawWhitespace = true;
375 // Increment line number in parse error if we consume a newline
376 maybeAdvanceLine();
377 next();
378 } else {
379 break;
380 }
381 }
382
383 if (!sawWhitespace && required) {
384 ERROR(errorCode);
385 }
386 }
387
388 /*
389 No pre, no post, for the same reason as `parseWhitespaceMaybeRequired()`.
390 */
parseRequiredWhitespace(UErrorCode & errorCode)391 void Parser::parseRequiredWhitespace(UErrorCode& errorCode) {
392 parseWhitespaceMaybeRequired(true, errorCode);
393 normalizedInput += SPACE;
394 }
395
396 /*
397 No pre, no post, for the same reason as `parseWhitespaceMaybeRequired()`.
398 */
parseOptionalWhitespace(UErrorCode & errorCode)399 void Parser::parseOptionalWhitespace(UErrorCode& errorCode) {
400 parseWhitespaceMaybeRequired(false, errorCode);
401 }
402
403 // Consumes a single character, signaling an error if `peek()` != `c`
404 // No postcondition -- a message can end with a '}' token
parseToken(UChar32 c,UErrorCode & errorCode)405 void Parser::parseToken(UChar32 c, UErrorCode& errorCode) {
406 CHECK_BOUNDS(errorCode);
407
408 if (peek() == c) {
409 next();
410 normalizedInput += c;
411 return;
412 }
413 // Next character didn't match -- error out
414 ERROR(errorCode);
415 }
416
417 /*
418 Consumes a fixed-length token, signaling an error if the token isn't a prefix of
419 the string beginning at `peek()`
420 No postcondition -- a message can end with a '}' token
421 */
parseToken(const std::u16string_view & token,UErrorCode & errorCode)422 void Parser::parseToken(const std::u16string_view& token, UErrorCode& errorCode) {
423 U_ASSERT(inBounds());
424
425 int32_t tokenPos = 0;
426 while (tokenPos < static_cast<int32_t>(token.length())) {
427 if (peek() != token[tokenPos]) {
428 ERROR(errorCode);
429 return;
430 }
431 normalizedInput += token[tokenPos];
432 next();
433 tokenPos++;
434 }
435 }
436
437 /*
438 Consumes optional whitespace, possibly advancing `index` to `index'`,
439 then consumes a fixed-length token (signaling an error if the token isn't a prefix of
440 the string beginning at `source[index']`),
441 then consumes optional whitespace again
442 */
parseTokenWithWhitespace(const std::u16string_view & token,UErrorCode & errorCode)443 void Parser::parseTokenWithWhitespace(const std::u16string_view& token, UErrorCode& errorCode) {
444 // No need for error check or bounds check before parseOptionalWhitespace
445 parseOptionalWhitespace(errorCode);
446 // Establish precondition
447 CHECK_BOUNDS(errorCode);
448 parseToken(token, errorCode);
449 parseOptionalWhitespace(errorCode);
450 // Guarantee postcondition
451 CHECK_BOUNDS(errorCode);
452 }
453
454 /*
455 Consumes optional whitespace, possibly advancing `index` to `index'`,
456 then consumes a single character (signaling an error if it doesn't match
457 `source[index']`),
458 then consumes optional whitespace again
459 */
parseTokenWithWhitespace(UChar32 c,UErrorCode & errorCode)460 void Parser::parseTokenWithWhitespace(UChar32 c, UErrorCode& errorCode) {
461 // No need for error check or bounds check before parseOptionalWhitespace(errorCode)
462 parseOptionalWhitespace(errorCode);
463 // Establish precondition
464 CHECK_BOUNDS(errorCode);
465 parseToken(c, errorCode);
466 parseOptionalWhitespace(errorCode);
467 // Guarantee postcondition
468 CHECK_BOUNDS(errorCode);
469 }
470
471 /*
472 Consumes a non-empty sequence of `name-char`s, the first of which is
473 also a `name-start`.
474 that begins with a character `start` such that `isNameStart(start)`.
475
476 Returns this sequence.
477
478 (Matches the `name` nonterminal in the grammar.)
479 */
parseName(UErrorCode & errorCode)480 UnicodeString Parser::parseName(UErrorCode& errorCode) {
481 UnicodeString name;
482
483 U_ASSERT(inBounds());
484
485 if (!isNameStart(peek())) {
486 ERROR(errorCode);
487 return name;
488 }
489
490 while (isNameChar(peek())) {
491 UChar32 c = peek();
492 name += c;
493 normalizedInput += c;
494 next();
495 if (!inBounds()) {
496 ERROR(errorCode);
497 break;
498 }
499 }
500 return name;
501 }
502
503 /*
504 Consumes a '$' followed by a `name`, returning a VariableName
505 with `name` as its name
506
507 (Matches the `variable` nonterminal in the grammar.)
508 */
parseVariableName(UErrorCode & errorCode)509 VariableName Parser::parseVariableName(UErrorCode& errorCode) {
510 VariableName result;
511
512 U_ASSERT(inBounds());
513 // If the '$' is missing, we don't want a binding
514 // for this variable to be created.
515 bool valid = peek() == DOLLAR;
516 parseToken(DOLLAR, errorCode);
517 if (!inBounds()) {
518 ERROR(errorCode);
519 return result;
520 }
521 UnicodeString varName = parseName(errorCode);
522 // Set the name to "" if the variable wasn't
523 // declared correctly
524 if (!valid) {
525 varName.remove();
526 }
527 return VariableName(varName);
528 }
529
530 /*
531 Corresponds to the `identifier` nonterminal in the grammar
532 */
parseIdentifier(UErrorCode & errorCode)533 UnicodeString Parser::parseIdentifier(UErrorCode& errorCode) {
534 U_ASSERT(inBounds());
535
536 UnicodeString result;
537 // The following is a hack to get around ambiguity in the grammar:
538 // identifier -> namespace ":" name
539 // vs.
540 // identifier -> name
541 // can't be distinguished without arbitrary lookahead.
542 // Instead, we treat the production as:
543 // identifier -> namespace *(":"name)
544 // and then check for multiple colons.
545
546 // Parse namespace
547 result += parseName(errorCode);
548 int32_t firstColon = -1;
549 while (inBounds() && peek() == COLON) {
550 // Parse ':' separator
551 if (firstColon == -1) {
552 firstColon = index;
553 }
554 parseToken(COLON, errorCode);
555 result += COLON;
556 // Check for message ending with something like "foo:"
557 if (!inBounds()) {
558 ERROR(errorCode);
559 } else {
560 // Parse name part
561 result += parseName(errorCode);
562 }
563 }
564
565 // If there's at least one ':', scan from the first ':'
566 // to the end of the name to check for multiple ':'s
567 if (firstColon != -1) {
568 for (int32_t i = firstColon + 1; i < result.length(); i++) {
569 if (result[i] == COLON) {
570 ERROR_AT(errorCode, i);
571 return {};
572 }
573 }
574 }
575
576 return result;
577 }
578
579 /*
580 Consumes a reference to a function, matching the ": identifier"
581 in the `function` nonterminal in the grammar.
582
583 Returns the function name.
584 */
parseFunction(UErrorCode & errorCode)585 FunctionName Parser::parseFunction(UErrorCode& errorCode) {
586 U_ASSERT(inBounds());
587 if (!isFunctionStart(peek())) {
588 ERROR(errorCode);
589 return FunctionName();
590 }
591
592 normalizedInput += peek();
593 next(); // Consume the function start character
594 if (!inBounds()) {
595 ERROR(errorCode);
596 return FunctionName();
597 }
598 return parseIdentifier(errorCode);
599 }
600
601
602 /*
603 Precondition: peek() == BACKSLASH
604
605 Consume an escaped character.
606 Corresponds to `escaped-char` in the grammar.
607
608 No postcondition (a message can end with an escaped char)
609 */
parseEscapeSequence(UErrorCode & errorCode)610 UnicodeString Parser::parseEscapeSequence(UErrorCode& errorCode) {
611 U_ASSERT(inBounds());
612 U_ASSERT(peek() == BACKSLASH);
613 normalizedInput += BACKSLASH;
614 next(); // Skip the initial backslash
615 UnicodeString str;
616 if (inBounds()) {
617 // Expect a '{', '|' or '}'
618 switch (peek()) {
619 case LEFT_CURLY_BRACE:
620 case RIGHT_CURLY_BRACE:
621 case PIPE:
622 case BACKSLASH: {
623 /* Append to the output string */
624 str += peek();
625 /* Update normalizedInput */
626 normalizedInput += peek();
627 /* Consume the character */
628 next();
629 return str;
630 }
631 default: {
632 // No other characters are allowed here
633 break;
634 }
635 }
636 }
637 // If control reaches here, there was an error
638 ERROR(errorCode);
639 return str;
640 }
641
642
643 /*
644 Consume and return a quoted literal, matching the `literal` nonterminal in the grammar.
645 */
parseQuotedLiteral(UErrorCode & errorCode)646 Literal Parser::parseQuotedLiteral(UErrorCode& errorCode) {
647 bool error = false;
648
649 UnicodeString contents;
650 if (U_SUCCESS(errorCode)) {
651 // Parse the opening '|'
652 parseToken(PIPE, errorCode);
653 if (!inBounds()) {
654 ERROR(errorCode);
655 error = true;
656 } else {
657 // Parse the contents
658 bool done = false;
659 while (!done) {
660 if (peek() == BACKSLASH) {
661 contents += parseEscapeSequence(errorCode);
662 } else if (isQuotedChar(peek())) {
663 contents += peek();
664 // Handle cases like:
665 // |}{| -- we want to escape everywhere that
666 // can be escaped, to make round-trip checking
667 // easier -- so this case normalizes to
668 // |\}\{|
669 if (isEscapableChar(peek())) {
670 normalizedInput += BACKSLASH;
671 }
672 normalizedInput += peek();
673 next(); // Consume this character
674 maybeAdvanceLine();
675 } else {
676 // Assume the sequence of literal characters ends here
677 done = true;
678 }
679 if (!inBounds()) {
680 ERROR(errorCode);
681 error = true;
682 break;
683 }
684 }
685 }
686 }
687
688 if (error) {
689 return {};
690 }
691
692 // Parse the closing '|'
693 parseToken(PIPE, errorCode);
694
695 return Literal(true, contents);
696 }
697
698 // Parse (1*DIGIT)
parseDigits(UErrorCode & errorCode)699 UnicodeString Parser::parseDigits(UErrorCode& errorCode) {
700 if (U_FAILURE(errorCode)) {
701 return {};
702 }
703
704 U_ASSERT(isDigit(peek()));
705
706 UnicodeString contents;
707 do {
708 contents += peek();
709 normalizedInput += peek();
710 next();
711 if (!inBounds()) {
712 ERROR(errorCode);
713 return {};
714 }
715 } while (isDigit(peek()));
716
717 return contents;
718 }
719 /*
720 Consume and return an unquoted literal, matching the `unquoted` nonterminal in the grammar.
721 */
parseUnquotedLiteral(UErrorCode & errorCode)722 Literal Parser::parseUnquotedLiteral(UErrorCode& errorCode) {
723 if (U_FAILURE(errorCode)) {
724 return {};
725 }
726
727 // unquoted -> name
728 if (isNameStart(peek())) {
729 return Literal(false, parseName(errorCode));
730 }
731
732 // unquoted -> number
733 // Parse the contents
734 UnicodeString contents;
735
736 // Parse the sign
737 if (peek() == HYPHEN) {
738 contents += peek();
739 normalizedInput += peek();
740 next();
741 }
742 if (!inBounds()) {
743 ERROR(errorCode);
744 return {};
745 }
746
747 // Parse the integer part
748 if (peek() == ((UChar32)0x0030) /* 0 */) {
749 contents += peek();
750 normalizedInput += peek();
751 next();
752 } else if (isDigit(peek())) {
753 contents += parseDigits(errorCode);
754 } else {
755 // Error -- nothing else can start a number literal
756 ERROR(errorCode);
757 return {};
758 }
759
760 // Parse the decimal point if present
761 if (peek() == PERIOD) {
762 contents += peek();
763 normalizedInput += peek();
764 next();
765 if (!inBounds()) {
766 ERROR(errorCode);
767 return {};
768 }
769 // Parse the fraction part
770 if (isDigit(peek())) {
771 contents += parseDigits(errorCode);
772 } else {
773 // '.' not followed by digit is a parse error
774 ERROR(errorCode);
775 return {};
776 }
777 }
778
779 if (!inBounds()) {
780 ERROR(errorCode);
781 return {};
782 }
783
784 // Parse the exponent part if present
785 if (peek() == UPPERCASE_E || peek() == LOWERCASE_E) {
786 contents += peek();
787 normalizedInput += peek();
788 next();
789 if (!inBounds()) {
790 ERROR(errorCode);
791 return {};
792 }
793 // Parse sign if present
794 if (peek() == PLUS || peek() == HYPHEN) {
795 contents += peek();
796 normalizedInput += peek();
797 next();
798 if (!inBounds()) {
799 ERROR(errorCode);
800 return {};
801 }
802 }
803 // Parse exponent digits
804 if (!isDigit(peek())) {
805 ERROR(errorCode);
806 return {};
807 }
808 contents += parseDigits(errorCode);
809 }
810
811 return Literal(false, contents);
812 }
813
814 /*
815 Consume and return a literal, matching the `literal` nonterminal in the grammar.
816 */
parseLiteral(UErrorCode & errorCode)817 Literal Parser::parseLiteral(UErrorCode& errorCode) {
818 Literal result;
819 if (!inBounds()) {
820 ERROR(errorCode);
821 } else {
822 if (peek() == PIPE) {
823 result = parseQuotedLiteral(errorCode);
824 } else {
825 result = parseUnquotedLiteral(errorCode);
826 }
827 // Guarantee postcondition
828 if (!inBounds()) {
829 ERROR(errorCode);
830 }
831 }
832
833 return result;
834 }
835
836 /*
837 Consume a @name-value pair, matching the `attribute` nonterminal in the grammar.
838
839 Adds the option to `options`
840 */
841 template<class T>
parseAttribute(AttributeAdder<T> & attrAdder,UErrorCode & errorCode)842 void Parser::parseAttribute(AttributeAdder<T>& attrAdder, UErrorCode& errorCode) {
843 U_ASSERT(inBounds());
844
845 U_ASSERT(peek() == AT);
846 // Consume the '@'
847 parseToken(AT, errorCode);
848
849 // Parse LHS
850 UnicodeString lhs = parseIdentifier(errorCode);
851
852 // Prepare to "backtrack" to resolve ambiguity
853 // about whether whitespace precedes another
854 // attribute, or the '=' sign
855 int32_t savedIndex = index;
856 parseOptionalWhitespace(errorCode);
857
858 Operand rand;
859 if (peek() == EQUALS) {
860 // Parse '='
861 parseTokenWithWhitespace(EQUALS, errorCode);
862
863 UnicodeString rhsStr;
864 // Parse RHS, which is either a literal or variable
865 switch (peek()) {
866 case DOLLAR: {
867 rand = Operand(parseVariableName(errorCode));
868 break;
869 }
870 default: {
871 // Must be a literal
872 rand = Operand(parseLiteral(errorCode));
873 break;
874 }
875 }
876 U_ASSERT(!rand.isNull());
877 } else {
878 // attribute -> "@" identifier [[s] "=" [s]]
879 // Use null operand, which `rand` is already set to
880 // "Backtrack" by restoring the whitespace (if there was any)
881 index = savedIndex;
882 }
883
884 attrAdder.addAttribute(lhs, std::move(rand), errorCode);
885 }
886
887 /*
888 Consume a name-value pair, matching the `option` nonterminal in the grammar.
889
890 Adds the option to `optionList`
891 */
892 template<class T>
parseOption(OptionAdder<T> & addOption,UErrorCode & errorCode)893 void Parser::parseOption(OptionAdder<T>& addOption, UErrorCode& errorCode) {
894 U_ASSERT(inBounds());
895
896 // Parse LHS
897 UnicodeString lhs = parseIdentifier(errorCode);
898
899 // Parse '='
900 parseTokenWithWhitespace(EQUALS, errorCode);
901
902 UnicodeString rhsStr;
903 Operand rand;
904 // Parse RHS, which is either a literal or variable
905 switch (peek()) {
906 case DOLLAR: {
907 rand = Operand(parseVariableName(errorCode));
908 break;
909 }
910 default: {
911 // Must be a literal
912 rand = Operand(parseLiteral(errorCode));
913 break;
914 }
915 }
916 U_ASSERT(!rand.isNull());
917
918 // Finally, add the key=value mapping
919 // Use a local error code, check for duplicate option error and
920 // record it as with other errors
921 UErrorCode status = U_ZERO_ERROR;
922 addOption.addOption(lhs, std::move(rand), status);
923 if (U_FAILURE(status)) {
924 U_ASSERT(status == U_MF_DUPLICATE_OPTION_NAME_ERROR);
925 errors.setDuplicateOptionName(errorCode);
926 }
927 }
928
929 /*
930 Note: there are multiple overloads of parseOptions() for parsing
931 options within markup, vs. within an expression, vs. parsing
932 attributes. This should be refactored. TODO
933 */
934
935 /*
936 Consume optional whitespace followed by a sequence of options
937 (possibly empty), separated by whitespace
938 */
939 template <class T>
parseOptions(OptionAdder<T> & addOption,UErrorCode & errorCode)940 void Parser::parseOptions(OptionAdder<T>& addOption, UErrorCode& errorCode) {
941 // Early exit if out of bounds -- no more work is possible
942 CHECK_BOUNDS(errorCode);
943
944 /*
945 Arbitrary lookahead is required to parse option lists. To see why, consider
946 these rules from the grammar:
947
948 expression = "{" [s] (((literal / variable) [s annotation]) / annotation) [s] "}"
949 annotation = (function *(s option)) / reserved
950
951 And this example:
952 {:foo }
953
954 Derivation:
955 expression -> "{" [s] (((literal / variable) [s annotation]) / annotation) [s] "}"
956 -> "{" [s] annotation [s] "}"
957 -> "{" [s] ((function *(s option)) / reserved) [s] "}"
958 -> "{" [s] function *(s option) [s] "}"
959
960 In this example, knowing whether to expect a '}' or the start of another option
961 after the whitespace would require arbitrary lookahead -- in other words, which
962 rule should we apply?
963 *(s option) -> s option *(s option)
964 or
965 *(s option) ->
966
967 The same would apply to the example {:foo k=v } (note the trailing space after "v").
968
969 This is addressed using a form of backtracking and (to make the backtracking easier
970 to apply) a slight refactoring to the grammar.
971
972 This code is written as if the grammar is:
973 expression = "{" [s] (((literal / variable) ([s] / [s annotation])) / annotation) "}"
974 annotation = (function *(s option) [s]) / (reserved [s])
975
976 Parsing the `*(s option) [s]` sequence can be done within `parseOptions()`, meaning
977 that `parseExpression()` can safely require a '}' after `parseOptions()` finishes.
978
979 Note that when "backtracking" really just means early exit, since only whitespace
980 is involved and there's no state to save.
981
982 There is a separate but similar ambiguity as to whether the space precedes
983 an option or an attribute.
984 */
985
986 while(true) {
987 // If the next character is not whitespace, that means we've already
988 // parsed the entire options list (which may have been empty) and there's
989 // no trailing whitespace. In that case, exit.
990 if (!isWhitespace(peek())) {
991 break;
992 }
993 int32_t firstWhitespace = index;
994
995 // In any case other than an empty options list, there must be at least
996 // one whitespace character.
997 parseRequiredWhitespace(errorCode);
998 // Restore precondition
999 CHECK_BOUNDS(errorCode);
1000
1001 // If a name character follows, then at least one more option remains
1002 // in the list.
1003 // Otherwise, we've consumed all the options and any trailing whitespace,
1004 // and can exit.
1005 // Note that exiting is sort of like backtracking: "(s option)" doesn't apply,
1006 // so we back out to [s].
1007 if (!isNameStart(peek())) {
1008 // We've consumed all the options (meaning that either we consumed non-empty
1009 // whitespace, or consumed at least one option.)
1010 // Done.
1011 // Remove the required whitespace from normalizedInput
1012 normalizedInput.truncate(normalizedInput.length() - 1);
1013 // "Backtrack" so as to leave the optional whitespace there
1014 // when parsing attributes
1015 index = firstWhitespace;
1016 break;
1017 }
1018 parseOption(addOption, errorCode);
1019 }
1020 }
1021
1022 /*
1023 Consume optional whitespace followed by a sequence of attributes
1024 (possibly empty), separated by whitespace
1025 */
1026 template<class T>
parseAttributes(AttributeAdder<T> & attrAdder,UErrorCode & errorCode)1027 void Parser::parseAttributes(AttributeAdder<T>& attrAdder, UErrorCode& errorCode) {
1028
1029 // Early exit if out of bounds -- no more work is possible
1030 if (!inBounds()) {
1031 ERROR(errorCode);
1032 return;
1033 }
1034
1035 /*
1036 Arbitrary lookahead is required to parse attribute lists, similarly to option lists.
1037 (See comment in parseOptions()).
1038 */
1039
1040 while(true) {
1041 // If the next character is not whitespace, that means we've already
1042 // parsed the entire attributes list (which may have been empty) and there's
1043 // no trailing whitespace. In that case, exit.
1044 if (!isWhitespace(peek())) {
1045 break;
1046 }
1047
1048 // In any case other than an empty attributes list, there must be at least
1049 // one whitespace character.
1050 parseRequiredWhitespace(errorCode);
1051 // Restore precondition
1052 if (!inBounds()) {
1053 ERROR(errorCode);
1054 break;
1055 }
1056
1057 // If an '@' follows, then at least one more attribute remains
1058 // in the list.
1059 // Otherwise, we've consumed all the attributes and any trailing whitespace,
1060 // and can exit.
1061 // Note that exiting is sort of like backtracking: "(s attributes)" doesn't apply,
1062 // so we back out to [s].
1063 if (peek() != AT) {
1064 // We've consumed all the attributes (meaning that either we consumed non-empty
1065 // whitespace, or consumed at least one attribute.)
1066 // Done.
1067 // Remove the whitespace from normalizedInput
1068 normalizedInput.truncate(normalizedInput.length() - 1);
1069 break;
1070 }
1071 parseAttribute(attrAdder, errorCode);
1072 }
1073 }
1074
1075 /*
1076 Consume a function call, matching the `annotation`
1077 nonterminal in the grammar
1078
1079 Returns an `Operator` representing this (a reserved is a parse error)
1080 */
parseAnnotation(UErrorCode & status)1081 Operator Parser::parseAnnotation(UErrorCode& status) {
1082 U_ASSERT(inBounds());
1083 Operator::Builder ratorBuilder(status);
1084 if (U_FAILURE(status)) {
1085 return {};
1086 }
1087 if (isFunctionStart(peek())) {
1088 // Consume the function name
1089 FunctionName func = parseFunction(status);
1090 ratorBuilder.setFunctionName(std::move(func));
1091
1092 OptionAdder<Operator::Builder> addOptions(ratorBuilder);
1093 // Consume the options (which may be empty)
1094 parseOptions(addOptions, status);
1095 } else {
1096 ERROR(status);
1097 }
1098 return ratorBuilder.build(status);
1099 }
1100
1101 /*
1102 Consume a literal or variable (depending on `isVariable`),
1103 followed by either required whitespace followed by an annotation,
1104 or optional whitespace.
1105 */
parseLiteralOrVariableWithAnnotation(bool isVariable,Expression::Builder & builder,UErrorCode & status)1106 void Parser::parseLiteralOrVariableWithAnnotation(bool isVariable,
1107 Expression::Builder& builder,
1108 UErrorCode& status) {
1109 CHECK_ERROR(status);
1110
1111 U_ASSERT(inBounds());
1112
1113 Operand rand;
1114 if (isVariable) {
1115 rand = Operand(parseVariableName(status));
1116 } else {
1117 rand = Operand(parseLiteral(status));
1118 }
1119
1120 builder.setOperand(std::move(rand));
1121
1122 /*
1123 Parsing a literal or variable with an optional annotation requires arbitrary lookahead.
1124 To see why, consider this rule from the grammar:
1125
1126 expression = "{" [s] (((literal / variable) [s annotation]) / annotation) [s] "}"
1127
1128 And this example:
1129
1130 {|foo| }
1131
1132 Derivation:
1133 expression -> "{" [s] (((literal / variable) [s annotation]) / annotation) [s] "}"
1134 -> "{" [s] ((literal / variable) [s annotation]) [s] "}"
1135 -> "{" [s] (literal [s annotation]) [s] "}"
1136
1137 When reading the ' ' after the second '|', it's ambiguous whether that's the required
1138 space before an annotation, or the optional space before the '}'.
1139
1140 To make this ambiguity easier to handle, this code is based on the same grammar
1141 refactoring for the `expression` nonterminal that `parseOptions()` relies on. See
1142 the comment in `parseOptions()` for details.
1143 */
1144
1145 if (isWhitespace(peek())) {
1146 int32_t firstWhitespace = index;
1147
1148 // If the next character is whitespace, either [s annotation] or [s] applies
1149 // (the character is either the required space before an annotation, or optional
1150 // trailing space after the literal or variable). It's still ambiguous which
1151 // one does apply.
1152 parseOptionalWhitespace(status);
1153 // Restore precondition
1154 CHECK_BOUNDS(status);
1155
1156 // This next check resolves the ambiguity between [s annotation] and [s]
1157 bool isSAnnotation = isAnnotationStart(peek());
1158
1159 if (isSAnnotation) {
1160 normalizedInput += SPACE;
1161 }
1162
1163 if (isSAnnotation) {
1164 // The previously consumed whitespace precedes an annotation
1165 builder.setOperator(parseAnnotation(status));
1166 } else {
1167 // Either there's a right curly brace (will be consumed by the caller),
1168 // or there's an error and the trailing whitespace should be
1169 // handled by the caller. However, this is not an error
1170 // here because we're just parsing `literal [s annotation]`.
1171 index = firstWhitespace;
1172 }
1173 } else {
1174 // Either there was never whitespace, or
1175 // the previously consumed whitespace is the optional trailing whitespace;
1176 // either the next character is '}' or the error will be handled by parseExpression.
1177 // Do nothing, since the operand was already set
1178 }
1179
1180 // At the end of this code, the next character should either be '}',
1181 // whitespace followed by a '}',
1182 // or end-of-input
1183 }
1184
1185 /*
1186 Consume an expression, matching the `expression` nonterminal in the grammar
1187 */
1188
exprFallback(Expression::Builder & exprBuilder)1189 static void exprFallback(Expression::Builder& exprBuilder) {
1190 // Construct a literal consisting just of The U+FFFD REPLACEMENT CHARACTER
1191 // per https://github.com/unicode-org/message-format-wg/blob/main/spec/formatting.md#fallback-resolution
1192 exprBuilder.setOperand(Operand(Literal(false, UnicodeString(REPLACEMENT))));
1193 }
1194
exprFallback(UErrorCode & status)1195 static Expression exprFallback(UErrorCode& status) {
1196 Expression result;
1197 if (U_SUCCESS(status)) {
1198 Expression::Builder exprBuilder(status);
1199 if (U_SUCCESS(status)) {
1200 // Construct a literal consisting just of The U+FFFD REPLACEMENT CHARACTER
1201 // per https://github.com/unicode-org/message-format-wg/blob/main/spec/formatting.md#fallback-resolution
1202 exprBuilder.setOperand(Operand(Literal(false, UnicodeString(REPLACEMENT))));
1203 UErrorCode status = U_ZERO_ERROR;
1204 result = exprBuilder.build(status);
1205 // An operand was set, so there can't be an error
1206 U_ASSERT(U_SUCCESS(status));
1207 }
1208 }
1209 return result;
1210 }
1211
parseExpression(UErrorCode & status)1212 Expression Parser::parseExpression(UErrorCode& status) {
1213 if (U_FAILURE(status)) {
1214 return {};
1215 }
1216
1217 // Early return if out of input -- no more work is possible
1218 U_ASSERT(inBounds());
1219
1220 // Parse opening brace
1221 parseToken(LEFT_CURLY_BRACE, status);
1222 // Optional whitespace after opening brace
1223 parseOptionalWhitespace(status);
1224
1225 Expression::Builder exprBuilder(status);
1226 // Restore precondition
1227 if (!inBounds()) {
1228 exprFallback(exprBuilder);
1229 } else {
1230 // literal '|', variable '$' or annotation
1231 switch (peek()) {
1232 case PIPE: {
1233 // Quoted literal
1234 parseLiteralOrVariableWithAnnotation(false, exprBuilder, status);
1235 break;
1236 }
1237 case DOLLAR: {
1238 // Variable
1239 parseLiteralOrVariableWithAnnotation(true, exprBuilder, status);
1240 break;
1241 }
1242 default: {
1243 if (isAnnotationStart(peek())) {
1244 Operator rator = parseAnnotation(status);
1245 exprBuilder.setOperator(std::move(rator));
1246 } else if (isUnquotedStart(peek())) {
1247 // Unquoted literal
1248 parseLiteralOrVariableWithAnnotation(false, exprBuilder, status);
1249 } else {
1250 // Not a literal, variable or annotation -- error out
1251 ERROR(status);
1252 exprFallback(exprBuilder);
1253 break;
1254 }
1255 break;
1256 }
1257 }
1258 }
1259
1260 // Parse attributes
1261 AttributeAdder<Expression::Builder> attrAdder(exprBuilder);
1262 parseAttributes(attrAdder, status);
1263
1264 // Parse optional space
1265 // (the last [s] in e.g. "{" [s] literal [s annotation] *(s attribute) [s] "}")
1266 parseOptionalWhitespace(status);
1267
1268 // Either an operand or operator (or both) must have been set already,
1269 // so there can't be an error
1270 UErrorCode localStatus = U_ZERO_ERROR;
1271 Expression result = exprBuilder.build(localStatus);
1272 U_ASSERT(U_SUCCESS(localStatus));
1273
1274 // Check for end-of-input and missing '}'
1275 if (!inBounds()) {
1276 ERROR(status);
1277 } else {
1278 // Otherwise, it's safe to check for the '}'
1279 parseToken(RIGHT_CURLY_BRACE, status);
1280 }
1281 return result;
1282 }
1283
1284 /*
1285 Parse a .local declaration, matching the `local-declaration`
1286 production in the grammar
1287 */
parseLocalDeclaration(UErrorCode & status)1288 void Parser::parseLocalDeclaration(UErrorCode& status) {
1289 // End-of-input here would be an error; even empty
1290 // declarations must be followed by a body
1291 CHECK_BOUNDS(status);
1292
1293 parseToken(ID_LOCAL, status);
1294 parseRequiredWhitespace(status);
1295
1296 // Restore precondition
1297 CHECK_BOUNDS(status);
1298 VariableName lhs = parseVariableName(status);
1299 parseTokenWithWhitespace(EQUALS, status);
1300 // Restore precondition before calling parseExpression()
1301 CHECK_BOUNDS(status);
1302
1303 Expression rhs = parseExpression(status);
1304
1305 // Add binding from lhs to rhs, unless there was an error
1306 // (This ensures that if there was a correct lhs but a
1307 // parse error in rhs, the fallback for uses of the
1308 // lhs will be its own name rather than the rhs)
1309 /* This affects the behavior of this test case, which the spec
1310 is ambiguous about:
1311
1312 .local $bar {|foo|} {{{$bar}}}
1313
1314 Should `$bar` still be bound to a value although
1315 its declaration is syntactically incorrect (missing the '=')?
1316 This code says no, but it needs to change if
1317 https://github.com/unicode-org/message-format-wg/issues/703
1318 is resolved differently.
1319 */
1320 CHECK_ERROR(status);
1321 if (!errors.hasSyntaxError()) {
1322 dataModel.addBinding(Binding(std::move(lhs), std::move(rhs)), status);
1323 // Check if status is U_DUPLICATE_DECLARATION_ERROR
1324 // and add that as an internal error if so
1325 if (status == U_MF_DUPLICATE_DECLARATION_ERROR) {
1326 status = U_ZERO_ERROR;
1327 errors.addError(StaticErrorType::DuplicateDeclarationError, status);
1328 }
1329 }
1330 }
1331
1332 /*
1333 Parse an .input declaration, matching the `local-declaration`
1334 production in the grammar
1335 */
parseInputDeclaration(UErrorCode & status)1336 void Parser::parseInputDeclaration(UErrorCode& status) {
1337 // End-of-input here would be an error; even empty
1338 // declarations must be followed by a body
1339 CHECK_BOUNDS(status);
1340
1341 parseToken(ID_INPUT, status);
1342 parseOptionalWhitespace(status);
1343
1344 // Restore precondition before calling parseExpression()
1345 CHECK_BOUNDS(status);
1346
1347 // Save the index for error diagnostics
1348 int32_t exprIndex = index;
1349 Expression rhs = parseExpression(status);
1350
1351 // Here we have to check that the rhs is a variable-expression
1352 if (!rhs.getOperand().isVariable()) {
1353 // This case is a syntax error; report it at the beginning
1354 // of the expression
1355 ERROR_AT(status, exprIndex);
1356 return;
1357 }
1358
1359 VariableName lhs = rhs.getOperand().asVariable();
1360
1361 // Add binding from lhs to rhs
1362 // This just adds a new local variable that shadows the message
1363 // argument referred to, which is harmless.
1364 // When evaluating the RHS, the new local is not in scope
1365 // and the message argument will be correctly referred to.
1366 CHECK_ERROR(status);
1367 if (!errors.hasSyntaxError()) {
1368 dataModel.addBinding(Binding::input(std::move(lhs), std::move(rhs), status), status);
1369 // Check if status is U_MF_DUPLICATE_DECLARATION_ERROR
1370 // and add that as an internal error if so
1371 if (status == U_MF_DUPLICATE_DECLARATION_ERROR) {
1372 status = U_ZERO_ERROR;
1373 errors.addError(StaticErrorType::DuplicateDeclarationError, status);
1374 }
1375 }
1376 }
1377
1378 /*
1379 Consume a possibly-empty sequence of declarations separated by whitespace;
1380 each declaration matches the `declaration` nonterminal in the grammar
1381
1382 Builds up an environment representing those declarations
1383 */
parseDeclarations(UErrorCode & status)1384 void Parser::parseDeclarations(UErrorCode& status) {
1385 // End-of-input here would be an error; even empty
1386 // declarations must be followed by a body
1387 CHECK_BOUNDS(status);
1388
1389 while (peek() == PERIOD) {
1390 CHECK_BOUNDS_1(status);
1391 if (peek(1) == ID_LOCAL[1]) {
1392 parseLocalDeclaration(status);
1393 } else if (peek(1) == ID_INPUT[1]) {
1394 parseInputDeclaration(status);
1395 } else {
1396 // Done parsing declarations
1397 break;
1398 }
1399
1400 // Avoid looping infinitely
1401 CHECK_ERROR(status);
1402
1403 parseOptionalWhitespace(status);
1404 // Restore precondition
1405 CHECK_BOUNDS(status);
1406 }
1407 }
1408
1409 /*
1410 Consume a text character
1411 matching the `text-char` nonterminal in the grammar
1412
1413 No postcondition (a message can end with a text-char)
1414 */
parseTextChar(UErrorCode & status)1415 UnicodeString Parser::parseTextChar(UErrorCode& status) {
1416 UnicodeString str;
1417 if (!inBounds() || !(isTextChar(peek()))) {
1418 // Error -- text-char is expected here
1419 ERROR(status);
1420 } else {
1421 // See comment in parseQuotedLiteral()
1422 if (isEscapableChar(peek())) {
1423 normalizedInput += BACKSLASH;
1424 }
1425 normalizedInput += peek();
1426 str += peek();
1427 next();
1428 maybeAdvanceLine();
1429 }
1430 return str;
1431 }
1432
1433 /*
1434 Consume an `nmtoken`, `literal`, or the string "*", matching
1435 the `key` nonterminal in the grammar
1436 */
parseKey(UErrorCode & status)1437 Key Parser::parseKey(UErrorCode& status) {
1438 U_ASSERT(inBounds());
1439
1440 Key k; // wildcard by default
1441 // Literal | '*'
1442 switch (peek()) {
1443 case ASTERISK: {
1444 next();
1445 normalizedInput += ASTERISK;
1446 // Guarantee postcondition
1447 if (!inBounds()) {
1448 ERROR(status);
1449 return k;
1450 }
1451 break;
1452 }
1453 default: {
1454 // Literal
1455 k = Key(parseLiteral(status));
1456 break;
1457 }
1458 }
1459 return k;
1460 }
1461
1462 /*
1463 Consume a non-empty sequence of `key`s separated by whitespace
1464
1465 Takes ownership of `keys`
1466 */
parseNonEmptyKeys(UErrorCode & status)1467 SelectorKeys Parser::parseNonEmptyKeys(UErrorCode& status) {
1468 SelectorKeys result;
1469
1470 if (U_FAILURE(status)) {
1471 return result;
1472 }
1473
1474 U_ASSERT(inBounds());
1475
1476 /*
1477 Arbitrary lookahead is required to parse key lists. To see why, consider
1478 this rule from the grammar:
1479
1480 variant = key *(s key) [s] quoted-pattern
1481
1482 And this example:
1483 when k1 k2 {a}
1484
1485 Derivation:
1486 variant -> key *(s key) [s] quoted-pattern
1487 -> key s key *(s key) quoted-pattern
1488
1489 After matching ' ' to `s` and 'k2' to `key`, it would require arbitrary lookahead
1490 to know whether to expect the start of a pattern or the start of another key.
1491 In other words: is the second whitespace sequence the required space in *(s key),
1492 or the optional space in [s] quoted-pattern?
1493
1494 This is addressed using "backtracking" (similarly to `parseOptions()`).
1495 */
1496
1497 SelectorKeys::Builder keysBuilder(status);
1498 if (U_FAILURE(status)) {
1499 return result;
1500 }
1501
1502 // Since the first key is required, it's simplest to parse it separately.
1503 keysBuilder.add(parseKey(status), status);
1504
1505 // Restore precondition
1506 if (!inBounds()) {
1507 ERROR(status);
1508 return result;
1509 }
1510
1511 // We've seen at least one whitespace-key pair, so now we can parse
1512 // *(s key) [s]
1513 while (peek() != LEFT_CURLY_BRACE || isWhitespace(peek())) { // Try to recover from errors
1514 bool wasWhitespace = isWhitespace(peek());
1515 parseRequiredWhitespace(status);
1516 if (!wasWhitespace) {
1517 // Avoid infinite loop when parsing something like:
1518 // when * @{!...
1519 next();
1520 }
1521
1522 // Restore precondition
1523 if (!inBounds()) {
1524 ERROR(status);
1525 return result;
1526 }
1527
1528 // At this point, it's ambiguous whether we are inside (s key) or [s].
1529 // This check resolves that ambiguity.
1530 if (peek() == LEFT_CURLY_BRACE) {
1531 // A pattern follows, so what we just parsed was the optional
1532 // trailing whitespace. All the keys have been parsed.
1533
1534 // Unpush the whitespace from `normalizedInput`
1535 normalizedInput.truncate(normalizedInput.length() - 1);
1536 break;
1537 }
1538 keysBuilder.add(parseKey(status), status);
1539 }
1540
1541 return keysBuilder.build(status);
1542 }
1543
parseQuotedPattern(UErrorCode & status)1544 Pattern Parser::parseQuotedPattern(UErrorCode& status) {
1545 U_ASSERT(inBounds());
1546
1547 parseToken(LEFT_CURLY_BRACE, status);
1548 parseToken(LEFT_CURLY_BRACE, status);
1549 Pattern p = parseSimpleMessage(status);
1550 parseToken(RIGHT_CURLY_BRACE, status);
1551 parseToken(RIGHT_CURLY_BRACE, status);
1552 return p;
1553 }
1554
1555 /*
1556 Consume a `placeholder`, matching the nonterminal in the grammar
1557 No postcondition (a markup can end a message)
1558 */
parseMarkup(UErrorCode & status)1559 Markup Parser::parseMarkup(UErrorCode& status) {
1560 U_ASSERT(inBounds(1));
1561
1562 U_ASSERT(peek() == LEFT_CURLY_BRACE);
1563
1564 Markup::Builder builder(status);
1565 if (U_FAILURE(status)) {
1566 return {};
1567 }
1568
1569 // Consume the '{'
1570 next();
1571 normalizedInput += LEFT_CURLY_BRACE;
1572 parseOptionalWhitespace(status);
1573 bool closing = false;
1574 switch (peek()) {
1575 case NUMBER_SIGN: {
1576 // Open or standalone; consume the '#'
1577 normalizedInput += peek();
1578 next();
1579 break;
1580 }
1581 case SLASH: {
1582 // Closing
1583 normalizedInput += peek();
1584 closing = true;
1585 next();
1586 break;
1587 }
1588 default: {
1589 ERROR(status);
1590 return {};
1591 }
1592 }
1593
1594 // Parse the markup identifier
1595 builder.setName(parseIdentifier(status));
1596
1597 // Parse the options, which must begin with a ' '
1598 // if present
1599 if (inBounds() && isWhitespace(peek())) {
1600 OptionAdder<Markup::Builder> optionAdder(builder);
1601 parseOptions(optionAdder, status);
1602 }
1603
1604 // Parse the attributes, which also must begin
1605 // with a ' '
1606 if (inBounds() && isWhitespace(peek())) {
1607 AttributeAdder<Markup::Builder> attrAdder(builder);
1608 parseAttributes(attrAdder, status);
1609 }
1610
1611 parseOptionalWhitespace(status);
1612
1613 bool standalone = false;
1614 // Check if this is a standalone or not
1615 if (!closing) {
1616 if (inBounds() && peek() == SLASH) {
1617 standalone = true;
1618 normalizedInput += SLASH;
1619 next();
1620 }
1621 }
1622
1623 parseToken(RIGHT_CURLY_BRACE, status);
1624
1625 if (standalone) {
1626 builder.setStandalone();
1627 } else if (closing) {
1628 builder.setClose();
1629 } else {
1630 builder.setOpen();
1631 }
1632
1633 return builder.build(status);
1634 }
1635
1636 /*
1637 Consume a `placeholder`, matching the nonterminal in the grammar
1638 No postcondition (a placeholder can end a message)
1639 */
parsePlaceholder(UErrorCode & status)1640 std::variant<Expression, Markup> Parser::parsePlaceholder(UErrorCode& status) {
1641 U_ASSERT(peek() == LEFT_CURLY_BRACE);
1642
1643 if (!inBounds()) {
1644 ERROR(status);
1645 return exprFallback(status);
1646 }
1647
1648 // Need to look ahead arbitrarily since whitespace
1649 // can appear before the '{' and '#'
1650 // in markup
1651 int32_t tempIndex = 1;
1652 bool isMarkup = false;
1653 while (inBounds(1)) {
1654 UChar32 c = peek(tempIndex);
1655 if (c == NUMBER_SIGN || c == SLASH) {
1656 isMarkup = true;
1657 break;
1658 }
1659 if (!isWhitespace(c)){
1660 break;
1661 }
1662 tempIndex++;
1663 }
1664
1665 if (isMarkup) {
1666 return parseMarkup(status);
1667 }
1668 return parseExpression(status);
1669 }
1670
1671 /*
1672 Consume a `simple-message`, matching the nonterminal in the grammar
1673 Postcondition: `index == len()` or U_FAILURE(status);
1674 for a syntactically correct message, this will consume the entire input
1675 */
parseSimpleMessage(UErrorCode & status)1676 Pattern Parser::parseSimpleMessage(UErrorCode& status) {
1677 Pattern::Builder result(status);
1678
1679 if (U_SUCCESS(status)) {
1680 Expression expression;
1681 while (inBounds()) {
1682 switch (peek()) {
1683 case LEFT_CURLY_BRACE: {
1684 // Must be placeholder
1685 std::variant<Expression, Markup> piece = parsePlaceholder(status);
1686 if (std::holds_alternative<Expression>(piece)) {
1687 Expression expr = *std::get_if<Expression>(&piece);
1688 result.add(std::move(expr), status);
1689 } else {
1690 Markup markup = *std::get_if<Markup>(&piece);
1691 result.add(std::move(markup), status);
1692 }
1693 break;
1694 }
1695 case BACKSLASH: {
1696 // Must be escaped-char
1697 result.add(parseEscapeSequence(status), status);
1698 break;
1699 }
1700 case RIGHT_CURLY_BRACE: {
1701 // Distinguish unescaped '}' from end of quoted pattern
1702 break;
1703 }
1704 default: {
1705 // Must be text-char
1706 result.add(parseTextChar(status), status);
1707 break;
1708 }
1709 }
1710 if (peek() == RIGHT_CURLY_BRACE) {
1711 // End of quoted pattern
1712 break;
1713 }
1714 // Don't loop infinitely
1715 if (errors.hasSyntaxError()) {
1716 break;
1717 }
1718 }
1719 }
1720 return result.build(status);
1721 }
1722
1723
1724 /*
1725 Consume a `selectors` (matching the nonterminal in the grammar),
1726 followed by a non-empty sequence of `variant`s (matching the nonterminal
1727 in the grammar) preceded by whitespace
1728 No postcondition (on return, `index` might equal `len()` with no syntax error
1729 because a message can end with a variant)
1730 */
parseSelectors(UErrorCode & status)1731 void Parser::parseSelectors(UErrorCode& status) {
1732 CHECK_ERROR(status);
1733
1734 U_ASSERT(inBounds());
1735
1736 parseToken(ID_MATCH, status);
1737
1738 bool empty = true;
1739 // Parse selectors
1740 // "Backtracking" is required here. It's not clear if whitespace is
1741 // (`[s]` selector) or (`[s]` variant)
1742 while (isWhitespace(peek()) || peek() == LEFT_CURLY_BRACE) {
1743 parseOptionalWhitespace(status);
1744 // Restore precondition
1745 CHECK_BOUNDS(status);
1746 if (peek() != LEFT_CURLY_BRACE) {
1747 // This is not necessarily an error, but rather,
1748 // means the whitespace we parsed was the optional
1749 // whitespace preceding the first variant, not the
1750 // optional whitespace preceding a subsequent expression.
1751 break;
1752 }
1753 Expression expression;
1754 expression = parseExpression(status);
1755 empty = false;
1756
1757 dataModel.addSelector(std::move(expression), status);
1758 CHECK_ERROR(status);
1759 }
1760
1761 // At least one selector is required
1762 if (empty) {
1763 ERROR(status);
1764 return;
1765 }
1766
1767 #define CHECK_END_OF_INPUT \
1768 if (!inBounds()) { \
1769 break; \
1770 } \
1771
1772 // Parse variants
1773 while (isWhitespace(peek()) || isKeyStart(peek())) {
1774 // Trailing whitespace is allowed
1775 parseOptionalWhitespace(status);
1776 if (!inBounds()) {
1777 return;
1778 }
1779
1780 // At least one key is required
1781 SelectorKeys keyList(parseNonEmptyKeys(status));
1782
1783 CHECK_ERROR(status);
1784
1785 // parseNonEmptyKeys() consumes any trailing whitespace,
1786 // so the pattern can be consumed next.
1787
1788 // Restore precondition before calling parsePattern()
1789 // (which must return a non-null value)
1790 CHECK_BOUNDS(status);
1791 Pattern rhs = parseQuotedPattern(status);
1792
1793 dataModel.addVariant(std::move(keyList), std::move(rhs), status);
1794
1795 // Restore the precondition, *without* erroring out if we've
1796 // reached the end of input. That's because it's valid for the
1797 // message to end with a variant that has no trailing whitespace.
1798 // Why do we need to check this condition twice inside the loop?
1799 // Because if we don't check it here, the `isWhitespace()` call in
1800 // the loop head will read off the end of the input string.
1801 CHECK_END_OF_INPUT
1802 }
1803 }
1804
1805 /*
1806 Consume a `body` (matching the nonterminal in the grammar),
1807 No postcondition (on return, `index` might equal `len()` with no syntax error,
1808 because a message can end with a body (trailing whitespace is optional)
1809 */
1810
errorPattern(UErrorCode & status)1811 void Parser::errorPattern(UErrorCode& status) {
1812 errors.addSyntaxError(status);
1813 // Set to empty pattern
1814 Pattern::Builder result = Pattern::Builder(status);
1815 CHECK_ERROR(status);
1816
1817 // If still in bounds, then add the remaining input as a single text part
1818 // to the pattern
1819 /*
1820 TODO: this behavior isn't documented in the spec, but it comes from
1821 https://github.com/messageformat/messageformat/blob/e0087bff312d759b67a9129eac135d318a1f0ce7/packages/mf2-messageformat/src/__fixtures/test-messages.json#L236
1822 and a pending pull request https://github.com/unicode-org/message-format-wg/pull/462 will clarify
1823 whether this is the intent behind the spec
1824 */
1825 UnicodeString partStr(LEFT_CURLY_BRACE);
1826 while (inBounds()) {
1827 partStr += peek();
1828 next();
1829 }
1830 // Add curly braces around the entire output (same comment as above)
1831 partStr += RIGHT_CURLY_BRACE;
1832 result.add(std::move(partStr), status);
1833 dataModel.setPattern(result.build(status));
1834 }
1835
parseBody(UErrorCode & status)1836 void Parser::parseBody(UErrorCode& status) {
1837 CHECK_ERROR(status);
1838
1839 // Out-of-input is a syntax warning
1840 if (!inBounds()) {
1841 errorPattern(status);
1842 return;
1843 }
1844
1845 // Body must be either a pattern or selectors
1846 switch (peek()) {
1847 case LEFT_CURLY_BRACE: {
1848 // Pattern
1849 dataModel.setPattern(parseQuotedPattern(status));
1850 break;
1851 }
1852 case ID_MATCH[0]: {
1853 // Selectors
1854 parseSelectors(status);
1855 return;
1856 }
1857 default: {
1858 ERROR(status);
1859 errorPattern(status);
1860 return;
1861 }
1862 }
1863 }
1864
1865 // -------------------------------------
1866 // Parses the source pattern.
1867
parse(UParseError & parseErrorResult,UErrorCode & status)1868 void Parser::parse(UParseError &parseErrorResult, UErrorCode& status) {
1869 CHECK_ERROR(status);
1870
1871 bool complex = false;
1872 // First, "look ahead" to determine if this is a simple or complex
1873 // message. To do that, check the first non-whitespace character.
1874 while (inBounds(index) && isWhitespace(peek())) {
1875 next();
1876 }
1877
1878 // Message can be empty, so we need to only look ahead
1879 // if we know it's non-empty
1880 if (inBounds()) {
1881 if (peek() == PERIOD
1882 || (inBounds(1)
1883 && peek() == LEFT_CURLY_BRACE
1884 && peek(1) == LEFT_CURLY_BRACE)) {
1885 complex = true;
1886 }
1887 }
1888 // Reset index
1889 index = 0;
1890
1891 // Message can be empty, so we need to only look ahead
1892 // if we know it's non-empty
1893 if (complex) {
1894 parseOptionalWhitespace(status);
1895 parseDeclarations(status);
1896 parseBody(status);
1897 parseOptionalWhitespace(status);
1898 } else {
1899 // Simple message
1900 // For normalization, quote the pattern
1901 normalizedInput += LEFT_CURLY_BRACE;
1902 normalizedInput += LEFT_CURLY_BRACE;
1903 dataModel.setPattern(parseSimpleMessage(status));
1904 normalizedInput += RIGHT_CURLY_BRACE;
1905 normalizedInput += RIGHT_CURLY_BRACE;
1906 }
1907
1908 CHECK_ERROR(status);
1909
1910 // There are no errors; finally, check that the entire input was consumed
1911 if (!allConsumed()) {
1912 ERROR(status);
1913 }
1914
1915 // Finally, copy the relevant fields of the internal `MessageParseError`
1916 // into the `UParseError` argument
1917 translateParseError(parseError, parseErrorResult);
1918 }
1919
~Parser()1920 Parser::~Parser() {}
1921
1922 } // namespace message2
1923 U_NAMESPACE_END
1924
1925 #endif /* #if !UCONFIG_NO_MF2 */
1926
1927 #endif /* #if !UCONFIG_NO_FORMATTING */
1928
1929