• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // © 2024 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 
4 #include "unicode/utypes.h"
5 
6 #if !UCONFIG_NO_FORMATTING
7 
8 #if !UCONFIG_NO_MF2
9 
10 #include "messageformat2_errors.h"
11 #include "messageformat2_macros.h"
12 #include "messageformat2_parser.h"
13 #include "uvector.h" // U_ASSERT
14 
15 U_NAMESPACE_BEGIN
16 
17 namespace message2 {
18 
19 using namespace pluralimpl;
20 
21 using namespace data_model;
22 
23 /*
24     The `ERROR()` macro sets a syntax error in the context
25     and sets the offset in `parseError` to `index`. It does not alter control flow.
26 */
27 #define ERROR(errorCode)                                                                                \
28     if (!errors.hasSyntaxError()) {                                                                     \
29         setParseError(parseError, index);                                                               \
30         errors.addSyntaxError(errorCode);                                                               \
31     }
32 
33 #define ERROR_AT(errorCode, i)                                                                          \
34     if (!errors.hasSyntaxError()) {                                                                     \
35         setParseError(parseError, i);                                                                   \
36         errors.addSyntaxError(errorCode);                                                               \
37     }
38 
39 // Increments the line number and updates the "characters seen before
40 // current line" count in `parseError`, iff `peek()` is a newline
maybeAdvanceLine()41 void Parser::maybeAdvanceLine() {
42     if (peek() == LF) {
43         parseError.line++;
44         // add 1 to index to get the number of characters seen so far
45         // (including the newline)
46         parseError.lengthBeforeCurrentLine = index + 1;
47     }
48 }
49 
50 /*
51     Signals an error and returns either if `parseError` already denotes an
52     error, or `index` is out of bounds for the string `source`
53 */
54 #define CHECK_BOUNDS(errorCode)                                                            \
55     if (!inBounds()) {                                                                     \
56         ERROR(errorCode);                                                                  \
57         return;                                                                            \
58     }
59 #define CHECK_BOUNDS_1(errorCode)                                                          \
60     if (!inBounds(1)) {                                                                    \
61         ERROR_AT(errorCode, index + 1);                                                    \
62         return;                                                                            \
63     }
64 
65 // -------------------------------------
66 // Helper functions
67 
copyContext(const UChar in[U_PARSE_CONTEXT_LEN],UChar out[U_PARSE_CONTEXT_LEN])68 static void copyContext(const UChar in[U_PARSE_CONTEXT_LEN], UChar out[U_PARSE_CONTEXT_LEN]) {
69     for (int32_t i = 0; i < U_PARSE_CONTEXT_LEN; i++) {
70         out[i] = in[i];
71         if (in[i] == '\0') {
72             break;
73         }
74     }
75 }
76 
translateParseError(const MessageParseError & messageParseError,UParseError & parseError)77 /* static */ void Parser::translateParseError(const MessageParseError &messageParseError, UParseError &parseError) {
78     parseError.line = messageParseError.line;
79     parseError.offset = messageParseError.offset;
80     copyContext(messageParseError.preContext, parseError.preContext);
81     copyContext(messageParseError.postContext, parseError.postContext);
82 }
83 
setParseError(MessageParseError & parseError,uint32_t index)84 /* static */ void Parser::setParseError(MessageParseError &parseError, uint32_t index) {
85     // Translate absolute to relative offset
86     parseError.offset = index                               // Start with total number of characters seen
87                       - parseError.lengthBeforeCurrentLine; // Subtract all characters before the current line
88     // TODO: Fill this in with actual pre and post-context
89     parseError.preContext[0] = 0;
90     parseError.postContext[0] = 0;
91 }
92 
93 // -------------------------------------
94 // Predicates
95 
96 // Returns true if `c` is in the interval [`first`, `last`]
inRange(UChar32 c,UChar32 first,UChar32 last)97 static bool inRange(UChar32 c, UChar32 first, UChar32 last) {
98     U_ASSERT(first < last);
99     return c >= first && c <= last;
100 }
101 
102 /*
103   The following helper predicates should exactly match nonterminals in the MessageFormat 2 grammar:
104 
105   `isContentChar()`   : `content-char`
106   `isTextChar()`      : `text-char`
107   `isAlpha()`         : `ALPHA`
108   `isDigit()`         : `DIGIT`
109   `isNameStart()`     : `name-start`
110   `isNameChar()`      : `name-char`
111   `isUnquotedStart()` : `unquoted-start`
112   `isQuotedChar()`    : `quoted-char`
113   `isWhitespace()`    : `s`
114 */
115 
isContentChar(UChar32 c)116 static bool isContentChar(UChar32 c) {
117     return inRange(c, 0x0001, 0x0008)    // Omit NULL, HTAB and LF
118            || inRange(c, 0x000B, 0x000C) // Omit CR
119            || inRange(c, 0x000E, 0x001F) // Omit SP
120            || inRange(c, 0x0021, 0x002D) // Omit '.'
121            || inRange(c, 0x002F, 0x003F) // Omit '@'
122            || inRange(c, 0x0041, 0x005B) // Omit '\'
123            || inRange(c, 0x005D, 0x007A) // Omit { | }
124            || inRange(c, 0x007E, 0xD7FF) // Omit surrogates
125            || inRange(c, 0xE000, 0x10FFFF);
126 }
127 
128 // See `s` in the MessageFormat 2 grammar
isWhitespace(UChar32 c)129 inline bool isWhitespace(UChar32 c) {
130     switch (c) {
131     case SPACE:
132     case HTAB:
133     case CR:
134     case LF:
135     case IDEOGRAPHIC_SPACE:
136         return true;
137     default:
138         return false;
139     }
140 }
141 
isTextChar(UChar32 c)142 static bool isTextChar(UChar32 c) {
143     return isContentChar(c)
144         || isWhitespace(c)
145         || c == PERIOD
146         || c == AT
147         || c == PIPE;
148 }
149 
isAlpha(UChar32 c)150 static bool isAlpha(UChar32 c) { return inRange(c, 0x0041, 0x005A) || inRange(c, 0x0061, 0x007A); }
151 
isDigit(UChar32 c)152 static bool isDigit(UChar32 c) { return inRange(c, 0x0030, 0x0039); }
153 
isNameStart(UChar32 c)154 static bool isNameStart(UChar32 c) {
155     return isAlpha(c) || c == UNDERSCORE || inRange(c, 0x00C0, 0x00D6) || inRange(c, 0x00D8, 0x00F6) ||
156            inRange(c, 0x00F8, 0x02FF) || inRange(c, 0x0370, 0x037D) || inRange(c, 0x037F, 0x1FFF) ||
157            inRange(c, 0x200C, 0x200D) || inRange(c, 0x2070, 0x218F) || inRange(c, 0x2C00, 0x2FEF) ||
158            inRange(c, 0x3001, 0xD7FF) || inRange(c, 0xF900, 0xFDCF) || inRange(c, 0xFDF0, 0xFFFD) ||
159            inRange(c, 0x10000, 0xEFFFF);
160 }
161 
isNameChar(UChar32 c)162 static bool isNameChar(UChar32 c) {
163     return isNameStart(c) || isDigit(c) || c == HYPHEN || c == PERIOD || c == 0x00B7 ||
164            inRange(c, 0x0300, 0x036F) || inRange(c, 0x203F, 0x2040);
165 }
166 
isUnquotedStart(UChar32 c)167 static bool isUnquotedStart(UChar32 c) {
168     return isNameStart(c) || isDigit(c) || c == HYPHEN || c == PERIOD || c == 0x00B7 ||
169            inRange(c, 0x0300, 0x036F) || inRange(c, 0x203F, 0x2040);
170 }
171 
isQuotedChar(UChar32 c)172 static bool isQuotedChar(UChar32 c) {
173     return isContentChar(c)
174         || isWhitespace(c)
175         || c == PERIOD
176         || c == AT
177         || c == LEFT_CURLY_BRACE
178         || c == RIGHT_CURLY_BRACE;
179 }
180 
isEscapableChar(UChar32 c)181 static bool isEscapableChar(UChar32 c) {
182     return c == PIPE
183         || c == BACKSLASH
184         || c == LEFT_CURLY_BRACE
185         || c == RIGHT_CURLY_BRACE;
186 }
187 
188 // Returns true iff `c` can begin a `function` nonterminal
isFunctionStart(UChar32 c)189 static bool isFunctionStart(UChar32 c) {
190     switch (c) {
191     case COLON: {
192         return true;
193     }
194     default: {
195         return false;
196     }
197     }
198 }
199 
200 // Returns true iff `c` can begin an `annotation` nonterminal
isAnnotationStart(UChar32 c)201 static bool isAnnotationStart(UChar32 c) {
202     return isFunctionStart(c);
203 }
204 
205 // Returns true iff `c` can begin a `literal` nonterminal
isLiteralStart(UChar32 c)206 static bool isLiteralStart(UChar32 c) {
207     return (c == PIPE || isNameStart(c) || c == HYPHEN || isDigit(c));
208 }
209 
210 // Returns true iff `c` can begin a `key` nonterminal
isKeyStart(UChar32 c)211 static bool isKeyStart(UChar32 c) {
212     return (c == ASTERISK || isLiteralStart(c));
213 }
214 
isDeclarationStart()215 bool Parser::isDeclarationStart() {
216     return (peek() == ID_LOCAL[0]
217             && inBounds(1)
218             && peek(1) == ID_LOCAL[1])
219         || (peek() == ID_INPUT[0]
220             && inBounds(1)
221             && peek(1) == ID_INPUT[1]);
222 }
223 
224 // -------------------------------------
225 // Parsing functions
226 
227 
228 /*
229   TODO: Since handling the whitespace ambiguities needs to be repeated
230   in several different places and is hard to factor out,
231   it probably would be better to replace the parser with a lexer + parser
232   to separate tokenizing from parsing, which would simplify the code significantly.
233   This has the disadvantage that there is no token grammar for MessageFormat,
234   so one would have to be invented that isn't a component of the spec.
235  */
236 
237 /*
238     This is a recursive-descent scannerless parser that,
239     with a few exceptions, uses 1 character of lookahead.
240 
241     This may not be an exhaustive list, as the additions of attributes and reserved
242     statements introduced several new ambiguities.
243 
244 All but three of the exceptions involve ambiguities about the meaning of whitespace.
245 One ambiguity not involving whitespace is:
246 identifier -> namespace ":" name
247 vs.
248 identifier -> name
249 
250 `namespace` and `name` can't be distinguished without arbitrary lookahead.
251 (For how this is handled, see parseIdentifier())
252 
253 The second ambiguity not involving whitespace is:
254 complex-message -> *(declaration[s]) complex-body
255                 -> declaration *(declaration[s]) complex-body
256                 -> declaration complex-body
257                 -> reserved-statement complex-body
258                 -> .foo {$x} .match // ...
259 When processing the '.', arbitrary lookahead is required to distinguish the
260 arbitrary-length unsupported keyword from `.match`.
261 (For how this is handled, see parseDeclarations()).
262 
263 The third ambiguity not involving whitespace is:
264 complex-message -> *(declaration [s]) complex-body
265                 -> reserved-statement *(declaration [s]) complex-body
266                 -> reserved-statement complex-body
267                 -> reserved-statement quotedPattern
268                 -> reserved-keyword [s reserved-body] 1*([s] expression) quoted-pattern
269                 -> reserved-keyword expression quoted-pattern
270  Example: .foo {1} {{1}}
271 
272  Without lookahead, the opening '{' of the quoted pattern can't be distinguished
273  from the opening '{' of another expression in the unsupported statement.
274  (Though this only requires 1 character of lookahead.)
275 
276  Otherwise:
277 
278 There are at least seven ambiguities in the grammar that can't be resolved with finite
279 lookahead (since whitespace sequences can be arbitrarily long). They are resolved
280 with a form of backtracking (early exit). No state needs to be saved/restored
281 since whitespace doesn't affect the shape of the resulting parse tree, so it's
282 not true backtracking.
283 
284 In addition, the grammar has been refactored
285 in a semantics-preserving way in some cases to make the code easier to structure.
286 
287 First: variant = when 1*(s key) [s] pattern
288    Example: when k     {a}
289    When reading the first space after 'k', it's ambiguous whether it's the
290    required space before another key, or the optional space before `pattern`.
291  (See comments in parseNonEmptyKeys())
292 
293 Second: expression = "{" [s] (((literal / variable) [s annotation]) / annotation) [s] "}"
294         annotation = (function *(s option)) / reserved
295    Example: {:f    }
296    When reading the first space after 'f', it's ambiguous whether it's the
297    required space before an option, or the optional trailing space after an options list
298    (in this case, the options list is empty).
299  (See comments in parseOptions() -- handling this case also meant it was easier to base
300   the code on a slightly refactored grammar, which should be semantically equivalent.)
301 
302 Third: expression = "{" [s] (((literal / variable) [s annotation]) / annotation) [s] "}"
303         annotation = (function *(s option)) / reserved
304    Example: {@a }
305    Similar to the previous case; see comments in parseReserved()
306 
307 Fourth: expression = "{" [s] (((literal / variable) [s annotation]) / annotation) [s] "}"
308    Example: {|foo|   }
309    When reading the first space after the '|', it's ambiguous whether it's the required
310    space before an annotation, or the optional trailing space before the '}'.
311   (See comments in parseLiteralOrVariableWithAnnotation(); handling this case relies on
312   the same grammar refactoring as the second exception.)
313 
314     Most functions match a non-terminal in the grammar, except as explained
315     in comments.
316 
317 Fifth: matcher = match-statement 1*([s] variant)
318                -> match 1 *([s] selector) 1*([s] variant)
319     Example: match {42} * {{_}}
320  When reading the space after the first '}', it's unclear whether
321  it's the optional space before another selector, or the optional space
322  before a variant.
323 
324 Sixth: annotation-expression = "{" [s] annotation *(s attribute) [s] "}"
325        -> "{" [s] function *(s attribute) [s] "}"
326        -> "{" [s] ":" identifier *(s option) *(s attribute) [s] "}"
327        -> "{" [s] ":" identifier s attribute *(s attribute) [s] "}"
328 
329      Example: {:func @foo}
330 (Note: the same ambiguity is present with variable-expression and literal-expression)
331 
332 Seventh:
333 
334 
335 When parsing the space, it's unclear whether it's the optional space before an
336 option, or the optional space before an attribute.
337 
338  Unless otherwise noted in a comment, all helper functions that take
339     a `source` string, an `index` unsigned int, and an `errorCode` `UErrorCode`
340     have the precondition:
341       `index` < `len()`
342     and the postcondition:
343       `U_FAILURE(errorCode)` || `index < `len()`
344 */
345 
346 /*
347   No pre, no post.
348   A message may end with whitespace, so `index` may equal `len()` on exit.
349 */
parseWhitespaceMaybeRequired(bool required,UErrorCode & errorCode)350 void Parser::parseWhitespaceMaybeRequired(bool required, UErrorCode& errorCode) {
351     bool sawWhitespace = false;
352 
353     // The loop exits either when we consume all the input,
354     // or when we see a non-whitespace character.
355     while (true) {
356         // Check if all input has been consumed
357         if (!inBounds()) {
358             // If whitespace isn't required -- or if we saw it already --
359             // then the caller is responsible for checking this case and
360             // setting an error if necessary.
361             if (!required || sawWhitespace) {
362                 // Not an error.
363                 return;
364             }
365             // Otherwise, whitespace is required; the end of the input has
366             // been reached without whitespace. This is an error.
367             ERROR(errorCode);
368             return;
369         }
370 
371         // Input remains; process the next character if it's whitespace,
372         // exit the loop otherwise
373         if (isWhitespace(peek())) {
374             sawWhitespace = true;
375             // Increment line number in parse error if we consume a newline
376             maybeAdvanceLine();
377             next();
378         } else {
379             break;
380         }
381     }
382 
383     if (!sawWhitespace && required) {
384         ERROR(errorCode);
385     }
386 }
387 
388 /*
389   No pre, no post, for the same reason as `parseWhitespaceMaybeRequired()`.
390 */
parseRequiredWhitespace(UErrorCode & errorCode)391 void Parser::parseRequiredWhitespace(UErrorCode& errorCode) {
392     parseWhitespaceMaybeRequired(true, errorCode);
393     normalizedInput += SPACE;
394 }
395 
396 /*
397   No pre, no post, for the same reason as `parseWhitespaceMaybeRequired()`.
398 */
parseOptionalWhitespace(UErrorCode & errorCode)399 void Parser::parseOptionalWhitespace(UErrorCode& errorCode) {
400     parseWhitespaceMaybeRequired(false, errorCode);
401 }
402 
403 // Consumes a single character, signaling an error if `peek()` != `c`
404 // No postcondition -- a message can end with a '}' token
parseToken(UChar32 c,UErrorCode & errorCode)405 void Parser::parseToken(UChar32 c, UErrorCode& errorCode) {
406     CHECK_BOUNDS(errorCode);
407 
408     if (peek() == c) {
409         next();
410         normalizedInput += c;
411         return;
412     }
413     // Next character didn't match -- error out
414     ERROR(errorCode);
415 }
416 
417 /*
418    Consumes a fixed-length token, signaling an error if the token isn't a prefix of
419    the string beginning at `peek()`
420    No postcondition -- a message can end with a '}' token
421 */
parseToken(const std::u16string_view & token,UErrorCode & errorCode)422 void Parser::parseToken(const std::u16string_view& token, UErrorCode& errorCode) {
423     U_ASSERT(inBounds());
424 
425     int32_t tokenPos = 0;
426     while (tokenPos < static_cast<int32_t>(token.length())) {
427         if (peek() != token[tokenPos]) {
428             ERROR(errorCode);
429             return;
430         }
431         normalizedInput += token[tokenPos];
432         next();
433         tokenPos++;
434     }
435 }
436 
437 /*
438    Consumes optional whitespace, possibly advancing `index` to `index'`,
439    then consumes a fixed-length token (signaling an error if the token isn't a prefix of
440    the string beginning at `source[index']`),
441    then consumes optional whitespace again
442 */
parseTokenWithWhitespace(const std::u16string_view & token,UErrorCode & errorCode)443 void Parser::parseTokenWithWhitespace(const std::u16string_view& token, UErrorCode& errorCode) {
444     // No need for error check or bounds check before parseOptionalWhitespace
445     parseOptionalWhitespace(errorCode);
446     // Establish precondition
447     CHECK_BOUNDS(errorCode);
448     parseToken(token, errorCode);
449     parseOptionalWhitespace(errorCode);
450     // Guarantee postcondition
451     CHECK_BOUNDS(errorCode);
452 }
453 
454 /*
455    Consumes optional whitespace, possibly advancing `index` to `index'`,
456    then consumes a single character (signaling an error if it doesn't match
457    `source[index']`),
458    then consumes optional whitespace again
459 */
parseTokenWithWhitespace(UChar32 c,UErrorCode & errorCode)460 void Parser::parseTokenWithWhitespace(UChar32 c, UErrorCode& errorCode) {
461     // No need for error check or bounds check before parseOptionalWhitespace(errorCode)
462     parseOptionalWhitespace(errorCode);
463     // Establish precondition
464     CHECK_BOUNDS(errorCode);
465     parseToken(c, errorCode);
466     parseOptionalWhitespace(errorCode);
467     // Guarantee postcondition
468     CHECK_BOUNDS(errorCode);
469 }
470 
471 /*
472   Consumes a non-empty sequence of `name-char`s, the first of which is
473   also a `name-start`.
474   that begins with a character `start` such that `isNameStart(start)`.
475 
476   Returns this sequence.
477 
478   (Matches the `name` nonterminal in the grammar.)
479 */
parseName(UErrorCode & errorCode)480 UnicodeString Parser::parseName(UErrorCode& errorCode) {
481     UnicodeString name;
482 
483     U_ASSERT(inBounds());
484 
485     if (!isNameStart(peek())) {
486         ERROR(errorCode);
487         return name;
488     }
489 
490     while (isNameChar(peek())) {
491         UChar32 c = peek();
492         name += c;
493         normalizedInput += c;
494         next();
495         if (!inBounds()) {
496             ERROR(errorCode);
497             break;
498         }
499     }
500     return name;
501 }
502 
503 /*
504   Consumes a '$' followed by a `name`, returning a VariableName
505   with `name` as its name
506 
507   (Matches the `variable` nonterminal in the grammar.)
508 */
parseVariableName(UErrorCode & errorCode)509 VariableName Parser::parseVariableName(UErrorCode& errorCode) {
510     VariableName result;
511 
512     U_ASSERT(inBounds());
513     // If the '$' is missing, we don't want a binding
514     // for this variable to be created.
515     bool valid = peek() == DOLLAR;
516     parseToken(DOLLAR, errorCode);
517     if (!inBounds()) {
518         ERROR(errorCode);
519         return result;
520     }
521     UnicodeString varName = parseName(errorCode);
522     // Set the name to "" if the variable wasn't
523     // declared correctly
524     if (!valid) {
525         varName.remove();
526     }
527     return VariableName(varName);
528 }
529 
530 /*
531   Corresponds to the `identifier` nonterminal in the grammar
532 */
parseIdentifier(UErrorCode & errorCode)533 UnicodeString Parser::parseIdentifier(UErrorCode& errorCode) {
534     U_ASSERT(inBounds());
535 
536     UnicodeString result;
537     // The following is a hack to get around ambiguity in the grammar:
538     // identifier -> namespace ":" name
539     // vs.
540     // identifier -> name
541     // can't be distinguished without arbitrary lookahead.
542     // Instead, we treat the production as:
543     // identifier -> namespace *(":"name)
544     // and then check for multiple colons.
545 
546     // Parse namespace
547     result += parseName(errorCode);
548     int32_t firstColon = -1;
549     while (inBounds() && peek() == COLON) {
550         // Parse ':' separator
551         if (firstColon == -1) {
552             firstColon = index;
553         }
554         parseToken(COLON, errorCode);
555         result += COLON;
556         // Check for message ending with something like "foo:"
557         if (!inBounds()) {
558             ERROR(errorCode);
559         } else {
560             // Parse name part
561             result += parseName(errorCode);
562         }
563     }
564 
565     // If there's at least one ':', scan from the first ':'
566     // to the end of the name to check for multiple ':'s
567     if (firstColon != -1) {
568         for (int32_t i = firstColon + 1; i < result.length(); i++) {
569             if (result[i] == COLON) {
570                 ERROR_AT(errorCode, i);
571                 return {};
572             }
573         }
574     }
575 
576     return result;
577 }
578 
579 /*
580   Consumes a reference to a function, matching the ": identifier"
581   in the `function` nonterminal in the grammar.
582 
583   Returns the function name.
584 */
parseFunction(UErrorCode & errorCode)585 FunctionName Parser::parseFunction(UErrorCode& errorCode) {
586     U_ASSERT(inBounds());
587     if (!isFunctionStart(peek())) {
588         ERROR(errorCode);
589         return FunctionName();
590     }
591 
592     normalizedInput += peek();
593     next(); // Consume the function start character
594     if (!inBounds()) {
595         ERROR(errorCode);
596         return FunctionName();
597     }
598     return parseIdentifier(errorCode);
599 }
600 
601 
602 /*
603   Precondition: peek() == BACKSLASH
604 
605   Consume an escaped character.
606   Corresponds to `escaped-char` in the grammar.
607 
608   No postcondition (a message can end with an escaped char)
609 */
parseEscapeSequence(UErrorCode & errorCode)610 UnicodeString Parser::parseEscapeSequence(UErrorCode& errorCode) {
611     U_ASSERT(inBounds());
612     U_ASSERT(peek() == BACKSLASH);
613     normalizedInput += BACKSLASH;
614     next(); // Skip the initial backslash
615     UnicodeString str;
616     if (inBounds()) {
617         // Expect a '{', '|' or '}'
618         switch (peek()) {
619         case LEFT_CURLY_BRACE:
620         case RIGHT_CURLY_BRACE:
621         case PIPE:
622         case BACKSLASH: {
623             /* Append to the output string */
624             str += peek();
625             /* Update normalizedInput */
626             normalizedInput += peek();
627             /* Consume the character */
628             next();
629             return str;
630         }
631         default: {
632             // No other characters are allowed here
633             break;
634         }
635         }
636     }
637    // If control reaches here, there was an error
638    ERROR(errorCode);
639    return str;
640 }
641 
642 
643 /*
644   Consume and return a quoted literal, matching the `literal` nonterminal in the grammar.
645 */
parseQuotedLiteral(UErrorCode & errorCode)646 Literal Parser::parseQuotedLiteral(UErrorCode& errorCode) {
647     bool error = false;
648 
649     UnicodeString contents;
650     if (U_SUCCESS(errorCode)) {
651         // Parse the opening '|'
652         parseToken(PIPE, errorCode);
653         if (!inBounds()) {
654             ERROR(errorCode);
655             error = true;
656         } else {
657             // Parse the contents
658             bool done = false;
659             while (!done) {
660                 if (peek() == BACKSLASH) {
661                     contents += parseEscapeSequence(errorCode);
662                 } else if (isQuotedChar(peek())) {
663                     contents += peek();
664                     // Handle cases like:
665                     // |}{| -- we want to escape everywhere that
666                     // can be escaped, to make round-trip checking
667                     // easier -- so this case normalizes to
668                     // |\}\{|
669                     if (isEscapableChar(peek())) {
670                         normalizedInput += BACKSLASH;
671                     }
672                     normalizedInput += peek();
673                     next(); // Consume this character
674                     maybeAdvanceLine();
675                 } else {
676                     // Assume the sequence of literal characters ends here
677                     done = true;
678                 }
679                 if (!inBounds()) {
680                     ERROR(errorCode);
681                     error = true;
682                     break;
683                 }
684             }
685         }
686     }
687 
688     if (error) {
689         return {};
690     }
691 
692     // Parse the closing '|'
693     parseToken(PIPE, errorCode);
694 
695     return Literal(true, contents);
696 }
697 
698 // Parse (1*DIGIT)
parseDigits(UErrorCode & errorCode)699 UnicodeString Parser::parseDigits(UErrorCode& errorCode) {
700     if (U_FAILURE(errorCode)) {
701         return {};
702     }
703 
704     U_ASSERT(isDigit(peek()));
705 
706     UnicodeString contents;
707     do {
708         contents += peek();
709         normalizedInput += peek();
710         next();
711         if (!inBounds()) {
712             ERROR(errorCode);
713             return {};
714         }
715     } while (isDigit(peek()));
716 
717     return contents;
718 }
719 /*
720   Consume and return an unquoted literal, matching the `unquoted` nonterminal in the grammar.
721 */
parseUnquotedLiteral(UErrorCode & errorCode)722 Literal Parser::parseUnquotedLiteral(UErrorCode& errorCode) {
723     if (U_FAILURE(errorCode)) {
724         return {};
725     }
726 
727     // unquoted -> name
728     if (isNameStart(peek())) {
729         return Literal(false, parseName(errorCode));
730     }
731 
732     // unquoted -> number
733     // Parse the contents
734     UnicodeString contents;
735 
736     // Parse the sign
737     if (peek() == HYPHEN) {
738         contents += peek();
739         normalizedInput += peek();
740         next();
741     }
742     if (!inBounds()) {
743         ERROR(errorCode);
744         return {};
745     }
746 
747     // Parse the integer part
748     if (peek() == ((UChar32)0x0030) /* 0 */) {
749         contents += peek();
750         normalizedInput += peek();
751         next();
752     } else if (isDigit(peek())) {
753         contents += parseDigits(errorCode);
754     } else {
755         // Error -- nothing else can start a number literal
756         ERROR(errorCode);
757         return {};
758     }
759 
760     // Parse the decimal point if present
761     if (peek() == PERIOD) {
762         contents += peek();
763         normalizedInput += peek();
764         next();
765         if (!inBounds()) {
766             ERROR(errorCode);
767             return {};
768         }
769         // Parse the fraction part
770         if (isDigit(peek())) {
771             contents += parseDigits(errorCode);
772         } else {
773             // '.' not followed by digit is a parse error
774             ERROR(errorCode);
775             return {};
776         }
777     }
778 
779     if (!inBounds()) {
780         ERROR(errorCode);
781         return {};
782     }
783 
784     // Parse the exponent part if present
785     if (peek() == UPPERCASE_E || peek() == LOWERCASE_E) {
786         contents += peek();
787         normalizedInput += peek();
788         next();
789         if (!inBounds()) {
790             ERROR(errorCode);
791             return {};
792         }
793         // Parse sign if present
794         if (peek() == PLUS || peek() == HYPHEN) {
795             contents += peek();
796             normalizedInput += peek();
797             next();
798             if (!inBounds()) {
799                 ERROR(errorCode);
800                 return {};
801             }
802         }
803         // Parse exponent digits
804         if (!isDigit(peek())) {
805             ERROR(errorCode);
806             return {};
807         }
808         contents += parseDigits(errorCode);
809     }
810 
811     return Literal(false, contents);
812 }
813 
814 /*
815   Consume and return a literal, matching the `literal` nonterminal in the grammar.
816 */
parseLiteral(UErrorCode & errorCode)817 Literal Parser::parseLiteral(UErrorCode& errorCode) {
818     Literal result;
819     if (!inBounds()) {
820         ERROR(errorCode);
821     } else {
822         if (peek() == PIPE) {
823             result = parseQuotedLiteral(errorCode);
824         } else {
825             result = parseUnquotedLiteral(errorCode);
826         }
827         // Guarantee postcondition
828         if (!inBounds()) {
829             ERROR(errorCode);
830         }
831     }
832 
833     return result;
834 }
835 
836 /*
837   Consume a @name-value pair, matching the `attribute` nonterminal in the grammar.
838 
839   Adds the option to `options`
840 */
841 template<class T>
parseAttribute(AttributeAdder<T> & attrAdder,UErrorCode & errorCode)842 void Parser::parseAttribute(AttributeAdder<T>& attrAdder, UErrorCode& errorCode) {
843     U_ASSERT(inBounds());
844 
845     U_ASSERT(peek() == AT);
846     // Consume the '@'
847     parseToken(AT, errorCode);
848 
849     // Parse LHS
850     UnicodeString lhs = parseIdentifier(errorCode);
851 
852     // Prepare to "backtrack" to resolve ambiguity
853     // about whether whitespace precedes another
854     // attribute, or the '=' sign
855     int32_t savedIndex = index;
856     parseOptionalWhitespace(errorCode);
857 
858     Operand rand;
859     if (peek() == EQUALS) {
860         // Parse '='
861         parseTokenWithWhitespace(EQUALS, errorCode);
862 
863         UnicodeString rhsStr;
864         // Parse RHS, which is either a literal or variable
865         switch (peek()) {
866         case DOLLAR: {
867             rand = Operand(parseVariableName(errorCode));
868             break;
869         }
870         default: {
871             // Must be a literal
872             rand = Operand(parseLiteral(errorCode));
873             break;
874         }
875         }
876         U_ASSERT(!rand.isNull());
877     } else {
878         // attribute -> "@" identifier [[s] "=" [s]]
879         // Use null operand, which `rand` is already set to
880         // "Backtrack" by restoring the whitespace (if there was any)
881         index = savedIndex;
882     }
883 
884     attrAdder.addAttribute(lhs, std::move(rand), errorCode);
885 }
886 
887 /*
888   Consume a name-value pair, matching the `option` nonterminal in the grammar.
889 
890   Adds the option to `optionList`
891 */
892 template<class T>
parseOption(OptionAdder<T> & addOption,UErrorCode & errorCode)893 void Parser::parseOption(OptionAdder<T>& addOption, UErrorCode& errorCode) {
894     U_ASSERT(inBounds());
895 
896     // Parse LHS
897     UnicodeString lhs = parseIdentifier(errorCode);
898 
899     // Parse '='
900     parseTokenWithWhitespace(EQUALS, errorCode);
901 
902     UnicodeString rhsStr;
903     Operand rand;
904     // Parse RHS, which is either a literal or variable
905     switch (peek()) {
906     case DOLLAR: {
907         rand = Operand(parseVariableName(errorCode));
908         break;
909     }
910     default: {
911         // Must be a literal
912         rand = Operand(parseLiteral(errorCode));
913         break;
914     }
915     }
916     U_ASSERT(!rand.isNull());
917 
918     // Finally, add the key=value mapping
919     // Use a local error code, check for duplicate option error and
920     // record it as with other errors
921     UErrorCode status = U_ZERO_ERROR;
922     addOption.addOption(lhs, std::move(rand), status);
923     if (U_FAILURE(status)) {
924       U_ASSERT(status == U_MF_DUPLICATE_OPTION_NAME_ERROR);
925       errors.setDuplicateOptionName(errorCode);
926     }
927 }
928 
929 /*
930   Note: there are multiple overloads of parseOptions() for parsing
931   options within markup, vs. within an expression, vs. parsing
932   attributes. This should be refactored. TODO
933  */
934 
935 /*
936   Consume optional whitespace followed by a sequence of options
937   (possibly empty), separated by whitespace
938 */
939 template <class T>
parseOptions(OptionAdder<T> & addOption,UErrorCode & errorCode)940 void Parser::parseOptions(OptionAdder<T>& addOption, UErrorCode& errorCode) {
941     // Early exit if out of bounds -- no more work is possible
942     CHECK_BOUNDS(errorCode);
943 
944 /*
945 Arbitrary lookahead is required to parse option lists. To see why, consider
946 these rules from the grammar:
947 
948 expression = "{" [s] (((literal / variable) [s annotation]) / annotation) [s] "}"
949 annotation = (function *(s option)) / reserved
950 
951 And this example:
952 {:foo  }
953 
954 Derivation:
955 expression -> "{" [s] (((literal / variable) [s annotation]) / annotation) [s] "}"
956            -> "{" [s] annotation [s] "}"
957            -> "{" [s] ((function *(s option)) / reserved) [s] "}"
958            -> "{" [s] function *(s option) [s] "}"
959 
960 In this example, knowing whether to expect a '}' or the start of another option
961 after the whitespace would require arbitrary lookahead -- in other words, which
962 rule should we apply?
963     *(s option) -> s option *(s option)
964   or
965     *(s option) ->
966 
967 The same would apply to the example {:foo k=v } (note the trailing space after "v").
968 
969 This is addressed using a form of backtracking and (to make the backtracking easier
970 to apply) a slight refactoring to the grammar.
971 
972 This code is written as if the grammar is:
973   expression = "{" [s] (((literal / variable) ([s] / [s annotation])) / annotation) "}"
974   annotation = (function *(s option) [s]) / (reserved [s])
975 
976 Parsing the `*(s option) [s]` sequence can be done within `parseOptions()`, meaning
977 that `parseExpression()` can safely require a '}' after `parseOptions()` finishes.
978 
979 Note that when "backtracking" really just means early exit, since only whitespace
980 is involved and there's no state to save.
981 
982 There is a separate but similar ambiguity as to whether the space precedes
983 an option or an attribute.
984 */
985 
986     while(true) {
987         // If the next character is not whitespace, that means we've already
988         // parsed the entire options list (which may have been empty) and there's
989         // no trailing whitespace. In that case, exit.
990         if (!isWhitespace(peek())) {
991             break;
992         }
993         int32_t firstWhitespace = index;
994 
995         // In any case other than an empty options list, there must be at least
996         // one whitespace character.
997         parseRequiredWhitespace(errorCode);
998         // Restore precondition
999         CHECK_BOUNDS(errorCode);
1000 
1001         // If a name character follows, then at least one more option remains
1002         // in the list.
1003         // Otherwise, we've consumed all the options and any trailing whitespace,
1004         // and can exit.
1005         // Note that exiting is sort of like backtracking: "(s option)" doesn't apply,
1006         // so we back out to [s].
1007         if (!isNameStart(peek())) {
1008             // We've consumed all the options (meaning that either we consumed non-empty
1009             // whitespace, or consumed at least one option.)
1010             // Done.
1011             // Remove the required whitespace from normalizedInput
1012             normalizedInput.truncate(normalizedInput.length() - 1);
1013             // "Backtrack" so as to leave the optional whitespace there
1014             // when parsing attributes
1015             index = firstWhitespace;
1016             break;
1017         }
1018         parseOption(addOption, errorCode);
1019     }
1020 }
1021 
1022 /*
1023   Consume optional whitespace followed by a sequence of attributes
1024   (possibly empty), separated by whitespace
1025 */
1026 template<class T>
parseAttributes(AttributeAdder<T> & attrAdder,UErrorCode & errorCode)1027 void Parser::parseAttributes(AttributeAdder<T>& attrAdder, UErrorCode& errorCode) {
1028 
1029     // Early exit if out of bounds -- no more work is possible
1030     if (!inBounds()) {
1031         ERROR(errorCode);
1032         return;
1033     }
1034 
1035 /*
1036 Arbitrary lookahead is required to parse attribute lists, similarly to option lists.
1037 (See comment in parseOptions()).
1038 */
1039 
1040     while(true) {
1041         // If the next character is not whitespace, that means we've already
1042         // parsed the entire attributes list (which may have been empty) and there's
1043         // no trailing whitespace. In that case, exit.
1044         if (!isWhitespace(peek())) {
1045             break;
1046         }
1047 
1048         // In any case other than an empty attributes list, there must be at least
1049         // one whitespace character.
1050         parseRequiredWhitespace(errorCode);
1051         // Restore precondition
1052         if (!inBounds()) {
1053             ERROR(errorCode);
1054             break;
1055         }
1056 
1057         // If an '@' follows, then at least one more attribute remains
1058         // in the list.
1059         // Otherwise, we've consumed all the attributes and any trailing whitespace,
1060         // and can exit.
1061         // Note that exiting is sort of like backtracking: "(s attributes)" doesn't apply,
1062         // so we back out to [s].
1063         if (peek() != AT) {
1064             // We've consumed all the attributes (meaning that either we consumed non-empty
1065             // whitespace, or consumed at least one attribute.)
1066             // Done.
1067             // Remove the whitespace from normalizedInput
1068             normalizedInput.truncate(normalizedInput.length() - 1);
1069             break;
1070         }
1071         parseAttribute(attrAdder, errorCode);
1072     }
1073 }
1074 
1075 /*
1076   Consume a function call, matching the `annotation`
1077   nonterminal in the grammar
1078 
1079   Returns an `Operator` representing this (a reserved is a parse error)
1080 */
parseAnnotation(UErrorCode & status)1081 Operator Parser::parseAnnotation(UErrorCode& status) {
1082     U_ASSERT(inBounds());
1083     Operator::Builder ratorBuilder(status);
1084     if (U_FAILURE(status)) {
1085         return {};
1086     }
1087     if (isFunctionStart(peek())) {
1088         // Consume the function name
1089         FunctionName func = parseFunction(status);
1090         ratorBuilder.setFunctionName(std::move(func));
1091 
1092         OptionAdder<Operator::Builder> addOptions(ratorBuilder);
1093         // Consume the options (which may be empty)
1094         parseOptions(addOptions, status);
1095     } else {
1096         ERROR(status);
1097     }
1098     return ratorBuilder.build(status);
1099 }
1100 
1101 /*
1102   Consume a literal or variable (depending on `isVariable`),
1103   followed by either required whitespace followed by an annotation,
1104   or optional whitespace.
1105 */
parseLiteralOrVariableWithAnnotation(bool isVariable,Expression::Builder & builder,UErrorCode & status)1106 void Parser::parseLiteralOrVariableWithAnnotation(bool isVariable,
1107                                                   Expression::Builder& builder,
1108                                                   UErrorCode& status) {
1109     CHECK_ERROR(status);
1110 
1111     U_ASSERT(inBounds());
1112 
1113     Operand rand;
1114     if (isVariable) {
1115         rand = Operand(parseVariableName(status));
1116     } else {
1117         rand = Operand(parseLiteral(status));
1118     }
1119 
1120     builder.setOperand(std::move(rand));
1121 
1122 /*
1123 Parsing a literal or variable with an optional annotation requires arbitrary lookahead.
1124 To see why, consider this rule from the grammar:
1125 
1126 expression = "{" [s] (((literal / variable) [s annotation]) / annotation) [s] "}"
1127 
1128 And this example:
1129 
1130 {|foo|   }
1131 
1132 Derivation:
1133 expression -> "{" [s] (((literal / variable) [s annotation]) / annotation) [s] "}"
1134            -> "{" [s] ((literal / variable) [s annotation]) [s] "}"
1135            -> "{" [s] (literal [s annotation]) [s] "}"
1136 
1137 When reading the ' ' after the second '|', it's ambiguous whether that's the required
1138 space before an annotation, or the optional space before the '}'.
1139 
1140 To make this ambiguity easier to handle, this code is based on the same grammar
1141 refactoring for the `expression` nonterminal that `parseOptions()` relies on. See
1142 the comment in `parseOptions()` for details.
1143 */
1144 
1145     if (isWhitespace(peek())) {
1146       int32_t firstWhitespace = index;
1147 
1148       // If the next character is whitespace, either [s annotation] or [s] applies
1149       // (the character is either the required space before an annotation, or optional
1150       // trailing space after the literal or variable). It's still ambiguous which
1151       // one does apply.
1152       parseOptionalWhitespace(status);
1153       // Restore precondition
1154       CHECK_BOUNDS(status);
1155 
1156       // This next check resolves the ambiguity between [s annotation] and [s]
1157       bool isSAnnotation = isAnnotationStart(peek());
1158 
1159       if (isSAnnotation) {
1160         normalizedInput += SPACE;
1161       }
1162 
1163       if (isSAnnotation) {
1164         // The previously consumed whitespace precedes an annotation
1165         builder.setOperator(parseAnnotation(status));
1166       } else {
1167           // Either there's a right curly brace (will be consumed by the caller),
1168           // or there's an error and the trailing whitespace should be
1169           // handled by the caller. However, this is not an error
1170           // here because we're just parsing `literal [s annotation]`.
1171           index = firstWhitespace;
1172       }
1173     } else {
1174       // Either there was never whitespace, or
1175       // the previously consumed whitespace is the optional trailing whitespace;
1176       // either the next character is '}' or the error will be handled by parseExpression.
1177       // Do nothing, since the operand was already set
1178     }
1179 
1180     // At the end of this code, the next character should either be '}',
1181     // whitespace followed by a '}',
1182     // or end-of-input
1183 }
1184 
1185 /*
1186   Consume an expression, matching the `expression` nonterminal in the grammar
1187 */
1188 
exprFallback(Expression::Builder & exprBuilder)1189 static void exprFallback(Expression::Builder& exprBuilder) {
1190     // Construct a literal consisting just of  The U+FFFD REPLACEMENT CHARACTER
1191     // per https://github.com/unicode-org/message-format-wg/blob/main/spec/formatting.md#fallback-resolution
1192     exprBuilder.setOperand(Operand(Literal(false, UnicodeString(REPLACEMENT))));
1193 }
1194 
exprFallback(UErrorCode & status)1195 static Expression exprFallback(UErrorCode& status) {
1196     Expression result;
1197     if (U_SUCCESS(status)) {
1198         Expression::Builder exprBuilder(status);
1199         if (U_SUCCESS(status)) {
1200             // Construct a literal consisting just of  The U+FFFD REPLACEMENT CHARACTER
1201             // per https://github.com/unicode-org/message-format-wg/blob/main/spec/formatting.md#fallback-resolution
1202             exprBuilder.setOperand(Operand(Literal(false, UnicodeString(REPLACEMENT))));
1203             UErrorCode status = U_ZERO_ERROR;
1204             result = exprBuilder.build(status);
1205             // An operand was set, so there can't be an error
1206             U_ASSERT(U_SUCCESS(status));
1207         }
1208     }
1209     return result;
1210 }
1211 
parseExpression(UErrorCode & status)1212 Expression Parser::parseExpression(UErrorCode& status) {
1213     if (U_FAILURE(status)) {
1214         return {};
1215     }
1216 
1217     // Early return if out of input -- no more work is possible
1218     U_ASSERT(inBounds());
1219 
1220     // Parse opening brace
1221     parseToken(LEFT_CURLY_BRACE, status);
1222     // Optional whitespace after opening brace
1223     parseOptionalWhitespace(status);
1224 
1225     Expression::Builder exprBuilder(status);
1226     // Restore precondition
1227     if (!inBounds()) {
1228         exprFallback(exprBuilder);
1229     } else {
1230         // literal '|', variable '$' or annotation
1231         switch (peek()) {
1232         case PIPE: {
1233             // Quoted literal
1234             parseLiteralOrVariableWithAnnotation(false, exprBuilder, status);
1235             break;
1236         }
1237         case DOLLAR: {
1238             // Variable
1239             parseLiteralOrVariableWithAnnotation(true, exprBuilder, status);
1240             break;
1241         }
1242         default: {
1243             if (isAnnotationStart(peek())) {
1244                 Operator rator = parseAnnotation(status);
1245                 exprBuilder.setOperator(std::move(rator));
1246             } else if (isUnquotedStart(peek())) {
1247                 // Unquoted literal
1248                 parseLiteralOrVariableWithAnnotation(false, exprBuilder, status);
1249             } else {
1250                 // Not a literal, variable or annotation -- error out
1251                 ERROR(status);
1252                 exprFallback(exprBuilder);
1253                 break;
1254             }
1255             break;
1256         }
1257         }
1258     }
1259 
1260     // Parse attributes
1261     AttributeAdder<Expression::Builder> attrAdder(exprBuilder);
1262     parseAttributes(attrAdder, status);
1263 
1264     // Parse optional space
1265     // (the last [s] in e.g. "{" [s] literal [s annotation] *(s attribute) [s] "}")
1266     parseOptionalWhitespace(status);
1267 
1268     // Either an operand or operator (or both) must have been set already,
1269     // so there can't be an error
1270     UErrorCode localStatus = U_ZERO_ERROR;
1271     Expression result = exprBuilder.build(localStatus);
1272     U_ASSERT(U_SUCCESS(localStatus));
1273 
1274     // Check for end-of-input and missing '}'
1275     if (!inBounds()) {
1276         ERROR(status);
1277     } else {
1278         // Otherwise, it's safe to check for the '}'
1279         parseToken(RIGHT_CURLY_BRACE, status);
1280     }
1281     return result;
1282 }
1283 
1284 /*
1285   Parse a .local declaration, matching the `local-declaration`
1286   production in the grammar
1287 */
parseLocalDeclaration(UErrorCode & status)1288 void Parser::parseLocalDeclaration(UErrorCode& status) {
1289     // End-of-input here would be an error; even empty
1290     // declarations must be followed by a body
1291     CHECK_BOUNDS(status);
1292 
1293     parseToken(ID_LOCAL, status);
1294     parseRequiredWhitespace(status);
1295 
1296     // Restore precondition
1297     CHECK_BOUNDS(status);
1298     VariableName lhs = parseVariableName(status);
1299     parseTokenWithWhitespace(EQUALS, status);
1300     // Restore precondition before calling parseExpression()
1301     CHECK_BOUNDS(status);
1302 
1303     Expression rhs = parseExpression(status);
1304 
1305     // Add binding from lhs to rhs, unless there was an error
1306     // (This ensures that if there was a correct lhs but a
1307     // parse error in rhs, the fallback for uses of the
1308     // lhs will be its own name rather than the rhs)
1309     /* This affects the behavior of this test case, which the spec
1310        is ambiguous about:
1311 
1312        .local $bar {|foo|} {{{$bar}}}
1313 
1314        Should `$bar` still be bound to a value although
1315        its declaration is syntactically incorrect (missing the '=')?
1316        This code says no, but it needs to change if
1317        https://github.com/unicode-org/message-format-wg/issues/703
1318        is resolved differently.
1319     */
1320     CHECK_ERROR(status);
1321     if (!errors.hasSyntaxError()) {
1322         dataModel.addBinding(Binding(std::move(lhs), std::move(rhs)), status);
1323         // Check if status is U_DUPLICATE_DECLARATION_ERROR
1324         // and add that as an internal error if so
1325         if (status == U_MF_DUPLICATE_DECLARATION_ERROR) {
1326             status = U_ZERO_ERROR;
1327             errors.addError(StaticErrorType::DuplicateDeclarationError, status);
1328         }
1329     }
1330 }
1331 
1332 /*
1333   Parse an .input declaration, matching the `local-declaration`
1334   production in the grammar
1335 */
parseInputDeclaration(UErrorCode & status)1336 void Parser::parseInputDeclaration(UErrorCode& status) {
1337     // End-of-input here would be an error; even empty
1338     // declarations must be followed by a body
1339     CHECK_BOUNDS(status);
1340 
1341     parseToken(ID_INPUT, status);
1342     parseOptionalWhitespace(status);
1343 
1344     // Restore precondition before calling parseExpression()
1345     CHECK_BOUNDS(status);
1346 
1347     // Save the index for error diagnostics
1348     int32_t exprIndex = index;
1349     Expression rhs = parseExpression(status);
1350 
1351     // Here we have to check that the rhs is a variable-expression
1352     if (!rhs.getOperand().isVariable()) {
1353         // This case is a syntax error; report it at the beginning
1354         // of the expression
1355         ERROR_AT(status, exprIndex);
1356         return;
1357     }
1358 
1359     VariableName lhs = rhs.getOperand().asVariable();
1360 
1361     // Add binding from lhs to rhs
1362     // This just adds a new local variable that shadows the message
1363     // argument referred to, which is harmless.
1364     // When evaluating the RHS, the new local is not in scope
1365     // and the message argument will be correctly referred to.
1366     CHECK_ERROR(status);
1367     if (!errors.hasSyntaxError()) {
1368         dataModel.addBinding(Binding::input(std::move(lhs), std::move(rhs), status), status);
1369         // Check if status is U_MF_DUPLICATE_DECLARATION_ERROR
1370         // and add that as an internal error if so
1371         if (status == U_MF_DUPLICATE_DECLARATION_ERROR) {
1372             status = U_ZERO_ERROR;
1373             errors.addError(StaticErrorType::DuplicateDeclarationError, status);
1374         }
1375     }
1376 }
1377 
1378 /*
1379   Consume a possibly-empty sequence of declarations separated by whitespace;
1380   each declaration matches the `declaration` nonterminal in the grammar
1381 
1382   Builds up an environment representing those declarations
1383 */
parseDeclarations(UErrorCode & status)1384 void Parser::parseDeclarations(UErrorCode& status) {
1385     // End-of-input here would be an error; even empty
1386     // declarations must be followed by a body
1387     CHECK_BOUNDS(status);
1388 
1389     while (peek() == PERIOD) {
1390         CHECK_BOUNDS_1(status);
1391         if (peek(1) == ID_LOCAL[1]) {
1392             parseLocalDeclaration(status);
1393         } else if (peek(1) == ID_INPUT[1]) {
1394             parseInputDeclaration(status);
1395         } else {
1396             // Done parsing declarations
1397             break;
1398         }
1399 
1400         // Avoid looping infinitely
1401         CHECK_ERROR(status);
1402 
1403         parseOptionalWhitespace(status);
1404         // Restore precondition
1405         CHECK_BOUNDS(status);
1406     }
1407 }
1408 
1409 /*
1410   Consume a text character
1411   matching the `text-char` nonterminal in the grammar
1412 
1413   No postcondition (a message can end with a text-char)
1414 */
parseTextChar(UErrorCode & status)1415 UnicodeString Parser::parseTextChar(UErrorCode& status) {
1416     UnicodeString str;
1417     if (!inBounds() || !(isTextChar(peek()))) {
1418         // Error -- text-char is expected here
1419         ERROR(status);
1420     } else {
1421         // See comment in parseQuotedLiteral()
1422         if (isEscapableChar(peek())) {
1423             normalizedInput += BACKSLASH;
1424         }
1425         normalizedInput += peek();
1426         str += peek();
1427         next();
1428         maybeAdvanceLine();
1429     }
1430     return str;
1431 }
1432 
1433 /*
1434   Consume an `nmtoken`, `literal`, or the string "*", matching
1435   the `key` nonterminal in the grammar
1436 */
parseKey(UErrorCode & status)1437 Key Parser::parseKey(UErrorCode& status) {
1438     U_ASSERT(inBounds());
1439 
1440     Key k; // wildcard by default
1441     // Literal | '*'
1442     switch (peek()) {
1443     case ASTERISK: {
1444         next();
1445         normalizedInput += ASTERISK;
1446         // Guarantee postcondition
1447         if (!inBounds()) {
1448             ERROR(status);
1449             return k;
1450         }
1451         break;
1452     }
1453     default: {
1454         // Literal
1455         k = Key(parseLiteral(status));
1456         break;
1457     }
1458     }
1459     return k;
1460 }
1461 
1462 /*
1463   Consume a non-empty sequence of `key`s separated by whitespace
1464 
1465   Takes ownership of `keys`
1466 */
parseNonEmptyKeys(UErrorCode & status)1467 SelectorKeys Parser::parseNonEmptyKeys(UErrorCode& status) {
1468     SelectorKeys result;
1469 
1470     if (U_FAILURE(status)) {
1471         return result;
1472     }
1473 
1474     U_ASSERT(inBounds());
1475 
1476 /*
1477 Arbitrary lookahead is required to parse key lists. To see why, consider
1478 this rule from the grammar:
1479 
1480 variant = key *(s key) [s] quoted-pattern
1481 
1482 And this example:
1483 when k1 k2   {a}
1484 
1485 Derivation:
1486    variant -> key *(s key) [s] quoted-pattern
1487            -> key s key *(s key) quoted-pattern
1488 
1489 After matching ' ' to `s` and 'k2' to `key`, it would require arbitrary lookahead
1490 to know whether to expect the start of a pattern or the start of another key.
1491 In other words: is the second whitespace sequence the required space in *(s key),
1492 or the optional space in [s] quoted-pattern?
1493 
1494 This is addressed using "backtracking" (similarly to `parseOptions()`).
1495 */
1496 
1497     SelectorKeys::Builder keysBuilder(status);
1498     if (U_FAILURE(status)) {
1499         return result;
1500     }
1501 
1502     // Since the first key is required, it's simplest to parse it separately.
1503     keysBuilder.add(parseKey(status), status);
1504 
1505     // Restore precondition
1506     if (!inBounds()) {
1507         ERROR(status);
1508         return result;
1509     }
1510 
1511     // We've seen at least one whitespace-key pair, so now we can parse
1512     // *(s key) [s]
1513     while (peek() != LEFT_CURLY_BRACE || isWhitespace(peek())) { // Try to recover from errors
1514         bool wasWhitespace = isWhitespace(peek());
1515         parseRequiredWhitespace(status);
1516         if (!wasWhitespace) {
1517             // Avoid infinite loop when parsing something like:
1518             // when * @{!...
1519             next();
1520         }
1521 
1522         // Restore precondition
1523         if (!inBounds()) {
1524             ERROR(status);
1525             return result;
1526         }
1527 
1528         // At this point, it's ambiguous whether we are inside (s key) or [s].
1529         // This check resolves that ambiguity.
1530         if (peek() == LEFT_CURLY_BRACE) {
1531             // A pattern follows, so what we just parsed was the optional
1532             // trailing whitespace. All the keys have been parsed.
1533 
1534             // Unpush the whitespace from `normalizedInput`
1535             normalizedInput.truncate(normalizedInput.length() - 1);
1536             break;
1537         }
1538         keysBuilder.add(parseKey(status), status);
1539     }
1540 
1541     return keysBuilder.build(status);
1542 }
1543 
parseQuotedPattern(UErrorCode & status)1544 Pattern Parser::parseQuotedPattern(UErrorCode& status) {
1545     U_ASSERT(inBounds());
1546 
1547     parseToken(LEFT_CURLY_BRACE, status);
1548     parseToken(LEFT_CURLY_BRACE, status);
1549     Pattern p = parseSimpleMessage(status);
1550     parseToken(RIGHT_CURLY_BRACE, status);
1551     parseToken(RIGHT_CURLY_BRACE, status);
1552     return p;
1553 }
1554 
1555 /*
1556   Consume a `placeholder`, matching the nonterminal in the grammar
1557   No postcondition (a markup can end a message)
1558 */
parseMarkup(UErrorCode & status)1559 Markup Parser::parseMarkup(UErrorCode& status) {
1560     U_ASSERT(inBounds(1));
1561 
1562     U_ASSERT(peek() == LEFT_CURLY_BRACE);
1563 
1564     Markup::Builder builder(status);
1565     if (U_FAILURE(status)) {
1566         return {};
1567     }
1568 
1569     // Consume the '{'
1570     next();
1571     normalizedInput += LEFT_CURLY_BRACE;
1572     parseOptionalWhitespace(status);
1573     bool closing = false;
1574     switch (peek()) {
1575     case NUMBER_SIGN: {
1576         // Open or standalone; consume the '#'
1577         normalizedInput += peek();
1578         next();
1579         break;
1580     }
1581     case SLASH: {
1582         // Closing
1583         normalizedInput += peek();
1584         closing = true;
1585         next();
1586         break;
1587     }
1588     default: {
1589         ERROR(status);
1590         return {};
1591     }
1592     }
1593 
1594     // Parse the markup identifier
1595     builder.setName(parseIdentifier(status));
1596 
1597     // Parse the options, which must begin with a ' '
1598     // if present
1599     if (inBounds() && isWhitespace(peek())) {
1600         OptionAdder<Markup::Builder> optionAdder(builder);
1601         parseOptions(optionAdder, status);
1602     }
1603 
1604     // Parse the attributes, which also must begin
1605     // with a ' '
1606     if (inBounds() && isWhitespace(peek())) {
1607         AttributeAdder<Markup::Builder> attrAdder(builder);
1608         parseAttributes(attrAdder, status);
1609     }
1610 
1611     parseOptionalWhitespace(status);
1612 
1613     bool standalone = false;
1614     // Check if this is a standalone or not
1615     if (!closing) {
1616         if (inBounds() && peek() == SLASH) {
1617             standalone = true;
1618             normalizedInput += SLASH;
1619             next();
1620         }
1621     }
1622 
1623     parseToken(RIGHT_CURLY_BRACE, status);
1624 
1625     if (standalone) {
1626         builder.setStandalone();
1627     } else if (closing) {
1628         builder.setClose();
1629     } else {
1630         builder.setOpen();
1631     }
1632 
1633     return builder.build(status);
1634 }
1635 
1636 /*
1637   Consume a `placeholder`, matching the nonterminal in the grammar
1638   No postcondition (a placeholder can end a message)
1639 */
parsePlaceholder(UErrorCode & status)1640 std::variant<Expression, Markup> Parser::parsePlaceholder(UErrorCode& status) {
1641     U_ASSERT(peek() == LEFT_CURLY_BRACE);
1642 
1643     if (!inBounds()) {
1644         ERROR(status);
1645         return exprFallback(status);
1646     }
1647 
1648     // Need to look ahead arbitrarily since whitespace
1649     // can appear before the '{' and '#'
1650     // in markup
1651     int32_t tempIndex = 1;
1652     bool isMarkup = false;
1653     while (inBounds(1)) {
1654         UChar32 c = peek(tempIndex);
1655         if (c == NUMBER_SIGN || c == SLASH) {
1656             isMarkup = true;
1657             break;
1658         }
1659         if (!isWhitespace(c)){
1660             break;
1661         }
1662         tempIndex++;
1663     }
1664 
1665     if (isMarkup) {
1666         return parseMarkup(status);
1667     }
1668     return parseExpression(status);
1669 }
1670 
1671 /*
1672   Consume a `simple-message`, matching the nonterminal in the grammar
1673   Postcondition: `index == len()` or U_FAILURE(status);
1674   for a syntactically correct message, this will consume the entire input
1675 */
parseSimpleMessage(UErrorCode & status)1676 Pattern Parser::parseSimpleMessage(UErrorCode& status) {
1677     Pattern::Builder result(status);
1678 
1679     if (U_SUCCESS(status)) {
1680         Expression expression;
1681         while (inBounds()) {
1682             switch (peek()) {
1683             case LEFT_CURLY_BRACE: {
1684                 // Must be placeholder
1685                 std::variant<Expression, Markup> piece = parsePlaceholder(status);
1686                 if (std::holds_alternative<Expression>(piece)) {
1687                     Expression expr = *std::get_if<Expression>(&piece);
1688                     result.add(std::move(expr), status);
1689                 } else {
1690                     Markup markup = *std::get_if<Markup>(&piece);
1691                     result.add(std::move(markup), status);
1692                 }
1693                 break;
1694             }
1695             case BACKSLASH: {
1696                 // Must be escaped-char
1697                 result.add(parseEscapeSequence(status), status);
1698                 break;
1699             }
1700             case RIGHT_CURLY_BRACE: {
1701                 // Distinguish unescaped '}' from end of quoted pattern
1702                 break;
1703             }
1704             default: {
1705                 // Must be text-char
1706                 result.add(parseTextChar(status), status);
1707                 break;
1708             }
1709             }
1710             if (peek() == RIGHT_CURLY_BRACE) {
1711                 // End of quoted pattern
1712                 break;
1713             }
1714             // Don't loop infinitely
1715             if (errors.hasSyntaxError()) {
1716                 break;
1717             }
1718         }
1719     }
1720     return result.build(status);
1721 }
1722 
1723 
1724 /*
1725   Consume a `selectors` (matching the nonterminal in the grammar),
1726   followed by a non-empty sequence of `variant`s (matching the nonterminal
1727   in the grammar) preceded by whitespace
1728   No postcondition (on return, `index` might equal `len()` with no syntax error
1729   because a message can end with a variant)
1730 */
parseSelectors(UErrorCode & status)1731 void Parser::parseSelectors(UErrorCode& status) {
1732     CHECK_ERROR(status);
1733 
1734     U_ASSERT(inBounds());
1735 
1736     parseToken(ID_MATCH, status);
1737 
1738     bool empty = true;
1739     // Parse selectors
1740     // "Backtracking" is required here. It's not clear if whitespace is
1741     // (`[s]` selector) or (`[s]` variant)
1742     while (isWhitespace(peek()) || peek() == LEFT_CURLY_BRACE) {
1743         parseOptionalWhitespace(status);
1744         // Restore precondition
1745         CHECK_BOUNDS(status);
1746         if (peek() != LEFT_CURLY_BRACE) {
1747             // This is not necessarily an error, but rather,
1748             // means the whitespace we parsed was the optional
1749             // whitespace preceding the first variant, not the
1750             // optional whitespace preceding a subsequent expression.
1751             break;
1752         }
1753         Expression expression;
1754         expression = parseExpression(status);
1755         empty = false;
1756 
1757         dataModel.addSelector(std::move(expression), status);
1758         CHECK_ERROR(status);
1759     }
1760 
1761     // At least one selector is required
1762     if (empty) {
1763         ERROR(status);
1764         return;
1765     }
1766 
1767     #define CHECK_END_OF_INPUT                     \
1768         if (!inBounds()) {                         \
1769             break;                                 \
1770         }                                          \
1771 
1772     // Parse variants
1773     while (isWhitespace(peek()) || isKeyStart(peek())) {
1774         // Trailing whitespace is allowed
1775         parseOptionalWhitespace(status);
1776         if (!inBounds()) {
1777             return;
1778         }
1779 
1780         // At least one key is required
1781         SelectorKeys keyList(parseNonEmptyKeys(status));
1782 
1783         CHECK_ERROR(status);
1784 
1785         // parseNonEmptyKeys() consumes any trailing whitespace,
1786         // so the pattern can be consumed next.
1787 
1788         // Restore precondition before calling parsePattern()
1789         // (which must return a non-null value)
1790         CHECK_BOUNDS(status);
1791         Pattern rhs = parseQuotedPattern(status);
1792 
1793         dataModel.addVariant(std::move(keyList), std::move(rhs), status);
1794 
1795         // Restore the precondition, *without* erroring out if we've
1796         // reached the end of input. That's because it's valid for the
1797         // message to end with a variant that has no trailing whitespace.
1798         // Why do we need to check this condition twice inside the loop?
1799         // Because if we don't check it here, the `isWhitespace()` call in
1800         // the loop head will read off the end of the input string.
1801         CHECK_END_OF_INPUT
1802     }
1803 }
1804 
1805 /*
1806   Consume a `body` (matching the nonterminal in the grammar),
1807   No postcondition (on return, `index` might equal `len()` with no syntax error,
1808   because a message can end with a body (trailing whitespace is optional)
1809 */
1810 
errorPattern(UErrorCode & status)1811 void Parser::errorPattern(UErrorCode& status) {
1812     errors.addSyntaxError(status);
1813     // Set to empty pattern
1814     Pattern::Builder result = Pattern::Builder(status);
1815     CHECK_ERROR(status);
1816 
1817     // If still in bounds, then add the remaining input as a single text part
1818     // to the pattern
1819     /*
1820       TODO: this behavior isn't documented in the spec, but it comes from
1821       https://github.com/messageformat/messageformat/blob/e0087bff312d759b67a9129eac135d318a1f0ce7/packages/mf2-messageformat/src/__fixtures/test-messages.json#L236
1822       and a pending pull request https://github.com/unicode-org/message-format-wg/pull/462 will clarify
1823       whether this is the intent behind the spec
1824      */
1825     UnicodeString partStr(LEFT_CURLY_BRACE);
1826     while (inBounds()) {
1827         partStr += peek();
1828         next();
1829     }
1830     // Add curly braces around the entire output (same comment as above)
1831     partStr += RIGHT_CURLY_BRACE;
1832     result.add(std::move(partStr), status);
1833     dataModel.setPattern(result.build(status));
1834 }
1835 
parseBody(UErrorCode & status)1836 void Parser::parseBody(UErrorCode& status) {
1837     CHECK_ERROR(status);
1838 
1839     // Out-of-input is a syntax warning
1840     if (!inBounds()) {
1841         errorPattern(status);
1842         return;
1843     }
1844 
1845     // Body must be either a pattern or selectors
1846     switch (peek()) {
1847     case LEFT_CURLY_BRACE: {
1848         // Pattern
1849         dataModel.setPattern(parseQuotedPattern(status));
1850         break;
1851     }
1852     case ID_MATCH[0]: {
1853         // Selectors
1854         parseSelectors(status);
1855         return;
1856     }
1857     default: {
1858         ERROR(status);
1859         errorPattern(status);
1860         return;
1861     }
1862     }
1863 }
1864 
1865 // -------------------------------------
1866 // Parses the source pattern.
1867 
parse(UParseError & parseErrorResult,UErrorCode & status)1868 void Parser::parse(UParseError &parseErrorResult, UErrorCode& status) {
1869     CHECK_ERROR(status);
1870 
1871     bool complex = false;
1872     // First, "look ahead" to determine if this is a simple or complex
1873     // message. To do that, check the first non-whitespace character.
1874     while (inBounds(index) && isWhitespace(peek())) {
1875         next();
1876     }
1877 
1878     // Message can be empty, so we need to only look ahead
1879     // if we know it's non-empty
1880     if (inBounds()) {
1881         if (peek() == PERIOD
1882             || (inBounds(1)
1883                 && peek() == LEFT_CURLY_BRACE
1884                 && peek(1) == LEFT_CURLY_BRACE)) {
1885             complex = true;
1886         }
1887     }
1888     // Reset index
1889     index = 0;
1890 
1891     // Message can be empty, so we need to only look ahead
1892     // if we know it's non-empty
1893     if (complex) {
1894         parseOptionalWhitespace(status);
1895         parseDeclarations(status);
1896         parseBody(status);
1897         parseOptionalWhitespace(status);
1898     } else {
1899         // Simple message
1900         // For normalization, quote the pattern
1901         normalizedInput += LEFT_CURLY_BRACE;
1902         normalizedInput += LEFT_CURLY_BRACE;
1903         dataModel.setPattern(parseSimpleMessage(status));
1904         normalizedInput += RIGHT_CURLY_BRACE;
1905         normalizedInput += RIGHT_CURLY_BRACE;
1906     }
1907 
1908     CHECK_ERROR(status);
1909 
1910     // There are no errors; finally, check that the entire input was consumed
1911     if (!allConsumed()) {
1912         ERROR(status);
1913     }
1914 
1915     // Finally, copy the relevant fields of the internal `MessageParseError`
1916     // into the `UParseError` argument
1917     translateParseError(parseError, parseErrorResult);
1918 }
1919 
~Parser()1920 Parser::~Parser() {}
1921 
1922 } // namespace message2
1923 U_NAMESPACE_END
1924 
1925 #endif /* #if !UCONFIG_NO_MF2 */
1926 
1927 #endif /* #if !UCONFIG_NO_FORMATTING */
1928 
1929