1 /**
2 * Copyright (c) 2021-2025 Huawei Device Co., Ltd.
3 * Licensed under the Apache License, Version 2.0 (the "License");
4 * you may not use this file except in compliance with the License.
5 * You may obtain a copy of the License at
6 *
7 * http://www.apache.org/licenses/LICENSE-2.0
8 *
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS,
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 * See the License for the specific language governing permissions and
13 * limitations under the License.
14 */
15
16 #include "regexp.h"
17
18 #include "lexer/token/letters.h"
19 #include "unicode/uchar.h"
20 #include "generated/diagnostic.h"
21
22 #include <iostream>
23
24 namespace ark::es2panda::lexer {
RegExp(util::StringView p,util::StringView f,RegExpFlags reFlags)25 RegExp::RegExp(util::StringView p, util::StringView f, RegExpFlags reFlags) : patternStr(p), flagsStr(f), flags(reFlags)
26 {
27 }
28
RegExpParser(const RegExp & re,ArenaAllocator * allocator,parser::ParserImpl * parser)29 RegExpParser::RegExpParser(const RegExp &re, ArenaAllocator *allocator, parser::ParserImpl *parser)
30 : re_(re), allocator_ {allocator}, iter_(re_.patternStr), parser_(parser)
31 {
32 }
33
Unicode() const34 bool RegExpParser::Unicode() const
35 {
36 return (re_.flags & RegExpFlags::UNICODE) != 0;
37 }
38
Peek() const39 char32_t RegExpParser::Peek() const
40 {
41 return iter_.Peek();
42 }
43
Next()44 char32_t RegExpParser::Next()
45 {
46 return iter_.Next();
47 }
48
IsDecimalDigit(char32_t cp)49 static bool IsDecimalDigit(char32_t cp)
50 {
51 return (cp >= LEX_CHAR_0 && cp <= LEX_CHAR_9);
52 }
53
IsOctalDigit(char32_t cp)54 static bool IsOctalDigit(char32_t cp)
55 {
56 return (cp >= LEX_CHAR_0 && cp <= LEX_CHAR_7);
57 }
58
IsHexDigit(char32_t cp)59 static bool IsHexDigit(char32_t cp)
60 {
61 return IsDecimalDigit(cp) || (cp >= LEX_CHAR_LOWERCASE_A && cp <= LEX_CHAR_LOWERCASE_F) ||
62 (cp >= LEX_CHAR_UPPERCASE_A && cp <= LEX_CHAR_UPPERCASE_F);
63 }
64
DigitValue(char32_t cp)65 static uint32_t DigitValue(char32_t cp)
66 {
67 return (cp - LEX_CHAR_0);
68 }
69
HexValue(char32_t cp)70 static uint32_t HexValue(char32_t cp)
71 {
72 if (IsDecimalDigit(cp)) {
73 return DigitValue(cp);
74 }
75
76 constexpr auto OFFSET = 10;
77
78 if (cp < LEX_CHAR_LOWERCASE_A) {
79 return cp - LEX_CHAR_UPPERCASE_A + OFFSET;
80 }
81
82 return (cp - LEX_CHAR_LOWERCASE_A + OFFSET);
83 }
84
ParsePattern()85 void RegExpParser::ParsePattern()
86 {
87 ParseDisjunction();
88
89 if (iter_.HasNext()) {
90 parser_->LogError(diagnostic::INVALID_CLOSING_PARENTHESIS);
91 }
92
93 if (!backReferences_.empty() && !groupNames_.empty()) {
94 for (const auto it : backReferences_) {
95 auto result = groupNames_.find(it);
96 if (result == groupNames_.end()) {
97 parser_->LogError(diagnostic::INVALID_CAPTURING_GROUP);
98 }
99 }
100 }
101 }
102
ParseDisjunction()103 void RegExpParser::ParseDisjunction()
104 {
105 while (true) {
106 ParseAlternatives();
107
108 if (Peek() != LEX_CHAR_VLINE) {
109 break;
110 }
111
112 Next();
113 }
114 }
115
ParseAlternative()116 void RegExpParser::ParseAlternative()
117 {
118 switch (Peek()) {
119 case LEX_CHAR_BACKSLASH: {
120 Next();
121 char32_t cp = Peek();
122 if (cp == LEX_CHAR_LOWERCASE_B || cp == LEX_CHAR_UPPERCASE_B) {
123 /* assertion */
124 Next();
125 return;
126 }
127
128 ParseAtomEscape();
129 break;
130 }
131 case LEX_CHAR_CIRCUMFLEX:
132 case LEX_CHAR_DOLLAR_SIGN: {
133 /* assertion */
134 Next();
135 return;
136 }
137 case LEX_CHAR_LEFT_PAREN: {
138 if (ParseAlternativeCharLeftParen()) {
139 return;
140 }
141 break;
142 }
143 case LEX_CHAR_LEFT_SQUARE: {
144 Next();
145 ParseCharacterClass();
146 break;
147 }
148 case LEX_CHAR_DOT: {
149 Next();
150 break;
151 }
152 default: {
153 if (ParseBracedQuantifier()) {
154 parser_->LogError(diagnostic::INVALID_QUANTIFIER);
155 }
156
157 if (!ParsePatternCharacter()) {
158 parser_->LogError(diagnostic::INVALID_CHAR);
159 }
160
161 break;
162 }
163 }
164
165 ParseQuantifier();
166 }
167
ParseAlternativeCharLeftParen()168 bool RegExpParser::ParseAlternativeCharLeftParen()
169 {
170 Next();
171
172 if (Peek() != LEX_CHAR_QUESTION) {
173 ParseCapturingGroup();
174 return false;
175 }
176
177 Next(); // eat '?'
178
179 char32_t cp = Next();
180 if (cp == LEX_CHAR_COLON) {
181 ParseNonCapturingGroup();
182 return false;
183 }
184
185 if (cp == LEX_CHAR_EQUALS || cp == LEX_CHAR_EXCLAMATION) {
186 ParseAssertion();
187
188 return Unicode();
189 }
190
191 if (cp != LEX_CHAR_LESS_THAN) {
192 parser_->LogError(diagnostic::INVALID_GROUP);
193 }
194
195 cp = Peek();
196 if (cp == LEX_CHAR_EQUALS || cp == LEX_CHAR_EXCLAMATION) {
197 Next();
198 ParseAssertion();
199 return true;
200 }
201
202 ParseNamedCapturingGroup();
203 return false;
204 }
205
ParseAlternatives()206 void RegExpParser::ParseAlternatives()
207 {
208 while (iter_.HasNext()) {
209 auto saved = iter_.Save();
210 switch (Peek()) {
211 case util::StringView::Iterator::INVALID_CP:
212 case LEX_CHAR_RIGHT_PAREN:
213 case LEX_CHAR_VLINE: {
214 return;
215 }
216 default: {
217 ParseAlternative();
218 }
219 }
220
221 if (saved == iter_.Save()) {
222 break; // Avoid infinite loop in error processing.
223 }
224 }
225 }
226
ParseNonCapturingGroup()227 void RegExpParser::ParseNonCapturingGroup()
228 {
229 ParseDisjunction();
230
231 if (Peek() != LEX_CHAR_RIGHT_PAREN) {
232 parser_->LogError(diagnostic::INVALID_NON_CAPTURING_GROUP);
233 }
234
235 Next();
236 }
237
ParseNamedCapturingGroup()238 void RegExpParser::ParseNamedCapturingGroup()
239 {
240 util::StringView name = ParseIdent();
241
242 auto result = groupNames_.insert(name);
243 if (!result.second) {
244 parser_->LogError(diagnostic::DUPLICATE_GROUP_NAME);
245 }
246
247 ParseCapturingGroup();
248 }
249
ParseCapturingGroup()250 void RegExpParser::ParseCapturingGroup()
251 {
252 capturingGroupCount_++;
253
254 ParseDisjunction();
255
256 if (Peek() != LEX_CHAR_RIGHT_PAREN) {
257 parser_->LogError(diagnostic::INVALID_CAPTURING_GROUP);
258 }
259
260 Next();
261 }
262
ParseAssertion()263 void RegExpParser::ParseAssertion()
264 {
265 ParseDisjunction();
266
267 if (Peek() != LEX_CHAR_RIGHT_PAREN) {
268 parser_->LogError(diagnostic::INVALID_ASSERT);
269 }
270
271 Next();
272 }
273
ParseControlEscape()274 uint32_t RegExpParser::ParseControlEscape()
275 {
276 char32_t cp = Peek();
277 if ((cp < LEX_CHAR_LOWERCASE_A || cp > LEX_CHAR_LOWERCASE_Z) &&
278 (cp < LEX_CHAR_UPPERCASE_A || cp > LEX_CHAR_UPPERCASE_Z)) {
279 if (Unicode()) {
280 parser_->LogError(diagnostic::INVALID_CONTROL_ESCAPE);
281 }
282
283 if (cp < LEX_CHAR_0 || cp > LEX_CHAR_9) {
284 return LEX_CHAR_LOWERCASE_C;
285 }
286 }
287
288 Next();
289 constexpr auto MODULO = 32;
290 return cp % MODULO;
291 }
292
ParseClassAtomHelper(char32_t cp)293 char32_t RegExpParser::ParseClassAtomHelper(char32_t cp)
294 {
295 switch (cp) {
296 case LEX_CHAR_LOWERCASE_C:
297 return ParseControlEscape();
298 case LEX_CHAR_LOWERCASE_X:
299 return ParseHexEscape();
300 case LEX_CHAR_LOWERCASE_U:
301 if (!Unicode() && Peek() == LEX_CHAR_LEFT_BRACE) {
302 return cp;
303 }
304
305 return ParseUnicodeEscape();
306 case LEX_CHAR_LOWERCASE_P:
307 case LEX_CHAR_UPPERCASE_P: {
308 if (!Unicode()) {
309 return cp;
310 }
311
312 ParseUnicodePropertyEscape();
313 [[fallthrough]];
314 }
315 case LEX_CHAR_LOWERCASE_D:
316 case LEX_CHAR_UPPERCASE_D:
317 case LEX_CHAR_LOWERCASE_S:
318 case LEX_CHAR_UPPERCASE_S:
319 case LEX_CHAR_LOWERCASE_W:
320 case LEX_CHAR_UPPERCASE_W:
321 return std::numeric_limits<uint32_t>::max();
322 case LEX_CHAR_LOWERCASE_B:
323 return LEX_CHAR_BS;
324 case LEX_CHAR_LOWERCASE_F:
325 return LEX_CHAR_FF;
326 case LEX_CHAR_LOWERCASE_N:
327 return LEX_CHAR_LF;
328 case LEX_CHAR_LOWERCASE_R:
329 return LEX_CHAR_CR;
330 case LEX_CHAR_LOWERCASE_T:
331 return LEX_CHAR_TAB;
332 case LEX_CHAR_LOWERCASE_V:
333 return LEX_CHAR_VT;
334 case LEX_CHAR_MINUS:
335 return cp;
336 default:
337 if (Unicode() && !IsSyntaxCharacter(cp) && cp != LEX_CHAR_SLASH) {
338 parser_->LogError(diagnostic::INVALID_ESCAPE);
339 }
340
341 return cp;
342 }
343
344 return cp;
345 }
346
ParseClassAtom()347 char32_t RegExpParser::ParseClassAtom()
348 {
349 char32_t cp = Next();
350 if (cp != LEX_CHAR_BACKSLASH) {
351 return cp;
352 }
353
354 cp = Peek();
355 if (cp == LEX_CHAR_0) {
356 if (!Unicode()) {
357 return ParseDecimalEscape();
358 }
359
360 Next();
361
362 if (IsDecimalDigit(Peek())) {
363 parser_->LogError(diagnostic::INVALID_ESCAPE);
364 }
365
366 return LEX_CHAR_NULL;
367 }
368
369 Next();
370
371 return ParseClassAtomHelper(cp);
372 }
373
IsClassEscape(uint32_t cp)374 static bool IsClassEscape(uint32_t cp)
375 {
376 return cp == std::numeric_limits<uint32_t>::max();
377 }
378
ParseCharacterClass()379 void RegExpParser::ParseCharacterClass()
380 {
381 if (Peek() == LEX_CHAR_CIRCUMFLEX) {
382 Next();
383 }
384
385 while (iter_.HasNext()) {
386 if (Peek() == LEX_CHAR_RIGHT_SQUARE) {
387 Next();
388 break;
389 }
390
391 uint32_t left = ParseClassAtom();
392
393 if (Peek() != LEX_CHAR_MINUS) {
394 continue;
395 }
396
397 Next();
398
399 if (Peek() == LEX_CHAR_RIGHT_SQUARE) {
400 Next();
401 break;
402 }
403
404 uint32_t right = ParseClassAtom();
405 if ((IsClassEscape(left) || IsClassEscape(right))) {
406 if (Unicode()) {
407 parser_->LogError(diagnostic::INVALIDE_CHAR_CLASS);
408 }
409
410 continue;
411 }
412
413 if (left > right) {
414 parser_->LogError(diagnostic::CLASS_OUT_OF_ORDER);
415 }
416 }
417 }
418
IsSyntaxCharacter(char32_t cp) const419 bool RegExpParser::IsSyntaxCharacter(char32_t cp) const
420 {
421 switch (cp) {
422 case LEX_CHAR_RIGHT_SQUARE:
423 case LEX_CHAR_LEFT_BRACE:
424 case LEX_CHAR_RIGHT_BRACE: {
425 if (!Unicode()) {
426 return false;
427 }
428
429 [[fallthrough]];
430 }
431 case LEX_CHAR_CIRCUMFLEX:
432 case LEX_CHAR_DOLLAR_SIGN:
433 case LEX_CHAR_BACKSLASH:
434 case LEX_CHAR_DOT:
435 case LEX_CHAR_ASTERISK:
436 case LEX_CHAR_PLUS:
437 case LEX_CHAR_QUESTION:
438 case LEX_CHAR_LEFT_PAREN:
439 case LEX_CHAR_RIGHT_PAREN:
440 case LEX_CHAR_LEFT_SQUARE:
441 case LEX_CHAR_VLINE: {
442 return true;
443 }
444 default: {
445 return false;
446 }
447 }
448 }
449
ParseAtomEscape()450 void RegExpParser::ParseAtomEscape()
451 {
452 char32_t cp = Peek();
453 if (IsDecimalDigit(cp)) {
454 ParseDecimalEscape();
455 return;
456 }
457
458 Next();
459
460 ParseAtomEscapeSwitch(cp);
461 }
462
ParseAtomEscapeSwitch(char32_t cp)463 void RegExpParser::ParseAtomEscapeSwitch(char32_t cp)
464 {
465 switch (cp) {
466 case LEX_CHAR_LOWERCASE_X: {
467 ParseHexEscape();
468 break;
469 }
470 case LEX_CHAR_LOWERCASE_U: {
471 ParseUnicodeEscape();
472 break;
473 }
474 case LEX_CHAR_LOWERCASE_K: {
475 ParseNamedBackreference();
476 return;
477 }
478 /* ControlEscape */
479 case LEX_CHAR_LOWERCASE_F:
480 case LEX_CHAR_LOWERCASE_N:
481 case LEX_CHAR_LOWERCASE_R:
482 case LEX_CHAR_LOWERCASE_T:
483 case LEX_CHAR_LOWERCASE_V:
484 /* CharacterClassEscape */
485 case LEX_CHAR_LOWERCASE_D:
486 case LEX_CHAR_UPPERCASE_D:
487 case LEX_CHAR_LOWERCASE_S:
488 case LEX_CHAR_UPPERCASE_S:
489 case LEX_CHAR_LOWERCASE_W:
490 case LEX_CHAR_UPPERCASE_W: {
491 return;
492 }
493 case LEX_CHAR_LOWERCASE_P:
494 case LEX_CHAR_UPPERCASE_P: {
495 ParseUnicodePropertyEscape();
496 return;
497 }
498 case LEX_CHAR_LOWERCASE_C: {
499 cp = Peek();
500 if ((cp < LEX_CHAR_LOWERCASE_A || cp > LEX_CHAR_LOWERCASE_Z) &&
501 (cp < LEX_CHAR_UPPERCASE_A || cp > LEX_CHAR_UPPERCASE_Z)) {
502 parser_->LogError(diagnostic::INVALID_CONTROL_ESCAPE);
503 }
504
505 Next();
506 return;
507 }
508 default: {
509 /* IdentityEscape */
510 if (Unicode() && !IsSyntaxCharacter(cp) && cp != LEX_CHAR_SLASH) {
511 parser_->LogError(diagnostic::INVALID_ESCAPE);
512 }
513 }
514 }
515 }
516
ParseDecimalEscape()517 uint32_t RegExpParser::ParseDecimalEscape()
518 {
519 ES2PANDA_ASSERT(IsDecimalDigit(Peek()));
520
521 auto digitStart = iter_;
522 uint32_t decimalValue = DigitValue(Next());
523 if (decimalValue == 0) {
524 if (!IsDecimalDigit(Peek())) {
525 /* \0 */
526 return decimalValue;
527 }
528
529 if (Unicode()) {
530 parser_->LogError(diagnostic::INVALID_DECIMAL_ESCAPE);
531 }
532
533 iter_ = digitStart;
534 return ParseLegacyOctalEscape();
535 }
536
537 constexpr auto MULTIPLIER = 10;
538
539 while (IsDecimalDigit(Peek())) {
540 uint32_t newValue = decimalValue * MULTIPLIER + DigitValue(Next());
541 if (newValue < decimalValue) {
542 parser_->LogError(diagnostic::INVALID_DECIMAL_ESCAPE);
543 }
544
545 decimalValue = newValue;
546 }
547
548 if (decimalValue <= capturingGroupCount_) {
549 return decimalValue;
550 }
551
552 if (Unicode()) {
553 parser_->LogError(diagnostic::INVALID_DECIMAL_ESCAPE);
554 }
555
556 iter_ = digitStart;
557
558 if (!IsOctalDigit(Peek())) {
559 /* \8 or \9 */
560 return DigitValue(Next());
561 }
562
563 return ParseLegacyOctalEscape();
564 }
565
ParseLegacyOctalEscape()566 uint32_t RegExpParser::ParseLegacyOctalEscape()
567 {
568 ES2PANDA_ASSERT(IsOctalDigit(Peek()));
569 uint32_t octalValue = DigitValue(Next());
570
571 if (!IsOctalDigit(Peek())) {
572 return octalValue;
573 }
574
575 octalValue = octalValue * 8U + DigitValue(Next());
576
577 if (!IsOctalDigit(Peek())) {
578 return octalValue;
579 }
580
581 uint32_t newValue = octalValue * 8 + DigitValue(Peek());
582 constexpr uint32_t MAX_OCTAL_VALUE = 0xFF;
583
584 if (newValue <= MAX_OCTAL_VALUE) {
585 octalValue = newValue;
586 Next();
587 }
588
589 return octalValue;
590 }
591
ParseHexEscape()592 uint32_t RegExpParser::ParseHexEscape()
593 {
594 char32_t digit = Next();
595 if (!IsHexDigit(digit)) {
596 parser_->LogError(diagnostic::INVALID_HEX_ESCAPE);
597 }
598
599 constexpr auto MULTIPLIER = 16;
600 uint32_t cpValue = HexValue(digit) * MULTIPLIER;
601
602 digit = Next();
603 if (!IsHexDigit(digit)) {
604 parser_->LogError(diagnostic::INVALID_HEX_ESCAPE);
605 }
606
607 cpValue += HexValue(digit);
608 return cpValue;
609 }
610
ParseUnicodeDigits()611 uint32_t RegExpParser::ParseUnicodeDigits()
612 {
613 uint32_t value = 0;
614 uint32_t count = 4;
615
616 while ((count--) != 0U) {
617 char32_t digit = Next();
618 if (!IsHexDigit(digit)) {
619 parser_->LogError(diagnostic::INVALID_UNICODE_ESCAPE);
620 return value;
621 }
622
623 constexpr auto MULTIPLIER = 16;
624 value = value * MULTIPLIER + HexValue(digit);
625 }
626
627 return value;
628 }
629
ParseUnicodeEscape()630 uint32_t RegExpParser::ParseUnicodeEscape()
631 {
632 uint32_t value = 0;
633
634 if (Peek() == LEX_CHAR_LEFT_BRACE) {
635 Next();
636 if (!IsHexDigit(Peek())) {
637 parser_->LogError(diagnostic::INVALID_UNICODE_ESCAPE);
638 }
639
640 while (IsHexDigit(Peek())) {
641 constexpr auto MULTIPLIER = 16;
642 value = value * MULTIPLIER + HexValue(Next());
643 constexpr uint32_t CODE_POINT_MAX = 0x10FFFF;
644
645 if (value > CODE_POINT_MAX) {
646 parser_->LogError(diagnostic::INVALID_UNICODE_ESCAPE);
647 break;
648 }
649 }
650
651 if (Peek() != LEX_CHAR_RIGHT_BRACE) {
652 parser_->LogError(diagnostic::INVALID_UNICODE_ESCAPE);
653 } else { // Error processing.
654 Next();
655 }
656 } else {
657 value = ParseUnicodeDigits();
658 if (!util::StringView::IsHighSurrogate(value)) {
659 return value;
660 }
661
662 auto pos = iter_;
663 if (Next() == LEX_CHAR_BACKSLASH && Next() == LEX_CHAR_LOWERCASE_U) {
664 uint32_t next = ParseUnicodeDigits();
665 if (util::StringView::IsLowSurrogate(next)) {
666 return util::StringView::DecodeSurrogates(value, next);
667 }
668 }
669 iter_ = pos;
670 }
671
672 return value;
673 }
674
ParseUnicodePropertyEscape()675 void RegExpParser::ParseUnicodePropertyEscape()
676 {
677 if (!Unicode()) {
678 return;
679 }
680
681 if (Peek() != LEX_CHAR_LEFT_BRACE) {
682 parser_->LogError(diagnostic::INVALID_UNICODE_PROP_ESCAPE);
683 return;
684 }
685
686 Next();
687
688 while (true) {
689 if (!iter_.HasNext()) {
690 parser_->LogError(diagnostic::UNTERMINATED_UNICODE_PROP_ESCAPE);
691 break;
692 }
693
694 char32_t ch = Next();
695 if (ch == LEX_CHAR_LEFT_BRACE) {
696 break;
697 }
698
699 /* NOTE: Parse and validate Unicode property names */
700 }
701 }
702
ParseNamedBackreference()703 void RegExpParser::ParseNamedBackreference()
704 {
705 if (Next() != LEX_CHAR_LESS_THAN) {
706 if (!Unicode() && groupNames_.empty()) {
707 return;
708 }
709
710 parser_->LogError(diagnostic::INVALID_NAME_BACKREFERENCE);
711 return;
712 }
713
714 if (IsDecimalDigit(Peek())) {
715 return;
716 }
717
718 util::StringView name = ParseIdent();
719 backReferences_.insert(name);
720
721 ValidateNamedBackreference(Unicode());
722 }
723
ValidateNamedBackreference(bool isUnicode)724 void RegExpParser::ValidateNamedBackreference(bool isUnicode)
725 {
726 if (Peek() != LEX_CHAR_LEFT_PAREN || Peek() != LEX_CHAR_BACKSLASH || Peek() != UNICODE_INVALID_CP) {
727 if (!isUnicode) {
728 /* Identity escape */
729 return;
730 }
731
732 if (groupNames_.empty()) {
733 parser_->LogError(diagnostic::INVALID_NAME_BACKREFERENCE);
734 }
735 }
736 }
737
ValidateGroupNameElement(char32_t cp)738 void RegExpParser::ValidateGroupNameElement(char32_t cp)
739 {
740 if (IsDecimalDigit(cp) && !backReferences_.empty()) {
741 parser_->LogError(diagnostic::INVALID_GROUP_NAME);
742 }
743 if (cp == UNICODE_INVALID_CP && !groupNames_.empty()) {
744 parser_->LogError(diagnostic::INVALID_GROUP_NAME);
745 }
746 }
747
ParseQuantifier()748 void RegExpParser::ParseQuantifier()
749 {
750 switch (Peek()) {
751 case LEX_CHAR_ASTERISK:
752 case LEX_CHAR_PLUS:
753 case LEX_CHAR_QUESTION: {
754 Next();
755 break;
756 }
757 case LEX_CHAR_LEFT_BRACE: {
758 if (!ParseBracedQuantifier()) {
759 return;
760 }
761
762 break;
763 }
764 default: {
765 return;
766 }
767 }
768
769 if (Peek() == LEX_CHAR_QUESTION) {
770 Next();
771 }
772 }
773
ParseBracedQuantifier()774 bool RegExpParser::ParseBracedQuantifier()
775 {
776 if (Peek() != LEX_CHAR_LEFT_BRACE) {
777 return false;
778 }
779
780 auto startPos = iter_;
781 Next();
782
783 if (!IsDecimalDigit(Peek())) {
784 iter_ = startPos;
785 return false;
786 }
787
788 uint32_t leftValue = 0;
789 constexpr auto MULTIPLIER = 10;
790
791 while (IsDecimalDigit(Peek())) {
792 uint32_t newValue = leftValue * MULTIPLIER + DigitValue(Next());
793 if (newValue < leftValue) {
794 leftValue = std::numeric_limits<uint32_t>::max();
795 continue;
796 }
797
798 leftValue = newValue;
799 }
800
801 if (Peek() == LEX_CHAR_COMMA) {
802 Next();
803 }
804
805 if (Peek() == LEX_CHAR_RIGHT_BRACE) {
806 Next();
807 return true;
808 }
809
810 if (IsDecimalDigit(Peek())) {
811 uint32_t rightValue = 0;
812 while (IsDecimalDigit(Peek())) {
813 uint32_t newValue = rightValue * MULTIPLIER + DigitValue(Next());
814 if (newValue < rightValue) {
815 rightValue = std::numeric_limits<uint32_t>::max();
816 continue;
817 }
818
819 rightValue = newValue;
820 }
821
822 if (Peek() == LEX_CHAR_RIGHT_BRACE) {
823 if (rightValue < leftValue) {
824 parser_->LogError(diagnostic::QUANTIFIER_OUT_OF_ORDER);
825 }
826
827 Next();
828 return true;
829 }
830 }
831
832 iter_ = startPos;
833 return false;
834 }
835
ParsePatternCharacter()836 bool RegExpParser::ParsePatternCharacter()
837 {
838 char32_t cp = Peek();
839 if (IsSyntaxCharacter(cp)) {
840 return false;
841 }
842
843 Next();
844 return true;
845 }
846
IsIdStart(uint32_t cp)847 static bool IsIdStart(uint32_t cp)
848 {
849 auto uchar = static_cast<UChar>(cp);
850 return u_isIDStart(uchar) || uchar == LEX_CHAR_DOLLAR_SIGN || uchar == LEX_CHAR_UNDERSCORE ||
851 uchar == LEX_CHAR_BACKSLASH;
852 }
853
IsIdCont(uint32_t cp)854 static bool IsIdCont(uint32_t cp)
855 {
856 auto uchar = static_cast<UChar>(cp);
857 return u_isIDPart(uchar) || uchar == LEX_CHAR_DOLLAR_SIGN || uchar == LEX_CHAR_UNDERSCORE ||
858 uchar == LEX_CHAR_BACKSLASH || uchar == LEX_CHAR_ZWNJ || uchar == LEX_CHAR_ZWJ;
859 }
860
ParseIdent()861 util::StringView RegExpParser::ParseIdent()
862 {
863 char32_t cp = Next();
864 if (cp == LEX_CHAR_BACKSLASH) {
865 if (Next() != LEX_CHAR_LOWERCASE_U) {
866 parser_->LogError(diagnostic::INVALID_GROUP_NAME);
867 }
868
869 if (!Unicode() && Peek() == LEX_CHAR_LEFT_BRACE) {
870 parser_->LogError(diagnostic::INVALID_UNICODE_ESCAPE);
871 }
872
873 cp = ParseUnicodeEscape();
874 }
875
876 if (!IsIdStart(cp) && cp != UNICODE_INVALID_CP && backReferences_.empty()) {
877 parser_->LogError(diagnostic::INVALID_GROUP_NAME);
878 }
879
880 util::UString ident(allocator_);
881 ident.Append(cp);
882
883 while (true) {
884 cp = Next();
885 if (cp == LEX_CHAR_GREATER_THAN) {
886 break;
887 }
888
889 if (cp == LEX_CHAR_BACKSLASH) {
890 if (Next() != LEX_CHAR_LOWERCASE_U) {
891 parser_->LogError(diagnostic::INVALID_GROUP_NAME);
892 }
893
894 if (!Unicode() && Peek() == LEX_CHAR_LEFT_BRACE) {
895 parser_->LogError(diagnostic::INVALID_UNICODE_ESCAPE);
896 }
897
898 cp = ParseUnicodeEscape();
899 }
900
901 ValidateGroupNameElement(cp);
902
903 if (cp == UNICODE_INVALID_CP) {
904 break;
905 }
906
907 if (!IsIdCont(cp)) {
908 parser_->LogError(diagnostic::INVALID_GROUP_NAME);
909 }
910
911 ident.Append(cp);
912 }
913
914 return ident.View();
915 }
916 } // namespace ark::es2panda::lexer
917