1 /**
2 * Copyright (c) 2021-2024 Huawei Device Co., Ltd.
3 * Licensed under the Apache License, Version 2.0 (the "License");
4 * you may not use this file except in compliance with the License.
5 * You may obtain a copy of the License at
6 *
7 * http://www.apache.org/licenses/LICENSE-2.0
8 *
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS,
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 * See the License for the specific language governing permissions and
13 * limitations under the License.
14 */
15
16 #include "regexp.h"
17
18 #include "lexer/token/letters.h"
19 #include "unicode/uchar.h"
20
21 #include <iostream>
22
23 namespace ark::es2panda::lexer {
RegExp(util::StringView p,util::StringView f,RegExpFlags reFlags)24 RegExp::RegExp(util::StringView p, util::StringView f, RegExpFlags reFlags) : patternStr(p), flagsStr(f), flags(reFlags)
25 {
26 }
27
RegExpParser(const RegExp & re,ArenaAllocator * allocator,const parser::ParserImpl & parser)28 RegExpParser::RegExpParser(const RegExp &re, ArenaAllocator *allocator, const parser::ParserImpl &parser)
29 : re_(re), allocator_ {allocator}, iter_(re_.patternStr), parser_(parser)
30 {
31 }
32
Unicode() const33 bool RegExpParser::Unicode() const
34 {
35 return (re_.flags & RegExpFlags::UNICODE) != 0;
36 }
37
Peek() const38 char32_t RegExpParser::Peek() const
39 {
40 return iter_.Peek();
41 }
42
Next()43 char32_t RegExpParser::Next()
44 {
45 return iter_.Next();
46 }
47
IsDecimalDigit(char32_t cp)48 static bool IsDecimalDigit(char32_t cp)
49 {
50 return (cp >= LEX_CHAR_0 && cp <= LEX_CHAR_9);
51 }
52
IsOctalDigit(char32_t cp)53 static bool IsOctalDigit(char32_t cp)
54 {
55 return (cp >= LEX_CHAR_0 && cp <= LEX_CHAR_7);
56 }
57
IsHexDigit(char32_t cp)58 static bool IsHexDigit(char32_t cp)
59 {
60 return IsDecimalDigit(cp) || (cp >= LEX_CHAR_LOWERCASE_A && cp <= LEX_CHAR_LOWERCASE_F) ||
61 (cp >= LEX_CHAR_UPPERCASE_A && cp <= LEX_CHAR_UPPERCASE_F);
62 }
63
DigitValue(char32_t cp)64 static uint32_t DigitValue(char32_t cp)
65 {
66 return (cp - LEX_CHAR_0);
67 }
68
HexValue(char32_t cp)69 static uint32_t HexValue(char32_t cp)
70 {
71 if (IsDecimalDigit(cp)) {
72 return DigitValue(cp);
73 }
74
75 constexpr auto OFFSET = 10;
76
77 if (cp < LEX_CHAR_LOWERCASE_A) {
78 return cp - LEX_CHAR_UPPERCASE_A + OFFSET;
79 }
80
81 return (cp - LEX_CHAR_LOWERCASE_A + OFFSET);
82 }
83
ParsePattern()84 void RegExpParser::ParsePattern()
85 {
86 ParseDisjunction();
87
88 if (iter_.HasNext()) {
89 parser_.ThrowSyntaxError("Invalid closing parenthesis");
90 }
91
92 if (!backReferences_.empty() && !groupNames_.empty()) {
93 for (const auto it : backReferences_) {
94 auto result = groupNames_.find(it);
95 if (result == groupNames_.end()) {
96 parser_.ThrowSyntaxError("Invalid capturing group");
97 }
98 }
99 }
100 }
101
ParseDisjunction()102 void RegExpParser::ParseDisjunction()
103 {
104 while (true) {
105 ParseAlternatives();
106
107 if (Peek() != LEX_CHAR_VLINE) {
108 break;
109 }
110
111 Next();
112 };
113 }
114
ParseAlternative()115 void RegExpParser::ParseAlternative()
116 {
117 switch (Peek()) {
118 case LEX_CHAR_BACKSLASH: {
119 Next();
120 char32_t cp = Peek();
121 if (cp == LEX_CHAR_LOWERCASE_B || cp == LEX_CHAR_UPPERCASE_B) {
122 /* assertion */
123 Next();
124 return;
125 }
126
127 ParseAtomEscape();
128 break;
129 }
130 case LEX_CHAR_CIRCUMFLEX:
131 case LEX_CHAR_DOLLAR_SIGN: {
132 /* assertion */
133 Next();
134 return;
135 }
136 case LEX_CHAR_LEFT_PAREN: {
137 if (ParseAlternativeCharLeftParen()) {
138 return;
139 }
140 break;
141 }
142 case LEX_CHAR_LEFT_SQUARE: {
143 Next();
144 ParseCharacterClass();
145 break;
146 }
147 case LEX_CHAR_DOT: {
148 Next();
149 break;
150 }
151 default: {
152 if (ParseBracedQuantifier()) {
153 parser_.ThrowSyntaxError("Invalid quantifier, nothing to repeat");
154 }
155
156 if (!ParsePatternCharacter()) {
157 parser_.ThrowSyntaxError("Invalid character");
158 }
159
160 break;
161 }
162 }
163
164 ParseQuantifier();
165 }
166
ParseAlternativeCharLeftParen()167 bool RegExpParser::ParseAlternativeCharLeftParen()
168 {
169 Next();
170
171 if (Peek() != LEX_CHAR_QUESTION) {
172 ParseCapturingGroup();
173 return false;
174 }
175
176 Next(); // eat '?'
177
178 char32_t cp = Next();
179 if (cp == LEX_CHAR_COLON) {
180 ParseNonCapturingGroup();
181 return false;
182 }
183
184 if (cp == LEX_CHAR_EQUALS || cp == LEX_CHAR_EXCLAMATION) {
185 ParseAssertion();
186
187 return Unicode();
188 }
189
190 if (cp != LEX_CHAR_LESS_THAN) {
191 parser_.ThrowSyntaxError("Invalid group");
192 }
193
194 cp = Peek();
195 if (cp == LEX_CHAR_EQUALS || cp == LEX_CHAR_EXCLAMATION) {
196 Next();
197 ParseAssertion();
198 return true;
199 }
200
201 ParseNamedCapturingGroup();
202 return false;
203 }
204
ParseAlternatives()205 void RegExpParser::ParseAlternatives()
206 {
207 while (true) {
208 switch (Peek()) {
209 case util::StringView::Iterator::INVALID_CP:
210 case LEX_CHAR_RIGHT_PAREN:
211 case LEX_CHAR_VLINE: {
212 return;
213 }
214 default: {
215 ParseAlternative();
216 }
217 }
218 }
219 }
220
ParseNonCapturingGroup()221 void RegExpParser::ParseNonCapturingGroup()
222 {
223 ParseDisjunction();
224
225 if (Peek() != LEX_CHAR_RIGHT_PAREN) {
226 parser_.ThrowSyntaxError("Invalid non-capturing group");
227 }
228
229 Next();
230 }
231
ParseNamedCapturingGroup()232 void RegExpParser::ParseNamedCapturingGroup()
233 {
234 util::StringView name = ParseIdent();
235
236 auto result = groupNames_.insert(name);
237 if (!result.second) {
238 parser_.ThrowSyntaxError("Duplicate group name");
239 }
240
241 ParseCapturingGroup();
242 }
243
ParseCapturingGroup()244 void RegExpParser::ParseCapturingGroup()
245 {
246 capturingGroupCount_++;
247
248 ParseDisjunction();
249
250 if (Peek() != LEX_CHAR_RIGHT_PAREN) {
251 parser_.ThrowSyntaxError("Invalid capturing group");
252 }
253
254 Next();
255 }
256
ParseAssertion()257 void RegExpParser::ParseAssertion()
258 {
259 ParseDisjunction();
260
261 if (Peek() != LEX_CHAR_RIGHT_PAREN) {
262 parser_.ThrowSyntaxError("Invalid assertion");
263 }
264
265 Next();
266 }
267
ParseControlEscape()268 uint32_t RegExpParser::ParseControlEscape()
269 {
270 char32_t cp = Peek();
271 if ((cp < LEX_CHAR_LOWERCASE_A || cp > LEX_CHAR_LOWERCASE_Z) &&
272 (cp < LEX_CHAR_UPPERCASE_A || cp > LEX_CHAR_UPPERCASE_Z)) {
273 if (Unicode()) {
274 parser_.ThrowSyntaxError("Invalid control escape");
275 }
276
277 if (cp < LEX_CHAR_0 || cp > LEX_CHAR_9) {
278 return LEX_CHAR_LOWERCASE_C;
279 }
280 }
281
282 Next();
283 constexpr auto MODULO = 32;
284 return cp % MODULO;
285 }
286
ParseClassAtomHelper(char32_t cp)287 char32_t RegExpParser::ParseClassAtomHelper(char32_t cp)
288 {
289 switch (cp) {
290 case LEX_CHAR_LOWERCASE_C:
291 return ParseControlEscape();
292 case LEX_CHAR_LOWERCASE_X:
293 return ParseHexEscape();
294 case LEX_CHAR_LOWERCASE_U:
295 if (!Unicode() && Peek() == LEX_CHAR_LEFT_BRACE) {
296 return cp;
297 }
298
299 return ParseUnicodeEscape();
300 case LEX_CHAR_LOWERCASE_P:
301 case LEX_CHAR_UPPERCASE_P: {
302 if (!Unicode()) {
303 return cp;
304 }
305
306 ParseUnicodePropertyEscape();
307 [[fallthrough]];
308 }
309 case LEX_CHAR_LOWERCASE_D:
310 case LEX_CHAR_UPPERCASE_D:
311 case LEX_CHAR_LOWERCASE_S:
312 case LEX_CHAR_UPPERCASE_S:
313 case LEX_CHAR_LOWERCASE_W:
314 case LEX_CHAR_UPPERCASE_W:
315 return std::numeric_limits<uint32_t>::max();
316 case LEX_CHAR_LOWERCASE_B:
317 return LEX_CHAR_BS;
318 case LEX_CHAR_LOWERCASE_F:
319 return LEX_CHAR_FF;
320 case LEX_CHAR_LOWERCASE_N:
321 return LEX_CHAR_LF;
322 case LEX_CHAR_LOWERCASE_R:
323 return LEX_CHAR_CR;
324 case LEX_CHAR_LOWERCASE_T:
325 return LEX_CHAR_TAB;
326 case LEX_CHAR_LOWERCASE_V:
327 return LEX_CHAR_VT;
328 case LEX_CHAR_MINUS:
329 return cp;
330 default:
331 if (Unicode() && !IsSyntaxCharacter(cp) && cp != LEX_CHAR_SLASH) {
332 parser_.ThrowSyntaxError("Invalid escape");
333 }
334
335 return cp;
336 }
337
338 return cp;
339 }
340
ParseClassAtom()341 char32_t RegExpParser::ParseClassAtom()
342 {
343 char32_t cp = Next();
344 if (cp != LEX_CHAR_BACKSLASH) {
345 return cp;
346 }
347
348 cp = Peek();
349 if (cp == LEX_CHAR_0) {
350 if (!Unicode()) {
351 return ParseDecimalEscape();
352 }
353
354 Next();
355
356 if (IsDecimalDigit(Peek())) {
357 parser_.ThrowSyntaxError("Invalid escape");
358 }
359
360 return LEX_CHAR_NULL;
361 }
362
363 Next();
364
365 return ParseClassAtomHelper(cp);
366 }
367
IsClassEscape(uint32_t cp)368 static bool IsClassEscape(uint32_t cp)
369 {
370 return cp == std::numeric_limits<uint32_t>::max();
371 }
372
ParseCharacterClass()373 void RegExpParser::ParseCharacterClass()
374 {
375 if (Peek() == LEX_CHAR_CIRCUMFLEX) {
376 Next();
377 }
378
379 while (true) {
380 if (Peek() == LEX_CHAR_RIGHT_SQUARE) {
381 Next();
382 break;
383 }
384
385 uint32_t left = ParseClassAtom();
386
387 if (Peek() != LEX_CHAR_MINUS) {
388 continue;
389 }
390
391 Next();
392
393 if (Peek() == LEX_CHAR_RIGHT_SQUARE) {
394 Next();
395 break;
396 }
397
398 uint32_t right = ParseClassAtom();
399 if ((IsClassEscape(left) || IsClassEscape(right))) {
400 if (Unicode()) {
401 parser_.ThrowSyntaxError("Invalid character class");
402 }
403
404 continue;
405 }
406
407 if (left > right) {
408 parser_.ThrowSyntaxError("Class range out of order");
409 }
410 }
411 }
412
IsSyntaxCharacter(char32_t cp) const413 bool RegExpParser::IsSyntaxCharacter(char32_t cp) const
414 {
415 switch (cp) {
416 case LEX_CHAR_RIGHT_SQUARE:
417 case LEX_CHAR_LEFT_BRACE:
418 case LEX_CHAR_RIGHT_BRACE: {
419 if (!Unicode()) {
420 return false;
421 }
422
423 [[fallthrough]];
424 }
425 case LEX_CHAR_CIRCUMFLEX:
426 case LEX_CHAR_DOLLAR_SIGN:
427 case LEX_CHAR_BACKSLASH:
428 case LEX_CHAR_DOT:
429 case LEX_CHAR_ASTERISK:
430 case LEX_CHAR_PLUS:
431 case LEX_CHAR_QUESTION:
432 case LEX_CHAR_LEFT_PAREN:
433 case LEX_CHAR_RIGHT_PAREN:
434 case LEX_CHAR_LEFT_SQUARE:
435 case LEX_CHAR_VLINE: {
436 return true;
437 }
438 default: {
439 return false;
440 }
441 }
442 }
443
ParseAtomEscape()444 void RegExpParser::ParseAtomEscape()
445 {
446 char32_t cp = Peek();
447 if (IsDecimalDigit(cp)) {
448 ParseDecimalEscape();
449 return;
450 }
451
452 Next();
453
454 ParseAtomEscapeSwitch(cp);
455 }
456
ParseAtomEscapeSwitch(char32_t cp)457 void RegExpParser::ParseAtomEscapeSwitch(char32_t cp)
458 {
459 switch (cp) {
460 case LEX_CHAR_LOWERCASE_X: {
461 ParseHexEscape();
462 break;
463 }
464 case LEX_CHAR_LOWERCASE_U: {
465 ParseUnicodeEscape();
466 break;
467 }
468 case LEX_CHAR_LOWERCASE_K: {
469 ParseNamedBackreference();
470 return;
471 }
472 /* ControlEscape */
473 case LEX_CHAR_LOWERCASE_F:
474 case LEX_CHAR_LOWERCASE_N:
475 case LEX_CHAR_LOWERCASE_R:
476 case LEX_CHAR_LOWERCASE_T:
477 case LEX_CHAR_LOWERCASE_V:
478 /* CharacterClassEscape */
479 case LEX_CHAR_LOWERCASE_D:
480 case LEX_CHAR_UPPERCASE_D:
481 case LEX_CHAR_LOWERCASE_S:
482 case LEX_CHAR_UPPERCASE_S:
483 case LEX_CHAR_LOWERCASE_W:
484 case LEX_CHAR_UPPERCASE_W: {
485 return;
486 }
487 case LEX_CHAR_LOWERCASE_P:
488 case LEX_CHAR_UPPERCASE_P: {
489 ParseUnicodePropertyEscape();
490 return;
491 }
492 case LEX_CHAR_LOWERCASE_C: {
493 cp = Peek();
494 if ((cp < LEX_CHAR_LOWERCASE_A || cp > LEX_CHAR_LOWERCASE_Z) &&
495 (cp < LEX_CHAR_UPPERCASE_A || cp > LEX_CHAR_UPPERCASE_Z)) {
496 parser_.ThrowSyntaxError("Invalid control escape");
497 }
498
499 Next();
500 return;
501 }
502 default: {
503 /* IdentityEscape */
504 if (Unicode() && !IsSyntaxCharacter(cp) && cp != LEX_CHAR_SLASH) {
505 parser_.ThrowSyntaxError("Invalid escape");
506 }
507 }
508 }
509 }
510
ParseDecimalEscape()511 uint32_t RegExpParser::ParseDecimalEscape()
512 {
513 ASSERT(IsDecimalDigit(Peek()));
514
515 auto digitStart = iter_;
516 uint32_t decimalValue = DigitValue(Next());
517 if (decimalValue == 0) {
518 if (!IsDecimalDigit(Peek())) {
519 /* \0 */
520 return decimalValue;
521 }
522
523 if (Unicode()) {
524 parser_.ThrowSyntaxError("Invalid decimal escape");
525 }
526
527 iter_ = digitStart;
528 return ParseLegacyOctalEscape();
529 }
530
531 constexpr auto MULTIPLIER = 10;
532
533 while (IsDecimalDigit(Peek())) {
534 uint32_t newValue = decimalValue * MULTIPLIER + DigitValue(Next());
535 if (newValue < decimalValue) {
536 parser_.ThrowSyntaxError("Invalid decimal escape");
537 }
538
539 decimalValue = newValue;
540 }
541
542 if (decimalValue <= capturingGroupCount_) {
543 return decimalValue;
544 }
545
546 if (Unicode()) {
547 parser_.ThrowSyntaxError("Invalid decimal escape");
548 }
549
550 iter_ = digitStart;
551
552 if (!IsOctalDigit(Peek())) {
553 /* \8 or \9 */
554 return DigitValue(Next());
555 }
556
557 return ParseLegacyOctalEscape();
558 }
559
ParseLegacyOctalEscape()560 uint32_t RegExpParser::ParseLegacyOctalEscape()
561 {
562 ASSERT(IsOctalDigit(Peek()));
563 uint32_t octalValue = DigitValue(Next());
564
565 if (!IsOctalDigit(Peek())) {
566 return octalValue;
567 }
568
569 octalValue = octalValue * 8U + DigitValue(Next());
570
571 if (!IsOctalDigit(Peek())) {
572 return octalValue;
573 }
574
575 uint32_t newValue = octalValue * 8 + DigitValue(Peek());
576 constexpr uint32_t MAX_OCTAL_VALUE = 0xFF;
577
578 if (newValue <= MAX_OCTAL_VALUE) {
579 octalValue = newValue;
580 Next();
581 }
582
583 return octalValue;
584 }
585
ParseHexEscape()586 uint32_t RegExpParser::ParseHexEscape()
587 {
588 char32_t digit = Next();
589 if (!IsHexDigit(digit)) {
590 parser_.ThrowSyntaxError("Invalid hex escape");
591 }
592
593 constexpr auto MULTIPLIER = 16;
594 uint32_t cpValue = HexValue(digit) * MULTIPLIER;
595
596 digit = Next();
597 if (!IsHexDigit(digit)) {
598 parser_.ThrowSyntaxError("Invalid hex escape");
599 }
600
601 cpValue += HexValue(digit);
602 return cpValue;
603 }
604
ParseUnicodeDigits()605 uint32_t RegExpParser::ParseUnicodeDigits()
606 {
607 uint32_t value = 0;
608 uint32_t count = 4;
609
610 while ((count--) != 0U) {
611 char32_t digit = Next();
612 if (!IsHexDigit(digit)) {
613 parser_.ThrowSyntaxError("Invalid Unicode escape");
614 }
615
616 constexpr auto MULTIPLIER = 16;
617 value = value * MULTIPLIER + HexValue(digit);
618 }
619
620 return value;
621 }
622
ParseUnicodeEscape()623 uint32_t RegExpParser::ParseUnicodeEscape()
624 {
625 uint32_t value = 0;
626
627 if (Peek() == LEX_CHAR_LEFT_BRACE) {
628 Next();
629 if (!IsHexDigit(Peek())) {
630 parser_.ThrowSyntaxError("Invalid Unicode escape");
631 }
632
633 while (IsHexDigit(Peek())) {
634 constexpr auto MULTIPLIER = 16;
635 value = value * MULTIPLIER + HexValue(Next());
636 constexpr uint32_t CODE_POINT_MAX = 0x10FFFF;
637
638 if (value > CODE_POINT_MAX) {
639 parser_.ThrowSyntaxError("Invalid Unicode escape");
640 }
641 }
642
643 if (Peek() != LEX_CHAR_RIGHT_BRACE) {
644 parser_.ThrowSyntaxError("Invalid Unicode escape");
645 }
646
647 Next();
648 } else {
649 value = ParseUnicodeDigits();
650 if (!util::StringView::IsHighSurrogate(value)) {
651 return value;
652 }
653
654 auto pos = iter_;
655 if (Next() == LEX_CHAR_BACKSLASH && Next() == LEX_CHAR_LOWERCASE_U) {
656 uint32_t next = ParseUnicodeDigits();
657 if (util::StringView::IsLowSurrogate(next)) {
658 return util::StringView::DecodeSurrogates(value, next);
659 }
660 }
661 iter_ = pos;
662 }
663
664 return value;
665 }
666
ParseUnicodePropertyEscape()667 void RegExpParser::ParseUnicodePropertyEscape()
668 {
669 if (!Unicode()) {
670 return;
671 }
672
673 if (Peek() != LEX_CHAR_LEFT_BRACE) {
674 parser_.ThrowSyntaxError("Invalid Unicode property escape");
675 }
676
677 Next();
678
679 while (true) {
680 if (!iter_.HasNext()) {
681 parser_.ThrowSyntaxError("Unterminated Unicode property escape");
682 }
683
684 char32_t ch = Next();
685 if (ch == LEX_CHAR_LEFT_BRACE) {
686 break;
687 }
688
689 /* NOTE: Parse and validate Unicode property names */
690 }
691 }
692
ParseNamedBackreference()693 void RegExpParser::ParseNamedBackreference()
694 {
695 if (Next() != LEX_CHAR_LESS_THAN) {
696 if (!Unicode() && groupNames_.empty()) {
697 return;
698 }
699
700 parser_.ThrowSyntaxError("Invalid named backreference");
701 }
702
703 if (IsDecimalDigit(Peek())) {
704 return;
705 }
706
707 util::StringView name = ParseIdent();
708 backReferences_.insert(name);
709
710 ValidateNamedBackreference(Unicode());
711 }
712
ValidateNamedBackreference(bool isUnicode)713 void RegExpParser::ValidateNamedBackreference(bool isUnicode)
714 {
715 if (Peek() != LEX_CHAR_LEFT_PAREN || Peek() != LEX_CHAR_BACKSLASH || Peek() != UNICODE_INVALID_CP) {
716 if (!isUnicode) {
717 /* Identity escape */
718 return;
719 }
720
721 if (groupNames_.empty()) {
722 parser_.ThrowSyntaxError("Invalid named backreference");
723 }
724 }
725 }
726
ValidateGroupNameElement(char32_t cp)727 void RegExpParser::ValidateGroupNameElement(char32_t cp)
728 {
729 if (IsDecimalDigit(cp) && !backReferences_.empty()) {
730 parser_.ThrowSyntaxError("Invalid group name");
731 }
732 if (cp == UNICODE_INVALID_CP && !groupNames_.empty()) {
733 parser_.ThrowSyntaxError("Invalid group name");
734 }
735 }
736
ParseQuantifier()737 void RegExpParser::ParseQuantifier()
738 {
739 switch (Peek()) {
740 case LEX_CHAR_ASTERISK:
741 case LEX_CHAR_PLUS:
742 case LEX_CHAR_QUESTION: {
743 Next();
744 break;
745 }
746 case LEX_CHAR_LEFT_BRACE: {
747 if (!ParseBracedQuantifier()) {
748 return;
749 }
750
751 break;
752 }
753 default: {
754 return;
755 }
756 }
757
758 if (Peek() == LEX_CHAR_QUESTION) {
759 Next();
760 }
761 }
762
ParseBracedQuantifier()763 bool RegExpParser::ParseBracedQuantifier()
764 {
765 if (Peek() != LEX_CHAR_LEFT_BRACE) {
766 return false;
767 }
768
769 auto startPos = iter_;
770 Next();
771
772 if (!IsDecimalDigit(Peek())) {
773 iter_ = startPos;
774 return false;
775 }
776
777 uint32_t leftValue = 0;
778 constexpr auto MULTIPLIER = 10;
779
780 while (IsDecimalDigit(Peek())) {
781 uint32_t newValue = leftValue * MULTIPLIER + DigitValue(Next());
782 if (newValue < leftValue) {
783 leftValue = std::numeric_limits<uint32_t>::max();
784 continue;
785 }
786
787 leftValue = newValue;
788 }
789
790 if (Peek() == LEX_CHAR_COMMA) {
791 Next();
792 }
793
794 if (Peek() == LEX_CHAR_RIGHT_BRACE) {
795 Next();
796 return true;
797 }
798
799 if (IsDecimalDigit(Peek())) {
800 uint32_t rightValue = 0;
801 while (IsDecimalDigit(Peek())) {
802 uint32_t newValue = rightValue * MULTIPLIER + DigitValue(Next());
803 if (newValue < rightValue) {
804 rightValue = std::numeric_limits<uint32_t>::max();
805 continue;
806 }
807
808 rightValue = newValue;
809 }
810
811 if (Peek() == LEX_CHAR_RIGHT_BRACE) {
812 if (rightValue < leftValue) {
813 parser_.ThrowSyntaxError("Quantifier range out of order");
814 }
815
816 Next();
817 return true;
818 }
819 }
820
821 iter_ = startPos;
822 return false;
823 }
824
ParsePatternCharacter()825 bool RegExpParser::ParsePatternCharacter()
826 {
827 char32_t cp = Peek();
828 if (IsSyntaxCharacter(cp)) {
829 return false;
830 }
831
832 Next();
833 return true;
834 }
835
IsIdStart(uint32_t cp)836 static bool IsIdStart(uint32_t cp)
837 {
838 auto uchar = static_cast<UChar>(cp);
839 return u_isIDStart(uchar) || uchar == LEX_CHAR_DOLLAR_SIGN || uchar == LEX_CHAR_UNDERSCORE ||
840 uchar == LEX_CHAR_BACKSLASH;
841 }
842
IsIdCont(uint32_t cp)843 static bool IsIdCont(uint32_t cp)
844 {
845 auto uchar = static_cast<UChar>(cp);
846 return u_isIDPart(uchar) || uchar == LEX_CHAR_DOLLAR_SIGN || uchar == LEX_CHAR_UNDERSCORE ||
847 uchar == LEX_CHAR_BACKSLASH || uchar == LEX_CHAR_ZWNJ || uchar == LEX_CHAR_ZWJ;
848 }
849
ParseIdent()850 util::StringView RegExpParser::ParseIdent()
851 {
852 char32_t cp = Next();
853 if (cp == LEX_CHAR_BACKSLASH) {
854 if (Next() != LEX_CHAR_LOWERCASE_U) {
855 parser_.ThrowSyntaxError("Invalid group name");
856 }
857
858 if (!Unicode() && Peek() == LEX_CHAR_LEFT_BRACE) {
859 parser_.ThrowSyntaxError("Invalid Unicode escape");
860 }
861
862 cp = ParseUnicodeEscape();
863 }
864
865 if (!IsIdStart(cp) && cp != UNICODE_INVALID_CP && backReferences_.empty()) {
866 parser_.ThrowSyntaxError("Invalid group name");
867 }
868
869 util::UString ident(allocator_);
870 ident.Append(cp);
871
872 while (true) {
873 cp = Next();
874 if (cp == LEX_CHAR_GREATER_THAN) {
875 break;
876 }
877
878 if (cp == LEX_CHAR_BACKSLASH) {
879 if (Next() != LEX_CHAR_LOWERCASE_U) {
880 parser_.ThrowSyntaxError("Invalid group name");
881 }
882
883 if (!Unicode() && Peek() == LEX_CHAR_LEFT_BRACE) {
884 parser_.ThrowSyntaxError("Invalid Unicode escape");
885 }
886
887 cp = ParseUnicodeEscape();
888 }
889
890 ValidateGroupNameElement(cp);
891
892 if (cp == UNICODE_INVALID_CP) {
893 break;
894 }
895
896 if (!IsIdCont(cp)) {
897 parser_.ThrowSyntaxError("Invalid group name");
898 }
899
900 ident.Append(cp);
901 }
902
903 return ident.View();
904 }
905 } // namespace ark::es2panda::lexer
906