1 /**
2 * Copyright (c) 2021-2024 Huawei Device Co., Ltd.
3 * Licensed under the Apache License, Version 2.0 (the "License");
4 * you may not use this file except in compliance with the License.
5 * You may obtain a copy of the License at
6 *
7 * http://www.apache.org/licenses/LICENSE-2.0
8 *
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS,
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 * See the License for the specific language governing permissions and
13 * limitations under the License.
14 */
15
16 #include "regexp.h"
17
18 #include "lexer/token/letters.h"
19 #include "unicode/uchar.h"
20
21 #include <iostream>
22
23 namespace ark::es2panda::lexer {
RegExpError(std::string_view m)24 RegExpError::RegExpError(std::string_view m) : message(m) {}
25
RegExp(util::StringView p,util::StringView f,RegExpFlags reFlags)26 RegExp::RegExp(util::StringView p, util::StringView f, RegExpFlags reFlags) : patternStr(p), flagsStr(f), flags(reFlags)
27 {
28 }
29
RegExpParser(const RegExp & re,ArenaAllocator * allocator)30 RegExpParser::RegExpParser(const RegExp &re, ArenaAllocator *allocator)
31 : re_(re), allocator_ {allocator}, iter_(re_.patternStr)
32 {
33 }
34
Unicode() const35 bool RegExpParser::Unicode() const
36 {
37 return (re_.flags & RegExpFlags::UNICODE) != 0;
38 }
39
Peek() const40 char32_t RegExpParser::Peek() const
41 {
42 return iter_.Peek();
43 }
44
Next()45 char32_t RegExpParser::Next()
46 {
47 return iter_.Next();
48 }
49
IsDecimalDigit(char32_t cp)50 static bool IsDecimalDigit(char32_t cp)
51 {
52 return (cp >= LEX_CHAR_0 && cp <= LEX_CHAR_9);
53 }
54
IsOctalDigit(char32_t cp)55 static bool IsOctalDigit(char32_t cp)
56 {
57 return (cp >= LEX_CHAR_0 && cp <= LEX_CHAR_7);
58 }
59
IsHexDigit(char32_t cp)60 static bool IsHexDigit(char32_t cp)
61 {
62 return IsDecimalDigit(cp) || (cp >= LEX_CHAR_LOWERCASE_A && cp <= LEX_CHAR_LOWERCASE_F) ||
63 (cp >= LEX_CHAR_UPPERCASE_A && cp <= LEX_CHAR_UPPERCASE_F);
64 }
65
DigitValue(char32_t cp)66 static uint32_t DigitValue(char32_t cp)
67 {
68 return (cp - LEX_CHAR_0);
69 }
70
HexValue(char32_t cp)71 static uint32_t HexValue(char32_t cp)
72 {
73 if (IsDecimalDigit(cp)) {
74 return DigitValue(cp);
75 }
76
77 constexpr auto OFFSET = 10;
78
79 if (cp < LEX_CHAR_LOWERCASE_A) {
80 return cp - LEX_CHAR_UPPERCASE_A + OFFSET;
81 }
82
83 return (cp - LEX_CHAR_LOWERCASE_A + OFFSET);
84 }
85
ThrowError(std::string_view message)86 static void ThrowError(std::string_view message)
87 {
88 throw RegExpError(message);
89 }
90
ParsePattern()91 void RegExpParser::ParsePattern()
92 {
93 ParseDisjunction();
94
95 if (iter_.HasNext()) {
96 ThrowError("Invalid closing parenthesis");
97 }
98
99 if (!backReferences_.empty() && !groupNames_.empty()) {
100 for (const auto it : backReferences_) {
101 auto result = groupNames_.find(it);
102 if (result == groupNames_.end()) {
103 ThrowError("Invalid capturing group");
104 }
105 }
106 }
107 }
108
ParseDisjunction()109 void RegExpParser::ParseDisjunction()
110 {
111 while (true) {
112 ParseAlternatives();
113
114 if (Peek() != LEX_CHAR_VLINE) {
115 break;
116 }
117
118 Next();
119 };
120 }
121
ParseAlternative()122 void RegExpParser::ParseAlternative()
123 {
124 switch (Peek()) {
125 case LEX_CHAR_BACKSLASH: {
126 Next();
127 char32_t cp = Peek();
128 if (cp == LEX_CHAR_LOWERCASE_B || cp == LEX_CHAR_UPPERCASE_B) {
129 /* assertion */
130 Next();
131 return;
132 }
133
134 ParseAtomEscape();
135 break;
136 }
137 case LEX_CHAR_CIRCUMFLEX:
138 case LEX_CHAR_DOLLAR_SIGN: {
139 /* assertion */
140 Next();
141 return;
142 }
143 case LEX_CHAR_LEFT_PAREN: {
144 if (ParseAlternativeCharLeftParen()) {
145 return;
146 }
147 break;
148 }
149 case LEX_CHAR_LEFT_SQUARE: {
150 Next();
151 ParseCharacterClass();
152 break;
153 }
154 case LEX_CHAR_DOT: {
155 Next();
156 break;
157 }
158 default: {
159 if (ParseBracedQuantifier()) {
160 ThrowError("Invalid quantifier, nothing to repeat");
161 }
162
163 if (!ParsePatternCharacter()) {
164 ThrowError("Invalid character");
165 }
166
167 break;
168 }
169 }
170
171 ParseQuantifier();
172 }
173
ParseAlternativeCharLeftParen()174 bool RegExpParser::ParseAlternativeCharLeftParen()
175 {
176 Next();
177
178 if (Peek() != LEX_CHAR_QUESTION) {
179 ParseCapturingGroup();
180 return false;
181 }
182
183 Next(); // eat '?'
184
185 char32_t cp = Next();
186 if (cp == LEX_CHAR_COLON) {
187 ParseNonCapturingGroup();
188 return false;
189 }
190
191 if (cp == LEX_CHAR_EQUALS || cp == LEX_CHAR_EXCLAMATION) {
192 ParseAssertion();
193
194 return Unicode();
195 }
196
197 if (cp != LEX_CHAR_LESS_THAN) {
198 ThrowError("Invalid group");
199 }
200
201 cp = Peek();
202 if (cp == LEX_CHAR_EQUALS || cp == LEX_CHAR_EXCLAMATION) {
203 Next();
204 ParseAssertion();
205 return true;
206 }
207
208 ParseNamedCapturingGroup();
209 return false;
210 }
211
ParseAlternatives()212 void RegExpParser::ParseAlternatives()
213 {
214 while (true) {
215 switch (Peek()) {
216 case util::StringView::Iterator::INVALID_CP:
217 case LEX_CHAR_RIGHT_PAREN:
218 case LEX_CHAR_VLINE: {
219 return;
220 }
221 default: {
222 ParseAlternative();
223 }
224 }
225 }
226 }
227
ParseNonCapturingGroup()228 void RegExpParser::ParseNonCapturingGroup()
229 {
230 ParseDisjunction();
231
232 if (Peek() != LEX_CHAR_RIGHT_PAREN) {
233 ThrowError("Invalid non-capturing group");
234 }
235
236 Next();
237 }
238
ParseNamedCapturingGroup()239 void RegExpParser::ParseNamedCapturingGroup()
240 {
241 util::StringView name = ParseIdent();
242
243 auto result = groupNames_.insert(name);
244 if (!result.second) {
245 ThrowError("Duplicate group name");
246 }
247
248 ParseCapturingGroup();
249 }
250
ParseCapturingGroup()251 void RegExpParser::ParseCapturingGroup()
252 {
253 capturingGroupCount_++;
254
255 ParseDisjunction();
256
257 if (Peek() != LEX_CHAR_RIGHT_PAREN) {
258 ThrowError("Invalid capturing group");
259 }
260
261 Next();
262 }
263
ParseAssertion()264 void RegExpParser::ParseAssertion()
265 {
266 ParseDisjunction();
267
268 if (Peek() != LEX_CHAR_RIGHT_PAREN) {
269 ThrowError("Invalid assertion");
270 }
271
272 Next();
273 }
274
ParseControlEscape()275 uint32_t RegExpParser::ParseControlEscape()
276 {
277 char32_t cp = Peek();
278 if ((cp < LEX_CHAR_LOWERCASE_A || cp > LEX_CHAR_LOWERCASE_Z) &&
279 (cp < LEX_CHAR_UPPERCASE_A || cp > LEX_CHAR_UPPERCASE_Z)) {
280 if (Unicode()) {
281 ThrowError("Invalid control escape");
282 }
283
284 if (cp < LEX_CHAR_0 || cp > LEX_CHAR_9) {
285 return LEX_CHAR_LOWERCASE_C;
286 }
287 }
288
289 Next();
290 constexpr auto MODULO = 32;
291 return cp % MODULO;
292 }
293
ParseClassAtom()294 char32_t RegExpParser::ParseClassAtom()
295 {
296 char32_t cp = Next();
297 if (cp != LEX_CHAR_BACKSLASH) {
298 return cp;
299 }
300
301 cp = Peek();
302 if (cp == LEX_CHAR_0) {
303 if (!Unicode()) {
304 return ParseDecimalEscape();
305 }
306
307 Next();
308
309 if (IsDecimalDigit(Peek())) {
310 ThrowError("Invalid class escape");
311 }
312
313 return LEX_CHAR_NULL;
314 }
315
316 Next();
317
318 switch (cp) {
319 case LEX_CHAR_LOWERCASE_C: {
320 return ParseControlEscape();
321 }
322 case LEX_CHAR_LOWERCASE_X: {
323 return ParseHexEscape();
324 }
325 case LEX_CHAR_LOWERCASE_U: {
326 if (!Unicode() && Peek() == LEX_CHAR_LEFT_BRACE) {
327 return cp;
328 }
329
330 return ParseUnicodeEscape();
331 }
332 case LEX_CHAR_LOWERCASE_P:
333 case LEX_CHAR_UPPERCASE_P: {
334 if (!Unicode()) {
335 return cp;
336 }
337
338 ParseUnicodePropertyEscape();
339 [[fallthrough]];
340 }
341 case LEX_CHAR_LOWERCASE_D:
342 case LEX_CHAR_UPPERCASE_D:
343 case LEX_CHAR_LOWERCASE_S:
344 case LEX_CHAR_UPPERCASE_S:
345 case LEX_CHAR_LOWERCASE_W:
346 case LEX_CHAR_UPPERCASE_W: {
347 return std::numeric_limits<uint32_t>::max();
348 }
349 case LEX_CHAR_LOWERCASE_B: {
350 return LEX_CHAR_BS;
351 }
352 case LEX_CHAR_LOWERCASE_F: {
353 return LEX_CHAR_FF;
354 }
355 case LEX_CHAR_LOWERCASE_N: {
356 return LEX_CHAR_LF;
357 }
358 case LEX_CHAR_LOWERCASE_R: {
359 return LEX_CHAR_CR;
360 }
361 case LEX_CHAR_LOWERCASE_T: {
362 return LEX_CHAR_TAB;
363 }
364 case LEX_CHAR_LOWERCASE_V: {
365 return LEX_CHAR_VT;
366 }
367 case LEX_CHAR_MINUS: {
368 return cp;
369 }
370 default: {
371 if (Unicode() && !IsSyntaxCharacter(cp) && cp != LEX_CHAR_SLASH) {
372 ThrowError("Invalid escape");
373 }
374
375 return cp;
376 }
377 }
378
379 return cp;
380 }
381
IsClassEscape(uint32_t cp)382 static bool IsClassEscape(uint32_t cp)
383 {
384 return cp == std::numeric_limits<uint32_t>::max();
385 }
386
ParseCharacterClass()387 void RegExpParser::ParseCharacterClass()
388 {
389 if (Peek() == LEX_CHAR_CIRCUMFLEX) {
390 Next();
391 }
392
393 while (true) {
394 if (Peek() == LEX_CHAR_RIGHT_SQUARE) {
395 Next();
396 break;
397 }
398
399 uint32_t left = ParseClassAtom();
400
401 if (Peek() != LEX_CHAR_MINUS) {
402 continue;
403 }
404
405 Next();
406
407 if (Peek() == LEX_CHAR_RIGHT_SQUARE) {
408 Next();
409 break;
410 }
411
412 uint32_t right = ParseClassAtom();
413 if ((IsClassEscape(left) || IsClassEscape(right))) {
414 if (Unicode()) {
415 ThrowError("Invalid character class");
416 }
417
418 continue;
419 }
420
421 if (left > right) {
422 ThrowError("Class range out of order");
423 }
424 }
425 }
426
IsSyntaxCharacter(char32_t cp) const427 bool RegExpParser::IsSyntaxCharacter(char32_t cp) const
428 {
429 switch (cp) {
430 case LEX_CHAR_RIGHT_SQUARE:
431 case LEX_CHAR_LEFT_BRACE:
432 case LEX_CHAR_RIGHT_BRACE: {
433 if (!Unicode()) {
434 return false;
435 }
436
437 [[fallthrough]];
438 }
439 case LEX_CHAR_CIRCUMFLEX:
440 case LEX_CHAR_DOLLAR_SIGN:
441 case LEX_CHAR_BACKSLASH:
442 case LEX_CHAR_DOT:
443 case LEX_CHAR_ASTERISK:
444 case LEX_CHAR_PLUS:
445 case LEX_CHAR_QUESTION:
446 case LEX_CHAR_LEFT_PAREN:
447 case LEX_CHAR_RIGHT_PAREN:
448 case LEX_CHAR_LEFT_SQUARE:
449 case LEX_CHAR_VLINE: {
450 return true;
451 }
452 default: {
453 return false;
454 }
455 }
456 }
457
ParseAtomEscape()458 void RegExpParser::ParseAtomEscape()
459 {
460 char32_t cp = Peek();
461 if (IsDecimalDigit(cp)) {
462 ParseDecimalEscape();
463 return;
464 }
465
466 Next();
467
468 ParseAtomEscapeSwitch(cp);
469 }
470
ParseAtomEscapeSwitch(char32_t cp)471 void RegExpParser::ParseAtomEscapeSwitch(char32_t cp)
472 {
473 switch (cp) {
474 case LEX_CHAR_LOWERCASE_X: {
475 ParseHexEscape();
476 break;
477 }
478 case LEX_CHAR_LOWERCASE_U: {
479 ParseUnicodeEscape();
480 break;
481 }
482 case LEX_CHAR_LOWERCASE_K: {
483 ParseNamedBackreference();
484 return;
485 }
486 /* ControlEscape */
487 case LEX_CHAR_LOWERCASE_F:
488 case LEX_CHAR_LOWERCASE_N:
489 case LEX_CHAR_LOWERCASE_R:
490 case LEX_CHAR_LOWERCASE_T:
491 case LEX_CHAR_LOWERCASE_V:
492 /* CharacterClassEscape */
493 case LEX_CHAR_LOWERCASE_D:
494 case LEX_CHAR_UPPERCASE_D:
495 case LEX_CHAR_LOWERCASE_S:
496 case LEX_CHAR_UPPERCASE_S:
497 case LEX_CHAR_LOWERCASE_W:
498 case LEX_CHAR_UPPERCASE_W: {
499 return;
500 }
501 case LEX_CHAR_LOWERCASE_P:
502 case LEX_CHAR_UPPERCASE_P: {
503 ParseUnicodePropertyEscape();
504 return;
505 }
506 case LEX_CHAR_LOWERCASE_C: {
507 cp = Peek();
508 if ((cp < LEX_CHAR_LOWERCASE_A || cp > LEX_CHAR_LOWERCASE_Z) &&
509 (cp < LEX_CHAR_UPPERCASE_A || cp > LEX_CHAR_UPPERCASE_Z)) {
510 ThrowError("Invalid control escape");
511 }
512
513 Next();
514 return;
515 }
516 default: {
517 /* IdentityEscape */
518 if (Unicode() && !IsSyntaxCharacter(cp) && cp != LEX_CHAR_SLASH) {
519 ThrowError("Invalid escape");
520 }
521 }
522 }
523 }
524
ParseDecimalEscape()525 uint32_t RegExpParser::ParseDecimalEscape()
526 {
527 ASSERT(IsDecimalDigit(Peek()));
528
529 auto digitStart = iter_;
530 uint32_t decimalValue = DigitValue(Next());
531 if (decimalValue == 0) {
532 if (!IsDecimalDigit(Peek())) {
533 /* \0 */
534 return decimalValue;
535 }
536
537 if (Unicode()) {
538 ThrowError("Invalid decimal escape");
539 }
540
541 iter_ = digitStart;
542 return ParseLegacyOctalEscape();
543 }
544
545 constexpr auto MULTIPLIER = 10;
546
547 while (IsDecimalDigit(Peek())) {
548 uint32_t newValue = decimalValue * MULTIPLIER + DigitValue(Next());
549 if (newValue < decimalValue) {
550 ThrowError("Invalid decimal escape");
551 }
552
553 decimalValue = newValue;
554 }
555
556 if (decimalValue <= capturingGroupCount_) {
557 return decimalValue;
558 }
559
560 if (Unicode()) {
561 ThrowError("Invalid decimal escape");
562 }
563
564 iter_ = digitStart;
565
566 if (!IsOctalDigit(Peek())) {
567 /* \8 or \9 */
568 return DigitValue(Next());
569 }
570
571 return ParseLegacyOctalEscape();
572 }
573
ParseLegacyOctalEscape()574 uint32_t RegExpParser::ParseLegacyOctalEscape()
575 {
576 ASSERT(IsOctalDigit(Peek()));
577 uint32_t octalValue = DigitValue(Next());
578
579 if (!IsOctalDigit(Peek())) {
580 return octalValue;
581 }
582
583 octalValue = octalValue * 8U + DigitValue(Next());
584
585 if (!IsOctalDigit(Peek())) {
586 return octalValue;
587 }
588
589 uint32_t newValue = octalValue * 8 + DigitValue(Peek());
590 constexpr uint32_t MAX_OCTAL_VALUE = 0xFF;
591
592 if (newValue <= MAX_OCTAL_VALUE) {
593 octalValue = newValue;
594 Next();
595 }
596
597 return octalValue;
598 }
599
ParseHexEscape()600 uint32_t RegExpParser::ParseHexEscape()
601 {
602 char32_t digit = Next();
603 if (!IsHexDigit(digit)) {
604 ThrowError("Invalid hex escape");
605 }
606
607 constexpr auto MULTIPLIER = 16;
608 uint32_t cpValue = HexValue(digit) * MULTIPLIER;
609
610 digit = Next();
611 if (!IsHexDigit(digit)) {
612 ThrowError("Invalid hex escape");
613 }
614
615 cpValue += HexValue(digit);
616 return cpValue;
617 }
618
ParseUnicodeDigits()619 uint32_t RegExpParser::ParseUnicodeDigits()
620 {
621 uint32_t value = 0;
622 uint32_t count = 4;
623
624 while ((count--) != 0U) {
625 char32_t digit = Next();
626 if (!IsHexDigit(digit)) {
627 ThrowError("Invalid Unicode escape");
628 }
629
630 constexpr auto MULTIPLIER = 16;
631 value = value * MULTIPLIER + HexValue(digit);
632 }
633
634 return value;
635 }
636
ParseUnicodeEscape()637 uint32_t RegExpParser::ParseUnicodeEscape()
638 {
639 uint32_t value = 0;
640
641 if (Peek() == LEX_CHAR_LEFT_BRACE) {
642 Next();
643 if (!IsHexDigit(Peek())) {
644 ThrowError("Invalid Unicode escape");
645 }
646
647 while (IsHexDigit(Peek())) {
648 constexpr auto MULTIPLIER = 16;
649 value = value * MULTIPLIER + HexValue(Next());
650 constexpr uint32_t CODE_POINT_MAX = 0x10FFFF;
651
652 if (value > CODE_POINT_MAX) {
653 ThrowError("Invalid Unicode escape");
654 }
655 }
656
657 if (Peek() != LEX_CHAR_RIGHT_BRACE) {
658 ThrowError("Invalid Unicode escape");
659 }
660
661 Next();
662 } else {
663 value = ParseUnicodeDigits();
664 if (!util::StringView::IsHighSurrogate(value)) {
665 return value;
666 }
667
668 auto pos = iter_;
669 if (Next() == LEX_CHAR_BACKSLASH && Next() == LEX_CHAR_LOWERCASE_U) {
670 uint32_t next = ParseUnicodeDigits();
671 if (util::StringView::IsLowSurrogate(next)) {
672 return util::StringView::DecodeSurrogates(value, next);
673 }
674 }
675 iter_ = pos;
676 }
677
678 return value;
679 }
680
ParseUnicodePropertyEscape()681 void RegExpParser::ParseUnicodePropertyEscape()
682 {
683 if (!Unicode()) {
684 return;
685 }
686
687 if (Peek() != LEX_CHAR_LEFT_BRACE) {
688 ThrowError("Invalid Unicode property escape");
689 }
690
691 Next();
692
693 while (true) {
694 if (!iter_.HasNext()) {
695 ThrowError("Unterminated Unicode property escape");
696 }
697
698 char32_t ch = Next();
699 if (ch == LEX_CHAR_LEFT_BRACE) {
700 break;
701 }
702
703 /* NOTE: Parse and validate Unicode property names */
704 }
705 }
706
ParseNamedBackreference()707 void RegExpParser::ParseNamedBackreference()
708 {
709 if (Next() != LEX_CHAR_LESS_THAN) {
710 if (!Unicode() && groupNames_.empty()) {
711 return;
712 }
713
714 ThrowError("Invalid named backreference");
715 }
716
717 if (IsDecimalDigit(Peek())) {
718 return;
719 }
720
721 util::StringView name = ParseIdent();
722 backReferences_.insert(name);
723
724 ValidateNamedBackreference(Unicode());
725 }
726
ValidateNamedBackreference(bool isUnicode)727 void RegExpParser::ValidateNamedBackreference(bool isUnicode)
728 {
729 if (Peek() != LEX_CHAR_LEFT_PAREN || Peek() != LEX_CHAR_BACKSLASH || Peek() != UNICODE_INVALID_CP) {
730 if (!isUnicode) {
731 /* Identity escape */
732 return;
733 }
734
735 if (groupNames_.empty()) {
736 ThrowError("Invalid named backreference");
737 }
738 }
739 }
740
ValidateGroupNameElement(char32_t cp)741 void RegExpParser::ValidateGroupNameElement(char32_t cp)
742 {
743 if (IsDecimalDigit(cp) && !backReferences_.empty()) {
744 ThrowError("Invalid group name");
745 }
746 if (cp == UNICODE_INVALID_CP && !groupNames_.empty()) {
747 ThrowError("Invalid group name");
748 }
749 }
750
ParseQuantifier()751 void RegExpParser::ParseQuantifier()
752 {
753 switch (Peek()) {
754 case LEX_CHAR_ASTERISK:
755 case LEX_CHAR_PLUS:
756 case LEX_CHAR_QUESTION: {
757 Next();
758 break;
759 }
760 case LEX_CHAR_LEFT_BRACE: {
761 if (!ParseBracedQuantifier()) {
762 return;
763 }
764
765 break;
766 }
767 default: {
768 return;
769 }
770 }
771
772 if (Peek() == LEX_CHAR_QUESTION) {
773 Next();
774 }
775 }
776
ParseBracedQuantifier()777 bool RegExpParser::ParseBracedQuantifier()
778 {
779 if (Peek() != LEX_CHAR_LEFT_BRACE) {
780 return false;
781 }
782
783 auto startPos = iter_;
784 Next();
785
786 if (!IsDecimalDigit(Peek())) {
787 iter_ = startPos;
788 return false;
789 }
790
791 uint32_t leftValue = 0;
792 constexpr auto MULTIPLIER = 10;
793
794 while (IsDecimalDigit(Peek())) {
795 uint32_t newValue = leftValue * MULTIPLIER + DigitValue(Next());
796 if (newValue < leftValue) {
797 leftValue = std::numeric_limits<uint32_t>::max();
798 continue;
799 }
800
801 leftValue = newValue;
802 }
803
804 if (Peek() == LEX_CHAR_COMMA) {
805 Next();
806 }
807
808 if (Peek() == LEX_CHAR_RIGHT_BRACE) {
809 Next();
810 return true;
811 }
812
813 if (IsDecimalDigit(Peek())) {
814 uint32_t rightValue = 0;
815 while (IsDecimalDigit(Peek())) {
816 uint32_t newValue = rightValue * MULTIPLIER + DigitValue(Next());
817 if (newValue < rightValue) {
818 rightValue = std::numeric_limits<uint32_t>::max();
819 continue;
820 }
821
822 rightValue = newValue;
823 }
824
825 if (Peek() == LEX_CHAR_RIGHT_BRACE) {
826 if (rightValue < leftValue) {
827 ThrowError("Quantifier range out of order");
828 }
829
830 Next();
831 return true;
832 }
833 }
834
835 iter_ = startPos;
836 return false;
837 }
838
ParsePatternCharacter()839 bool RegExpParser::ParsePatternCharacter()
840 {
841 char32_t cp = Peek();
842 if (IsSyntaxCharacter(cp)) {
843 return false;
844 }
845
846 Next();
847 return true;
848 }
849
IsIdStart(uint32_t cp)850 static bool IsIdStart(uint32_t cp)
851 {
852 auto uchar = static_cast<UChar>(cp);
853 return u_isIDStart(uchar) || uchar == LEX_CHAR_DOLLAR_SIGN || uchar == LEX_CHAR_UNDERSCORE ||
854 uchar == LEX_CHAR_BACKSLASH;
855 }
856
IsIdCont(uint32_t cp)857 static bool IsIdCont(uint32_t cp)
858 {
859 auto uchar = static_cast<UChar>(cp);
860 return u_isIDPart(uchar) || uchar == LEX_CHAR_DOLLAR_SIGN || uchar == LEX_CHAR_UNDERSCORE ||
861 uchar == LEX_CHAR_BACKSLASH || uchar == LEX_CHAR_ZWNJ || uchar == LEX_CHAR_ZWJ;
862 }
863
ParseIdent()864 util::StringView RegExpParser::ParseIdent()
865 {
866 char32_t cp = Next();
867 if (cp == LEX_CHAR_BACKSLASH) {
868 if (Next() != LEX_CHAR_LOWERCASE_U) {
869 ThrowError("Invalid group name");
870 }
871
872 if (!Unicode() && Peek() == LEX_CHAR_LEFT_BRACE) {
873 ThrowError("Invalid Unicode escape");
874 }
875
876 cp = ParseUnicodeEscape();
877 }
878
879 if (!IsIdStart(cp) && cp != UNICODE_INVALID_CP && backReferences_.empty()) {
880 ThrowError("Invalid group name");
881 }
882
883 util::UString ident(allocator_);
884 ident.Append(cp);
885
886 while (true) {
887 cp = Next();
888 if (cp == LEX_CHAR_GREATER_THAN) {
889 break;
890 }
891
892 if (cp == LEX_CHAR_BACKSLASH) {
893 if (Next() != LEX_CHAR_LOWERCASE_U) {
894 ThrowError("Invalid group name");
895 }
896
897 if (!Unicode() && Peek() == LEX_CHAR_LEFT_BRACE) {
898 ThrowError("Invalid Unicode escape");
899 }
900
901 cp = ParseUnicodeEscape();
902 }
903
904 ValidateGroupNameElement(cp);
905
906 if (cp == UNICODE_INVALID_CP) {
907 break;
908 }
909
910 if (!IsIdCont(cp)) {
911 ThrowError("Invalid group name");
912 }
913
914 ident.Append(cp);
915 }
916
917 return ident.View();
918 }
919 } // namespace ark::es2panda::lexer
920