• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2016 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #include "src/regexp/regexp-parser.h"
6 
7 #include <vector>
8 
9 #include "src/execution/isolate.h"
10 #include "src/heap/factory.h"
11 #include "src/objects/objects-inl.h"
12 #include "src/regexp/property-sequences.h"
13 #include "src/regexp/regexp-macro-assembler.h"
14 #include "src/regexp/regexp.h"
15 #include "src/strings/char-predicates-inl.h"
16 #include "src/utils/ostreams.h"
17 #include "src/utils/utils.h"
18 #include "src/zone/zone-list-inl.h"
19 
20 #ifdef V8_INTL_SUPPORT
21 #include "unicode/uniset.h"
22 #endif  // V8_INTL_SUPPORT
23 
24 namespace v8 {
25 namespace internal {
26 
RegExpParser(FlatStringReader * in,JSRegExp::Flags flags,Isolate * isolate,Zone * zone)27 RegExpParser::RegExpParser(FlatStringReader* in, JSRegExp::Flags flags,
28                            Isolate* isolate, Zone* zone)
29     : isolate_(isolate),
30       zone_(zone),
31       captures_(nullptr),
32       named_captures_(nullptr),
33       named_back_references_(nullptr),
34       in_(in),
35       current_(kEndMarker),
36       top_level_flags_(flags),
37       next_pos_(0),
38       captures_started_(0),
39       capture_count_(0),
40       has_more_(true),
41       simple_(false),
42       contains_anchor_(false),
43       is_scanned_for_captures_(false),
44       has_named_captures_(false),
45       failed_(false) {
46   Advance();
47 }
48 
49 template <bool update_position>
ReadNext()50 inline uc32 RegExpParser::ReadNext() {
51   int position = next_pos_;
52   uc32 c0 = in()->Get(position);
53   position++;
54   // Read the whole surrogate pair in case of unicode flag, if possible.
55   if (unicode() && position < in()->length() &&
56       unibrow::Utf16::IsLeadSurrogate(static_cast<uc16>(c0))) {
57     uc16 c1 = in()->Get(position);
58     if (unibrow::Utf16::IsTrailSurrogate(c1)) {
59       c0 = unibrow::Utf16::CombineSurrogatePair(static_cast<uc16>(c0), c1);
60       position++;
61     }
62   }
63   if (update_position) next_pos_ = position;
64   return c0;
65 }
66 
67 
Next()68 uc32 RegExpParser::Next() {
69   if (has_next()) {
70     return ReadNext<false>();
71   } else {
72     return kEndMarker;
73   }
74 }
75 
Advance()76 void RegExpParser::Advance() {
77   if (has_next()) {
78     StackLimitCheck check(isolate());
79     if (check.HasOverflowed()) {
80       if (FLAG_correctness_fuzzer_suppressions) {
81         FATAL("Aborting on stack overflow");
82       }
83       ReportError(RegExpError::kStackOverflow);
84     } else if (zone()->excess_allocation()) {
85       if (FLAG_correctness_fuzzer_suppressions) {
86         FATAL("Aborting on excess zone allocation");
87       }
88       ReportError(RegExpError::kTooLarge);
89     } else {
90       current_ = ReadNext<true>();
91     }
92   } else {
93     current_ = kEndMarker;
94     // Advance so that position() points to 1-after-the-last-character. This is
95     // important so that Reset() to this position works correctly.
96     next_pos_ = in()->length() + 1;
97     has_more_ = false;
98   }
99 }
100 
101 
Reset(int pos)102 void RegExpParser::Reset(int pos) {
103   next_pos_ = pos;
104   has_more_ = (pos < in()->length());
105   Advance();
106 }
107 
Advance(int dist)108 void RegExpParser::Advance(int dist) {
109   next_pos_ += dist - 1;
110   Advance();
111 }
112 
113 
simple()114 bool RegExpParser::simple() { return simple_; }
115 
IsSyntaxCharacterOrSlash(uc32 c)116 bool RegExpParser::IsSyntaxCharacterOrSlash(uc32 c) {
117   switch (c) {
118     case '^':
119     case '$':
120     case '\\':
121     case '.':
122     case '*':
123     case '+':
124     case '?':
125     case '(':
126     case ')':
127     case '[':
128     case ']':
129     case '{':
130     case '}':
131     case '|':
132     case '/':
133       return true;
134     default:
135       break;
136   }
137   return false;
138 }
139 
ReportError(RegExpError error)140 RegExpTree* RegExpParser::ReportError(RegExpError error) {
141   if (failed_) return nullptr;  // Do not overwrite any existing error.
142   failed_ = true;
143   error_ = error;
144   error_pos_ = position();
145   // Zip to the end to make sure no more input is read.
146   current_ = kEndMarker;
147   next_pos_ = in()->length();
148   return nullptr;
149 }
150 
151 #define CHECK_FAILED /**/);    \
152   if (failed_) return nullptr; \
153   ((void)0
154 
155 // Pattern ::
156 //   Disjunction
ParsePattern()157 RegExpTree* RegExpParser::ParsePattern() {
158   RegExpTree* result = ParseDisjunction(CHECK_FAILED);
159   PatchNamedBackReferences(CHECK_FAILED);
160   DCHECK(!has_more());
161   // If the result of parsing is a literal string atom, and it has the
162   // same length as the input, then the atom is identical to the input.
163   if (result->IsAtom() && result->AsAtom()->length() == in()->length()) {
164     simple_ = true;
165   }
166   return result;
167 }
168 
169 
170 // Disjunction ::
171 //   Alternative
172 //   Alternative | Disjunction
173 // Alternative ::
174 //   [empty]
175 //   Term Alternative
176 // Term ::
177 //   Assertion
178 //   Atom
179 //   Atom Quantifier
ParseDisjunction()180 RegExpTree* RegExpParser::ParseDisjunction() {
181   // Used to store current state while parsing subexpressions.
182   RegExpParserState initial_state(nullptr, INITIAL, RegExpLookaround::LOOKAHEAD,
183                                   0, nullptr, top_level_flags_, zone());
184   RegExpParserState* state = &initial_state;
185   // Cache the builder in a local variable for quick access.
186   RegExpBuilder* builder = initial_state.builder();
187   while (true) {
188     switch (current()) {
189       case kEndMarker:
190         if (state->IsSubexpression()) {
191           // Inside a parenthesized group when hitting end of input.
192           return ReportError(RegExpError::kUnterminatedGroup);
193         }
194         DCHECK_EQ(INITIAL, state->group_type());
195         // Parsing completed successfully.
196         return builder->ToRegExp();
197       case ')': {
198         if (!state->IsSubexpression()) {
199           return ReportError(RegExpError::kUnmatchedParen);
200         }
201         DCHECK_NE(INITIAL, state->group_type());
202 
203         Advance();
204         // End disjunction parsing and convert builder content to new single
205         // regexp atom.
206         RegExpTree* body = builder->ToRegExp();
207 
208         int end_capture_index = captures_started();
209 
210         int capture_index = state->capture_index();
211         SubexpressionType group_type = state->group_type();
212 
213         // Build result of subexpression.
214         if (group_type == CAPTURE) {
215           if (state->IsNamedCapture()) {
216             CreateNamedCaptureAtIndex(state->capture_name(),
217                                       capture_index CHECK_FAILED);
218           }
219           RegExpCapture* capture = GetCapture(capture_index);
220           capture->set_body(body);
221           body = capture;
222         } else if (group_type == GROUPING) {
223           body = zone()->New<RegExpGroup>(body);
224         } else {
225           DCHECK(group_type == POSITIVE_LOOKAROUND ||
226                  group_type == NEGATIVE_LOOKAROUND);
227           bool is_positive = (group_type == POSITIVE_LOOKAROUND);
228           body = zone()->New<RegExpLookaround>(
229               body, is_positive, end_capture_index - capture_index,
230               capture_index, state->lookaround_type());
231         }
232 
233         // Restore previous state.
234         state = state->previous_state();
235         builder = state->builder();
236 
237         builder->AddAtom(body);
238         // For compatibility with JSC and ES3, we allow quantifiers after
239         // lookaheads, and break in all cases.
240         break;
241       }
242       case '|': {
243         Advance();
244         builder->NewAlternative();
245         continue;
246       }
247       case '*':
248       case '+':
249       case '?':
250         return ReportError(RegExpError::kNothingToRepeat);
251       case '^': {
252         Advance();
253         if (builder->multiline()) {
254           builder->AddAssertion(zone()->New<RegExpAssertion>(
255               RegExpAssertion::START_OF_LINE, builder->flags()));
256         } else {
257           builder->AddAssertion(zone()->New<RegExpAssertion>(
258               RegExpAssertion::START_OF_INPUT, builder->flags()));
259           set_contains_anchor();
260         }
261         continue;
262       }
263       case '$': {
264         Advance();
265         RegExpAssertion::AssertionType assertion_type =
266             builder->multiline() ? RegExpAssertion::END_OF_LINE
267                                  : RegExpAssertion::END_OF_INPUT;
268         builder->AddAssertion(
269             zone()->New<RegExpAssertion>(assertion_type, builder->flags()));
270         continue;
271       }
272       case '.': {
273         Advance();
274         ZoneList<CharacterRange>* ranges =
275             zone()->New<ZoneList<CharacterRange>>(2, zone());
276 
277         if (builder->dotall()) {
278           // Everything.
279           CharacterRange::AddClassEscape('*', ranges, false, zone());
280         } else {
281           // Everything except \x0A, \x0D, \u2028 and \u2029
282           CharacterRange::AddClassEscape('.', ranges, false, zone());
283         }
284 
285         RegExpCharacterClass* cc =
286             zone()->New<RegExpCharacterClass>(zone(), ranges, builder->flags());
287         builder->AddCharacterClass(cc);
288         break;
289       }
290       case '(': {
291         state = ParseOpenParenthesis(state CHECK_FAILED);
292         builder = state->builder();
293         continue;
294       }
295       case '[': {
296         RegExpTree* cc = ParseCharacterClass(builder CHECK_FAILED);
297         builder->AddCharacterClass(cc->AsCharacterClass());
298         break;
299       }
300       // Atom ::
301       //   \ AtomEscape
302       case '\\':
303         switch (Next()) {
304           case kEndMarker:
305             return ReportError(RegExpError::kEscapeAtEndOfPattern);
306           case 'b':
307             Advance(2);
308             builder->AddAssertion(zone()->New<RegExpAssertion>(
309                 RegExpAssertion::BOUNDARY, builder->flags()));
310             continue;
311           case 'B':
312             Advance(2);
313             builder->AddAssertion(zone()->New<RegExpAssertion>(
314                 RegExpAssertion::NON_BOUNDARY, builder->flags()));
315             continue;
316           // AtomEscape ::
317           //   CharacterClassEscape
318           //
319           // CharacterClassEscape :: one of
320           //   d D s S w W
321           case 'd':
322           case 'D':
323           case 's':
324           case 'S':
325           case 'w':
326           case 'W': {
327             uc32 c = Next();
328             Advance(2);
329             ZoneList<CharacterRange>* ranges =
330                 zone()->New<ZoneList<CharacterRange>>(2, zone());
331             CharacterRange::AddClassEscape(
332                 c, ranges, unicode() && builder->ignore_case(), zone());
333             RegExpCharacterClass* cc = zone()->New<RegExpCharacterClass>(
334                 zone(), ranges, builder->flags());
335             builder->AddCharacterClass(cc);
336             break;
337           }
338           case 'p':
339           case 'P': {
340             uc32 p = Next();
341             Advance(2);
342             if (unicode()) {
343               ZoneList<CharacterRange>* ranges =
344                   zone()->New<ZoneList<CharacterRange>>(2, zone());
345               ZoneVector<char> name_1(zone());
346               ZoneVector<char> name_2(zone());
347               if (ParsePropertyClassName(&name_1, &name_2)) {
348                 if (AddPropertyClassRange(ranges, p == 'P', name_1, name_2)) {
349                   RegExpCharacterClass* cc = zone()->New<RegExpCharacterClass>(
350                       zone(), ranges, builder->flags());
351                   builder->AddCharacterClass(cc);
352                   break;
353                 }
354                 if (p == 'p' && name_2.empty()) {
355                   RegExpTree* sequence = GetPropertySequence(name_1);
356                   if (sequence != nullptr) {
357                     builder->AddAtom(sequence);
358                     break;
359                   }
360                 }
361               }
362               return ReportError(RegExpError::kInvalidPropertyName);
363             } else {
364               builder->AddCharacter(p);
365             }
366             break;
367           }
368           case '1':
369           case '2':
370           case '3':
371           case '4':
372           case '5':
373           case '6':
374           case '7':
375           case '8':
376           case '9': {
377             int index = 0;
378             bool is_backref = ParseBackReferenceIndex(&index CHECK_FAILED);
379             if (is_backref) {
380               if (state->IsInsideCaptureGroup(index)) {
381                 // The back reference is inside the capture group it refers to.
382                 // Nothing can possibly have been captured yet, so we use empty
383                 // instead. This ensures that, when checking a back reference,
384                 // the capture registers of the referenced capture are either
385                 // both set or both cleared.
386                 builder->AddEmpty();
387               } else {
388                 RegExpCapture* capture = GetCapture(index);
389                 RegExpTree* atom =
390                     zone()->New<RegExpBackReference>(capture, builder->flags());
391                 builder->AddAtom(atom);
392               }
393               break;
394             }
395             // With /u, no identity escapes except for syntax characters
396             // are allowed. Otherwise, all identity escapes are allowed.
397             if (unicode()) {
398               return ReportError(RegExpError::kInvalidEscape);
399             }
400             uc32 first_digit = Next();
401             if (first_digit == '8' || first_digit == '9') {
402               builder->AddCharacter(first_digit);
403               Advance(2);
404               break;
405             }
406             V8_FALLTHROUGH;
407           }
408           case '0': {
409             Advance();
410             if (unicode() && Next() >= '0' && Next() <= '9') {
411               // With /u, decimal escape with leading 0 are not parsed as octal.
412               return ReportError(RegExpError::kInvalidDecimalEscape);
413             }
414             uc32 octal = ParseOctalLiteral();
415             builder->AddCharacter(octal);
416             break;
417           }
418           // ControlEscape :: one of
419           //   f n r t v
420           case 'f':
421             Advance(2);
422             builder->AddCharacter('\f');
423             break;
424           case 'n':
425             Advance(2);
426             builder->AddCharacter('\n');
427             break;
428           case 'r':
429             Advance(2);
430             builder->AddCharacter('\r');
431             break;
432           case 't':
433             Advance(2);
434             builder->AddCharacter('\t');
435             break;
436           case 'v':
437             Advance(2);
438             builder->AddCharacter('\v');
439             break;
440           case 'c': {
441             Advance();
442             uc32 controlLetter = Next();
443             // Special case if it is an ASCII letter.
444             // Convert lower case letters to uppercase.
445             uc32 letter = controlLetter & ~('a' ^ 'A');
446             if (letter < 'A' || 'Z' < letter) {
447               // controlLetter is not in range 'A'-'Z' or 'a'-'z'.
448               // Read the backslash as a literal character instead of as
449               // starting an escape.
450               // ES#prod-annexB-ExtendedPatternCharacter
451               if (unicode()) {
452                 // With /u, invalid escapes are not treated as identity escapes.
453                 return ReportError(RegExpError::kInvalidUnicodeEscape);
454               }
455               builder->AddCharacter('\\');
456             } else {
457               Advance(2);
458               builder->AddCharacter(controlLetter & 0x1F);
459             }
460             break;
461           }
462           case 'x': {
463             Advance(2);
464             uc32 value;
465             if (ParseHexEscape(2, &value)) {
466               builder->AddCharacter(value);
467             } else if (!unicode()) {
468               builder->AddCharacter('x');
469             } else {
470               // With /u, invalid escapes are not treated as identity escapes.
471               return ReportError(RegExpError::kInvalidEscape);
472             }
473             break;
474           }
475           case 'u': {
476             Advance(2);
477             uc32 value;
478             if (ParseUnicodeEscape(&value)) {
479               builder->AddEscapedUnicodeCharacter(value);
480             } else if (!unicode()) {
481               builder->AddCharacter('u');
482             } else {
483               // With /u, invalid escapes are not treated as identity escapes.
484               return ReportError(RegExpError::kInvalidUnicodeEscape);
485             }
486             break;
487           }
488           case 'k':
489             // Either an identity escape or a named back-reference.  The two
490             // interpretations are mutually exclusive: '\k' is interpreted as
491             // an identity escape for non-Unicode patterns without named
492             // capture groups, and as the beginning of a named back-reference
493             // in all other cases.
494             if (unicode() || HasNamedCaptures()) {
495               Advance(2);
496               ParseNamedBackReference(builder, state CHECK_FAILED);
497               break;
498             }
499             V8_FALLTHROUGH;
500           default:
501             Advance();
502             // With /u, no identity escapes except for syntax characters
503             // are allowed. Otherwise, all identity escapes are allowed.
504             if (!unicode() || IsSyntaxCharacterOrSlash(current())) {
505               builder->AddCharacter(current());
506               Advance();
507             } else {
508               return ReportError(RegExpError::kInvalidEscape);
509             }
510             break;
511         }
512         break;
513       case '{': {
514         int dummy;
515         bool parsed = ParseIntervalQuantifier(&dummy, &dummy CHECK_FAILED);
516         if (parsed) return ReportError(RegExpError::kNothingToRepeat);
517         V8_FALLTHROUGH;
518       }
519       case '}':
520       case ']':
521         if (unicode()) {
522           return ReportError(RegExpError::kLoneQuantifierBrackets);
523         }
524         V8_FALLTHROUGH;
525       default:
526         builder->AddUnicodeCharacter(current());
527         Advance();
528         break;
529     }  // end switch(current())
530 
531     int min;
532     int max;
533     switch (current()) {
534       // QuantifierPrefix ::
535       //   *
536       //   +
537       //   ?
538       //   {
539       case '*':
540         min = 0;
541         max = RegExpTree::kInfinity;
542         Advance();
543         break;
544       case '+':
545         min = 1;
546         max = RegExpTree::kInfinity;
547         Advance();
548         break;
549       case '?':
550         min = 0;
551         max = 1;
552         Advance();
553         break;
554       case '{':
555         if (ParseIntervalQuantifier(&min, &max)) {
556           if (max < min) {
557             return ReportError(RegExpError::kRangeOutOfOrder);
558           }
559           break;
560         } else if (unicode()) {
561           // With /u, incomplete quantifiers are not allowed.
562           return ReportError(RegExpError::kIncompleteQuantifier);
563         }
564         continue;
565       default:
566         continue;
567     }
568     RegExpQuantifier::QuantifierType quantifier_type = RegExpQuantifier::GREEDY;
569     if (current() == '?') {
570       quantifier_type = RegExpQuantifier::NON_GREEDY;
571       Advance();
572     } else if (FLAG_regexp_possessive_quantifier && current() == '+') {
573       // FLAG_regexp_possessive_quantifier is a debug-only flag.
574       quantifier_type = RegExpQuantifier::POSSESSIVE;
575       Advance();
576     }
577     if (!builder->AddQuantifierToAtom(min, max, quantifier_type)) {
578       return ReportError(RegExpError::kInvalidQuantifier);
579     }
580   }
581 }
582 
ParseOpenParenthesis(RegExpParserState * state)583 RegExpParser::RegExpParserState* RegExpParser::ParseOpenParenthesis(
584     RegExpParserState* state) {
585   RegExpLookaround::Type lookaround_type = state->lookaround_type();
586   bool is_named_capture = false;
587   JSRegExp::Flags switch_on = JSRegExp::kNone;
588   JSRegExp::Flags switch_off = JSRegExp::kNone;
589   const ZoneVector<uc16>* capture_name = nullptr;
590   SubexpressionType subexpr_type = CAPTURE;
591   Advance();
592   if (current() == '?') {
593     switch (Next()) {
594       case ':':
595         Advance(2);
596         subexpr_type = GROUPING;
597         break;
598       case '=':
599         Advance(2);
600         lookaround_type = RegExpLookaround::LOOKAHEAD;
601         subexpr_type = POSITIVE_LOOKAROUND;
602         break;
603       case '!':
604         Advance(2);
605         lookaround_type = RegExpLookaround::LOOKAHEAD;
606         subexpr_type = NEGATIVE_LOOKAROUND;
607         break;
608       case '-':
609       case 'i':
610       case 's':
611       case 'm': {
612         if (!FLAG_regexp_mode_modifiers) {
613           ReportError(RegExpError::kInvalidGroup);
614           return nullptr;
615         }
616         Advance();
617         bool flags_sense = true;  // Switching on flags.
618         while (subexpr_type != GROUPING) {
619           switch (current()) {
620             case '-':
621               if (!flags_sense) {
622                 ReportError(RegExpError::kMultipleFlagDashes);
623                 return nullptr;
624               }
625               flags_sense = false;
626               Advance();
627               continue;
628             case 's':
629             case 'i':
630             case 'm': {
631               JSRegExp::Flags bit = JSRegExp::kUnicode;
632               if (current() == 'i') bit = JSRegExp::kIgnoreCase;
633               if (current() == 'm') bit = JSRegExp::kMultiline;
634               if (current() == 's') bit = JSRegExp::kDotAll;
635               if (((switch_on | switch_off) & bit) != 0) {
636                 ReportError(RegExpError::kRepeatedFlag);
637                 return nullptr;
638               }
639               if (flags_sense) {
640                 switch_on |= bit;
641               } else {
642                 switch_off |= bit;
643               }
644               Advance();
645               continue;
646             }
647             case ')': {
648               Advance();
649               state->builder()
650                   ->FlushText();  // Flush pending text using old flags.
651               // These (?i)-style flag switches don't put us in a subexpression
652               // at all, they just modify the flags in the rest of the current
653               // subexpression.
654               JSRegExp::Flags flags =
655                   (state->builder()->flags() | switch_on) & ~switch_off;
656               state->builder()->set_flags(flags);
657               return state;
658             }
659             case ':':
660               Advance();
661               subexpr_type = GROUPING;  // Will break us out of the outer loop.
662               continue;
663             default:
664               ReportError(RegExpError::kInvalidFlagGroup);
665               return nullptr;
666           }
667         }
668         break;
669       }
670       case '<':
671         Advance();
672         if (Next() == '=') {
673           Advance(2);
674           lookaround_type = RegExpLookaround::LOOKBEHIND;
675           subexpr_type = POSITIVE_LOOKAROUND;
676           break;
677         } else if (Next() == '!') {
678           Advance(2);
679           lookaround_type = RegExpLookaround::LOOKBEHIND;
680           subexpr_type = NEGATIVE_LOOKAROUND;
681           break;
682         }
683         is_named_capture = true;
684         has_named_captures_ = true;
685         Advance();
686         break;
687       default:
688         ReportError(RegExpError::kInvalidGroup);
689         return nullptr;
690     }
691   }
692   if (subexpr_type == CAPTURE) {
693     if (captures_started_ >= JSRegExp::kMaxCaptures) {
694       ReportError(RegExpError::kTooManyCaptures);
695       return nullptr;
696     }
697     captures_started_++;
698 
699     if (is_named_capture) {
700       capture_name = ParseCaptureGroupName(CHECK_FAILED);
701     }
702   }
703   JSRegExp::Flags flags = (state->builder()->flags() | switch_on) & ~switch_off;
704   // Store current state and begin new disjunction parsing.
705   return zone()->New<RegExpParserState>(state, subexpr_type, lookaround_type,
706                                         captures_started_, capture_name, flags,
707                                         zone());
708 }
709 
710 #ifdef DEBUG
711 // Currently only used in an DCHECK.
IsSpecialClassEscape(uc32 c)712 static bool IsSpecialClassEscape(uc32 c) {
713   switch (c) {
714     case 'd':
715     case 'D':
716     case 's':
717     case 'S':
718     case 'w':
719     case 'W':
720       return true;
721     default:
722       return false;
723   }
724 }
725 #endif
726 
727 
728 // In order to know whether an escape is a backreference or not we have to scan
729 // the entire regexp and find the number of capturing parentheses.  However we
730 // don't want to scan the regexp twice unless it is necessary.  This mini-parser
731 // is called when needed.  It can see the difference between capturing and
732 // noncapturing parentheses and can skip character classes and backslash-escaped
733 // characters.
ScanForCaptures()734 void RegExpParser::ScanForCaptures() {
735   DCHECK(!is_scanned_for_captures_);
736   const int saved_position = position();
737   // Start with captures started previous to current position
738   int capture_count = captures_started();
739   // Add count of captures after this position.
740   int n;
741   while ((n = current()) != kEndMarker) {
742     Advance();
743     switch (n) {
744       case '\\':
745         Advance();
746         break;
747       case '[': {
748         int c;
749         while ((c = current()) != kEndMarker) {
750           Advance();
751           if (c == '\\') {
752             Advance();
753           } else {
754             if (c == ']') break;
755           }
756         }
757         break;
758       }
759       case '(':
760         if (current() == '?') {
761           // At this point we could be in
762           // * a non-capturing group '(:',
763           // * a lookbehind assertion '(?<=' '(?<!'
764           // * or a named capture '(?<'.
765           //
766           // Of these, only named captures are capturing groups.
767 
768           Advance();
769           if (current() != '<') break;
770 
771           Advance();
772           if (current() == '=' || current() == '!') break;
773 
774           // Found a possible named capture. It could turn out to be a syntax
775           // error (e.g. an unterminated or invalid name), but that distinction
776           // does not matter for our purposes.
777           has_named_captures_ = true;
778         }
779         capture_count++;
780         break;
781     }
782   }
783   capture_count_ = capture_count;
784   is_scanned_for_captures_ = true;
785   Reset(saved_position);
786 }
787 
788 
ParseBackReferenceIndex(int * index_out)789 bool RegExpParser::ParseBackReferenceIndex(int* index_out) {
790   DCHECK_EQ('\\', current());
791   DCHECK('1' <= Next() && Next() <= '9');
792   // Try to parse a decimal literal that is no greater than the total number
793   // of left capturing parentheses in the input.
794   int start = position();
795   int value = Next() - '0';
796   Advance(2);
797   while (true) {
798     uc32 c = current();
799     if (IsDecimalDigit(c)) {
800       value = 10 * value + (c - '0');
801       if (value > JSRegExp::kMaxCaptures) {
802         Reset(start);
803         return false;
804       }
805       Advance();
806     } else {
807       break;
808     }
809   }
810   if (value > captures_started()) {
811     if (!is_scanned_for_captures_) ScanForCaptures();
812     if (value > capture_count_) {
813       Reset(start);
814       return false;
815     }
816   }
817   *index_out = value;
818   return true;
819 }
820 
push_code_unit(ZoneVector<uc16> * v,uint32_t code_unit)821 static void push_code_unit(ZoneVector<uc16>* v, uint32_t code_unit) {
822   if (code_unit <= unibrow::Utf16::kMaxNonSurrogateCharCode) {
823     v->push_back(code_unit);
824   } else {
825     v->push_back(unibrow::Utf16::LeadSurrogate(code_unit));
826     v->push_back(unibrow::Utf16::TrailSurrogate(code_unit));
827   }
828 }
829 
ParseCaptureGroupName()830 const ZoneVector<uc16>* RegExpParser::ParseCaptureGroupName() {
831   ZoneVector<uc16>* name = zone()->New<ZoneVector<uc16>>(zone());
832 
833   bool at_start = true;
834   while (true) {
835     uc32 c = current();
836     Advance();
837 
838     // Convert unicode escapes.
839     if (c == '\\' && current() == 'u') {
840       Advance();
841       if (!ParseUnicodeEscape(&c)) {
842         ReportError(RegExpError::kInvalidUnicodeEscape);
843         return nullptr;
844       }
845     }
846 
847     // The backslash char is misclassified as both ID_Start and ID_Continue.
848     if (c == '\\') {
849       ReportError(RegExpError::kInvalidCaptureGroupName);
850       return nullptr;
851     }
852 
853     if (at_start) {
854       if (!IsIdentifierStart(c)) {
855         ReportError(RegExpError::kInvalidCaptureGroupName);
856         return nullptr;
857       }
858       push_code_unit(name, c);
859       at_start = false;
860     } else {
861       if (c == '>') {
862         break;
863       } else if (IsIdentifierPart(c)) {
864         push_code_unit(name, c);
865       } else {
866         ReportError(RegExpError::kInvalidCaptureGroupName);
867         return nullptr;
868       }
869     }
870   }
871 
872   return name;
873 }
874 
CreateNamedCaptureAtIndex(const ZoneVector<uc16> * name,int index)875 bool RegExpParser::CreateNamedCaptureAtIndex(const ZoneVector<uc16>* name,
876                                              int index) {
877   DCHECK(0 < index && index <= captures_started_);
878   DCHECK_NOT_NULL(name);
879 
880   RegExpCapture* capture = GetCapture(index);
881   DCHECK_NULL(capture->name());
882 
883   capture->set_name(name);
884 
885   if (named_captures_ == nullptr) {
886     named_captures_ =
887         zone_->New<ZoneSet<RegExpCapture*, RegExpCaptureNameLess>>(zone());
888   } else {
889     // Check for duplicates and bail if we find any.
890 
891     const auto& named_capture_it = named_captures_->find(capture);
892     if (named_capture_it != named_captures_->end()) {
893       ReportError(RegExpError::kDuplicateCaptureGroupName);
894       return false;
895     }
896   }
897 
898   named_captures_->emplace(capture);
899 
900   return true;
901 }
902 
ParseNamedBackReference(RegExpBuilder * builder,RegExpParserState * state)903 bool RegExpParser::ParseNamedBackReference(RegExpBuilder* builder,
904                                            RegExpParserState* state) {
905   // The parser is assumed to be on the '<' in \k<name>.
906   if (current() != '<') {
907     ReportError(RegExpError::kInvalidNamedReference);
908     return false;
909   }
910 
911   Advance();
912   const ZoneVector<uc16>* name = ParseCaptureGroupName();
913   if (name == nullptr) {
914     return false;
915   }
916 
917   if (state->IsInsideCaptureGroup(name)) {
918     builder->AddEmpty();
919   } else {
920     RegExpBackReference* atom =
921         zone()->New<RegExpBackReference>(builder->flags());
922     atom->set_name(name);
923 
924     builder->AddAtom(atom);
925 
926     if (named_back_references_ == nullptr) {
927       named_back_references_ =
928           zone()->New<ZoneList<RegExpBackReference*>>(1, zone());
929     }
930     named_back_references_->Add(atom, zone());
931   }
932 
933   return true;
934 }
935 
PatchNamedBackReferences()936 void RegExpParser::PatchNamedBackReferences() {
937   if (named_back_references_ == nullptr) return;
938 
939   if (named_captures_ == nullptr) {
940     ReportError(RegExpError::kInvalidNamedCaptureReference);
941     return;
942   }
943 
944   // Look up and patch the actual capture for each named back reference.
945 
946   for (int i = 0; i < named_back_references_->length(); i++) {
947     RegExpBackReference* ref = named_back_references_->at(i);
948 
949     // Capture used to search the named_captures_ by name, index of the
950     // capture is never used.
951     static const int kInvalidIndex = 0;
952     RegExpCapture* search_capture = zone()->New<RegExpCapture>(kInvalidIndex);
953     DCHECK_NULL(search_capture->name());
954     search_capture->set_name(ref->name());
955 
956     int index = -1;
957     const auto& capture_it = named_captures_->find(search_capture);
958     if (capture_it != named_captures_->end()) {
959       index = (*capture_it)->index();
960     } else {
961       ReportError(RegExpError::kInvalidNamedCaptureReference);
962       return;
963     }
964 
965     ref->set_capture(GetCapture(index));
966   }
967 }
968 
GetCapture(int index)969 RegExpCapture* RegExpParser::GetCapture(int index) {
970   // The index for the capture groups are one-based. Its index in the list is
971   // zero-based.
972   int know_captures =
973       is_scanned_for_captures_ ? capture_count_ : captures_started_;
974   DCHECK(index <= know_captures);
975   if (captures_ == nullptr) {
976     captures_ = zone()->New<ZoneList<RegExpCapture*>>(know_captures, zone());
977   }
978   while (captures_->length() < know_captures) {
979     captures_->Add(zone()->New<RegExpCapture>(captures_->length() + 1), zone());
980   }
981   return captures_->at(index - 1);
982 }
983 
984 namespace {
985 
986 struct RegExpCaptureIndexLess {
operator ()v8::internal::__anonb7f600e30111::RegExpCaptureIndexLess987   bool operator()(const RegExpCapture* lhs, const RegExpCapture* rhs) const {
988     DCHECK_NOT_NULL(lhs);
989     DCHECK_NOT_NULL(rhs);
990     return lhs->index() < rhs->index();
991   }
992 };
993 
994 }  // namespace
995 
CreateCaptureNameMap()996 Handle<FixedArray> RegExpParser::CreateCaptureNameMap() {
997   if (named_captures_ == nullptr || named_captures_->empty()) {
998     return Handle<FixedArray>();
999   }
1000 
1001   // Named captures are sorted by name (because the set is used to ensure
1002   // name uniqueness). But the capture name map must to be sorted by index.
1003 
1004   ZoneVector<RegExpCapture*> sorted_named_captures(
1005       named_captures_->begin(), named_captures_->end(), zone());
1006   std::sort(sorted_named_captures.begin(), sorted_named_captures.end(),
1007             RegExpCaptureIndexLess{});
1008   DCHECK_EQ(sorted_named_captures.size(), named_captures_->size());
1009 
1010   Factory* factory = isolate()->factory();
1011 
1012   int len = static_cast<int>(sorted_named_captures.size()) * 2;
1013   Handle<FixedArray> array = factory->NewFixedArray(len);
1014 
1015   int i = 0;
1016   for (const auto& capture : sorted_named_captures) {
1017     Vector<const uc16> capture_name(capture->name()->data(),
1018                                     capture->name()->size());
1019     // CSA code in ConstructNewResultFromMatchInfo requires these strings to be
1020     // internalized so they can be used as property names in the 'exec' results.
1021     Handle<String> name = factory->InternalizeString(capture_name);
1022     array->set(i * 2, *name);
1023     array->set(i * 2 + 1, Smi::FromInt(capture->index()));
1024 
1025     i++;
1026   }
1027   DCHECK_EQ(i * 2, len);
1028 
1029   return array;
1030 }
1031 
HasNamedCaptures()1032 bool RegExpParser::HasNamedCaptures() {
1033   if (has_named_captures_ || is_scanned_for_captures_) {
1034     return has_named_captures_;
1035   }
1036 
1037   ScanForCaptures();
1038   DCHECK(is_scanned_for_captures_);
1039   return has_named_captures_;
1040 }
1041 
IsInsideCaptureGroup(int index)1042 bool RegExpParser::RegExpParserState::IsInsideCaptureGroup(int index) {
1043   for (RegExpParserState* s = this; s != nullptr; s = s->previous_state()) {
1044     if (s->group_type() != CAPTURE) continue;
1045     // Return true if we found the matching capture index.
1046     if (index == s->capture_index()) return true;
1047     // Abort if index is larger than what has been parsed up till this state.
1048     if (index > s->capture_index()) return false;
1049   }
1050   return false;
1051 }
1052 
IsInsideCaptureGroup(const ZoneVector<uc16> * name)1053 bool RegExpParser::RegExpParserState::IsInsideCaptureGroup(
1054     const ZoneVector<uc16>* name) {
1055   DCHECK_NOT_NULL(name);
1056   for (RegExpParserState* s = this; s != nullptr; s = s->previous_state()) {
1057     if (s->capture_name() == nullptr) continue;
1058     if (*s->capture_name() == *name) return true;
1059   }
1060   return false;
1061 }
1062 
1063 // QuantifierPrefix ::
1064 //   { DecimalDigits }
1065 //   { DecimalDigits , }
1066 //   { DecimalDigits , DecimalDigits }
1067 //
1068 // Returns true if parsing succeeds, and set the min_out and max_out
1069 // values. Values are truncated to RegExpTree::kInfinity if they overflow.
ParseIntervalQuantifier(int * min_out,int * max_out)1070 bool RegExpParser::ParseIntervalQuantifier(int* min_out, int* max_out) {
1071   DCHECK_EQ(current(), '{');
1072   int start = position();
1073   Advance();
1074   int min = 0;
1075   if (!IsDecimalDigit(current())) {
1076     Reset(start);
1077     return false;
1078   }
1079   while (IsDecimalDigit(current())) {
1080     int next = current() - '0';
1081     if (min > (RegExpTree::kInfinity - next) / 10) {
1082       // Overflow. Skip past remaining decimal digits and return -1.
1083       do {
1084         Advance();
1085       } while (IsDecimalDigit(current()));
1086       min = RegExpTree::kInfinity;
1087       break;
1088     }
1089     min = 10 * min + next;
1090     Advance();
1091   }
1092   int max = 0;
1093   if (current() == '}') {
1094     max = min;
1095     Advance();
1096   } else if (current() == ',') {
1097     Advance();
1098     if (current() == '}') {
1099       max = RegExpTree::kInfinity;
1100       Advance();
1101     } else {
1102       while (IsDecimalDigit(current())) {
1103         int next = current() - '0';
1104         if (max > (RegExpTree::kInfinity - next) / 10) {
1105           do {
1106             Advance();
1107           } while (IsDecimalDigit(current()));
1108           max = RegExpTree::kInfinity;
1109           break;
1110         }
1111         max = 10 * max + next;
1112         Advance();
1113       }
1114       if (current() != '}') {
1115         Reset(start);
1116         return false;
1117       }
1118       Advance();
1119     }
1120   } else {
1121     Reset(start);
1122     return false;
1123   }
1124   *min_out = min;
1125   *max_out = max;
1126   return true;
1127 }
1128 
1129 
ParseOctalLiteral()1130 uc32 RegExpParser::ParseOctalLiteral() {
1131   DCHECK(('0' <= current() && current() <= '7') || current() == kEndMarker);
1132   // For compatibility with some other browsers (not all), we parse
1133   // up to three octal digits with a value below 256.
1134   // ES#prod-annexB-LegacyOctalEscapeSequence
1135   uc32 value = current() - '0';
1136   Advance();
1137   if ('0' <= current() && current() <= '7') {
1138     value = value * 8 + current() - '0';
1139     Advance();
1140     if (value < 32 && '0' <= current() && current() <= '7') {
1141       value = value * 8 + current() - '0';
1142       Advance();
1143     }
1144   }
1145   return value;
1146 }
1147 
1148 
ParseHexEscape(int length,uc32 * value)1149 bool RegExpParser::ParseHexEscape(int length, uc32* value) {
1150   int start = position();
1151   uc32 val = 0;
1152   for (int i = 0; i < length; ++i) {
1153     uc32 c = current();
1154     int d = HexValue(c);
1155     if (d < 0) {
1156       Reset(start);
1157       return false;
1158     }
1159     val = val * 16 + d;
1160     Advance();
1161   }
1162   *value = val;
1163   return true;
1164 }
1165 
1166 // This parses RegExpUnicodeEscapeSequence as described in ECMA262.
ParseUnicodeEscape(uc32 * value)1167 bool RegExpParser::ParseUnicodeEscape(uc32* value) {
1168   // Accept both \uxxxx and \u{xxxxxx} (if harmony unicode escapes are
1169   // allowed). In the latter case, the number of hex digits between { } is
1170   // arbitrary. \ and u have already been read.
1171   if (current() == '{' && unicode()) {
1172     int start = position();
1173     Advance();
1174     if (ParseUnlimitedLengthHexNumber(0x10FFFF, value)) {
1175       if (current() == '}') {
1176         Advance();
1177         return true;
1178       }
1179     }
1180     Reset(start);
1181     return false;
1182   }
1183   // \u but no {, or \u{...} escapes not allowed.
1184   bool result = ParseHexEscape(4, value);
1185   if (result && unicode() && unibrow::Utf16::IsLeadSurrogate(*value) &&
1186       current() == '\\') {
1187     // Attempt to read trail surrogate.
1188     int start = position();
1189     if (Next() == 'u') {
1190       Advance(2);
1191       uc32 trail;
1192       if (ParseHexEscape(4, &trail) &&
1193           unibrow::Utf16::IsTrailSurrogate(trail)) {
1194         *value = unibrow::Utf16::CombineSurrogatePair(static_cast<uc16>(*value),
1195                                                       static_cast<uc16>(trail));
1196         return true;
1197       }
1198     }
1199     Reset(start);
1200   }
1201   return result;
1202 }
1203 
1204 #ifdef V8_INTL_SUPPORT
1205 
1206 namespace {
1207 
IsExactPropertyAlias(const char * property_name,UProperty property)1208 bool IsExactPropertyAlias(const char* property_name, UProperty property) {
1209   const char* short_name = u_getPropertyName(property, U_SHORT_PROPERTY_NAME);
1210   if (short_name != nullptr && strcmp(property_name, short_name) == 0)
1211     return true;
1212   for (int i = 0;; i++) {
1213     const char* long_name = u_getPropertyName(
1214         property, static_cast<UPropertyNameChoice>(U_LONG_PROPERTY_NAME + i));
1215     if (long_name == nullptr) break;
1216     if (strcmp(property_name, long_name) == 0) return true;
1217   }
1218   return false;
1219 }
1220 
IsExactPropertyValueAlias(const char * property_value_name,UProperty property,int32_t property_value)1221 bool IsExactPropertyValueAlias(const char* property_value_name,
1222                                UProperty property, int32_t property_value) {
1223   const char* short_name =
1224       u_getPropertyValueName(property, property_value, U_SHORT_PROPERTY_NAME);
1225   if (short_name != nullptr && strcmp(property_value_name, short_name) == 0) {
1226     return true;
1227   }
1228   for (int i = 0;; i++) {
1229     const char* long_name = u_getPropertyValueName(
1230         property, property_value,
1231         static_cast<UPropertyNameChoice>(U_LONG_PROPERTY_NAME + i));
1232     if (long_name == nullptr) break;
1233     if (strcmp(property_value_name, long_name) == 0) return true;
1234   }
1235   return false;
1236 }
1237 
LookupPropertyValueName(UProperty property,const char * property_value_name,bool negate,ZoneList<CharacterRange> * result,Zone * zone)1238 bool LookupPropertyValueName(UProperty property,
1239                              const char* property_value_name, bool negate,
1240                              ZoneList<CharacterRange>* result, Zone* zone) {
1241   UProperty property_for_lookup = property;
1242   if (property_for_lookup == UCHAR_SCRIPT_EXTENSIONS) {
1243     // For the property Script_Extensions, we have to do the property value
1244     // name lookup as if the property is Script.
1245     property_for_lookup = UCHAR_SCRIPT;
1246   }
1247   int32_t property_value =
1248       u_getPropertyValueEnum(property_for_lookup, property_value_name);
1249   if (property_value == UCHAR_INVALID_CODE) return false;
1250 
1251   // We require the property name to match exactly to one of the property value
1252   // aliases. However, u_getPropertyValueEnum uses loose matching.
1253   if (!IsExactPropertyValueAlias(property_value_name, property_for_lookup,
1254                                  property_value)) {
1255     return false;
1256   }
1257 
1258   UErrorCode ec = U_ZERO_ERROR;
1259   icu::UnicodeSet set;
1260   set.applyIntPropertyValue(property, property_value, ec);
1261   bool success = ec == U_ZERO_ERROR && !set.isEmpty();
1262 
1263   if (success) {
1264     set.removeAllStrings();
1265     if (negate) set.complement();
1266     for (int i = 0; i < set.getRangeCount(); i++) {
1267       result->Add(
1268           CharacterRange::Range(set.getRangeStart(i), set.getRangeEnd(i)),
1269           zone);
1270     }
1271   }
1272   return success;
1273 }
1274 
1275 template <size_t N>
NameEquals(const char * name,const char (& literal)[N])1276 inline bool NameEquals(const char* name, const char (&literal)[N]) {
1277   return strncmp(name, literal, N + 1) == 0;
1278 }
1279 
LookupSpecialPropertyValueName(const char * name,ZoneList<CharacterRange> * result,bool negate,Zone * zone)1280 bool LookupSpecialPropertyValueName(const char* name,
1281                                     ZoneList<CharacterRange>* result,
1282                                     bool negate, Zone* zone) {
1283   if (NameEquals(name, "Any")) {
1284     if (negate) {
1285       // Leave the list of character ranges empty, since the negation of 'Any'
1286       // is the empty set.
1287     } else {
1288       result->Add(CharacterRange::Everything(), zone);
1289     }
1290   } else if (NameEquals(name, "ASCII")) {
1291     result->Add(negate ? CharacterRange::Range(0x80, String::kMaxCodePoint)
1292                        : CharacterRange::Range(0x0, 0x7F),
1293                 zone);
1294   } else if (NameEquals(name, "Assigned")) {
1295     return LookupPropertyValueName(UCHAR_GENERAL_CATEGORY, "Unassigned",
1296                                    !negate, result, zone);
1297   } else {
1298     return false;
1299   }
1300   return true;
1301 }
1302 
1303 // Explicitly allowlist supported binary properties. The spec forbids supporting
1304 // properties outside of this set to ensure interoperability.
IsSupportedBinaryProperty(UProperty property)1305 bool IsSupportedBinaryProperty(UProperty property) {
1306   switch (property) {
1307     case UCHAR_ALPHABETIC:
1308     // 'Any' is not supported by ICU. See LookupSpecialPropertyValueName.
1309     // 'ASCII' is not supported by ICU. See LookupSpecialPropertyValueName.
1310     case UCHAR_ASCII_HEX_DIGIT:
1311     // 'Assigned' is not supported by ICU. See LookupSpecialPropertyValueName.
1312     case UCHAR_BIDI_CONTROL:
1313     case UCHAR_BIDI_MIRRORED:
1314     case UCHAR_CASE_IGNORABLE:
1315     case UCHAR_CASED:
1316     case UCHAR_CHANGES_WHEN_CASEFOLDED:
1317     case UCHAR_CHANGES_WHEN_CASEMAPPED:
1318     case UCHAR_CHANGES_WHEN_LOWERCASED:
1319     case UCHAR_CHANGES_WHEN_NFKC_CASEFOLDED:
1320     case UCHAR_CHANGES_WHEN_TITLECASED:
1321     case UCHAR_CHANGES_WHEN_UPPERCASED:
1322     case UCHAR_DASH:
1323     case UCHAR_DEFAULT_IGNORABLE_CODE_POINT:
1324     case UCHAR_DEPRECATED:
1325     case UCHAR_DIACRITIC:
1326     case UCHAR_EMOJI:
1327     case UCHAR_EMOJI_COMPONENT:
1328     case UCHAR_EMOJI_MODIFIER_BASE:
1329     case UCHAR_EMOJI_MODIFIER:
1330     case UCHAR_EMOJI_PRESENTATION:
1331     case UCHAR_EXTENDED_PICTOGRAPHIC:
1332     case UCHAR_EXTENDER:
1333     case UCHAR_GRAPHEME_BASE:
1334     case UCHAR_GRAPHEME_EXTEND:
1335     case UCHAR_HEX_DIGIT:
1336     case UCHAR_ID_CONTINUE:
1337     case UCHAR_ID_START:
1338     case UCHAR_IDEOGRAPHIC:
1339     case UCHAR_IDS_BINARY_OPERATOR:
1340     case UCHAR_IDS_TRINARY_OPERATOR:
1341     case UCHAR_JOIN_CONTROL:
1342     case UCHAR_LOGICAL_ORDER_EXCEPTION:
1343     case UCHAR_LOWERCASE:
1344     case UCHAR_MATH:
1345     case UCHAR_NONCHARACTER_CODE_POINT:
1346     case UCHAR_PATTERN_SYNTAX:
1347     case UCHAR_PATTERN_WHITE_SPACE:
1348     case UCHAR_QUOTATION_MARK:
1349     case UCHAR_RADICAL:
1350     case UCHAR_REGIONAL_INDICATOR:
1351     case UCHAR_S_TERM:
1352     case UCHAR_SOFT_DOTTED:
1353     case UCHAR_TERMINAL_PUNCTUATION:
1354     case UCHAR_UNIFIED_IDEOGRAPH:
1355     case UCHAR_UPPERCASE:
1356     case UCHAR_VARIATION_SELECTOR:
1357     case UCHAR_WHITE_SPACE:
1358     case UCHAR_XID_CONTINUE:
1359     case UCHAR_XID_START:
1360       return true;
1361     default:
1362       break;
1363   }
1364   return false;
1365 }
1366 
IsUnicodePropertyValueCharacter(char c)1367 bool IsUnicodePropertyValueCharacter(char c) {
1368   // https://tc39.github.io/proposal-regexp-unicode-property-escapes/
1369   //
1370   // Note that using this to validate each parsed char is quite conservative.
1371   // A possible alternative solution would be to only ensure the parsed
1372   // property name/value candidate string does not contain '\0' characters and
1373   // let ICU lookups trigger the final failure.
1374   if ('a' <= c && c <= 'z') return true;
1375   if ('A' <= c && c <= 'Z') return true;
1376   if ('0' <= c && c <= '9') return true;
1377   return (c == '_');
1378 }
1379 
1380 }  // anonymous namespace
1381 
ParsePropertyClassName(ZoneVector<char> * name_1,ZoneVector<char> * name_2)1382 bool RegExpParser::ParsePropertyClassName(ZoneVector<char>* name_1,
1383                                           ZoneVector<char>* name_2) {
1384   DCHECK(name_1->empty());
1385   DCHECK(name_2->empty());
1386   // Parse the property class as follows:
1387   // - In \p{name}, 'name' is interpreted
1388   //   - either as a general category property value name.
1389   //   - or as a binary property name.
1390   // - In \p{name=value}, 'name' is interpreted as an enumerated property name,
1391   //   and 'value' is interpreted as one of the available property value names.
1392   // - Aliases in PropertyAlias.txt and PropertyValueAlias.txt can be used.
1393   // - Loose matching is not applied.
1394   if (current() == '{') {
1395     // Parse \p{[PropertyName=]PropertyNameValue}
1396     for (Advance(); current() != '}' && current() != '='; Advance()) {
1397       if (!IsUnicodePropertyValueCharacter(current())) return false;
1398       if (!has_next()) return false;
1399       name_1->push_back(static_cast<char>(current()));
1400     }
1401     if (current() == '=') {
1402       for (Advance(); current() != '}'; Advance()) {
1403         if (!IsUnicodePropertyValueCharacter(current())) return false;
1404         if (!has_next()) return false;
1405         name_2->push_back(static_cast<char>(current()));
1406       }
1407       name_2->push_back(0);  // null-terminate string.
1408     }
1409   } else {
1410     return false;
1411   }
1412   Advance();
1413   name_1->push_back(0);  // null-terminate string.
1414 
1415   DCHECK(name_1->size() - 1 == std::strlen(name_1->data()));
1416   DCHECK(name_2->empty() || name_2->size() - 1 == std::strlen(name_2->data()));
1417   return true;
1418 }
1419 
AddPropertyClassRange(ZoneList<CharacterRange> * add_to,bool negate,const ZoneVector<char> & name_1,const ZoneVector<char> & name_2)1420 bool RegExpParser::AddPropertyClassRange(ZoneList<CharacterRange>* add_to,
1421                                          bool negate,
1422                                          const ZoneVector<char>& name_1,
1423                                          const ZoneVector<char>& name_2) {
1424   if (name_2.empty()) {
1425     // First attempt to interpret as general category property value name.
1426     const char* name = name_1.data();
1427     if (LookupPropertyValueName(UCHAR_GENERAL_CATEGORY_MASK, name, negate,
1428                                 add_to, zone())) {
1429       return true;
1430     }
1431     // Interpret "Any", "ASCII", and "Assigned".
1432     if (LookupSpecialPropertyValueName(name, add_to, negate, zone())) {
1433       return true;
1434     }
1435     // Then attempt to interpret as binary property name with value name 'Y'.
1436     UProperty property = u_getPropertyEnum(name);
1437     if (!IsSupportedBinaryProperty(property)) return false;
1438     if (!IsExactPropertyAlias(name, property)) return false;
1439     return LookupPropertyValueName(property, negate ? "N" : "Y", false, add_to,
1440                                    zone());
1441   } else {
1442     // Both property name and value name are specified. Attempt to interpret
1443     // the property name as enumerated property.
1444     const char* property_name = name_1.data();
1445     const char* value_name = name_2.data();
1446     UProperty property = u_getPropertyEnum(property_name);
1447     if (!IsExactPropertyAlias(property_name, property)) return false;
1448     if (property == UCHAR_GENERAL_CATEGORY) {
1449       // We want to allow aggregate value names such as "Letter".
1450       property = UCHAR_GENERAL_CATEGORY_MASK;
1451     } else if (property != UCHAR_SCRIPT &&
1452                property != UCHAR_SCRIPT_EXTENSIONS) {
1453       return false;
1454     }
1455     return LookupPropertyValueName(property, value_name, negate, add_to,
1456                                    zone());
1457   }
1458 }
1459 
GetPropertySequence(const ZoneVector<char> & name_1)1460 RegExpTree* RegExpParser::GetPropertySequence(const ZoneVector<char>& name_1) {
1461   if (!FLAG_harmony_regexp_sequence) return nullptr;
1462   const char* name = name_1.data();
1463   const uc32* sequence_list = nullptr;
1464   JSRegExp::Flags flags = JSRegExp::kUnicode;
1465   if (NameEquals(name, "Emoji_Flag_Sequence")) {
1466     sequence_list = UnicodePropertySequences::kEmojiFlagSequences;
1467   } else if (NameEquals(name, "Emoji_Tag_Sequence")) {
1468     sequence_list = UnicodePropertySequences::kEmojiTagSequences;
1469   } else if (NameEquals(name, "Emoji_ZWJ_Sequence")) {
1470     sequence_list = UnicodePropertySequences::kEmojiZWJSequences;
1471   }
1472   if (sequence_list != nullptr) {
1473     // TODO(yangguo): this creates huge regexp code. Alternative to this is
1474     // to create a new operator that checks for these sequences at runtime.
1475     RegExpBuilder builder(zone(), flags);
1476     while (true) {                   // Iterate through list of sequences.
1477       while (*sequence_list != 0) {  // Iterate through sequence.
1478         builder.AddUnicodeCharacter(*sequence_list);
1479         sequence_list++;
1480       }
1481       sequence_list++;
1482       if (*sequence_list == 0) break;
1483       builder.NewAlternative();
1484     }
1485     return builder.ToRegExp();
1486   }
1487 
1488   if (NameEquals(name, "Emoji_Keycap_Sequence")) {
1489     // https://unicode.org/reports/tr51/#def_emoji_keycap_sequence
1490     // emoji_keycap_sequence := [0-9#*] \x{FE0F 20E3}
1491     RegExpBuilder builder(zone(), flags);
1492     ZoneList<CharacterRange>* prefix_ranges =
1493         zone()->New<ZoneList<CharacterRange>>(2, zone());
1494     prefix_ranges->Add(CharacterRange::Range('0', '9'), zone());
1495     prefix_ranges->Add(CharacterRange::Singleton('#'), zone());
1496     prefix_ranges->Add(CharacterRange::Singleton('*'), zone());
1497     builder.AddCharacterClass(
1498         zone()->New<RegExpCharacterClass>(zone(), prefix_ranges, flags));
1499     builder.AddCharacter(0xFE0F);
1500     builder.AddCharacter(0x20E3);
1501     return builder.ToRegExp();
1502   } else if (NameEquals(name, "Emoji_Modifier_Sequence")) {
1503     // https://unicode.org/reports/tr51/#def_emoji_modifier_sequence
1504     // emoji_modifier_sequence := emoji_modifier_base emoji_modifier
1505     RegExpBuilder builder(zone(), flags);
1506     ZoneList<CharacterRange>* modifier_base_ranges =
1507         zone()->New<ZoneList<CharacterRange>>(2, zone());
1508     LookupPropertyValueName(UCHAR_EMOJI_MODIFIER_BASE, "Y", false,
1509                             modifier_base_ranges, zone());
1510     builder.AddCharacterClass(
1511         zone()->New<RegExpCharacterClass>(zone(), modifier_base_ranges, flags));
1512     ZoneList<CharacterRange>* modifier_ranges =
1513         zone()->New<ZoneList<CharacterRange>>(2, zone());
1514     LookupPropertyValueName(UCHAR_EMOJI_MODIFIER, "Y", false, modifier_ranges,
1515                             zone());
1516     builder.AddCharacterClass(
1517         zone()->New<RegExpCharacterClass>(zone(), modifier_ranges, flags));
1518     return builder.ToRegExp();
1519   }
1520 
1521   return nullptr;
1522 }
1523 
1524 #else  // V8_INTL_SUPPORT
1525 
ParsePropertyClassName(ZoneVector<char> * name_1,ZoneVector<char> * name_2)1526 bool RegExpParser::ParsePropertyClassName(ZoneVector<char>* name_1,
1527                                           ZoneVector<char>* name_2) {
1528   return false;
1529 }
1530 
AddPropertyClassRange(ZoneList<CharacterRange> * add_to,bool negate,const ZoneVector<char> & name_1,const ZoneVector<char> & name_2)1531 bool RegExpParser::AddPropertyClassRange(ZoneList<CharacterRange>* add_to,
1532                                          bool negate,
1533                                          const ZoneVector<char>& name_1,
1534                                          const ZoneVector<char>& name_2) {
1535   return false;
1536 }
1537 
GetPropertySequence(const ZoneVector<char> & name)1538 RegExpTree* RegExpParser::GetPropertySequence(const ZoneVector<char>& name) {
1539   return nullptr;
1540 }
1541 
1542 #endif  // V8_INTL_SUPPORT
1543 
ParseUnlimitedLengthHexNumber(int max_value,uc32 * value)1544 bool RegExpParser::ParseUnlimitedLengthHexNumber(int max_value, uc32* value) {
1545   uc32 x = 0;
1546   int d = HexValue(current());
1547   if (d < 0) {
1548     return false;
1549   }
1550   while (d >= 0) {
1551     x = x * 16 + d;
1552     if (x > static_cast<uc32>(max_value)) {
1553       return false;
1554     }
1555     Advance();
1556     d = HexValue(current());
1557   }
1558   *value = x;
1559   return true;
1560 }
1561 
1562 
ParseClassCharacterEscape()1563 uc32 RegExpParser::ParseClassCharacterEscape() {
1564   DCHECK_EQ('\\', current());
1565   DCHECK(has_next() && !IsSpecialClassEscape(Next()));
1566   Advance();
1567   switch (current()) {
1568     case 'b':
1569       Advance();
1570       return '\b';
1571     // ControlEscape :: one of
1572     //   f n r t v
1573     case 'f':
1574       Advance();
1575       return '\f';
1576     case 'n':
1577       Advance();
1578       return '\n';
1579     case 'r':
1580       Advance();
1581       return '\r';
1582     case 't':
1583       Advance();
1584       return '\t';
1585     case 'v':
1586       Advance();
1587       return '\v';
1588     case 'c': {
1589       uc32 controlLetter = Next();
1590       uc32 letter = controlLetter & ~('A' ^ 'a');
1591       // Inside a character class, we also accept digits and underscore as
1592       // control characters, unless with /u. See Annex B:
1593       // ES#prod-annexB-ClassControlLetter
1594       if (letter >= 'A' && letter <= 'Z') {
1595         Advance(2);
1596         // Control letters mapped to ASCII control characters in the range
1597         // 0x00-0x1F.
1598         return controlLetter & 0x1F;
1599       }
1600       if (unicode()) {
1601         // With /u, invalid escapes are not treated as identity escapes.
1602         ReportError(RegExpError::kInvalidClassEscape);
1603         return 0;
1604       }
1605       if ((controlLetter >= '0' && controlLetter <= '9') ||
1606           controlLetter == '_') {
1607         Advance(2);
1608         return controlLetter & 0x1F;
1609       }
1610       // We match JSC in reading the backslash as a literal
1611       // character instead of as starting an escape.
1612       // TODO(v8:6201): Not yet covered by the spec.
1613       return '\\';
1614     }
1615     case '0':
1616       // With /u, \0 is interpreted as NUL if not followed by another digit.
1617       if (unicode() && !(Next() >= '0' && Next() <= '9')) {
1618         Advance();
1619         return 0;
1620       }
1621       V8_FALLTHROUGH;
1622     case '1':
1623     case '2':
1624     case '3':
1625     case '4':
1626     case '5':
1627     case '6':
1628     case '7':
1629       // For compatibility, we interpret a decimal escape that isn't
1630       // a back reference (and therefore either \0 or not valid according
1631       // to the specification) as a 1..3 digit octal character code.
1632       // ES#prod-annexB-LegacyOctalEscapeSequence
1633       if (unicode()) {
1634         // With /u, decimal escape is not interpreted as octal character code.
1635         ReportError(RegExpError::kInvalidClassEscape);
1636         return 0;
1637       }
1638       return ParseOctalLiteral();
1639     case 'x': {
1640       Advance();
1641       uc32 value;
1642       if (ParseHexEscape(2, &value)) return value;
1643       if (unicode()) {
1644         // With /u, invalid escapes are not treated as identity escapes.
1645         ReportError(RegExpError::kInvalidEscape);
1646         return 0;
1647       }
1648       // If \x is not followed by a two-digit hexadecimal, treat it
1649       // as an identity escape.
1650       return 'x';
1651     }
1652     case 'u': {
1653       Advance();
1654       uc32 value;
1655       if (ParseUnicodeEscape(&value)) return value;
1656       if (unicode()) {
1657         // With /u, invalid escapes are not treated as identity escapes.
1658         ReportError(RegExpError::kInvalidUnicodeEscape);
1659         return 0;
1660       }
1661       // If \u is not followed by a two-digit hexadecimal, treat it
1662       // as an identity escape.
1663       return 'u';
1664     }
1665     default: {
1666       uc32 result = current();
1667       // With /u, no identity escapes except for syntax characters and '-' are
1668       // allowed. Otherwise, all identity escapes are allowed.
1669       if (!unicode() || IsSyntaxCharacterOrSlash(result) || result == '-') {
1670         Advance();
1671         return result;
1672       }
1673       ReportError(RegExpError::kInvalidEscape);
1674       return 0;
1675     }
1676   }
1677   UNREACHABLE();
1678 }
1679 
ParseClassEscape(ZoneList<CharacterRange> * ranges,Zone * zone,bool add_unicode_case_equivalents,uc32 * char_out,bool * is_class_escape)1680 void RegExpParser::ParseClassEscape(ZoneList<CharacterRange>* ranges,
1681                                     Zone* zone,
1682                                     bool add_unicode_case_equivalents,
1683                                     uc32* char_out, bool* is_class_escape) {
1684   uc32 current_char = current();
1685   if (current_char == '\\') {
1686     switch (Next()) {
1687       case 'w':
1688       case 'W':
1689       case 'd':
1690       case 'D':
1691       case 's':
1692       case 'S': {
1693         CharacterRange::AddClassEscape(static_cast<char>(Next()), ranges,
1694                                        add_unicode_case_equivalents, zone);
1695         Advance(2);
1696         *is_class_escape = true;
1697         return;
1698       }
1699       case kEndMarker:
1700         ReportError(RegExpError::kEscapeAtEndOfPattern);
1701         return;
1702       case 'p':
1703       case 'P':
1704         if (unicode()) {
1705           bool negate = Next() == 'P';
1706           Advance(2);
1707           ZoneVector<char> name_1(zone);
1708           ZoneVector<char> name_2(zone);
1709           if (!ParsePropertyClassName(&name_1, &name_2) ||
1710               !AddPropertyClassRange(ranges, negate, name_1, name_2)) {
1711             ReportError(RegExpError::kInvalidClassPropertyName);
1712           }
1713           *is_class_escape = true;
1714           return;
1715         }
1716         break;
1717       default:
1718         break;
1719     }
1720     *char_out = ParseClassCharacterEscape();
1721     *is_class_escape = false;
1722   } else {
1723     Advance();
1724     *char_out = current_char;
1725     *is_class_escape = false;
1726   }
1727 }
1728 
ParseCharacterClass(const RegExpBuilder * builder)1729 RegExpTree* RegExpParser::ParseCharacterClass(const RegExpBuilder* builder) {
1730   DCHECK_EQ(current(), '[');
1731   Advance();
1732   bool is_negated = false;
1733   if (current() == '^') {
1734     is_negated = true;
1735     Advance();
1736   }
1737   ZoneList<CharacterRange>* ranges =
1738       zone()->New<ZoneList<CharacterRange>>(2, zone());
1739   bool add_unicode_case_equivalents = unicode() && builder->ignore_case();
1740   while (has_more() && current() != ']') {
1741     uc32 char_1, char_2;
1742     bool is_class_1, is_class_2;
1743     ParseClassEscape(ranges, zone(), add_unicode_case_equivalents, &char_1,
1744                      &is_class_1 CHECK_FAILED);
1745     if (current() == '-') {
1746       Advance();
1747       if (current() == kEndMarker) {
1748         // If we reach the end we break out of the loop and let the
1749         // following code report an error.
1750         break;
1751       } else if (current() == ']') {
1752         if (!is_class_1) ranges->Add(CharacterRange::Singleton(char_1), zone());
1753         ranges->Add(CharacterRange::Singleton('-'), zone());
1754         break;
1755       }
1756       ParseClassEscape(ranges, zone(), add_unicode_case_equivalents, &char_2,
1757                        &is_class_2 CHECK_FAILED);
1758       if (is_class_1 || is_class_2) {
1759         // Either end is an escaped character class. Treat the '-' verbatim.
1760         if (unicode()) {
1761           // ES2015 21.2.2.15.1 step 1.
1762           return ReportError(RegExpError::kInvalidCharacterClass);
1763         }
1764         if (!is_class_1) ranges->Add(CharacterRange::Singleton(char_1), zone());
1765         ranges->Add(CharacterRange::Singleton('-'), zone());
1766         if (!is_class_2) ranges->Add(CharacterRange::Singleton(char_2), zone());
1767         continue;
1768       }
1769       // ES2015 21.2.2.15.1 step 6.
1770       if (char_1 > char_2) {
1771         return ReportError(RegExpError::kOutOfOrderCharacterClass);
1772       }
1773       ranges->Add(CharacterRange::Range(char_1, char_2), zone());
1774     } else {
1775       if (!is_class_1) ranges->Add(CharacterRange::Singleton(char_1), zone());
1776     }
1777   }
1778   if (!has_more()) {
1779     return ReportError(RegExpError::kUnterminatedCharacterClass);
1780   }
1781   Advance();
1782   RegExpCharacterClass::CharacterClassFlags character_class_flags;
1783   if (is_negated) character_class_flags = RegExpCharacterClass::NEGATED;
1784   return zone()->New<RegExpCharacterClass>(zone(), ranges, builder->flags(),
1785                                            character_class_flags);
1786 }
1787 
1788 
1789 #undef CHECK_FAILED
1790 
Parse(RegExpCompileData * result,const DisallowHeapAllocation &)1791 bool RegExpParser::Parse(RegExpCompileData* result,
1792                          const DisallowHeapAllocation&) {
1793   DCHECK(result != nullptr);
1794   RegExpTree* tree = ParsePattern();
1795   if (failed()) {
1796     DCHECK(tree == nullptr);
1797     DCHECK(error_ != RegExpError::kNone);
1798     result->error = error_;
1799     result->error_pos = error_pos_;
1800   } else {
1801     DCHECK(tree != nullptr);
1802     DCHECK(error_ == RegExpError::kNone);
1803     if (FLAG_trace_regexp_parser) {
1804       StdoutStream os;
1805       tree->Print(os, zone());
1806       os << "\n";
1807     }
1808     result->tree = tree;
1809     int capture_count = captures_started();
1810     result->simple = tree->IsAtom() && simple() && capture_count == 0;
1811     result->contains_anchor = contains_anchor();
1812     result->capture_count = capture_count;
1813   }
1814   return !failed();
1815 }
1816 
ParseRegExp(Isolate * isolate,Zone * zone,FlatStringReader * input,JSRegExp::Flags flags,RegExpCompileData * result)1817 bool RegExpParser::ParseRegExp(Isolate* isolate, Zone* zone,
1818                                FlatStringReader* input, JSRegExp::Flags flags,
1819                                RegExpCompileData* result) {
1820   RegExpParser parser(input, flags, isolate, zone);
1821   bool success;
1822   {
1823     DisallowHeapAllocation no_gc;
1824     success = parser.Parse(result, no_gc);
1825   }
1826   if (success) {
1827     result->capture_name_map = parser.CreateCaptureNameMap();
1828   }
1829   return success;
1830 }
1831 
RegExpBuilder(Zone * zone,JSRegExp::Flags flags)1832 RegExpBuilder::RegExpBuilder(Zone* zone, JSRegExp::Flags flags)
1833     : zone_(zone),
1834       pending_empty_(false),
1835       flags_(flags),
1836       characters_(nullptr),
1837       pending_surrogate_(kNoPendingSurrogate),
1838       terms_(),
1839       alternatives_()
1840 #ifdef DEBUG
1841       ,
1842       last_added_(ADD_NONE)
1843 #endif
1844 {
1845 }
1846 
1847 
AddLeadSurrogate(uc16 lead_surrogate)1848 void RegExpBuilder::AddLeadSurrogate(uc16 lead_surrogate) {
1849   DCHECK(unibrow::Utf16::IsLeadSurrogate(lead_surrogate));
1850   FlushPendingSurrogate();
1851   // Hold onto the lead surrogate, waiting for a trail surrogate to follow.
1852   pending_surrogate_ = lead_surrogate;
1853 }
1854 
1855 
AddTrailSurrogate(uc16 trail_surrogate)1856 void RegExpBuilder::AddTrailSurrogate(uc16 trail_surrogate) {
1857   DCHECK(unibrow::Utf16::IsTrailSurrogate(trail_surrogate));
1858   if (pending_surrogate_ != kNoPendingSurrogate) {
1859     uc16 lead_surrogate = pending_surrogate_;
1860     pending_surrogate_ = kNoPendingSurrogate;
1861     DCHECK(unibrow::Utf16::IsLeadSurrogate(lead_surrogate));
1862     uc32 combined =
1863         unibrow::Utf16::CombineSurrogatePair(lead_surrogate, trail_surrogate);
1864     if (NeedsDesugaringForIgnoreCase(combined)) {
1865       AddCharacterClassForDesugaring(combined);
1866     } else {
1867       ZoneList<uc16> surrogate_pair(2, zone());
1868       surrogate_pair.Add(lead_surrogate, zone());
1869       surrogate_pair.Add(trail_surrogate, zone());
1870       RegExpAtom* atom =
1871           zone()->New<RegExpAtom>(surrogate_pair.ToConstVector(), flags_);
1872       AddAtom(atom);
1873     }
1874   } else {
1875     pending_surrogate_ = trail_surrogate;
1876     FlushPendingSurrogate();
1877   }
1878 }
1879 
1880 
FlushPendingSurrogate()1881 void RegExpBuilder::FlushPendingSurrogate() {
1882   if (pending_surrogate_ != kNoPendingSurrogate) {
1883     DCHECK(unicode());
1884     uc32 c = pending_surrogate_;
1885     pending_surrogate_ = kNoPendingSurrogate;
1886     AddCharacterClassForDesugaring(c);
1887   }
1888 }
1889 
1890 
FlushCharacters()1891 void RegExpBuilder::FlushCharacters() {
1892   FlushPendingSurrogate();
1893   pending_empty_ = false;
1894   if (characters_ != nullptr) {
1895     RegExpTree* atom =
1896         zone()->New<RegExpAtom>(characters_->ToConstVector(), flags_);
1897     characters_ = nullptr;
1898     text_.Add(atom, zone());
1899     LAST(ADD_ATOM);
1900   }
1901 }
1902 
1903 
FlushText()1904 void RegExpBuilder::FlushText() {
1905   FlushCharacters();
1906   int num_text = text_.length();
1907   if (num_text == 0) {
1908     return;
1909   } else if (num_text == 1) {
1910     terms_.Add(text_.last(), zone());
1911   } else {
1912     RegExpText* text = zone()->New<RegExpText>(zone());
1913     for (int i = 0; i < num_text; i++) text_.Get(i)->AppendToText(text, zone());
1914     terms_.Add(text, zone());
1915   }
1916   text_.Clear();
1917 }
1918 
1919 
AddCharacter(uc16 c)1920 void RegExpBuilder::AddCharacter(uc16 c) {
1921   FlushPendingSurrogate();
1922   pending_empty_ = false;
1923   if (NeedsDesugaringForIgnoreCase(c)) {
1924     AddCharacterClassForDesugaring(c);
1925   } else {
1926     if (characters_ == nullptr) {
1927       characters_ = zone()->New<ZoneList<uc16>>(4, zone());
1928     }
1929     characters_->Add(c, zone());
1930     LAST(ADD_CHAR);
1931   }
1932 }
1933 
1934 
AddUnicodeCharacter(uc32 c)1935 void RegExpBuilder::AddUnicodeCharacter(uc32 c) {
1936   if (c > static_cast<uc32>(unibrow::Utf16::kMaxNonSurrogateCharCode)) {
1937     DCHECK(unicode());
1938     AddLeadSurrogate(unibrow::Utf16::LeadSurrogate(c));
1939     AddTrailSurrogate(unibrow::Utf16::TrailSurrogate(c));
1940   } else if (unicode() && unibrow::Utf16::IsLeadSurrogate(c)) {
1941     AddLeadSurrogate(c);
1942   } else if (unicode() && unibrow::Utf16::IsTrailSurrogate(c)) {
1943     AddTrailSurrogate(c);
1944   } else {
1945     AddCharacter(static_cast<uc16>(c));
1946   }
1947 }
1948 
AddEscapedUnicodeCharacter(uc32 character)1949 void RegExpBuilder::AddEscapedUnicodeCharacter(uc32 character) {
1950   // A lead or trail surrogate parsed via escape sequence will not
1951   // pair up with any preceding lead or following trail surrogate.
1952   FlushPendingSurrogate();
1953   AddUnicodeCharacter(character);
1954   FlushPendingSurrogate();
1955 }
1956 
AddEmpty()1957 void RegExpBuilder::AddEmpty() { pending_empty_ = true; }
1958 
1959 
AddCharacterClass(RegExpCharacterClass * cc)1960 void RegExpBuilder::AddCharacterClass(RegExpCharacterClass* cc) {
1961   if (NeedsDesugaringForUnicode(cc)) {
1962     // With /u, character class needs to be desugared, so it
1963     // must be a standalone term instead of being part of a RegExpText.
1964     AddTerm(cc);
1965   } else {
1966     AddAtom(cc);
1967   }
1968 }
1969 
AddCharacterClassForDesugaring(uc32 c)1970 void RegExpBuilder::AddCharacterClassForDesugaring(uc32 c) {
1971   AddTerm(zone()->New<RegExpCharacterClass>(
1972       zone(), CharacterRange::List(zone(), CharacterRange::Singleton(c)),
1973       flags_));
1974 }
1975 
1976 
AddAtom(RegExpTree * term)1977 void RegExpBuilder::AddAtom(RegExpTree* term) {
1978   if (term->IsEmpty()) {
1979     AddEmpty();
1980     return;
1981   }
1982   if (term->IsTextElement()) {
1983     FlushCharacters();
1984     text_.Add(term, zone());
1985   } else {
1986     FlushText();
1987     terms_.Add(term, zone());
1988   }
1989   LAST(ADD_ATOM);
1990 }
1991 
1992 
AddTerm(RegExpTree * term)1993 void RegExpBuilder::AddTerm(RegExpTree* term) {
1994   FlushText();
1995   terms_.Add(term, zone());
1996   LAST(ADD_ATOM);
1997 }
1998 
1999 
AddAssertion(RegExpTree * assert)2000 void RegExpBuilder::AddAssertion(RegExpTree* assert) {
2001   FlushText();
2002   terms_.Add(assert, zone());
2003   LAST(ADD_ASSERT);
2004 }
2005 
2006 
NewAlternative()2007 void RegExpBuilder::NewAlternative() { FlushTerms(); }
2008 
2009 
FlushTerms()2010 void RegExpBuilder::FlushTerms() {
2011   FlushText();
2012   int num_terms = terms_.length();
2013   RegExpTree* alternative;
2014   if (num_terms == 0) {
2015     alternative = zone()->New<RegExpEmpty>();
2016   } else if (num_terms == 1) {
2017     alternative = terms_.last();
2018   } else {
2019     alternative = zone()->New<RegExpAlternative>(terms_.GetList(zone()));
2020   }
2021   alternatives_.Add(alternative, zone());
2022   terms_.Clear();
2023   LAST(ADD_NONE);
2024 }
2025 
2026 
NeedsDesugaringForUnicode(RegExpCharacterClass * cc)2027 bool RegExpBuilder::NeedsDesugaringForUnicode(RegExpCharacterClass* cc) {
2028   if (!unicode()) return false;
2029   // TODO(yangguo): we could be smarter than this. Case-insensitivity does not
2030   // necessarily mean that we need to desugar. It's probably nicer to have a
2031   // separate pass to figure out unicode desugarings.
2032   if (ignore_case()) return true;
2033   ZoneList<CharacterRange>* ranges = cc->ranges(zone());
2034   CharacterRange::Canonicalize(ranges);
2035   for (int i = ranges->length() - 1; i >= 0; i--) {
2036     uc32 from = ranges->at(i).from();
2037     uc32 to = ranges->at(i).to();
2038     // Check for non-BMP characters.
2039     if (to >= kNonBmpStart) return true;
2040     // Check for lone surrogates.
2041     if (from <= kTrailSurrogateEnd && to >= kLeadSurrogateStart) return true;
2042   }
2043   return false;
2044 }
2045 
2046 
NeedsDesugaringForIgnoreCase(uc32 c)2047 bool RegExpBuilder::NeedsDesugaringForIgnoreCase(uc32 c) {
2048 #ifdef V8_INTL_SUPPORT
2049   if (unicode() && ignore_case()) {
2050     icu::UnicodeSet set(c, c);
2051     set.closeOver(USET_CASE_INSENSITIVE);
2052     set.removeAllStrings();
2053     return set.size() > 1;
2054   }
2055   // In the case where ICU is not included, we act as if the unicode flag is
2056   // not set, and do not desugar.
2057 #endif  // V8_INTL_SUPPORT
2058   return false;
2059 }
2060 
2061 
ToRegExp()2062 RegExpTree* RegExpBuilder::ToRegExp() {
2063   FlushTerms();
2064   int num_alternatives = alternatives_.length();
2065   if (num_alternatives == 0) return zone()->New<RegExpEmpty>();
2066   if (num_alternatives == 1) return alternatives_.last();
2067   return zone()->New<RegExpDisjunction>(alternatives_.GetList(zone()));
2068 }
2069 
AddQuantifierToAtom(int min,int max,RegExpQuantifier::QuantifierType quantifier_type)2070 bool RegExpBuilder::AddQuantifierToAtom(
2071     int min, int max, RegExpQuantifier::QuantifierType quantifier_type) {
2072   FlushPendingSurrogate();
2073   if (pending_empty_) {
2074     pending_empty_ = false;
2075     return true;
2076   }
2077   RegExpTree* atom;
2078   if (characters_ != nullptr) {
2079     DCHECK(last_added_ == ADD_CHAR);
2080     // Last atom was character.
2081     Vector<const uc16> char_vector = characters_->ToConstVector();
2082     int num_chars = char_vector.length();
2083     if (num_chars > 1) {
2084       Vector<const uc16> prefix = char_vector.SubVector(0, num_chars - 1);
2085       text_.Add(zone()->New<RegExpAtom>(prefix, flags_), zone());
2086       char_vector = char_vector.SubVector(num_chars - 1, num_chars);
2087     }
2088     characters_ = nullptr;
2089     atom = zone()->New<RegExpAtom>(char_vector, flags_);
2090     FlushText();
2091   } else if (text_.length() > 0) {
2092     DCHECK(last_added_ == ADD_ATOM);
2093     atom = text_.RemoveLast();
2094     FlushText();
2095   } else if (terms_.length() > 0) {
2096     DCHECK(last_added_ == ADD_ATOM);
2097     atom = terms_.RemoveLast();
2098     if (atom->IsLookaround()) {
2099       // With /u, lookarounds are not quantifiable.
2100       if (unicode()) return false;
2101       // Lookbehinds are not quantifiable.
2102       if (atom->AsLookaround()->type() == RegExpLookaround::LOOKBEHIND) {
2103         return false;
2104       }
2105     }
2106     if (atom->max_match() == 0) {
2107       // Guaranteed to only match an empty string.
2108       LAST(ADD_TERM);
2109       if (min == 0) {
2110         return true;
2111       }
2112       terms_.Add(atom, zone());
2113       return true;
2114     }
2115   } else {
2116     // Only call immediately after adding an atom or character!
2117     UNREACHABLE();
2118   }
2119   terms_.Add(zone()->New<RegExpQuantifier>(min, max, quantifier_type, atom),
2120              zone());
2121   LAST(ADD_TERM);
2122   return true;
2123 }
2124 
2125 }  // namespace internal
2126 }  // namespace v8
2127