• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2016 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #include "src/regexp/regexp-parser.h"
6 
7 #include <vector>
8 
9 #include "src/char-predicates-inl.h"
10 #include "src/heap/factory.h"
11 #include "src/isolate.h"
12 #include "src/objects-inl.h"
13 #include "src/ostreams.h"
14 #include "src/regexp/jsregexp.h"
15 #include "src/utils.h"
16 
17 #ifdef V8_INTL_SUPPORT
18 #include "unicode/uniset.h"
19 // TODO(mathias): Remove this when we no longer need to check
20 // `U_ICU_VERSION_MAJOR_NUM`.
21 #include "unicode/uvernum.h"
22 #endif  // V8_INTL_SUPPORT
23 
24 namespace v8 {
25 namespace internal {
26 
RegExpParser(FlatStringReader * in,Handle<String> * error,JSRegExp::Flags flags,Isolate * isolate,Zone * zone)27 RegExpParser::RegExpParser(FlatStringReader* in, Handle<String>* error,
28                            JSRegExp::Flags flags, Isolate* isolate, Zone* zone)
29     : isolate_(isolate),
30       zone_(zone),
31       error_(error),
32       captures_(nullptr),
33       named_captures_(nullptr),
34       named_back_references_(nullptr),
35       in_(in),
36       current_(kEndMarker),
37       top_level_flags_(flags),
38       next_pos_(0),
39       captures_started_(0),
40       capture_count_(0),
41       has_more_(true),
42       simple_(false),
43       contains_anchor_(false),
44       is_scanned_for_captures_(false),
45       has_named_captures_(false),
46       failed_(false) {
47   Advance();
48 }
49 
50 template <bool update_position>
ReadNext()51 inline uc32 RegExpParser::ReadNext() {
52   int position = next_pos_;
53   uc32 c0 = in()->Get(position);
54   position++;
55   // Read the whole surrogate pair in case of unicode flag, if possible.
56   if (unicode() && position < in()->length() &&
57       unibrow::Utf16::IsLeadSurrogate(static_cast<uc16>(c0))) {
58     uc16 c1 = in()->Get(position);
59     if (unibrow::Utf16::IsTrailSurrogate(c1)) {
60       c0 = unibrow::Utf16::CombineSurrogatePair(static_cast<uc16>(c0), c1);
61       position++;
62     }
63   }
64   if (update_position) next_pos_ = position;
65   return c0;
66 }
67 
68 
Next()69 uc32 RegExpParser::Next() {
70   if (has_next()) {
71     return ReadNext<false>();
72   } else {
73     return kEndMarker;
74   }
75 }
76 
Advance()77 void RegExpParser::Advance() {
78   if (has_next()) {
79     StackLimitCheck check(isolate());
80     if (check.HasOverflowed()) {
81       if (FLAG_abort_on_stack_or_string_length_overflow) {
82         FATAL("Aborting on stack overflow");
83       }
84       ReportError(CStrVector(
85           MessageTemplate::TemplateString(MessageTemplate::kStackOverflow)));
86     } else if (zone()->excess_allocation()) {
87       ReportError(CStrVector("Regular expression too large"));
88     } else {
89       current_ = ReadNext<true>();
90     }
91   } else {
92     current_ = kEndMarker;
93     // Advance so that position() points to 1-after-the-last-character. This is
94     // important so that Reset() to this position works correctly.
95     next_pos_ = in()->length() + 1;
96     has_more_ = false;
97   }
98 }
99 
100 
Reset(int pos)101 void RegExpParser::Reset(int pos) {
102   next_pos_ = pos;
103   has_more_ = (pos < in()->length());
104   Advance();
105 }
106 
Advance(int dist)107 void RegExpParser::Advance(int dist) {
108   next_pos_ += dist - 1;
109   Advance();
110 }
111 
112 
simple()113 bool RegExpParser::simple() { return simple_; }
114 
IsSyntaxCharacterOrSlash(uc32 c)115 bool RegExpParser::IsSyntaxCharacterOrSlash(uc32 c) {
116   switch (c) {
117     case '^':
118     case '$':
119     case '\\':
120     case '.':
121     case '*':
122     case '+':
123     case '?':
124     case '(':
125     case ')':
126     case '[':
127     case ']':
128     case '{':
129     case '}':
130     case '|':
131     case '/':
132       return true;
133     default:
134       break;
135   }
136   return false;
137 }
138 
139 
ReportError(Vector<const char> message)140 RegExpTree* RegExpParser::ReportError(Vector<const char> message) {
141   if (failed_) return nullptr;  // Do not overwrite any existing error.
142   failed_ = true;
143   *error_ = isolate()
144                 ->factory()
145                 ->NewStringFromOneByte(Vector<const uint8_t>::cast(message))
146                 .ToHandleChecked();
147   // Zip to the end to make sure the no more input is read.
148   current_ = kEndMarker;
149   next_pos_ = in()->length();
150   return nullptr;
151 }
152 
153 #define CHECK_FAILED /**/);    \
154   if (failed_) return nullptr; \
155   ((void)0
156 
157 // Pattern ::
158 //   Disjunction
ParsePattern()159 RegExpTree* RegExpParser::ParsePattern() {
160   RegExpTree* result = ParseDisjunction(CHECK_FAILED);
161   PatchNamedBackReferences(CHECK_FAILED);
162   DCHECK(!has_more());
163   // If the result of parsing is a literal string atom, and it has the
164   // same length as the input, then the atom is identical to the input.
165   if (result->IsAtom() && result->AsAtom()->length() == in()->length()) {
166     simple_ = true;
167   }
168   return result;
169 }
170 
171 
172 // Disjunction ::
173 //   Alternative
174 //   Alternative | Disjunction
175 // Alternative ::
176 //   [empty]
177 //   Term Alternative
178 // Term ::
179 //   Assertion
180 //   Atom
181 //   Atom Quantifier
ParseDisjunction()182 RegExpTree* RegExpParser::ParseDisjunction() {
183   // Used to store current state while parsing subexpressions.
184   RegExpParserState initial_state(nullptr, INITIAL, RegExpLookaround::LOOKAHEAD,
185                                   0, nullptr, top_level_flags_, zone());
186   RegExpParserState* state = &initial_state;
187   // Cache the builder in a local variable for quick access.
188   RegExpBuilder* builder = initial_state.builder();
189   while (true) {
190     switch (current()) {
191       case kEndMarker:
192         if (state->IsSubexpression()) {
193           // Inside a parenthesized group when hitting end of input.
194           return ReportError(CStrVector("Unterminated group"));
195         }
196         DCHECK_EQ(INITIAL, state->group_type());
197         // Parsing completed successfully.
198         return builder->ToRegExp();
199       case ')': {
200         if (!state->IsSubexpression()) {
201           return ReportError(CStrVector("Unmatched ')'"));
202         }
203         DCHECK_NE(INITIAL, state->group_type());
204 
205         Advance();
206         // End disjunction parsing and convert builder content to new single
207         // regexp atom.
208         RegExpTree* body = builder->ToRegExp();
209 
210         int end_capture_index = captures_started();
211 
212         int capture_index = state->capture_index();
213         SubexpressionType group_type = state->group_type();
214 
215         // Build result of subexpression.
216         if (group_type == CAPTURE) {
217           if (state->IsNamedCapture()) {
218             CreateNamedCaptureAtIndex(state->capture_name(),
219                                       capture_index CHECK_FAILED);
220           }
221           RegExpCapture* capture = GetCapture(capture_index);
222           capture->set_body(body);
223           body = capture;
224         } else if (group_type == GROUPING) {
225           body = new (zone()) RegExpGroup(body);
226         } else {
227           DCHECK(group_type == POSITIVE_LOOKAROUND ||
228                  group_type == NEGATIVE_LOOKAROUND);
229           bool is_positive = (group_type == POSITIVE_LOOKAROUND);
230           body = new (zone()) RegExpLookaround(
231               body, is_positive, end_capture_index - capture_index,
232               capture_index, state->lookaround_type());
233         }
234 
235         // Restore previous state.
236         state = state->previous_state();
237         builder = state->builder();
238 
239         builder->AddAtom(body);
240         // For compatibility with JSC and ES3, we allow quantifiers after
241         // lookaheads, and break in all cases.
242         break;
243       }
244       case '|': {
245         Advance();
246         builder->NewAlternative();
247         continue;
248       }
249       case '*':
250       case '+':
251       case '?':
252         return ReportError(CStrVector("Nothing to repeat"));
253       case '^': {
254         Advance();
255         if (builder->multiline()) {
256           builder->AddAssertion(new (zone()) RegExpAssertion(
257               RegExpAssertion::START_OF_LINE, builder->flags()));
258         } else {
259           builder->AddAssertion(new (zone()) RegExpAssertion(
260               RegExpAssertion::START_OF_INPUT, builder->flags()));
261           set_contains_anchor();
262         }
263         continue;
264       }
265       case '$': {
266         Advance();
267         RegExpAssertion::AssertionType assertion_type =
268             builder->multiline() ? RegExpAssertion::END_OF_LINE
269                                  : RegExpAssertion::END_OF_INPUT;
270         builder->AddAssertion(
271             new (zone()) RegExpAssertion(assertion_type, builder->flags()));
272         continue;
273       }
274       case '.': {
275         Advance();
276         ZoneList<CharacterRange>* ranges =
277             new (zone()) ZoneList<CharacterRange>(2, zone());
278 
279         if (builder->dotall()) {
280           // Everything.
281           CharacterRange::AddClassEscape('*', ranges, false, zone());
282         } else {
283           // Everything except \x0A, \x0D, \u2028 and \u2029
284           CharacterRange::AddClassEscape('.', ranges, false, zone());
285         }
286 
287         RegExpCharacterClass* cc =
288             new (zone()) RegExpCharacterClass(zone(), ranges, builder->flags());
289         builder->AddCharacterClass(cc);
290         break;
291       }
292       case '(': {
293         state = ParseOpenParenthesis(state CHECK_FAILED);
294         builder = state->builder();
295         continue;
296       }
297       case '[': {
298         RegExpTree* cc = ParseCharacterClass(builder CHECK_FAILED);
299         builder->AddCharacterClass(cc->AsCharacterClass());
300         break;
301       }
302       // Atom ::
303       //   \ AtomEscape
304       case '\\':
305         switch (Next()) {
306           case kEndMarker:
307             return ReportError(CStrVector("\\ at end of pattern"));
308           case 'b':
309             Advance(2);
310             builder->AddAssertion(new (zone()) RegExpAssertion(
311                 RegExpAssertion::BOUNDARY, builder->flags()));
312             continue;
313           case 'B':
314             Advance(2);
315             builder->AddAssertion(new (zone()) RegExpAssertion(
316                 RegExpAssertion::NON_BOUNDARY, builder->flags()));
317             continue;
318           // AtomEscape ::
319           //   CharacterClassEscape
320           //
321           // CharacterClassEscape :: one of
322           //   d D s S w W
323           case 'd':
324           case 'D':
325           case 's':
326           case 'S':
327           case 'w':
328           case 'W': {
329             uc32 c = Next();
330             Advance(2);
331             ZoneList<CharacterRange>* ranges =
332                 new (zone()) ZoneList<CharacterRange>(2, zone());
333             CharacterRange::AddClassEscape(
334                 c, ranges, unicode() && builder->ignore_case(), zone());
335             RegExpCharacterClass* cc = new (zone())
336                 RegExpCharacterClass(zone(), ranges, builder->flags());
337             builder->AddCharacterClass(cc);
338             break;
339           }
340           case 'p':
341           case 'P': {
342             uc32 p = Next();
343             Advance(2);
344             if (unicode()) {
345               ZoneList<CharacterRange>* ranges =
346                   new (zone()) ZoneList<CharacterRange>(2, zone());
347               if (!ParsePropertyClass(ranges, p == 'P')) {
348                 return ReportError(CStrVector("Invalid property name"));
349               }
350               RegExpCharacterClass* cc = new (zone())
351                   RegExpCharacterClass(zone(), ranges, builder->flags());
352               builder->AddCharacterClass(cc);
353 
354             } else {
355               builder->AddCharacter(p);
356             }
357             break;
358           }
359           case '1':
360           case '2':
361           case '3':
362           case '4':
363           case '5':
364           case '6':
365           case '7':
366           case '8':
367           case '9': {
368             int index = 0;
369             bool is_backref = ParseBackReferenceIndex(&index CHECK_FAILED);
370             if (is_backref) {
371               if (state->IsInsideCaptureGroup(index)) {
372                 // The back reference is inside the capture group it refers to.
373                 // Nothing can possibly have been captured yet, so we use empty
374                 // instead. This ensures that, when checking a back reference,
375                 // the capture registers of the referenced capture are either
376                 // both set or both cleared.
377                 builder->AddEmpty();
378               } else {
379                 RegExpCapture* capture = GetCapture(index);
380                 RegExpTree* atom =
381                     new (zone()) RegExpBackReference(capture, builder->flags());
382                 builder->AddAtom(atom);
383               }
384               break;
385             }
386             // With /u, no identity escapes except for syntax characters
387             // are allowed. Otherwise, all identity escapes are allowed.
388             if (unicode()) {
389               return ReportError(CStrVector("Invalid escape"));
390             }
391             uc32 first_digit = Next();
392             if (first_digit == '8' || first_digit == '9') {
393               builder->AddCharacter(first_digit);
394               Advance(2);
395               break;
396             }
397             V8_FALLTHROUGH;
398           }
399           case '0': {
400             Advance();
401             if (unicode() && Next() >= '0' && Next() <= '9') {
402               // With /u, decimal escape with leading 0 are not parsed as octal.
403               return ReportError(CStrVector("Invalid decimal escape"));
404             }
405             uc32 octal = ParseOctalLiteral();
406             builder->AddCharacter(octal);
407             break;
408           }
409           // ControlEscape :: one of
410           //   f n r t v
411           case 'f':
412             Advance(2);
413             builder->AddCharacter('\f');
414             break;
415           case 'n':
416             Advance(2);
417             builder->AddCharacter('\n');
418             break;
419           case 'r':
420             Advance(2);
421             builder->AddCharacter('\r');
422             break;
423           case 't':
424             Advance(2);
425             builder->AddCharacter('\t');
426             break;
427           case 'v':
428             Advance(2);
429             builder->AddCharacter('\v');
430             break;
431           case 'c': {
432             Advance();
433             uc32 controlLetter = Next();
434             // Special case if it is an ASCII letter.
435             // Convert lower case letters to uppercase.
436             uc32 letter = controlLetter & ~('a' ^ 'A');
437             if (letter < 'A' || 'Z' < letter) {
438               // controlLetter is not in range 'A'-'Z' or 'a'-'z'.
439               // Read the backslash as a literal character instead of as
440               // starting an escape.
441               // ES#prod-annexB-ExtendedPatternCharacter
442               if (unicode()) {
443                 // With /u, invalid escapes are not treated as identity escapes.
444                 return ReportError(CStrVector("Invalid unicode escape"));
445               }
446               builder->AddCharacter('\\');
447             } else {
448               Advance(2);
449               builder->AddCharacter(controlLetter & 0x1F);
450             }
451             break;
452           }
453           case 'x': {
454             Advance(2);
455             uc32 value;
456             if (ParseHexEscape(2, &value)) {
457               builder->AddCharacter(value);
458             } else if (!unicode()) {
459               builder->AddCharacter('x');
460             } else {
461               // With /u, invalid escapes are not treated as identity escapes.
462               return ReportError(CStrVector("Invalid escape"));
463             }
464             break;
465           }
466           case 'u': {
467             Advance(2);
468             uc32 value;
469             if (ParseUnicodeEscape(&value)) {
470               builder->AddEscapedUnicodeCharacter(value);
471             } else if (!unicode()) {
472               builder->AddCharacter('u');
473             } else {
474               // With /u, invalid escapes are not treated as identity escapes.
475               return ReportError(CStrVector("Invalid Unicode escape"));
476             }
477             break;
478           }
479           case 'k':
480             // Either an identity escape or a named back-reference.  The two
481             // interpretations are mutually exclusive: '\k' is interpreted as
482             // an identity escape for non-Unicode patterns without named
483             // capture groups, and as the beginning of a named back-reference
484             // in all other cases.
485             if (unicode() || HasNamedCaptures()) {
486               Advance(2);
487               ParseNamedBackReference(builder, state CHECK_FAILED);
488               break;
489             }
490             V8_FALLTHROUGH;
491           default:
492             Advance();
493             // With /u, no identity escapes except for syntax characters
494             // are allowed. Otherwise, all identity escapes are allowed.
495             if (!unicode() || IsSyntaxCharacterOrSlash(current())) {
496               builder->AddCharacter(current());
497               Advance();
498             } else {
499               return ReportError(CStrVector("Invalid escape"));
500             }
501             break;
502         }
503         break;
504       case '{': {
505         int dummy;
506         bool parsed = ParseIntervalQuantifier(&dummy, &dummy CHECK_FAILED);
507         if (parsed) return ReportError(CStrVector("Nothing to repeat"));
508         V8_FALLTHROUGH;
509       }
510       case '}':
511       case ']':
512         if (unicode()) {
513           return ReportError(CStrVector("Lone quantifier brackets"));
514         }
515         V8_FALLTHROUGH;
516       default:
517         builder->AddUnicodeCharacter(current());
518         Advance();
519         break;
520     }  // end switch(current())
521 
522     int min;
523     int max;
524     switch (current()) {
525       // QuantifierPrefix ::
526       //   *
527       //   +
528       //   ?
529       //   {
530       case '*':
531         min = 0;
532         max = RegExpTree::kInfinity;
533         Advance();
534         break;
535       case '+':
536         min = 1;
537         max = RegExpTree::kInfinity;
538         Advance();
539         break;
540       case '?':
541         min = 0;
542         max = 1;
543         Advance();
544         break;
545       case '{':
546         if (ParseIntervalQuantifier(&min, &max)) {
547           if (max < min) {
548             return ReportError(
549                 CStrVector("numbers out of order in {} quantifier"));
550           }
551           break;
552         } else if (unicode()) {
553           // With /u, incomplete quantifiers are not allowed.
554           return ReportError(CStrVector("Incomplete quantifier"));
555         }
556         continue;
557       default:
558         continue;
559     }
560     RegExpQuantifier::QuantifierType quantifier_type = RegExpQuantifier::GREEDY;
561     if (current() == '?') {
562       quantifier_type = RegExpQuantifier::NON_GREEDY;
563       Advance();
564     } else if (FLAG_regexp_possessive_quantifier && current() == '+') {
565       // FLAG_regexp_possessive_quantifier is a debug-only flag.
566       quantifier_type = RegExpQuantifier::POSSESSIVE;
567       Advance();
568     }
569     if (!builder->AddQuantifierToAtom(min, max, quantifier_type)) {
570       return ReportError(CStrVector("Invalid quantifier"));
571     }
572   }
573 }
574 
ParseOpenParenthesis(RegExpParserState * state)575 RegExpParser::RegExpParserState* RegExpParser::ParseOpenParenthesis(
576     RegExpParserState* state) {
577   RegExpLookaround::Type lookaround_type = state->lookaround_type();
578   bool is_named_capture = false;
579   JSRegExp::Flags switch_on = JSRegExp::kNone;
580   JSRegExp::Flags switch_off = JSRegExp::kNone;
581   const ZoneVector<uc16>* capture_name = nullptr;
582   SubexpressionType subexpr_type = CAPTURE;
583   Advance();
584   if (current() == '?') {
585     switch (Next()) {
586       case ':':
587         Advance(2);
588         subexpr_type = GROUPING;
589         break;
590       case '=':
591         Advance(2);
592         lookaround_type = RegExpLookaround::LOOKAHEAD;
593         subexpr_type = POSITIVE_LOOKAROUND;
594         break;
595       case '!':
596         Advance(2);
597         lookaround_type = RegExpLookaround::LOOKAHEAD;
598         subexpr_type = NEGATIVE_LOOKAROUND;
599         break;
600       case '-':
601       case 'i':
602       case 's':
603       case 'm': {
604         if (!FLAG_regexp_mode_modifiers) {
605           ReportError(CStrVector("Invalid group"));
606           return nullptr;
607         }
608         Advance();
609         bool flags_sense = true;  // Switching on flags.
610         while (subexpr_type != GROUPING) {
611           switch (current()) {
612             case '-':
613               if (!flags_sense) {
614                 ReportError(CStrVector("Multiple dashes in flag group"));
615                 return nullptr;
616               }
617               flags_sense = false;
618               Advance();
619               continue;
620             case 's':
621             case 'i':
622             case 'm': {
623               JSRegExp::Flags bit = JSRegExp::kUnicode;
624               if (current() == 'i') bit = JSRegExp::kIgnoreCase;
625               if (current() == 'm') bit = JSRegExp::kMultiline;
626               if (current() == 's') bit = JSRegExp::kDotAll;
627               if (((switch_on | switch_off) & bit) != 0) {
628                 ReportError(CStrVector("Repeated flag in flag group"));
629                 return nullptr;
630               }
631               if (flags_sense) {
632                 switch_on |= bit;
633               } else {
634                 switch_off |= bit;
635               }
636               Advance();
637               continue;
638             }
639             case ')': {
640               Advance();
641               state->builder()
642                   ->FlushText();  // Flush pending text using old flags.
643               // These (?i)-style flag switches don't put us in a subexpression
644               // at all, they just modify the flags in the rest of the current
645               // subexpression.
646               JSRegExp::Flags flags =
647                   (state->builder()->flags() | switch_on) & ~switch_off;
648               state->builder()->set_flags(flags);
649               return state;
650             }
651             case ':':
652               Advance();
653               subexpr_type = GROUPING;  // Will break us out of the outer loop.
654               continue;
655             default:
656               ReportError(CStrVector("Invalid flag group"));
657               return nullptr;
658           }
659         }
660         break;
661       }
662       case '<':
663         Advance();
664         if (Next() == '=') {
665           Advance(2);
666           lookaround_type = RegExpLookaround::LOOKBEHIND;
667           subexpr_type = POSITIVE_LOOKAROUND;
668           break;
669         } else if (Next() == '!') {
670           Advance(2);
671           lookaround_type = RegExpLookaround::LOOKBEHIND;
672           subexpr_type = NEGATIVE_LOOKAROUND;
673           break;
674         }
675         is_named_capture = true;
676         has_named_captures_ = true;
677         Advance();
678         break;
679       default:
680         ReportError(CStrVector("Invalid group"));
681         return nullptr;
682     }
683   }
684   if (subexpr_type == CAPTURE) {
685     if (captures_started_ >= kMaxCaptures) {
686       ReportError(CStrVector("Too many captures"));
687       return nullptr;
688     }
689     captures_started_++;
690 
691     if (is_named_capture) {
692       capture_name = ParseCaptureGroupName(CHECK_FAILED);
693     }
694   }
695   JSRegExp::Flags flags = (state->builder()->flags() | switch_on) & ~switch_off;
696   // Store current state and begin new disjunction parsing.
697   return new (zone())
698       RegExpParserState(state, subexpr_type, lookaround_type, captures_started_,
699                         capture_name, flags, zone());
700 }
701 
702 #ifdef DEBUG
703 // Currently only used in an DCHECK.
IsSpecialClassEscape(uc32 c)704 static bool IsSpecialClassEscape(uc32 c) {
705   switch (c) {
706     case 'd':
707     case 'D':
708     case 's':
709     case 'S':
710     case 'w':
711     case 'W':
712       return true;
713     default:
714       return false;
715   }
716 }
717 #endif
718 
719 
720 // In order to know whether an escape is a backreference or not we have to scan
721 // the entire regexp and find the number of capturing parentheses.  However we
722 // don't want to scan the regexp twice unless it is necessary.  This mini-parser
723 // is called when needed.  It can see the difference between capturing and
724 // noncapturing parentheses and can skip character classes and backslash-escaped
725 // characters.
ScanForCaptures()726 void RegExpParser::ScanForCaptures() {
727   DCHECK(!is_scanned_for_captures_);
728   const int saved_position = position();
729   // Start with captures started previous to current position
730   int capture_count = captures_started();
731   // Add count of captures after this position.
732   int n;
733   while ((n = current()) != kEndMarker) {
734     Advance();
735     switch (n) {
736       case '\\':
737         Advance();
738         break;
739       case '[': {
740         int c;
741         while ((c = current()) != kEndMarker) {
742           Advance();
743           if (c == '\\') {
744             Advance();
745           } else {
746             if (c == ']') break;
747           }
748         }
749         break;
750       }
751       case '(':
752         if (current() == '?') {
753           // At this point we could be in
754           // * a non-capturing group '(:',
755           // * a lookbehind assertion '(?<=' '(?<!'
756           // * or a named capture '(?<'.
757           //
758           // Of these, only named captures are capturing groups.
759 
760           Advance();
761           if (current() != '<') break;
762 
763           Advance();
764           if (current() == '=' || current() == '!') break;
765 
766           // Found a possible named capture. It could turn out to be a syntax
767           // error (e.g. an unterminated or invalid name), but that distinction
768           // does not matter for our purposes.
769           has_named_captures_ = true;
770         }
771         capture_count++;
772         break;
773     }
774   }
775   capture_count_ = capture_count;
776   is_scanned_for_captures_ = true;
777   Reset(saved_position);
778 }
779 
780 
ParseBackReferenceIndex(int * index_out)781 bool RegExpParser::ParseBackReferenceIndex(int* index_out) {
782   DCHECK_EQ('\\', current());
783   DCHECK('1' <= Next() && Next() <= '9');
784   // Try to parse a decimal literal that is no greater than the total number
785   // of left capturing parentheses in the input.
786   int start = position();
787   int value = Next() - '0';
788   Advance(2);
789   while (true) {
790     uc32 c = current();
791     if (IsDecimalDigit(c)) {
792       value = 10 * value + (c - '0');
793       if (value > kMaxCaptures) {
794         Reset(start);
795         return false;
796       }
797       Advance();
798     } else {
799       break;
800     }
801   }
802   if (value > captures_started()) {
803     if (!is_scanned_for_captures_) ScanForCaptures();
804     if (value > capture_count_) {
805       Reset(start);
806       return false;
807     }
808   }
809   *index_out = value;
810   return true;
811 }
812 
push_code_unit(ZoneVector<uc16> * v,uint32_t code_unit)813 static void push_code_unit(ZoneVector<uc16>* v, uint32_t code_unit) {
814   if (code_unit <= unibrow::Utf16::kMaxNonSurrogateCharCode) {
815     v->push_back(code_unit);
816   } else {
817     v->push_back(unibrow::Utf16::LeadSurrogate(code_unit));
818     v->push_back(unibrow::Utf16::TrailSurrogate(code_unit));
819   }
820 }
821 
ParseCaptureGroupName()822 const ZoneVector<uc16>* RegExpParser::ParseCaptureGroupName() {
823   ZoneVector<uc16>* name =
824       new (zone()->New(sizeof(ZoneVector<uc16>))) ZoneVector<uc16>(zone());
825 
826   bool at_start = true;
827   while (true) {
828     uc32 c = current();
829     Advance();
830 
831     // Convert unicode escapes.
832     if (c == '\\' && current() == 'u') {
833       Advance();
834       if (!ParseUnicodeEscape(&c)) {
835         ReportError(CStrVector("Invalid Unicode escape sequence"));
836         return nullptr;
837       }
838     }
839 
840     // The backslash char is misclassified as both ID_Start and ID_Continue.
841     if (c == '\\') {
842       ReportError(CStrVector("Invalid capture group name"));
843       return nullptr;
844     }
845 
846     if (at_start) {
847       if (!IdentifierStart::Is(c)) {
848         ReportError(CStrVector("Invalid capture group name"));
849         return nullptr;
850       }
851       push_code_unit(name, c);
852       at_start = false;
853     } else {
854       if (c == '>') {
855         break;
856       } else if (IdentifierPart::Is(c)) {
857         push_code_unit(name, c);
858       } else {
859         ReportError(CStrVector("Invalid capture group name"));
860         return nullptr;
861       }
862     }
863   }
864 
865   return name;
866 }
867 
CreateNamedCaptureAtIndex(const ZoneVector<uc16> * name,int index)868 bool RegExpParser::CreateNamedCaptureAtIndex(const ZoneVector<uc16>* name,
869                                              int index) {
870   DCHECK(0 < index && index <= captures_started_);
871   DCHECK_NOT_NULL(name);
872 
873   if (named_captures_ == nullptr) {
874     named_captures_ = new (zone()) ZoneList<RegExpCapture*>(1, zone());
875   } else {
876     // Check for duplicates and bail if we find any.
877     // TODO(jgruber): O(n^2).
878     for (const auto& named_capture : *named_captures_) {
879       if (*named_capture->name() == *name) {
880         ReportError(CStrVector("Duplicate capture group name"));
881         return false;
882       }
883     }
884   }
885 
886   RegExpCapture* capture = GetCapture(index);
887   DCHECK_NULL(capture->name());
888 
889   capture->set_name(name);
890   named_captures_->Add(capture, zone());
891 
892   return true;
893 }
894 
ParseNamedBackReference(RegExpBuilder * builder,RegExpParserState * state)895 bool RegExpParser::ParseNamedBackReference(RegExpBuilder* builder,
896                                            RegExpParserState* state) {
897   // The parser is assumed to be on the '<' in \k<name>.
898   if (current() != '<') {
899     ReportError(CStrVector("Invalid named reference"));
900     return false;
901   }
902 
903   Advance();
904   const ZoneVector<uc16>* name = ParseCaptureGroupName();
905   if (name == nullptr) {
906     return false;
907   }
908 
909   if (state->IsInsideCaptureGroup(name)) {
910     builder->AddEmpty();
911   } else {
912     RegExpBackReference* atom =
913         new (zone()) RegExpBackReference(builder->flags());
914     atom->set_name(name);
915 
916     builder->AddAtom(atom);
917 
918     if (named_back_references_ == nullptr) {
919       named_back_references_ =
920           new (zone()) ZoneList<RegExpBackReference*>(1, zone());
921     }
922     named_back_references_->Add(atom, zone());
923   }
924 
925   return true;
926 }
927 
PatchNamedBackReferences()928 void RegExpParser::PatchNamedBackReferences() {
929   if (named_back_references_ == nullptr) return;
930 
931   if (named_captures_ == nullptr) {
932     ReportError(CStrVector("Invalid named capture referenced"));
933     return;
934   }
935 
936   // Look up and patch the actual capture for each named back reference.
937   // TODO(jgruber): O(n^2), optimize if necessary.
938 
939   for (int i = 0; i < named_back_references_->length(); i++) {
940     RegExpBackReference* ref = named_back_references_->at(i);
941 
942     int index = -1;
943     for (const auto& capture : *named_captures_) {
944       if (*capture->name() == *ref->name()) {
945         index = capture->index();
946         break;
947       }
948     }
949 
950     if (index == -1) {
951       ReportError(CStrVector("Invalid named capture referenced"));
952       return;
953     }
954 
955     ref->set_capture(GetCapture(index));
956   }
957 }
958 
GetCapture(int index)959 RegExpCapture* RegExpParser::GetCapture(int index) {
960   // The index for the capture groups are one-based. Its index in the list is
961   // zero-based.
962   int know_captures =
963       is_scanned_for_captures_ ? capture_count_ : captures_started_;
964   DCHECK(index <= know_captures);
965   if (captures_ == nullptr) {
966     captures_ = new (zone()) ZoneList<RegExpCapture*>(know_captures, zone());
967   }
968   while (captures_->length() < know_captures) {
969     captures_->Add(new (zone()) RegExpCapture(captures_->length() + 1), zone());
970   }
971   return captures_->at(index - 1);
972 }
973 
CreateCaptureNameMap()974 Handle<FixedArray> RegExpParser::CreateCaptureNameMap() {
975   if (named_captures_ == nullptr || named_captures_->is_empty())
976     return Handle<FixedArray>();
977 
978   Factory* factory = isolate()->factory();
979 
980   int len = named_captures_->length() * 2;
981   Handle<FixedArray> array = factory->NewFixedArray(len);
982 
983   for (int i = 0; i < named_captures_->length(); i++) {
984     RegExpCapture* capture = named_captures_->at(i);
985     MaybeHandle<String> name = factory->NewStringFromTwoByte(capture->name());
986     array->set(i * 2, *name.ToHandleChecked());
987     array->set(i * 2 + 1, Smi::FromInt(capture->index()));
988   }
989 
990   return array;
991 }
992 
HasNamedCaptures()993 bool RegExpParser::HasNamedCaptures() {
994   if (has_named_captures_ || is_scanned_for_captures_) {
995     return has_named_captures_;
996   }
997 
998   ScanForCaptures();
999   DCHECK(is_scanned_for_captures_);
1000   return has_named_captures_;
1001 }
1002 
IsInsideCaptureGroup(int index)1003 bool RegExpParser::RegExpParserState::IsInsideCaptureGroup(int index) {
1004   for (RegExpParserState* s = this; s != nullptr; s = s->previous_state()) {
1005     if (s->group_type() != CAPTURE) continue;
1006     // Return true if we found the matching capture index.
1007     if (index == s->capture_index()) return true;
1008     // Abort if index is larger than what has been parsed up till this state.
1009     if (index > s->capture_index()) return false;
1010   }
1011   return false;
1012 }
1013 
IsInsideCaptureGroup(const ZoneVector<uc16> * name)1014 bool RegExpParser::RegExpParserState::IsInsideCaptureGroup(
1015     const ZoneVector<uc16>* name) {
1016   DCHECK_NOT_NULL(name);
1017   for (RegExpParserState* s = this; s != nullptr; s = s->previous_state()) {
1018     if (s->capture_name() == nullptr) continue;
1019     if (*s->capture_name() == *name) return true;
1020   }
1021   return false;
1022 }
1023 
1024 // QuantifierPrefix ::
1025 //   { DecimalDigits }
1026 //   { DecimalDigits , }
1027 //   { DecimalDigits , DecimalDigits }
1028 //
1029 // Returns true if parsing succeeds, and set the min_out and max_out
1030 // values. Values are truncated to RegExpTree::kInfinity if they overflow.
ParseIntervalQuantifier(int * min_out,int * max_out)1031 bool RegExpParser::ParseIntervalQuantifier(int* min_out, int* max_out) {
1032   DCHECK_EQ(current(), '{');
1033   int start = position();
1034   Advance();
1035   int min = 0;
1036   if (!IsDecimalDigit(current())) {
1037     Reset(start);
1038     return false;
1039   }
1040   while (IsDecimalDigit(current())) {
1041     int next = current() - '0';
1042     if (min > (RegExpTree::kInfinity - next) / 10) {
1043       // Overflow. Skip past remaining decimal digits and return -1.
1044       do {
1045         Advance();
1046       } while (IsDecimalDigit(current()));
1047       min = RegExpTree::kInfinity;
1048       break;
1049     }
1050     min = 10 * min + next;
1051     Advance();
1052   }
1053   int max = 0;
1054   if (current() == '}') {
1055     max = min;
1056     Advance();
1057   } else if (current() == ',') {
1058     Advance();
1059     if (current() == '}') {
1060       max = RegExpTree::kInfinity;
1061       Advance();
1062     } else {
1063       while (IsDecimalDigit(current())) {
1064         int next = current() - '0';
1065         if (max > (RegExpTree::kInfinity - next) / 10) {
1066           do {
1067             Advance();
1068           } while (IsDecimalDigit(current()));
1069           max = RegExpTree::kInfinity;
1070           break;
1071         }
1072         max = 10 * max + next;
1073         Advance();
1074       }
1075       if (current() != '}') {
1076         Reset(start);
1077         return false;
1078       }
1079       Advance();
1080     }
1081   } else {
1082     Reset(start);
1083     return false;
1084   }
1085   *min_out = min;
1086   *max_out = max;
1087   return true;
1088 }
1089 
1090 
ParseOctalLiteral()1091 uc32 RegExpParser::ParseOctalLiteral() {
1092   DCHECK(('0' <= current() && current() <= '7') || current() == kEndMarker);
1093   // For compatibility with some other browsers (not all), we parse
1094   // up to three octal digits with a value below 256.
1095   // ES#prod-annexB-LegacyOctalEscapeSequence
1096   uc32 value = current() - '0';
1097   Advance();
1098   if ('0' <= current() && current() <= '7') {
1099     value = value * 8 + current() - '0';
1100     Advance();
1101     if (value < 32 && '0' <= current() && current() <= '7') {
1102       value = value * 8 + current() - '0';
1103       Advance();
1104     }
1105   }
1106   return value;
1107 }
1108 
1109 
ParseHexEscape(int length,uc32 * value)1110 bool RegExpParser::ParseHexEscape(int length, uc32* value) {
1111   int start = position();
1112   uc32 val = 0;
1113   for (int i = 0; i < length; ++i) {
1114     uc32 c = current();
1115     int d = HexValue(c);
1116     if (d < 0) {
1117       Reset(start);
1118       return false;
1119     }
1120     val = val * 16 + d;
1121     Advance();
1122   }
1123   *value = val;
1124   return true;
1125 }
1126 
1127 // This parses RegExpUnicodeEscapeSequence as described in ECMA262.
ParseUnicodeEscape(uc32 * value)1128 bool RegExpParser::ParseUnicodeEscape(uc32* value) {
1129   // Accept both \uxxxx and \u{xxxxxx} (if harmony unicode escapes are
1130   // allowed). In the latter case, the number of hex digits between { } is
1131   // arbitrary. \ and u have already been read.
1132   if (current() == '{' && unicode()) {
1133     int start = position();
1134     Advance();
1135     if (ParseUnlimitedLengthHexNumber(0x10FFFF, value)) {
1136       if (current() == '}') {
1137         Advance();
1138         return true;
1139       }
1140     }
1141     Reset(start);
1142     return false;
1143   }
1144   // \u but no {, or \u{...} escapes not allowed.
1145   bool result = ParseHexEscape(4, value);
1146   if (result && unicode() && unibrow::Utf16::IsLeadSurrogate(*value) &&
1147       current() == '\\') {
1148     // Attempt to read trail surrogate.
1149     int start = position();
1150     if (Next() == 'u') {
1151       Advance(2);
1152       uc32 trail;
1153       if (ParseHexEscape(4, &trail) &&
1154           unibrow::Utf16::IsTrailSurrogate(trail)) {
1155         *value = unibrow::Utf16::CombineSurrogatePair(static_cast<uc16>(*value),
1156                                                       static_cast<uc16>(trail));
1157         return true;
1158       }
1159     }
1160     Reset(start);
1161   }
1162   return result;
1163 }
1164 
1165 #ifdef V8_INTL_SUPPORT
1166 
1167 namespace {
1168 
IsExactPropertyAlias(const char * property_name,UProperty property)1169 bool IsExactPropertyAlias(const char* property_name, UProperty property) {
1170   const char* short_name = u_getPropertyName(property, U_SHORT_PROPERTY_NAME);
1171   if (short_name != nullptr && strcmp(property_name, short_name) == 0)
1172     return true;
1173   for (int i = 0;; i++) {
1174     const char* long_name = u_getPropertyName(
1175         property, static_cast<UPropertyNameChoice>(U_LONG_PROPERTY_NAME + i));
1176     if (long_name == nullptr) break;
1177     if (strcmp(property_name, long_name) == 0) return true;
1178   }
1179   return false;
1180 }
1181 
IsExactPropertyValueAlias(const char * property_value_name,UProperty property,int32_t property_value)1182 bool IsExactPropertyValueAlias(const char* property_value_name,
1183                                UProperty property, int32_t property_value) {
1184   const char* short_name =
1185       u_getPropertyValueName(property, property_value, U_SHORT_PROPERTY_NAME);
1186   if (short_name != nullptr && strcmp(property_value_name, short_name) == 0) {
1187     return true;
1188   }
1189   for (int i = 0;; i++) {
1190     const char* long_name = u_getPropertyValueName(
1191         property, property_value,
1192         static_cast<UPropertyNameChoice>(U_LONG_PROPERTY_NAME + i));
1193     if (long_name == nullptr) break;
1194     if (strcmp(property_value_name, long_name) == 0) return true;
1195   }
1196   return false;
1197 }
1198 
LookupPropertyValueName(UProperty property,const char * property_value_name,bool negate,ZoneList<CharacterRange> * result,Zone * zone)1199 bool LookupPropertyValueName(UProperty property,
1200                              const char* property_value_name, bool negate,
1201                              ZoneList<CharacterRange>* result, Zone* zone) {
1202   UProperty property_for_lookup = property;
1203   if (property_for_lookup == UCHAR_SCRIPT_EXTENSIONS) {
1204     // For the property Script_Extensions, we have to do the property value
1205     // name lookup as if the property is Script.
1206     property_for_lookup = UCHAR_SCRIPT;
1207   }
1208   int32_t property_value =
1209       u_getPropertyValueEnum(property_for_lookup, property_value_name);
1210   if (property_value == UCHAR_INVALID_CODE) return false;
1211 
1212   // We require the property name to match exactly to one of the property value
1213   // aliases. However, u_getPropertyValueEnum uses loose matching.
1214   if (!IsExactPropertyValueAlias(property_value_name, property_for_lookup,
1215                                  property_value)) {
1216     return false;
1217   }
1218 
1219   UErrorCode ec = U_ZERO_ERROR;
1220   icu::UnicodeSet set;
1221   set.applyIntPropertyValue(property, property_value, ec);
1222   bool success = ec == U_ZERO_ERROR && !set.isEmpty();
1223 
1224   if (success) {
1225     set.removeAllStrings();
1226     if (negate) set.complement();
1227     for (int i = 0; i < set.getRangeCount(); i++) {
1228       result->Add(
1229           CharacterRange::Range(set.getRangeStart(i), set.getRangeEnd(i)),
1230           zone);
1231     }
1232   }
1233   return success;
1234 }
1235 
1236 template <size_t N>
NameEquals(const char * name,const char (& literal)[N])1237 inline bool NameEquals(const char* name, const char (&literal)[N]) {
1238   return strncmp(name, literal, N + 1) == 0;
1239 }
1240 
LookupSpecialPropertyValueName(const char * name,ZoneList<CharacterRange> * result,bool negate,Zone * zone)1241 bool LookupSpecialPropertyValueName(const char* name,
1242                                     ZoneList<CharacterRange>* result,
1243                                     bool negate, Zone* zone) {
1244   if (NameEquals(name, "Any")) {
1245     if (negate) {
1246       // Leave the list of character ranges empty, since the negation of 'Any'
1247       // is the empty set.
1248     } else {
1249       result->Add(CharacterRange::Everything(), zone);
1250     }
1251   } else if (NameEquals(name, "ASCII")) {
1252     result->Add(negate ? CharacterRange::Range(0x80, String::kMaxCodePoint)
1253                        : CharacterRange::Range(0x0, 0x7F),
1254                 zone);
1255   } else if (NameEquals(name, "Assigned")) {
1256     return LookupPropertyValueName(UCHAR_GENERAL_CATEGORY, "Unassigned",
1257                                    !negate, result, zone);
1258   } else {
1259     return false;
1260   }
1261   return true;
1262 }
1263 
1264 // Explicitly whitelist supported binary properties. The spec forbids supporting
1265 // properties outside of this set to ensure interoperability.
IsSupportedBinaryProperty(UProperty property)1266 bool IsSupportedBinaryProperty(UProperty property) {
1267   switch (property) {
1268     case UCHAR_ALPHABETIC:
1269     // 'Any' is not supported by ICU. See LookupSpecialPropertyValueName.
1270     // 'ASCII' is not supported by ICU. See LookupSpecialPropertyValueName.
1271     case UCHAR_ASCII_HEX_DIGIT:
1272     // 'Assigned' is not supported by ICU. See LookupSpecialPropertyValueName.
1273     case UCHAR_BIDI_CONTROL:
1274     case UCHAR_BIDI_MIRRORED:
1275     case UCHAR_CASE_IGNORABLE:
1276     case UCHAR_CASED:
1277     case UCHAR_CHANGES_WHEN_CASEFOLDED:
1278     case UCHAR_CHANGES_WHEN_CASEMAPPED:
1279     case UCHAR_CHANGES_WHEN_LOWERCASED:
1280     case UCHAR_CHANGES_WHEN_NFKC_CASEFOLDED:
1281     case UCHAR_CHANGES_WHEN_TITLECASED:
1282     case UCHAR_CHANGES_WHEN_UPPERCASED:
1283     case UCHAR_DASH:
1284     case UCHAR_DEFAULT_IGNORABLE_CODE_POINT:
1285     case UCHAR_DEPRECATED:
1286     case UCHAR_DIACRITIC:
1287     case UCHAR_EMOJI:
1288 #if U_ICU_VERSION_MAJOR_NUM >= 60
1289     case UCHAR_EMOJI_COMPONENT:
1290 #endif
1291     case UCHAR_EMOJI_MODIFIER_BASE:
1292     case UCHAR_EMOJI_MODIFIER:
1293     case UCHAR_EMOJI_PRESENTATION:
1294 #if U_ICU_VERSION_MAJOR_NUM >= 62
1295     case UCHAR_EXTENDED_PICTOGRAPHIC:
1296 #endif
1297     case UCHAR_EXTENDER:
1298     case UCHAR_GRAPHEME_BASE:
1299     case UCHAR_GRAPHEME_EXTEND:
1300     case UCHAR_HEX_DIGIT:
1301     case UCHAR_ID_CONTINUE:
1302     case UCHAR_ID_START:
1303     case UCHAR_IDEOGRAPHIC:
1304     case UCHAR_IDS_BINARY_OPERATOR:
1305     case UCHAR_IDS_TRINARY_OPERATOR:
1306     case UCHAR_JOIN_CONTROL:
1307     case UCHAR_LOGICAL_ORDER_EXCEPTION:
1308     case UCHAR_LOWERCASE:
1309     case UCHAR_MATH:
1310     case UCHAR_NONCHARACTER_CODE_POINT:
1311     case UCHAR_PATTERN_SYNTAX:
1312     case UCHAR_PATTERN_WHITE_SPACE:
1313     case UCHAR_QUOTATION_MARK:
1314     case UCHAR_RADICAL:
1315 #if U_ICU_VERSION_MAJOR_NUM >= 60
1316     case UCHAR_REGIONAL_INDICATOR:
1317 #endif
1318     case UCHAR_S_TERM:
1319     case UCHAR_SOFT_DOTTED:
1320     case UCHAR_TERMINAL_PUNCTUATION:
1321     case UCHAR_UNIFIED_IDEOGRAPH:
1322     case UCHAR_UPPERCASE:
1323     case UCHAR_VARIATION_SELECTOR:
1324     case UCHAR_WHITE_SPACE:
1325     case UCHAR_XID_CONTINUE:
1326     case UCHAR_XID_START:
1327       return true;
1328     default:
1329       break;
1330   }
1331   return false;
1332 }
1333 
IsUnicodePropertyValueCharacter(char c)1334 bool IsUnicodePropertyValueCharacter(char c) {
1335   // https://tc39.github.io/proposal-regexp-unicode-property-escapes/
1336   //
1337   // Note that using this to validate each parsed char is quite conservative.
1338   // A possible alternative solution would be to only ensure the parsed
1339   // property name/value candidate string does not contain '\0' characters and
1340   // let ICU lookups trigger the final failure.
1341   if ('a' <= c && c <= 'z') return true;
1342   if ('A' <= c && c <= 'Z') return true;
1343   if ('0' <= c && c <= '9') return true;
1344   return (c == '_');
1345 }
1346 
1347 }  // anonymous namespace
1348 
ParsePropertyClass(ZoneList<CharacterRange> * result,bool negate)1349 bool RegExpParser::ParsePropertyClass(ZoneList<CharacterRange>* result,
1350                                       bool negate) {
1351   // Parse the property class as follows:
1352   // - In \p{name}, 'name' is interpreted
1353   //   - either as a general category property value name.
1354   //   - or as a binary property name.
1355   // - In \p{name=value}, 'name' is interpreted as an enumerated property name,
1356   //   and 'value' is interpreted as one of the available property value names.
1357   // - Aliases in PropertyAlias.txt and PropertyValueAlias.txt can be used.
1358   // - Loose matching is not applied.
1359   std::vector<char> first_part;
1360   std::vector<char> second_part;
1361   if (current() == '{') {
1362     // Parse \p{[PropertyName=]PropertyNameValue}
1363     for (Advance(); current() != '}' && current() != '='; Advance()) {
1364       if (!IsUnicodePropertyValueCharacter(current())) return false;
1365       if (!has_next()) return false;
1366       first_part.push_back(static_cast<char>(current()));
1367     }
1368     if (current() == '=') {
1369       for (Advance(); current() != '}'; Advance()) {
1370         if (!IsUnicodePropertyValueCharacter(current())) return false;
1371         if (!has_next()) return false;
1372         second_part.push_back(static_cast<char>(current()));
1373       }
1374       second_part.push_back(0);  // null-terminate string.
1375     }
1376   } else {
1377     return false;
1378   }
1379   Advance();
1380   first_part.push_back(0);  // null-terminate string.
1381 
1382   DCHECK(first_part.size() - 1 == std::strlen(first_part.data()));
1383   DCHECK(second_part.empty() ||
1384          second_part.size() - 1 == std::strlen(second_part.data()));
1385 
1386   if (second_part.empty()) {
1387     // First attempt to interpret as general category property value name.
1388     const char* name = first_part.data();
1389     if (LookupPropertyValueName(UCHAR_GENERAL_CATEGORY_MASK, name, negate,
1390                                 result, zone())) {
1391       return true;
1392     }
1393     // Interpret "Any", "ASCII", and "Assigned".
1394     if (LookupSpecialPropertyValueName(name, result, negate, zone())) {
1395       return true;
1396     }
1397     // Then attempt to interpret as binary property name with value name 'Y'.
1398     UProperty property = u_getPropertyEnum(name);
1399     if (!IsSupportedBinaryProperty(property)) return false;
1400     if (!IsExactPropertyAlias(name, property)) return false;
1401     return LookupPropertyValueName(property, negate ? "N" : "Y", false, result,
1402                                    zone());
1403   } else {
1404     // Both property name and value name are specified. Attempt to interpret
1405     // the property name as enumerated property.
1406     const char* property_name = first_part.data();
1407     const char* value_name = second_part.data();
1408     UProperty property = u_getPropertyEnum(property_name);
1409     if (!IsExactPropertyAlias(property_name, property)) return false;
1410     if (property == UCHAR_GENERAL_CATEGORY) {
1411       // We want to allow aggregate value names such as "Letter".
1412       property = UCHAR_GENERAL_CATEGORY_MASK;
1413     } else if (property != UCHAR_SCRIPT &&
1414                property != UCHAR_SCRIPT_EXTENSIONS) {
1415       return false;
1416     }
1417     return LookupPropertyValueName(property, value_name, negate, result,
1418                                    zone());
1419   }
1420 }
1421 
1422 #else  // V8_INTL_SUPPORT
1423 
ParsePropertyClass(ZoneList<CharacterRange> * result,bool negate)1424 bool RegExpParser::ParsePropertyClass(ZoneList<CharacterRange>* result,
1425                                       bool negate) {
1426   return false;
1427 }
1428 
1429 #endif  // V8_INTL_SUPPORT
1430 
ParseUnlimitedLengthHexNumber(int max_value,uc32 * value)1431 bool RegExpParser::ParseUnlimitedLengthHexNumber(int max_value, uc32* value) {
1432   uc32 x = 0;
1433   int d = HexValue(current());
1434   if (d < 0) {
1435     return false;
1436   }
1437   while (d >= 0) {
1438     x = x * 16 + d;
1439     if (x > max_value) {
1440       return false;
1441     }
1442     Advance();
1443     d = HexValue(current());
1444   }
1445   *value = x;
1446   return true;
1447 }
1448 
1449 
ParseClassCharacterEscape()1450 uc32 RegExpParser::ParseClassCharacterEscape() {
1451   DCHECK_EQ('\\', current());
1452   DCHECK(has_next() && !IsSpecialClassEscape(Next()));
1453   Advance();
1454   switch (current()) {
1455     case 'b':
1456       Advance();
1457       return '\b';
1458     // ControlEscape :: one of
1459     //   f n r t v
1460     case 'f':
1461       Advance();
1462       return '\f';
1463     case 'n':
1464       Advance();
1465       return '\n';
1466     case 'r':
1467       Advance();
1468       return '\r';
1469     case 't':
1470       Advance();
1471       return '\t';
1472     case 'v':
1473       Advance();
1474       return '\v';
1475     case 'c': {
1476       uc32 controlLetter = Next();
1477       uc32 letter = controlLetter & ~('A' ^ 'a');
1478       // Inside a character class, we also accept digits and underscore as
1479       // control characters, unless with /u. See Annex B:
1480       // ES#prod-annexB-ClassControlLetter
1481       if (letter >= 'A' && letter <= 'Z') {
1482         Advance(2);
1483         // Control letters mapped to ASCII control characters in the range
1484         // 0x00-0x1F.
1485         return controlLetter & 0x1F;
1486       }
1487       if (unicode()) {
1488         // With /u, invalid escapes are not treated as identity escapes.
1489         ReportError(CStrVector("Invalid class escape"));
1490         return 0;
1491       }
1492       if ((controlLetter >= '0' && controlLetter <= '9') ||
1493           controlLetter == '_') {
1494         Advance(2);
1495         return controlLetter & 0x1F;
1496       }
1497       // We match JSC in reading the backslash as a literal
1498       // character instead of as starting an escape.
1499       // TODO(v8:6201): Not yet covered by the spec.
1500       return '\\';
1501     }
1502     case '0':
1503       // With /u, \0 is interpreted as NUL if not followed by another digit.
1504       if (unicode() && !(Next() >= '0' && Next() <= '9')) {
1505         Advance();
1506         return 0;
1507       }
1508       V8_FALLTHROUGH;
1509     case '1':
1510     case '2':
1511     case '3':
1512     case '4':
1513     case '5':
1514     case '6':
1515     case '7':
1516       // For compatibility, we interpret a decimal escape that isn't
1517       // a back reference (and therefore either \0 or not valid according
1518       // to the specification) as a 1..3 digit octal character code.
1519       // ES#prod-annexB-LegacyOctalEscapeSequence
1520       if (unicode()) {
1521         // With /u, decimal escape is not interpreted as octal character code.
1522         ReportError(CStrVector("Invalid class escape"));
1523         return 0;
1524       }
1525       return ParseOctalLiteral();
1526     case 'x': {
1527       Advance();
1528       uc32 value;
1529       if (ParseHexEscape(2, &value)) return value;
1530       if (unicode()) {
1531         // With /u, invalid escapes are not treated as identity escapes.
1532         ReportError(CStrVector("Invalid escape"));
1533         return 0;
1534       }
1535       // If \x is not followed by a two-digit hexadecimal, treat it
1536       // as an identity escape.
1537       return 'x';
1538     }
1539     case 'u': {
1540       Advance();
1541       uc32 value;
1542       if (ParseUnicodeEscape(&value)) return value;
1543       if (unicode()) {
1544         // With /u, invalid escapes are not treated as identity escapes.
1545         ReportError(CStrVector("Invalid unicode escape"));
1546         return 0;
1547       }
1548       // If \u is not followed by a two-digit hexadecimal, treat it
1549       // as an identity escape.
1550       return 'u';
1551     }
1552     default: {
1553       uc32 result = current();
1554       // With /u, no identity escapes except for syntax characters and '-' are
1555       // allowed. Otherwise, all identity escapes are allowed.
1556       if (!unicode() || IsSyntaxCharacterOrSlash(result) || result == '-') {
1557         Advance();
1558         return result;
1559       }
1560       ReportError(CStrVector("Invalid escape"));
1561       return 0;
1562     }
1563   }
1564   return 0;
1565 }
1566 
ParseClassEscape(ZoneList<CharacterRange> * ranges,Zone * zone,bool add_unicode_case_equivalents,uc32 * char_out,bool * is_class_escape)1567 void RegExpParser::ParseClassEscape(ZoneList<CharacterRange>* ranges,
1568                                     Zone* zone,
1569                                     bool add_unicode_case_equivalents,
1570                                     uc32* char_out, bool* is_class_escape) {
1571   uc32 current_char = current();
1572   if (current_char == '\\') {
1573     switch (Next()) {
1574       case 'w':
1575       case 'W':
1576       case 'd':
1577       case 'D':
1578       case 's':
1579       case 'S': {
1580         CharacterRange::AddClassEscape(static_cast<char>(Next()), ranges,
1581                                        add_unicode_case_equivalents, zone);
1582         Advance(2);
1583         *is_class_escape = true;
1584         return;
1585       }
1586       case kEndMarker:
1587         ReportError(CStrVector("\\ at end of pattern"));
1588         return;
1589       case 'p':
1590       case 'P':
1591         if (unicode()) {
1592           bool negate = Next() == 'P';
1593           Advance(2);
1594           if (!ParsePropertyClass(ranges, negate)) {
1595             ReportError(CStrVector("Invalid property name in character class"));
1596           }
1597           *is_class_escape = true;
1598           return;
1599         }
1600         break;
1601       default:
1602         break;
1603     }
1604     *char_out = ParseClassCharacterEscape();
1605     *is_class_escape = false;
1606   } else {
1607     Advance();
1608     *char_out = current_char;
1609     *is_class_escape = false;
1610   }
1611 }
1612 
ParseCharacterClass(const RegExpBuilder * builder)1613 RegExpTree* RegExpParser::ParseCharacterClass(const RegExpBuilder* builder) {
1614   static const char* kUnterminated = "Unterminated character class";
1615   static const char* kRangeInvalid = "Invalid character class";
1616   static const char* kRangeOutOfOrder = "Range out of order in character class";
1617 
1618   DCHECK_EQ(current(), '[');
1619   Advance();
1620   bool is_negated = false;
1621   if (current() == '^') {
1622     is_negated = true;
1623     Advance();
1624   }
1625   ZoneList<CharacterRange>* ranges =
1626       new (zone()) ZoneList<CharacterRange>(2, zone());
1627   bool add_unicode_case_equivalents = unicode() && builder->ignore_case();
1628   while (has_more() && current() != ']') {
1629     uc32 char_1, char_2;
1630     bool is_class_1, is_class_2;
1631     ParseClassEscape(ranges, zone(), add_unicode_case_equivalents, &char_1,
1632                      &is_class_1 CHECK_FAILED);
1633     if (current() == '-') {
1634       Advance();
1635       if (current() == kEndMarker) {
1636         // If we reach the end we break out of the loop and let the
1637         // following code report an error.
1638         break;
1639       } else if (current() == ']') {
1640         if (!is_class_1) ranges->Add(CharacterRange::Singleton(char_1), zone());
1641         ranges->Add(CharacterRange::Singleton('-'), zone());
1642         break;
1643       }
1644       ParseClassEscape(ranges, zone(), add_unicode_case_equivalents, &char_2,
1645                        &is_class_2 CHECK_FAILED);
1646       if (is_class_1 || is_class_2) {
1647         // Either end is an escaped character class. Treat the '-' verbatim.
1648         if (unicode()) {
1649           // ES2015 21.2.2.15.1 step 1.
1650           return ReportError(CStrVector(kRangeInvalid));
1651         }
1652         if (!is_class_1) ranges->Add(CharacterRange::Singleton(char_1), zone());
1653         ranges->Add(CharacterRange::Singleton('-'), zone());
1654         if (!is_class_2) ranges->Add(CharacterRange::Singleton(char_2), zone());
1655         continue;
1656       }
1657       // ES2015 21.2.2.15.1 step 6.
1658       if (char_1 > char_2) {
1659         return ReportError(CStrVector(kRangeOutOfOrder));
1660       }
1661       ranges->Add(CharacterRange::Range(char_1, char_2), zone());
1662     } else {
1663       if (!is_class_1) ranges->Add(CharacterRange::Singleton(char_1), zone());
1664     }
1665   }
1666   if (!has_more()) {
1667     return ReportError(CStrVector(kUnterminated));
1668   }
1669   Advance();
1670   RegExpCharacterClass::CharacterClassFlags character_class_flags;
1671   if (is_negated) character_class_flags = RegExpCharacterClass::NEGATED;
1672   return new (zone()) RegExpCharacterClass(zone(), ranges, builder->flags(),
1673                                            character_class_flags);
1674 }
1675 
1676 
1677 #undef CHECK_FAILED
1678 
1679 
ParseRegExp(Isolate * isolate,Zone * zone,FlatStringReader * input,JSRegExp::Flags flags,RegExpCompileData * result)1680 bool RegExpParser::ParseRegExp(Isolate* isolate, Zone* zone,
1681                                FlatStringReader* input, JSRegExp::Flags flags,
1682                                RegExpCompileData* result) {
1683   DCHECK(result != nullptr);
1684   RegExpParser parser(input, &result->error, flags, isolate, zone);
1685   RegExpTree* tree = parser.ParsePattern();
1686   if (parser.failed()) {
1687     DCHECK(tree == nullptr);
1688     DCHECK(!result->error.is_null());
1689   } else {
1690     DCHECK(tree != nullptr);
1691     DCHECK(result->error.is_null());
1692     if (FLAG_trace_regexp_parser) {
1693       StdoutStream os;
1694       tree->Print(os, zone);
1695       os << "\n";
1696     }
1697     result->tree = tree;
1698     int capture_count = parser.captures_started();
1699     result->simple = tree->IsAtom() && parser.simple() && capture_count == 0;
1700     result->contains_anchor = parser.contains_anchor();
1701     result->capture_name_map = parser.CreateCaptureNameMap();
1702     result->capture_count = capture_count;
1703   }
1704   return !parser.failed();
1705 }
1706 
RegExpBuilder(Zone * zone,JSRegExp::Flags flags)1707 RegExpBuilder::RegExpBuilder(Zone* zone, JSRegExp::Flags flags)
1708     : zone_(zone),
1709       pending_empty_(false),
1710       flags_(flags),
1711       characters_(nullptr),
1712       pending_surrogate_(kNoPendingSurrogate),
1713       terms_(),
1714       alternatives_()
1715 #ifdef DEBUG
1716       ,
1717       last_added_(ADD_NONE)
1718 #endif
1719 {
1720 }
1721 
1722 
AddLeadSurrogate(uc16 lead_surrogate)1723 void RegExpBuilder::AddLeadSurrogate(uc16 lead_surrogate) {
1724   DCHECK(unibrow::Utf16::IsLeadSurrogate(lead_surrogate));
1725   FlushPendingSurrogate();
1726   // Hold onto the lead surrogate, waiting for a trail surrogate to follow.
1727   pending_surrogate_ = lead_surrogate;
1728 }
1729 
1730 
AddTrailSurrogate(uc16 trail_surrogate)1731 void RegExpBuilder::AddTrailSurrogate(uc16 trail_surrogate) {
1732   DCHECK(unibrow::Utf16::IsTrailSurrogate(trail_surrogate));
1733   if (pending_surrogate_ != kNoPendingSurrogate) {
1734     uc16 lead_surrogate = pending_surrogate_;
1735     pending_surrogate_ = kNoPendingSurrogate;
1736     DCHECK(unibrow::Utf16::IsLeadSurrogate(lead_surrogate));
1737     uc32 combined =
1738         unibrow::Utf16::CombineSurrogatePair(lead_surrogate, trail_surrogate);
1739     if (NeedsDesugaringForIgnoreCase(combined)) {
1740       AddCharacterClassForDesugaring(combined);
1741     } else {
1742       ZoneList<uc16> surrogate_pair(2, zone());
1743       surrogate_pair.Add(lead_surrogate, zone());
1744       surrogate_pair.Add(trail_surrogate, zone());
1745       RegExpAtom* atom =
1746           new (zone()) RegExpAtom(surrogate_pair.ToConstVector(), flags_);
1747       AddAtom(atom);
1748     }
1749   } else {
1750     pending_surrogate_ = trail_surrogate;
1751     FlushPendingSurrogate();
1752   }
1753 }
1754 
1755 
FlushPendingSurrogate()1756 void RegExpBuilder::FlushPendingSurrogate() {
1757   if (pending_surrogate_ != kNoPendingSurrogate) {
1758     DCHECK(unicode());
1759     uc32 c = pending_surrogate_;
1760     pending_surrogate_ = kNoPendingSurrogate;
1761     AddCharacterClassForDesugaring(c);
1762   }
1763 }
1764 
1765 
FlushCharacters()1766 void RegExpBuilder::FlushCharacters() {
1767   FlushPendingSurrogate();
1768   pending_empty_ = false;
1769   if (characters_ != nullptr) {
1770     RegExpTree* atom =
1771         new (zone()) RegExpAtom(characters_->ToConstVector(), flags_);
1772     characters_ = nullptr;
1773     text_.Add(atom, zone());
1774     LAST(ADD_ATOM);
1775   }
1776 }
1777 
1778 
FlushText()1779 void RegExpBuilder::FlushText() {
1780   FlushCharacters();
1781   int num_text = text_.length();
1782   if (num_text == 0) {
1783     return;
1784   } else if (num_text == 1) {
1785     terms_.Add(text_.last(), zone());
1786   } else {
1787     RegExpText* text = new (zone()) RegExpText(zone());
1788     for (int i = 0; i < num_text; i++) text_.Get(i)->AppendToText(text, zone());
1789     terms_.Add(text, zone());
1790   }
1791   text_.Clear();
1792 }
1793 
1794 
AddCharacter(uc16 c)1795 void RegExpBuilder::AddCharacter(uc16 c) {
1796   FlushPendingSurrogate();
1797   pending_empty_ = false;
1798   if (NeedsDesugaringForIgnoreCase(c)) {
1799     AddCharacterClassForDesugaring(c);
1800   } else {
1801     if (characters_ == nullptr) {
1802       characters_ = new (zone()) ZoneList<uc16>(4, zone());
1803     }
1804     characters_->Add(c, zone());
1805     LAST(ADD_CHAR);
1806   }
1807 }
1808 
1809 
AddUnicodeCharacter(uc32 c)1810 void RegExpBuilder::AddUnicodeCharacter(uc32 c) {
1811   if (c > static_cast<uc32>(unibrow::Utf16::kMaxNonSurrogateCharCode)) {
1812     DCHECK(unicode());
1813     AddLeadSurrogate(unibrow::Utf16::LeadSurrogate(c));
1814     AddTrailSurrogate(unibrow::Utf16::TrailSurrogate(c));
1815   } else if (unicode() && unibrow::Utf16::IsLeadSurrogate(c)) {
1816     AddLeadSurrogate(c);
1817   } else if (unicode() && unibrow::Utf16::IsTrailSurrogate(c)) {
1818     AddTrailSurrogate(c);
1819   } else {
1820     AddCharacter(static_cast<uc16>(c));
1821   }
1822 }
1823 
AddEscapedUnicodeCharacter(uc32 character)1824 void RegExpBuilder::AddEscapedUnicodeCharacter(uc32 character) {
1825   // A lead or trail surrogate parsed via escape sequence will not
1826   // pair up with any preceding lead or following trail surrogate.
1827   FlushPendingSurrogate();
1828   AddUnicodeCharacter(character);
1829   FlushPendingSurrogate();
1830 }
1831 
AddEmpty()1832 void RegExpBuilder::AddEmpty() { pending_empty_ = true; }
1833 
1834 
AddCharacterClass(RegExpCharacterClass * cc)1835 void RegExpBuilder::AddCharacterClass(RegExpCharacterClass* cc) {
1836   if (NeedsDesugaringForUnicode(cc)) {
1837     // With /u, character class needs to be desugared, so it
1838     // must be a standalone term instead of being part of a RegExpText.
1839     AddTerm(cc);
1840   } else {
1841     AddAtom(cc);
1842   }
1843 }
1844 
AddCharacterClassForDesugaring(uc32 c)1845 void RegExpBuilder::AddCharacterClassForDesugaring(uc32 c) {
1846   AddTerm(new (zone()) RegExpCharacterClass(
1847       zone(), CharacterRange::List(zone(), CharacterRange::Singleton(c)),
1848       flags_));
1849 }
1850 
1851 
AddAtom(RegExpTree * term)1852 void RegExpBuilder::AddAtom(RegExpTree* term) {
1853   if (term->IsEmpty()) {
1854     AddEmpty();
1855     return;
1856   }
1857   if (term->IsTextElement()) {
1858     FlushCharacters();
1859     text_.Add(term, zone());
1860   } else {
1861     FlushText();
1862     terms_.Add(term, zone());
1863   }
1864   LAST(ADD_ATOM);
1865 }
1866 
1867 
AddTerm(RegExpTree * term)1868 void RegExpBuilder::AddTerm(RegExpTree* term) {
1869   FlushText();
1870   terms_.Add(term, zone());
1871   LAST(ADD_ATOM);
1872 }
1873 
1874 
AddAssertion(RegExpTree * assert)1875 void RegExpBuilder::AddAssertion(RegExpTree* assert) {
1876   FlushText();
1877   if (terms_.length() > 0 && terms_.last()->IsAssertion()) {
1878     // Omit repeated assertions of the same type.
1879     RegExpAssertion* last = terms_.last()->AsAssertion();
1880     RegExpAssertion* next = assert->AsAssertion();
1881     if (last->assertion_type() == next->assertion_type()) return;
1882   }
1883   terms_.Add(assert, zone());
1884   LAST(ADD_ASSERT);
1885 }
1886 
1887 
NewAlternative()1888 void RegExpBuilder::NewAlternative() { FlushTerms(); }
1889 
1890 
FlushTerms()1891 void RegExpBuilder::FlushTerms() {
1892   FlushText();
1893   int num_terms = terms_.length();
1894   RegExpTree* alternative;
1895   if (num_terms == 0) {
1896     alternative = new (zone()) RegExpEmpty();
1897   } else if (num_terms == 1) {
1898     alternative = terms_.last();
1899   } else {
1900     alternative = new (zone()) RegExpAlternative(terms_.GetList(zone()));
1901   }
1902   alternatives_.Add(alternative, zone());
1903   terms_.Clear();
1904   LAST(ADD_NONE);
1905 }
1906 
1907 
NeedsDesugaringForUnicode(RegExpCharacterClass * cc)1908 bool RegExpBuilder::NeedsDesugaringForUnicode(RegExpCharacterClass* cc) {
1909   if (!unicode()) return false;
1910   // TODO(yangguo): we could be smarter than this. Case-insensitivity does not
1911   // necessarily mean that we need to desugar. It's probably nicer to have a
1912   // separate pass to figure out unicode desugarings.
1913   if (ignore_case()) return true;
1914   ZoneList<CharacterRange>* ranges = cc->ranges(zone());
1915   CharacterRange::Canonicalize(ranges);
1916   for (int i = ranges->length() - 1; i >= 0; i--) {
1917     uc32 from = ranges->at(i).from();
1918     uc32 to = ranges->at(i).to();
1919     // Check for non-BMP characters.
1920     if (to >= kNonBmpStart) return true;
1921     // Check for lone surrogates.
1922     if (from <= kTrailSurrogateEnd && to >= kLeadSurrogateStart) return true;
1923   }
1924   return false;
1925 }
1926 
1927 
NeedsDesugaringForIgnoreCase(uc32 c)1928 bool RegExpBuilder::NeedsDesugaringForIgnoreCase(uc32 c) {
1929 #ifdef V8_INTL_SUPPORT
1930   if (unicode() && ignore_case()) {
1931     icu::UnicodeSet set(c, c);
1932     set.closeOver(USET_CASE_INSENSITIVE);
1933     set.removeAllStrings();
1934     return set.size() > 1;
1935   }
1936   // In the case where ICU is not included, we act as if the unicode flag is
1937   // not set, and do not desugar.
1938 #endif  // V8_INTL_SUPPORT
1939   return false;
1940 }
1941 
1942 
ToRegExp()1943 RegExpTree* RegExpBuilder::ToRegExp() {
1944   FlushTerms();
1945   int num_alternatives = alternatives_.length();
1946   if (num_alternatives == 0) return new (zone()) RegExpEmpty();
1947   if (num_alternatives == 1) return alternatives_.last();
1948   return new (zone()) RegExpDisjunction(alternatives_.GetList(zone()));
1949 }
1950 
AddQuantifierToAtom(int min,int max,RegExpQuantifier::QuantifierType quantifier_type)1951 bool RegExpBuilder::AddQuantifierToAtom(
1952     int min, int max, RegExpQuantifier::QuantifierType quantifier_type) {
1953   FlushPendingSurrogate();
1954   if (pending_empty_) {
1955     pending_empty_ = false;
1956     return true;
1957   }
1958   RegExpTree* atom;
1959   if (characters_ != nullptr) {
1960     DCHECK(last_added_ == ADD_CHAR);
1961     // Last atom was character.
1962     Vector<const uc16> char_vector = characters_->ToConstVector();
1963     int num_chars = char_vector.length();
1964     if (num_chars > 1) {
1965       Vector<const uc16> prefix = char_vector.SubVector(0, num_chars - 1);
1966       text_.Add(new (zone()) RegExpAtom(prefix, flags_), zone());
1967       char_vector = char_vector.SubVector(num_chars - 1, num_chars);
1968     }
1969     characters_ = nullptr;
1970     atom = new (zone()) RegExpAtom(char_vector, flags_);
1971     FlushText();
1972   } else if (text_.length() > 0) {
1973     DCHECK(last_added_ == ADD_ATOM);
1974     atom = text_.RemoveLast();
1975     FlushText();
1976   } else if (terms_.length() > 0) {
1977     DCHECK(last_added_ == ADD_ATOM);
1978     atom = terms_.RemoveLast();
1979     if (atom->IsLookaround()) {
1980       // With /u, lookarounds are not quantifiable.
1981       if (unicode()) return false;
1982       // Lookbehinds are not quantifiable.
1983       if (atom->AsLookaround()->type() == RegExpLookaround::LOOKBEHIND) {
1984         return false;
1985       }
1986     }
1987     if (atom->max_match() == 0) {
1988       // Guaranteed to only match an empty string.
1989       LAST(ADD_TERM);
1990       if (min == 0) {
1991         return true;
1992       }
1993       terms_.Add(atom, zone());
1994       return true;
1995     }
1996   } else {
1997     // Only call immediately after adding an atom or character!
1998     UNREACHABLE();
1999   }
2000   terms_.Add(new (zone()) RegExpQuantifier(min, max, quantifier_type, atom),
2001              zone());
2002   LAST(ADD_TERM);
2003   return true;
2004 }
2005 
2006 }  // namespace internal
2007 }  // namespace v8
2008