• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2016 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #include "src/regexp/regexp-parser.h"
6 
7 #include "src/char-predicates-inl.h"
8 #include "src/factory.h"
9 #include "src/isolate.h"
10 #include "src/objects-inl.h"
11 #include "src/ostreams.h"
12 #include "src/regexp/jsregexp.h"
13 #include "src/utils.h"
14 
15 #ifdef V8_I18N_SUPPORT
16 #include "unicode/uniset.h"
17 #endif  // V8_I18N_SUPPORT
18 
19 namespace v8 {
20 namespace internal {
21 
RegExpParser(FlatStringReader * in,Handle<String> * error,JSRegExp::Flags flags,Isolate * isolate,Zone * zone)22 RegExpParser::RegExpParser(FlatStringReader* in, Handle<String>* error,
23                            JSRegExp::Flags flags, Isolate* isolate, Zone* zone)
24     : isolate_(isolate),
25       zone_(zone),
26       error_(error),
27       captures_(NULL),
28       named_captures_(NULL),
29       named_back_references_(NULL),
30       in_(in),
31       current_(kEndMarker),
32       ignore_case_(flags & JSRegExp::kIgnoreCase),
33       multiline_(flags & JSRegExp::kMultiline),
34       unicode_(flags & JSRegExp::kUnicode),
35       next_pos_(0),
36       captures_started_(0),
37       capture_count_(0),
38       has_more_(true),
39       simple_(false),
40       contains_anchor_(false),
41       is_scanned_for_captures_(false),
42       failed_(false) {
43   Advance();
44 }
45 
46 template <bool update_position>
ReadNext()47 inline uc32 RegExpParser::ReadNext() {
48   int position = next_pos_;
49   uc32 c0 = in()->Get(position);
50   position++;
51   // Read the whole surrogate pair in case of unicode flag, if possible.
52   if (unicode() && position < in()->length() &&
53       unibrow::Utf16::IsLeadSurrogate(static_cast<uc16>(c0))) {
54     uc16 c1 = in()->Get(position);
55     if (unibrow::Utf16::IsTrailSurrogate(c1)) {
56       c0 = unibrow::Utf16::CombineSurrogatePair(static_cast<uc16>(c0), c1);
57       position++;
58     }
59   }
60   if (update_position) next_pos_ = position;
61   return c0;
62 }
63 
64 
Next()65 uc32 RegExpParser::Next() {
66   if (has_next()) {
67     return ReadNext<false>();
68   } else {
69     return kEndMarker;
70   }
71 }
72 
73 
Advance()74 void RegExpParser::Advance() {
75   if (has_next()) {
76     StackLimitCheck check(isolate());
77     if (check.HasOverflowed()) {
78       if (FLAG_abort_on_stack_overflow) FATAL("Aborting on stack overflow");
79       ReportError(CStrVector(
80           MessageTemplate::TemplateString(MessageTemplate::kStackOverflow)));
81     } else if (zone()->excess_allocation()) {
82       ReportError(CStrVector("Regular expression too large"));
83     } else {
84       current_ = ReadNext<true>();
85     }
86   } else {
87     current_ = kEndMarker;
88     // Advance so that position() points to 1-after-the-last-character. This is
89     // important so that Reset() to this position works correctly.
90     next_pos_ = in()->length() + 1;
91     has_more_ = false;
92   }
93 }
94 
95 
Reset(int pos)96 void RegExpParser::Reset(int pos) {
97   next_pos_ = pos;
98   has_more_ = (pos < in()->length());
99   Advance();
100 }
101 
102 
Advance(int dist)103 void RegExpParser::Advance(int dist) {
104   next_pos_ += dist - 1;
105   Advance();
106 }
107 
108 
simple()109 bool RegExpParser::simple() { return simple_; }
110 
IsSyntaxCharacterOrSlash(uc32 c)111 bool RegExpParser::IsSyntaxCharacterOrSlash(uc32 c) {
112   switch (c) {
113     case '^':
114     case '$':
115     case '\\':
116     case '.':
117     case '*':
118     case '+':
119     case '?':
120     case '(':
121     case ')':
122     case '[':
123     case ']':
124     case '{':
125     case '}':
126     case '|':
127     case '/':
128       return true;
129     default:
130       break;
131   }
132   return false;
133 }
134 
135 
ReportError(Vector<const char> message)136 RegExpTree* RegExpParser::ReportError(Vector<const char> message) {
137   if (failed_) return NULL;  // Do not overwrite any existing error.
138   failed_ = true;
139   *error_ = isolate()->factory()->NewStringFromAscii(message).ToHandleChecked();
140   // Zip to the end to make sure the no more input is read.
141   current_ = kEndMarker;
142   next_pos_ = in()->length();
143   return NULL;
144 }
145 
146 
147 #define CHECK_FAILED /**/); \
148   if (failed_) return NULL; \
149   ((void)0
150 
151 
152 // Pattern ::
153 //   Disjunction
ParsePattern()154 RegExpTree* RegExpParser::ParsePattern() {
155   RegExpTree* result = ParseDisjunction(CHECK_FAILED);
156   PatchNamedBackReferences(CHECK_FAILED);
157   DCHECK(!has_more());
158   // If the result of parsing is a literal string atom, and it has the
159   // same length as the input, then the atom is identical to the input.
160   if (result->IsAtom() && result->AsAtom()->length() == in()->length()) {
161     simple_ = true;
162   }
163   return result;
164 }
165 
166 
167 // Disjunction ::
168 //   Alternative
169 //   Alternative | Disjunction
170 // Alternative ::
171 //   [empty]
172 //   Term Alternative
173 // Term ::
174 //   Assertion
175 //   Atom
176 //   Atom Quantifier
ParseDisjunction()177 RegExpTree* RegExpParser::ParseDisjunction() {
178   // Used to store current state while parsing subexpressions.
179   RegExpParserState initial_state(NULL, INITIAL, RegExpLookaround::LOOKAHEAD, 0,
180                                   nullptr, ignore_case(), unicode(), zone());
181   RegExpParserState* state = &initial_state;
182   // Cache the builder in a local variable for quick access.
183   RegExpBuilder* builder = initial_state.builder();
184   while (true) {
185     switch (current()) {
186       case kEndMarker:
187         if (state->IsSubexpression()) {
188           // Inside a parenthesized group when hitting end of input.
189           return ReportError(CStrVector("Unterminated group"));
190         }
191         DCHECK_EQ(INITIAL, state->group_type());
192         // Parsing completed successfully.
193         return builder->ToRegExp();
194       case ')': {
195         if (!state->IsSubexpression()) {
196           return ReportError(CStrVector("Unmatched ')'"));
197         }
198         DCHECK_NE(INITIAL, state->group_type());
199 
200         Advance();
201         // End disjunction parsing and convert builder content to new single
202         // regexp atom.
203         RegExpTree* body = builder->ToRegExp();
204 
205         int end_capture_index = captures_started();
206 
207         int capture_index = state->capture_index();
208         SubexpressionType group_type = state->group_type();
209 
210         // Build result of subexpression.
211         if (group_type == CAPTURE) {
212           if (state->IsNamedCapture()) {
213             CreateNamedCaptureAtIndex(state->capture_name(),
214                                       capture_index CHECK_FAILED);
215           }
216           RegExpCapture* capture = GetCapture(capture_index);
217           capture->set_body(body);
218           body = capture;
219         } else if (group_type == GROUPING) {
220           body = new (zone()) RegExpGroup(body);
221         } else {
222           DCHECK(group_type == POSITIVE_LOOKAROUND ||
223                  group_type == NEGATIVE_LOOKAROUND);
224           bool is_positive = (group_type == POSITIVE_LOOKAROUND);
225           body = new (zone()) RegExpLookaround(
226               body, is_positive, end_capture_index - capture_index,
227               capture_index, state->lookaround_type());
228         }
229 
230         // Restore previous state.
231         state = state->previous_state();
232         builder = state->builder();
233 
234         builder->AddAtom(body);
235         // For compatability with JSC and ES3, we allow quantifiers after
236         // lookaheads, and break in all cases.
237         break;
238       }
239       case '|': {
240         Advance();
241         builder->NewAlternative();
242         continue;
243       }
244       case '*':
245       case '+':
246       case '?':
247         return ReportError(CStrVector("Nothing to repeat"));
248       case '^': {
249         Advance();
250         if (multiline()) {
251           builder->AddAssertion(
252               new (zone()) RegExpAssertion(RegExpAssertion::START_OF_LINE));
253         } else {
254           builder->AddAssertion(
255               new (zone()) RegExpAssertion(RegExpAssertion::START_OF_INPUT));
256           set_contains_anchor();
257         }
258         continue;
259       }
260       case '$': {
261         Advance();
262         RegExpAssertion::AssertionType assertion_type =
263             multiline() ? RegExpAssertion::END_OF_LINE
264                         : RegExpAssertion::END_OF_INPUT;
265         builder->AddAssertion(new (zone()) RegExpAssertion(assertion_type));
266         continue;
267       }
268       case '.': {
269         Advance();
270         // everything except \x0a, \x0d, \u2028 and \u2029
271         ZoneList<CharacterRange>* ranges =
272             new (zone()) ZoneList<CharacterRange>(2, zone());
273         CharacterRange::AddClassEscape('.', ranges, zone());
274         RegExpCharacterClass* cc =
275             new (zone()) RegExpCharacterClass(ranges, false);
276         builder->AddCharacterClass(cc);
277         break;
278       }
279       case '(': {
280         SubexpressionType subexpr_type = CAPTURE;
281         RegExpLookaround::Type lookaround_type = state->lookaround_type();
282         bool is_named_capture = false;
283         Advance();
284         if (current() == '?') {
285           switch (Next()) {
286             case ':':
287               subexpr_type = GROUPING;
288               Advance(2);
289               break;
290             case '=':
291               lookaround_type = RegExpLookaround::LOOKAHEAD;
292               subexpr_type = POSITIVE_LOOKAROUND;
293               Advance(2);
294               break;
295             case '!':
296               lookaround_type = RegExpLookaround::LOOKAHEAD;
297               subexpr_type = NEGATIVE_LOOKAROUND;
298               Advance(2);
299               break;
300             case '<':
301               Advance();
302               if (FLAG_harmony_regexp_lookbehind) {
303                 if (Next() == '=') {
304                   subexpr_type = POSITIVE_LOOKAROUND;
305                   lookaround_type = RegExpLookaround::LOOKBEHIND;
306                   Advance(2);
307                   break;
308                 } else if (Next() == '!') {
309                   subexpr_type = NEGATIVE_LOOKAROUND;
310                   lookaround_type = RegExpLookaround::LOOKBEHIND;
311                   Advance(2);
312                   break;
313                 }
314               }
315               if (FLAG_harmony_regexp_named_captures && unicode()) {
316                 is_named_capture = true;
317                 Advance();
318                 break;
319               }
320             // Fall through.
321             default:
322               return ReportError(CStrVector("Invalid group"));
323           }
324         }
325 
326         const ZoneVector<uc16>* capture_name = nullptr;
327         if (subexpr_type == CAPTURE) {
328           if (captures_started_ >= kMaxCaptures) {
329             return ReportError(CStrVector("Too many captures"));
330           }
331           captures_started_++;
332 
333           if (is_named_capture) {
334             capture_name = ParseCaptureGroupName(CHECK_FAILED);
335           }
336         }
337         // Store current state and begin new disjunction parsing.
338         state = new (zone()) RegExpParserState(
339             state, subexpr_type, lookaround_type, captures_started_,
340             capture_name, ignore_case(), unicode(), zone());
341         builder = state->builder();
342         continue;
343       }
344       case '[': {
345         RegExpTree* cc = ParseCharacterClass(CHECK_FAILED);
346         builder->AddCharacterClass(cc->AsCharacterClass());
347         break;
348       }
349       // Atom ::
350       //   \ AtomEscape
351       case '\\':
352         switch (Next()) {
353           case kEndMarker:
354             return ReportError(CStrVector("\\ at end of pattern"));
355           case 'b':
356             Advance(2);
357             builder->AddAssertion(
358                 new (zone()) RegExpAssertion(RegExpAssertion::BOUNDARY));
359             continue;
360           case 'B':
361             Advance(2);
362             builder->AddAssertion(
363                 new (zone()) RegExpAssertion(RegExpAssertion::NON_BOUNDARY));
364             continue;
365           // AtomEscape ::
366           //   CharacterClassEscape
367           //
368           // CharacterClassEscape :: one of
369           //   d D s S w W
370           case 'd':
371           case 'D':
372           case 's':
373           case 'S':
374           case 'w':
375           case 'W': {
376             uc32 c = Next();
377             Advance(2);
378             ZoneList<CharacterRange>* ranges =
379                 new (zone()) ZoneList<CharacterRange>(2, zone());
380             CharacterRange::AddClassEscape(c, ranges, zone());
381             RegExpCharacterClass* cc =
382                 new (zone()) RegExpCharacterClass(ranges, false);
383             builder->AddCharacterClass(cc);
384             break;
385           }
386           case 'p':
387           case 'P': {
388             uc32 p = Next();
389             Advance(2);
390             if (unicode()) {
391               if (FLAG_harmony_regexp_property) {
392                 ZoneList<CharacterRange>* ranges =
393                     new (zone()) ZoneList<CharacterRange>(2, zone());
394                 if (!ParsePropertyClass(ranges, p == 'P')) {
395                   return ReportError(CStrVector("Invalid property name"));
396                 }
397                 RegExpCharacterClass* cc =
398                     new (zone()) RegExpCharacterClass(ranges, false);
399                 builder->AddCharacterClass(cc);
400               } else {
401                 // With /u, no identity escapes except for syntax characters
402                 // are allowed. Otherwise, all identity escapes are allowed.
403                 return ReportError(CStrVector("Invalid escape"));
404               }
405             } else {
406               builder->AddCharacter(p);
407             }
408             break;
409           }
410           case '1':
411           case '2':
412           case '3':
413           case '4':
414           case '5':
415           case '6':
416           case '7':
417           case '8':
418           case '9': {
419             int index = 0;
420             bool is_backref = ParseBackReferenceIndex(&index CHECK_FAILED);
421             if (is_backref) {
422               if (state->IsInsideCaptureGroup(index)) {
423                 // The back reference is inside the capture group it refers to.
424                 // Nothing can possibly have been captured yet, so we use empty
425                 // instead. This ensures that, when checking a back reference,
426                 // the capture registers of the referenced capture are either
427                 // both set or both cleared.
428                 builder->AddEmpty();
429               } else {
430                 RegExpCapture* capture = GetCapture(index);
431                 RegExpTree* atom = new (zone()) RegExpBackReference(capture);
432                 builder->AddAtom(atom);
433               }
434               break;
435             }
436             // With /u, no identity escapes except for syntax characters
437             // are allowed. Otherwise, all identity escapes are allowed.
438             if (unicode()) {
439               return ReportError(CStrVector("Invalid escape"));
440             }
441             uc32 first_digit = Next();
442             if (first_digit == '8' || first_digit == '9') {
443               builder->AddCharacter(first_digit);
444               Advance(2);
445               break;
446             }
447           }
448           // Fall through.
449           case '0': {
450             Advance();
451             if (unicode() && Next() >= '0' && Next() <= '9') {
452               // With /u, decimal escape with leading 0 are not parsed as octal.
453               return ReportError(CStrVector("Invalid decimal escape"));
454             }
455             uc32 octal = ParseOctalLiteral();
456             builder->AddCharacter(octal);
457             break;
458           }
459           // ControlEscape :: one of
460           //   f n r t v
461           case 'f':
462             Advance(2);
463             builder->AddCharacter('\f');
464             break;
465           case 'n':
466             Advance(2);
467             builder->AddCharacter('\n');
468             break;
469           case 'r':
470             Advance(2);
471             builder->AddCharacter('\r');
472             break;
473           case 't':
474             Advance(2);
475             builder->AddCharacter('\t');
476             break;
477           case 'v':
478             Advance(2);
479             builder->AddCharacter('\v');
480             break;
481           case 'c': {
482             Advance();
483             uc32 controlLetter = Next();
484             // Special case if it is an ASCII letter.
485             // Convert lower case letters to uppercase.
486             uc32 letter = controlLetter & ~('a' ^ 'A');
487             if (letter < 'A' || 'Z' < letter) {
488               // controlLetter is not in range 'A'-'Z' or 'a'-'z'.
489               // This is outside the specification. We match JSC in
490               // reading the backslash as a literal character instead
491               // of as starting an escape.
492               if (unicode()) {
493                 // With /u, invalid escapes are not treated as identity escapes.
494                 return ReportError(CStrVector("Invalid unicode escape"));
495               }
496               builder->AddCharacter('\\');
497             } else {
498               Advance(2);
499               builder->AddCharacter(controlLetter & 0x1f);
500             }
501             break;
502           }
503           case 'x': {
504             Advance(2);
505             uc32 value;
506             if (ParseHexEscape(2, &value)) {
507               builder->AddCharacter(value);
508             } else if (!unicode()) {
509               builder->AddCharacter('x');
510             } else {
511               // With /u, invalid escapes are not treated as identity escapes.
512               return ReportError(CStrVector("Invalid escape"));
513             }
514             break;
515           }
516           case 'u': {
517             Advance(2);
518             uc32 value;
519             if (ParseUnicodeEscape(&value)) {
520               builder->AddEscapedUnicodeCharacter(value);
521             } else if (!unicode()) {
522               builder->AddCharacter('u');
523             } else {
524               // With /u, invalid escapes are not treated as identity escapes.
525               return ReportError(CStrVector("Invalid unicode escape"));
526             }
527             break;
528           }
529           case 'k':
530             if (FLAG_harmony_regexp_named_captures && unicode()) {
531               Advance(2);
532               ParseNamedBackReference(builder, state CHECK_FAILED);
533               break;
534             }
535           // Fall through.
536           default:
537             Advance();
538             // With /u, no identity escapes except for syntax characters
539             // are allowed. Otherwise, all identity escapes are allowed.
540             if (!unicode() || IsSyntaxCharacterOrSlash(current())) {
541               builder->AddCharacter(current());
542               Advance();
543             } else {
544               return ReportError(CStrVector("Invalid escape"));
545             }
546             break;
547         }
548         break;
549       case '{': {
550         int dummy;
551         bool parsed = ParseIntervalQuantifier(&dummy, &dummy CHECK_FAILED);
552         if (parsed) return ReportError(CStrVector("Nothing to repeat"));
553         // Fall through.
554       }
555       case '}':
556       case ']':
557         if (unicode()) {
558           return ReportError(CStrVector("Lone quantifier brackets"));
559         }
560       // Fall through.
561       default:
562         builder->AddUnicodeCharacter(current());
563         Advance();
564         break;
565     }  // end switch(current())
566 
567     int min;
568     int max;
569     switch (current()) {
570       // QuantifierPrefix ::
571       //   *
572       //   +
573       //   ?
574       //   {
575       case '*':
576         min = 0;
577         max = RegExpTree::kInfinity;
578         Advance();
579         break;
580       case '+':
581         min = 1;
582         max = RegExpTree::kInfinity;
583         Advance();
584         break;
585       case '?':
586         min = 0;
587         max = 1;
588         Advance();
589         break;
590       case '{':
591         if (ParseIntervalQuantifier(&min, &max)) {
592           if (max < min) {
593             return ReportError(
594                 CStrVector("numbers out of order in {} quantifier"));
595           }
596           break;
597         } else if (unicode()) {
598           // With /u, incomplete quantifiers are not allowed.
599           return ReportError(CStrVector("Incomplete quantifier"));
600         }
601         continue;
602       default:
603         continue;
604     }
605     RegExpQuantifier::QuantifierType quantifier_type = RegExpQuantifier::GREEDY;
606     if (current() == '?') {
607       quantifier_type = RegExpQuantifier::NON_GREEDY;
608       Advance();
609     } else if (FLAG_regexp_possessive_quantifier && current() == '+') {
610       // FLAG_regexp_possessive_quantifier is a debug-only flag.
611       quantifier_type = RegExpQuantifier::POSSESSIVE;
612       Advance();
613     }
614     if (!builder->AddQuantifierToAtom(min, max, quantifier_type)) {
615       return ReportError(CStrVector("Invalid quantifier"));
616     }
617   }
618 }
619 
620 
621 #ifdef DEBUG
622 // Currently only used in an DCHECK.
IsSpecialClassEscape(uc32 c)623 static bool IsSpecialClassEscape(uc32 c) {
624   switch (c) {
625     case 'd':
626     case 'D':
627     case 's':
628     case 'S':
629     case 'w':
630     case 'W':
631       return true;
632     default:
633       return false;
634   }
635 }
636 #endif
637 
638 
639 // In order to know whether an escape is a backreference or not we have to scan
640 // the entire regexp and find the number of capturing parentheses.  However we
641 // don't want to scan the regexp twice unless it is necessary.  This mini-parser
642 // is called when needed.  It can see the difference between capturing and
643 // noncapturing parentheses and can skip character classes and backslash-escaped
644 // characters.
ScanForCaptures()645 void RegExpParser::ScanForCaptures() {
646   // Start with captures started previous to current position
647   int capture_count = captures_started();
648   // Add count of captures after this position.
649   int n;
650   while ((n = current()) != kEndMarker) {
651     Advance();
652     switch (n) {
653       case '\\':
654         Advance();
655         break;
656       case '[': {
657         int c;
658         while ((c = current()) != kEndMarker) {
659           Advance();
660           if (c == '\\') {
661             Advance();
662           } else {
663             if (c == ']') break;
664           }
665         }
666         break;
667       }
668       case '(':
669         if (current() != '?') capture_count++;
670         break;
671     }
672   }
673   capture_count_ = capture_count;
674   is_scanned_for_captures_ = true;
675 }
676 
677 
ParseBackReferenceIndex(int * index_out)678 bool RegExpParser::ParseBackReferenceIndex(int* index_out) {
679   DCHECK_EQ('\\', current());
680   DCHECK('1' <= Next() && Next() <= '9');
681   // Try to parse a decimal literal that is no greater than the total number
682   // of left capturing parentheses in the input.
683   int start = position();
684   int value = Next() - '0';
685   Advance(2);
686   while (true) {
687     uc32 c = current();
688     if (IsDecimalDigit(c)) {
689       value = 10 * value + (c - '0');
690       if (value > kMaxCaptures) {
691         Reset(start);
692         return false;
693       }
694       Advance();
695     } else {
696       break;
697     }
698   }
699   if (value > captures_started()) {
700     if (!is_scanned_for_captures_) {
701       int saved_position = position();
702       ScanForCaptures();
703       Reset(saved_position);
704     }
705     if (value > capture_count_) {
706       Reset(start);
707       return false;
708     }
709   }
710   *index_out = value;
711   return true;
712 }
713 
push_code_unit(ZoneVector<uc16> * v,uint32_t code_unit)714 static void push_code_unit(ZoneVector<uc16>* v, uint32_t code_unit) {
715   if (code_unit <= unibrow::Utf16::kMaxNonSurrogateCharCode) {
716     v->push_back(code_unit);
717   } else {
718     v->push_back(unibrow::Utf16::LeadSurrogate(code_unit));
719     v->push_back(unibrow::Utf16::TrailSurrogate(code_unit));
720   }
721 }
722 
ParseCaptureGroupName()723 const ZoneVector<uc16>* RegExpParser::ParseCaptureGroupName() {
724   DCHECK(FLAG_harmony_regexp_named_captures);
725   DCHECK(unicode());
726 
727   ZoneVector<uc16>* name =
728       new (zone()->New(sizeof(ZoneVector<uc16>))) ZoneVector<uc16>(zone());
729 
730   bool at_start = true;
731   while (true) {
732     uc32 c = current();
733     Advance();
734 
735     // Convert unicode escapes.
736     if (c == '\\' && current() == 'u') {
737       Advance();
738       if (!ParseUnicodeEscape(&c)) {
739         ReportError(CStrVector("Invalid Unicode escape sequence"));
740         return nullptr;
741       }
742     }
743 
744     if (at_start) {
745       if (!IdentifierStart::Is(c)) {
746         ReportError(CStrVector("Invalid capture group name"));
747         return nullptr;
748       }
749       push_code_unit(name, c);
750       at_start = false;
751     } else {
752       if (c == '>') {
753         break;
754       } else if (IdentifierPart::Is(c)) {
755         push_code_unit(name, c);
756       } else {
757         ReportError(CStrVector("Invalid capture group name"));
758         return nullptr;
759       }
760     }
761   }
762 
763   return name;
764 }
765 
CreateNamedCaptureAtIndex(const ZoneVector<uc16> * name,int index)766 bool RegExpParser::CreateNamedCaptureAtIndex(const ZoneVector<uc16>* name,
767                                              int index) {
768   DCHECK(FLAG_harmony_regexp_named_captures);
769   DCHECK(unicode());
770   DCHECK(0 < index && index <= captures_started_);
771   DCHECK_NOT_NULL(name);
772 
773   if (named_captures_ == nullptr) {
774     named_captures_ = new (zone()) ZoneList<RegExpCapture*>(1, zone());
775   } else {
776     // Check for duplicates and bail if we find any.
777     for (const auto& named_capture : *named_captures_) {
778       if (*named_capture->name() == *name) {
779         ReportError(CStrVector("Duplicate capture group name"));
780         return false;
781       }
782     }
783   }
784 
785   RegExpCapture* capture = GetCapture(index);
786   DCHECK(capture->name() == nullptr);
787 
788   capture->set_name(name);
789   named_captures_->Add(capture, zone());
790 
791   return true;
792 }
793 
ParseNamedBackReference(RegExpBuilder * builder,RegExpParserState * state)794 bool RegExpParser::ParseNamedBackReference(RegExpBuilder* builder,
795                                            RegExpParserState* state) {
796   // The parser is assumed to be on the '<' in \k<name>.
797   if (current() != '<') {
798     ReportError(CStrVector("Invalid named reference"));
799     return false;
800   }
801 
802   Advance();
803   const ZoneVector<uc16>* name = ParseCaptureGroupName();
804   if (name == nullptr) {
805     return false;
806   }
807 
808   if (state->IsInsideCaptureGroup(name)) {
809     builder->AddEmpty();
810   } else {
811     RegExpBackReference* atom = new (zone()) RegExpBackReference();
812     atom->set_name(name);
813 
814     builder->AddAtom(atom);
815 
816     if (named_back_references_ == nullptr) {
817       named_back_references_ =
818           new (zone()) ZoneList<RegExpBackReference*>(1, zone());
819     }
820     named_back_references_->Add(atom, zone());
821   }
822 
823   return true;
824 }
825 
PatchNamedBackReferences()826 void RegExpParser::PatchNamedBackReferences() {
827   if (named_back_references_ == nullptr) return;
828 
829   if (named_captures_ == nullptr) {
830     ReportError(CStrVector("Invalid named capture referenced"));
831     return;
832   }
833 
834   // Look up and patch the actual capture for each named back reference.
835   // TODO(jgruber): O(n^2), optimize if necessary.
836 
837   for (int i = 0; i < named_back_references_->length(); i++) {
838     RegExpBackReference* ref = named_back_references_->at(i);
839 
840     int index = -1;
841     for (const auto& capture : *named_captures_) {
842       if (*capture->name() == *ref->name()) {
843         index = capture->index();
844         break;
845       }
846     }
847 
848     if (index == -1) {
849       ReportError(CStrVector("Invalid named capture referenced"));
850       return;
851     }
852 
853     ref->set_capture(GetCapture(index));
854   }
855 }
856 
GetCapture(int index)857 RegExpCapture* RegExpParser::GetCapture(int index) {
858   // The index for the capture groups are one-based. Its index in the list is
859   // zero-based.
860   int know_captures =
861       is_scanned_for_captures_ ? capture_count_ : captures_started_;
862   DCHECK(index <= know_captures);
863   if (captures_ == NULL) {
864     captures_ = new (zone()) ZoneList<RegExpCapture*>(know_captures, zone());
865   }
866   while (captures_->length() < know_captures) {
867     captures_->Add(new (zone()) RegExpCapture(captures_->length() + 1), zone());
868   }
869   return captures_->at(index - 1);
870 }
871 
CreateCaptureNameMap()872 Handle<FixedArray> RegExpParser::CreateCaptureNameMap() {
873   if (named_captures_ == nullptr || named_captures_->is_empty())
874     return Handle<FixedArray>();
875 
876   Factory* factory = isolate()->factory();
877 
878   int len = named_captures_->length() * 2;
879   Handle<FixedArray> array = factory->NewFixedArray(len);
880 
881   for (int i = 0; i < named_captures_->length(); i++) {
882     RegExpCapture* capture = named_captures_->at(i);
883     MaybeHandle<String> name = factory->NewStringFromTwoByte(capture->name());
884     array->set(i * 2, *name.ToHandleChecked());
885     array->set(i * 2 + 1, Smi::FromInt(capture->index()));
886   }
887 
888   return array;
889 }
890 
IsInsideCaptureGroup(int index)891 bool RegExpParser::RegExpParserState::IsInsideCaptureGroup(int index) {
892   for (RegExpParserState* s = this; s != NULL; s = s->previous_state()) {
893     if (s->group_type() != CAPTURE) continue;
894     // Return true if we found the matching capture index.
895     if (index == s->capture_index()) return true;
896     // Abort if index is larger than what has been parsed up till this state.
897     if (index > s->capture_index()) return false;
898   }
899   return false;
900 }
901 
IsInsideCaptureGroup(const ZoneVector<uc16> * name)902 bool RegExpParser::RegExpParserState::IsInsideCaptureGroup(
903     const ZoneVector<uc16>* name) {
904   DCHECK_NOT_NULL(name);
905   for (RegExpParserState* s = this; s != NULL; s = s->previous_state()) {
906     if (s->capture_name() == nullptr) continue;
907     if (*s->capture_name() == *name) return true;
908   }
909   return false;
910 }
911 
912 // QuantifierPrefix ::
913 //   { DecimalDigits }
914 //   { DecimalDigits , }
915 //   { DecimalDigits , DecimalDigits }
916 //
917 // Returns true if parsing succeeds, and set the min_out and max_out
918 // values. Values are truncated to RegExpTree::kInfinity if they overflow.
ParseIntervalQuantifier(int * min_out,int * max_out)919 bool RegExpParser::ParseIntervalQuantifier(int* min_out, int* max_out) {
920   DCHECK_EQ(current(), '{');
921   int start = position();
922   Advance();
923   int min = 0;
924   if (!IsDecimalDigit(current())) {
925     Reset(start);
926     return false;
927   }
928   while (IsDecimalDigit(current())) {
929     int next = current() - '0';
930     if (min > (RegExpTree::kInfinity - next) / 10) {
931       // Overflow. Skip past remaining decimal digits and return -1.
932       do {
933         Advance();
934       } while (IsDecimalDigit(current()));
935       min = RegExpTree::kInfinity;
936       break;
937     }
938     min = 10 * min + next;
939     Advance();
940   }
941   int max = 0;
942   if (current() == '}') {
943     max = min;
944     Advance();
945   } else if (current() == ',') {
946     Advance();
947     if (current() == '}') {
948       max = RegExpTree::kInfinity;
949       Advance();
950     } else {
951       while (IsDecimalDigit(current())) {
952         int next = current() - '0';
953         if (max > (RegExpTree::kInfinity - next) / 10) {
954           do {
955             Advance();
956           } while (IsDecimalDigit(current()));
957           max = RegExpTree::kInfinity;
958           break;
959         }
960         max = 10 * max + next;
961         Advance();
962       }
963       if (current() != '}') {
964         Reset(start);
965         return false;
966       }
967       Advance();
968     }
969   } else {
970     Reset(start);
971     return false;
972   }
973   *min_out = min;
974   *max_out = max;
975   return true;
976 }
977 
978 
ParseOctalLiteral()979 uc32 RegExpParser::ParseOctalLiteral() {
980   DCHECK(('0' <= current() && current() <= '7') || current() == kEndMarker);
981   // For compatibility with some other browsers (not all), we parse
982   // up to three octal digits with a value below 256.
983   uc32 value = current() - '0';
984   Advance();
985   if ('0' <= current() && current() <= '7') {
986     value = value * 8 + current() - '0';
987     Advance();
988     if (value < 32 && '0' <= current() && current() <= '7') {
989       value = value * 8 + current() - '0';
990       Advance();
991     }
992   }
993   return value;
994 }
995 
996 
ParseHexEscape(int length,uc32 * value)997 bool RegExpParser::ParseHexEscape(int length, uc32* value) {
998   int start = position();
999   uc32 val = 0;
1000   for (int i = 0; i < length; ++i) {
1001     uc32 c = current();
1002     int d = HexValue(c);
1003     if (d < 0) {
1004       Reset(start);
1005       return false;
1006     }
1007     val = val * 16 + d;
1008     Advance();
1009   }
1010   *value = val;
1011   return true;
1012 }
1013 
1014 // This parses RegExpUnicodeEscapeSequence as described in ECMA262.
ParseUnicodeEscape(uc32 * value)1015 bool RegExpParser::ParseUnicodeEscape(uc32* value) {
1016   // Accept both \uxxxx and \u{xxxxxx} (if harmony unicode escapes are
1017   // allowed). In the latter case, the number of hex digits between { } is
1018   // arbitrary. \ and u have already been read.
1019   if (current() == '{' && unicode()) {
1020     int start = position();
1021     Advance();
1022     if (ParseUnlimitedLengthHexNumber(0x10ffff, value)) {
1023       if (current() == '}') {
1024         Advance();
1025         return true;
1026       }
1027     }
1028     Reset(start);
1029     return false;
1030   }
1031   // \u but no {, or \u{...} escapes not allowed.
1032   bool result = ParseHexEscape(4, value);
1033   if (result && unicode() && unibrow::Utf16::IsLeadSurrogate(*value) &&
1034       current() == '\\') {
1035     // Attempt to read trail surrogate.
1036     int start = position();
1037     if (Next() == 'u') {
1038       Advance(2);
1039       uc32 trail;
1040       if (ParseHexEscape(4, &trail) &&
1041           unibrow::Utf16::IsTrailSurrogate(trail)) {
1042         *value = unibrow::Utf16::CombineSurrogatePair(static_cast<uc16>(*value),
1043                                                       static_cast<uc16>(trail));
1044         return true;
1045       }
1046     }
1047     Reset(start);
1048   }
1049   return result;
1050 }
1051 
1052 #ifdef V8_I18N_SUPPORT
1053 
1054 namespace {
1055 
IsExactPropertyAlias(const char * property_name,UProperty property)1056 bool IsExactPropertyAlias(const char* property_name, UProperty property) {
1057   const char* short_name = u_getPropertyName(property, U_SHORT_PROPERTY_NAME);
1058   if (short_name != NULL && strcmp(property_name, short_name) == 0) return true;
1059   for (int i = 0;; i++) {
1060     const char* long_name = u_getPropertyName(
1061         property, static_cast<UPropertyNameChoice>(U_LONG_PROPERTY_NAME + i));
1062     if (long_name == NULL) break;
1063     if (strcmp(property_name, long_name) == 0) return true;
1064   }
1065   return false;
1066 }
1067 
IsExactPropertyValueAlias(const char * property_value_name,UProperty property,int32_t property_value)1068 bool IsExactPropertyValueAlias(const char* property_value_name,
1069                                UProperty property, int32_t property_value) {
1070   const char* short_name =
1071       u_getPropertyValueName(property, property_value, U_SHORT_PROPERTY_NAME);
1072   if (short_name != NULL && strcmp(property_value_name, short_name) == 0) {
1073     return true;
1074   }
1075   for (int i = 0;; i++) {
1076     const char* long_name = u_getPropertyValueName(
1077         property, property_value,
1078         static_cast<UPropertyNameChoice>(U_LONG_PROPERTY_NAME + i));
1079     if (long_name == NULL) break;
1080     if (strcmp(property_value_name, long_name) == 0) return true;
1081   }
1082   return false;
1083 }
1084 
LookupPropertyValueName(UProperty property,const char * property_value_name,bool negate,ZoneList<CharacterRange> * result,Zone * zone)1085 bool LookupPropertyValueName(UProperty property,
1086                              const char* property_value_name, bool negate,
1087                              ZoneList<CharacterRange>* result, Zone* zone) {
1088   UProperty property_for_lookup = property;
1089   if (property_for_lookup == UCHAR_SCRIPT_EXTENSIONS) {
1090     // For the property Script_Extensions, we have to do the property value
1091     // name lookup as if the property is Script.
1092     property_for_lookup = UCHAR_SCRIPT;
1093   }
1094   int32_t property_value =
1095       u_getPropertyValueEnum(property_for_lookup, property_value_name);
1096   if (property_value == UCHAR_INVALID_CODE) return false;
1097 
1098   // We require the property name to match exactly to one of the property value
1099   // aliases. However, u_getPropertyValueEnum uses loose matching.
1100   if (!IsExactPropertyValueAlias(property_value_name, property_for_lookup,
1101                                  property_value)) {
1102     return false;
1103   }
1104 
1105   UErrorCode ec = U_ZERO_ERROR;
1106   icu::UnicodeSet set;
1107   set.applyIntPropertyValue(property, property_value, ec);
1108   bool success = ec == U_ZERO_ERROR && !set.isEmpty();
1109 
1110   if (success) {
1111     set.removeAllStrings();
1112     if (negate) set.complement();
1113     for (int i = 0; i < set.getRangeCount(); i++) {
1114       result->Add(
1115           CharacterRange::Range(set.getRangeStart(i), set.getRangeEnd(i)),
1116           zone);
1117     }
1118   }
1119   return success;
1120 }
1121 
1122 template <size_t N>
NameEquals(const char * name,const char (& literal)[N])1123 inline bool NameEquals(const char* name, const char (&literal)[N]) {
1124   return strncmp(name, literal, N + 1) == 0;
1125 }
1126 
LookupSpecialPropertyValueName(const char * name,ZoneList<CharacterRange> * result,bool negate,Zone * zone)1127 bool LookupSpecialPropertyValueName(const char* name,
1128                                     ZoneList<CharacterRange>* result,
1129                                     bool negate, Zone* zone) {
1130   if (NameEquals(name, "Any")) {
1131     if (!negate) result->Add(CharacterRange::Everything(), zone);
1132   } else if (NameEquals(name, "ASCII")) {
1133     result->Add(negate ? CharacterRange::Range(0x80, String::kMaxCodePoint)
1134                        : CharacterRange::Range(0x0, 0x7f),
1135                 zone);
1136   } else if (NameEquals(name, "Assigned")) {
1137     return LookupPropertyValueName(UCHAR_GENERAL_CATEGORY, "Unassigned",
1138                                    !negate, result, zone);
1139   } else {
1140     return false;
1141   }
1142   return true;
1143 }
1144 
1145 }  // anonymous namespace
1146 
ParsePropertyClass(ZoneList<CharacterRange> * result,bool negate)1147 bool RegExpParser::ParsePropertyClass(ZoneList<CharacterRange>* result,
1148                                       bool negate) {
1149   // Parse the property class as follows:
1150   // - In \p{name}, 'name' is interpreted
1151   //   - either as a general category property value name.
1152   //   - or as a binary property name.
1153   // - In \p{name=value}, 'name' is interpreted as an enumerated property name,
1154   //   and 'value' is interpreted as one of the available property value names.
1155   // - Aliases in PropertyAlias.txt and PropertyValueAlias.txt can be used.
1156   // - Loose matching is not applied.
1157   List<char> first_part;
1158   List<char> second_part;
1159   if (current() == '{') {
1160     // Parse \p{[PropertyName=]PropertyNameValue}
1161     for (Advance(); current() != '}' && current() != '='; Advance()) {
1162       if (!has_next()) return false;
1163       first_part.Add(static_cast<char>(current()));
1164     }
1165     if (current() == '=') {
1166       for (Advance(); current() != '}'; Advance()) {
1167         if (!has_next()) return false;
1168         second_part.Add(static_cast<char>(current()));
1169       }
1170       second_part.Add(0);  // null-terminate string.
1171     }
1172   } else {
1173     return false;
1174   }
1175   Advance();
1176   first_part.Add(0);  // null-terminate string.
1177 
1178   if (second_part.is_empty()) {
1179     // First attempt to interpret as general category property value name.
1180     const char* name = first_part.ToConstVector().start();
1181     if (LookupPropertyValueName(UCHAR_GENERAL_CATEGORY_MASK, name, negate,
1182                                 result, zone())) {
1183       return true;
1184     }
1185     // Interpret "Any", "ASCII", and "Assigned".
1186     if (LookupSpecialPropertyValueName(name, result, negate, zone())) {
1187       return true;
1188     }
1189     // Then attempt to interpret as binary property name with value name 'Y'.
1190     UProperty property = u_getPropertyEnum(name);
1191     if (property < UCHAR_BINARY_START) return false;
1192     if (property >= UCHAR_BINARY_LIMIT) return false;
1193     if (!IsExactPropertyAlias(name, property)) return false;
1194     return LookupPropertyValueName(property, negate ? "N" : "Y", false, result,
1195                                    zone());
1196   } else {
1197     // Both property name and value name are specified. Attempt to interpret
1198     // the property name as enumerated property.
1199     const char* property_name = first_part.ToConstVector().start();
1200     const char* value_name = second_part.ToConstVector().start();
1201     UProperty property = u_getPropertyEnum(property_name);
1202     if (!IsExactPropertyAlias(property_name, property)) return false;
1203     if (property == UCHAR_GENERAL_CATEGORY) {
1204       // We want to allow aggregate value names such as "Letter".
1205       property = UCHAR_GENERAL_CATEGORY_MASK;
1206     } else if (property != UCHAR_SCRIPT &&
1207                property != UCHAR_SCRIPT_EXTENSIONS) {
1208       return false;
1209     }
1210     return LookupPropertyValueName(property, value_name, negate, result,
1211                                    zone());
1212   }
1213 }
1214 
1215 #else  // V8_I18N_SUPPORT
1216 
ParsePropertyClass(ZoneList<CharacterRange> * result,bool negate)1217 bool RegExpParser::ParsePropertyClass(ZoneList<CharacterRange>* result,
1218                                       bool negate) {
1219   return false;
1220 }
1221 
1222 #endif  // V8_I18N_SUPPORT
1223 
ParseUnlimitedLengthHexNumber(int max_value,uc32 * value)1224 bool RegExpParser::ParseUnlimitedLengthHexNumber(int max_value, uc32* value) {
1225   uc32 x = 0;
1226   int d = HexValue(current());
1227   if (d < 0) {
1228     return false;
1229   }
1230   while (d >= 0) {
1231     x = x * 16 + d;
1232     if (x > max_value) {
1233       return false;
1234     }
1235     Advance();
1236     d = HexValue(current());
1237   }
1238   *value = x;
1239   return true;
1240 }
1241 
1242 
ParseClassCharacterEscape()1243 uc32 RegExpParser::ParseClassCharacterEscape() {
1244   DCHECK(current() == '\\');
1245   DCHECK(has_next() && !IsSpecialClassEscape(Next()));
1246   Advance();
1247   switch (current()) {
1248     case 'b':
1249       Advance();
1250       return '\b';
1251     // ControlEscape :: one of
1252     //   f n r t v
1253     case 'f':
1254       Advance();
1255       return '\f';
1256     case 'n':
1257       Advance();
1258       return '\n';
1259     case 'r':
1260       Advance();
1261       return '\r';
1262     case 't':
1263       Advance();
1264       return '\t';
1265     case 'v':
1266       Advance();
1267       return '\v';
1268     case 'c': {
1269       uc32 controlLetter = Next();
1270       uc32 letter = controlLetter & ~('A' ^ 'a');
1271       // For compatibility with JSC, inside a character class. We also accept
1272       // digits and underscore as control characters, unless with /u.
1273       if (letter >= 'A' && letter <= 'Z') {
1274         Advance(2);
1275         // Control letters mapped to ASCII control characters in the range
1276         // 0x00-0x1f.
1277         return controlLetter & 0x1f;
1278       }
1279       if (unicode()) {
1280         // With /u, invalid escapes are not treated as identity escapes.
1281         ReportError(CStrVector("Invalid class escape"));
1282         return 0;
1283       }
1284       if ((controlLetter >= '0' && controlLetter <= '9') ||
1285           controlLetter == '_') {
1286         Advance(2);
1287         return controlLetter & 0x1f;
1288       }
1289       // We match JSC in reading the backslash as a literal
1290       // character instead of as starting an escape.
1291       return '\\';
1292     }
1293     case '0':
1294       // With /u, \0 is interpreted as NUL if not followed by another digit.
1295       if (unicode() && !(Next() >= '0' && Next() <= '9')) {
1296         Advance();
1297         return 0;
1298       }
1299     // Fall through.
1300     case '1':
1301     case '2':
1302     case '3':
1303     case '4':
1304     case '5':
1305     case '6':
1306     case '7':
1307       // For compatibility, we interpret a decimal escape that isn't
1308       // a back reference (and therefore either \0 or not valid according
1309       // to the specification) as a 1..3 digit octal character code.
1310       if (unicode()) {
1311         // With /u, decimal escape is not interpreted as octal character code.
1312         ReportError(CStrVector("Invalid class escape"));
1313         return 0;
1314       }
1315       return ParseOctalLiteral();
1316     case 'x': {
1317       Advance();
1318       uc32 value;
1319       if (ParseHexEscape(2, &value)) return value;
1320       if (unicode()) {
1321         // With /u, invalid escapes are not treated as identity escapes.
1322         ReportError(CStrVector("Invalid escape"));
1323         return 0;
1324       }
1325       // If \x is not followed by a two-digit hexadecimal, treat it
1326       // as an identity escape.
1327       return 'x';
1328     }
1329     case 'u': {
1330       Advance();
1331       uc32 value;
1332       if (ParseUnicodeEscape(&value)) return value;
1333       if (unicode()) {
1334         // With /u, invalid escapes are not treated as identity escapes.
1335         ReportError(CStrVector("Invalid unicode escape"));
1336         return 0;
1337       }
1338       // If \u is not followed by a two-digit hexadecimal, treat it
1339       // as an identity escape.
1340       return 'u';
1341     }
1342     default: {
1343       uc32 result = current();
1344       // With /u, no identity escapes except for syntax characters and '-' are
1345       // allowed. Otherwise, all identity escapes are allowed.
1346       if (!unicode() || IsSyntaxCharacterOrSlash(result) || result == '-') {
1347         Advance();
1348         return result;
1349       }
1350       ReportError(CStrVector("Invalid escape"));
1351       return 0;
1352     }
1353   }
1354   return 0;
1355 }
1356 
1357 
ParseClassAtom(uc16 * char_class)1358 CharacterRange RegExpParser::ParseClassAtom(uc16* char_class) {
1359   DCHECK_EQ(0, *char_class);
1360   uc32 first = current();
1361   if (first == '\\') {
1362     switch (Next()) {
1363       case 'w':
1364       case 'W':
1365       case 'd':
1366       case 'D':
1367       case 's':
1368       case 'S': {
1369         *char_class = Next();
1370         Advance(2);
1371         return CharacterRange::Singleton(0);  // Return dummy value.
1372       }
1373       case kEndMarker:
1374         return ReportError(CStrVector("\\ at end of pattern"));
1375       default:
1376         first = ParseClassCharacterEscape(CHECK_FAILED);
1377     }
1378   } else {
1379     Advance();
1380   }
1381 
1382   return CharacterRange::Singleton(first);
1383 }
1384 
1385 static const uc16 kNoCharClass = 0;
1386 
1387 // Adds range or pre-defined character class to character ranges.
1388 // If char_class is not kInvalidClass, it's interpreted as a class
1389 // escape (i.e., 's' means whitespace, from '\s').
AddRangeOrEscape(ZoneList<CharacterRange> * ranges,uc16 char_class,CharacterRange range,Zone * zone)1390 static inline void AddRangeOrEscape(ZoneList<CharacterRange>* ranges,
1391                                     uc16 char_class, CharacterRange range,
1392                                     Zone* zone) {
1393   if (char_class != kNoCharClass) {
1394     CharacterRange::AddClassEscape(char_class, ranges, zone);
1395   } else {
1396     ranges->Add(range, zone);
1397   }
1398 }
1399 
ParseClassProperty(ZoneList<CharacterRange> * ranges)1400 bool RegExpParser::ParseClassProperty(ZoneList<CharacterRange>* ranges) {
1401   if (!FLAG_harmony_regexp_property) return false;
1402   if (!unicode()) return false;
1403   if (current() != '\\') return false;
1404   uc32 next = Next();
1405   bool parse_success = false;
1406   if (next == 'p') {
1407     Advance(2);
1408     parse_success = ParsePropertyClass(ranges, false);
1409   } else if (next == 'P') {
1410     Advance(2);
1411     parse_success = ParsePropertyClass(ranges, true);
1412   } else {
1413     return false;
1414   }
1415   if (!parse_success)
1416     ReportError(CStrVector("Invalid property name in character class"));
1417   return parse_success;
1418 }
1419 
ParseCharacterClass()1420 RegExpTree* RegExpParser::ParseCharacterClass() {
1421   static const char* kUnterminated = "Unterminated character class";
1422   static const char* kRangeInvalid = "Invalid character class";
1423   static const char* kRangeOutOfOrder = "Range out of order in character class";
1424 
1425   DCHECK_EQ(current(), '[');
1426   Advance();
1427   bool is_negated = false;
1428   if (current() == '^') {
1429     is_negated = true;
1430     Advance();
1431   }
1432   ZoneList<CharacterRange>* ranges =
1433       new (zone()) ZoneList<CharacterRange>(2, zone());
1434   while (has_more() && current() != ']') {
1435     bool parsed_property = ParseClassProperty(ranges CHECK_FAILED);
1436     if (parsed_property) continue;
1437     uc16 char_class = kNoCharClass;
1438     CharacterRange first = ParseClassAtom(&char_class CHECK_FAILED);
1439     if (current() == '-') {
1440       Advance();
1441       if (current() == kEndMarker) {
1442         // If we reach the end we break out of the loop and let the
1443         // following code report an error.
1444         break;
1445       } else if (current() == ']') {
1446         AddRangeOrEscape(ranges, char_class, first, zone());
1447         ranges->Add(CharacterRange::Singleton('-'), zone());
1448         break;
1449       }
1450       uc16 char_class_2 = kNoCharClass;
1451       CharacterRange next = ParseClassAtom(&char_class_2 CHECK_FAILED);
1452       if (char_class != kNoCharClass || char_class_2 != kNoCharClass) {
1453         // Either end is an escaped character class. Treat the '-' verbatim.
1454         if (unicode()) {
1455           // ES2015 21.2.2.15.1 step 1.
1456           return ReportError(CStrVector(kRangeInvalid));
1457         }
1458         AddRangeOrEscape(ranges, char_class, first, zone());
1459         ranges->Add(CharacterRange::Singleton('-'), zone());
1460         AddRangeOrEscape(ranges, char_class_2, next, zone());
1461         continue;
1462       }
1463       // ES2015 21.2.2.15.1 step 6.
1464       if (first.from() > next.to()) {
1465         return ReportError(CStrVector(kRangeOutOfOrder));
1466       }
1467       ranges->Add(CharacterRange::Range(first.from(), next.to()), zone());
1468     } else {
1469       AddRangeOrEscape(ranges, char_class, first, zone());
1470     }
1471   }
1472   if (!has_more()) {
1473     return ReportError(CStrVector(kUnterminated));
1474   }
1475   Advance();
1476   if (ranges->length() == 0) {
1477     ranges->Add(CharacterRange::Everything(), zone());
1478     is_negated = !is_negated;
1479   }
1480   return new (zone()) RegExpCharacterClass(ranges, is_negated);
1481 }
1482 
1483 
1484 #undef CHECK_FAILED
1485 
1486 
ParseRegExp(Isolate * isolate,Zone * zone,FlatStringReader * input,JSRegExp::Flags flags,RegExpCompileData * result)1487 bool RegExpParser::ParseRegExp(Isolate* isolate, Zone* zone,
1488                                FlatStringReader* input, JSRegExp::Flags flags,
1489                                RegExpCompileData* result) {
1490   DCHECK(result != NULL);
1491   RegExpParser parser(input, &result->error, flags, isolate, zone);
1492   RegExpTree* tree = parser.ParsePattern();
1493   if (parser.failed()) {
1494     DCHECK(tree == NULL);
1495     DCHECK(!result->error.is_null());
1496   } else {
1497     DCHECK(tree != NULL);
1498     DCHECK(result->error.is_null());
1499     if (FLAG_trace_regexp_parser) {
1500       OFStream os(stdout);
1501       tree->Print(os, zone);
1502       os << "\n";
1503     }
1504     result->tree = tree;
1505     int capture_count = parser.captures_started();
1506     result->simple = tree->IsAtom() && parser.simple() && capture_count == 0;
1507     result->contains_anchor = parser.contains_anchor();
1508     result->capture_name_map = parser.CreateCaptureNameMap();
1509     result->capture_count = capture_count;
1510   }
1511   return !parser.failed();
1512 }
1513 
RegExpBuilder(Zone * zone,bool ignore_case,bool unicode)1514 RegExpBuilder::RegExpBuilder(Zone* zone, bool ignore_case, bool unicode)
1515     : zone_(zone),
1516       pending_empty_(false),
1517       ignore_case_(ignore_case),
1518       unicode_(unicode),
1519       characters_(NULL),
1520       pending_surrogate_(kNoPendingSurrogate),
1521       terms_(),
1522       alternatives_()
1523 #ifdef DEBUG
1524       ,
1525       last_added_(ADD_NONE)
1526 #endif
1527 {
1528 }
1529 
1530 
AddLeadSurrogate(uc16 lead_surrogate)1531 void RegExpBuilder::AddLeadSurrogate(uc16 lead_surrogate) {
1532   DCHECK(unibrow::Utf16::IsLeadSurrogate(lead_surrogate));
1533   FlushPendingSurrogate();
1534   // Hold onto the lead surrogate, waiting for a trail surrogate to follow.
1535   pending_surrogate_ = lead_surrogate;
1536 }
1537 
1538 
AddTrailSurrogate(uc16 trail_surrogate)1539 void RegExpBuilder::AddTrailSurrogate(uc16 trail_surrogate) {
1540   DCHECK(unibrow::Utf16::IsTrailSurrogate(trail_surrogate));
1541   if (pending_surrogate_ != kNoPendingSurrogate) {
1542     uc16 lead_surrogate = pending_surrogate_;
1543     pending_surrogate_ = kNoPendingSurrogate;
1544     DCHECK(unibrow::Utf16::IsLeadSurrogate(lead_surrogate));
1545     uc32 combined =
1546         unibrow::Utf16::CombineSurrogatePair(lead_surrogate, trail_surrogate);
1547     if (NeedsDesugaringForIgnoreCase(combined)) {
1548       AddCharacterClassForDesugaring(combined);
1549     } else {
1550       ZoneList<uc16> surrogate_pair(2, zone());
1551       surrogate_pair.Add(lead_surrogate, zone());
1552       surrogate_pair.Add(trail_surrogate, zone());
1553       RegExpAtom* atom =
1554           new (zone()) RegExpAtom(surrogate_pair.ToConstVector());
1555       AddAtom(atom);
1556     }
1557   } else {
1558     pending_surrogate_ = trail_surrogate;
1559     FlushPendingSurrogate();
1560   }
1561 }
1562 
1563 
FlushPendingSurrogate()1564 void RegExpBuilder::FlushPendingSurrogate() {
1565   if (pending_surrogate_ != kNoPendingSurrogate) {
1566     DCHECK(unicode());
1567     uc32 c = pending_surrogate_;
1568     pending_surrogate_ = kNoPendingSurrogate;
1569     AddCharacterClassForDesugaring(c);
1570   }
1571 }
1572 
1573 
FlushCharacters()1574 void RegExpBuilder::FlushCharacters() {
1575   FlushPendingSurrogate();
1576   pending_empty_ = false;
1577   if (characters_ != NULL) {
1578     RegExpTree* atom = new (zone()) RegExpAtom(characters_->ToConstVector());
1579     characters_ = NULL;
1580     text_.Add(atom, zone());
1581     LAST(ADD_ATOM);
1582   }
1583 }
1584 
1585 
FlushText()1586 void RegExpBuilder::FlushText() {
1587   FlushCharacters();
1588   int num_text = text_.length();
1589   if (num_text == 0) {
1590     return;
1591   } else if (num_text == 1) {
1592     terms_.Add(text_.last(), zone());
1593   } else {
1594     RegExpText* text = new (zone()) RegExpText(zone());
1595     for (int i = 0; i < num_text; i++) text_.Get(i)->AppendToText(text, zone());
1596     terms_.Add(text, zone());
1597   }
1598   text_.Clear();
1599 }
1600 
1601 
AddCharacter(uc16 c)1602 void RegExpBuilder::AddCharacter(uc16 c) {
1603   FlushPendingSurrogate();
1604   pending_empty_ = false;
1605   if (NeedsDesugaringForIgnoreCase(c)) {
1606     AddCharacterClassForDesugaring(c);
1607   } else {
1608     if (characters_ == NULL) {
1609       characters_ = new (zone()) ZoneList<uc16>(4, zone());
1610     }
1611     characters_->Add(c, zone());
1612     LAST(ADD_CHAR);
1613   }
1614 }
1615 
1616 
AddUnicodeCharacter(uc32 c)1617 void RegExpBuilder::AddUnicodeCharacter(uc32 c) {
1618   if (c > static_cast<uc32>(unibrow::Utf16::kMaxNonSurrogateCharCode)) {
1619     DCHECK(unicode());
1620     AddLeadSurrogate(unibrow::Utf16::LeadSurrogate(c));
1621     AddTrailSurrogate(unibrow::Utf16::TrailSurrogate(c));
1622   } else if (unicode() && unibrow::Utf16::IsLeadSurrogate(c)) {
1623     AddLeadSurrogate(c);
1624   } else if (unicode() && unibrow::Utf16::IsTrailSurrogate(c)) {
1625     AddTrailSurrogate(c);
1626   } else {
1627     AddCharacter(static_cast<uc16>(c));
1628   }
1629 }
1630 
AddEscapedUnicodeCharacter(uc32 character)1631 void RegExpBuilder::AddEscapedUnicodeCharacter(uc32 character) {
1632   // A lead or trail surrogate parsed via escape sequence will not
1633   // pair up with any preceding lead or following trail surrogate.
1634   FlushPendingSurrogate();
1635   AddUnicodeCharacter(character);
1636   FlushPendingSurrogate();
1637 }
1638 
AddEmpty()1639 void RegExpBuilder::AddEmpty() { pending_empty_ = true; }
1640 
1641 
AddCharacterClass(RegExpCharacterClass * cc)1642 void RegExpBuilder::AddCharacterClass(RegExpCharacterClass* cc) {
1643   if (NeedsDesugaringForUnicode(cc)) {
1644     // With /u, character class needs to be desugared, so it
1645     // must be a standalone term instead of being part of a RegExpText.
1646     AddTerm(cc);
1647   } else {
1648     AddAtom(cc);
1649   }
1650 }
1651 
AddCharacterClassForDesugaring(uc32 c)1652 void RegExpBuilder::AddCharacterClassForDesugaring(uc32 c) {
1653   AddTerm(new (zone()) RegExpCharacterClass(
1654       CharacterRange::List(zone(), CharacterRange::Singleton(c)), false));
1655 }
1656 
1657 
AddAtom(RegExpTree * term)1658 void RegExpBuilder::AddAtom(RegExpTree* term) {
1659   if (term->IsEmpty()) {
1660     AddEmpty();
1661     return;
1662   }
1663   if (term->IsTextElement()) {
1664     FlushCharacters();
1665     text_.Add(term, zone());
1666   } else {
1667     FlushText();
1668     terms_.Add(term, zone());
1669   }
1670   LAST(ADD_ATOM);
1671 }
1672 
1673 
AddTerm(RegExpTree * term)1674 void RegExpBuilder::AddTerm(RegExpTree* term) {
1675   FlushText();
1676   terms_.Add(term, zone());
1677   LAST(ADD_ATOM);
1678 }
1679 
1680 
AddAssertion(RegExpTree * assert)1681 void RegExpBuilder::AddAssertion(RegExpTree* assert) {
1682   FlushText();
1683   terms_.Add(assert, zone());
1684   LAST(ADD_ASSERT);
1685 }
1686 
1687 
NewAlternative()1688 void RegExpBuilder::NewAlternative() { FlushTerms(); }
1689 
1690 
FlushTerms()1691 void RegExpBuilder::FlushTerms() {
1692   FlushText();
1693   int num_terms = terms_.length();
1694   RegExpTree* alternative;
1695   if (num_terms == 0) {
1696     alternative = new (zone()) RegExpEmpty();
1697   } else if (num_terms == 1) {
1698     alternative = terms_.last();
1699   } else {
1700     alternative = new (zone()) RegExpAlternative(terms_.GetList(zone()));
1701   }
1702   alternatives_.Add(alternative, zone());
1703   terms_.Clear();
1704   LAST(ADD_NONE);
1705 }
1706 
1707 
NeedsDesugaringForUnicode(RegExpCharacterClass * cc)1708 bool RegExpBuilder::NeedsDesugaringForUnicode(RegExpCharacterClass* cc) {
1709   if (!unicode()) return false;
1710   // TODO(yangguo): we could be smarter than this. Case-insensitivity does not
1711   // necessarily mean that we need to desugar. It's probably nicer to have a
1712   // separate pass to figure out unicode desugarings.
1713   if (ignore_case()) return true;
1714   ZoneList<CharacterRange>* ranges = cc->ranges(zone());
1715   CharacterRange::Canonicalize(ranges);
1716   for (int i = ranges->length() - 1; i >= 0; i--) {
1717     uc32 from = ranges->at(i).from();
1718     uc32 to = ranges->at(i).to();
1719     // Check for non-BMP characters.
1720     if (to >= kNonBmpStart) return true;
1721     // Check for lone surrogates.
1722     if (from <= kTrailSurrogateEnd && to >= kLeadSurrogateStart) return true;
1723   }
1724   return false;
1725 }
1726 
1727 
NeedsDesugaringForIgnoreCase(uc32 c)1728 bool RegExpBuilder::NeedsDesugaringForIgnoreCase(uc32 c) {
1729 #ifdef V8_I18N_SUPPORT
1730   if (unicode() && ignore_case()) {
1731     icu::UnicodeSet set(c, c);
1732     set.closeOver(USET_CASE_INSENSITIVE);
1733     set.removeAllStrings();
1734     return set.size() > 1;
1735   }
1736   // In the case where ICU is not included, we act as if the unicode flag is
1737   // not set, and do not desugar.
1738 #endif  // V8_I18N_SUPPORT
1739   return false;
1740 }
1741 
1742 
ToRegExp()1743 RegExpTree* RegExpBuilder::ToRegExp() {
1744   FlushTerms();
1745   int num_alternatives = alternatives_.length();
1746   if (num_alternatives == 0) return new (zone()) RegExpEmpty();
1747   if (num_alternatives == 1) return alternatives_.last();
1748   return new (zone()) RegExpDisjunction(alternatives_.GetList(zone()));
1749 }
1750 
AddQuantifierToAtom(int min,int max,RegExpQuantifier::QuantifierType quantifier_type)1751 bool RegExpBuilder::AddQuantifierToAtom(
1752     int min, int max, RegExpQuantifier::QuantifierType quantifier_type) {
1753   FlushPendingSurrogate();
1754   if (pending_empty_) {
1755     pending_empty_ = false;
1756     return true;
1757   }
1758   RegExpTree* atom;
1759   if (characters_ != NULL) {
1760     DCHECK(last_added_ == ADD_CHAR);
1761     // Last atom was character.
1762     Vector<const uc16> char_vector = characters_->ToConstVector();
1763     int num_chars = char_vector.length();
1764     if (num_chars > 1) {
1765       Vector<const uc16> prefix = char_vector.SubVector(0, num_chars - 1);
1766       text_.Add(new (zone()) RegExpAtom(prefix), zone());
1767       char_vector = char_vector.SubVector(num_chars - 1, num_chars);
1768     }
1769     characters_ = NULL;
1770     atom = new (zone()) RegExpAtom(char_vector);
1771     FlushText();
1772   } else if (text_.length() > 0) {
1773     DCHECK(last_added_ == ADD_ATOM);
1774     atom = text_.RemoveLast();
1775     FlushText();
1776   } else if (terms_.length() > 0) {
1777     DCHECK(last_added_ == ADD_ATOM);
1778     atom = terms_.RemoveLast();
1779     // With /u, lookarounds are not quantifiable.
1780     if (unicode() && atom->IsLookaround()) return false;
1781     if (atom->max_match() == 0) {
1782       // Guaranteed to only match an empty string.
1783       LAST(ADD_TERM);
1784       if (min == 0) {
1785         return true;
1786       }
1787       terms_.Add(atom, zone());
1788       return true;
1789     }
1790   } else {
1791     // Only call immediately after adding an atom or character!
1792     UNREACHABLE();
1793     return false;
1794   }
1795   terms_.Add(new (zone()) RegExpQuantifier(min, max, quantifier_type, atom),
1796              zone());
1797   LAST(ADD_TERM);
1798   return true;
1799 }
1800 
1801 }  // namespace internal
1802 }  // namespace v8
1803