• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2016 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #include "src/regexp/regexp-parser.h"
6 
7 #include "src/base/small-vector.h"
8 #include "src/execution/isolate.h"
9 #include "src/objects/string-inl.h"
10 #include "src/regexp/property-sequences.h"
11 #include "src/regexp/regexp-ast.h"
12 #include "src/regexp/regexp-macro-assembler.h"
13 #include "src/regexp/regexp.h"
14 #include "src/strings/char-predicates-inl.h"
15 #include "src/utils/ostreams.h"
16 #include "src/utils/utils.h"
17 #include "src/zone/zone-allocator.h"
18 #include "src/zone/zone-list-inl.h"
19 
20 #ifdef V8_INTL_SUPPORT
21 #include "unicode/uniset.h"
22 #endif  // V8_INTL_SUPPORT
23 
24 namespace v8 {
25 namespace internal {
26 
27 namespace {
28 
29 // Whether we're currently inside the ClassEscape production
30 // (tc39.es/ecma262/#prod-annexB-CharacterEscape).
31 enum class InClassEscapeState {
32   kInClass,
33   kNotInClass,
34 };
35 
36 // Accumulates RegExp atoms and assertions into lists of terms and alternatives.
37 class RegExpBuilder {
38  public:
RegExpBuilder(Zone * zone,RegExpFlags flags)39   RegExpBuilder(Zone* zone, RegExpFlags flags)
40       : zone_(zone),
41         flags_(flags),
42         terms_(ZoneAllocator<RegExpTree*>{zone}),
43         text_(ZoneAllocator<RegExpTree*>{zone}),
44         alternatives_(ZoneAllocator<RegExpTree*>{zone}) {}
45   void AddCharacter(base::uc16 character);
46   void AddUnicodeCharacter(base::uc32 character);
47   void AddEscapedUnicodeCharacter(base::uc32 character);
48   // "Adds" an empty expression. Does nothing except consume a
49   // following quantifier
50   void AddEmpty();
51   void AddCharacterClass(RegExpCharacterClass* cc);
52   void AddCharacterClassForDesugaring(base::uc32 c);
53   void AddAtom(RegExpTree* tree);
54   void AddTerm(RegExpTree* tree);
55   void AddAssertion(RegExpTree* tree);
56   void NewAlternative();  // '|'
57   bool AddQuantifierToAtom(int min, int max,
58                            RegExpQuantifier::QuantifierType type);
59   void FlushText();
60   RegExpTree* ToRegExp();
flags() const61   RegExpFlags flags() const { return flags_; }
62 
ignore_case() const63   bool ignore_case() const { return IsIgnoreCase(flags_); }
multiline() const64   bool multiline() const { return IsMultiline(flags_); }
dotall() const65   bool dotall() const { return IsDotAll(flags_); }
66 
67  private:
68   static const base::uc16 kNoPendingSurrogate = 0;
69   void AddLeadSurrogate(base::uc16 lead_surrogate);
70   void AddTrailSurrogate(base::uc16 trail_surrogate);
71   void FlushPendingSurrogate();
72   void FlushCharacters();
73   void FlushTerms();
74   bool NeedsDesugaringForUnicode(RegExpCharacterClass* cc);
75   bool NeedsDesugaringForIgnoreCase(base::uc32 c);
zone() const76   Zone* zone() const { return zone_; }
unicode() const77   bool unicode() const { return IsUnicode(flags_); }
78 
79   Zone* const zone_;
80   bool pending_empty_ = false;
81   const RegExpFlags flags_;
82   ZoneList<base::uc16>* characters_ = nullptr;
83   base::uc16 pending_surrogate_ = kNoPendingSurrogate;
84 
85   using SmallRegExpTreeVector =
86       base::SmallVector<RegExpTree*, 8, ZoneAllocator<RegExpTree*>>;
87   SmallRegExpTreeVector terms_;
88   SmallRegExpTreeVector text_;
89   SmallRegExpTreeVector alternatives_;
90 #ifdef DEBUG
91   enum {
92     ADD_NONE,
93     ADD_CHAR,
94     ADD_TERM,
95     ADD_ASSERT,
96     ADD_ATOM
97   } last_added_ = ADD_NONE;
98 #define LAST(x) last_added_ = x;
99 #else
100 #define LAST(x)
101 #endif
102 };
103 
104 enum SubexpressionType {
105   INITIAL,
106   CAPTURE,  // All positive values represent captures.
107   POSITIVE_LOOKAROUND,
108   NEGATIVE_LOOKAROUND,
109   GROUPING
110 };
111 
112 class RegExpParserState : public ZoneObject {
113  public:
114   // Push a state on the stack.
RegExpParserState(RegExpParserState * previous_state,SubexpressionType group_type,RegExpLookaround::Type lookaround_type,int disjunction_capture_index,const ZoneVector<base::uc16> * capture_name,RegExpFlags flags,Zone * zone)115   RegExpParserState(RegExpParserState* previous_state,
116                     SubexpressionType group_type,
117                     RegExpLookaround::Type lookaround_type,
118                     int disjunction_capture_index,
119                     const ZoneVector<base::uc16>* capture_name,
120                     RegExpFlags flags, Zone* zone)
121       : previous_state_(previous_state),
122         builder_(zone, flags),
123         group_type_(group_type),
124         lookaround_type_(lookaround_type),
125         disjunction_capture_index_(disjunction_capture_index),
126         capture_name_(capture_name) {}
127   // Parser state of containing expression, if any.
previous_state() const128   RegExpParserState* previous_state() const { return previous_state_; }
IsSubexpression()129   bool IsSubexpression() { return previous_state_ != nullptr; }
130   // RegExpBuilder building this regexp's AST.
builder()131   RegExpBuilder* builder() { return &builder_; }
132   // Type of regexp being parsed (parenthesized group or entire regexp).
group_type() const133   SubexpressionType group_type() const { return group_type_; }
134   // Lookahead or Lookbehind.
lookaround_type() const135   RegExpLookaround::Type lookaround_type() const { return lookaround_type_; }
136   // Index in captures array of first capture in this sub-expression, if any.
137   // Also the capture index of this sub-expression itself, if group_type
138   // is CAPTURE.
capture_index() const139   int capture_index() const { return disjunction_capture_index_; }
140   // The name of the current sub-expression, if group_type is CAPTURE. Only
141   // used for named captures.
capture_name() const142   const ZoneVector<base::uc16>* capture_name() const { return capture_name_; }
143 
IsNamedCapture() const144   bool IsNamedCapture() const { return capture_name_ != nullptr; }
145 
146   // Check whether the parser is inside a capture group with the given index.
IsInsideCaptureGroup(int index) const147   bool IsInsideCaptureGroup(int index) const {
148     for (const RegExpParserState* s = this; s != nullptr;
149          s = s->previous_state()) {
150       if (s->group_type() != CAPTURE) continue;
151       // Return true if we found the matching capture index.
152       if (index == s->capture_index()) return true;
153       // Abort if index is larger than what has been parsed up till this state.
154       if (index > s->capture_index()) return false;
155     }
156     return false;
157   }
158 
159   // Check whether the parser is inside a capture group with the given name.
IsInsideCaptureGroup(const ZoneVector<base::uc16> * name) const160   bool IsInsideCaptureGroup(const ZoneVector<base::uc16>* name) const {
161     DCHECK_NOT_NULL(name);
162     for (const RegExpParserState* s = this; s != nullptr;
163          s = s->previous_state()) {
164       if (s->capture_name() == nullptr) continue;
165       if (*s->capture_name() == *name) return true;
166     }
167     return false;
168   }
169 
170  private:
171   // Linked list implementation of stack of states.
172   RegExpParserState* const previous_state_;
173   // Builder for the stored disjunction.
174   RegExpBuilder builder_;
175   // Stored disjunction type (capture, look-ahead or grouping), if any.
176   const SubexpressionType group_type_;
177   // Stored read direction.
178   const RegExpLookaround::Type lookaround_type_;
179   // Stored disjunction's capture index (if any).
180   const int disjunction_capture_index_;
181   // Stored capture name (if any).
182   const ZoneVector<base::uc16>* const capture_name_;
183 };
184 
185 template <class CharT>
186 class RegExpParserImpl final {
187  private:
188   RegExpParserImpl(const CharT* input, int input_length, RegExpFlags flags,
189                    uintptr_t stack_limit, Zone* zone,
190                    const DisallowGarbageCollection& no_gc);
191 
192   bool Parse(RegExpCompileData* result);
193 
194   RegExpTree* ParsePattern();
195   RegExpTree* ParseDisjunction();
196   RegExpTree* ParseGroup();
197 
198   // Parses a {...,...} quantifier and stores the range in the given
199   // out parameters.
200   bool ParseIntervalQuantifier(int* min_out, int* max_out);
201 
202   // Checks whether the following is a length-digit hexadecimal number,
203   // and sets the value if it is.
204   bool ParseHexEscape(int length, base::uc32* value);
205   bool ParseUnicodeEscape(base::uc32* value);
206   bool ParseUnlimitedLengthHexNumber(int max_value, base::uc32* value);
207 
208   bool ParsePropertyClassName(ZoneVector<char>* name_1,
209                               ZoneVector<char>* name_2);
210   bool AddPropertyClassRange(ZoneList<CharacterRange>* add_to, bool negate,
211                              const ZoneVector<char>& name_1,
212                              const ZoneVector<char>& name_2);
213 
214   RegExpTree* ParseCharacterClass(const RegExpBuilder* state);
215 
216   base::uc32 ParseOctalLiteral();
217 
218   // Tries to parse the input as a back reference.  If successful it
219   // stores the result in the output parameter and returns true.  If
220   // it fails it will push back the characters read so the same characters
221   // can be reparsed.
222   bool ParseBackReferenceIndex(int* index_out);
223 
224   // Parse inside a class. Either add escaped class to the range, or return
225   // false and pass parsed single character through |char_out|.
226   void ParseClassEscape(ZoneList<CharacterRange>* ranges, Zone* zone,
227                         bool add_unicode_case_equivalents, base::uc32* char_out,
228                         bool* is_class_escape);
229   // Returns true iff parsing was successful.
230   bool TryParseCharacterClassEscape(base::uc32 next,
231                                     InClassEscapeState in_class_escape_state,
232                                     ZoneList<CharacterRange>* ranges,
233                                     Zone* zone,
234                                     bool add_unicode_case_equivalents);
235   // Parses and returns a single escaped character.
236   base::uc32 ParseCharacterEscape(InClassEscapeState in_class_escape_state,
237                                   bool* is_escaped_unicode_character);
238 
239   RegExpTree* ReportError(RegExpError error);
240   void Advance();
241   void Advance(int dist);
242   void RewindByOneCodepoint();  // Rewinds to before the previous Advance().
243   void Reset(int pos);
244 
245   // Reports whether the pattern might be used as a literal search string.
246   // Only use if the result of the parse is a single atom node.
simple() const247   bool simple() const { return simple_; }
contains_anchor() const248   bool contains_anchor() const { return contains_anchor_; }
set_contains_anchor()249   void set_contains_anchor() { contains_anchor_ = true; }
captures_started() const250   int captures_started() const { return captures_started_; }
position() const251   int position() const { return next_pos_ - 1; }
failed() const252   bool failed() const { return failed_; }
unicode() const253   bool unicode() const { return IsUnicode(top_level_flags_) || force_unicode_; }
254 
255   static bool IsSyntaxCharacterOrSlash(base::uc32 c);
256 
257   static const base::uc32 kEndMarker = (1 << 21);
258 
259  private:
260   // Return the 1-indexed RegExpCapture object, allocate if necessary.
261   RegExpCapture* GetCapture(int index);
262 
263   // Creates a new named capture at the specified index. Must be called exactly
264   // once for each named capture. Fails if a capture with the same name is
265   // encountered.
266   bool CreateNamedCaptureAtIndex(const ZoneVector<base::uc16>* name, int index);
267 
268   // Parses the name of a capture group (?<name>pattern). The name must adhere
269   // to IdentifierName in the ECMAScript standard.
270   const ZoneVector<base::uc16>* ParseCaptureGroupName();
271 
272   bool ParseNamedBackReference(RegExpBuilder* builder,
273                                RegExpParserState* state);
274   RegExpParserState* ParseOpenParenthesis(RegExpParserState* state);
275 
276   // After the initial parsing pass, patch corresponding RegExpCapture objects
277   // into all RegExpBackReferences. This is done after initial parsing in order
278   // to avoid complicating cases in which references comes before the capture.
279   void PatchNamedBackReferences();
280 
281   ZoneVector<RegExpCapture*>* GetNamedCaptures() const;
282 
283   // Returns true iff the pattern contains named captures. May call
284   // ScanForCaptures to look ahead at the remaining pattern.
285   bool HasNamedCaptures(InClassEscapeState in_class_escape_state);
286 
zone() const287   Zone* zone() const { return zone_; }
288 
current() const289   base::uc32 current() const { return current_; }
has_more() const290   bool has_more() const { return has_more_; }
has_next() const291   bool has_next() const { return next_pos_ < input_length(); }
292   base::uc32 Next();
293   template <bool update_position>
294   base::uc32 ReadNext();
InputAt(int index) const295   CharT InputAt(int index) const {
296     DCHECK(0 <= index && index < input_length());
297     return input_[index];
298   }
input_length() const299   int input_length() const { return input_length_; }
300   void ScanForCaptures(InClassEscapeState in_class_escape_state);
301 
302   struct RegExpCaptureNameLess {
operator ()v8::internal::__anonfae8d4fb0111::RegExpParserImpl::RegExpCaptureNameLess303     bool operator()(const RegExpCapture* lhs, const RegExpCapture* rhs) const {
304       DCHECK_NOT_NULL(lhs);
305       DCHECK_NOT_NULL(rhs);
306       return *lhs->name() < *rhs->name();
307     }
308   };
309 
310   class ForceUnicodeScope final {
311    public:
ForceUnicodeScope(RegExpParserImpl<CharT> * parser)312     explicit ForceUnicodeScope(RegExpParserImpl<CharT>* parser)
313         : parser_(parser) {
314       DCHECK(!parser_->force_unicode_);
315       parser_->force_unicode_ = true;
316     }
~ForceUnicodeScope()317     ~ForceUnicodeScope() {
318       DCHECK(parser_->force_unicode_);
319       parser_->force_unicode_ = false;
320     }
321 
322    private:
323     RegExpParserImpl<CharT>* const parser_;
324   };
325 
326   const DisallowGarbageCollection no_gc_;
327   Zone* const zone_;
328   RegExpError error_ = RegExpError::kNone;
329   int error_pos_ = 0;
330   ZoneList<RegExpCapture*>* captures_;
331   ZoneSet<RegExpCapture*, RegExpCaptureNameLess>* named_captures_;
332   ZoneList<RegExpBackReference*>* named_back_references_;
333   const CharT* const input_;
334   const int input_length_;
335   base::uc32 current_;
336   const RegExpFlags top_level_flags_;
337   bool force_unicode_ = false;  // Force parser to act as if unicode were set.
338   int next_pos_;
339   int captures_started_;
340   int capture_count_;  // Only valid after we have scanned for captures.
341   bool has_more_;
342   bool simple_;
343   bool contains_anchor_;
344   bool is_scanned_for_captures_;
345   bool has_named_captures_;  // Only valid after we have scanned for captures.
346   bool failed_;
347   const uintptr_t stack_limit_;
348 
349   friend bool RegExpParser::ParseRegExpFromHeapString(Isolate*, Zone*,
350                                                       Handle<String>,
351                                                       RegExpFlags,
352                                                       RegExpCompileData*);
353   friend bool RegExpParser::VerifyRegExpSyntax<CharT>(
354       Zone*, uintptr_t, const CharT*, int, RegExpFlags, RegExpCompileData*,
355       const DisallowGarbageCollection&);
356 };
357 
358 template <class CharT>
RegExpParserImpl(const CharT * input,int input_length,RegExpFlags flags,uintptr_t stack_limit,Zone * zone,const DisallowGarbageCollection & no_gc)359 RegExpParserImpl<CharT>::RegExpParserImpl(
360     const CharT* input, int input_length, RegExpFlags flags,
361     uintptr_t stack_limit, Zone* zone, const DisallowGarbageCollection& no_gc)
362     : zone_(zone),
363       captures_(nullptr),
364       named_captures_(nullptr),
365       named_back_references_(nullptr),
366       input_(input),
367       input_length_(input_length),
368       current_(kEndMarker),
369       top_level_flags_(flags),
370       next_pos_(0),
371       captures_started_(0),
372       capture_count_(0),
373       has_more_(true),
374       simple_(false),
375       contains_anchor_(false),
376       is_scanned_for_captures_(false),
377       has_named_captures_(false),
378       failed_(false),
379       stack_limit_(stack_limit) {
380   Advance();
381 }
382 
383 template <>
384 template <bool update_position>
ReadNext()385 inline base::uc32 RegExpParserImpl<uint8_t>::ReadNext() {
386   int position = next_pos_;
387   base::uc16 c0 = InputAt(position);
388   position++;
389   DCHECK(!unibrow::Utf16::IsLeadSurrogate(c0));
390   if (update_position) next_pos_ = position;
391   return c0;
392 }
393 
394 template <>
395 template <bool update_position>
ReadNext()396 inline base::uc32 RegExpParserImpl<base::uc16>::ReadNext() {
397   int position = next_pos_;
398   base::uc16 c0 = InputAt(position);
399   base::uc32 result = c0;
400   position++;
401   // Read the whole surrogate pair in case of unicode flag, if possible.
402   if (unicode() && position < input_length() &&
403       unibrow::Utf16::IsLeadSurrogate(c0)) {
404     base::uc16 c1 = InputAt(position);
405     if (unibrow::Utf16::IsTrailSurrogate(c1)) {
406       result = unibrow::Utf16::CombineSurrogatePair(c0, c1);
407       position++;
408     }
409   }
410   if (update_position) next_pos_ = position;
411   return result;
412 }
413 
414 template <class CharT>
Next()415 base::uc32 RegExpParserImpl<CharT>::Next() {
416   if (has_next()) {
417     return ReadNext<false>();
418   } else {
419     return kEndMarker;
420   }
421 }
422 
423 template <class CharT>
Advance()424 void RegExpParserImpl<CharT>::Advance() {
425   if (has_next()) {
426     if (GetCurrentStackPosition() < stack_limit_) {
427       if (FLAG_correctness_fuzzer_suppressions) {
428         FATAL("Aborting on stack overflow");
429       }
430       ReportError(RegExpError::kStackOverflow);
431     } else {
432       current_ = ReadNext<true>();
433     }
434   } else {
435     current_ = kEndMarker;
436     // Advance so that position() points to 1-after-the-last-character. This is
437     // important so that Reset() to this position works correctly.
438     next_pos_ = input_length() + 1;
439     has_more_ = false;
440   }
441 }
442 
443 template <class CharT>
RewindByOneCodepoint()444 void RegExpParserImpl<CharT>::RewindByOneCodepoint() {
445   if (current() == kEndMarker) return;
446   // Rewinds by one code point, i.e.: two code units if `current` is outside
447   // the basic multilingual plane (= composed of a lead and trail surrogate),
448   // or one code unit otherwise.
449   const int rewind_by =
450       current() > unibrow::Utf16::kMaxNonSurrogateCharCode ? -2 : -1;
451   Advance(rewind_by);  // Undo the last Advance.
452 }
453 
454 template <class CharT>
Reset(int pos)455 void RegExpParserImpl<CharT>::Reset(int pos) {
456   next_pos_ = pos;
457   has_more_ = (pos < input_length());
458   Advance();
459 }
460 
461 template <class CharT>
Advance(int dist)462 void RegExpParserImpl<CharT>::Advance(int dist) {
463   next_pos_ += dist - 1;
464   Advance();
465 }
466 
467 template <class CharT>
IsSyntaxCharacterOrSlash(base::uc32 c)468 bool RegExpParserImpl<CharT>::IsSyntaxCharacterOrSlash(base::uc32 c) {
469   switch (c) {
470     case '^':
471     case '$':
472     case '\\':
473     case '.':
474     case '*':
475     case '+':
476     case '?':
477     case '(':
478     case ')':
479     case '[':
480     case ']':
481     case '{':
482     case '}':
483     case '|':
484     case '/':
485       return true;
486     default:
487       break;
488   }
489   return false;
490 }
491 
492 template <class CharT>
ReportError(RegExpError error)493 RegExpTree* RegExpParserImpl<CharT>::ReportError(RegExpError error) {
494   if (failed_) return nullptr;  // Do not overwrite any existing error.
495   failed_ = true;
496   error_ = error;
497   error_pos_ = position();
498   // Zip to the end to make sure no more input is read.
499   current_ = kEndMarker;
500   next_pos_ = input_length();
501   return nullptr;
502 }
503 
504 #define CHECK_FAILED /**/);    \
505   if (failed_) return nullptr; \
506   ((void)0
507 
508 // Pattern ::
509 //   Disjunction
510 template <class CharT>
ParsePattern()511 RegExpTree* RegExpParserImpl<CharT>::ParsePattern() {
512   RegExpTree* result = ParseDisjunction(CHECK_FAILED);
513   PatchNamedBackReferences(CHECK_FAILED);
514   DCHECK(!has_more());
515   // If the result of parsing is a literal string atom, and it has the
516   // same length as the input, then the atom is identical to the input.
517   if (result->IsAtom() && result->AsAtom()->length() == input_length()) {
518     simple_ = true;
519   }
520   return result;
521 }
522 
523 // Disjunction ::
524 //   Alternative
525 //   Alternative | Disjunction
526 // Alternative ::
527 //   [empty]
528 //   Term Alternative
529 // Term ::
530 //   Assertion
531 //   Atom
532 //   Atom Quantifier
533 template <class CharT>
ParseDisjunction()534 RegExpTree* RegExpParserImpl<CharT>::ParseDisjunction() {
535   // Used to store current state while parsing subexpressions.
536   RegExpParserState initial_state(nullptr, INITIAL, RegExpLookaround::LOOKAHEAD,
537                                   0, nullptr, top_level_flags_, zone());
538   RegExpParserState* state = &initial_state;
539   // Cache the builder in a local variable for quick access.
540   RegExpBuilder* builder = initial_state.builder();
541   while (true) {
542     switch (current()) {
543       case kEndMarker:
544         if (failed()) return nullptr;  // E.g. the initial Advance failed.
545         if (state->IsSubexpression()) {
546           // Inside a parenthesized group when hitting end of input.
547           return ReportError(RegExpError::kUnterminatedGroup);
548         }
549         DCHECK_EQ(INITIAL, state->group_type());
550         // Parsing completed successfully.
551         return builder->ToRegExp();
552       case ')': {
553         if (!state->IsSubexpression()) {
554           return ReportError(RegExpError::kUnmatchedParen);
555         }
556         DCHECK_NE(INITIAL, state->group_type());
557 
558         Advance();
559         // End disjunction parsing and convert builder content to new single
560         // regexp atom.
561         RegExpTree* body = builder->ToRegExp();
562 
563         int end_capture_index = captures_started();
564 
565         int capture_index = state->capture_index();
566         SubexpressionType group_type = state->group_type();
567 
568         // Build result of subexpression.
569         if (group_type == CAPTURE) {
570           if (state->IsNamedCapture()) {
571             CreateNamedCaptureAtIndex(state->capture_name(),
572                                       capture_index CHECK_FAILED);
573           }
574           RegExpCapture* capture = GetCapture(capture_index);
575           capture->set_body(body);
576           body = capture;
577         } else if (group_type == GROUPING) {
578           body = zone()->template New<RegExpGroup>(body);
579         } else {
580           DCHECK(group_type == POSITIVE_LOOKAROUND ||
581                  group_type == NEGATIVE_LOOKAROUND);
582           bool is_positive = (group_type == POSITIVE_LOOKAROUND);
583           body = zone()->template New<RegExpLookaround>(
584               body, is_positive, end_capture_index - capture_index,
585               capture_index, state->lookaround_type());
586         }
587 
588         // Restore previous state.
589         state = state->previous_state();
590         builder = state->builder();
591 
592         builder->AddAtom(body);
593         // For compatibility with JSC and ES3, we allow quantifiers after
594         // lookaheads, and break in all cases.
595         break;
596       }
597       case '|': {
598         Advance();
599         builder->NewAlternative();
600         continue;
601       }
602       case '*':
603       case '+':
604       case '?':
605         return ReportError(RegExpError::kNothingToRepeat);
606       case '^': {
607         Advance();
608         builder->AddAssertion(zone()->template New<RegExpAssertion>(
609             builder->multiline() ? RegExpAssertion::Type::START_OF_LINE
610                                  : RegExpAssertion::Type::START_OF_INPUT));
611         set_contains_anchor();
612         continue;
613       }
614       case '$': {
615         Advance();
616         RegExpAssertion::Type assertion_type =
617             builder->multiline() ? RegExpAssertion::Type::END_OF_LINE
618                                  : RegExpAssertion::Type::END_OF_INPUT;
619         builder->AddAssertion(
620             zone()->template New<RegExpAssertion>(assertion_type));
621         continue;
622       }
623       case '.': {
624         Advance();
625         ZoneList<CharacterRange>* ranges =
626             zone()->template New<ZoneList<CharacterRange>>(2, zone());
627 
628         if (builder->dotall()) {
629           // Everything.
630           CharacterRange::AddClassEscape(StandardCharacterSet::kEverything,
631                                          ranges, false, zone());
632         } else {
633           // Everything except \x0A, \x0D, \u2028 and \u2029.
634           CharacterRange::AddClassEscape(
635               StandardCharacterSet::kNotLineTerminator, ranges, false, zone());
636         }
637 
638         RegExpCharacterClass* cc =
639             zone()->template New<RegExpCharacterClass>(zone(), ranges);
640         builder->AddCharacterClass(cc);
641         break;
642       }
643       case '(': {
644         state = ParseOpenParenthesis(state CHECK_FAILED);
645         builder = state->builder();
646         continue;
647       }
648       case '[': {
649         RegExpTree* cc = ParseCharacterClass(builder CHECK_FAILED);
650         builder->AddCharacterClass(cc->AsCharacterClass());
651         break;
652       }
653       // Atom ::
654       //   \ AtomEscape
655       case '\\':
656         switch (Next()) {
657           case kEndMarker:
658             return ReportError(RegExpError::kEscapeAtEndOfPattern);
659           // AtomEscape ::
660           //   [+UnicodeMode] DecimalEscape
661           //   [~UnicodeMode] DecimalEscape but only if the CapturingGroupNumber
662           //                  of DecimalEscape is ≤ NcapturingParens
663           //   CharacterEscape (some cases of this mixed in too)
664           //
665           // TODO(jgruber): It may make sense to disentangle all the different
666           // cases and make the structure mirror the spec, e.g. for AtomEscape:
667           //
668           //  if (TryParseDecimalEscape(...)) return;
669           //  if (TryParseCharacterClassEscape(...)) return;
670           //  if (TryParseCharacterEscape(...)) return;
671           //  if (TryParseGroupName(...)) return;
672           case '1':
673           case '2':
674           case '3':
675           case '4':
676           case '5':
677           case '6':
678           case '7':
679           case '8':
680           case '9': {
681             int index = 0;
682             const bool is_backref =
683                 ParseBackReferenceIndex(&index CHECK_FAILED);
684             if (is_backref) {
685               if (state->IsInsideCaptureGroup(index)) {
686                 // The back reference is inside the capture group it refers to.
687                 // Nothing can possibly have been captured yet, so we use empty
688                 // instead. This ensures that, when checking a back reference,
689                 // the capture registers of the referenced capture are either
690                 // both set or both cleared.
691                 builder->AddEmpty();
692               } else {
693                 RegExpCapture* capture = GetCapture(index);
694                 RegExpTree* atom = zone()->template New<RegExpBackReference>(
695                     capture, builder->flags());
696                 builder->AddAtom(atom);
697               }
698               break;
699             }
700             // With /u, no identity escapes except for syntax characters
701             // are allowed. Otherwise, all identity escapes are allowed.
702             if (unicode()) {
703               return ReportError(RegExpError::kInvalidEscape);
704             }
705             base::uc32 first_digit = Next();
706             if (first_digit == '8' || first_digit == '9') {
707               builder->AddCharacter(first_digit);
708               Advance(2);
709               break;
710             }
711             V8_FALLTHROUGH;
712           }
713           case '0': {
714             Advance();
715             if (unicode() && Next() >= '0' && Next() <= '9') {
716               // With /u, decimal escape with leading 0 are not parsed as octal.
717               return ReportError(RegExpError::kInvalidDecimalEscape);
718             }
719             base::uc32 octal = ParseOctalLiteral();
720             builder->AddCharacter(octal);
721             break;
722           }
723           case 'b':
724             Advance(2);
725             builder->AddAssertion(zone()->template New<RegExpAssertion>(
726                 RegExpAssertion::Type::BOUNDARY));
727             continue;
728           case 'B':
729             Advance(2);
730             builder->AddAssertion(zone()->template New<RegExpAssertion>(
731                 RegExpAssertion::Type::NON_BOUNDARY));
732             continue;
733           // AtomEscape ::
734           //   CharacterClassEscape
735           case 'd':
736           case 'D':
737           case 's':
738           case 'S':
739           case 'w':
740           case 'W':
741           case 'p':
742           case 'P': {
743             base::uc32 next = Next();
744             ZoneList<CharacterRange>* ranges =
745                 zone()->template New<ZoneList<CharacterRange>>(2, zone());
746             bool add_unicode_case_equivalents =
747                 unicode() && builder->ignore_case();
748             bool parsed_character_class_escape = TryParseCharacterClassEscape(
749                 next, InClassEscapeState::kNotInClass, ranges, zone(),
750                 add_unicode_case_equivalents CHECK_FAILED);
751 
752             if (parsed_character_class_escape) {
753               RegExpCharacterClass* cc =
754                   zone()->template New<RegExpCharacterClass>(zone(), ranges);
755               builder->AddCharacterClass(cc);
756             } else {
757               CHECK(!unicode());
758               Advance(2);
759               builder->AddCharacter(next);  // IdentityEscape.
760             }
761             break;
762           }
763           // AtomEscape ::
764           //   k GroupName
765           case 'k': {
766             // Either an identity escape or a named back-reference.  The two
767             // interpretations are mutually exclusive: '\k' is interpreted as
768             // an identity escape for non-Unicode patterns without named
769             // capture groups, and as the beginning of a named back-reference
770             // in all other cases.
771             const bool has_named_captures =
772                 HasNamedCaptures(InClassEscapeState::kNotInClass CHECK_FAILED);
773             if (unicode() || has_named_captures) {
774               Advance(2);
775               ParseNamedBackReference(builder, state CHECK_FAILED);
776               break;
777             }
778           }
779             V8_FALLTHROUGH;
780           // AtomEscape ::
781           //   CharacterEscape
782           default: {
783             bool is_escaped_unicode_character = false;
784             base::uc32 c = ParseCharacterEscape(
785                 InClassEscapeState::kNotInClass,
786                 &is_escaped_unicode_character CHECK_FAILED);
787             if (is_escaped_unicode_character) {
788               builder->AddEscapedUnicodeCharacter(c);
789             } else {
790               builder->AddCharacter(c);
791             }
792             break;
793           }
794         }
795         break;
796       case '{': {
797         int dummy;
798         bool parsed = ParseIntervalQuantifier(&dummy, &dummy CHECK_FAILED);
799         if (parsed) return ReportError(RegExpError::kNothingToRepeat);
800         V8_FALLTHROUGH;
801       }
802       case '}':
803       case ']':
804         if (unicode()) {
805           return ReportError(RegExpError::kLoneQuantifierBrackets);
806         }
807         V8_FALLTHROUGH;
808       default:
809         builder->AddUnicodeCharacter(current());
810         Advance();
811         break;
812     }  // end switch(current())
813 
814     int min;
815     int max;
816     switch (current()) {
817       // QuantifierPrefix ::
818       //   *
819       //   +
820       //   ?
821       //   {
822       case '*':
823         min = 0;
824         max = RegExpTree::kInfinity;
825         Advance();
826         break;
827       case '+':
828         min = 1;
829         max = RegExpTree::kInfinity;
830         Advance();
831         break;
832       case '?':
833         min = 0;
834         max = 1;
835         Advance();
836         break;
837       case '{':
838         if (ParseIntervalQuantifier(&min, &max)) {
839           if (max < min) {
840             return ReportError(RegExpError::kRangeOutOfOrder);
841           }
842           break;
843         } else if (unicode()) {
844           // With /u, incomplete quantifiers are not allowed.
845           return ReportError(RegExpError::kIncompleteQuantifier);
846         }
847         continue;
848       default:
849         continue;
850     }
851     RegExpQuantifier::QuantifierType quantifier_type = RegExpQuantifier::GREEDY;
852     if (current() == '?') {
853       quantifier_type = RegExpQuantifier::NON_GREEDY;
854       Advance();
855     } else if (FLAG_regexp_possessive_quantifier && current() == '+') {
856       // FLAG_regexp_possessive_quantifier is a debug-only flag.
857       quantifier_type = RegExpQuantifier::POSSESSIVE;
858       Advance();
859     }
860     if (!builder->AddQuantifierToAtom(min, max, quantifier_type)) {
861       return ReportError(RegExpError::kInvalidQuantifier);
862     }
863   }
864 }
865 
866 template <class CharT>
ParseOpenParenthesis(RegExpParserState * state)867 RegExpParserState* RegExpParserImpl<CharT>::ParseOpenParenthesis(
868     RegExpParserState* state) {
869   RegExpLookaround::Type lookaround_type = state->lookaround_type();
870   bool is_named_capture = false;
871   const ZoneVector<base::uc16>* capture_name = nullptr;
872   SubexpressionType subexpr_type = CAPTURE;
873   Advance();
874   if (current() == '?') {
875     switch (Next()) {
876       case ':':
877         Advance(2);
878         subexpr_type = GROUPING;
879         break;
880       case '=':
881         Advance(2);
882         lookaround_type = RegExpLookaround::LOOKAHEAD;
883         subexpr_type = POSITIVE_LOOKAROUND;
884         break;
885       case '!':
886         Advance(2);
887         lookaround_type = RegExpLookaround::LOOKAHEAD;
888         subexpr_type = NEGATIVE_LOOKAROUND;
889         break;
890       case '<':
891         Advance();
892         if (Next() == '=') {
893           Advance(2);
894           lookaround_type = RegExpLookaround::LOOKBEHIND;
895           subexpr_type = POSITIVE_LOOKAROUND;
896           break;
897         } else if (Next() == '!') {
898           Advance(2);
899           lookaround_type = RegExpLookaround::LOOKBEHIND;
900           subexpr_type = NEGATIVE_LOOKAROUND;
901           break;
902         }
903         is_named_capture = true;
904         has_named_captures_ = true;
905         Advance();
906         break;
907       default:
908         ReportError(RegExpError::kInvalidGroup);
909         return nullptr;
910     }
911   }
912   if (subexpr_type == CAPTURE) {
913     if (captures_started_ >= RegExpMacroAssembler::kMaxRegisterCount) {
914       ReportError(RegExpError::kTooManyCaptures);
915       return nullptr;
916     }
917     captures_started_++;
918 
919     if (is_named_capture) {
920       capture_name = ParseCaptureGroupName(CHECK_FAILED);
921     }
922   }
923   // Store current state and begin new disjunction parsing.
924   return zone()->template New<RegExpParserState>(
925       state, subexpr_type, lookaround_type, captures_started_, capture_name,
926       state->builder()->flags(), zone());
927 }
928 
929 #ifdef DEBUG
930 namespace {
931 
IsSpecialClassEscape(base::uc32 c)932 bool IsSpecialClassEscape(base::uc32 c) {
933   switch (c) {
934     case 'd':
935     case 'D':
936     case 's':
937     case 'S':
938     case 'w':
939     case 'W':
940       return true;
941     default:
942       return false;
943   }
944 }
945 
946 }  // namespace
947 #endif
948 
949 // In order to know whether an escape is a backreference or not we have to scan
950 // the entire regexp and find the number of capturing parentheses.  However we
951 // don't want to scan the regexp twice unless it is necessary.  This mini-parser
952 // is called when needed.  It can see the difference between capturing and
953 // noncapturing parentheses and can skip character classes and backslash-escaped
954 // characters.
955 //
956 // Important: The scanner has to be in a consistent state when calling
957 // ScanForCaptures, e.g. not in the middle of an escape sequence '\['.
958 template <class CharT>
ScanForCaptures(InClassEscapeState in_class_escape_state)959 void RegExpParserImpl<CharT>::ScanForCaptures(
960     InClassEscapeState in_class_escape_state) {
961   DCHECK(!is_scanned_for_captures_);
962   const int saved_position = position();
963   // Start with captures started previous to current position
964   int capture_count = captures_started();
965   // When we start inside a character class, skip everything inside the class.
966   if (in_class_escape_state == InClassEscapeState::kInClass) {
967     int c;
968     while ((c = current()) != kEndMarker) {
969       Advance();
970       if (c == '\\') {
971         Advance();
972       } else {
973         if (c == ']') break;
974       }
975     }
976   }
977   // Add count of captures after this position.
978   int n;
979   while ((n = current()) != kEndMarker) {
980     Advance();
981     switch (n) {
982       case '\\':
983         Advance();
984         break;
985       case '[': {
986         int c;
987         while ((c = current()) != kEndMarker) {
988           Advance();
989           if (c == '\\') {
990             Advance();
991           } else {
992             if (c == ']') break;
993           }
994         }
995         break;
996       }
997       case '(':
998         if (current() == '?') {
999           // At this point we could be in
1000           // * a non-capturing group '(:',
1001           // * a lookbehind assertion '(?<=' '(?<!'
1002           // * or a named capture '(?<'.
1003           //
1004           // Of these, only named captures are capturing groups.
1005 
1006           Advance();
1007           if (current() != '<') break;
1008 
1009           Advance();
1010           if (current() == '=' || current() == '!') break;
1011 
1012           // Found a possible named capture. It could turn out to be a syntax
1013           // error (e.g. an unterminated or invalid name), but that distinction
1014           // does not matter for our purposes.
1015           has_named_captures_ = true;
1016         }
1017         capture_count++;
1018         break;
1019     }
1020   }
1021   capture_count_ = capture_count;
1022   is_scanned_for_captures_ = true;
1023   Reset(saved_position);
1024 }
1025 
1026 template <class CharT>
ParseBackReferenceIndex(int * index_out)1027 bool RegExpParserImpl<CharT>::ParseBackReferenceIndex(int* index_out) {
1028   DCHECK_EQ('\\', current());
1029   DCHECK('1' <= Next() && Next() <= '9');
1030   // Try to parse a decimal literal that is no greater than the total number
1031   // of left capturing parentheses in the input.
1032   int start = position();
1033   int value = Next() - '0';
1034   Advance(2);
1035   while (true) {
1036     base::uc32 c = current();
1037     if (IsDecimalDigit(c)) {
1038       value = 10 * value + (c - '0');
1039       if (value > RegExpMacroAssembler::kMaxRegisterCount) {
1040         Reset(start);
1041         return false;
1042       }
1043       Advance();
1044     } else {
1045       break;
1046     }
1047   }
1048   if (value > captures_started()) {
1049     if (!is_scanned_for_captures_)
1050       ScanForCaptures(InClassEscapeState::kNotInClass);
1051     if (value > capture_count_) {
1052       Reset(start);
1053       return false;
1054     }
1055   }
1056   *index_out = value;
1057   return true;
1058 }
1059 
1060 namespace {
1061 
push_code_unit(ZoneVector<base::uc16> * v,uint32_t code_unit)1062 void push_code_unit(ZoneVector<base::uc16>* v, uint32_t code_unit) {
1063   if (code_unit <= unibrow::Utf16::kMaxNonSurrogateCharCode) {
1064     v->push_back(code_unit);
1065   } else {
1066     v->push_back(unibrow::Utf16::LeadSurrogate(code_unit));
1067     v->push_back(unibrow::Utf16::TrailSurrogate(code_unit));
1068   }
1069 }
1070 
1071 }  // namespace
1072 
1073 template <class CharT>
ParseCaptureGroupName()1074 const ZoneVector<base::uc16>* RegExpParserImpl<CharT>::ParseCaptureGroupName() {
1075   // Due to special Advance requirements (see the next comment), rewind by one
1076   // such that names starting with a surrogate pair are parsed correctly for
1077   // patterns where the unicode flag is unset.
1078   //
1079   // Note that we use this odd pattern of rewinding the last advance in order
1080   // to adhere to the common parser behavior of expecting `current` to point at
1081   // the first candidate character for a function (e.g. when entering ParseFoo,
1082   // `current` should point at the first character of Foo).
1083   RewindByOneCodepoint();
1084 
1085   ZoneVector<base::uc16>* name =
1086       zone()->template New<ZoneVector<base::uc16>>(zone());
1087 
1088   {
1089     // Advance behavior inside this function is tricky since
1090     // RegExpIdentifierName explicitly enables unicode (in spec terms, sets +U)
1091     // and thus allows surrogate pairs and \u{}-style escapes even in
1092     // non-unicode patterns. Therefore Advance within the capture group name
1093     // has to force-enable unicode, and outside the name revert to default
1094     // behavior.
1095     ForceUnicodeScope force_unicode(this);
1096 
1097     bool at_start = true;
1098     while (true) {
1099       Advance();
1100       base::uc32 c = current();
1101 
1102       // Convert unicode escapes.
1103       if (c == '\\' && Next() == 'u') {
1104         Advance(2);
1105         if (!ParseUnicodeEscape(&c)) {
1106           ReportError(RegExpError::kInvalidUnicodeEscape);
1107           return nullptr;
1108         }
1109         RewindByOneCodepoint();
1110       }
1111 
1112       // The backslash char is misclassified as both ID_Start and ID_Continue.
1113       if (c == '\\') {
1114         ReportError(RegExpError::kInvalidCaptureGroupName);
1115         return nullptr;
1116       }
1117 
1118       if (at_start) {
1119         if (!IsIdentifierStart(c)) {
1120           ReportError(RegExpError::kInvalidCaptureGroupName);
1121           return nullptr;
1122         }
1123         push_code_unit(name, c);
1124         at_start = false;
1125       } else {
1126         if (c == '>') {
1127           break;
1128         } else if (IsIdentifierPart(c)) {
1129           push_code_unit(name, c);
1130         } else {
1131           ReportError(RegExpError::kInvalidCaptureGroupName);
1132           return nullptr;
1133         }
1134       }
1135     }
1136   }
1137 
1138   // This final advance goes back into the state of pointing at the next
1139   // relevant char, which the rest of the parser expects. See also the previous
1140   // comments in this function.
1141   Advance();
1142   return name;
1143 }
1144 
1145 template <class CharT>
CreateNamedCaptureAtIndex(const ZoneVector<base::uc16> * name,int index)1146 bool RegExpParserImpl<CharT>::CreateNamedCaptureAtIndex(
1147     const ZoneVector<base::uc16>* name, int index) {
1148   DCHECK(0 < index && index <= captures_started_);
1149   DCHECK_NOT_NULL(name);
1150 
1151   RegExpCapture* capture = GetCapture(index);
1152   DCHECK_NULL(capture->name());
1153 
1154   capture->set_name(name);
1155 
1156   if (named_captures_ == nullptr) {
1157     named_captures_ =
1158         zone_->template New<ZoneSet<RegExpCapture*, RegExpCaptureNameLess>>(
1159             zone());
1160   } else {
1161     // Check for duplicates and bail if we find any.
1162 
1163     const auto& named_capture_it = named_captures_->find(capture);
1164     if (named_capture_it != named_captures_->end()) {
1165       ReportError(RegExpError::kDuplicateCaptureGroupName);
1166       return false;
1167     }
1168   }
1169 
1170   named_captures_->emplace(capture);
1171 
1172   return true;
1173 }
1174 
1175 template <class CharT>
ParseNamedBackReference(RegExpBuilder * builder,RegExpParserState * state)1176 bool RegExpParserImpl<CharT>::ParseNamedBackReference(
1177     RegExpBuilder* builder, RegExpParserState* state) {
1178   // The parser is assumed to be on the '<' in \k<name>.
1179   if (current() != '<') {
1180     ReportError(RegExpError::kInvalidNamedReference);
1181     return false;
1182   }
1183 
1184   Advance();
1185   const ZoneVector<base::uc16>* name = ParseCaptureGroupName();
1186   if (name == nullptr) {
1187     return false;
1188   }
1189 
1190   if (state->IsInsideCaptureGroup(name)) {
1191     builder->AddEmpty();
1192   } else {
1193     RegExpBackReference* atom =
1194         zone()->template New<RegExpBackReference>(builder->flags());
1195     atom->set_name(name);
1196 
1197     builder->AddAtom(atom);
1198 
1199     if (named_back_references_ == nullptr) {
1200       named_back_references_ =
1201           zone()->template New<ZoneList<RegExpBackReference*>>(1, zone());
1202     }
1203     named_back_references_->Add(atom, zone());
1204   }
1205 
1206   return true;
1207 }
1208 
1209 template <class CharT>
PatchNamedBackReferences()1210 void RegExpParserImpl<CharT>::PatchNamedBackReferences() {
1211   if (named_back_references_ == nullptr) return;
1212 
1213   if (named_captures_ == nullptr) {
1214     ReportError(RegExpError::kInvalidNamedCaptureReference);
1215     return;
1216   }
1217 
1218   // Look up and patch the actual capture for each named back reference.
1219 
1220   for (int i = 0; i < named_back_references_->length(); i++) {
1221     RegExpBackReference* ref = named_back_references_->at(i);
1222 
1223     // Capture used to search the named_captures_ by name, index of the
1224     // capture is never used.
1225     static const int kInvalidIndex = 0;
1226     RegExpCapture* search_capture =
1227         zone()->template New<RegExpCapture>(kInvalidIndex);
1228     DCHECK_NULL(search_capture->name());
1229     search_capture->set_name(ref->name());
1230 
1231     int index = -1;
1232     const auto& capture_it = named_captures_->find(search_capture);
1233     if (capture_it != named_captures_->end()) {
1234       index = (*capture_it)->index();
1235     } else {
1236       ReportError(RegExpError::kInvalidNamedCaptureReference);
1237       return;
1238     }
1239 
1240     ref->set_capture(GetCapture(index));
1241   }
1242 }
1243 
1244 template <class CharT>
GetCapture(int index)1245 RegExpCapture* RegExpParserImpl<CharT>::GetCapture(int index) {
1246   // The index for the capture groups are one-based. Its index in the list is
1247   // zero-based.
1248   const int known_captures =
1249       is_scanned_for_captures_ ? capture_count_ : captures_started_;
1250   DCHECK(index <= known_captures);
1251   if (captures_ == nullptr) {
1252     captures_ =
1253         zone()->template New<ZoneList<RegExpCapture*>>(known_captures, zone());
1254   }
1255   while (captures_->length() < known_captures) {
1256     captures_->Add(zone()->template New<RegExpCapture>(captures_->length() + 1),
1257                    zone());
1258   }
1259   return captures_->at(index - 1);
1260 }
1261 
1262 template <class CharT>
GetNamedCaptures() const1263 ZoneVector<RegExpCapture*>* RegExpParserImpl<CharT>::GetNamedCaptures() const {
1264   if (named_captures_ == nullptr || named_captures_->empty()) {
1265     return nullptr;
1266   }
1267 
1268   return zone()->template New<ZoneVector<RegExpCapture*>>(
1269       named_captures_->begin(), named_captures_->end(), zone());
1270 }
1271 
1272 template <class CharT>
HasNamedCaptures(InClassEscapeState in_class_escape_state)1273 bool RegExpParserImpl<CharT>::HasNamedCaptures(
1274     InClassEscapeState in_class_escape_state) {
1275   if (has_named_captures_ || is_scanned_for_captures_) {
1276     return has_named_captures_;
1277   }
1278 
1279   ScanForCaptures(in_class_escape_state);
1280   DCHECK(is_scanned_for_captures_);
1281   return has_named_captures_;
1282 }
1283 
1284 // QuantifierPrefix ::
1285 //   { DecimalDigits }
1286 //   { DecimalDigits , }
1287 //   { DecimalDigits , DecimalDigits }
1288 //
1289 // Returns true if parsing succeeds, and set the min_out and max_out
1290 // values. Values are truncated to RegExpTree::kInfinity if they overflow.
1291 template <class CharT>
ParseIntervalQuantifier(int * min_out,int * max_out)1292 bool RegExpParserImpl<CharT>::ParseIntervalQuantifier(int* min_out,
1293                                                       int* max_out) {
1294   DCHECK_EQ(current(), '{');
1295   int start = position();
1296   Advance();
1297   int min = 0;
1298   if (!IsDecimalDigit(current())) {
1299     Reset(start);
1300     return false;
1301   }
1302   while (IsDecimalDigit(current())) {
1303     int next = current() - '0';
1304     if (min > (RegExpTree::kInfinity - next) / 10) {
1305       // Overflow. Skip past remaining decimal digits and return -1.
1306       do {
1307         Advance();
1308       } while (IsDecimalDigit(current()));
1309       min = RegExpTree::kInfinity;
1310       break;
1311     }
1312     min = 10 * min + next;
1313     Advance();
1314   }
1315   int max = 0;
1316   if (current() == '}') {
1317     max = min;
1318     Advance();
1319   } else if (current() == ',') {
1320     Advance();
1321     if (current() == '}') {
1322       max = RegExpTree::kInfinity;
1323       Advance();
1324     } else {
1325       while (IsDecimalDigit(current())) {
1326         int next = current() - '0';
1327         if (max > (RegExpTree::kInfinity - next) / 10) {
1328           do {
1329             Advance();
1330           } while (IsDecimalDigit(current()));
1331           max = RegExpTree::kInfinity;
1332           break;
1333         }
1334         max = 10 * max + next;
1335         Advance();
1336       }
1337       if (current() != '}') {
1338         Reset(start);
1339         return false;
1340       }
1341       Advance();
1342     }
1343   } else {
1344     Reset(start);
1345     return false;
1346   }
1347   *min_out = min;
1348   *max_out = max;
1349   return true;
1350 }
1351 
1352 template <class CharT>
ParseOctalLiteral()1353 base::uc32 RegExpParserImpl<CharT>::ParseOctalLiteral() {
1354   DCHECK(('0' <= current() && current() <= '7') || current() == kEndMarker);
1355   // For compatibility with some other browsers (not all), we parse
1356   // up to three octal digits with a value below 256.
1357   // ES#prod-annexB-LegacyOctalEscapeSequence
1358   base::uc32 value = current() - '0';
1359   Advance();
1360   if ('0' <= current() && current() <= '7') {
1361     value = value * 8 + current() - '0';
1362     Advance();
1363     if (value < 32 && '0' <= current() && current() <= '7') {
1364       value = value * 8 + current() - '0';
1365       Advance();
1366     }
1367   }
1368   return value;
1369 }
1370 
1371 template <class CharT>
ParseHexEscape(int length,base::uc32 * value)1372 bool RegExpParserImpl<CharT>::ParseHexEscape(int length, base::uc32* value) {
1373   int start = position();
1374   base::uc32 val = 0;
1375   for (int i = 0; i < length; ++i) {
1376     base::uc32 c = current();
1377     int d = base::HexValue(c);
1378     if (d < 0) {
1379       Reset(start);
1380       return false;
1381     }
1382     val = val * 16 + d;
1383     Advance();
1384   }
1385   *value = val;
1386   return true;
1387 }
1388 
1389 // This parses RegExpUnicodeEscapeSequence as described in ECMA262.
1390 template <class CharT>
ParseUnicodeEscape(base::uc32 * value)1391 bool RegExpParserImpl<CharT>::ParseUnicodeEscape(base::uc32* value) {
1392   // Accept both \uxxxx and \u{xxxxxx} (if harmony unicode escapes are
1393   // allowed). In the latter case, the number of hex digits between { } is
1394   // arbitrary. \ and u have already been read.
1395   if (current() == '{' && unicode()) {
1396     int start = position();
1397     Advance();
1398     if (ParseUnlimitedLengthHexNumber(0x10FFFF, value)) {
1399       if (current() == '}') {
1400         Advance();
1401         return true;
1402       }
1403     }
1404     Reset(start);
1405     return false;
1406   }
1407   // \u but no {, or \u{...} escapes not allowed.
1408   bool result = ParseHexEscape(4, value);
1409   if (result && unicode() && unibrow::Utf16::IsLeadSurrogate(*value) &&
1410       current() == '\\') {
1411     // Attempt to read trail surrogate.
1412     int start = position();
1413     if (Next() == 'u') {
1414       Advance(2);
1415       base::uc32 trail;
1416       if (ParseHexEscape(4, &trail) &&
1417           unibrow::Utf16::IsTrailSurrogate(trail)) {
1418         *value = unibrow::Utf16::CombineSurrogatePair(
1419             static_cast<base::uc16>(*value), static_cast<base::uc16>(trail));
1420         return true;
1421       }
1422     }
1423     Reset(start);
1424   }
1425   return result;
1426 }
1427 
1428 #ifdef V8_INTL_SUPPORT
1429 
1430 namespace {
1431 
IsExactPropertyAlias(const char * property_name,UProperty property)1432 bool IsExactPropertyAlias(const char* property_name, UProperty property) {
1433   const char* short_name = u_getPropertyName(property, U_SHORT_PROPERTY_NAME);
1434   if (short_name != nullptr && strcmp(property_name, short_name) == 0)
1435     return true;
1436   for (int i = 0;; i++) {
1437     const char* long_name = u_getPropertyName(
1438         property, static_cast<UPropertyNameChoice>(U_LONG_PROPERTY_NAME + i));
1439     if (long_name == nullptr) break;
1440     if (strcmp(property_name, long_name) == 0) return true;
1441   }
1442   return false;
1443 }
1444 
IsExactPropertyValueAlias(const char * property_value_name,UProperty property,int32_t property_value)1445 bool IsExactPropertyValueAlias(const char* property_value_name,
1446                                UProperty property, int32_t property_value) {
1447   const char* short_name =
1448       u_getPropertyValueName(property, property_value, U_SHORT_PROPERTY_NAME);
1449   if (short_name != nullptr && strcmp(property_value_name, short_name) == 0) {
1450     return true;
1451   }
1452   for (int i = 0;; i++) {
1453     const char* long_name = u_getPropertyValueName(
1454         property, property_value,
1455         static_cast<UPropertyNameChoice>(U_LONG_PROPERTY_NAME + i));
1456     if (long_name == nullptr) break;
1457     if (strcmp(property_value_name, long_name) == 0) return true;
1458   }
1459   return false;
1460 }
1461 
LookupPropertyValueName(UProperty property,const char * property_value_name,bool negate,ZoneList<CharacterRange> * result,Zone * zone)1462 bool LookupPropertyValueName(UProperty property,
1463                              const char* property_value_name, bool negate,
1464                              ZoneList<CharacterRange>* result, Zone* zone) {
1465   UProperty property_for_lookup = property;
1466   if (property_for_lookup == UCHAR_SCRIPT_EXTENSIONS) {
1467     // For the property Script_Extensions, we have to do the property value
1468     // name lookup as if the property is Script.
1469     property_for_lookup = UCHAR_SCRIPT;
1470   }
1471   int32_t property_value =
1472       u_getPropertyValueEnum(property_for_lookup, property_value_name);
1473   if (property_value == UCHAR_INVALID_CODE) return false;
1474 
1475   // We require the property name to match exactly to one of the property value
1476   // aliases. However, u_getPropertyValueEnum uses loose matching.
1477   if (!IsExactPropertyValueAlias(property_value_name, property_for_lookup,
1478                                  property_value)) {
1479     return false;
1480   }
1481 
1482   UErrorCode ec = U_ZERO_ERROR;
1483   icu::UnicodeSet set;
1484   set.applyIntPropertyValue(property, property_value, ec);
1485   bool success = ec == U_ZERO_ERROR && !set.isEmpty();
1486 
1487   if (success) {
1488     set.removeAllStrings();
1489     if (negate) set.complement();
1490     for (int i = 0; i < set.getRangeCount(); i++) {
1491       result->Add(
1492           CharacterRange::Range(set.getRangeStart(i), set.getRangeEnd(i)),
1493           zone);
1494     }
1495   }
1496   return success;
1497 }
1498 
1499 template <size_t N>
NameEquals(const char * name,const char (& literal)[N])1500 inline bool NameEquals(const char* name, const char (&literal)[N]) {
1501   return strncmp(name, literal, N + 1) == 0;
1502 }
1503 
LookupSpecialPropertyValueName(const char * name,ZoneList<CharacterRange> * result,bool negate,Zone * zone)1504 bool LookupSpecialPropertyValueName(const char* name,
1505                                     ZoneList<CharacterRange>* result,
1506                                     bool negate, Zone* zone) {
1507   if (NameEquals(name, "Any")) {
1508     if (negate) {
1509       // Leave the list of character ranges empty, since the negation of 'Any'
1510       // is the empty set.
1511     } else {
1512       result->Add(CharacterRange::Everything(), zone);
1513     }
1514   } else if (NameEquals(name, "ASCII")) {
1515     result->Add(negate ? CharacterRange::Range(0x80, String::kMaxCodePoint)
1516                        : CharacterRange::Range(0x0, 0x7F),
1517                 zone);
1518   } else if (NameEquals(name, "Assigned")) {
1519     return LookupPropertyValueName(UCHAR_GENERAL_CATEGORY, "Unassigned",
1520                                    !negate, result, zone);
1521   } else {
1522     return false;
1523   }
1524   return true;
1525 }
1526 
1527 // Explicitly allowlist supported binary properties. The spec forbids supporting
1528 // properties outside of this set to ensure interoperability.
IsSupportedBinaryProperty(UProperty property)1529 bool IsSupportedBinaryProperty(UProperty property) {
1530   switch (property) {
1531     case UCHAR_ALPHABETIC:
1532     // 'Any' is not supported by ICU. See LookupSpecialPropertyValueName.
1533     // 'ASCII' is not supported by ICU. See LookupSpecialPropertyValueName.
1534     case UCHAR_ASCII_HEX_DIGIT:
1535     // 'Assigned' is not supported by ICU. See LookupSpecialPropertyValueName.
1536     case UCHAR_BIDI_CONTROL:
1537     case UCHAR_BIDI_MIRRORED:
1538     case UCHAR_CASE_IGNORABLE:
1539     case UCHAR_CASED:
1540     case UCHAR_CHANGES_WHEN_CASEFOLDED:
1541     case UCHAR_CHANGES_WHEN_CASEMAPPED:
1542     case UCHAR_CHANGES_WHEN_LOWERCASED:
1543     case UCHAR_CHANGES_WHEN_NFKC_CASEFOLDED:
1544     case UCHAR_CHANGES_WHEN_TITLECASED:
1545     case UCHAR_CHANGES_WHEN_UPPERCASED:
1546     case UCHAR_DASH:
1547     case UCHAR_DEFAULT_IGNORABLE_CODE_POINT:
1548     case UCHAR_DEPRECATED:
1549     case UCHAR_DIACRITIC:
1550     case UCHAR_EMOJI:
1551     case UCHAR_EMOJI_COMPONENT:
1552     case UCHAR_EMOJI_MODIFIER_BASE:
1553     case UCHAR_EMOJI_MODIFIER:
1554     case UCHAR_EMOJI_PRESENTATION:
1555     case UCHAR_EXTENDED_PICTOGRAPHIC:
1556     case UCHAR_EXTENDER:
1557     case UCHAR_GRAPHEME_BASE:
1558     case UCHAR_GRAPHEME_EXTEND:
1559     case UCHAR_HEX_DIGIT:
1560     case UCHAR_ID_CONTINUE:
1561     case UCHAR_ID_START:
1562     case UCHAR_IDEOGRAPHIC:
1563     case UCHAR_IDS_BINARY_OPERATOR:
1564     case UCHAR_IDS_TRINARY_OPERATOR:
1565     case UCHAR_JOIN_CONTROL:
1566     case UCHAR_LOGICAL_ORDER_EXCEPTION:
1567     case UCHAR_LOWERCASE:
1568     case UCHAR_MATH:
1569     case UCHAR_NONCHARACTER_CODE_POINT:
1570     case UCHAR_PATTERN_SYNTAX:
1571     case UCHAR_PATTERN_WHITE_SPACE:
1572     case UCHAR_QUOTATION_MARK:
1573     case UCHAR_RADICAL:
1574     case UCHAR_REGIONAL_INDICATOR:
1575     case UCHAR_S_TERM:
1576     case UCHAR_SOFT_DOTTED:
1577     case UCHAR_TERMINAL_PUNCTUATION:
1578     case UCHAR_UNIFIED_IDEOGRAPH:
1579     case UCHAR_UPPERCASE:
1580     case UCHAR_VARIATION_SELECTOR:
1581     case UCHAR_WHITE_SPACE:
1582     case UCHAR_XID_CONTINUE:
1583     case UCHAR_XID_START:
1584       return true;
1585     default:
1586       break;
1587   }
1588   return false;
1589 }
1590 
IsUnicodePropertyValueCharacter(char c)1591 bool IsUnicodePropertyValueCharacter(char c) {
1592   // https://tc39.github.io/proposal-regexp-unicode-property-escapes/
1593   //
1594   // Note that using this to validate each parsed char is quite conservative.
1595   // A possible alternative solution would be to only ensure the parsed
1596   // property name/value candidate string does not contain '\0' characters and
1597   // let ICU lookups trigger the final failure.
1598   if ('a' <= c && c <= 'z') return true;
1599   if ('A' <= c && c <= 'Z') return true;
1600   if ('0' <= c && c <= '9') return true;
1601   return (c == '_');
1602 }
1603 
1604 }  // namespace
1605 
1606 template <class CharT>
ParsePropertyClassName(ZoneVector<char> * name_1,ZoneVector<char> * name_2)1607 bool RegExpParserImpl<CharT>::ParsePropertyClassName(ZoneVector<char>* name_1,
1608                                                      ZoneVector<char>* name_2) {
1609   DCHECK(name_1->empty());
1610   DCHECK(name_2->empty());
1611   // Parse the property class as follows:
1612   // - In \p{name}, 'name' is interpreted
1613   //   - either as a general category property value name.
1614   //   - or as a binary property name.
1615   // - In \p{name=value}, 'name' is interpreted as an enumerated property name,
1616   //   and 'value' is interpreted as one of the available property value names.
1617   // - Aliases in PropertyAlias.txt and PropertyValueAlias.txt can be used.
1618   // - Loose matching is not applied.
1619   if (current() == '{') {
1620     // Parse \p{[PropertyName=]PropertyNameValue}
1621     for (Advance(); current() != '}' && current() != '='; Advance()) {
1622       if (!IsUnicodePropertyValueCharacter(current())) return false;
1623       if (!has_next()) return false;
1624       name_1->push_back(static_cast<char>(current()));
1625     }
1626     if (current() == '=') {
1627       for (Advance(); current() != '}'; Advance()) {
1628         if (!IsUnicodePropertyValueCharacter(current())) return false;
1629         if (!has_next()) return false;
1630         name_2->push_back(static_cast<char>(current()));
1631       }
1632       name_2->push_back(0);  // null-terminate string.
1633     }
1634   } else {
1635     return false;
1636   }
1637   Advance();
1638   name_1->push_back(0);  // null-terminate string.
1639 
1640   DCHECK(name_1->size() - 1 == std::strlen(name_1->data()));
1641   DCHECK(name_2->empty() || name_2->size() - 1 == std::strlen(name_2->data()));
1642   return true;
1643 }
1644 
1645 template <class CharT>
AddPropertyClassRange(ZoneList<CharacterRange> * add_to,bool negate,const ZoneVector<char> & name_1,const ZoneVector<char> & name_2)1646 bool RegExpParserImpl<CharT>::AddPropertyClassRange(
1647     ZoneList<CharacterRange>* add_to, bool negate,
1648     const ZoneVector<char>& name_1, const ZoneVector<char>& name_2) {
1649   if (name_2.empty()) {
1650     // First attempt to interpret as general category property value name.
1651     const char* name = name_1.data();
1652     if (LookupPropertyValueName(UCHAR_GENERAL_CATEGORY_MASK, name, negate,
1653                                 add_to, zone())) {
1654       return true;
1655     }
1656     // Interpret "Any", "ASCII", and "Assigned".
1657     if (LookupSpecialPropertyValueName(name, add_to, negate, zone())) {
1658       return true;
1659     }
1660     // Then attempt to interpret as binary property name with value name 'Y'.
1661     UProperty property = u_getPropertyEnum(name);
1662     if (!IsSupportedBinaryProperty(property)) return false;
1663     if (!IsExactPropertyAlias(name, property)) return false;
1664     return LookupPropertyValueName(property, negate ? "N" : "Y", false, add_to,
1665                                    zone());
1666   } else {
1667     // Both property name and value name are specified. Attempt to interpret
1668     // the property name as enumerated property.
1669     const char* property_name = name_1.data();
1670     const char* value_name = name_2.data();
1671     UProperty property = u_getPropertyEnum(property_name);
1672     if (!IsExactPropertyAlias(property_name, property)) return false;
1673     if (property == UCHAR_GENERAL_CATEGORY) {
1674       // We want to allow aggregate value names such as "Letter".
1675       property = UCHAR_GENERAL_CATEGORY_MASK;
1676     } else if (property != UCHAR_SCRIPT &&
1677                property != UCHAR_SCRIPT_EXTENSIONS) {
1678       return false;
1679     }
1680     return LookupPropertyValueName(property, value_name, negate, add_to,
1681                                    zone());
1682   }
1683 }
1684 
1685 #else  // V8_INTL_SUPPORT
1686 
1687 template <class CharT>
ParsePropertyClassName(ZoneVector<char> * name_1,ZoneVector<char> * name_2)1688 bool RegExpParserImpl<CharT>::ParsePropertyClassName(ZoneVector<char>* name_1,
1689                                                      ZoneVector<char>* name_2) {
1690   return false;
1691 }
1692 
1693 template <class CharT>
AddPropertyClassRange(ZoneList<CharacterRange> * add_to,bool negate,const ZoneVector<char> & name_1,const ZoneVector<char> & name_2)1694 bool RegExpParserImpl<CharT>::AddPropertyClassRange(
1695     ZoneList<CharacterRange>* add_to, bool negate,
1696     const ZoneVector<char>& name_1, const ZoneVector<char>& name_2) {
1697   return false;
1698 }
1699 
1700 #endif  // V8_INTL_SUPPORT
1701 
1702 template <class CharT>
ParseUnlimitedLengthHexNumber(int max_value,base::uc32 * value)1703 bool RegExpParserImpl<CharT>::ParseUnlimitedLengthHexNumber(int max_value,
1704                                                             base::uc32* value) {
1705   base::uc32 x = 0;
1706   int d = base::HexValue(current());
1707   if (d < 0) {
1708     return false;
1709   }
1710   while (d >= 0) {
1711     x = x * 16 + d;
1712     if (x > static_cast<base::uc32>(max_value)) {
1713       return false;
1714     }
1715     Advance();
1716     d = base::HexValue(current());
1717   }
1718   *value = x;
1719   return true;
1720 }
1721 
1722 // https://tc39.es/ecma262/#prod-CharacterEscape
1723 template <class CharT>
ParseCharacterEscape(InClassEscapeState in_class_escape_state,bool * is_escaped_unicode_character)1724 base::uc32 RegExpParserImpl<CharT>::ParseCharacterEscape(
1725     InClassEscapeState in_class_escape_state,
1726     bool* is_escaped_unicode_character) {
1727   DCHECK_EQ('\\', current());
1728   DCHECK(has_next() && !IsSpecialClassEscape(Next()));
1729 
1730   Advance();
1731 
1732   const base::uc32 c = current();
1733   switch (c) {
1734     // CharacterEscape ::
1735     //   ControlEscape :: one of
1736     //     f n r t v
1737     case 'f':
1738       Advance();
1739       return '\f';
1740     case 'n':
1741       Advance();
1742       return '\n';
1743     case 'r':
1744       Advance();
1745       return '\r';
1746     case 't':
1747       Advance();
1748       return '\t';
1749     case 'v':
1750       Advance();
1751       return '\v';
1752     // CharacterEscape ::
1753     //   c ControlLetter
1754     case 'c': {
1755       base::uc32 controlLetter = Next();
1756       base::uc32 letter = controlLetter & ~('A' ^ 'a');
1757       if (letter >= 'A' && letter <= 'Z') {
1758         Advance(2);
1759         // Control letters mapped to ASCII control characters in the range
1760         // 0x00-0x1F.
1761         return controlLetter & 0x1F;
1762       }
1763       if (unicode()) {
1764         // With /u, invalid escapes are not treated as identity escapes.
1765         ReportError(RegExpError::kInvalidUnicodeEscape);
1766         return 0;
1767       }
1768       if (in_class_escape_state == InClassEscapeState::kInClass) {
1769         // Inside a character class, we also accept digits and underscore as
1770         // control characters, unless with /u. See Annex B:
1771         // ES#prod-annexB-ClassControlLetter
1772         if ((controlLetter >= '0' && controlLetter <= '9') ||
1773             controlLetter == '_') {
1774           Advance(2);
1775           return controlLetter & 0x1F;
1776         }
1777       }
1778       // We match JSC in reading the backslash as a literal
1779       // character instead of as starting an escape.
1780       return '\\';
1781     }
1782     // CharacterEscape ::
1783     //   0 [lookahead ∉ DecimalDigit]
1784     //   [~UnicodeMode] LegacyOctalEscapeSequence
1785     case '0':
1786       // \0 is interpreted as NUL if not followed by another digit.
1787       if (Next() < '0' || Next() > '9') {
1788         Advance();
1789         return 0;
1790       }
1791       V8_FALLTHROUGH;
1792     case '1':
1793     case '2':
1794     case '3':
1795     case '4':
1796     case '5':
1797     case '6':
1798     case '7':
1799       // For compatibility, we interpret a decimal escape that isn't
1800       // a back reference (and therefore either \0 or not valid according
1801       // to the specification) as a 1..3 digit octal character code.
1802       // ES#prod-annexB-LegacyOctalEscapeSequence
1803       if (unicode()) {
1804         // With /u, decimal escape is not interpreted as octal character code.
1805         ReportError(RegExpError::kInvalidClassEscape);
1806         return 0;
1807       }
1808       return ParseOctalLiteral();
1809     // CharacterEscape ::
1810     //   HexEscapeSequence
1811     case 'x': {
1812       Advance();
1813       base::uc32 value;
1814       if (ParseHexEscape(2, &value)) return value;
1815       if (unicode()) {
1816         // With /u, invalid escapes are not treated as identity escapes.
1817         ReportError(RegExpError::kInvalidEscape);
1818         return 0;
1819       }
1820       // If \x is not followed by a two-digit hexadecimal, treat it
1821       // as an identity escape.
1822       return 'x';
1823     }
1824     // CharacterEscape ::
1825     //   RegExpUnicodeEscapeSequence [?UnicodeMode]
1826     case 'u': {
1827       Advance();
1828       base::uc32 value;
1829       if (ParseUnicodeEscape(&value)) {
1830         *is_escaped_unicode_character = true;
1831         return value;
1832       }
1833       if (unicode()) {
1834         // With /u, invalid escapes are not treated as identity escapes.
1835         ReportError(RegExpError::kInvalidUnicodeEscape);
1836         return 0;
1837       }
1838       // If \u is not followed by a two-digit hexadecimal, treat it
1839       // as an identity escape.
1840       return 'u';
1841     }
1842     default:
1843       break;
1844   }
1845 
1846   // CharacterEscape ::
1847   //   IdentityEscape[?UnicodeMode, ?N]
1848   //
1849   // * With /u, no identity escapes except for syntax characters are
1850   //   allowed.
1851   // * Without /u:
1852   //   * '\c' is not an IdentityEscape.
1853   //   * '\k' is not an IdentityEscape when named captures exist.
1854   //   * Otherwise, all identity escapes are allowed.
1855   if (unicode()) {
1856     if (!IsSyntaxCharacterOrSlash(c)) {
1857       ReportError(RegExpError::kInvalidEscape);
1858       return 0;
1859     }
1860     Advance();
1861     return c;
1862   }
1863   DCHECK(!unicode());
1864   if (c == 'c') {
1865     ReportError(RegExpError::kInvalidEscape);
1866     return 0;
1867   }
1868   Advance();
1869   // Note: It's important to Advance before the HasNamedCaptures call s.t. we
1870   // don't start scanning in the middle of an escape.
1871   if (c == 'k' && HasNamedCaptures(in_class_escape_state)) {
1872     ReportError(RegExpError::kInvalidEscape);
1873     return 0;
1874   }
1875   return c;
1876 }
1877 
1878 // https://tc39.es/ecma262/#prod-ClassEscape
1879 template <class CharT>
ParseClassEscape(ZoneList<CharacterRange> * ranges,Zone * zone,bool add_unicode_case_equivalents,base::uc32 * char_out,bool * is_class_escape)1880 void RegExpParserImpl<CharT>::ParseClassEscape(
1881     ZoneList<CharacterRange>* ranges, Zone* zone,
1882     bool add_unicode_case_equivalents, base::uc32* char_out,
1883     bool* is_class_escape) {
1884   *is_class_escape = false;
1885 
1886   if (current() != '\\') {
1887     // Not a ClassEscape.
1888     *char_out = current();
1889     Advance();
1890     return;
1891   }
1892 
1893   const base::uc32 next = Next();
1894   switch (next) {
1895     case 'b':
1896       *char_out = '\b';
1897       Advance(2);
1898       return;
1899     case '-':
1900       if (unicode()) {
1901         *char_out = next;
1902         Advance(2);
1903         return;
1904       }
1905       break;
1906     case kEndMarker:
1907       ReportError(RegExpError::kEscapeAtEndOfPattern);
1908       return;
1909     default:
1910       break;
1911   }
1912 
1913   static constexpr InClassEscapeState kInClassEscape =
1914       InClassEscapeState::kInClass;
1915   *is_class_escape = TryParseCharacterClassEscape(
1916       next, kInClassEscape, ranges, zone, add_unicode_case_equivalents);
1917   if (*is_class_escape) return;
1918 
1919   bool dummy = false;  // Unused.
1920   *char_out = ParseCharacterEscape(kInClassEscape, &dummy);
1921 }
1922 
1923 // https://tc39.es/ecma262/#prod-CharacterClassEscape
1924 template <class CharT>
TryParseCharacterClassEscape(base::uc32 next,InClassEscapeState in_class_escape_state,ZoneList<CharacterRange> * ranges,Zone * zone,bool add_unicode_case_equivalents)1925 bool RegExpParserImpl<CharT>::TryParseCharacterClassEscape(
1926     base::uc32 next, InClassEscapeState in_class_escape_state,
1927     ZoneList<CharacterRange>* ranges, Zone* zone,
1928     bool add_unicode_case_equivalents) {
1929   DCHECK_EQ(current(), '\\');
1930   DCHECK_EQ(Next(), next);
1931 
1932   switch (next) {
1933     case 'd':
1934     case 'D':
1935     case 's':
1936     case 'S':
1937     case 'w':
1938     case 'W':
1939       CharacterRange::AddClassEscape(static_cast<StandardCharacterSet>(next),
1940                                      ranges, add_unicode_case_equivalents,
1941                                      zone);
1942       Advance(2);
1943       return true;
1944     case 'p':
1945     case 'P': {
1946       if (!unicode()) return false;
1947       bool negate = next == 'P';
1948       Advance(2);
1949       ZoneVector<char> name_1(zone);
1950       ZoneVector<char> name_2(zone);
1951       if (!ParsePropertyClassName(&name_1, &name_2) ||
1952           !AddPropertyClassRange(ranges, negate, name_1, name_2)) {
1953         ReportError(in_class_escape_state == InClassEscapeState::kInClass
1954                         ? RegExpError::kInvalidClassPropertyName
1955                         : RegExpError::kInvalidPropertyName);
1956       }
1957       return true;
1958     }
1959     default:
1960       return false;
1961   }
1962 }
1963 
1964 template <class CharT>
ParseCharacterClass(const RegExpBuilder * builder)1965 RegExpTree* RegExpParserImpl<CharT>::ParseCharacterClass(
1966     const RegExpBuilder* builder) {
1967   DCHECK_EQ(current(), '[');
1968   Advance();
1969   bool is_negated = false;
1970   if (current() == '^') {
1971     is_negated = true;
1972     Advance();
1973   }
1974   ZoneList<CharacterRange>* ranges =
1975       zone()->template New<ZoneList<CharacterRange>>(2, zone());
1976   bool add_unicode_case_equivalents = unicode() && builder->ignore_case();
1977   while (has_more() && current() != ']') {
1978     base::uc32 char_1, char_2;
1979     bool is_class_1, is_class_2;
1980     ParseClassEscape(ranges, zone(), add_unicode_case_equivalents, &char_1,
1981                      &is_class_1 CHECK_FAILED);
1982     if (current() == '-') {
1983       Advance();
1984       if (current() == kEndMarker) {
1985         // If we reach the end we break out of the loop and let the
1986         // following code report an error.
1987         break;
1988       } else if (current() == ']') {
1989         if (!is_class_1) ranges->Add(CharacterRange::Singleton(char_1), zone());
1990         ranges->Add(CharacterRange::Singleton('-'), zone());
1991         break;
1992       }
1993       ParseClassEscape(ranges, zone(), add_unicode_case_equivalents, &char_2,
1994                        &is_class_2 CHECK_FAILED);
1995       if (is_class_1 || is_class_2) {
1996         // Either end is an escaped character class. Treat the '-' verbatim.
1997         if (unicode()) {
1998           // ES2015 21.2.2.15.1 step 1.
1999           return ReportError(RegExpError::kInvalidCharacterClass);
2000         }
2001         if (!is_class_1) ranges->Add(CharacterRange::Singleton(char_1), zone());
2002         ranges->Add(CharacterRange::Singleton('-'), zone());
2003         if (!is_class_2) ranges->Add(CharacterRange::Singleton(char_2), zone());
2004         continue;
2005       }
2006       // ES2015 21.2.2.15.1 step 6.
2007       if (char_1 > char_2) {
2008         return ReportError(RegExpError::kOutOfOrderCharacterClass);
2009       }
2010       ranges->Add(CharacterRange::Range(char_1, char_2), zone());
2011     } else {
2012       if (!is_class_1) ranges->Add(CharacterRange::Singleton(char_1), zone());
2013     }
2014   }
2015   if (!has_more()) {
2016     return ReportError(RegExpError::kUnterminatedCharacterClass);
2017   }
2018   Advance();
2019   RegExpCharacterClass::CharacterClassFlags character_class_flags;
2020   if (is_negated) character_class_flags = RegExpCharacterClass::NEGATED;
2021   return zone()->template New<RegExpCharacterClass>(zone(), ranges,
2022                                                     character_class_flags);
2023 }
2024 
2025 #undef CHECK_FAILED
2026 
2027 template <class CharT>
Parse(RegExpCompileData * result)2028 bool RegExpParserImpl<CharT>::Parse(RegExpCompileData* result) {
2029   DCHECK_NOT_NULL(result);
2030   RegExpTree* tree = ParsePattern();
2031 
2032   if (failed()) {
2033     DCHECK_NULL(tree);
2034     DCHECK_NE(error_, RegExpError::kNone);
2035     result->error = error_;
2036     result->error_pos = error_pos_;
2037     return false;
2038   }
2039 
2040   DCHECK_NOT_NULL(tree);
2041   DCHECK_EQ(error_, RegExpError::kNone);
2042   if (FLAG_trace_regexp_parser) {
2043     StdoutStream os;
2044     tree->Print(os, zone());
2045     os << "\n";
2046   }
2047 
2048   result->tree = tree;
2049   const int capture_count = captures_started();
2050   result->simple = tree->IsAtom() && simple() && capture_count == 0;
2051   result->contains_anchor = contains_anchor();
2052   result->capture_count = capture_count;
2053   result->named_captures = GetNamedCaptures();
2054   return true;
2055 }
2056 
AddLeadSurrogate(base::uc16 lead_surrogate)2057 void RegExpBuilder::AddLeadSurrogate(base::uc16 lead_surrogate) {
2058   DCHECK(unibrow::Utf16::IsLeadSurrogate(lead_surrogate));
2059   FlushPendingSurrogate();
2060   // Hold onto the lead surrogate, waiting for a trail surrogate to follow.
2061   pending_surrogate_ = lead_surrogate;
2062 }
2063 
AddTrailSurrogate(base::uc16 trail_surrogate)2064 void RegExpBuilder::AddTrailSurrogate(base::uc16 trail_surrogate) {
2065   DCHECK(unibrow::Utf16::IsTrailSurrogate(trail_surrogate));
2066   if (pending_surrogate_ != kNoPendingSurrogate) {
2067     base::uc16 lead_surrogate = pending_surrogate_;
2068     pending_surrogate_ = kNoPendingSurrogate;
2069     DCHECK(unibrow::Utf16::IsLeadSurrogate(lead_surrogate));
2070     base::uc32 combined =
2071         unibrow::Utf16::CombineSurrogatePair(lead_surrogate, trail_surrogate);
2072     if (NeedsDesugaringForIgnoreCase(combined)) {
2073       AddCharacterClassForDesugaring(combined);
2074     } else {
2075       ZoneList<base::uc16> surrogate_pair(2, zone());
2076       surrogate_pair.Add(lead_surrogate, zone());
2077       surrogate_pair.Add(trail_surrogate, zone());
2078       RegExpAtom* atom =
2079           zone()->New<RegExpAtom>(surrogate_pair.ToConstVector());
2080       AddAtom(atom);
2081     }
2082   } else {
2083     pending_surrogate_ = trail_surrogate;
2084     FlushPendingSurrogate();
2085   }
2086 }
2087 
FlushPendingSurrogate()2088 void RegExpBuilder::FlushPendingSurrogate() {
2089   if (pending_surrogate_ != kNoPendingSurrogate) {
2090     DCHECK(unicode());
2091     base::uc32 c = pending_surrogate_;
2092     pending_surrogate_ = kNoPendingSurrogate;
2093     AddCharacterClassForDesugaring(c);
2094   }
2095 }
2096 
FlushCharacters()2097 void RegExpBuilder::FlushCharacters() {
2098   FlushPendingSurrogate();
2099   pending_empty_ = false;
2100   if (characters_ != nullptr) {
2101     RegExpTree* atom = zone()->New<RegExpAtom>(characters_->ToConstVector());
2102     characters_ = nullptr;
2103     text_.emplace_back(atom);
2104     LAST(ADD_ATOM);
2105   }
2106 }
2107 
FlushText()2108 void RegExpBuilder::FlushText() {
2109   FlushCharacters();
2110   size_t num_text = text_.size();
2111   if (num_text == 0) {
2112     return;
2113   } else if (num_text == 1) {
2114     terms_.emplace_back(text_.back());
2115   } else {
2116     RegExpText* text = zone()->New<RegExpText>(zone());
2117     for (size_t i = 0; i < num_text; i++) {
2118       text_[i]->AppendToText(text, zone());
2119     }
2120     terms_.emplace_back(text);
2121   }
2122   text_.clear();
2123 }
2124 
AddCharacter(base::uc16 c)2125 void RegExpBuilder::AddCharacter(base::uc16 c) {
2126   FlushPendingSurrogate();
2127   pending_empty_ = false;
2128   if (NeedsDesugaringForIgnoreCase(c)) {
2129     AddCharacterClassForDesugaring(c);
2130   } else {
2131     if (characters_ == nullptr) {
2132       characters_ = zone()->New<ZoneList<base::uc16>>(4, zone());
2133     }
2134     characters_->Add(c, zone());
2135     LAST(ADD_CHAR);
2136   }
2137 }
2138 
AddUnicodeCharacter(base::uc32 c)2139 void RegExpBuilder::AddUnicodeCharacter(base::uc32 c) {
2140   if (c > static_cast<base::uc32>(unibrow::Utf16::kMaxNonSurrogateCharCode)) {
2141     DCHECK(unicode());
2142     AddLeadSurrogate(unibrow::Utf16::LeadSurrogate(c));
2143     AddTrailSurrogate(unibrow::Utf16::TrailSurrogate(c));
2144   } else if (unicode() && unibrow::Utf16::IsLeadSurrogate(c)) {
2145     AddLeadSurrogate(c);
2146   } else if (unicode() && unibrow::Utf16::IsTrailSurrogate(c)) {
2147     AddTrailSurrogate(c);
2148   } else {
2149     AddCharacter(static_cast<base::uc16>(c));
2150   }
2151 }
2152 
AddEscapedUnicodeCharacter(base::uc32 character)2153 void RegExpBuilder::AddEscapedUnicodeCharacter(base::uc32 character) {
2154   // A lead or trail surrogate parsed via escape sequence will not
2155   // pair up with any preceding lead or following trail surrogate.
2156   FlushPendingSurrogate();
2157   AddUnicodeCharacter(character);
2158   FlushPendingSurrogate();
2159 }
2160 
AddEmpty()2161 void RegExpBuilder::AddEmpty() { pending_empty_ = true; }
2162 
AddCharacterClass(RegExpCharacterClass * cc)2163 void RegExpBuilder::AddCharacterClass(RegExpCharacterClass* cc) {
2164   if (NeedsDesugaringForUnicode(cc)) {
2165     // With /u, character class needs to be desugared, so it
2166     // must be a standalone term instead of being part of a RegExpText.
2167     AddTerm(cc);
2168   } else {
2169     AddAtom(cc);
2170   }
2171 }
2172 
AddCharacterClassForDesugaring(base::uc32 c)2173 void RegExpBuilder::AddCharacterClassForDesugaring(base::uc32 c) {
2174   AddTerm(zone()->New<RegExpCharacterClass>(
2175       zone(), CharacterRange::List(zone(), CharacterRange::Singleton(c))));
2176 }
2177 
AddAtom(RegExpTree * term)2178 void RegExpBuilder::AddAtom(RegExpTree* term) {
2179   if (term->IsEmpty()) {
2180     AddEmpty();
2181     return;
2182   }
2183   if (term->IsTextElement()) {
2184     FlushCharacters();
2185     text_.emplace_back(term);
2186   } else {
2187     FlushText();
2188     terms_.emplace_back(term);
2189   }
2190   LAST(ADD_ATOM);
2191 }
2192 
AddTerm(RegExpTree * term)2193 void RegExpBuilder::AddTerm(RegExpTree* term) {
2194   FlushText();
2195   terms_.emplace_back(term);
2196   LAST(ADD_ATOM);
2197 }
2198 
AddAssertion(RegExpTree * assert)2199 void RegExpBuilder::AddAssertion(RegExpTree* assert) {
2200   FlushText();
2201   terms_.emplace_back(assert);
2202   LAST(ADD_ASSERT);
2203 }
2204 
NewAlternative()2205 void RegExpBuilder::NewAlternative() { FlushTerms(); }
2206 
FlushTerms()2207 void RegExpBuilder::FlushTerms() {
2208   FlushText();
2209   size_t num_terms = terms_.size();
2210   RegExpTree* alternative;
2211   if (num_terms == 0) {
2212     alternative = zone()->New<RegExpEmpty>();
2213   } else if (num_terms == 1) {
2214     alternative = terms_.back();
2215   } else {
2216     alternative =
2217         zone()->New<RegExpAlternative>(zone()->New<ZoneList<RegExpTree*>>(
2218             base::VectorOf(terms_.begin(), terms_.size()), zone()));
2219   }
2220   alternatives_.emplace_back(alternative);
2221   terms_.clear();
2222   LAST(ADD_NONE);
2223 }
2224 
NeedsDesugaringForUnicode(RegExpCharacterClass * cc)2225 bool RegExpBuilder::NeedsDesugaringForUnicode(RegExpCharacterClass* cc) {
2226   if (!unicode()) return false;
2227   // TODO(yangguo): we could be smarter than this. Case-insensitivity does not
2228   // necessarily mean that we need to desugar. It's probably nicer to have a
2229   // separate pass to figure out unicode desugarings.
2230   if (ignore_case()) return true;
2231   ZoneList<CharacterRange>* ranges = cc->ranges(zone());
2232   CharacterRange::Canonicalize(ranges);
2233   for (int i = ranges->length() - 1; i >= 0; i--) {
2234     base::uc32 from = ranges->at(i).from();
2235     base::uc32 to = ranges->at(i).to();
2236     // Check for non-BMP characters.
2237     if (to >= kNonBmpStart) return true;
2238     // Check for lone surrogates.
2239     if (from <= kTrailSurrogateEnd && to >= kLeadSurrogateStart) return true;
2240   }
2241   return false;
2242 }
2243 
NeedsDesugaringForIgnoreCase(base::uc32 c)2244 bool RegExpBuilder::NeedsDesugaringForIgnoreCase(base::uc32 c) {
2245 #ifdef V8_INTL_SUPPORT
2246   if (unicode() && ignore_case()) {
2247     icu::UnicodeSet set(c, c);
2248     set.closeOver(USET_CASE_INSENSITIVE);
2249     set.removeAllStrings();
2250     return set.size() > 1;
2251   }
2252   // In the case where ICU is not included, we act as if the unicode flag is
2253   // not set, and do not desugar.
2254 #endif  // V8_INTL_SUPPORT
2255   return false;
2256 }
2257 
ToRegExp()2258 RegExpTree* RegExpBuilder::ToRegExp() {
2259   FlushTerms();
2260   size_t num_alternatives = alternatives_.size();
2261   if (num_alternatives == 0) return zone()->New<RegExpEmpty>();
2262   if (num_alternatives == 1) return alternatives_.back();
2263   return zone()->New<RegExpDisjunction>(zone()->New<ZoneList<RegExpTree*>>(
2264       base::VectorOf(alternatives_.begin(), alternatives_.size()), zone()));
2265 }
2266 
AddQuantifierToAtom(int min,int max,RegExpQuantifier::QuantifierType quantifier_type)2267 bool RegExpBuilder::AddQuantifierToAtom(
2268     int min, int max, RegExpQuantifier::QuantifierType quantifier_type) {
2269   FlushPendingSurrogate();
2270   if (pending_empty_) {
2271     pending_empty_ = false;
2272     return true;
2273   }
2274   RegExpTree* atom;
2275   if (characters_ != nullptr) {
2276     DCHECK(last_added_ == ADD_CHAR);
2277     // Last atom was character.
2278     base::Vector<const base::uc16> char_vector = characters_->ToConstVector();
2279     int num_chars = char_vector.length();
2280     if (num_chars > 1) {
2281       base::Vector<const base::uc16> prefix =
2282           char_vector.SubVector(0, num_chars - 1);
2283       text_.emplace_back(zone()->New<RegExpAtom>(prefix));
2284       char_vector = char_vector.SubVector(num_chars - 1, num_chars);
2285     }
2286     characters_ = nullptr;
2287     atom = zone()->New<RegExpAtom>(char_vector);
2288     FlushText();
2289   } else if (text_.size() > 0) {
2290     DCHECK(last_added_ == ADD_ATOM);
2291     atom = text_.back();
2292     text_.pop_back();
2293     FlushText();
2294   } else if (terms_.size() > 0) {
2295     DCHECK(last_added_ == ADD_ATOM);
2296     atom = terms_.back();
2297     terms_.pop_back();
2298     if (atom->IsLookaround()) {
2299       // With /u, lookarounds are not quantifiable.
2300       if (unicode()) return false;
2301       // Lookbehinds are not quantifiable.
2302       if (atom->AsLookaround()->type() == RegExpLookaround::LOOKBEHIND) {
2303         return false;
2304       }
2305     }
2306     if (atom->max_match() == 0) {
2307       // Guaranteed to only match an empty string.
2308       LAST(ADD_TERM);
2309       if (min == 0) {
2310         return true;
2311       }
2312       terms_.emplace_back(atom);
2313       return true;
2314     }
2315   } else {
2316     // Only call immediately after adding an atom or character!
2317     UNREACHABLE();
2318   }
2319   terms_.emplace_back(
2320       zone()->New<RegExpQuantifier>(min, max, quantifier_type, atom));
2321   LAST(ADD_TERM);
2322   return true;
2323 }
2324 
2325 template class RegExpParserImpl<uint8_t>;
2326 template class RegExpParserImpl<base::uc16>;
2327 
2328 }  // namespace
2329 
2330 // static
ParseRegExpFromHeapString(Isolate * isolate,Zone * zone,Handle<String> input,RegExpFlags flags,RegExpCompileData * result)2331 bool RegExpParser::ParseRegExpFromHeapString(Isolate* isolate, Zone* zone,
2332                                              Handle<String> input,
2333                                              RegExpFlags flags,
2334                                              RegExpCompileData* result) {
2335   DisallowGarbageCollection no_gc;
2336   uintptr_t stack_limit = isolate->stack_guard()->real_climit();
2337   String::FlatContent content = input->GetFlatContent(no_gc);
2338   if (content.IsOneByte()) {
2339     base::Vector<const uint8_t> v = content.ToOneByteVector();
2340     return RegExpParserImpl<uint8_t>{v.begin(),   v.length(), flags,
2341                                      stack_limit, zone,       no_gc}
2342         .Parse(result);
2343   } else {
2344     base::Vector<const base::uc16> v = content.ToUC16Vector();
2345     return RegExpParserImpl<base::uc16>{v.begin(),   v.length(), flags,
2346                                         stack_limit, zone,       no_gc}
2347         .Parse(result);
2348   }
2349 }
2350 
2351 // static
2352 template <class CharT>
VerifyRegExpSyntax(Zone * zone,uintptr_t stack_limit,const CharT * input,int input_length,RegExpFlags flags,RegExpCompileData * result,const DisallowGarbageCollection & no_gc)2353 bool RegExpParser::VerifyRegExpSyntax(Zone* zone, uintptr_t stack_limit,
2354                                       const CharT* input, int input_length,
2355                                       RegExpFlags flags,
2356                                       RegExpCompileData* result,
2357                                       const DisallowGarbageCollection& no_gc) {
2358   return RegExpParserImpl<CharT>{input,       input_length, flags,
2359                                  stack_limit, zone,         no_gc}
2360       .Parse(result);
2361 }
2362 
2363 template bool RegExpParser::VerifyRegExpSyntax<uint8_t>(
2364     Zone*, uintptr_t, const uint8_t*, int, RegExpFlags, RegExpCompileData*,
2365     const DisallowGarbageCollection&);
2366 template bool RegExpParser::VerifyRegExpSyntax<base::uc16>(
2367     Zone*, uintptr_t, const base::uc16*, int, RegExpFlags, RegExpCompileData*,
2368     const DisallowGarbageCollection&);
2369 
2370 // static
VerifyRegExpSyntax(Isolate * isolate,Zone * zone,Handle<String> input,RegExpFlags flags,RegExpCompileData * result,const DisallowGarbageCollection &)2371 bool RegExpParser::VerifyRegExpSyntax(Isolate* isolate, Zone* zone,
2372                                       Handle<String> input, RegExpFlags flags,
2373                                       RegExpCompileData* result,
2374                                       const DisallowGarbageCollection&) {
2375   return ParseRegExpFromHeapString(isolate, zone, input, flags, result);
2376 }
2377 
2378 #undef LAST
2379 
2380 }  // namespace internal
2381 }  // namespace v8
2382