• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2016 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #ifndef V8_REGEXP_REGEXP_PARSER_H_
6 #define V8_REGEXP_REGEXP_PARSER_H_
7 
8 #include "src/objects/js-regexp.h"
9 #include "src/objects/objects.h"
10 #include "src/regexp/regexp-ast.h"
11 #include "src/regexp/regexp-error.h"
12 #include "src/zone/zone.h"
13 
14 namespace v8 {
15 namespace internal {
16 
17 struct RegExpCompileData;
18 
19 // A BufferedZoneList is an automatically growing list, just like (and backed
20 // by) a ZoneList, that is optimized for the case of adding and removing
21 // a single element. The last element added is stored outside the backing list,
22 // and if no more than one element is ever added, the ZoneList isn't even
23 // allocated.
24 // Elements must not be nullptr pointers.
25 template <typename T, int initial_size>
26 class BufferedZoneList {
27  public:
BufferedZoneList()28   BufferedZoneList() : list_(nullptr), last_(nullptr) {}
29 
30   // Adds element at end of list. This element is buffered and can
31   // be read using last() or removed using RemoveLast until a new Add or until
32   // RemoveLast or GetList has been called.
Add(T * value,Zone * zone)33   void Add(T* value, Zone* zone) {
34     if (last_ != nullptr) {
35       if (list_ == nullptr) {
36         list_ = zone->New<ZoneList<T*>>(initial_size, zone);
37       }
38       list_->Add(last_, zone);
39     }
40     last_ = value;
41   }
42 
last()43   T* last() {
44     DCHECK(last_ != nullptr);
45     return last_;
46   }
47 
RemoveLast()48   T* RemoveLast() {
49     DCHECK(last_ != nullptr);
50     T* result = last_;
51     if ((list_ != nullptr) && (list_->length() > 0))
52       last_ = list_->RemoveLast();
53     else
54       last_ = nullptr;
55     return result;
56   }
57 
Get(int i)58   T* Get(int i) {
59     DCHECK((0 <= i) && (i < length()));
60     if (list_ == nullptr) {
61       DCHECK_EQ(0, i);
62       return last_;
63     } else {
64       if (i == list_->length()) {
65         DCHECK(last_ != nullptr);
66         return last_;
67       } else {
68         return list_->at(i);
69       }
70     }
71   }
72 
Clear()73   void Clear() {
74     list_ = nullptr;
75     last_ = nullptr;
76   }
77 
length()78   int length() {
79     int length = (list_ == nullptr) ? 0 : list_->length();
80     return length + ((last_ == nullptr) ? 0 : 1);
81   }
82 
GetList(Zone * zone)83   ZoneList<T*>* GetList(Zone* zone) {
84     if (list_ == nullptr) {
85       list_ = zone->New<ZoneList<T*>>(initial_size, zone);
86     }
87     if (last_ != nullptr) {
88       list_->Add(last_, zone);
89       last_ = nullptr;
90     }
91     return list_;
92   }
93 
94  private:
95   ZoneList<T*>* list_;
96   T* last_;
97 };
98 
99 
100 // Accumulates RegExp atoms and assertions into lists of terms and alternatives.
101 class RegExpBuilder : public ZoneObject {
102  public:
103   RegExpBuilder(Zone* zone, JSRegExp::Flags flags);
104   void AddCharacter(uc16 character);
105   void AddUnicodeCharacter(uc32 character);
106   void AddEscapedUnicodeCharacter(uc32 character);
107   // "Adds" an empty expression. Does nothing except consume a
108   // following quantifier
109   void AddEmpty();
110   void AddCharacterClass(RegExpCharacterClass* cc);
111   void AddCharacterClassForDesugaring(uc32 c);
112   void AddAtom(RegExpTree* tree);
113   void AddTerm(RegExpTree* tree);
114   void AddAssertion(RegExpTree* tree);
115   void NewAlternative();  // '|'
116   bool AddQuantifierToAtom(int min, int max,
117                            RegExpQuantifier::QuantifierType type);
118   void FlushText();
119   RegExpTree* ToRegExp();
flags()120   JSRegExp::Flags flags() const { return flags_; }
set_flags(JSRegExp::Flags flags)121   void set_flags(JSRegExp::Flags flags) { flags_ = flags; }
122 
ignore_case()123   bool ignore_case() const { return (flags_ & JSRegExp::kIgnoreCase) != 0; }
multiline()124   bool multiline() const { return (flags_ & JSRegExp::kMultiline) != 0; }
dotall()125   bool dotall() const { return (flags_ & JSRegExp::kDotAll) != 0; }
126 
127  private:
128   static const uc16 kNoPendingSurrogate = 0;
129   void AddLeadSurrogate(uc16 lead_surrogate);
130   void AddTrailSurrogate(uc16 trail_surrogate);
131   void FlushPendingSurrogate();
132   void FlushCharacters();
133   void FlushTerms();
134   bool NeedsDesugaringForUnicode(RegExpCharacterClass* cc);
135   bool NeedsDesugaringForIgnoreCase(uc32 c);
zone()136   Zone* zone() const { return zone_; }
unicode()137   bool unicode() const { return (flags_ & JSRegExp::kUnicode) != 0; }
138 
139   Zone* zone_;
140   bool pending_empty_;
141   JSRegExp::Flags flags_;
142   ZoneList<uc16>* characters_;
143   uc16 pending_surrogate_;
144   BufferedZoneList<RegExpTree, 2> terms_;
145   BufferedZoneList<RegExpTree, 2> text_;
146   BufferedZoneList<RegExpTree, 2> alternatives_;
147 #ifdef DEBUG
148   enum { ADD_NONE, ADD_CHAR, ADD_TERM, ADD_ASSERT, ADD_ATOM } last_added_;
149 #define LAST(x) last_added_ = x;
150 #else
151 #define LAST(x)
152 #endif
153 };
154 
155 class V8_EXPORT_PRIVATE RegExpParser {
156  public:
157   RegExpParser(FlatStringReader* in, JSRegExp::Flags flags, Isolate* isolate,
158                Zone* zone);
159 
160   static bool ParseRegExp(Isolate* isolate, Zone* zone, FlatStringReader* input,
161                           JSRegExp::Flags flags, RegExpCompileData* result);
162 
163  private:
164   bool Parse(RegExpCompileData* result, const DisallowHeapAllocation&);
165 
166   RegExpTree* ParsePattern();
167   RegExpTree* ParseDisjunction();
168   RegExpTree* ParseGroup();
169 
170   // Parses a {...,...} quantifier and stores the range in the given
171   // out parameters.
172   bool ParseIntervalQuantifier(int* min_out, int* max_out);
173 
174   // Parses and returns a single escaped character.  The character
175   // must not be 'b' or 'B' since they are usually handle specially.
176   uc32 ParseClassCharacterEscape();
177 
178   // Checks whether the following is a length-digit hexadecimal number,
179   // and sets the value if it is.
180   bool ParseHexEscape(int length, uc32* value);
181   bool ParseUnicodeEscape(uc32* value);
182   bool ParseUnlimitedLengthHexNumber(int max_value, uc32* value);
183 
184   bool ParsePropertyClassName(ZoneVector<char>* name_1,
185                               ZoneVector<char>* name_2);
186   bool AddPropertyClassRange(ZoneList<CharacterRange>* add_to, bool negate,
187                              const ZoneVector<char>& name_1,
188                              const ZoneVector<char>& name_2);
189 
190   RegExpTree* GetPropertySequence(const ZoneVector<char>& name_1);
191   RegExpTree* ParseCharacterClass(const RegExpBuilder* state);
192 
193   uc32 ParseOctalLiteral();
194 
195   // Tries to parse the input as a back reference.  If successful it
196   // stores the result in the output parameter and returns true.  If
197   // it fails it will push back the characters read so the same characters
198   // can be reparsed.
199   bool ParseBackReferenceIndex(int* index_out);
200 
201   // Parse inside a class. Either add escaped class to the range, or return
202   // false and pass parsed single character through |char_out|.
203   void ParseClassEscape(ZoneList<CharacterRange>* ranges, Zone* zone,
204                         bool add_unicode_case_equivalents, uc32* char_out,
205                         bool* is_class_escape);
206 
207   char ParseClassEscape();
208 
209   RegExpTree* ReportError(RegExpError error);
210   void Advance();
211   void Advance(int dist);
212   void Reset(int pos);
213 
214   // Reports whether the pattern might be used as a literal search string.
215   // Only use if the result of the parse is a single atom node.
216   bool simple();
contains_anchor()217   bool contains_anchor() { return contains_anchor_; }
set_contains_anchor()218   void set_contains_anchor() { contains_anchor_ = true; }
captures_started()219   int captures_started() { return captures_started_; }
position()220   int position() { return next_pos_ - 1; }
failed()221   bool failed() { return failed_; }
222   // The Unicode flag can't be changed using in-regexp syntax, so it's OK to
223   // just read the initial flag value here.
unicode()224   bool unicode() const { return (top_level_flags_ & JSRegExp::kUnicode) != 0; }
225 
226   static bool IsSyntaxCharacterOrSlash(uc32 c);
227 
228   static const uc32 kEndMarker = (1 << 21);
229 
230  private:
231   enum SubexpressionType {
232     INITIAL,
233     CAPTURE,  // All positive values represent captures.
234     POSITIVE_LOOKAROUND,
235     NEGATIVE_LOOKAROUND,
236     GROUPING
237   };
238 
239   class RegExpParserState : public ZoneObject {
240    public:
241     // Push a state on the stack.
RegExpParserState(RegExpParserState * previous_state,SubexpressionType group_type,RegExpLookaround::Type lookaround_type,int disjunction_capture_index,const ZoneVector<uc16> * capture_name,JSRegExp::Flags flags,Zone * zone)242     RegExpParserState(RegExpParserState* previous_state,
243                       SubexpressionType group_type,
244                       RegExpLookaround::Type lookaround_type,
245                       int disjunction_capture_index,
246                       const ZoneVector<uc16>* capture_name,
247                       JSRegExp::Flags flags, Zone* zone)
248         : previous_state_(previous_state),
249           builder_(zone->New<RegExpBuilder>(zone, flags)),
250           group_type_(group_type),
251           lookaround_type_(lookaround_type),
252           disjunction_capture_index_(disjunction_capture_index),
253           capture_name_(capture_name) {}
254     // Parser state of containing expression, if any.
previous_state()255     RegExpParserState* previous_state() const { return previous_state_; }
IsSubexpression()256     bool IsSubexpression() { return previous_state_ != nullptr; }
257     // RegExpBuilder building this regexp's AST.
builder()258     RegExpBuilder* builder() const { return builder_; }
259     // Type of regexp being parsed (parenthesized group or entire regexp).
group_type()260     SubexpressionType group_type() const { return group_type_; }
261     // Lookahead or Lookbehind.
lookaround_type()262     RegExpLookaround::Type lookaround_type() const { return lookaround_type_; }
263     // Index in captures array of first capture in this sub-expression, if any.
264     // Also the capture index of this sub-expression itself, if group_type
265     // is CAPTURE.
capture_index()266     int capture_index() const { return disjunction_capture_index_; }
267     // The name of the current sub-expression, if group_type is CAPTURE. Only
268     // used for named captures.
capture_name()269     const ZoneVector<uc16>* capture_name() const { return capture_name_; }
270 
IsNamedCapture()271     bool IsNamedCapture() const { return capture_name_ != nullptr; }
272 
273     // Check whether the parser is inside a capture group with the given index.
274     bool IsInsideCaptureGroup(int index);
275     // Check whether the parser is inside a capture group with the given name.
276     bool IsInsideCaptureGroup(const ZoneVector<uc16>* name);
277 
278    private:
279     // Linked list implementation of stack of states.
280     RegExpParserState* const previous_state_;
281     // Builder for the stored disjunction.
282     RegExpBuilder* const builder_;
283     // Stored disjunction type (capture, look-ahead or grouping), if any.
284     const SubexpressionType group_type_;
285     // Stored read direction.
286     const RegExpLookaround::Type lookaround_type_;
287     // Stored disjunction's capture index (if any).
288     const int disjunction_capture_index_;
289     // Stored capture name (if any).
290     const ZoneVector<uc16>* const capture_name_;
291   };
292 
293   // Return the 1-indexed RegExpCapture object, allocate if necessary.
294   RegExpCapture* GetCapture(int index);
295 
296   // Creates a new named capture at the specified index. Must be called exactly
297   // once for each named capture. Fails if a capture with the same name is
298   // encountered.
299   bool CreateNamedCaptureAtIndex(const ZoneVector<uc16>* name, int index);
300 
301   // Parses the name of a capture group (?<name>pattern). The name must adhere
302   // to IdentifierName in the ECMAScript standard.
303   const ZoneVector<uc16>* ParseCaptureGroupName();
304 
305   bool ParseNamedBackReference(RegExpBuilder* builder,
306                                RegExpParserState* state);
307   RegExpParserState* ParseOpenParenthesis(RegExpParserState* state);
308 
309   // After the initial parsing pass, patch corresponding RegExpCapture objects
310   // into all RegExpBackReferences. This is done after initial parsing in order
311   // to avoid complicating cases in which references comes before the capture.
312   void PatchNamedBackReferences();
313 
314   Handle<FixedArray> CreateCaptureNameMap();
315 
316   // Returns true iff the pattern contains named captures. May call
317   // ScanForCaptures to look ahead at the remaining pattern.
318   bool HasNamedCaptures();
319 
isolate()320   Isolate* isolate() { return isolate_; }
zone()321   Zone* zone() const { return zone_; }
322 
current()323   uc32 current() { return current_; }
has_more()324   bool has_more() { return has_more_; }
has_next()325   bool has_next() { return next_pos_ < in()->length(); }
326   uc32 Next();
327   template <bool update_position>
328   uc32 ReadNext();
in()329   FlatStringReader* in() { return in_; }
330   void ScanForCaptures();
331 
332   struct RegExpCaptureNameLess {
operatorRegExpCaptureNameLess333     bool operator()(const RegExpCapture* lhs, const RegExpCapture* rhs) const {
334       DCHECK_NOT_NULL(lhs);
335       DCHECK_NOT_NULL(rhs);
336       return *lhs->name() < *rhs->name();
337     }
338   };
339 
340   Isolate* isolate_;
341   Zone* zone_;
342   RegExpError error_ = RegExpError::kNone;
343   int error_pos_ = 0;
344   ZoneList<RegExpCapture*>* captures_;
345   ZoneSet<RegExpCapture*, RegExpCaptureNameLess>* named_captures_;
346   ZoneList<RegExpBackReference*>* named_back_references_;
347   FlatStringReader* in_;
348   uc32 current_;
349   // These are the flags specified outside the regexp syntax ie after the
350   // terminating '/' or in the second argument to the constructor.  The current
351   // flags are stored on the RegExpBuilder.
352   JSRegExp::Flags top_level_flags_;
353   int next_pos_;
354   int captures_started_;
355   int capture_count_;  // Only valid after we have scanned for captures.
356   bool has_more_;
357   bool simple_;
358   bool contains_anchor_;
359   bool is_scanned_for_captures_;
360   bool has_named_captures_;  // Only valid after we have scanned for captures.
361   bool failed_;
362 };
363 
364 }  // namespace internal
365 }  // namespace v8
366 
367 #endif  // V8_REGEXP_REGEXP_PARSER_H_
368