1 // Copyright 2016 the V8 project authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #ifndef V8_REGEXP_REGEXP_PARSER_H_ 6 #define V8_REGEXP_REGEXP_PARSER_H_ 7 8 #include "src/objects.h" 9 #include "src/regexp/regexp-ast.h" 10 #include "src/zone.h" 11 12 namespace v8 { 13 namespace internal { 14 15 struct RegExpCompileData; 16 17 18 // A BufferedZoneList is an automatically growing list, just like (and backed 19 // by) a ZoneList, that is optimized for the case of adding and removing 20 // a single element. The last element added is stored outside the backing list, 21 // and if no more than one element is ever added, the ZoneList isn't even 22 // allocated. 23 // Elements must not be NULL pointers. 24 template <typename T, int initial_size> 25 class BufferedZoneList { 26 public: BufferedZoneList()27 BufferedZoneList() : list_(NULL), last_(NULL) {} 28 29 // Adds element at end of list. This element is buffered and can 30 // be read using last() or removed using RemoveLast until a new Add or until 31 // RemoveLast or GetList has been called. Add(T * value,Zone * zone)32 void Add(T* value, Zone* zone) { 33 if (last_ != NULL) { 34 if (list_ == NULL) { 35 list_ = new (zone) ZoneList<T*>(initial_size, zone); 36 } 37 list_->Add(last_, zone); 38 } 39 last_ = value; 40 } 41 last()42 T* last() { 43 DCHECK(last_ != NULL); 44 return last_; 45 } 46 RemoveLast()47 T* RemoveLast() { 48 DCHECK(last_ != NULL); 49 T* result = last_; 50 if ((list_ != NULL) && (list_->length() > 0)) 51 last_ = list_->RemoveLast(); 52 else 53 last_ = NULL; 54 return result; 55 } 56 Get(int i)57 T* Get(int i) { 58 DCHECK((0 <= i) && (i < length())); 59 if (list_ == NULL) { 60 DCHECK_EQ(0, i); 61 return last_; 62 } else { 63 if (i == list_->length()) { 64 DCHECK(last_ != NULL); 65 return last_; 66 } else { 67 return list_->at(i); 68 } 69 } 70 } 71 Clear()72 void Clear() { 73 list_ = NULL; 74 last_ = NULL; 75 } 76 length()77 int length() { 78 int length = (list_ == NULL) ? 0 : list_->length(); 79 return length + ((last_ == NULL) ? 0 : 1); 80 } 81 GetList(Zone * zone)82 ZoneList<T*>* GetList(Zone* zone) { 83 if (list_ == NULL) { 84 list_ = new (zone) ZoneList<T*>(initial_size, zone); 85 } 86 if (last_ != NULL) { 87 list_->Add(last_, zone); 88 last_ = NULL; 89 } 90 return list_; 91 } 92 93 private: 94 ZoneList<T*>* list_; 95 T* last_; 96 }; 97 98 99 // Accumulates RegExp atoms and assertions into lists of terms and alternatives. 100 class RegExpBuilder : public ZoneObject { 101 public: 102 RegExpBuilder(Zone* zone, bool ignore_case, bool unicode); 103 void AddCharacter(uc16 character); 104 void AddUnicodeCharacter(uc32 character); 105 void AddEscapedUnicodeCharacter(uc32 character); 106 // "Adds" an empty expression. Does nothing except consume a 107 // following quantifier 108 void AddEmpty(); 109 void AddCharacterClass(RegExpCharacterClass* cc); 110 void AddCharacterClassForDesugaring(uc32 c); 111 void AddAtom(RegExpTree* tree); 112 void AddTerm(RegExpTree* tree); 113 void AddAssertion(RegExpTree* tree); 114 void NewAlternative(); // '|' 115 bool AddQuantifierToAtom(int min, int max, 116 RegExpQuantifier::QuantifierType type); 117 RegExpTree* ToRegExp(); 118 119 private: 120 static const uc16 kNoPendingSurrogate = 0; 121 void AddLeadSurrogate(uc16 lead_surrogate); 122 void AddTrailSurrogate(uc16 trail_surrogate); 123 void FlushPendingSurrogate(); 124 void FlushCharacters(); 125 void FlushText(); 126 void FlushTerms(); 127 bool NeedsDesugaringForUnicode(RegExpCharacterClass* cc); 128 bool NeedsDesugaringForIgnoreCase(uc32 c); zone()129 Zone* zone() const { return zone_; } ignore_case()130 bool ignore_case() const { return ignore_case_; } unicode()131 bool unicode() const { return unicode_; } 132 133 Zone* zone_; 134 bool pending_empty_; 135 bool ignore_case_; 136 bool unicode_; 137 ZoneList<uc16>* characters_; 138 uc16 pending_surrogate_; 139 BufferedZoneList<RegExpTree, 2> terms_; 140 BufferedZoneList<RegExpTree, 2> text_; 141 BufferedZoneList<RegExpTree, 2> alternatives_; 142 #ifdef DEBUG 143 enum { ADD_NONE, ADD_CHAR, ADD_TERM, ADD_ASSERT, ADD_ATOM } last_added_; 144 #define LAST(x) last_added_ = x; 145 #else 146 #define LAST(x) 147 #endif 148 }; 149 150 151 class RegExpParser BASE_EMBEDDED { 152 public: 153 RegExpParser(FlatStringReader* in, Handle<String>* error, 154 JSRegExp::Flags flags, Isolate* isolate, Zone* zone); 155 156 static bool ParseRegExp(Isolate* isolate, Zone* zone, FlatStringReader* input, 157 JSRegExp::Flags flags, RegExpCompileData* result); 158 159 RegExpTree* ParsePattern(); 160 RegExpTree* ParseDisjunction(); 161 RegExpTree* ParseGroup(); 162 RegExpTree* ParseCharacterClass(); 163 164 // Parses a {...,...} quantifier and stores the range in the given 165 // out parameters. 166 bool ParseIntervalQuantifier(int* min_out, int* max_out); 167 168 // Parses and returns a single escaped character. The character 169 // must not be 'b' or 'B' since they are usually handle specially. 170 uc32 ParseClassCharacterEscape(); 171 172 // Checks whether the following is a length-digit hexadecimal number, 173 // and sets the value if it is. 174 bool ParseHexEscape(int length, uc32* value); 175 bool ParseUnicodeEscape(uc32* value); 176 bool ParseUnlimitedLengthHexNumber(int max_value, uc32* value); 177 bool ParsePropertyClass(ZoneList<CharacterRange>* result, bool negate); 178 179 uc32 ParseOctalLiteral(); 180 181 // Tries to parse the input as a back reference. If successful it 182 // stores the result in the output parameter and returns true. If 183 // it fails it will push back the characters read so the same characters 184 // can be reparsed. 185 bool ParseBackReferenceIndex(int* index_out); 186 187 bool ParseClassProperty(ZoneList<CharacterRange>* result); 188 CharacterRange ParseClassAtom(uc16* char_class); 189 RegExpTree* ReportError(Vector<const char> message); 190 void Advance(); 191 void Advance(int dist); 192 void Reset(int pos); 193 194 // Reports whether the pattern might be used as a literal search string. 195 // Only use if the result of the parse is a single atom node. 196 bool simple(); contains_anchor()197 bool contains_anchor() { return contains_anchor_; } set_contains_anchor()198 void set_contains_anchor() { contains_anchor_ = true; } captures_started()199 int captures_started() { return captures_started_; } position()200 int position() { return next_pos_ - 1; } failed()201 bool failed() { return failed_; } ignore_case()202 bool ignore_case() const { return ignore_case_; } multiline()203 bool multiline() const { return multiline_; } unicode()204 bool unicode() const { return unicode_; } 205 206 static bool IsSyntaxCharacterOrSlash(uc32 c); 207 208 static const int kMaxCaptures = 1 << 16; 209 static const uc32 kEndMarker = (1 << 21); 210 211 private: 212 enum SubexpressionType { 213 INITIAL, 214 CAPTURE, // All positive values represent captures. 215 POSITIVE_LOOKAROUND, 216 NEGATIVE_LOOKAROUND, 217 GROUPING 218 }; 219 220 class RegExpParserState : public ZoneObject { 221 public: RegExpParserState(RegExpParserState * previous_state,SubexpressionType group_type,RegExpLookaround::Type lookaround_type,int disjunction_capture_index,const ZoneVector<uc16> * capture_name,bool ignore_case,bool unicode,Zone * zone)222 RegExpParserState(RegExpParserState* previous_state, 223 SubexpressionType group_type, 224 RegExpLookaround::Type lookaround_type, 225 int disjunction_capture_index, 226 const ZoneVector<uc16>* capture_name, bool ignore_case, 227 bool unicode, Zone* zone) 228 : previous_state_(previous_state), 229 builder_(new (zone) RegExpBuilder(zone, ignore_case, unicode)), 230 group_type_(group_type), 231 lookaround_type_(lookaround_type), 232 disjunction_capture_index_(disjunction_capture_index), 233 capture_name_(capture_name) {} 234 // Parser state of containing expression, if any. previous_state()235 RegExpParserState* previous_state() { return previous_state_; } IsSubexpression()236 bool IsSubexpression() { return previous_state_ != NULL; } 237 // RegExpBuilder building this regexp's AST. builder()238 RegExpBuilder* builder() { return builder_; } 239 // Type of regexp being parsed (parenthesized group or entire regexp). group_type()240 SubexpressionType group_type() { return group_type_; } 241 // Lookahead or Lookbehind. lookaround_type()242 RegExpLookaround::Type lookaround_type() { return lookaround_type_; } 243 // Index in captures array of first capture in this sub-expression, if any. 244 // Also the capture index of this sub-expression itself, if group_type 245 // is CAPTURE. capture_index()246 int capture_index() { return disjunction_capture_index_; } 247 // The name of the current sub-expression, if group_type is CAPTURE. Only 248 // used for named captures. capture_name()249 const ZoneVector<uc16>* capture_name() { return capture_name_; } 250 IsNamedCapture()251 bool IsNamedCapture() const { return capture_name_ != nullptr; } 252 253 // Check whether the parser is inside a capture group with the given index. 254 bool IsInsideCaptureGroup(int index); 255 // Check whether the parser is inside a capture group with the given name. 256 bool IsInsideCaptureGroup(const ZoneVector<uc16>* name); 257 258 private: 259 // Linked list implementation of stack of states. 260 RegExpParserState* previous_state_; 261 // Builder for the stored disjunction. 262 RegExpBuilder* builder_; 263 // Stored disjunction type (capture, look-ahead or grouping), if any. 264 SubexpressionType group_type_; 265 // Stored read direction. 266 RegExpLookaround::Type lookaround_type_; 267 // Stored disjunction's capture index (if any). 268 int disjunction_capture_index_; 269 // Stored capture name (if any). 270 const ZoneVector<uc16>* capture_name_; 271 }; 272 273 // Return the 1-indexed RegExpCapture object, allocate if necessary. 274 RegExpCapture* GetCapture(int index); 275 276 // Creates a new named capture at the specified index. Must be called exactly 277 // once for each named capture. Fails if a capture with the same name is 278 // encountered. 279 bool CreateNamedCaptureAtIndex(const ZoneVector<uc16>* name, int index); 280 281 // Parses the name of a capture group (?<name>pattern). The name must adhere 282 // to IdentifierName in the ECMAScript standard. 283 const ZoneVector<uc16>* ParseCaptureGroupName(); 284 285 bool ParseNamedBackReference(RegExpBuilder* builder, 286 RegExpParserState* state); 287 288 // After the initial parsing pass, patch corresponding RegExpCapture objects 289 // into all RegExpBackReferences. This is done after initial parsing in order 290 // to avoid complicating cases in which references comes before the capture. 291 void PatchNamedBackReferences(); 292 293 Handle<FixedArray> CreateCaptureNameMap(); 294 isolate()295 Isolate* isolate() { return isolate_; } zone()296 Zone* zone() const { return zone_; } 297 current()298 uc32 current() { return current_; } has_more()299 bool has_more() { return has_more_; } has_next()300 bool has_next() { return next_pos_ < in()->length(); } 301 uc32 Next(); 302 template <bool update_position> 303 uc32 ReadNext(); in()304 FlatStringReader* in() { return in_; } 305 void ScanForCaptures(); 306 307 Isolate* isolate_; 308 Zone* zone_; 309 Handle<String>* error_; 310 ZoneList<RegExpCapture*>* captures_; 311 ZoneList<RegExpCapture*>* named_captures_; 312 ZoneList<RegExpBackReference*>* named_back_references_; 313 FlatStringReader* in_; 314 uc32 current_; 315 bool ignore_case_; 316 bool multiline_; 317 bool unicode_; 318 int next_pos_; 319 int captures_started_; 320 // The capture count is only valid after we have scanned for captures. 321 int capture_count_; 322 bool has_more_; 323 bool simple_; 324 bool contains_anchor_; 325 bool is_scanned_for_captures_; 326 bool failed_; 327 }; 328 329 } // namespace internal 330 } // namespace v8 331 332 #endif // V8_REGEXP_REGEXP_PARSER_H_ 333