1 // Copyright 2016 the V8 project authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #ifndef V8_REGEXP_REGEXP_PARSER_H_ 6 #define V8_REGEXP_REGEXP_PARSER_H_ 7 8 #include "src/objects/js-regexp.h" 9 #include "src/objects/objects.h" 10 #include "src/regexp/regexp-ast.h" 11 #include "src/regexp/regexp-error.h" 12 #include "src/zone/zone.h" 13 14 namespace v8 { 15 namespace internal { 16 17 struct RegExpCompileData; 18 19 // A BufferedZoneList is an automatically growing list, just like (and backed 20 // by) a ZoneList, that is optimized for the case of adding and removing 21 // a single element. The last element added is stored outside the backing list, 22 // and if no more than one element is ever added, the ZoneList isn't even 23 // allocated. 24 // Elements must not be nullptr pointers. 25 template <typename T, int initial_size> 26 class BufferedZoneList { 27 public: BufferedZoneList()28 BufferedZoneList() : list_(nullptr), last_(nullptr) {} 29 30 // Adds element at end of list. This element is buffered and can 31 // be read using last() or removed using RemoveLast until a new Add or until 32 // RemoveLast or GetList has been called. Add(T * value,Zone * zone)33 void Add(T* value, Zone* zone) { 34 if (last_ != nullptr) { 35 if (list_ == nullptr) { 36 list_ = zone->New<ZoneList<T*>>(initial_size, zone); 37 } 38 list_->Add(last_, zone); 39 } 40 last_ = value; 41 } 42 last()43 T* last() { 44 DCHECK(last_ != nullptr); 45 return last_; 46 } 47 RemoveLast()48 T* RemoveLast() { 49 DCHECK(last_ != nullptr); 50 T* result = last_; 51 if ((list_ != nullptr) && (list_->length() > 0)) 52 last_ = list_->RemoveLast(); 53 else 54 last_ = nullptr; 55 return result; 56 } 57 Get(int i)58 T* Get(int i) { 59 DCHECK((0 <= i) && (i < length())); 60 if (list_ == nullptr) { 61 DCHECK_EQ(0, i); 62 return last_; 63 } else { 64 if (i == list_->length()) { 65 DCHECK(last_ != nullptr); 66 return last_; 67 } else { 68 return list_->at(i); 69 } 70 } 71 } 72 Clear()73 void Clear() { 74 list_ = nullptr; 75 last_ = nullptr; 76 } 77 length()78 int length() { 79 int length = (list_ == nullptr) ? 0 : list_->length(); 80 return length + ((last_ == nullptr) ? 0 : 1); 81 } 82 GetList(Zone * zone)83 ZoneList<T*>* GetList(Zone* zone) { 84 if (list_ == nullptr) { 85 list_ = zone->New<ZoneList<T*>>(initial_size, zone); 86 } 87 if (last_ != nullptr) { 88 list_->Add(last_, zone); 89 last_ = nullptr; 90 } 91 return list_; 92 } 93 94 private: 95 ZoneList<T*>* list_; 96 T* last_; 97 }; 98 99 100 // Accumulates RegExp atoms and assertions into lists of terms and alternatives. 101 class RegExpBuilder : public ZoneObject { 102 public: 103 RegExpBuilder(Zone* zone, JSRegExp::Flags flags); 104 void AddCharacter(uc16 character); 105 void AddUnicodeCharacter(uc32 character); 106 void AddEscapedUnicodeCharacter(uc32 character); 107 // "Adds" an empty expression. Does nothing except consume a 108 // following quantifier 109 void AddEmpty(); 110 void AddCharacterClass(RegExpCharacterClass* cc); 111 void AddCharacterClassForDesugaring(uc32 c); 112 void AddAtom(RegExpTree* tree); 113 void AddTerm(RegExpTree* tree); 114 void AddAssertion(RegExpTree* tree); 115 void NewAlternative(); // '|' 116 bool AddQuantifierToAtom(int min, int max, 117 RegExpQuantifier::QuantifierType type); 118 void FlushText(); 119 RegExpTree* ToRegExp(); flags()120 JSRegExp::Flags flags() const { return flags_; } set_flags(JSRegExp::Flags flags)121 void set_flags(JSRegExp::Flags flags) { flags_ = flags; } 122 ignore_case()123 bool ignore_case() const { return (flags_ & JSRegExp::kIgnoreCase) != 0; } multiline()124 bool multiline() const { return (flags_ & JSRegExp::kMultiline) != 0; } dotall()125 bool dotall() const { return (flags_ & JSRegExp::kDotAll) != 0; } 126 127 private: 128 static const uc16 kNoPendingSurrogate = 0; 129 void AddLeadSurrogate(uc16 lead_surrogate); 130 void AddTrailSurrogate(uc16 trail_surrogate); 131 void FlushPendingSurrogate(); 132 void FlushCharacters(); 133 void FlushTerms(); 134 bool NeedsDesugaringForUnicode(RegExpCharacterClass* cc); 135 bool NeedsDesugaringForIgnoreCase(uc32 c); zone()136 Zone* zone() const { return zone_; } unicode()137 bool unicode() const { return (flags_ & JSRegExp::kUnicode) != 0; } 138 139 Zone* zone_; 140 bool pending_empty_; 141 JSRegExp::Flags flags_; 142 ZoneList<uc16>* characters_; 143 uc16 pending_surrogate_; 144 BufferedZoneList<RegExpTree, 2> terms_; 145 BufferedZoneList<RegExpTree, 2> text_; 146 BufferedZoneList<RegExpTree, 2> alternatives_; 147 #ifdef DEBUG 148 enum { ADD_NONE, ADD_CHAR, ADD_TERM, ADD_ASSERT, ADD_ATOM } last_added_; 149 #define LAST(x) last_added_ = x; 150 #else 151 #define LAST(x) 152 #endif 153 }; 154 155 class V8_EXPORT_PRIVATE RegExpParser { 156 public: 157 RegExpParser(FlatStringReader* in, JSRegExp::Flags flags, Isolate* isolate, 158 Zone* zone); 159 160 static bool ParseRegExp(Isolate* isolate, Zone* zone, FlatStringReader* input, 161 JSRegExp::Flags flags, RegExpCompileData* result); 162 163 private: 164 bool Parse(RegExpCompileData* result, const DisallowHeapAllocation&); 165 166 RegExpTree* ParsePattern(); 167 RegExpTree* ParseDisjunction(); 168 RegExpTree* ParseGroup(); 169 170 // Parses a {...,...} quantifier and stores the range in the given 171 // out parameters. 172 bool ParseIntervalQuantifier(int* min_out, int* max_out); 173 174 // Parses and returns a single escaped character. The character 175 // must not be 'b' or 'B' since they are usually handle specially. 176 uc32 ParseClassCharacterEscape(); 177 178 // Checks whether the following is a length-digit hexadecimal number, 179 // and sets the value if it is. 180 bool ParseHexEscape(int length, uc32* value); 181 bool ParseUnicodeEscape(uc32* value); 182 bool ParseUnlimitedLengthHexNumber(int max_value, uc32* value); 183 184 bool ParsePropertyClassName(ZoneVector<char>* name_1, 185 ZoneVector<char>* name_2); 186 bool AddPropertyClassRange(ZoneList<CharacterRange>* add_to, bool negate, 187 const ZoneVector<char>& name_1, 188 const ZoneVector<char>& name_2); 189 190 RegExpTree* GetPropertySequence(const ZoneVector<char>& name_1); 191 RegExpTree* ParseCharacterClass(const RegExpBuilder* state); 192 193 uc32 ParseOctalLiteral(); 194 195 // Tries to parse the input as a back reference. If successful it 196 // stores the result in the output parameter and returns true. If 197 // it fails it will push back the characters read so the same characters 198 // can be reparsed. 199 bool ParseBackReferenceIndex(int* index_out); 200 201 // Parse inside a class. Either add escaped class to the range, or return 202 // false and pass parsed single character through |char_out|. 203 void ParseClassEscape(ZoneList<CharacterRange>* ranges, Zone* zone, 204 bool add_unicode_case_equivalents, uc32* char_out, 205 bool* is_class_escape); 206 207 char ParseClassEscape(); 208 209 RegExpTree* ReportError(RegExpError error); 210 void Advance(); 211 void Advance(int dist); 212 void Reset(int pos); 213 214 // Reports whether the pattern might be used as a literal search string. 215 // Only use if the result of the parse is a single atom node. 216 bool simple(); contains_anchor()217 bool contains_anchor() { return contains_anchor_; } set_contains_anchor()218 void set_contains_anchor() { contains_anchor_ = true; } captures_started()219 int captures_started() { return captures_started_; } position()220 int position() { return next_pos_ - 1; } failed()221 bool failed() { return failed_; } 222 // The Unicode flag can't be changed using in-regexp syntax, so it's OK to 223 // just read the initial flag value here. unicode()224 bool unicode() const { return (top_level_flags_ & JSRegExp::kUnicode) != 0; } 225 226 static bool IsSyntaxCharacterOrSlash(uc32 c); 227 228 static const uc32 kEndMarker = (1 << 21); 229 230 private: 231 enum SubexpressionType { 232 INITIAL, 233 CAPTURE, // All positive values represent captures. 234 POSITIVE_LOOKAROUND, 235 NEGATIVE_LOOKAROUND, 236 GROUPING 237 }; 238 239 class RegExpParserState : public ZoneObject { 240 public: 241 // Push a state on the stack. RegExpParserState(RegExpParserState * previous_state,SubexpressionType group_type,RegExpLookaround::Type lookaround_type,int disjunction_capture_index,const ZoneVector<uc16> * capture_name,JSRegExp::Flags flags,Zone * zone)242 RegExpParserState(RegExpParserState* previous_state, 243 SubexpressionType group_type, 244 RegExpLookaround::Type lookaround_type, 245 int disjunction_capture_index, 246 const ZoneVector<uc16>* capture_name, 247 JSRegExp::Flags flags, Zone* zone) 248 : previous_state_(previous_state), 249 builder_(zone->New<RegExpBuilder>(zone, flags)), 250 group_type_(group_type), 251 lookaround_type_(lookaround_type), 252 disjunction_capture_index_(disjunction_capture_index), 253 capture_name_(capture_name) {} 254 // Parser state of containing expression, if any. previous_state()255 RegExpParserState* previous_state() const { return previous_state_; } IsSubexpression()256 bool IsSubexpression() { return previous_state_ != nullptr; } 257 // RegExpBuilder building this regexp's AST. builder()258 RegExpBuilder* builder() const { return builder_; } 259 // Type of regexp being parsed (parenthesized group or entire regexp). group_type()260 SubexpressionType group_type() const { return group_type_; } 261 // Lookahead or Lookbehind. lookaround_type()262 RegExpLookaround::Type lookaround_type() const { return lookaround_type_; } 263 // Index in captures array of first capture in this sub-expression, if any. 264 // Also the capture index of this sub-expression itself, if group_type 265 // is CAPTURE. capture_index()266 int capture_index() const { return disjunction_capture_index_; } 267 // The name of the current sub-expression, if group_type is CAPTURE. Only 268 // used for named captures. capture_name()269 const ZoneVector<uc16>* capture_name() const { return capture_name_; } 270 IsNamedCapture()271 bool IsNamedCapture() const { return capture_name_ != nullptr; } 272 273 // Check whether the parser is inside a capture group with the given index. 274 bool IsInsideCaptureGroup(int index); 275 // Check whether the parser is inside a capture group with the given name. 276 bool IsInsideCaptureGroup(const ZoneVector<uc16>* name); 277 278 private: 279 // Linked list implementation of stack of states. 280 RegExpParserState* const previous_state_; 281 // Builder for the stored disjunction. 282 RegExpBuilder* const builder_; 283 // Stored disjunction type (capture, look-ahead or grouping), if any. 284 const SubexpressionType group_type_; 285 // Stored read direction. 286 const RegExpLookaround::Type lookaround_type_; 287 // Stored disjunction's capture index (if any). 288 const int disjunction_capture_index_; 289 // Stored capture name (if any). 290 const ZoneVector<uc16>* const capture_name_; 291 }; 292 293 // Return the 1-indexed RegExpCapture object, allocate if necessary. 294 RegExpCapture* GetCapture(int index); 295 296 // Creates a new named capture at the specified index. Must be called exactly 297 // once for each named capture. Fails if a capture with the same name is 298 // encountered. 299 bool CreateNamedCaptureAtIndex(const ZoneVector<uc16>* name, int index); 300 301 // Parses the name of a capture group (?<name>pattern). The name must adhere 302 // to IdentifierName in the ECMAScript standard. 303 const ZoneVector<uc16>* ParseCaptureGroupName(); 304 305 bool ParseNamedBackReference(RegExpBuilder* builder, 306 RegExpParserState* state); 307 RegExpParserState* ParseOpenParenthesis(RegExpParserState* state); 308 309 // After the initial parsing pass, patch corresponding RegExpCapture objects 310 // into all RegExpBackReferences. This is done after initial parsing in order 311 // to avoid complicating cases in which references comes before the capture. 312 void PatchNamedBackReferences(); 313 314 Handle<FixedArray> CreateCaptureNameMap(); 315 316 // Returns true iff the pattern contains named captures. May call 317 // ScanForCaptures to look ahead at the remaining pattern. 318 bool HasNamedCaptures(); 319 isolate()320 Isolate* isolate() { return isolate_; } zone()321 Zone* zone() const { return zone_; } 322 current()323 uc32 current() { return current_; } has_more()324 bool has_more() { return has_more_; } has_next()325 bool has_next() { return next_pos_ < in()->length(); } 326 uc32 Next(); 327 template <bool update_position> 328 uc32 ReadNext(); in()329 FlatStringReader* in() { return in_; } 330 void ScanForCaptures(); 331 332 struct RegExpCaptureNameLess { operatorRegExpCaptureNameLess333 bool operator()(const RegExpCapture* lhs, const RegExpCapture* rhs) const { 334 DCHECK_NOT_NULL(lhs); 335 DCHECK_NOT_NULL(rhs); 336 return *lhs->name() < *rhs->name(); 337 } 338 }; 339 340 Isolate* isolate_; 341 Zone* zone_; 342 RegExpError error_ = RegExpError::kNone; 343 int error_pos_ = 0; 344 ZoneList<RegExpCapture*>* captures_; 345 ZoneSet<RegExpCapture*, RegExpCaptureNameLess>* named_captures_; 346 ZoneList<RegExpBackReference*>* named_back_references_; 347 FlatStringReader* in_; 348 uc32 current_; 349 // These are the flags specified outside the regexp syntax ie after the 350 // terminating '/' or in the second argument to the constructor. The current 351 // flags are stored on the RegExpBuilder. 352 JSRegExp::Flags top_level_flags_; 353 int next_pos_; 354 int captures_started_; 355 int capture_count_; // Only valid after we have scanned for captures. 356 bool has_more_; 357 bool simple_; 358 bool contains_anchor_; 359 bool is_scanned_for_captures_; 360 bool has_named_captures_; // Only valid after we have scanned for captures. 361 bool failed_; 362 }; 363 364 } // namespace internal 365 } // namespace v8 366 367 #endif // V8_REGEXP_REGEXP_PARSER_H_ 368