1 // Copyright 2016 the V8 project authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #ifndef V8_REGEXP_REGEXP_PARSER_H_ 6 #define V8_REGEXP_REGEXP_PARSER_H_ 7 8 #include "src/objects.h" 9 #include "src/objects/js-regexp.h" 10 #include "src/regexp/regexp-ast.h" 11 #include "src/zone/zone.h" 12 13 namespace v8 { 14 namespace internal { 15 16 struct RegExpCompileData; 17 18 // A BufferedZoneList is an automatically growing list, just like (and backed 19 // by) a ZoneList, that is optimized for the case of adding and removing 20 // a single element. The last element added is stored outside the backing list, 21 // and if no more than one element is ever added, the ZoneList isn't even 22 // allocated. 23 // Elements must not be nullptr pointers. 24 template <typename T, int initial_size> 25 class BufferedZoneList { 26 public: BufferedZoneList()27 BufferedZoneList() : list_(nullptr), last_(nullptr) {} 28 29 // Adds element at end of list. This element is buffered and can 30 // be read using last() or removed using RemoveLast until a new Add or until 31 // RemoveLast or GetList has been called. Add(T * value,Zone * zone)32 void Add(T* value, Zone* zone) { 33 if (last_ != nullptr) { 34 if (list_ == nullptr) { 35 list_ = new (zone) ZoneList<T*>(initial_size, zone); 36 } 37 list_->Add(last_, zone); 38 } 39 last_ = value; 40 } 41 last()42 T* last() { 43 DCHECK(last_ != nullptr); 44 return last_; 45 } 46 RemoveLast()47 T* RemoveLast() { 48 DCHECK(last_ != nullptr); 49 T* result = last_; 50 if ((list_ != nullptr) && (list_->length() > 0)) 51 last_ = list_->RemoveLast(); 52 else 53 last_ = nullptr; 54 return result; 55 } 56 Get(int i)57 T* Get(int i) { 58 DCHECK((0 <= i) && (i < length())); 59 if (list_ == nullptr) { 60 DCHECK_EQ(0, i); 61 return last_; 62 } else { 63 if (i == list_->length()) { 64 DCHECK(last_ != nullptr); 65 return last_; 66 } else { 67 return list_->at(i); 68 } 69 } 70 } 71 Clear()72 void Clear() { 73 list_ = nullptr; 74 last_ = nullptr; 75 } 76 length()77 int length() { 78 int length = (list_ == nullptr) ? 0 : list_->length(); 79 return length + ((last_ == nullptr) ? 0 : 1); 80 } 81 GetList(Zone * zone)82 ZoneList<T*>* GetList(Zone* zone) { 83 if (list_ == nullptr) { 84 list_ = new (zone) ZoneList<T*>(initial_size, zone); 85 } 86 if (last_ != nullptr) { 87 list_->Add(last_, zone); 88 last_ = nullptr; 89 } 90 return list_; 91 } 92 93 private: 94 ZoneList<T*>* list_; 95 T* last_; 96 }; 97 98 99 // Accumulates RegExp atoms and assertions into lists of terms and alternatives. 100 class RegExpBuilder : public ZoneObject { 101 public: 102 RegExpBuilder(Zone* zone, JSRegExp::Flags flags); 103 void AddCharacter(uc16 character); 104 void AddUnicodeCharacter(uc32 character); 105 void AddEscapedUnicodeCharacter(uc32 character); 106 // "Adds" an empty expression. Does nothing except consume a 107 // following quantifier 108 void AddEmpty(); 109 void AddCharacterClass(RegExpCharacterClass* cc); 110 void AddCharacterClassForDesugaring(uc32 c); 111 void AddAtom(RegExpTree* tree); 112 void AddTerm(RegExpTree* tree); 113 void AddAssertion(RegExpTree* tree); 114 void NewAlternative(); // '|' 115 bool AddQuantifierToAtom(int min, int max, 116 RegExpQuantifier::QuantifierType type); 117 void FlushText(); 118 RegExpTree* ToRegExp(); flags()119 JSRegExp::Flags flags() const { return flags_; } set_flags(JSRegExp::Flags flags)120 void set_flags(JSRegExp::Flags flags) { flags_ = flags; } 121 ignore_case()122 bool ignore_case() const { return (flags_ & JSRegExp::kIgnoreCase) != 0; } multiline()123 bool multiline() const { return (flags_ & JSRegExp::kMultiline) != 0; } dotall()124 bool dotall() const { return (flags_ & JSRegExp::kDotAll) != 0; } 125 126 private: 127 static const uc16 kNoPendingSurrogate = 0; 128 void AddLeadSurrogate(uc16 lead_surrogate); 129 void AddTrailSurrogate(uc16 trail_surrogate); 130 void FlushPendingSurrogate(); 131 void FlushCharacters(); 132 void FlushTerms(); 133 bool NeedsDesugaringForUnicode(RegExpCharacterClass* cc); 134 bool NeedsDesugaringForIgnoreCase(uc32 c); zone()135 Zone* zone() const { return zone_; } unicode()136 bool unicode() const { return (flags_ & JSRegExp::kUnicode) != 0; } 137 138 Zone* zone_; 139 bool pending_empty_; 140 JSRegExp::Flags flags_; 141 ZoneList<uc16>* characters_; 142 uc16 pending_surrogate_; 143 BufferedZoneList<RegExpTree, 2> terms_; 144 BufferedZoneList<RegExpTree, 2> text_; 145 BufferedZoneList<RegExpTree, 2> alternatives_; 146 #ifdef DEBUG 147 enum { ADD_NONE, ADD_CHAR, ADD_TERM, ADD_ASSERT, ADD_ATOM } last_added_; 148 #define LAST(x) last_added_ = x; 149 #else 150 #define LAST(x) 151 #endif 152 }; 153 154 155 class RegExpParser BASE_EMBEDDED { 156 public: 157 RegExpParser(FlatStringReader* in, Handle<String>* error, 158 JSRegExp::Flags flags, Isolate* isolate, Zone* zone); 159 160 static bool ParseRegExp(Isolate* isolate, Zone* zone, FlatStringReader* input, 161 JSRegExp::Flags flags, RegExpCompileData* result); 162 163 RegExpTree* ParsePattern(); 164 RegExpTree* ParseDisjunction(); 165 RegExpTree* ParseGroup(); 166 167 // Parses a {...,...} quantifier and stores the range in the given 168 // out parameters. 169 bool ParseIntervalQuantifier(int* min_out, int* max_out); 170 171 // Parses and returns a single escaped character. The character 172 // must not be 'b' or 'B' since they are usually handle specially. 173 uc32 ParseClassCharacterEscape(); 174 175 // Checks whether the following is a length-digit hexadecimal number, 176 // and sets the value if it is. 177 bool ParseHexEscape(int length, uc32* value); 178 bool ParseUnicodeEscape(uc32* value); 179 bool ParseUnlimitedLengthHexNumber(int max_value, uc32* value); 180 bool ParsePropertyClass(ZoneList<CharacterRange>* result, bool negate); 181 RegExpTree* ParseCharacterClass(const RegExpBuilder* state); 182 183 uc32 ParseOctalLiteral(); 184 185 // Tries to parse the input as a back reference. If successful it 186 // stores the result in the output parameter and returns true. If 187 // it fails it will push back the characters read so the same characters 188 // can be reparsed. 189 bool ParseBackReferenceIndex(int* index_out); 190 191 // Parse inside a class. Either add escaped class to the range, or return 192 // false and pass parsed single character through |char_out|. 193 void ParseClassEscape(ZoneList<CharacterRange>* ranges, Zone* zone, 194 bool add_unicode_case_equivalents, uc32* char_out, 195 bool* is_class_escape); 196 197 char ParseClassEscape(); 198 199 RegExpTree* ReportError(Vector<const char> message); 200 void Advance(); 201 void Advance(int dist); 202 void Reset(int pos); 203 204 // Reports whether the pattern might be used as a literal search string. 205 // Only use if the result of the parse is a single atom node. 206 bool simple(); contains_anchor()207 bool contains_anchor() { return contains_anchor_; } set_contains_anchor()208 void set_contains_anchor() { contains_anchor_ = true; } captures_started()209 int captures_started() { return captures_started_; } position()210 int position() { return next_pos_ - 1; } failed()211 bool failed() { return failed_; } 212 // The Unicode flag can't be changed using in-regexp syntax, so it's OK to 213 // just read the initial flag value here. unicode()214 bool unicode() const { return (top_level_flags_ & JSRegExp::kUnicode) != 0; } 215 216 static bool IsSyntaxCharacterOrSlash(uc32 c); 217 218 static const int kMaxCaptures = 1 << 16; 219 static const uc32 kEndMarker = (1 << 21); 220 221 private: 222 enum SubexpressionType { 223 INITIAL, 224 CAPTURE, // All positive values represent captures. 225 POSITIVE_LOOKAROUND, 226 NEGATIVE_LOOKAROUND, 227 GROUPING 228 }; 229 230 class RegExpParserState : public ZoneObject { 231 public: 232 // Push a state on the stack. RegExpParserState(RegExpParserState * previous_state,SubexpressionType group_type,RegExpLookaround::Type lookaround_type,int disjunction_capture_index,const ZoneVector<uc16> * capture_name,JSRegExp::Flags flags,Zone * zone)233 RegExpParserState(RegExpParserState* previous_state, 234 SubexpressionType group_type, 235 RegExpLookaround::Type lookaround_type, 236 int disjunction_capture_index, 237 const ZoneVector<uc16>* capture_name, 238 JSRegExp::Flags flags, Zone* zone) 239 : previous_state_(previous_state), 240 builder_(new (zone) RegExpBuilder(zone, flags)), 241 group_type_(group_type), 242 lookaround_type_(lookaround_type), 243 disjunction_capture_index_(disjunction_capture_index), 244 capture_name_(capture_name) {} 245 // Parser state of containing expression, if any. previous_state()246 RegExpParserState* previous_state() const { return previous_state_; } IsSubexpression()247 bool IsSubexpression() { return previous_state_ != nullptr; } 248 // RegExpBuilder building this regexp's AST. builder()249 RegExpBuilder* builder() const { return builder_; } 250 // Type of regexp being parsed (parenthesized group or entire regexp). group_type()251 SubexpressionType group_type() const { return group_type_; } 252 // Lookahead or Lookbehind. lookaround_type()253 RegExpLookaround::Type lookaround_type() const { return lookaround_type_; } 254 // Index in captures array of first capture in this sub-expression, if any. 255 // Also the capture index of this sub-expression itself, if group_type 256 // is CAPTURE. capture_index()257 int capture_index() const { return disjunction_capture_index_; } 258 // The name of the current sub-expression, if group_type is CAPTURE. Only 259 // used for named captures. capture_name()260 const ZoneVector<uc16>* capture_name() const { return capture_name_; } 261 IsNamedCapture()262 bool IsNamedCapture() const { return capture_name_ != nullptr; } 263 264 // Check whether the parser is inside a capture group with the given index. 265 bool IsInsideCaptureGroup(int index); 266 // Check whether the parser is inside a capture group with the given name. 267 bool IsInsideCaptureGroup(const ZoneVector<uc16>* name); 268 269 private: 270 // Linked list implementation of stack of states. 271 RegExpParserState* const previous_state_; 272 // Builder for the stored disjunction. 273 RegExpBuilder* const builder_; 274 // Stored disjunction type (capture, look-ahead or grouping), if any. 275 const SubexpressionType group_type_; 276 // Stored read direction. 277 const RegExpLookaround::Type lookaround_type_; 278 // Stored disjunction's capture index (if any). 279 const int disjunction_capture_index_; 280 // Stored capture name (if any). 281 const ZoneVector<uc16>* const capture_name_; 282 }; 283 284 // Return the 1-indexed RegExpCapture object, allocate if necessary. 285 RegExpCapture* GetCapture(int index); 286 287 // Creates a new named capture at the specified index. Must be called exactly 288 // once for each named capture. Fails if a capture with the same name is 289 // encountered. 290 bool CreateNamedCaptureAtIndex(const ZoneVector<uc16>* name, int index); 291 292 // Parses the name of a capture group (?<name>pattern). The name must adhere 293 // to IdentifierName in the ECMAScript standard. 294 const ZoneVector<uc16>* ParseCaptureGroupName(); 295 296 bool ParseNamedBackReference(RegExpBuilder* builder, 297 RegExpParserState* state); 298 RegExpParserState* ParseOpenParenthesis(RegExpParserState* state); 299 300 // After the initial parsing pass, patch corresponding RegExpCapture objects 301 // into all RegExpBackReferences. This is done after initial parsing in order 302 // to avoid complicating cases in which references comes before the capture. 303 void PatchNamedBackReferences(); 304 305 Handle<FixedArray> CreateCaptureNameMap(); 306 307 // Returns true iff the pattern contains named captures. May call 308 // ScanForCaptures to look ahead at the remaining pattern. 309 bool HasNamedCaptures(); 310 isolate()311 Isolate* isolate() { return isolate_; } zone()312 Zone* zone() const { return zone_; } 313 current()314 uc32 current() { return current_; } has_more()315 bool has_more() { return has_more_; } has_next()316 bool has_next() { return next_pos_ < in()->length(); } 317 uc32 Next(); 318 template <bool update_position> 319 uc32 ReadNext(); in()320 FlatStringReader* in() { return in_; } 321 void ScanForCaptures(); 322 323 Isolate* isolate_; 324 Zone* zone_; 325 Handle<String>* error_; 326 ZoneList<RegExpCapture*>* captures_; 327 ZoneList<RegExpCapture*>* named_captures_; 328 ZoneList<RegExpBackReference*>* named_back_references_; 329 FlatStringReader* in_; 330 uc32 current_; 331 // These are the flags specified outside the regexp syntax ie after the 332 // terminating '/' or in the second argument to the constructor. The current 333 // flags are stored on the RegExpBuilder. 334 JSRegExp::Flags top_level_flags_; 335 int next_pos_; 336 int captures_started_; 337 int capture_count_; // Only valid after we have scanned for captures. 338 bool has_more_; 339 bool simple_; 340 bool contains_anchor_; 341 bool is_scanned_for_captures_; 342 bool has_named_captures_; // Only valid after we have scanned for captures. 343 bool failed_; 344 }; 345 346 } // namespace internal 347 } // namespace v8 348 349 #endif // V8_REGEXP_REGEXP_PARSER_H_ 350