1 // Copyright 2016 the V8 project authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #ifndef V8_REGEXP_REGEXP_AST_H_ 6 #define V8_REGEXP_REGEXP_AST_H_ 7 8 #include "src/base/strings.h" 9 #include "src/regexp/regexp-flags.h" 10 #include "src/zone/zone-containers.h" 11 #include "src/zone/zone-list.h" 12 #include "src/zone/zone.h" 13 14 namespace v8 { 15 namespace internal { 16 17 #define FOR_EACH_REG_EXP_TREE_TYPE(VISIT) \ 18 VISIT(Disjunction) \ 19 VISIT(Alternative) \ 20 VISIT(Assertion) \ 21 VISIT(CharacterClass) \ 22 VISIT(Atom) \ 23 VISIT(Quantifier) \ 24 VISIT(Capture) \ 25 VISIT(Group) \ 26 VISIT(Lookaround) \ 27 VISIT(BackReference) \ 28 VISIT(Empty) \ 29 VISIT(Text) 30 31 #define FORWARD_DECLARE(Name) class RegExp##Name; 32 FOR_EACH_REG_EXP_TREE_TYPE(FORWARD_DECLARE) 33 #undef FORWARD_DECLARE 34 35 class RegExpCompiler; 36 class RegExpNode; 37 class RegExpTree; 38 39 class RegExpVisitor { 40 public: 41 virtual ~RegExpVisitor() = default; 42 #define MAKE_CASE(Name) \ 43 virtual void* Visit##Name(RegExp##Name*, void* data) = 0; 44 FOR_EACH_REG_EXP_TREE_TYPE(MAKE_CASE) 45 #undef MAKE_CASE 46 }; 47 48 // A simple closed interval. 49 class Interval { 50 public: Interval()51 Interval() : from_(kNone), to_(kNone - 1) {} // '- 1' for branchless size(). Interval(int from,int to)52 Interval(int from, int to) : from_(from), to_(to) {} Union(Interval that)53 Interval Union(Interval that) { 54 if (that.from_ == kNone) return *this; 55 if (from_ == kNone) return that; 56 return Interval(std::min(from_, that.from_), std::max(to_, that.to_)); 57 } 58 Empty()59 static Interval Empty() { return Interval(); } 60 Contains(int value)61 bool Contains(int value) const { return (from_ <= value) && (value <= to_); } is_empty()62 bool is_empty() const { return from_ == kNone; } from()63 int from() const { return from_; } to()64 int to() const { return to_; } size()65 int size() const { return to_ - from_ + 1; } 66 67 static constexpr int kNone = -1; 68 69 private: 70 int from_; 71 int to_; 72 }; 73 74 // Named standard character sets. 75 enum class StandardCharacterSet : char { 76 kWhitespace = 's', // Like /\s/. 77 kNotWhitespace = 'S', // Like /\S/. 78 kWord = 'w', // Like /\w/. 79 kNotWord = 'W', // Like /\W/. 80 kDigit = 'd', // Like /\d/. 81 kNotDigit = 'D', // Like /\D/. 82 kLineTerminator = 'n', // The inverse of /./. 83 kNotLineTerminator = '.', // Like /./. 84 kEverything = '*', // Matches every character, like /./s. 85 }; 86 87 // Represents code points (with values up to 0x10FFFF) in the range from from_ 88 // to to_, both ends are inclusive. 89 class CharacterRange { 90 public: 91 CharacterRange() = default; 92 // For compatibility with the CHECK_OK macro. CharacterRange(void * null)93 CharacterRange(void* null) { DCHECK_NULL(null); } // NOLINT 94 Singleton(base::uc32 value)95 static inline CharacterRange Singleton(base::uc32 value) { 96 return CharacterRange(value, value); 97 } Range(base::uc32 from,base::uc32 to)98 static inline CharacterRange Range(base::uc32 from, base::uc32 to) { 99 DCHECK(0 <= from && to <= kMaxCodePoint); 100 DCHECK(static_cast<uint32_t>(from) <= static_cast<uint32_t>(to)); 101 return CharacterRange(from, to); 102 } Everything()103 static inline CharacterRange Everything() { 104 return CharacterRange(0, kMaxCodePoint); 105 } 106 List(Zone * zone,CharacterRange range)107 static inline ZoneList<CharacterRange>* List(Zone* zone, 108 CharacterRange range) { 109 ZoneList<CharacterRange>* list = 110 zone->New<ZoneList<CharacterRange>>(1, zone); 111 list->Add(range, zone); 112 return list; 113 } 114 115 // Add class escapes. Add case equivalent closure for \w and \W if necessary. 116 V8_EXPORT_PRIVATE static void AddClassEscape( 117 StandardCharacterSet standard_character_set, 118 ZoneList<CharacterRange>* ranges, bool add_unicode_case_equivalents, 119 Zone* zone); 120 V8_EXPORT_PRIVATE static void AddCaseEquivalents( 121 Isolate* isolate, Zone* zone, ZoneList<CharacterRange>* ranges, 122 bool is_one_byte); 123 Contains(base::uc32 i)124 bool Contains(base::uc32 i) const { return from_ <= i && i <= to_; } from()125 base::uc32 from() const { return from_; } to()126 base::uc32 to() const { return to_; } IsEverything(base::uc32 max)127 bool IsEverything(base::uc32 max) const { return from_ == 0 && to_ >= max; } IsSingleton()128 bool IsSingleton() const { return from_ == to_; } 129 // Whether a range list is in canonical form: Ranges ordered by from value, 130 // and ranges non-overlapping and non-adjacent. 131 V8_EXPORT_PRIVATE static bool IsCanonical(ZoneList<CharacterRange>* ranges); 132 // Convert range list to canonical form. The characters covered by the ranges 133 // will still be the same, but no character is in more than one range, and 134 // adjacent ranges are merged. The resulting list may be shorter than the 135 // original, but cannot be longer. 136 static void Canonicalize(ZoneList<CharacterRange>* ranges); 137 // Negate the contents of a character range in canonical form. 138 static void Negate(ZoneList<CharacterRange>* src, 139 ZoneList<CharacterRange>* dst, Zone* zone); 140 141 // Remove all ranges outside the one-byte range. 142 static void ClampToOneByte(ZoneList<CharacterRange>* ranges); 143 144 private: CharacterRange(base::uc32 from,base::uc32 to)145 CharacterRange(base::uc32 from, base::uc32 to) : from_(from), to_(to) {} 146 147 static constexpr int kMaxCodePoint = 0x10ffff; 148 149 base::uc32 from_ = 0; 150 base::uc32 to_ = 0; 151 }; 152 153 #define DECL_BOILERPLATE(Name) \ 154 void* Accept(RegExpVisitor* visitor, void* data) override; \ 155 RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) \ 156 override; \ 157 RegExp##Name* As##Name() override; \ 158 bool Is##Name() override 159 160 class RegExpTree : public ZoneObject { 161 public: 162 static const int kInfinity = kMaxInt; 163 virtual ~RegExpTree() = default; 164 virtual void* Accept(RegExpVisitor* visitor, void* data) = 0; 165 virtual RegExpNode* ToNode(RegExpCompiler* compiler, 166 RegExpNode* on_success) = 0; IsTextElement()167 virtual bool IsTextElement() { return false; } IsAnchoredAtStart()168 virtual bool IsAnchoredAtStart() { return false; } IsAnchoredAtEnd()169 virtual bool IsAnchoredAtEnd() { return false; } 170 virtual int min_match() = 0; 171 virtual int max_match() = 0; 172 // Returns the interval of registers used for captures within this 173 // expression. CaptureRegisters()174 virtual Interval CaptureRegisters() { return Interval::Empty(); } 175 virtual void AppendToText(RegExpText* text, Zone* zone); 176 V8_EXPORT_PRIVATE std::ostream& Print(std::ostream& os, Zone* zone); 177 #define MAKE_ASTYPE(Name) \ 178 virtual RegExp##Name* As##Name(); \ 179 virtual bool Is##Name(); 180 FOR_EACH_REG_EXP_TREE_TYPE(MAKE_ASTYPE) 181 #undef MAKE_ASTYPE 182 }; 183 184 185 class RegExpDisjunction final : public RegExpTree { 186 public: 187 explicit RegExpDisjunction(ZoneList<RegExpTree*>* alternatives); 188 189 DECL_BOILERPLATE(Disjunction); 190 191 Interval CaptureRegisters() override; 192 bool IsAnchoredAtStart() override; 193 bool IsAnchoredAtEnd() override; min_match()194 int min_match() override { return min_match_; } max_match()195 int max_match() override { return max_match_; } alternatives()196 ZoneList<RegExpTree*>* alternatives() const { return alternatives_; } 197 198 private: 199 bool SortConsecutiveAtoms(RegExpCompiler* compiler); 200 void RationalizeConsecutiveAtoms(RegExpCompiler* compiler); 201 void FixSingleCharacterDisjunctions(RegExpCompiler* compiler); 202 ZoneList<RegExpTree*>* alternatives_; 203 int min_match_; 204 int max_match_; 205 }; 206 207 208 class RegExpAlternative final : public RegExpTree { 209 public: 210 explicit RegExpAlternative(ZoneList<RegExpTree*>* nodes); 211 212 DECL_BOILERPLATE(Alternative); 213 214 Interval CaptureRegisters() override; 215 bool IsAnchoredAtStart() override; 216 bool IsAnchoredAtEnd() override; min_match()217 int min_match() override { return min_match_; } max_match()218 int max_match() override { return max_match_; } nodes()219 ZoneList<RegExpTree*>* nodes() const { return nodes_; } 220 221 private: 222 ZoneList<RegExpTree*>* nodes_; 223 int min_match_; 224 int max_match_; 225 }; 226 227 228 class RegExpAssertion final : public RegExpTree { 229 public: 230 enum class Type { 231 START_OF_LINE = 0, 232 START_OF_INPUT = 1, 233 END_OF_LINE = 2, 234 END_OF_INPUT = 3, 235 BOUNDARY = 4, 236 NON_BOUNDARY = 5, 237 LAST_ASSERTION_TYPE = NON_BOUNDARY, 238 }; RegExpAssertion(Type type)239 explicit RegExpAssertion(Type type) : assertion_type_(type) {} 240 241 DECL_BOILERPLATE(Assertion); 242 243 bool IsAnchoredAtStart() override; 244 bool IsAnchoredAtEnd() override; min_match()245 int min_match() override { return 0; } max_match()246 int max_match() override { return 0; } assertion_type()247 Type assertion_type() const { return assertion_type_; } 248 249 private: 250 const Type assertion_type_; 251 }; 252 253 class CharacterSet final { 254 public: CharacterSet(StandardCharacterSet standard_set_type)255 explicit CharacterSet(StandardCharacterSet standard_set_type) 256 : standard_set_type_(standard_set_type) {} CharacterSet(ZoneList<CharacterRange> * ranges)257 explicit CharacterSet(ZoneList<CharacterRange>* ranges) : ranges_(ranges) {} 258 259 ZoneList<CharacterRange>* ranges(Zone* zone); standard_set_type()260 StandardCharacterSet standard_set_type() const { 261 return standard_set_type_.value(); 262 } set_standard_set_type(StandardCharacterSet standard_set_type)263 void set_standard_set_type(StandardCharacterSet standard_set_type) { 264 standard_set_type_ = standard_set_type; 265 } is_standard()266 bool is_standard() const { return standard_set_type_.has_value(); } 267 V8_EXPORT_PRIVATE void Canonicalize(); 268 269 private: 270 ZoneList<CharacterRange>* ranges_ = nullptr; 271 base::Optional<StandardCharacterSet> standard_set_type_; 272 }; 273 274 class RegExpCharacterClass final : public RegExpTree { 275 public: 276 // NEGATED: The character class is negated and should match everything but 277 // the specified ranges. 278 // CONTAINS_SPLIT_SURROGATE: The character class contains part of a split 279 // surrogate and should not be unicode-desugared (crbug.com/641091). 280 enum Flag { 281 NEGATED = 1 << 0, 282 CONTAINS_SPLIT_SURROGATE = 1 << 1, 283 }; 284 using CharacterClassFlags = base::Flags<Flag>; 285 286 RegExpCharacterClass( 287 Zone* zone, ZoneList<CharacterRange>* ranges, 288 CharacterClassFlags character_class_flags = CharacterClassFlags()) set_(ranges)289 : set_(ranges), character_class_flags_(character_class_flags) { 290 // Convert the empty set of ranges to the negated Everything() range. 291 if (ranges->is_empty()) { 292 ranges->Add(CharacterRange::Everything(), zone); 293 character_class_flags_ ^= NEGATED; 294 } 295 } RegExpCharacterClass(StandardCharacterSet standard_set_type)296 explicit RegExpCharacterClass(StandardCharacterSet standard_set_type) 297 : set_(standard_set_type), character_class_flags_() {} 298 299 DECL_BOILERPLATE(CharacterClass); 300 IsTextElement()301 bool IsTextElement() override { return true; } min_match()302 int min_match() override { return 1; } 303 // The character class may match two code units for unicode regexps. 304 // TODO(yangguo): we should split this class for usage in TextElement, and 305 // make max_match() dependent on the character class content. max_match()306 int max_match() override { return 2; } 307 308 void AppendToText(RegExpText* text, Zone* zone) override; 309 310 // TODO(lrn): Remove need for complex version if is_standard that 311 // recognizes a mangled standard set and just do { return set_.is_special(); } 312 bool is_standard(Zone* zone); 313 // Returns a value representing the standard character set if is_standard() 314 // returns true. standard_type()315 StandardCharacterSet standard_type() const { 316 return set_.standard_set_type(); 317 } 318 character_set()319 CharacterSet character_set() const { return set_; } ranges(Zone * zone)320 ZoneList<CharacterRange>* ranges(Zone* zone) { return set_.ranges(zone); } 321 is_negated()322 bool is_negated() const { return (character_class_flags_ & NEGATED) != 0; } contains_split_surrogate()323 bool contains_split_surrogate() const { 324 return (character_class_flags_ & CONTAINS_SPLIT_SURROGATE) != 0; 325 } 326 327 private: 328 CharacterSet set_; 329 CharacterClassFlags character_class_flags_; 330 }; 331 332 class RegExpAtom final : public RegExpTree { 333 public: RegExpAtom(base::Vector<const base::uc16> data)334 explicit RegExpAtom(base::Vector<const base::uc16> data) : data_(data) {} 335 336 DECL_BOILERPLATE(Atom); 337 IsTextElement()338 bool IsTextElement() override { return true; } min_match()339 int min_match() override { return data_.length(); } max_match()340 int max_match() override { return data_.length(); } 341 void AppendToText(RegExpText* text, Zone* zone) override; 342 data()343 base::Vector<const base::uc16> data() const { return data_; } length()344 int length() const { return data_.length(); } 345 346 private: 347 base::Vector<const base::uc16> data_; 348 }; 349 350 class TextElement final { 351 public: 352 enum TextType { ATOM, CHAR_CLASS }; 353 354 static TextElement Atom(RegExpAtom* atom); 355 static TextElement CharClass(RegExpCharacterClass* char_class); 356 cp_offset()357 int cp_offset() const { return cp_offset_; } set_cp_offset(int cp_offset)358 void set_cp_offset(int cp_offset) { cp_offset_ = cp_offset; } 359 int length() const; 360 text_type()361 TextType text_type() const { return text_type_; } 362 tree()363 RegExpTree* tree() const { return tree_; } 364 atom()365 RegExpAtom* atom() const { 366 DCHECK(text_type() == ATOM); 367 return reinterpret_cast<RegExpAtom*>(tree()); 368 } 369 char_class()370 RegExpCharacterClass* char_class() const { 371 DCHECK(text_type() == CHAR_CLASS); 372 return reinterpret_cast<RegExpCharacterClass*>(tree()); 373 } 374 375 private: TextElement(TextType text_type,RegExpTree * tree)376 TextElement(TextType text_type, RegExpTree* tree) 377 : cp_offset_(-1), text_type_(text_type), tree_(tree) {} 378 379 int cp_offset_; 380 TextType text_type_; 381 RegExpTree* tree_; 382 }; 383 384 class RegExpText final : public RegExpTree { 385 public: RegExpText(Zone * zone)386 explicit RegExpText(Zone* zone) : elements_(2, zone) {} 387 388 DECL_BOILERPLATE(Text); 389 IsTextElement()390 bool IsTextElement() override { return true; } min_match()391 int min_match() override { return length_; } max_match()392 int max_match() override { return length_; } 393 void AppendToText(RegExpText* text, Zone* zone) override; AddElement(TextElement elm,Zone * zone)394 void AddElement(TextElement elm, Zone* zone) { 395 elements_.Add(elm, zone); 396 length_ += elm.length(); 397 } elements()398 ZoneList<TextElement>* elements() { return &elements_; } 399 400 private: 401 ZoneList<TextElement> elements_; 402 int length_ = 0; 403 }; 404 405 406 class RegExpQuantifier final : public RegExpTree { 407 public: 408 enum QuantifierType { GREEDY, NON_GREEDY, POSSESSIVE }; RegExpQuantifier(int min,int max,QuantifierType type,RegExpTree * body)409 RegExpQuantifier(int min, int max, QuantifierType type, RegExpTree* body) 410 : body_(body), 411 min_(min), 412 max_(max), 413 quantifier_type_(type) { 414 if (min > 0 && body->min_match() > kInfinity / min) { 415 min_match_ = kInfinity; 416 } else { 417 min_match_ = min * body->min_match(); 418 } 419 if (max > 0 && body->max_match() > kInfinity / max) { 420 max_match_ = kInfinity; 421 } else { 422 max_match_ = max * body->max_match(); 423 } 424 } 425 426 DECL_BOILERPLATE(Quantifier); 427 428 static RegExpNode* ToNode(int min, int max, bool is_greedy, RegExpTree* body, 429 RegExpCompiler* compiler, RegExpNode* on_success, 430 bool not_at_start = false); 431 Interval CaptureRegisters() override; min_match()432 int min_match() override { return min_match_; } max_match()433 int max_match() override { return max_match_; } min()434 int min() const { return min_; } max()435 int max() const { return max_; } quantifier_type()436 QuantifierType quantifier_type() const { return quantifier_type_; } is_possessive()437 bool is_possessive() const { return quantifier_type_ == POSSESSIVE; } is_non_greedy()438 bool is_non_greedy() const { return quantifier_type_ == NON_GREEDY; } is_greedy()439 bool is_greedy() const { return quantifier_type_ == GREEDY; } body()440 RegExpTree* body() const { return body_; } 441 442 private: 443 RegExpTree* body_; 444 int min_; 445 int max_; 446 int min_match_; 447 int max_match_; 448 QuantifierType quantifier_type_; 449 }; 450 451 452 class RegExpCapture final : public RegExpTree { 453 public: RegExpCapture(int index)454 explicit RegExpCapture(int index) 455 : body_(nullptr), 456 index_(index), 457 min_match_(0), 458 max_match_(0), 459 name_(nullptr) {} 460 461 DECL_BOILERPLATE(Capture); 462 463 static RegExpNode* ToNode(RegExpTree* body, int index, 464 RegExpCompiler* compiler, RegExpNode* on_success); 465 bool IsAnchoredAtStart() override; 466 bool IsAnchoredAtEnd() override; 467 Interval CaptureRegisters() override; min_match()468 int min_match() override { return min_match_; } max_match()469 int max_match() override { return max_match_; } body()470 RegExpTree* body() { return body_; } set_body(RegExpTree * body)471 void set_body(RegExpTree* body) { 472 body_ = body; 473 min_match_ = body->min_match(); 474 max_match_ = body->max_match(); 475 } index()476 int index() const { return index_; } name()477 const ZoneVector<base::uc16>* name() const { return name_; } set_name(const ZoneVector<base::uc16> * name)478 void set_name(const ZoneVector<base::uc16>* name) { name_ = name; } StartRegister(int index)479 static int StartRegister(int index) { return index * 2; } EndRegister(int index)480 static int EndRegister(int index) { return index * 2 + 1; } 481 482 private: 483 RegExpTree* body_ = nullptr; 484 int index_; 485 int min_match_ = 0; 486 int max_match_ = 0; 487 const ZoneVector<base::uc16>* name_ = nullptr; 488 }; 489 490 class RegExpGroup final : public RegExpTree { 491 public: RegExpGroup(RegExpTree * body)492 explicit RegExpGroup(RegExpTree* body) 493 : body_(body), 494 min_match_(body->min_match()), 495 max_match_(body->max_match()) {} 496 497 DECL_BOILERPLATE(Group); 498 IsAnchoredAtStart()499 bool IsAnchoredAtStart() override { return body_->IsAnchoredAtStart(); } IsAnchoredAtEnd()500 bool IsAnchoredAtEnd() override { return body_->IsAnchoredAtEnd(); } min_match()501 int min_match() override { return min_match_; } max_match()502 int max_match() override { return max_match_; } CaptureRegisters()503 Interval CaptureRegisters() override { return body_->CaptureRegisters(); } body()504 RegExpTree* body() const { return body_; } 505 506 private: 507 RegExpTree* body_; 508 int min_match_; 509 int max_match_; 510 }; 511 512 class RegExpLookaround final : public RegExpTree { 513 public: 514 enum Type { LOOKAHEAD, LOOKBEHIND }; 515 RegExpLookaround(RegExpTree * body,bool is_positive,int capture_count,int capture_from,Type type)516 RegExpLookaround(RegExpTree* body, bool is_positive, int capture_count, 517 int capture_from, Type type) 518 : body_(body), 519 is_positive_(is_positive), 520 capture_count_(capture_count), 521 capture_from_(capture_from), 522 type_(type) {} 523 524 DECL_BOILERPLATE(Lookaround); 525 526 Interval CaptureRegisters() override; 527 bool IsAnchoredAtStart() override; min_match()528 int min_match() override { return 0; } max_match()529 int max_match() override { return 0; } body()530 RegExpTree* body() const { return body_; } is_positive()531 bool is_positive() const { return is_positive_; } capture_count()532 int capture_count() const { return capture_count_; } capture_from()533 int capture_from() const { return capture_from_; } type()534 Type type() const { return type_; } 535 536 class Builder { 537 public: 538 Builder(bool is_positive, RegExpNode* on_success, 539 int stack_pointer_register, int position_register, 540 int capture_register_count = 0, int capture_register_start = 0); on_match_success()541 RegExpNode* on_match_success() const { return on_match_success_; } 542 RegExpNode* ForMatch(RegExpNode* match); 543 544 private: 545 bool is_positive_; 546 RegExpNode* on_match_success_; 547 RegExpNode* on_success_; 548 int stack_pointer_register_; 549 int position_register_; 550 }; 551 552 private: 553 RegExpTree* body_; 554 bool is_positive_; 555 int capture_count_; 556 int capture_from_; 557 Type type_; 558 }; 559 560 561 class RegExpBackReference final : public RegExpTree { 562 public: RegExpBackReference(RegExpFlags flags)563 explicit RegExpBackReference(RegExpFlags flags) : flags_(flags) {} RegExpBackReference(RegExpCapture * capture,RegExpFlags flags)564 RegExpBackReference(RegExpCapture* capture, RegExpFlags flags) 565 : capture_(capture), flags_(flags) {} 566 567 DECL_BOILERPLATE(BackReference); 568 min_match()569 int min_match() override { return 0; } 570 // The back reference may be recursive, e.g. /(\2)(\1)/. To avoid infinite 571 // recursion, we give up. Ignorance is bliss. max_match()572 int max_match() override { return kInfinity; } index()573 int index() const { return capture_->index(); } capture()574 RegExpCapture* capture() const { return capture_; } set_capture(RegExpCapture * capture)575 void set_capture(RegExpCapture* capture) { capture_ = capture; } name()576 const ZoneVector<base::uc16>* name() const { return name_; } set_name(const ZoneVector<base::uc16> * name)577 void set_name(const ZoneVector<base::uc16>* name) { name_ = name; } 578 579 private: 580 RegExpCapture* capture_ = nullptr; 581 const ZoneVector<base::uc16>* name_ = nullptr; 582 const RegExpFlags flags_; 583 }; 584 585 586 class RegExpEmpty final : public RegExpTree { 587 public: 588 DECL_BOILERPLATE(Empty); min_match()589 int min_match() override { return 0; } max_match()590 int max_match() override { return 0; } 591 }; 592 593 } // namespace internal 594 } // namespace v8 595 596 #undef DECL_BOILERPLATE 597 598 #endif // V8_REGEXP_REGEXP_AST_H_ 599