1 // Copyright 2019 the V8 project authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #ifndef V8_REGEXP_REGEXP_NODES_H_ 6 #define V8_REGEXP_REGEXP_NODES_H_ 7 8 #include "src/codegen/label.h" 9 #include "src/regexp/regexp-macro-assembler.h" 10 #include "src/zone/zone.h" 11 12 namespace v8 { 13 namespace internal { 14 15 class AlternativeGenerationList; 16 class BoyerMooreLookahead; 17 class GreedyLoopState; 18 class NodeVisitor; 19 class QuickCheckDetails; 20 class RegExpCompiler; 21 class Trace; 22 struct PreloadState; 23 class ChoiceNode; 24 25 #define FOR_EACH_NODE_TYPE(VISIT) \ 26 VISIT(End) \ 27 VISIT(Action) \ 28 VISIT(Choice) \ 29 VISIT(LoopChoice) \ 30 VISIT(NegativeLookaroundChoice) \ 31 VISIT(BackReference) \ 32 VISIT(Assertion) \ 33 VISIT(Text) 34 35 struct NodeInfo final { NodeInfofinal36 NodeInfo() 37 : being_analyzed(false), 38 been_analyzed(false), 39 follows_word_interest(false), 40 follows_newline_interest(false), 41 follows_start_interest(false), 42 at_end(false), 43 visited(false), 44 replacement_calculated(false) {} 45 46 // Returns true if the interests and assumptions of this node 47 // matches the given one. Matchesfinal48 bool Matches(NodeInfo* that) { 49 return (at_end == that->at_end) && 50 (follows_word_interest == that->follows_word_interest) && 51 (follows_newline_interest == that->follows_newline_interest) && 52 (follows_start_interest == that->follows_start_interest); 53 } 54 55 // Updates the interests of this node given the interests of the 56 // node preceding it. AddFromPrecedingfinal57 void AddFromPreceding(NodeInfo* that) { 58 at_end |= that->at_end; 59 follows_word_interest |= that->follows_word_interest; 60 follows_newline_interest |= that->follows_newline_interest; 61 follows_start_interest |= that->follows_start_interest; 62 } 63 HasLookbehindfinal64 bool HasLookbehind() { 65 return follows_word_interest || follows_newline_interest || 66 follows_start_interest; 67 } 68 69 // Sets the interests of this node to include the interests of the 70 // following node. AddFromFollowingfinal71 void AddFromFollowing(NodeInfo* that) { 72 follows_word_interest |= that->follows_word_interest; 73 follows_newline_interest |= that->follows_newline_interest; 74 follows_start_interest |= that->follows_start_interest; 75 } 76 ResetCompilationStatefinal77 void ResetCompilationState() { 78 being_analyzed = false; 79 been_analyzed = false; 80 } 81 82 bool being_analyzed : 1; 83 bool been_analyzed : 1; 84 85 // These bits are set of this node has to know what the preceding 86 // character was. 87 bool follows_word_interest : 1; 88 bool follows_newline_interest : 1; 89 bool follows_start_interest : 1; 90 91 bool at_end : 1; 92 bool visited : 1; 93 bool replacement_calculated : 1; 94 }; 95 96 struct EatsAtLeastInfo final { EatsAtLeastInfofinal97 EatsAtLeastInfo() : EatsAtLeastInfo(0) {} EatsAtLeastInfofinal98 explicit EatsAtLeastInfo(uint8_t eats) 99 : eats_at_least_from_possibly_start(eats), 100 eats_at_least_from_not_start(eats) {} SetMinfinal101 void SetMin(const EatsAtLeastInfo& other) { 102 if (other.eats_at_least_from_possibly_start < 103 eats_at_least_from_possibly_start) { 104 eats_at_least_from_possibly_start = 105 other.eats_at_least_from_possibly_start; 106 } 107 if (other.eats_at_least_from_not_start < eats_at_least_from_not_start) { 108 eats_at_least_from_not_start = other.eats_at_least_from_not_start; 109 } 110 } 111 IsZerofinal112 bool IsZero() const { 113 return eats_at_least_from_possibly_start == 0 && 114 eats_at_least_from_not_start == 0; 115 } 116 117 // Any successful match starting from the current node will consume at least 118 // this many characters. This does not necessarily mean that there is a 119 // possible match with exactly this many characters, but we generally try to 120 // get this number as high as possible to allow for early exit on failure. 121 uint8_t eats_at_least_from_possibly_start; 122 123 // Like eats_at_least_from_possibly_start, but with the additional assumption 124 // that start-of-string assertions (^) can't match. This value is greater than 125 // or equal to eats_at_least_from_possibly_start. 126 uint8_t eats_at_least_from_not_start; 127 }; 128 129 class RegExpNode : public ZoneObject { 130 public: RegExpNode(Zone * zone)131 explicit RegExpNode(Zone* zone) 132 : replacement_(nullptr), 133 on_work_list_(false), 134 trace_count_(0), 135 zone_(zone) { 136 bm_info_[0] = bm_info_[1] = nullptr; 137 } 138 virtual ~RegExpNode(); 139 virtual void Accept(NodeVisitor* visitor) = 0; 140 // Generates a goto to this node or actually generates the code at this point. 141 virtual void Emit(RegExpCompiler* compiler, Trace* trace) = 0; 142 // How many characters must this node consume at a minimum in order to 143 // succeed. The not_at_start argument is used to indicate that we know we are 144 // not at the start of the input. In this case anchored branches will always 145 // fail and can be ignored when determining how many characters are consumed 146 // on success. If this node has not been analyzed yet, EatsAtLeast returns 0. 147 int EatsAtLeast(bool not_at_start); 148 // Returns how many characters this node must consume in order to succeed, 149 // given that this is a LoopChoiceNode whose counter register is in a 150 // newly-initialized state at the current position in the generated code. For 151 // example, consider /a{6,8}/. Absent any extra information, the 152 // LoopChoiceNode for the repetition must report that it consumes at least 153 // zero characters, because it may have already looped several times. However, 154 // with a newly-initialized counter, it can report that it consumes at least 155 // six characters. 156 virtual EatsAtLeastInfo EatsAtLeastFromLoopEntry(); 157 // Emits some quick code that checks whether the preloaded characters match. 158 // Falls through on certain failure, jumps to the label on possible success. 159 // If the node cannot make a quick check it does nothing and returns false. 160 bool EmitQuickCheck(RegExpCompiler* compiler, Trace* bounds_check_trace, 161 Trace* trace, bool preload_has_checked_bounds, 162 Label* on_possible_success, 163 QuickCheckDetails* details_return, 164 bool fall_through_on_failure, ChoiceNode* predecessor); 165 // For a given number of characters this returns a mask and a value. The 166 // next n characters are anded with the mask and compared with the value. 167 // A comparison failure indicates the node cannot match the next n characters. 168 // A comparison success indicates the node may match. 169 virtual void GetQuickCheckDetails(QuickCheckDetails* details, 170 RegExpCompiler* compiler, 171 int characters_filled_in, 172 bool not_at_start) = 0; 173 // Fills in quick check details for this node, given that this is a 174 // LoopChoiceNode whose counter register is in a newly-initialized state at 175 // the current position in the generated code. For example, consider /a{6,8}/. 176 // Absent any extra information, the LoopChoiceNode for the repetition cannot 177 // generate any useful quick check because a match might be the (empty) 178 // continuation node. However, with a newly-initialized counter, it can 179 // generate a quick check for several 'a' characters at once. 180 virtual void GetQuickCheckDetailsFromLoopEntry(QuickCheckDetails* details, 181 RegExpCompiler* compiler, 182 int characters_filled_in, 183 bool not_at_start); 184 static const int kNodeIsTooComplexForGreedyLoops = kMinInt; GreedyLoopTextLength()185 virtual int GreedyLoopTextLength() { return kNodeIsTooComplexForGreedyLoops; } 186 // Only returns the successor for a text node of length 1 that matches any 187 // character and that has no guards on it. GetSuccessorOfOmnivorousTextNode(RegExpCompiler * compiler)188 virtual RegExpNode* GetSuccessorOfOmnivorousTextNode( 189 RegExpCompiler* compiler) { 190 return nullptr; 191 } 192 193 // Collects information on the possible code units (mod 128) that can match if 194 // we look forward. This is used for a Boyer-Moore-like string searching 195 // implementation. TODO(erikcorry): This should share more code with 196 // EatsAtLeast, GetQuickCheckDetails. The budget argument is used to limit 197 // the number of nodes we are willing to look at in order to create this data. 198 static const int kRecursionBudget = 200; 199 bool KeepRecursing(RegExpCompiler* compiler); FillInBMInfo(Isolate * isolate,int offset,int budget,BoyerMooreLookahead * bm,bool not_at_start)200 virtual void FillInBMInfo(Isolate* isolate, int offset, int budget, 201 BoyerMooreLookahead* bm, bool not_at_start) { 202 UNREACHABLE(); 203 } 204 205 // If we know that the input is one-byte then there are some nodes that can 206 // never match. This method returns a node that can be substituted for 207 // itself, or nullptr if the node can never match. FilterOneByte(int depth,RegExpFlags flags)208 virtual RegExpNode* FilterOneByte(int depth, RegExpFlags flags) { 209 return this; 210 } 211 // Helper for FilterOneByte. replacement()212 RegExpNode* replacement() { 213 DCHECK(info()->replacement_calculated); 214 return replacement_; 215 } set_replacement(RegExpNode * replacement)216 RegExpNode* set_replacement(RegExpNode* replacement) { 217 info()->replacement_calculated = true; 218 replacement_ = replacement; 219 return replacement; // For convenience. 220 } 221 222 // We want to avoid recalculating the lookahead info, so we store it on the 223 // node. Only info that is for this node is stored. We can tell that the 224 // info is for this node when offset == 0, so the information is calculated 225 // relative to this node. SaveBMInfo(BoyerMooreLookahead * bm,bool not_at_start,int offset)226 void SaveBMInfo(BoyerMooreLookahead* bm, bool not_at_start, int offset) { 227 if (offset == 0) set_bm_info(not_at_start, bm); 228 } 229 label()230 Label* label() { return &label_; } 231 // If non-generic code is generated for a node (i.e. the node is not at the 232 // start of the trace) then it cannot be reused. This variable sets a limit 233 // on how often we allow that to happen before we insist on starting a new 234 // trace and generating generic code for a node that can be reused by flushing 235 // the deferred actions in the current trace and generating a goto. 236 static const int kMaxCopiesCodeGenerated = 10; 237 on_work_list()238 bool on_work_list() { return on_work_list_; } set_on_work_list(bool value)239 void set_on_work_list(bool value) { on_work_list_ = value; } 240 info()241 NodeInfo* info() { return &info_; } eats_at_least_info()242 const EatsAtLeastInfo* eats_at_least_info() const { return &eats_at_least_; } set_eats_at_least_info(const EatsAtLeastInfo & eats_at_least)243 void set_eats_at_least_info(const EatsAtLeastInfo& eats_at_least) { 244 eats_at_least_ = eats_at_least; 245 } 246 247 // TODO(v8:10441): This is a hacky way to avoid exponential code size growth 248 // for very large choice nodes that can be generated by unicode property 249 // escapes. In order to avoid inlining (i.e. trace recursion), we pretend to 250 // have generated the maximum count of code copies already. 251 // We should instead fix this properly, e.g. by using the code size budget 252 // (flush_budget) or by generating property escape matches as calls to a C 253 // function. SetDoNotInline()254 void SetDoNotInline() { trace_count_ = kMaxCopiesCodeGenerated; } 255 bm_info(bool not_at_start)256 BoyerMooreLookahead* bm_info(bool not_at_start) { 257 return bm_info_[not_at_start ? 1 : 0]; 258 } 259 zone()260 Zone* zone() const { return zone_; } 261 262 protected: 263 enum LimitResult { DONE, CONTINUE }; 264 RegExpNode* replacement_; 265 266 LimitResult LimitVersions(RegExpCompiler* compiler, Trace* trace); 267 set_bm_info(bool not_at_start,BoyerMooreLookahead * bm)268 void set_bm_info(bool not_at_start, BoyerMooreLookahead* bm) { 269 bm_info_[not_at_start ? 1 : 0] = bm; 270 } 271 272 private: 273 static const int kFirstCharBudget = 10; 274 Label label_; 275 bool on_work_list_; 276 NodeInfo info_; 277 278 // Saved values for EatsAtLeast results, to avoid recomputation. Filled in 279 // during analysis (valid if info_.been_analyzed is true). 280 EatsAtLeastInfo eats_at_least_; 281 282 // This variable keeps track of how many times code has been generated for 283 // this node (in different traces). We don't keep track of where the 284 // generated code is located unless the code is generated at the start of 285 // a trace, in which case it is generic and can be reused by flushing the 286 // deferred operations in the current trace and generating a goto. 287 int trace_count_; 288 BoyerMooreLookahead* bm_info_[2]; 289 290 Zone* zone_; 291 }; 292 293 class SeqRegExpNode : public RegExpNode { 294 public: SeqRegExpNode(RegExpNode * on_success)295 explicit SeqRegExpNode(RegExpNode* on_success) 296 : RegExpNode(on_success->zone()), on_success_(on_success) {} on_success()297 RegExpNode* on_success() { return on_success_; } set_on_success(RegExpNode * node)298 void set_on_success(RegExpNode* node) { on_success_ = node; } 299 RegExpNode* FilterOneByte(int depth, RegExpFlags flags) override; FillInBMInfo(Isolate * isolate,int offset,int budget,BoyerMooreLookahead * bm,bool not_at_start)300 void FillInBMInfo(Isolate* isolate, int offset, int budget, 301 BoyerMooreLookahead* bm, bool not_at_start) override { 302 on_success_->FillInBMInfo(isolate, offset, budget - 1, bm, not_at_start); 303 if (offset == 0) set_bm_info(not_at_start, bm); 304 } 305 306 protected: 307 RegExpNode* FilterSuccessor(int depth, RegExpFlags flags); 308 309 private: 310 RegExpNode* on_success_; 311 }; 312 313 class ActionNode : public SeqRegExpNode { 314 public: 315 enum ActionType { 316 SET_REGISTER_FOR_LOOP, 317 INCREMENT_REGISTER, 318 STORE_POSITION, 319 BEGIN_POSITIVE_SUBMATCH, 320 BEGIN_NEGATIVE_SUBMATCH, 321 POSITIVE_SUBMATCH_SUCCESS, 322 EMPTY_MATCH_CHECK, 323 CLEAR_CAPTURES 324 }; 325 static ActionNode* SetRegisterForLoop(int reg, int val, 326 RegExpNode* on_success); 327 static ActionNode* IncrementRegister(int reg, RegExpNode* on_success); 328 static ActionNode* StorePosition(int reg, bool is_capture, 329 RegExpNode* on_success); 330 static ActionNode* ClearCaptures(Interval range, RegExpNode* on_success); 331 static ActionNode* BeginPositiveSubmatch(int stack_pointer_reg, 332 int position_reg, 333 RegExpNode* on_success); 334 static ActionNode* BeginNegativeSubmatch(int stack_pointer_reg, 335 int position_reg, 336 RegExpNode* on_success); 337 static ActionNode* PositiveSubmatchSuccess(int stack_pointer_reg, 338 int restore_reg, 339 int clear_capture_count, 340 int clear_capture_from, 341 RegExpNode* on_success); 342 static ActionNode* EmptyMatchCheck(int start_register, 343 int repetition_register, 344 int repetition_limit, 345 RegExpNode* on_success); 346 void Accept(NodeVisitor* visitor) override; 347 void Emit(RegExpCompiler* compiler, Trace* trace) override; 348 void GetQuickCheckDetails(QuickCheckDetails* details, 349 RegExpCompiler* compiler, int filled_in, 350 bool not_at_start) override; 351 void FillInBMInfo(Isolate* isolate, int offset, int budget, 352 BoyerMooreLookahead* bm, bool not_at_start) override; action_type()353 ActionType action_type() { return action_type_; } 354 // TODO(erikcorry): We should allow some action nodes in greedy loops. GreedyLoopTextLength()355 int GreedyLoopTextLength() override { 356 return kNodeIsTooComplexForGreedyLoops; 357 } 358 359 private: 360 union { 361 struct { 362 int reg; 363 int value; 364 } u_store_register; 365 struct { 366 int reg; 367 } u_increment_register; 368 struct { 369 int reg; 370 bool is_capture; 371 } u_position_register; 372 struct { 373 int stack_pointer_register; 374 int current_position_register; 375 int clear_register_count; 376 int clear_register_from; 377 } u_submatch; 378 struct { 379 int start_register; 380 int repetition_register; 381 int repetition_limit; 382 } u_empty_match_check; 383 struct { 384 int range_from; 385 int range_to; 386 } u_clear_captures; 387 } data_; ActionNode(ActionType action_type,RegExpNode * on_success)388 ActionNode(ActionType action_type, RegExpNode* on_success) 389 : SeqRegExpNode(on_success), action_type_(action_type) {} 390 ActionType action_type_; 391 friend class DotPrinterImpl; 392 friend Zone; 393 }; 394 395 class TextNode : public SeqRegExpNode { 396 public: TextNode(ZoneList<TextElement> * elms,bool read_backward,RegExpNode * on_success)397 TextNode(ZoneList<TextElement>* elms, bool read_backward, 398 RegExpNode* on_success) 399 : SeqRegExpNode(on_success), elms_(elms), read_backward_(read_backward) {} TextNode(RegExpCharacterClass * that,bool read_backward,RegExpNode * on_success)400 TextNode(RegExpCharacterClass* that, bool read_backward, 401 RegExpNode* on_success) 402 : SeqRegExpNode(on_success), 403 elms_(zone()->New<ZoneList<TextElement>>(1, zone())), 404 read_backward_(read_backward) { 405 elms_->Add(TextElement::CharClass(that), zone()); 406 } 407 // Create TextNode for a single character class for the given ranges. 408 static TextNode* CreateForCharacterRanges(Zone* zone, 409 ZoneList<CharacterRange>* ranges, 410 bool read_backward, 411 RegExpNode* on_success); 412 // Create TextNode for a surrogate pair (i.e. match a sequence of two uc16 413 // code unit ranges). 414 static TextNode* CreateForSurrogatePair( 415 Zone* zone, CharacterRange lead, ZoneList<CharacterRange>* trail_ranges, 416 bool read_backward, RegExpNode* on_success); 417 static TextNode* CreateForSurrogatePair(Zone* zone, 418 ZoneList<CharacterRange>* lead_ranges, 419 CharacterRange trail, 420 bool read_backward, 421 RegExpNode* on_success); 422 void Accept(NodeVisitor* visitor) override; 423 void Emit(RegExpCompiler* compiler, Trace* trace) override; 424 void GetQuickCheckDetails(QuickCheckDetails* details, 425 RegExpCompiler* compiler, int characters_filled_in, 426 bool not_at_start) override; elements()427 ZoneList<TextElement>* elements() { return elms_; } read_backward()428 bool read_backward() { return read_backward_; } 429 void MakeCaseIndependent(Isolate* isolate, bool is_one_byte, 430 RegExpFlags flags); 431 int GreedyLoopTextLength() override; 432 RegExpNode* GetSuccessorOfOmnivorousTextNode( 433 RegExpCompiler* compiler) override; 434 void FillInBMInfo(Isolate* isolate, int offset, int budget, 435 BoyerMooreLookahead* bm, bool not_at_start) override; 436 void CalculateOffsets(); 437 RegExpNode* FilterOneByte(int depth, RegExpFlags flags) override; 438 int Length(); 439 440 private: 441 enum TextEmitPassType { 442 NON_LATIN1_MATCH, // Check for characters that can't match. 443 SIMPLE_CHARACTER_MATCH, // Case-dependent single character check. 444 NON_LETTER_CHARACTER_MATCH, // Check characters that have no case equivs. 445 CASE_CHARACTER_MATCH, // Case-independent single character check. 446 CHARACTER_CLASS_MATCH // Character class. 447 }; 448 static bool SkipPass(TextEmitPassType pass, bool ignore_case); 449 static const int kFirstRealPass = SIMPLE_CHARACTER_MATCH; 450 static const int kLastPass = CHARACTER_CLASS_MATCH; 451 void TextEmitPass(RegExpCompiler* compiler, TextEmitPassType pass, 452 bool preloaded, Trace* trace, bool first_element_checked, 453 int* checked_up_to); 454 ZoneList<TextElement>* elms_; 455 bool read_backward_; 456 }; 457 458 class AssertionNode : public SeqRegExpNode { 459 public: 460 enum AssertionType { 461 AT_END, 462 AT_START, 463 AT_BOUNDARY, 464 AT_NON_BOUNDARY, 465 AFTER_NEWLINE 466 }; AtEnd(RegExpNode * on_success)467 static AssertionNode* AtEnd(RegExpNode* on_success) { 468 return on_success->zone()->New<AssertionNode>(AT_END, on_success); 469 } AtStart(RegExpNode * on_success)470 static AssertionNode* AtStart(RegExpNode* on_success) { 471 return on_success->zone()->New<AssertionNode>(AT_START, on_success); 472 } AtBoundary(RegExpNode * on_success)473 static AssertionNode* AtBoundary(RegExpNode* on_success) { 474 return on_success->zone()->New<AssertionNode>(AT_BOUNDARY, on_success); 475 } AtNonBoundary(RegExpNode * on_success)476 static AssertionNode* AtNonBoundary(RegExpNode* on_success) { 477 return on_success->zone()->New<AssertionNode>(AT_NON_BOUNDARY, on_success); 478 } AfterNewline(RegExpNode * on_success)479 static AssertionNode* AfterNewline(RegExpNode* on_success) { 480 return on_success->zone()->New<AssertionNode>(AFTER_NEWLINE, on_success); 481 } 482 void Accept(NodeVisitor* visitor) override; 483 void Emit(RegExpCompiler* compiler, Trace* trace) override; 484 void GetQuickCheckDetails(QuickCheckDetails* details, 485 RegExpCompiler* compiler, int filled_in, 486 bool not_at_start) override; 487 void FillInBMInfo(Isolate* isolate, int offset, int budget, 488 BoyerMooreLookahead* bm, bool not_at_start) override; assertion_type()489 AssertionType assertion_type() { return assertion_type_; } 490 491 private: 492 friend Zone; 493 494 void EmitBoundaryCheck(RegExpCompiler* compiler, Trace* trace); 495 enum IfPrevious { kIsNonWord, kIsWord }; 496 void BacktrackIfPrevious(RegExpCompiler* compiler, Trace* trace, 497 IfPrevious backtrack_if_previous); AssertionNode(AssertionType t,RegExpNode * on_success)498 AssertionNode(AssertionType t, RegExpNode* on_success) 499 : SeqRegExpNode(on_success), assertion_type_(t) {} 500 AssertionType assertion_type_; 501 }; 502 503 class BackReferenceNode : public SeqRegExpNode { 504 public: BackReferenceNode(int start_reg,int end_reg,RegExpFlags flags,bool read_backward,RegExpNode * on_success)505 BackReferenceNode(int start_reg, int end_reg, RegExpFlags flags, 506 bool read_backward, RegExpNode* on_success) 507 : SeqRegExpNode(on_success), 508 start_reg_(start_reg), 509 end_reg_(end_reg), 510 flags_(flags), 511 read_backward_(read_backward) {} 512 void Accept(NodeVisitor* visitor) override; start_register()513 int start_register() { return start_reg_; } end_register()514 int end_register() { return end_reg_; } read_backward()515 bool read_backward() { return read_backward_; } 516 void Emit(RegExpCompiler* compiler, Trace* trace) override; GetQuickCheckDetails(QuickCheckDetails * details,RegExpCompiler * compiler,int characters_filled_in,bool not_at_start)517 void GetQuickCheckDetails(QuickCheckDetails* details, 518 RegExpCompiler* compiler, int characters_filled_in, 519 bool not_at_start) override { 520 return; 521 } 522 void FillInBMInfo(Isolate* isolate, int offset, int budget, 523 BoyerMooreLookahead* bm, bool not_at_start) override; 524 525 private: 526 int start_reg_; 527 int end_reg_; 528 RegExpFlags flags_; 529 bool read_backward_; 530 }; 531 532 class EndNode : public RegExpNode { 533 public: 534 enum Action { ACCEPT, BACKTRACK, NEGATIVE_SUBMATCH_SUCCESS }; EndNode(Action action,Zone * zone)535 EndNode(Action action, Zone* zone) : RegExpNode(zone), action_(action) {} 536 void Accept(NodeVisitor* visitor) override; 537 void Emit(RegExpCompiler* compiler, Trace* trace) override; GetQuickCheckDetails(QuickCheckDetails * details,RegExpCompiler * compiler,int characters_filled_in,bool not_at_start)538 void GetQuickCheckDetails(QuickCheckDetails* details, 539 RegExpCompiler* compiler, int characters_filled_in, 540 bool not_at_start) override { 541 // Returning 0 from EatsAtLeast should ensure we never get here. 542 UNREACHABLE(); 543 } FillInBMInfo(Isolate * isolate,int offset,int budget,BoyerMooreLookahead * bm,bool not_at_start)544 void FillInBMInfo(Isolate* isolate, int offset, int budget, 545 BoyerMooreLookahead* bm, bool not_at_start) override { 546 // Returning 0 from EatsAtLeast should ensure we never get here. 547 UNREACHABLE(); 548 } 549 550 private: 551 Action action_; 552 }; 553 554 class NegativeSubmatchSuccess : public EndNode { 555 public: NegativeSubmatchSuccess(int stack_pointer_reg,int position_reg,int clear_capture_count,int clear_capture_start,Zone * zone)556 NegativeSubmatchSuccess(int stack_pointer_reg, int position_reg, 557 int clear_capture_count, int clear_capture_start, 558 Zone* zone) 559 : EndNode(NEGATIVE_SUBMATCH_SUCCESS, zone), 560 stack_pointer_register_(stack_pointer_reg), 561 current_position_register_(position_reg), 562 clear_capture_count_(clear_capture_count), 563 clear_capture_start_(clear_capture_start) {} 564 void Emit(RegExpCompiler* compiler, Trace* trace) override; 565 566 private: 567 int stack_pointer_register_; 568 int current_position_register_; 569 int clear_capture_count_; 570 int clear_capture_start_; 571 }; 572 573 class Guard : public ZoneObject { 574 public: 575 enum Relation { LT, GEQ }; Guard(int reg,Relation op,int value)576 Guard(int reg, Relation op, int value) : reg_(reg), op_(op), value_(value) {} reg()577 int reg() { return reg_; } op()578 Relation op() { return op_; } value()579 int value() { return value_; } 580 581 private: 582 int reg_; 583 Relation op_; 584 int value_; 585 }; 586 587 class GuardedAlternative { 588 public: GuardedAlternative(RegExpNode * node)589 explicit GuardedAlternative(RegExpNode* node) 590 : node_(node), guards_(nullptr) {} 591 void AddGuard(Guard* guard, Zone* zone); node()592 RegExpNode* node() { return node_; } set_node(RegExpNode * node)593 void set_node(RegExpNode* node) { node_ = node; } guards()594 ZoneList<Guard*>* guards() { return guards_; } 595 596 private: 597 RegExpNode* node_; 598 ZoneList<Guard*>* guards_; 599 }; 600 601 class AlternativeGeneration; 602 603 class ChoiceNode : public RegExpNode { 604 public: ChoiceNode(int expected_size,Zone * zone)605 explicit ChoiceNode(int expected_size, Zone* zone) 606 : RegExpNode(zone), 607 alternatives_( 608 zone->New<ZoneList<GuardedAlternative>>(expected_size, zone)), 609 not_at_start_(false), 610 being_calculated_(false) {} 611 void Accept(NodeVisitor* visitor) override; AddAlternative(GuardedAlternative node)612 void AddAlternative(GuardedAlternative node) { 613 alternatives()->Add(node, zone()); 614 } alternatives()615 ZoneList<GuardedAlternative>* alternatives() { return alternatives_; } 616 void Emit(RegExpCompiler* compiler, Trace* trace) override; 617 void GetQuickCheckDetails(QuickCheckDetails* details, 618 RegExpCompiler* compiler, int characters_filled_in, 619 bool not_at_start) override; 620 void FillInBMInfo(Isolate* isolate, int offset, int budget, 621 BoyerMooreLookahead* bm, bool not_at_start) override; 622 being_calculated()623 bool being_calculated() { return being_calculated_; } not_at_start()624 bool not_at_start() { return not_at_start_; } set_not_at_start()625 void set_not_at_start() { not_at_start_ = true; } set_being_calculated(bool b)626 void set_being_calculated(bool b) { being_calculated_ = b; } try_to_emit_quick_check_for_alternative(bool is_first)627 virtual bool try_to_emit_quick_check_for_alternative(bool is_first) { 628 return true; 629 } 630 RegExpNode* FilterOneByte(int depth, RegExpFlags flags) override; read_backward()631 virtual bool read_backward() { return false; } 632 633 protected: 634 int GreedyLoopTextLengthForAlternative(GuardedAlternative* alternative); 635 ZoneList<GuardedAlternative>* alternatives_; 636 637 private: 638 template <typename...> 639 friend class Analysis; 640 641 void GenerateGuard(RegExpMacroAssembler* macro_assembler, Guard* guard, 642 Trace* trace); 643 int CalculatePreloadCharacters(RegExpCompiler* compiler, int eats_at_least); 644 void EmitOutOfLineContinuation(RegExpCompiler* compiler, Trace* trace, 645 GuardedAlternative alternative, 646 AlternativeGeneration* alt_gen, 647 int preload_characters, 648 bool next_expects_preload); 649 void SetUpPreLoad(RegExpCompiler* compiler, Trace* current_trace, 650 PreloadState* preloads); 651 void AssertGuardsMentionRegisters(Trace* trace); 652 int EmitOptimizedUnanchoredSearch(RegExpCompiler* compiler, Trace* trace); 653 Trace* EmitGreedyLoop(RegExpCompiler* compiler, Trace* trace, 654 AlternativeGenerationList* alt_gens, 655 PreloadState* preloads, 656 GreedyLoopState* greedy_loop_state, int text_length); 657 void EmitChoices(RegExpCompiler* compiler, 658 AlternativeGenerationList* alt_gens, int first_choice, 659 Trace* trace, PreloadState* preloads); 660 661 // If true, this node is never checked at the start of the input. 662 // Allows a new trace to start with at_start() set to false. 663 bool not_at_start_; 664 bool being_calculated_; 665 }; 666 667 class NegativeLookaroundChoiceNode : public ChoiceNode { 668 public: NegativeLookaroundChoiceNode(GuardedAlternative this_must_fail,GuardedAlternative then_do_this,Zone * zone)669 explicit NegativeLookaroundChoiceNode(GuardedAlternative this_must_fail, 670 GuardedAlternative then_do_this, 671 Zone* zone) 672 : ChoiceNode(2, zone) { 673 AddAlternative(this_must_fail); 674 AddAlternative(then_do_this); 675 } 676 void GetQuickCheckDetails(QuickCheckDetails* details, 677 RegExpCompiler* compiler, int characters_filled_in, 678 bool not_at_start) override; FillInBMInfo(Isolate * isolate,int offset,int budget,BoyerMooreLookahead * bm,bool not_at_start)679 void FillInBMInfo(Isolate* isolate, int offset, int budget, 680 BoyerMooreLookahead* bm, bool not_at_start) override { 681 continue_node()->FillInBMInfo(isolate, offset, budget - 1, bm, 682 not_at_start); 683 if (offset == 0) set_bm_info(not_at_start, bm); 684 } 685 static constexpr int kLookaroundIndex = 0; 686 static constexpr int kContinueIndex = 1; lookaround_node()687 RegExpNode* lookaround_node() { 688 return alternatives()->at(kLookaroundIndex).node(); 689 } continue_node()690 RegExpNode* continue_node() { 691 return alternatives()->at(kContinueIndex).node(); 692 } 693 // For a negative lookahead we don't emit the quick check for the 694 // alternative that is expected to fail. This is because quick check code 695 // starts by loading enough characters for the alternative that takes fewest 696 // characters, but on a negative lookahead the negative branch did not take 697 // part in that calculation (EatsAtLeast) so the assumptions don't hold. try_to_emit_quick_check_for_alternative(bool is_first)698 bool try_to_emit_quick_check_for_alternative(bool is_first) override { 699 return !is_first; 700 } 701 void Accept(NodeVisitor* visitor) override; 702 RegExpNode* FilterOneByte(int depth, RegExpFlags flags) override; 703 }; 704 705 class LoopChoiceNode : public ChoiceNode { 706 public: LoopChoiceNode(bool body_can_be_zero_length,bool read_backward,int min_loop_iterations,Zone * zone)707 LoopChoiceNode(bool body_can_be_zero_length, bool read_backward, 708 int min_loop_iterations, Zone* zone) 709 : ChoiceNode(2, zone), 710 loop_node_(nullptr), 711 continue_node_(nullptr), 712 body_can_be_zero_length_(body_can_be_zero_length), 713 read_backward_(read_backward), 714 traversed_loop_initialization_node_(false), 715 min_loop_iterations_(min_loop_iterations) {} 716 void AddLoopAlternative(GuardedAlternative alt); 717 void AddContinueAlternative(GuardedAlternative alt); 718 void Emit(RegExpCompiler* compiler, Trace* trace) override; 719 void GetQuickCheckDetails(QuickCheckDetails* details, 720 RegExpCompiler* compiler, int characters_filled_in, 721 bool not_at_start) override; 722 void GetQuickCheckDetailsFromLoopEntry(QuickCheckDetails* details, 723 RegExpCompiler* compiler, 724 int characters_filled_in, 725 bool not_at_start) override; 726 void FillInBMInfo(Isolate* isolate, int offset, int budget, 727 BoyerMooreLookahead* bm, bool not_at_start) override; 728 EatsAtLeastInfo EatsAtLeastFromLoopEntry() override; loop_node()729 RegExpNode* loop_node() { return loop_node_; } continue_node()730 RegExpNode* continue_node() { return continue_node_; } body_can_be_zero_length()731 bool body_can_be_zero_length() { return body_can_be_zero_length_; } min_loop_iterations()732 int min_loop_iterations() const { return min_loop_iterations_; } read_backward()733 bool read_backward() override { return read_backward_; } 734 void Accept(NodeVisitor* visitor) override; 735 RegExpNode* FilterOneByte(int depth, RegExpFlags flags) override; 736 737 private: 738 // AddAlternative is made private for loop nodes because alternatives 739 // should not be added freely, we need to keep track of which node 740 // goes back to the node itself. AddAlternative(GuardedAlternative node)741 void AddAlternative(GuardedAlternative node) { 742 ChoiceNode::AddAlternative(node); 743 } 744 745 RegExpNode* loop_node_; 746 RegExpNode* continue_node_; 747 bool body_can_be_zero_length_; 748 bool read_backward_; 749 750 // Temporary marker set only while generating quick check details. Represents 751 // whether GetQuickCheckDetails traversed the initialization node for this 752 // loop's counter. If so, we may be able to generate stricter quick checks 753 // because we know the loop node must match at least min_loop_iterations_ 754 // times before the continuation node can match. 755 bool traversed_loop_initialization_node_; 756 757 // The minimum number of times the loop_node_ must match before the 758 // continue_node_ might be considered. This value can be temporarily decreased 759 // while generating quick check details, to represent the remaining iterations 760 // after the completed portion of the quick check details. 761 int min_loop_iterations_; 762 763 friend class IterationDecrementer; 764 friend class LoopInitializationMarker; 765 }; 766 767 class NodeVisitor { 768 public: 769 virtual ~NodeVisitor() = default; 770 #define DECLARE_VISIT(Type) virtual void Visit##Type(Type##Node* that) = 0; 771 FOR_EACH_NODE_TYPE(DECLARE_VISIT) 772 #undef DECLARE_VISIT 773 }; 774 775 } // namespace internal 776 } // namespace v8 777 778 #endif // V8_REGEXP_REGEXP_NODES_H_ 779