• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2019 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #include "src/regexp/regexp-compiler.h"
6 
7 #include "src/base/safe_conversions.h"
8 #include "src/execution/isolate.h"
9 #include "src/objects/fixed-array-inl.h"
10 #include "src/regexp/regexp-macro-assembler-arch.h"
11 #include "src/strings/unicode-inl.h"
12 #include "src/zone/zone-list-inl.h"
13 
14 #ifdef V8_INTL_SUPPORT
15 #include "src/regexp/special-case.h"
16 #include "unicode/locid.h"
17 #include "unicode/uniset.h"
18 #include "unicode/utypes.h"
19 #endif  // V8_INTL_SUPPORT
20 
21 namespace v8 {
22 namespace internal {
23 
24 using namespace regexp_compiler_constants;  // NOLINT(build/namespaces)
25 
26 // -------------------------------------------------------------------
27 // Implementation of the Irregexp regular expression engine.
28 //
29 // The Irregexp regular expression engine is intended to be a complete
30 // implementation of ECMAScript regular expressions.  It generates either
31 // bytecodes or native code.
32 
33 //   The Irregexp regexp engine is structured in three steps.
34 //   1) The parser generates an abstract syntax tree.  See ast.cc.
35 //   2) From the AST a node network is created.  The nodes are all
36 //      subclasses of RegExpNode.  The nodes represent states when
37 //      executing a regular expression.  Several optimizations are
38 //      performed on the node network.
39 //   3) From the nodes we generate either byte codes or native code
40 //      that can actually execute the regular expression (perform
41 //      the search).  The code generation step is described in more
42 //      detail below.
43 
44 // Code generation.
45 //
46 //   The nodes are divided into four main categories.
47 //   * Choice nodes
48 //        These represent places where the regular expression can
49 //        match in more than one way.  For example on entry to an
50 //        alternation (foo|bar) or a repetition (*, +, ? or {}).
51 //   * Action nodes
52 //        These represent places where some action should be
53 //        performed.  Examples include recording the current position
54 //        in the input string to a register (in order to implement
55 //        captures) or other actions on register for example in order
56 //        to implement the counters needed for {} repetitions.
57 //   * Matching nodes
58 //        These attempt to match some element part of the input string.
59 //        Examples of elements include character classes, plain strings
60 //        or back references.
61 //   * End nodes
62 //        These are used to implement the actions required on finding
63 //        a successful match or failing to find a match.
64 //
65 //   The code generated (whether as byte codes or native code) maintains
66 //   some state as it runs.  This consists of the following elements:
67 //
68 //   * The capture registers.  Used for string captures.
69 //   * Other registers.  Used for counters etc.
70 //   * The current position.
71 //   * The stack of backtracking information.  Used when a matching node
72 //     fails to find a match and needs to try an alternative.
73 //
74 // Conceptual regular expression execution model:
75 //
76 //   There is a simple conceptual model of regular expression execution
77 //   which will be presented first.  The actual code generated is a more
78 //   efficient simulation of the simple conceptual model:
79 //
80 //   * Choice nodes are implemented as follows:
81 //     For each choice except the last {
82 //       push current position
83 //       push backtrack code location
84 //       <generate code to test for choice>
85 //       backtrack code location:
86 //       pop current position
87 //     }
88 //     <generate code to test for last choice>
89 //
90 //   * Actions nodes are generated as follows
91 //     <push affected registers on backtrack stack>
92 //     <generate code to perform action>
93 //     push backtrack code location
94 //     <generate code to test for following nodes>
95 //     backtrack code location:
96 //     <pop affected registers to restore their state>
97 //     <pop backtrack location from stack and go to it>
98 //
99 //   * Matching nodes are generated as follows:
100 //     if input string matches at current position
101 //       update current position
102 //       <generate code to test for following nodes>
103 //     else
104 //       <pop backtrack location from stack and go to it>
105 //
106 //   Thus it can be seen that the current position is saved and restored
107 //   by the choice nodes, whereas the registers are saved and restored by
108 //   by the action nodes that manipulate them.
109 //
110 //   The other interesting aspect of this model is that nodes are generated
111 //   at the point where they are needed by a recursive call to Emit().  If
112 //   the node has already been code generated then the Emit() call will
113 //   generate a jump to the previously generated code instead.  In order to
114 //   limit recursion it is possible for the Emit() function to put the node
115 //   on a work list for later generation and instead generate a jump.  The
116 //   destination of the jump is resolved later when the code is generated.
117 //
118 // Actual regular expression code generation.
119 //
120 //   Code generation is actually more complicated than the above.  In order
121 //   to improve the efficiency of the generated code some optimizations are
122 //   performed
123 //
124 //   * Choice nodes have 1-character lookahead.
125 //     A choice node looks at the following character and eliminates some of
126 //     the choices immediately based on that character.  This is not yet
127 //     implemented.
128 //   * Simple greedy loops store reduced backtracking information.
129 //     A quantifier like /.*foo/m will greedily match the whole input.  It will
130 //     then need to backtrack to a point where it can match "foo".  The naive
131 //     implementation of this would push each character position onto the
132 //     backtracking stack, then pop them off one by one.  This would use space
133 //     proportional to the length of the input string.  However since the "."
134 //     can only match in one way and always has a constant length (in this case
135 //     of 1) it suffices to store the current position on the top of the stack
136 //     once.  Matching now becomes merely incrementing the current position and
137 //     backtracking becomes decrementing the current position and checking the
138 //     result against the stored current position.  This is faster and saves
139 //     space.
140 //   * The current state is virtualized.
141 //     This is used to defer expensive operations until it is clear that they
142 //     are needed and to generate code for a node more than once, allowing
143 //     specialized an efficient versions of the code to be created. This is
144 //     explained in the section below.
145 //
146 // Execution state virtualization.
147 //
148 //   Instead of emitting code, nodes that manipulate the state can record their
149 //   manipulation in an object called the Trace.  The Trace object can record a
150 //   current position offset, an optional backtrack code location on the top of
151 //   the virtualized backtrack stack and some register changes.  When a node is
152 //   to be emitted it can flush the Trace or update it.  Flushing the Trace
153 //   will emit code to bring the actual state into line with the virtual state.
154 //   Avoiding flushing the state can postpone some work (e.g. updates of capture
155 //   registers).  Postponing work can save time when executing the regular
156 //   expression since it may be found that the work never has to be done as a
157 //   failure to match can occur.  In addition it is much faster to jump to a
158 //   known backtrack code location than it is to pop an unknown backtrack
159 //   location from the stack and jump there.
160 //
161 //   The virtual state found in the Trace affects code generation.  For example
162 //   the virtual state contains the difference between the actual current
163 //   position and the virtual current position, and matching code needs to use
164 //   this offset to attempt a match in the correct location of the input
165 //   string.  Therefore code generated for a non-trivial trace is specialized
166 //   to that trace.  The code generator therefore has the ability to generate
167 //   code for each node several times.  In order to limit the size of the
168 //   generated code there is an arbitrary limit on how many specialized sets of
169 //   code may be generated for a given node.  If the limit is reached, the
170 //   trace is flushed and a generic version of the code for a node is emitted.
171 //   This is subsequently used for that node.  The code emitted for non-generic
172 //   trace is not recorded in the node and so it cannot currently be reused in
173 //   the event that code generation is requested for an identical trace.
174 
175 namespace {
176 
MaxCodeUnit(const bool one_byte)177 constexpr base::uc32 MaxCodeUnit(const bool one_byte) {
178   STATIC_ASSERT(String::kMaxOneByteCharCodeU <=
179                 std::numeric_limits<uint16_t>::max());
180   STATIC_ASSERT(String::kMaxUtf16CodeUnitU <=
181                 std::numeric_limits<uint16_t>::max());
182   return one_byte ? String::kMaxOneByteCharCodeU : String::kMaxUtf16CodeUnitU;
183 }
184 
CharMask(const bool one_byte)185 constexpr uint32_t CharMask(const bool one_byte) {
186   STATIC_ASSERT(base::bits::IsPowerOfTwo(String::kMaxOneByteCharCodeU + 1));
187   STATIC_ASSERT(base::bits::IsPowerOfTwo(String::kMaxUtf16CodeUnitU + 1));
188   return MaxCodeUnit(one_byte);
189 }
190 
191 }  // namespace
192 
AppendToText(RegExpText * text,Zone * zone)193 void RegExpTree::AppendToText(RegExpText* text, Zone* zone) { UNREACHABLE(); }
194 
AppendToText(RegExpText * text,Zone * zone)195 void RegExpAtom::AppendToText(RegExpText* text, Zone* zone) {
196   text->AddElement(TextElement::Atom(this), zone);
197 }
198 
AppendToText(RegExpText * text,Zone * zone)199 void RegExpCharacterClass::AppendToText(RegExpText* text, Zone* zone) {
200   text->AddElement(TextElement::CharClass(this), zone);
201 }
202 
AppendToText(RegExpText * text,Zone * zone)203 void RegExpText::AppendToText(RegExpText* text, Zone* zone) {
204   for (int i = 0; i < elements()->length(); i++)
205     text->AddElement(elements()->at(i), zone);
206 }
207 
Atom(RegExpAtom * atom)208 TextElement TextElement::Atom(RegExpAtom* atom) {
209   return TextElement(ATOM, atom);
210 }
211 
CharClass(RegExpCharacterClass * char_class)212 TextElement TextElement::CharClass(RegExpCharacterClass* char_class) {
213   return TextElement(CHAR_CLASS, char_class);
214 }
215 
length() const216 int TextElement::length() const {
217   switch (text_type()) {
218     case ATOM:
219       return atom()->length();
220 
221     case CHAR_CLASS:
222       return 1;
223   }
224   UNREACHABLE();
225 }
226 
227 class RecursionCheck {
228  public:
RecursionCheck(RegExpCompiler * compiler)229   explicit RecursionCheck(RegExpCompiler* compiler) : compiler_(compiler) {
230     compiler->IncrementRecursionDepth();
231   }
~RecursionCheck()232   ~RecursionCheck() { compiler_->DecrementRecursionDepth(); }
233 
234  private:
235   RegExpCompiler* compiler_;
236 };
237 
238 // Attempts to compile the regexp using an Irregexp code generator.  Returns
239 // a fixed array or a null handle depending on whether it succeeded.
RegExpCompiler(Isolate * isolate,Zone * zone,int capture_count,RegExpFlags flags,bool one_byte)240 RegExpCompiler::RegExpCompiler(Isolate* isolate, Zone* zone, int capture_count,
241                                RegExpFlags flags, bool one_byte)
242     : next_register_(JSRegExp::RegistersForCaptureCount(capture_count)),
243       unicode_lookaround_stack_register_(kNoRegister),
244       unicode_lookaround_position_register_(kNoRegister),
245       work_list_(nullptr),
246       recursion_depth_(0),
247       flags_(flags),
248       one_byte_(one_byte),
249       reg_exp_too_big_(false),
250       limiting_recursion_(false),
251       optimize_(FLAG_regexp_optimization),
252       read_backward_(false),
253       current_expansion_factor_(1),
254       frequency_collator_(),
255       isolate_(isolate),
256       zone_(zone) {
257   accept_ = zone->New<EndNode>(EndNode::ACCEPT, zone);
258   DCHECK_GE(RegExpMacroAssembler::kMaxRegister, next_register_ - 1);
259 }
260 
Assemble(Isolate * isolate,RegExpMacroAssembler * macro_assembler,RegExpNode * start,int capture_count,Handle<String> pattern)261 RegExpCompiler::CompilationResult RegExpCompiler::Assemble(
262     Isolate* isolate, RegExpMacroAssembler* macro_assembler, RegExpNode* start,
263     int capture_count, Handle<String> pattern) {
264   macro_assembler_ = macro_assembler;
265 
266   ZoneVector<RegExpNode*> work_list(zone());
267   work_list_ = &work_list;
268   Label fail;
269   macro_assembler_->PushBacktrack(&fail);
270   Trace new_trace;
271   start->Emit(this, &new_trace);
272   macro_assembler_->BindJumpTarget(&fail);
273   macro_assembler_->Fail();
274   while (!work_list.empty()) {
275     RegExpNode* node = work_list.back();
276     work_list.pop_back();
277     node->set_on_work_list(false);
278     if (!node->label()->is_bound()) node->Emit(this, &new_trace);
279   }
280   if (reg_exp_too_big_) {
281     if (FLAG_correctness_fuzzer_suppressions) {
282       FATAL("Aborting on excess zone allocation");
283     }
284     macro_assembler_->AbortedCodeGeneration();
285     return CompilationResult::RegExpTooBig();
286   }
287 
288   Handle<HeapObject> code = macro_assembler_->GetCode(pattern);
289   isolate->IncreaseTotalRegexpCodeGenerated(code);
290   work_list_ = nullptr;
291 
292   return {code, next_register_};
293 }
294 
Mentions(int that)295 bool Trace::DeferredAction::Mentions(int that) {
296   if (action_type() == ActionNode::CLEAR_CAPTURES) {
297     Interval range = static_cast<DeferredClearCaptures*>(this)->range();
298     return range.Contains(that);
299   } else {
300     return reg() == that;
301   }
302 }
303 
mentions_reg(int reg)304 bool Trace::mentions_reg(int reg) {
305   for (DeferredAction* action = actions_; action != nullptr;
306        action = action->next()) {
307     if (action->Mentions(reg)) return true;
308   }
309   return false;
310 }
311 
GetStoredPosition(int reg,int * cp_offset)312 bool Trace::GetStoredPosition(int reg, int* cp_offset) {
313   DCHECK_EQ(0, *cp_offset);
314   for (DeferredAction* action = actions_; action != nullptr;
315        action = action->next()) {
316     if (action->Mentions(reg)) {
317       if (action->action_type() == ActionNode::STORE_POSITION) {
318         *cp_offset = static_cast<DeferredCapture*>(action)->cp_offset();
319         return true;
320       } else {
321         return false;
322       }
323     }
324   }
325   return false;
326 }
327 
328 // A (dynamically-sized) set of unsigned integers that behaves especially well
329 // on small integers (< kFirstLimit). May do zone-allocation.
330 class DynamicBitSet : public ZoneObject {
331  public:
Get(unsigned value) const332   V8_EXPORT_PRIVATE bool Get(unsigned value) const {
333     if (value < kFirstLimit) {
334       return (first_ & (1 << value)) != 0;
335     } else if (remaining_ == nullptr) {
336       return false;
337     } else {
338       return remaining_->Contains(value);
339     }
340   }
341 
342   // Destructively set a value in this set.
Set(unsigned value,Zone * zone)343   void Set(unsigned value, Zone* zone) {
344     if (value < kFirstLimit) {
345       first_ |= (1 << value);
346     } else {
347       if (remaining_ == nullptr)
348         remaining_ = zone->New<ZoneList<unsigned>>(1, zone);
349       if (remaining_->is_empty() || !remaining_->Contains(value))
350         remaining_->Add(value, zone);
351     }
352   }
353 
354  private:
355   static constexpr unsigned kFirstLimit = 32;
356 
357   uint32_t first_ = 0;
358   ZoneList<unsigned>* remaining_ = nullptr;
359 };
360 
FindAffectedRegisters(DynamicBitSet * affected_registers,Zone * zone)361 int Trace::FindAffectedRegisters(DynamicBitSet* affected_registers,
362                                  Zone* zone) {
363   int max_register = RegExpCompiler::kNoRegister;
364   for (DeferredAction* action = actions_; action != nullptr;
365        action = action->next()) {
366     if (action->action_type() == ActionNode::CLEAR_CAPTURES) {
367       Interval range = static_cast<DeferredClearCaptures*>(action)->range();
368       for (int i = range.from(); i <= range.to(); i++)
369         affected_registers->Set(i, zone);
370       if (range.to() > max_register) max_register = range.to();
371     } else {
372       affected_registers->Set(action->reg(), zone);
373       if (action->reg() > max_register) max_register = action->reg();
374     }
375   }
376   return max_register;
377 }
378 
RestoreAffectedRegisters(RegExpMacroAssembler * assembler,int max_register,const DynamicBitSet & registers_to_pop,const DynamicBitSet & registers_to_clear)379 void Trace::RestoreAffectedRegisters(RegExpMacroAssembler* assembler,
380                                      int max_register,
381                                      const DynamicBitSet& registers_to_pop,
382                                      const DynamicBitSet& registers_to_clear) {
383   for (int reg = max_register; reg >= 0; reg--) {
384     if (registers_to_pop.Get(reg)) {
385       assembler->PopRegister(reg);
386     } else if (registers_to_clear.Get(reg)) {
387       int clear_to = reg;
388       while (reg > 0 && registers_to_clear.Get(reg - 1)) {
389         reg--;
390       }
391       assembler->ClearRegisters(reg, clear_to);
392     }
393   }
394 }
395 
PerformDeferredActions(RegExpMacroAssembler * assembler,int max_register,const DynamicBitSet & affected_registers,DynamicBitSet * registers_to_pop,DynamicBitSet * registers_to_clear,Zone * zone)396 void Trace::PerformDeferredActions(RegExpMacroAssembler* assembler,
397                                    int max_register,
398                                    const DynamicBitSet& affected_registers,
399                                    DynamicBitSet* registers_to_pop,
400                                    DynamicBitSet* registers_to_clear,
401                                    Zone* zone) {
402   // The "+1" is to avoid a push_limit of zero if stack_limit_slack() is 1.
403   const int push_limit = (assembler->stack_limit_slack() + 1) / 2;
404 
405   // Count pushes performed to force a stack limit check occasionally.
406   int pushes = 0;
407 
408   for (int reg = 0; reg <= max_register; reg++) {
409     if (!affected_registers.Get(reg)) continue;
410 
411     // The chronologically first deferred action in the trace
412     // is used to infer the action needed to restore a register
413     // to its previous state (or not, if it's safe to ignore it).
414     enum DeferredActionUndoType { IGNORE, RESTORE, CLEAR };
415     DeferredActionUndoType undo_action = IGNORE;
416 
417     int value = 0;
418     bool absolute = false;
419     bool clear = false;
420     static const int kNoStore = kMinInt;
421     int store_position = kNoStore;
422     // This is a little tricky because we are scanning the actions in reverse
423     // historical order (newest first).
424     for (DeferredAction* action = actions_; action != nullptr;
425          action = action->next()) {
426       if (action->Mentions(reg)) {
427         switch (action->action_type()) {
428           case ActionNode::SET_REGISTER_FOR_LOOP: {
429             Trace::DeferredSetRegisterForLoop* psr =
430                 static_cast<Trace::DeferredSetRegisterForLoop*>(action);
431             if (!absolute) {
432               value += psr->value();
433               absolute = true;
434             }
435             // SET_REGISTER_FOR_LOOP is only used for newly introduced loop
436             // counters. They can have a significant previous value if they
437             // occur in a loop. TODO(lrn): Propagate this information, so
438             // we can set undo_action to IGNORE if we know there is no value to
439             // restore.
440             undo_action = RESTORE;
441             DCHECK_EQ(store_position, kNoStore);
442             DCHECK(!clear);
443             break;
444           }
445           case ActionNode::INCREMENT_REGISTER:
446             if (!absolute) {
447               value++;
448             }
449             DCHECK_EQ(store_position, kNoStore);
450             DCHECK(!clear);
451             undo_action = RESTORE;
452             break;
453           case ActionNode::STORE_POSITION: {
454             Trace::DeferredCapture* pc =
455                 static_cast<Trace::DeferredCapture*>(action);
456             if (!clear && store_position == kNoStore) {
457               store_position = pc->cp_offset();
458             }
459 
460             // For captures we know that stores and clears alternate.
461             // Other register, are never cleared, and if the occur
462             // inside a loop, they might be assigned more than once.
463             if (reg <= 1) {
464               // Registers zero and one, aka "capture zero", is
465               // always set correctly if we succeed. There is no
466               // need to undo a setting on backtrack, because we
467               // will set it again or fail.
468               undo_action = IGNORE;
469             } else {
470               undo_action = pc->is_capture() ? CLEAR : RESTORE;
471             }
472             DCHECK(!absolute);
473             DCHECK_EQ(value, 0);
474             break;
475           }
476           case ActionNode::CLEAR_CAPTURES: {
477             // Since we're scanning in reverse order, if we've already
478             // set the position we have to ignore historically earlier
479             // clearing operations.
480             if (store_position == kNoStore) {
481               clear = true;
482             }
483             undo_action = RESTORE;
484             DCHECK(!absolute);
485             DCHECK_EQ(value, 0);
486             break;
487           }
488           default:
489             UNREACHABLE();
490         }
491       }
492     }
493     // Prepare for the undo-action (e.g., push if it's going to be popped).
494     if (undo_action == RESTORE) {
495       pushes++;
496       RegExpMacroAssembler::StackCheckFlag stack_check =
497           RegExpMacroAssembler::kNoStackLimitCheck;
498       if (pushes == push_limit) {
499         stack_check = RegExpMacroAssembler::kCheckStackLimit;
500         pushes = 0;
501       }
502 
503       assembler->PushRegister(reg, stack_check);
504       registers_to_pop->Set(reg, zone);
505     } else if (undo_action == CLEAR) {
506       registers_to_clear->Set(reg, zone);
507     }
508     // Perform the chronologically last action (or accumulated increment)
509     // for the register.
510     if (store_position != kNoStore) {
511       assembler->WriteCurrentPositionToRegister(reg, store_position);
512     } else if (clear) {
513       assembler->ClearRegisters(reg, reg);
514     } else if (absolute) {
515       assembler->SetRegister(reg, value);
516     } else if (value != 0) {
517       assembler->AdvanceRegister(reg, value);
518     }
519   }
520 }
521 
522 // This is called as we come into a loop choice node and some other tricky
523 // nodes.  It normalizes the state of the code generator to ensure we can
524 // generate generic code.
Flush(RegExpCompiler * compiler,RegExpNode * successor)525 void Trace::Flush(RegExpCompiler* compiler, RegExpNode* successor) {
526   RegExpMacroAssembler* assembler = compiler->macro_assembler();
527 
528   DCHECK(!is_trivial());
529 
530   if (actions_ == nullptr && backtrack() == nullptr) {
531     // Here we just have some deferred cp advances to fix and we are back to
532     // a normal situation.  We may also have to forget some information gained
533     // through a quick check that was already performed.
534     if (cp_offset_ != 0) assembler->AdvanceCurrentPosition(cp_offset_);
535     // Create a new trivial state and generate the node with that.
536     Trace new_state;
537     successor->Emit(compiler, &new_state);
538     return;
539   }
540 
541   // Generate deferred actions here along with code to undo them again.
542   DynamicBitSet affected_registers;
543 
544   if (backtrack() != nullptr) {
545     // Here we have a concrete backtrack location.  These are set up by choice
546     // nodes and so they indicate that we have a deferred save of the current
547     // position which we may need to emit here.
548     assembler->PushCurrentPosition();
549   }
550 
551   int max_register =
552       FindAffectedRegisters(&affected_registers, compiler->zone());
553   DynamicBitSet registers_to_pop;
554   DynamicBitSet registers_to_clear;
555   PerformDeferredActions(assembler, max_register, affected_registers,
556                          &registers_to_pop, &registers_to_clear,
557                          compiler->zone());
558   if (cp_offset_ != 0) {
559     assembler->AdvanceCurrentPosition(cp_offset_);
560   }
561 
562   // Create a new trivial state and generate the node with that.
563   Label undo;
564   assembler->PushBacktrack(&undo);
565   if (successor->KeepRecursing(compiler)) {
566     Trace new_state;
567     successor->Emit(compiler, &new_state);
568   } else {
569     compiler->AddWork(successor);
570     assembler->GoTo(successor->label());
571   }
572 
573   // On backtrack we need to restore state.
574   assembler->BindJumpTarget(&undo);
575   RestoreAffectedRegisters(assembler, max_register, registers_to_pop,
576                            registers_to_clear);
577   if (backtrack() == nullptr) {
578     assembler->Backtrack();
579   } else {
580     assembler->PopCurrentPosition();
581     assembler->GoTo(backtrack());
582   }
583 }
584 
Emit(RegExpCompiler * compiler,Trace * trace)585 void NegativeSubmatchSuccess::Emit(RegExpCompiler* compiler, Trace* trace) {
586   RegExpMacroAssembler* assembler = compiler->macro_assembler();
587 
588   // Omit flushing the trace. We discard the entire stack frame anyway.
589 
590   if (!label()->is_bound()) {
591     // We are completely independent of the trace, since we ignore it,
592     // so this code can be used as the generic version.
593     assembler->Bind(label());
594   }
595 
596   // Throw away everything on the backtrack stack since the start
597   // of the negative submatch and restore the character position.
598   assembler->ReadCurrentPositionFromRegister(current_position_register_);
599   assembler->ReadStackPointerFromRegister(stack_pointer_register_);
600   if (clear_capture_count_ > 0) {
601     // Clear any captures that might have been performed during the success
602     // of the body of the negative look-ahead.
603     int clear_capture_end = clear_capture_start_ + clear_capture_count_ - 1;
604     assembler->ClearRegisters(clear_capture_start_, clear_capture_end);
605   }
606   // Now that we have unwound the stack we find at the top of the stack the
607   // backtrack that the BeginNegativeSubmatch node got.
608   assembler->Backtrack();
609 }
610 
Emit(RegExpCompiler * compiler,Trace * trace)611 void EndNode::Emit(RegExpCompiler* compiler, Trace* trace) {
612   if (!trace->is_trivial()) {
613     trace->Flush(compiler, this);
614     return;
615   }
616   RegExpMacroAssembler* assembler = compiler->macro_assembler();
617   if (!label()->is_bound()) {
618     assembler->Bind(label());
619   }
620   switch (action_) {
621     case ACCEPT:
622       assembler->Succeed();
623       return;
624     case BACKTRACK:
625       assembler->GoTo(trace->backtrack());
626       return;
627     case NEGATIVE_SUBMATCH_SUCCESS:
628       // This case is handled in a different virtual method.
629       UNREACHABLE();
630   }
631   UNIMPLEMENTED();
632 }
633 
AddGuard(Guard * guard,Zone * zone)634 void GuardedAlternative::AddGuard(Guard* guard, Zone* zone) {
635   if (guards_ == nullptr) guards_ = zone->New<ZoneList<Guard*>>(1, zone);
636   guards_->Add(guard, zone);
637 }
638 
SetRegisterForLoop(int reg,int val,RegExpNode * on_success)639 ActionNode* ActionNode::SetRegisterForLoop(int reg, int val,
640                                            RegExpNode* on_success) {
641   ActionNode* result =
642       on_success->zone()->New<ActionNode>(SET_REGISTER_FOR_LOOP, on_success);
643   result->data_.u_store_register.reg = reg;
644   result->data_.u_store_register.value = val;
645   return result;
646 }
647 
IncrementRegister(int reg,RegExpNode * on_success)648 ActionNode* ActionNode::IncrementRegister(int reg, RegExpNode* on_success) {
649   ActionNode* result =
650       on_success->zone()->New<ActionNode>(INCREMENT_REGISTER, on_success);
651   result->data_.u_increment_register.reg = reg;
652   return result;
653 }
654 
StorePosition(int reg,bool is_capture,RegExpNode * on_success)655 ActionNode* ActionNode::StorePosition(int reg, bool is_capture,
656                                       RegExpNode* on_success) {
657   ActionNode* result =
658       on_success->zone()->New<ActionNode>(STORE_POSITION, on_success);
659   result->data_.u_position_register.reg = reg;
660   result->data_.u_position_register.is_capture = is_capture;
661   return result;
662 }
663 
ClearCaptures(Interval range,RegExpNode * on_success)664 ActionNode* ActionNode::ClearCaptures(Interval range, RegExpNode* on_success) {
665   ActionNode* result =
666       on_success->zone()->New<ActionNode>(CLEAR_CAPTURES, on_success);
667   result->data_.u_clear_captures.range_from = range.from();
668   result->data_.u_clear_captures.range_to = range.to();
669   return result;
670 }
671 
BeginPositiveSubmatch(int stack_reg,int position_reg,RegExpNode * on_success)672 ActionNode* ActionNode::BeginPositiveSubmatch(int stack_reg, int position_reg,
673                                               RegExpNode* on_success) {
674   ActionNode* result =
675       on_success->zone()->New<ActionNode>(BEGIN_POSITIVE_SUBMATCH, on_success);
676   result->data_.u_submatch.stack_pointer_register = stack_reg;
677   result->data_.u_submatch.current_position_register = position_reg;
678   return result;
679 }
680 
BeginNegativeSubmatch(int stack_reg,int position_reg,RegExpNode * on_success)681 ActionNode* ActionNode::BeginNegativeSubmatch(int stack_reg, int position_reg,
682                                               RegExpNode* on_success) {
683   ActionNode* result =
684       on_success->zone()->New<ActionNode>(BEGIN_NEGATIVE_SUBMATCH, on_success);
685   result->data_.u_submatch.stack_pointer_register = stack_reg;
686   result->data_.u_submatch.current_position_register = position_reg;
687   return result;
688 }
689 
PositiveSubmatchSuccess(int stack_reg,int position_reg,int clear_register_count,int clear_register_from,RegExpNode * on_success)690 ActionNode* ActionNode::PositiveSubmatchSuccess(int stack_reg, int position_reg,
691                                                 int clear_register_count,
692                                                 int clear_register_from,
693                                                 RegExpNode* on_success) {
694   ActionNode* result = on_success->zone()->New<ActionNode>(
695       POSITIVE_SUBMATCH_SUCCESS, on_success);
696   result->data_.u_submatch.stack_pointer_register = stack_reg;
697   result->data_.u_submatch.current_position_register = position_reg;
698   result->data_.u_submatch.clear_register_count = clear_register_count;
699   result->data_.u_submatch.clear_register_from = clear_register_from;
700   return result;
701 }
702 
EmptyMatchCheck(int start_register,int repetition_register,int repetition_limit,RegExpNode * on_success)703 ActionNode* ActionNode::EmptyMatchCheck(int start_register,
704                                         int repetition_register,
705                                         int repetition_limit,
706                                         RegExpNode* on_success) {
707   ActionNode* result =
708       on_success->zone()->New<ActionNode>(EMPTY_MATCH_CHECK, on_success);
709   result->data_.u_empty_match_check.start_register = start_register;
710   result->data_.u_empty_match_check.repetition_register = repetition_register;
711   result->data_.u_empty_match_check.repetition_limit = repetition_limit;
712   return result;
713 }
714 
715 #define DEFINE_ACCEPT(Type) \
716   void Type##Node::Accept(NodeVisitor* visitor) { visitor->Visit##Type(this); }
FOR_EACH_NODE_TYPE(DEFINE_ACCEPT)717 FOR_EACH_NODE_TYPE(DEFINE_ACCEPT)
718 #undef DEFINE_ACCEPT
719 
720 // -------------------------------------------------------------------
721 // Emit code.
722 
723 void ChoiceNode::GenerateGuard(RegExpMacroAssembler* macro_assembler,
724                                Guard* guard, Trace* trace) {
725   switch (guard->op()) {
726     case Guard::LT:
727       DCHECK(!trace->mentions_reg(guard->reg()));
728       macro_assembler->IfRegisterGE(guard->reg(), guard->value(),
729                                     trace->backtrack());
730       break;
731     case Guard::GEQ:
732       DCHECK(!trace->mentions_reg(guard->reg()));
733       macro_assembler->IfRegisterLT(guard->reg(), guard->value(),
734                                     trace->backtrack());
735       break;
736   }
737 }
738 
739 namespace {
740 
741 #ifdef DEBUG
ContainsOnlyUtf16CodeUnits(unibrow::uchar * chars,int length)742 bool ContainsOnlyUtf16CodeUnits(unibrow::uchar* chars, int length) {
743   STATIC_ASSERT(sizeof(unibrow::uchar) == 4);
744   for (int i = 0; i < length; i++) {
745     if (chars[i] > String::kMaxUtf16CodeUnit) return false;
746   }
747   return true;
748 }
749 #endif  // DEBUG
750 
751 // Returns the number of characters in the equivalence class, omitting those
752 // that cannot occur in the source string because it is Latin1.
GetCaseIndependentLetters(Isolate * isolate,base::uc16 character,bool one_byte_subject,unibrow::uchar * letters,int letter_length)753 int GetCaseIndependentLetters(Isolate* isolate, base::uc16 character,
754                               bool one_byte_subject, unibrow::uchar* letters,
755                               int letter_length) {
756 #ifdef V8_INTL_SUPPORT
757   if (RegExpCaseFolding::IgnoreSet().contains(character)) {
758     letters[0] = character;
759     DCHECK(ContainsOnlyUtf16CodeUnits(letters, 1));
760     return 1;
761   }
762   bool in_special_add_set =
763       RegExpCaseFolding::SpecialAddSet().contains(character);
764 
765   icu::UnicodeSet set;
766   set.add(character);
767   set = set.closeOver(USET_CASE_INSENSITIVE);
768 
769   UChar32 canon = 0;
770   if (in_special_add_set) {
771     canon = RegExpCaseFolding::Canonicalize(character);
772   }
773 
774   int32_t range_count = set.getRangeCount();
775   int items = 0;
776   for (int32_t i = 0; i < range_count; i++) {
777     UChar32 start = set.getRangeStart(i);
778     UChar32 end = set.getRangeEnd(i);
779     CHECK(end - start + items <= letter_length);
780     for (UChar32 cu = start; cu <= end; cu++) {
781       if (one_byte_subject && cu > String::kMaxOneByteCharCode) break;
782       if (in_special_add_set && RegExpCaseFolding::Canonicalize(cu) != canon) {
783         continue;
784       }
785       letters[items++] = static_cast<unibrow::uchar>(cu);
786     }
787   }
788   DCHECK(ContainsOnlyUtf16CodeUnits(letters, items));
789   return items;
790 #else
791   int length =
792       isolate->jsregexp_uncanonicalize()->get(character, '\0', letters);
793   // Unibrow returns 0 or 1 for characters where case independence is
794   // trivial.
795   if (length == 0) {
796     letters[0] = character;
797     length = 1;
798   }
799 
800   if (one_byte_subject) {
801     int new_length = 0;
802     for (int i = 0; i < length; i++) {
803       if (letters[i] <= String::kMaxOneByteCharCode) {
804         letters[new_length++] = letters[i];
805       }
806     }
807     length = new_length;
808   }
809 
810   DCHECK(ContainsOnlyUtf16CodeUnits(letters, length));
811   return length;
812 #endif  // V8_INTL_SUPPORT
813 }
814 
EmitSimpleCharacter(Isolate * isolate,RegExpCompiler * compiler,base::uc16 c,Label * on_failure,int cp_offset,bool check,bool preloaded)815 inline bool EmitSimpleCharacter(Isolate* isolate, RegExpCompiler* compiler,
816                                 base::uc16 c, Label* on_failure, int cp_offset,
817                                 bool check, bool preloaded) {
818   RegExpMacroAssembler* assembler = compiler->macro_assembler();
819   bool bound_checked = false;
820   if (!preloaded) {
821     assembler->LoadCurrentCharacter(cp_offset, on_failure, check);
822     bound_checked = true;
823   }
824   assembler->CheckNotCharacter(c, on_failure);
825   return bound_checked;
826 }
827 
828 // Only emits non-letters (things that don't have case).  Only used for case
829 // independent matches.
EmitAtomNonLetter(Isolate * isolate,RegExpCompiler * compiler,base::uc16 c,Label * on_failure,int cp_offset,bool check,bool preloaded)830 inline bool EmitAtomNonLetter(Isolate* isolate, RegExpCompiler* compiler,
831                               base::uc16 c, Label* on_failure, int cp_offset,
832                               bool check, bool preloaded) {
833   RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
834   bool one_byte = compiler->one_byte();
835   unibrow::uchar chars[4];
836   int length = GetCaseIndependentLetters(isolate, c, one_byte, chars, 4);
837   if (length < 1) {
838     // This can't match.  Must be an one-byte subject and a non-one-byte
839     // character.  We do not need to do anything since the one-byte pass
840     // already handled this.
841     return false;  // Bounds not checked.
842   }
843   bool checked = false;
844   // We handle the length > 1 case in a later pass.
845   if (length == 1) {
846     if (one_byte && c > String::kMaxOneByteCharCodeU) {
847       // Can't match - see above.
848       return false;  // Bounds not checked.
849     }
850     if (!preloaded) {
851       macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);
852       checked = check;
853     }
854     macro_assembler->CheckNotCharacter(c, on_failure);
855   }
856   return checked;
857 }
858 
ShortCutEmitCharacterPair(RegExpMacroAssembler * macro_assembler,bool one_byte,base::uc16 c1,base::uc16 c2,Label * on_failure)859 bool ShortCutEmitCharacterPair(RegExpMacroAssembler* macro_assembler,
860                                bool one_byte, base::uc16 c1, base::uc16 c2,
861                                Label* on_failure) {
862   const uint32_t char_mask = CharMask(one_byte);
863   base::uc16 exor = c1 ^ c2;
864   // Check whether exor has only one bit set.
865   if (((exor - 1) & exor) == 0) {
866     // If c1 and c2 differ only by one bit.
867     // Ecma262UnCanonicalize always gives the highest number last.
868     DCHECK(c2 > c1);
869     base::uc16 mask = char_mask ^ exor;
870     macro_assembler->CheckNotCharacterAfterAnd(c1, mask, on_failure);
871     return true;
872   }
873   DCHECK(c2 > c1);
874   base::uc16 diff = c2 - c1;
875   if (((diff - 1) & diff) == 0 && c1 >= diff) {
876     // If the characters differ by 2^n but don't differ by one bit then
877     // subtract the difference from the found character, then do the or
878     // trick.  We avoid the theoretical case where negative numbers are
879     // involved in order to simplify code generation.
880     base::uc16 mask = char_mask ^ diff;
881     macro_assembler->CheckNotCharacterAfterMinusAnd(c1 - diff, diff, mask,
882                                                     on_failure);
883     return true;
884   }
885   return false;
886 }
887 
888 // Only emits letters (things that have case).  Only used for case independent
889 // matches.
EmitAtomLetter(Isolate * isolate,RegExpCompiler * compiler,base::uc16 c,Label * on_failure,int cp_offset,bool check,bool preloaded)890 inline bool EmitAtomLetter(Isolate* isolate, RegExpCompiler* compiler,
891                            base::uc16 c, Label* on_failure, int cp_offset,
892                            bool check, bool preloaded) {
893   RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
894   bool one_byte = compiler->one_byte();
895   unibrow::uchar chars[4];
896   int length = GetCaseIndependentLetters(isolate, c, one_byte, chars, 4);
897   if (length <= 1) return false;
898   // We may not need to check against the end of the input string
899   // if this character lies before a character that matched.
900   if (!preloaded) {
901     macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);
902   }
903   Label ok;
904   switch (length) {
905     case 2: {
906       if (ShortCutEmitCharacterPair(macro_assembler, one_byte, chars[0],
907                                     chars[1], on_failure)) {
908       } else {
909         macro_assembler->CheckCharacter(chars[0], &ok);
910         macro_assembler->CheckNotCharacter(chars[1], on_failure);
911         macro_assembler->Bind(&ok);
912       }
913       break;
914     }
915     case 4:
916       macro_assembler->CheckCharacter(chars[3], &ok);
917       V8_FALLTHROUGH;
918     case 3:
919       macro_assembler->CheckCharacter(chars[0], &ok);
920       macro_assembler->CheckCharacter(chars[1], &ok);
921       macro_assembler->CheckNotCharacter(chars[2], on_failure);
922       macro_assembler->Bind(&ok);
923       break;
924     default:
925       UNREACHABLE();
926   }
927   return true;
928 }
929 
EmitBoundaryTest(RegExpMacroAssembler * masm,int border,Label * fall_through,Label * above_or_equal,Label * below)930 void EmitBoundaryTest(RegExpMacroAssembler* masm, int border,
931                       Label* fall_through, Label* above_or_equal,
932                       Label* below) {
933   if (below != fall_through) {
934     masm->CheckCharacterLT(border, below);
935     if (above_or_equal != fall_through) masm->GoTo(above_or_equal);
936   } else {
937     masm->CheckCharacterGT(border - 1, above_or_equal);
938   }
939 }
940 
EmitDoubleBoundaryTest(RegExpMacroAssembler * masm,int first,int last,Label * fall_through,Label * in_range,Label * out_of_range)941 void EmitDoubleBoundaryTest(RegExpMacroAssembler* masm, int first, int last,
942                             Label* fall_through, Label* in_range,
943                             Label* out_of_range) {
944   if (in_range == fall_through) {
945     if (first == last) {
946       masm->CheckNotCharacter(first, out_of_range);
947     } else {
948       masm->CheckCharacterNotInRange(first, last, out_of_range);
949     }
950   } else {
951     if (first == last) {
952       masm->CheckCharacter(first, in_range);
953     } else {
954       masm->CheckCharacterInRange(first, last, in_range);
955     }
956     if (out_of_range != fall_through) masm->GoTo(out_of_range);
957   }
958 }
959 
960 // even_label is for ranges[i] to ranges[i + 1] where i - start_index is even.
961 // odd_label is for ranges[i] to ranges[i + 1] where i - start_index is odd.
EmitUseLookupTable(RegExpMacroAssembler * masm,ZoneList<base::uc32> * ranges,uint32_t start_index,uint32_t end_index,base::uc32 min_char,Label * fall_through,Label * even_label,Label * odd_label)962 void EmitUseLookupTable(RegExpMacroAssembler* masm,
963                         ZoneList<base::uc32>* ranges, uint32_t start_index,
964                         uint32_t end_index, base::uc32 min_char,
965                         Label* fall_through, Label* even_label,
966                         Label* odd_label) {
967   static const uint32_t kSize = RegExpMacroAssembler::kTableSize;
968   static const uint32_t kMask = RegExpMacroAssembler::kTableMask;
969 
970   base::uc32 base = (min_char & ~kMask);
971   USE(base);
972 
973   // Assert that everything is on one kTableSize page.
974   for (uint32_t i = start_index; i <= end_index; i++) {
975     DCHECK_EQ(ranges->at(i) & ~kMask, base);
976   }
977   DCHECK(start_index == 0 || (ranges->at(start_index - 1) & ~kMask) <= base);
978 
979   char templ[kSize];
980   Label* on_bit_set;
981   Label* on_bit_clear;
982   int bit;
983   if (even_label == fall_through) {
984     on_bit_set = odd_label;
985     on_bit_clear = even_label;
986     bit = 1;
987   } else {
988     on_bit_set = even_label;
989     on_bit_clear = odd_label;
990     bit = 0;
991   }
992   for (uint32_t i = 0; i < (ranges->at(start_index) & kMask) && i < kSize;
993        i++) {
994     templ[i] = bit;
995   }
996   uint32_t j = 0;
997   bit ^= 1;
998   for (uint32_t i = start_index; i < end_index; i++) {
999     for (j = (ranges->at(i) & kMask); j < (ranges->at(i + 1) & kMask); j++) {
1000       templ[j] = bit;
1001     }
1002     bit ^= 1;
1003   }
1004   for (uint32_t i = j; i < kSize; i++) {
1005     templ[i] = bit;
1006   }
1007   Factory* factory = masm->isolate()->factory();
1008   // TODO(erikcorry): Cache these.
1009   Handle<ByteArray> ba = factory->NewByteArray(kSize, AllocationType::kOld);
1010   for (uint32_t i = 0; i < kSize; i++) {
1011     ba->set(i, templ[i]);
1012   }
1013   masm->CheckBitInTable(ba, on_bit_set);
1014   if (on_bit_clear != fall_through) masm->GoTo(on_bit_clear);
1015 }
1016 
CutOutRange(RegExpMacroAssembler * masm,ZoneList<base::uc32> * ranges,uint32_t start_index,uint32_t end_index,uint32_t cut_index,Label * even_label,Label * odd_label)1017 void CutOutRange(RegExpMacroAssembler* masm, ZoneList<base::uc32>* ranges,
1018                  uint32_t start_index, uint32_t end_index, uint32_t cut_index,
1019                  Label* even_label, Label* odd_label) {
1020   bool odd = (((cut_index - start_index) & 1) == 1);
1021   Label* in_range_label = odd ? odd_label : even_label;
1022   Label dummy;
1023   EmitDoubleBoundaryTest(masm, ranges->at(cut_index),
1024                          ranges->at(cut_index + 1) - 1, &dummy, in_range_label,
1025                          &dummy);
1026   DCHECK(!dummy.is_linked());
1027   // Cut out the single range by rewriting the array.  This creates a new
1028   // range that is a merger of the two ranges on either side of the one we
1029   // are cutting out.  The oddity of the labels is preserved.
1030   for (uint32_t j = cut_index; j > start_index; j--) {
1031     ranges->at(j) = ranges->at(j - 1);
1032   }
1033   for (uint32_t j = cut_index + 1; j < end_index; j++) {
1034     ranges->at(j) = ranges->at(j + 1);
1035   }
1036 }
1037 
1038 // Unicode case.  Split the search space into kSize spaces that are handled
1039 // with recursion.
SplitSearchSpace(ZoneList<base::uc32> * ranges,uint32_t start_index,uint32_t end_index,uint32_t * new_start_index,uint32_t * new_end_index,base::uc32 * border)1040 void SplitSearchSpace(ZoneList<base::uc32>* ranges, uint32_t start_index,
1041                       uint32_t end_index, uint32_t* new_start_index,
1042                       uint32_t* new_end_index, base::uc32* border) {
1043   static const uint32_t kSize = RegExpMacroAssembler::kTableSize;
1044   static const uint32_t kMask = RegExpMacroAssembler::kTableMask;
1045 
1046   base::uc32 first = ranges->at(start_index);
1047   base::uc32 last = ranges->at(end_index) - 1;
1048 
1049   *new_start_index = start_index;
1050   *border = (ranges->at(start_index) & ~kMask) + kSize;
1051   while (*new_start_index < end_index) {
1052     if (ranges->at(*new_start_index) > *border) break;
1053     (*new_start_index)++;
1054   }
1055   // new_start_index is the index of the first edge that is beyond the
1056   // current kSize space.
1057 
1058   // For very large search spaces we do a binary chop search of the non-Latin1
1059   // space instead of just going to the end of the current kSize space.  The
1060   // heuristics are complicated a little by the fact that any 128-character
1061   // encoding space can be quickly tested with a table lookup, so we don't
1062   // wish to do binary chop search at a smaller granularity than that.  A
1063   // 128-character space can take up a lot of space in the ranges array if,
1064   // for example, we only want to match every second character (eg. the lower
1065   // case characters on some Unicode pages).
1066   uint32_t binary_chop_index = (end_index + start_index) / 2;
1067   // The first test ensures that we get to the code that handles the Latin1
1068   // range with a single not-taken branch, speeding up this important
1069   // character range (even non-Latin1 charset-based text has spaces and
1070   // punctuation).
1071   if (*border - 1 > String::kMaxOneByteCharCode &&  // Latin1 case.
1072       end_index - start_index > (*new_start_index - start_index) * 2 &&
1073       last - first > kSize * 2 && binary_chop_index > *new_start_index &&
1074       ranges->at(binary_chop_index) >= first + 2 * kSize) {
1075     uint32_t scan_forward_for_section_border = binary_chop_index;
1076     uint32_t new_border = (ranges->at(binary_chop_index) | kMask) + 1;
1077 
1078     while (scan_forward_for_section_border < end_index) {
1079       if (ranges->at(scan_forward_for_section_border) > new_border) {
1080         *new_start_index = scan_forward_for_section_border;
1081         *border = new_border;
1082         break;
1083       }
1084       scan_forward_for_section_border++;
1085     }
1086   }
1087 
1088   DCHECK(*new_start_index > start_index);
1089   *new_end_index = *new_start_index - 1;
1090   if (ranges->at(*new_end_index) == *border) {
1091     (*new_end_index)--;
1092   }
1093   if (*border >= ranges->at(end_index)) {
1094     *border = ranges->at(end_index);
1095     *new_start_index = end_index;  // Won't be used.
1096     *new_end_index = end_index - 1;
1097   }
1098 }
1099 
1100 // Gets a series of segment boundaries representing a character class.  If the
1101 // character is in the range between an even and an odd boundary (counting from
1102 // start_index) then go to even_label, otherwise go to odd_label.  We already
1103 // know that the character is in the range of min_char to max_char inclusive.
1104 // Either label can be nullptr indicating backtracking.  Either label can also
1105 // be equal to the fall_through label.
GenerateBranches(RegExpMacroAssembler * masm,ZoneList<base::uc32> * ranges,uint32_t start_index,uint32_t end_index,base::uc32 min_char,base::uc32 max_char,Label * fall_through,Label * even_label,Label * odd_label)1106 void GenerateBranches(RegExpMacroAssembler* masm, ZoneList<base::uc32>* ranges,
1107                       uint32_t start_index, uint32_t end_index,
1108                       base::uc32 min_char, base::uc32 max_char,
1109                       Label* fall_through, Label* even_label,
1110                       Label* odd_label) {
1111   DCHECK_LE(min_char, String::kMaxUtf16CodeUnit);
1112   DCHECK_LE(max_char, String::kMaxUtf16CodeUnit);
1113 
1114   base::uc32 first = ranges->at(start_index);
1115   base::uc32 last = ranges->at(end_index) - 1;
1116 
1117   DCHECK_LT(min_char, first);
1118 
1119   // Just need to test if the character is before or on-or-after
1120   // a particular character.
1121   if (start_index == end_index) {
1122     EmitBoundaryTest(masm, first, fall_through, even_label, odd_label);
1123     return;
1124   }
1125 
1126   // Another almost trivial case:  There is one interval in the middle that is
1127   // different from the end intervals.
1128   if (start_index + 1 == end_index) {
1129     EmitDoubleBoundaryTest(masm, first, last, fall_through, even_label,
1130                            odd_label);
1131     return;
1132   }
1133 
1134   // It's not worth using table lookup if there are very few intervals in the
1135   // character class.
1136   if (end_index - start_index <= 6) {
1137     // It is faster to test for individual characters, so we look for those
1138     // first, then try arbitrary ranges in the second round.
1139     static uint32_t kNoCutIndex = -1;
1140     uint32_t cut = kNoCutIndex;
1141     for (uint32_t i = start_index; i < end_index; i++) {
1142       if (ranges->at(i) == ranges->at(i + 1) - 1) {
1143         cut = i;
1144         break;
1145       }
1146     }
1147     if (cut == kNoCutIndex) cut = start_index;
1148     CutOutRange(masm, ranges, start_index, end_index, cut, even_label,
1149                 odd_label);
1150     DCHECK_GE(end_index - start_index, 2);
1151     GenerateBranches(masm, ranges, start_index + 1, end_index - 1, min_char,
1152                      max_char, fall_through, even_label, odd_label);
1153     return;
1154   }
1155 
1156   // If there are a lot of intervals in the regexp, then we will use tables to
1157   // determine whether the character is inside or outside the character class.
1158   static const int kBits = RegExpMacroAssembler::kTableSizeBits;
1159 
1160   if ((max_char >> kBits) == (min_char >> kBits)) {
1161     EmitUseLookupTable(masm, ranges, start_index, end_index, min_char,
1162                        fall_through, even_label, odd_label);
1163     return;
1164   }
1165 
1166   if ((min_char >> kBits) != first >> kBits) {
1167     masm->CheckCharacterLT(first, odd_label);
1168     GenerateBranches(masm, ranges, start_index + 1, end_index, first, max_char,
1169                      fall_through, odd_label, even_label);
1170     return;
1171   }
1172 
1173   uint32_t new_start_index = 0;
1174   uint32_t new_end_index = 0;
1175   base::uc32 border = 0;
1176 
1177   SplitSearchSpace(ranges, start_index, end_index, &new_start_index,
1178                    &new_end_index, &border);
1179 
1180   Label handle_rest;
1181   Label* above = &handle_rest;
1182   if (border == last + 1) {
1183     // We didn't find any section that started after the limit, so everything
1184     // above the border is one of the terminal labels.
1185     above = (end_index & 1) != (start_index & 1) ? odd_label : even_label;
1186     DCHECK(new_end_index == end_index - 1);
1187   }
1188 
1189   DCHECK_LE(start_index, new_end_index);
1190   DCHECK_LE(new_start_index, end_index);
1191   DCHECK_LT(start_index, new_start_index);
1192   DCHECK_LT(new_end_index, end_index);
1193   DCHECK(new_end_index + 1 == new_start_index ||
1194          (new_end_index + 2 == new_start_index &&
1195           border == ranges->at(new_end_index + 1)));
1196   DCHECK_LT(min_char, border - 1);
1197   DCHECK_LT(border, max_char);
1198   DCHECK_LT(ranges->at(new_end_index), border);
1199   DCHECK(border < ranges->at(new_start_index) ||
1200          (border == ranges->at(new_start_index) &&
1201           new_start_index == end_index && new_end_index == end_index - 1 &&
1202           border == last + 1));
1203   DCHECK(new_start_index == 0 || border >= ranges->at(new_start_index - 1));
1204 
1205   masm->CheckCharacterGT(border - 1, above);
1206   Label dummy;
1207   GenerateBranches(masm, ranges, start_index, new_end_index, min_char,
1208                    border - 1, &dummy, even_label, odd_label);
1209   if (handle_rest.is_linked()) {
1210     masm->Bind(&handle_rest);
1211     bool flip = (new_start_index & 1) != (start_index & 1);
1212     GenerateBranches(masm, ranges, new_start_index, end_index, border, max_char,
1213                      &dummy, flip ? odd_label : even_label,
1214                      flip ? even_label : odd_label);
1215   }
1216 }
1217 
EmitCharClass(RegExpMacroAssembler * macro_assembler,RegExpCharacterClass * cc,bool one_byte,Label * on_failure,int cp_offset,bool check_offset,bool preloaded,Zone * zone)1218 void EmitCharClass(RegExpMacroAssembler* macro_assembler,
1219                    RegExpCharacterClass* cc, bool one_byte, Label* on_failure,
1220                    int cp_offset, bool check_offset, bool preloaded,
1221                    Zone* zone) {
1222   ZoneList<CharacterRange>* ranges = cc->ranges(zone);
1223   CharacterRange::Canonicalize(ranges);
1224 
1225   // Now that all processing (like case-insensitivity) is done, clamp the
1226   // ranges to the set of ranges that may actually occur in the subject string.
1227   if (one_byte) CharacterRange::ClampToOneByte(ranges);
1228 
1229   const int ranges_length = ranges->length();
1230   if (ranges_length == 0) {
1231     if (!cc->is_negated()) {
1232       macro_assembler->GoTo(on_failure);
1233     }
1234     if (check_offset) {
1235       macro_assembler->CheckPosition(cp_offset, on_failure);
1236     }
1237     return;
1238   }
1239 
1240   const base::uc32 max_char = MaxCodeUnit(one_byte);
1241   if (ranges_length == 1 && ranges->at(0).IsEverything(max_char)) {
1242     if (cc->is_negated()) {
1243       macro_assembler->GoTo(on_failure);
1244     } else {
1245       // This is a common case hit by non-anchored expressions.
1246       if (check_offset) {
1247         macro_assembler->CheckPosition(cp_offset, on_failure);
1248       }
1249     }
1250     return;
1251   }
1252 
1253   if (!preloaded) {
1254     macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check_offset);
1255   }
1256 
1257   if (cc->is_standard(zone) && macro_assembler->CheckSpecialCharacterClass(
1258                                    cc->standard_type(), on_failure)) {
1259     return;
1260   }
1261 
1262   static constexpr int kMaxRangesForInlineBranchGeneration = 16;
1263   if (ranges_length > kMaxRangesForInlineBranchGeneration) {
1264     // For large range sets, emit a more compact instruction sequence to avoid
1265     // a potentially problematic increase in code size.
1266     // Note the flipped logic below (we check InRange if negated, NotInRange if
1267     // not negated); this is necessary since the method falls through on
1268     // failure whereas we want to fall through on success.
1269     if (cc->is_negated()) {
1270       if (macro_assembler->CheckCharacterInRangeArray(ranges, on_failure)) {
1271         return;
1272       }
1273     } else {
1274       if (macro_assembler->CheckCharacterNotInRangeArray(ranges, on_failure)) {
1275         return;
1276       }
1277     }
1278   }
1279 
1280   // Generate a flat list of range boundaries for consumption by
1281   // GenerateBranches. See the comment on that function for how the list should
1282   // be structured
1283   ZoneList<base::uc32>* range_boundaries =
1284       zone->New<ZoneList<base::uc32>>(ranges_length * 2, zone);
1285 
1286   bool zeroth_entry_is_failure = !cc->is_negated();
1287 
1288   for (int i = 0; i < ranges_length; i++) {
1289     CharacterRange& range = ranges->at(i);
1290     if (range.from() == 0) {
1291       DCHECK_EQ(i, 0);
1292       zeroth_entry_is_failure = !zeroth_entry_is_failure;
1293     } else {
1294       range_boundaries->Add(range.from(), zone);
1295     }
1296     // `+ 1` to convert from inclusive to exclusive `to`.
1297     // [from, to] == [from, to+1[.
1298     range_boundaries->Add(range.to() + 1, zone);
1299   }
1300   int end_index = range_boundaries->length() - 1;
1301   if (range_boundaries->at(end_index) > max_char) {
1302     end_index--;
1303   }
1304 
1305   Label fall_through;
1306   GenerateBranches(macro_assembler, range_boundaries,
1307                    0,  // start_index.
1308                    end_index,
1309                    0,  // min_char.
1310                    max_char, &fall_through,
1311                    zeroth_entry_is_failure ? &fall_through : on_failure,
1312                    zeroth_entry_is_failure ? on_failure : &fall_through);
1313   macro_assembler->Bind(&fall_through);
1314 }
1315 
1316 }  // namespace
1317 
1318 RegExpNode::~RegExpNode() = default;
1319 
LimitVersions(RegExpCompiler * compiler,Trace * trace)1320 RegExpNode::LimitResult RegExpNode::LimitVersions(RegExpCompiler* compiler,
1321                                                   Trace* trace) {
1322   // If we are generating a greedy loop then don't stop and don't reuse code.
1323   if (trace->stop_node() != nullptr) {
1324     return CONTINUE;
1325   }
1326 
1327   RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
1328   if (trace->is_trivial()) {
1329     if (label_.is_bound() || on_work_list() || !KeepRecursing(compiler)) {
1330       // If a generic version is already scheduled to be generated or we have
1331       // recursed too deeply then just generate a jump to that code.
1332       macro_assembler->GoTo(&label_);
1333       // This will queue it up for generation of a generic version if it hasn't
1334       // already been queued.
1335       compiler->AddWork(this);
1336       return DONE;
1337     }
1338     // Generate generic version of the node and bind the label for later use.
1339     macro_assembler->Bind(&label_);
1340     return CONTINUE;
1341   }
1342 
1343   // We are being asked to make a non-generic version.  Keep track of how many
1344   // non-generic versions we generate so as not to overdo it.
1345   trace_count_++;
1346   if (KeepRecursing(compiler) && compiler->optimize() &&
1347       trace_count_ < kMaxCopiesCodeGenerated) {
1348     return CONTINUE;
1349   }
1350 
1351   // If we get here code has been generated for this node too many times or
1352   // recursion is too deep.  Time to switch to a generic version.  The code for
1353   // generic versions above can handle deep recursion properly.
1354   bool was_limiting = compiler->limiting_recursion();
1355   compiler->set_limiting_recursion(true);
1356   trace->Flush(compiler, this);
1357   compiler->set_limiting_recursion(was_limiting);
1358   return DONE;
1359 }
1360 
KeepRecursing(RegExpCompiler * compiler)1361 bool RegExpNode::KeepRecursing(RegExpCompiler* compiler) {
1362   return !compiler->limiting_recursion() &&
1363          compiler->recursion_depth() <= RegExpCompiler::kMaxRecursion;
1364 }
1365 
FillInBMInfo(Isolate * isolate,int offset,int budget,BoyerMooreLookahead * bm,bool not_at_start)1366 void ActionNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
1367                               BoyerMooreLookahead* bm, bool not_at_start) {
1368   if (action_type_ == POSITIVE_SUBMATCH_SUCCESS) {
1369     // Anything may follow a positive submatch success, thus we need to accept
1370     // all characters from this position onwards.
1371     bm->SetRest(offset);
1372   } else {
1373     on_success()->FillInBMInfo(isolate, offset, budget - 1, bm, not_at_start);
1374   }
1375   SaveBMInfo(bm, not_at_start, offset);
1376 }
1377 
GetQuickCheckDetails(QuickCheckDetails * details,RegExpCompiler * compiler,int filled_in,bool not_at_start)1378 void ActionNode::GetQuickCheckDetails(QuickCheckDetails* details,
1379                                       RegExpCompiler* compiler, int filled_in,
1380                                       bool not_at_start) {
1381   if (action_type_ == SET_REGISTER_FOR_LOOP) {
1382     on_success()->GetQuickCheckDetailsFromLoopEntry(details, compiler,
1383                                                     filled_in, not_at_start);
1384   } else {
1385     on_success()->GetQuickCheckDetails(details, compiler, filled_in,
1386                                        not_at_start);
1387   }
1388 }
1389 
FillInBMInfo(Isolate * isolate,int offset,int budget,BoyerMooreLookahead * bm,bool not_at_start)1390 void AssertionNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
1391                                  BoyerMooreLookahead* bm, bool not_at_start) {
1392   // Match the behaviour of EatsAtLeast on this node.
1393   if (assertion_type() == AT_START && not_at_start) return;
1394   on_success()->FillInBMInfo(isolate, offset, budget - 1, bm, not_at_start);
1395   SaveBMInfo(bm, not_at_start, offset);
1396 }
1397 
GetQuickCheckDetails(QuickCheckDetails * details,RegExpCompiler * compiler,int filled_in,bool not_at_start)1398 void NegativeLookaroundChoiceNode::GetQuickCheckDetails(
1399     QuickCheckDetails* details, RegExpCompiler* compiler, int filled_in,
1400     bool not_at_start) {
1401   RegExpNode* node = continue_node();
1402   return node->GetQuickCheckDetails(details, compiler, filled_in, not_at_start);
1403 }
1404 
1405 namespace {
1406 
1407 // Takes the left-most 1-bit and smears it out, setting all bits to its right.
SmearBitsRight(uint32_t v)1408 inline uint32_t SmearBitsRight(uint32_t v) {
1409   v |= v >> 1;
1410   v |= v >> 2;
1411   v |= v >> 4;
1412   v |= v >> 8;
1413   v |= v >> 16;
1414   return v;
1415 }
1416 
1417 }  // namespace
1418 
Rationalize(bool asc)1419 bool QuickCheckDetails::Rationalize(bool asc) {
1420   bool found_useful_op = false;
1421   const uint32_t char_mask = CharMask(asc);
1422   mask_ = 0;
1423   value_ = 0;
1424   int char_shift = 0;
1425   for (int i = 0; i < characters_; i++) {
1426     Position* pos = &positions_[i];
1427     if ((pos->mask & String::kMaxOneByteCharCode) != 0) {
1428       found_useful_op = true;
1429     }
1430     mask_ |= (pos->mask & char_mask) << char_shift;
1431     value_ |= (pos->value & char_mask) << char_shift;
1432     char_shift += asc ? 8 : 16;
1433   }
1434   return found_useful_op;
1435 }
1436 
EatsAtLeast(bool not_at_start)1437 int RegExpNode::EatsAtLeast(bool not_at_start) {
1438   return not_at_start ? eats_at_least_.eats_at_least_from_not_start
1439                       : eats_at_least_.eats_at_least_from_possibly_start;
1440 }
1441 
EatsAtLeastFromLoopEntry()1442 EatsAtLeastInfo RegExpNode::EatsAtLeastFromLoopEntry() {
1443   // SET_REGISTER_FOR_LOOP is only used to initialize loop counters, and it
1444   // implies that the following node must be a LoopChoiceNode. If we need to
1445   // set registers to constant values for other reasons, we could introduce a
1446   // new action type SET_REGISTER that doesn't imply anything about its
1447   // successor.
1448   UNREACHABLE();
1449 }
1450 
GetQuickCheckDetailsFromLoopEntry(QuickCheckDetails * details,RegExpCompiler * compiler,int characters_filled_in,bool not_at_start)1451 void RegExpNode::GetQuickCheckDetailsFromLoopEntry(QuickCheckDetails* details,
1452                                                    RegExpCompiler* compiler,
1453                                                    int characters_filled_in,
1454                                                    bool not_at_start) {
1455   // See comment in RegExpNode::EatsAtLeastFromLoopEntry.
1456   UNREACHABLE();
1457 }
1458 
EatsAtLeastFromLoopEntry()1459 EatsAtLeastInfo LoopChoiceNode::EatsAtLeastFromLoopEntry() {
1460   DCHECK_EQ(alternatives_->length(), 2);  // There's just loop and continue.
1461 
1462   if (read_backward()) {
1463     // The eats_at_least value is not used if reading backward. The
1464     // EatsAtLeastPropagator should've zeroed it as well.
1465     DCHECK_EQ(eats_at_least_info()->eats_at_least_from_possibly_start, 0);
1466     DCHECK_EQ(eats_at_least_info()->eats_at_least_from_not_start, 0);
1467     return {};
1468   }
1469 
1470   // Figure out how much the loop body itself eats, not including anything in
1471   // the continuation case. In general, the nodes in the loop body should report
1472   // that they eat at least the number eaten by the continuation node, since any
1473   // successful match in the loop body must also include the continuation node.
1474   // However, in some cases involving positive lookaround, the loop body under-
1475   // reports its appetite, so use saturated math here to avoid negative numbers.
1476   uint8_t loop_body_from_not_start = base::saturated_cast<uint8_t>(
1477       loop_node_->EatsAtLeast(true) - continue_node_->EatsAtLeast(true));
1478   uint8_t loop_body_from_possibly_start = base::saturated_cast<uint8_t>(
1479       loop_node_->EatsAtLeast(false) - continue_node_->EatsAtLeast(true));
1480 
1481   // Limit the number of loop iterations to avoid overflow in subsequent steps.
1482   int loop_iterations = base::saturated_cast<uint8_t>(min_loop_iterations());
1483 
1484   EatsAtLeastInfo result;
1485   result.eats_at_least_from_not_start =
1486       base::saturated_cast<uint8_t>(loop_iterations * loop_body_from_not_start +
1487                                     continue_node_->EatsAtLeast(true));
1488   if (loop_iterations > 0 && loop_body_from_possibly_start > 0) {
1489     // First loop iteration eats at least one, so all subsequent iterations
1490     // and the after-loop chunk are guaranteed to not be at the start.
1491     result.eats_at_least_from_possibly_start = base::saturated_cast<uint8_t>(
1492         loop_body_from_possibly_start +
1493         (loop_iterations - 1) * loop_body_from_not_start +
1494         continue_node_->EatsAtLeast(true));
1495   } else {
1496     // Loop body might eat nothing, so only continue node contributes.
1497     result.eats_at_least_from_possibly_start =
1498         continue_node_->EatsAtLeast(false);
1499   }
1500   return result;
1501 }
1502 
EmitQuickCheck(RegExpCompiler * compiler,Trace * bounds_check_trace,Trace * trace,bool preload_has_checked_bounds,Label * on_possible_success,QuickCheckDetails * details,bool fall_through_on_failure,ChoiceNode * predecessor)1503 bool RegExpNode::EmitQuickCheck(RegExpCompiler* compiler,
1504                                 Trace* bounds_check_trace, Trace* trace,
1505                                 bool preload_has_checked_bounds,
1506                                 Label* on_possible_success,
1507                                 QuickCheckDetails* details,
1508                                 bool fall_through_on_failure,
1509                                 ChoiceNode* predecessor) {
1510   DCHECK_NOT_NULL(predecessor);
1511   if (details->characters() == 0) return false;
1512   GetQuickCheckDetails(details, compiler, 0,
1513                        trace->at_start() == Trace::FALSE_VALUE);
1514   if (details->cannot_match()) return false;
1515   if (!details->Rationalize(compiler->one_byte())) return false;
1516   DCHECK(details->characters() == 1 ||
1517          compiler->macro_assembler()->CanReadUnaligned());
1518   uint32_t mask = details->mask();
1519   uint32_t value = details->value();
1520 
1521   RegExpMacroAssembler* assembler = compiler->macro_assembler();
1522 
1523   if (trace->characters_preloaded() != details->characters()) {
1524     DCHECK(trace->cp_offset() == bounds_check_trace->cp_offset());
1525     // The bounds check is performed using the minimum number of characters
1526     // any choice would eat, so if the bounds check fails, then none of the
1527     // choices can succeed, so we can just immediately backtrack, rather
1528     // than go to the next choice. The number of characters preloaded may be
1529     // less than the number used for the bounds check.
1530     int eats_at_least = predecessor->EatsAtLeast(
1531         bounds_check_trace->at_start() == Trace::FALSE_VALUE);
1532     DCHECK_GE(eats_at_least, details->characters());
1533     assembler->LoadCurrentCharacter(
1534         trace->cp_offset(), bounds_check_trace->backtrack(),
1535         !preload_has_checked_bounds, details->characters(), eats_at_least);
1536   }
1537 
1538   bool need_mask = true;
1539 
1540   if (details->characters() == 1) {
1541     // If number of characters preloaded is 1 then we used a byte or 16 bit
1542     // load so the value is already masked down.
1543     const uint32_t char_mask = CharMask(compiler->one_byte());
1544     if ((mask & char_mask) == char_mask) need_mask = false;
1545     mask &= char_mask;
1546   } else {
1547     // For 2-character preloads in one-byte mode or 1-character preloads in
1548     // two-byte mode we also use a 16 bit load with zero extend.
1549     static const uint32_t kTwoByteMask = 0xFFFF;
1550     static const uint32_t kFourByteMask = 0xFFFFFFFF;
1551     if (details->characters() == 2 && compiler->one_byte()) {
1552       if ((mask & kTwoByteMask) == kTwoByteMask) need_mask = false;
1553     } else if (details->characters() == 1 && !compiler->one_byte()) {
1554       if ((mask & kTwoByteMask) == kTwoByteMask) need_mask = false;
1555     } else {
1556       if (mask == kFourByteMask) need_mask = false;
1557     }
1558   }
1559 
1560   if (fall_through_on_failure) {
1561     if (need_mask) {
1562       assembler->CheckCharacterAfterAnd(value, mask, on_possible_success);
1563     } else {
1564       assembler->CheckCharacter(value, on_possible_success);
1565     }
1566   } else {
1567     if (need_mask) {
1568       assembler->CheckNotCharacterAfterAnd(value, mask, trace->backtrack());
1569     } else {
1570       assembler->CheckNotCharacter(value, trace->backtrack());
1571     }
1572   }
1573   return true;
1574 }
1575 
1576 // Here is the meat of GetQuickCheckDetails (see also the comment on the
1577 // super-class in the .h file).
1578 //
1579 // We iterate along the text object, building up for each character a
1580 // mask and value that can be used to test for a quick failure to match.
1581 // The masks and values for the positions will be combined into a single
1582 // machine word for the current character width in order to be used in
1583 // generating a quick check.
GetQuickCheckDetails(QuickCheckDetails * details,RegExpCompiler * compiler,int characters_filled_in,bool not_at_start)1584 void TextNode::GetQuickCheckDetails(QuickCheckDetails* details,
1585                                     RegExpCompiler* compiler,
1586                                     int characters_filled_in,
1587                                     bool not_at_start) {
1588   // Do not collect any quick check details if the text node reads backward,
1589   // since it reads in the opposite direction than we use for quick checks.
1590   if (read_backward()) return;
1591   Isolate* isolate = compiler->macro_assembler()->isolate();
1592   DCHECK(characters_filled_in < details->characters());
1593   int characters = details->characters();
1594   const uint32_t char_mask = CharMask(compiler->one_byte());
1595   for (int k = 0; k < elements()->length(); k++) {
1596     TextElement elm = elements()->at(k);
1597     if (elm.text_type() == TextElement::ATOM) {
1598       base::Vector<const base::uc16> quarks = elm.atom()->data();
1599       for (int i = 0; i < characters && i < quarks.length(); i++) {
1600         QuickCheckDetails::Position* pos =
1601             details->positions(characters_filled_in);
1602         base::uc16 c = quarks[i];
1603         if (IsIgnoreCase(compiler->flags())) {
1604           unibrow::uchar chars[4];
1605           int length = GetCaseIndependentLetters(
1606               isolate, c, compiler->one_byte(), chars, 4);
1607           if (length == 0) {
1608             // This can happen because all case variants are non-Latin1, but we
1609             // know the input is Latin1.
1610             details->set_cannot_match();
1611             pos->determines_perfectly = false;
1612             return;
1613           }
1614           if (length == 1) {
1615             // This letter has no case equivalents, so it's nice and simple
1616             // and the mask-compare will determine definitely whether we have
1617             // a match at this character position.
1618             pos->mask = char_mask;
1619             pos->value = chars[0];
1620             pos->determines_perfectly = true;
1621           } else {
1622             uint32_t common_bits = char_mask;
1623             uint32_t bits = chars[0];
1624             for (int j = 1; j < length; j++) {
1625               uint32_t differing_bits = ((chars[j] & common_bits) ^ bits);
1626               common_bits ^= differing_bits;
1627               bits &= common_bits;
1628             }
1629             // If length is 2 and common bits has only one zero in it then
1630             // our mask and compare instruction will determine definitely
1631             // whether we have a match at this character position.  Otherwise
1632             // it can only be an approximate check.
1633             uint32_t one_zero = (common_bits | ~char_mask);
1634             if (length == 2 && ((~one_zero) & ((~one_zero) - 1)) == 0) {
1635               pos->determines_perfectly = true;
1636             }
1637             pos->mask = common_bits;
1638             pos->value = bits;
1639           }
1640         } else {
1641           // Don't ignore case.  Nice simple case where the mask-compare will
1642           // determine definitely whether we have a match at this character
1643           // position.
1644           if (c > char_mask) {
1645             details->set_cannot_match();
1646             pos->determines_perfectly = false;
1647             return;
1648           }
1649           pos->mask = char_mask;
1650           pos->value = c;
1651           pos->determines_perfectly = true;
1652         }
1653         characters_filled_in++;
1654         DCHECK(characters_filled_in <= details->characters());
1655         if (characters_filled_in == details->characters()) {
1656           return;
1657         }
1658       }
1659     } else {
1660       QuickCheckDetails::Position* pos =
1661           details->positions(characters_filled_in);
1662       RegExpCharacterClass* tree = elm.char_class();
1663       ZoneList<CharacterRange>* ranges = tree->ranges(zone());
1664       if (tree->is_negated() || ranges->is_empty()) {
1665         // A quick check uses multi-character mask and compare.  There is no
1666         // useful way to incorporate a negative char class into this scheme
1667         // so we just conservatively create a mask and value that will always
1668         // succeed.
1669         // Likewise for empty ranges (empty ranges can occur e.g. when
1670         // compiling for one-byte subjects and impossible (non-one-byte) ranges
1671         // have been removed).
1672         pos->mask = 0;
1673         pos->value = 0;
1674       } else {
1675         int first_range = 0;
1676         while (ranges->at(first_range).from() > char_mask) {
1677           first_range++;
1678           if (first_range == ranges->length()) {
1679             details->set_cannot_match();
1680             pos->determines_perfectly = false;
1681             return;
1682           }
1683         }
1684         CharacterRange range = ranges->at(first_range);
1685         const base::uc32 first_from = range.from();
1686         const base::uc32 first_to =
1687             (range.to() > char_mask) ? char_mask : range.to();
1688         const uint32_t differing_bits = (first_from ^ first_to);
1689         // A mask and compare is only perfect if the differing bits form a
1690         // number like 00011111 with one single block of trailing 1s.
1691         if ((differing_bits & (differing_bits + 1)) == 0 &&
1692             first_from + differing_bits == first_to) {
1693           pos->determines_perfectly = true;
1694         }
1695         uint32_t common_bits = ~SmearBitsRight(differing_bits);
1696         uint32_t bits = (first_from & common_bits);
1697         for (int i = first_range + 1; i < ranges->length(); i++) {
1698           range = ranges->at(i);
1699           const base::uc32 from = range.from();
1700           if (from > char_mask) continue;
1701           const base::uc32 to =
1702               (range.to() > char_mask) ? char_mask : range.to();
1703           // Here we are combining more ranges into the mask and compare
1704           // value.  With each new range the mask becomes more sparse and
1705           // so the chances of a false positive rise.  A character class
1706           // with multiple ranges is assumed never to be equivalent to a
1707           // mask and compare operation.
1708           pos->determines_perfectly = false;
1709           uint32_t new_common_bits = (from ^ to);
1710           new_common_bits = ~SmearBitsRight(new_common_bits);
1711           common_bits &= new_common_bits;
1712           bits &= new_common_bits;
1713           uint32_t new_differing_bits = (from & common_bits) ^ bits;
1714           common_bits ^= new_differing_bits;
1715           bits &= common_bits;
1716         }
1717         pos->mask = common_bits;
1718         pos->value = bits;
1719       }
1720       characters_filled_in++;
1721       DCHECK(characters_filled_in <= details->characters());
1722       if (characters_filled_in == details->characters()) return;
1723     }
1724   }
1725   DCHECK(characters_filled_in != details->characters());
1726   if (!details->cannot_match()) {
1727     on_success()->GetQuickCheckDetails(details, compiler, characters_filled_in,
1728                                        true);
1729   }
1730 }
1731 
Clear()1732 void QuickCheckDetails::Clear() {
1733   for (int i = 0; i < characters_; i++) {
1734     positions_[i].mask = 0;
1735     positions_[i].value = 0;
1736     positions_[i].determines_perfectly = false;
1737   }
1738   characters_ = 0;
1739 }
1740 
Advance(int by,bool one_byte)1741 void QuickCheckDetails::Advance(int by, bool one_byte) {
1742   if (by >= characters_ || by < 0) {
1743     DCHECK_IMPLIES(by < 0, characters_ == 0);
1744     Clear();
1745     return;
1746   }
1747   DCHECK_LE(characters_ - by, 4);
1748   DCHECK_LE(characters_, 4);
1749   for (int i = 0; i < characters_ - by; i++) {
1750     positions_[i] = positions_[by + i];
1751   }
1752   for (int i = characters_ - by; i < characters_; i++) {
1753     positions_[i].mask = 0;
1754     positions_[i].value = 0;
1755     positions_[i].determines_perfectly = false;
1756   }
1757   characters_ -= by;
1758   // We could change mask_ and value_ here but we would never advance unless
1759   // they had already been used in a check and they won't be used again because
1760   // it would gain us nothing.  So there's no point.
1761 }
1762 
Merge(QuickCheckDetails * other,int from_index)1763 void QuickCheckDetails::Merge(QuickCheckDetails* other, int from_index) {
1764   DCHECK(characters_ == other->characters_);
1765   if (other->cannot_match_) {
1766     return;
1767   }
1768   if (cannot_match_) {
1769     *this = *other;
1770     return;
1771   }
1772   for (int i = from_index; i < characters_; i++) {
1773     QuickCheckDetails::Position* pos = positions(i);
1774     QuickCheckDetails::Position* other_pos = other->positions(i);
1775     if (pos->mask != other_pos->mask || pos->value != other_pos->value ||
1776         !other_pos->determines_perfectly) {
1777       // Our mask-compare operation will be approximate unless we have the
1778       // exact same operation on both sides of the alternation.
1779       pos->determines_perfectly = false;
1780     }
1781     pos->mask &= other_pos->mask;
1782     pos->value &= pos->mask;
1783     other_pos->value &= pos->mask;
1784     uint32_t differing_bits = (pos->value ^ other_pos->value);
1785     pos->mask &= ~differing_bits;
1786     pos->value &= pos->mask;
1787   }
1788 }
1789 
1790 class VisitMarker {
1791  public:
VisitMarker(NodeInfo * info)1792   explicit VisitMarker(NodeInfo* info) : info_(info) {
1793     DCHECK(!info->visited);
1794     info->visited = true;
1795   }
~VisitMarker()1796   ~VisitMarker() { info_->visited = false; }
1797 
1798  private:
1799   NodeInfo* info_;
1800 };
1801 
1802 // Temporarily sets traversed_loop_initialization_node_.
1803 class LoopInitializationMarker {
1804  public:
LoopInitializationMarker(LoopChoiceNode * node)1805   explicit LoopInitializationMarker(LoopChoiceNode* node) : node_(node) {
1806     DCHECK(!node_->traversed_loop_initialization_node_);
1807     node_->traversed_loop_initialization_node_ = true;
1808   }
~LoopInitializationMarker()1809   ~LoopInitializationMarker() {
1810     DCHECK(node_->traversed_loop_initialization_node_);
1811     node_->traversed_loop_initialization_node_ = false;
1812   }
1813   LoopInitializationMarker(const LoopInitializationMarker&) = delete;
1814   LoopInitializationMarker& operator=(const LoopInitializationMarker&) = delete;
1815 
1816  private:
1817   LoopChoiceNode* node_;
1818 };
1819 
1820 // Temporarily decrements min_loop_iterations_.
1821 class IterationDecrementer {
1822  public:
IterationDecrementer(LoopChoiceNode * node)1823   explicit IterationDecrementer(LoopChoiceNode* node) : node_(node) {
1824     DCHECK_GT(node_->min_loop_iterations_, 0);
1825     --node_->min_loop_iterations_;
1826   }
~IterationDecrementer()1827   ~IterationDecrementer() { ++node_->min_loop_iterations_; }
1828   IterationDecrementer(const IterationDecrementer&) = delete;
1829   IterationDecrementer& operator=(const IterationDecrementer&) = delete;
1830 
1831  private:
1832   LoopChoiceNode* node_;
1833 };
1834 
FilterOneByte(int depth,RegExpFlags flags)1835 RegExpNode* SeqRegExpNode::FilterOneByte(int depth, RegExpFlags flags) {
1836   if (info()->replacement_calculated) return replacement();
1837   if (depth < 0) return this;
1838   DCHECK(!info()->visited);
1839   VisitMarker marker(info());
1840   return FilterSuccessor(depth - 1, flags);
1841 }
1842 
FilterSuccessor(int depth,RegExpFlags flags)1843 RegExpNode* SeqRegExpNode::FilterSuccessor(int depth, RegExpFlags flags) {
1844   RegExpNode* next = on_success_->FilterOneByte(depth - 1, flags);
1845   if (next == nullptr) return set_replacement(nullptr);
1846   on_success_ = next;
1847   return set_replacement(this);
1848 }
1849 
1850 // We need to check for the following characters: 0x39C 0x3BC 0x178.
RangeContainsLatin1Equivalents(CharacterRange range)1851 bool RangeContainsLatin1Equivalents(CharacterRange range) {
1852   // TODO(dcarney): this could be a lot more efficient.
1853   return range.Contains(0x039C) || range.Contains(0x03BC) ||
1854          range.Contains(0x0178);
1855 }
1856 
1857 namespace {
1858 
RangesContainLatin1Equivalents(ZoneList<CharacterRange> * ranges)1859 bool RangesContainLatin1Equivalents(ZoneList<CharacterRange>* ranges) {
1860   for (int i = 0; i < ranges->length(); i++) {
1861     // TODO(dcarney): this could be a lot more efficient.
1862     if (RangeContainsLatin1Equivalents(ranges->at(i))) return true;
1863   }
1864   return false;
1865 }
1866 
1867 }  // namespace
1868 
FilterOneByte(int depth,RegExpFlags flags)1869 RegExpNode* TextNode::FilterOneByte(int depth, RegExpFlags flags) {
1870   if (info()->replacement_calculated) return replacement();
1871   if (depth < 0) return this;
1872   DCHECK(!info()->visited);
1873   VisitMarker marker(info());
1874   int element_count = elements()->length();
1875   for (int i = 0; i < element_count; i++) {
1876     TextElement elm = elements()->at(i);
1877     if (elm.text_type() == TextElement::ATOM) {
1878       base::Vector<const base::uc16> quarks = elm.atom()->data();
1879       for (int j = 0; j < quarks.length(); j++) {
1880         base::uc16 c = quarks[j];
1881         if (IsIgnoreCase(flags)) {
1882           c = unibrow::Latin1::TryConvertToLatin1(c);
1883         }
1884         if (c > unibrow::Latin1::kMaxChar) return set_replacement(nullptr);
1885         // Replace quark in case we converted to Latin-1.
1886         base::uc16* writable_quarks = const_cast<base::uc16*>(quarks.begin());
1887         writable_quarks[j] = c;
1888       }
1889     } else {
1890       DCHECK(elm.text_type() == TextElement::CHAR_CLASS);
1891       RegExpCharacterClass* cc = elm.char_class();
1892       ZoneList<CharacterRange>* ranges = cc->ranges(zone());
1893       CharacterRange::Canonicalize(ranges);
1894       // Now they are in order so we only need to look at the first.
1895       int range_count = ranges->length();
1896       if (cc->is_negated()) {
1897         if (range_count != 0 && ranges->at(0).from() == 0 &&
1898             ranges->at(0).to() >= String::kMaxOneByteCharCode) {
1899           // This will be handled in a later filter.
1900           if (IsIgnoreCase(flags) && RangesContainLatin1Equivalents(ranges)) {
1901             continue;
1902           }
1903           return set_replacement(nullptr);
1904         }
1905       } else {
1906         if (range_count == 0 ||
1907             ranges->at(0).from() > String::kMaxOneByteCharCode) {
1908           // This will be handled in a later filter.
1909           if (IsIgnoreCase(flags) && RangesContainLatin1Equivalents(ranges)) {
1910             continue;
1911           }
1912           return set_replacement(nullptr);
1913         }
1914       }
1915     }
1916   }
1917   return FilterSuccessor(depth - 1, flags);
1918 }
1919 
FilterOneByte(int depth,RegExpFlags flags)1920 RegExpNode* LoopChoiceNode::FilterOneByte(int depth, RegExpFlags flags) {
1921   if (info()->replacement_calculated) return replacement();
1922   if (depth < 0) return this;
1923   if (info()->visited) return this;
1924   {
1925     VisitMarker marker(info());
1926 
1927     RegExpNode* continue_replacement =
1928         continue_node_->FilterOneByte(depth - 1, flags);
1929     // If we can't continue after the loop then there is no sense in doing the
1930     // loop.
1931     if (continue_replacement == nullptr) return set_replacement(nullptr);
1932   }
1933 
1934   return ChoiceNode::FilterOneByte(depth - 1, flags);
1935 }
1936 
FilterOneByte(int depth,RegExpFlags flags)1937 RegExpNode* ChoiceNode::FilterOneByte(int depth, RegExpFlags flags) {
1938   if (info()->replacement_calculated) return replacement();
1939   if (depth < 0) return this;
1940   if (info()->visited) return this;
1941   VisitMarker marker(info());
1942   int choice_count = alternatives_->length();
1943 
1944   for (int i = 0; i < choice_count; i++) {
1945     GuardedAlternative alternative = alternatives_->at(i);
1946     if (alternative.guards() != nullptr &&
1947         alternative.guards()->length() != 0) {
1948       set_replacement(this);
1949       return this;
1950     }
1951   }
1952 
1953   int surviving = 0;
1954   RegExpNode* survivor = nullptr;
1955   for (int i = 0; i < choice_count; i++) {
1956     GuardedAlternative alternative = alternatives_->at(i);
1957     RegExpNode* replacement =
1958         alternative.node()->FilterOneByte(depth - 1, flags);
1959     DCHECK(replacement != this);  // No missing EMPTY_MATCH_CHECK.
1960     if (replacement != nullptr) {
1961       alternatives_->at(i).set_node(replacement);
1962       surviving++;
1963       survivor = replacement;
1964     }
1965   }
1966   if (surviving < 2) return set_replacement(survivor);
1967 
1968   set_replacement(this);
1969   if (surviving == choice_count) {
1970     return this;
1971   }
1972   // Only some of the nodes survived the filtering.  We need to rebuild the
1973   // alternatives list.
1974   ZoneList<GuardedAlternative>* new_alternatives =
1975       zone()->New<ZoneList<GuardedAlternative>>(surviving, zone());
1976   for (int i = 0; i < choice_count; i++) {
1977     RegExpNode* replacement =
1978         alternatives_->at(i).node()->FilterOneByte(depth - 1, flags);
1979     if (replacement != nullptr) {
1980       alternatives_->at(i).set_node(replacement);
1981       new_alternatives->Add(alternatives_->at(i), zone());
1982     }
1983   }
1984   alternatives_ = new_alternatives;
1985   return this;
1986 }
1987 
FilterOneByte(int depth,RegExpFlags flags)1988 RegExpNode* NegativeLookaroundChoiceNode::FilterOneByte(int depth,
1989                                                         RegExpFlags flags) {
1990   if (info()->replacement_calculated) return replacement();
1991   if (depth < 0) return this;
1992   if (info()->visited) return this;
1993   VisitMarker marker(info());
1994   // Alternative 0 is the negative lookahead, alternative 1 is what comes
1995   // afterwards.
1996   RegExpNode* node = continue_node();
1997   RegExpNode* replacement = node->FilterOneByte(depth - 1, flags);
1998   if (replacement == nullptr) return set_replacement(nullptr);
1999   alternatives_->at(kContinueIndex).set_node(replacement);
2000 
2001   RegExpNode* neg_node = lookaround_node();
2002   RegExpNode* neg_replacement = neg_node->FilterOneByte(depth - 1, flags);
2003   // If the negative lookahead is always going to fail then
2004   // we don't need to check it.
2005   if (neg_replacement == nullptr) return set_replacement(replacement);
2006   alternatives_->at(kLookaroundIndex).set_node(neg_replacement);
2007   return set_replacement(this);
2008 }
2009 
GetQuickCheckDetails(QuickCheckDetails * details,RegExpCompiler * compiler,int characters_filled_in,bool not_at_start)2010 void LoopChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,
2011                                           RegExpCompiler* compiler,
2012                                           int characters_filled_in,
2013                                           bool not_at_start) {
2014   if (body_can_be_zero_length_ || info()->visited) return;
2015   not_at_start = not_at_start || this->not_at_start();
2016   DCHECK_EQ(alternatives_->length(), 2);  // There's just loop and continue.
2017   if (traversed_loop_initialization_node_ && min_loop_iterations_ > 0 &&
2018       loop_node_->EatsAtLeast(not_at_start) >
2019           continue_node_->EatsAtLeast(true)) {
2020     // Loop body is guaranteed to execute at least once, and consume characters
2021     // when it does, meaning the only possible quick checks from this point
2022     // begin with the loop body. We may recursively visit this LoopChoiceNode,
2023     // but we temporarily decrease its minimum iteration counter so we know when
2024     // to check the continue case.
2025     IterationDecrementer next_iteration(this);
2026     loop_node_->GetQuickCheckDetails(details, compiler, characters_filled_in,
2027                                      not_at_start);
2028   } else {
2029     // Might not consume anything in the loop body, so treat it like a normal
2030     // ChoiceNode (and don't recursively visit this node again).
2031     VisitMarker marker(info());
2032     ChoiceNode::GetQuickCheckDetails(details, compiler, characters_filled_in,
2033                                      not_at_start);
2034   }
2035 }
2036 
GetQuickCheckDetailsFromLoopEntry(QuickCheckDetails * details,RegExpCompiler * compiler,int characters_filled_in,bool not_at_start)2037 void LoopChoiceNode::GetQuickCheckDetailsFromLoopEntry(
2038     QuickCheckDetails* details, RegExpCompiler* compiler,
2039     int characters_filled_in, bool not_at_start) {
2040   if (traversed_loop_initialization_node_) {
2041     // We already entered this loop once, exited via its continuation node, and
2042     // followed an outer loop's back-edge to before the loop entry point. We
2043     // could try to reset the minimum iteration count to its starting value at
2044     // this point, but that seems like more trouble than it's worth. It's safe
2045     // to keep going with the current (possibly reduced) minimum iteration
2046     // count.
2047     GetQuickCheckDetails(details, compiler, characters_filled_in, not_at_start);
2048   } else {
2049     // We are entering a loop via its counter initialization action, meaning we
2050     // are guaranteed to run the loop body at least some minimum number of times
2051     // before running the continuation node. Set a flag so that this node knows
2052     // (now and any times we visit it again recursively) that it was entered
2053     // from the top.
2054     LoopInitializationMarker marker(this);
2055     GetQuickCheckDetails(details, compiler, characters_filled_in, not_at_start);
2056   }
2057 }
2058 
FillInBMInfo(Isolate * isolate,int offset,int budget,BoyerMooreLookahead * bm,bool not_at_start)2059 void LoopChoiceNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
2060                                   BoyerMooreLookahead* bm, bool not_at_start) {
2061   if (body_can_be_zero_length_ || budget <= 0) {
2062     bm->SetRest(offset);
2063     SaveBMInfo(bm, not_at_start, offset);
2064     return;
2065   }
2066   ChoiceNode::FillInBMInfo(isolate, offset, budget - 1, bm, not_at_start);
2067   SaveBMInfo(bm, not_at_start, offset);
2068 }
2069 
GetQuickCheckDetails(QuickCheckDetails * details,RegExpCompiler * compiler,int characters_filled_in,bool not_at_start)2070 void ChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,
2071                                       RegExpCompiler* compiler,
2072                                       int characters_filled_in,
2073                                       bool not_at_start) {
2074   not_at_start = (not_at_start || not_at_start_);
2075   int choice_count = alternatives_->length();
2076   DCHECK_LT(0, choice_count);
2077   alternatives_->at(0).node()->GetQuickCheckDetails(
2078       details, compiler, characters_filled_in, not_at_start);
2079   for (int i = 1; i < choice_count; i++) {
2080     QuickCheckDetails new_details(details->characters());
2081     RegExpNode* node = alternatives_->at(i).node();
2082     node->GetQuickCheckDetails(&new_details, compiler, characters_filled_in,
2083                                not_at_start);
2084     // Here we merge the quick match details of the two branches.
2085     details->Merge(&new_details, characters_filled_in);
2086   }
2087 }
2088 
2089 namespace {
2090 
2091 // Check for [0-9A-Z_a-z].
EmitWordCheck(RegExpMacroAssembler * assembler,Label * word,Label * non_word,bool fall_through_on_word)2092 void EmitWordCheck(RegExpMacroAssembler* assembler, Label* word,
2093                    Label* non_word, bool fall_through_on_word) {
2094   if (assembler->CheckSpecialCharacterClass(
2095           fall_through_on_word ? StandardCharacterSet::kWord
2096                                : StandardCharacterSet::kNotWord,
2097           fall_through_on_word ? non_word : word)) {
2098     // Optimized implementation available.
2099     return;
2100   }
2101   assembler->CheckCharacterGT('z', non_word);
2102   assembler->CheckCharacterLT('0', non_word);
2103   assembler->CheckCharacterGT('a' - 1, word);
2104   assembler->CheckCharacterLT('9' + 1, word);
2105   assembler->CheckCharacterLT('A', non_word);
2106   assembler->CheckCharacterLT('Z' + 1, word);
2107   if (fall_through_on_word) {
2108     assembler->CheckNotCharacter('_', non_word);
2109   } else {
2110     assembler->CheckCharacter('_', word);
2111   }
2112 }
2113 
2114 // Emit the code to check for a ^ in multiline mode (1-character lookbehind
2115 // that matches newline or the start of input).
EmitHat(RegExpCompiler * compiler,RegExpNode * on_success,Trace * trace)2116 void EmitHat(RegExpCompiler* compiler, RegExpNode* on_success, Trace* trace) {
2117   RegExpMacroAssembler* assembler = compiler->macro_assembler();
2118 
2119   // We will load the previous character into the current character register.
2120   Trace new_trace(*trace);
2121   new_trace.InvalidateCurrentCharacter();
2122 
2123   // A positive (> 0) cp_offset means we've already successfully matched a
2124   // non-empty-width part of the pattern, and thus cannot be at or before the
2125   // start of the subject string. We can thus skip both at-start and
2126   // bounds-checks when loading the one-character lookbehind.
2127   const bool may_be_at_or_before_subject_string_start =
2128       new_trace.cp_offset() <= 0;
2129 
2130   Label ok;
2131   if (may_be_at_or_before_subject_string_start) {
2132     // The start of input counts as a newline in this context, so skip to ok if
2133     // we are at the start.
2134     assembler->CheckAtStart(new_trace.cp_offset(), &ok);
2135   }
2136 
2137   // If we've already checked that we are not at the start of input, it's okay
2138   // to load the previous character without bounds checks.
2139   const bool can_skip_bounds_check = !may_be_at_or_before_subject_string_start;
2140   assembler->LoadCurrentCharacter(new_trace.cp_offset() - 1,
2141                                   new_trace.backtrack(), can_skip_bounds_check);
2142   if (!assembler->CheckSpecialCharacterClass(
2143           StandardCharacterSet::kLineTerminator, new_trace.backtrack())) {
2144     // Newline means \n, \r, 0x2028 or 0x2029.
2145     if (!compiler->one_byte()) {
2146       assembler->CheckCharacterAfterAnd(0x2028, 0xFFFE, &ok);
2147     }
2148     assembler->CheckCharacter('\n', &ok);
2149     assembler->CheckNotCharacter('\r', new_trace.backtrack());
2150   }
2151   assembler->Bind(&ok);
2152   on_success->Emit(compiler, &new_trace);
2153 }
2154 
2155 }  // namespace
2156 
2157 // Emit the code to handle \b and \B (word-boundary or non-word-boundary).
EmitBoundaryCheck(RegExpCompiler * compiler,Trace * trace)2158 void AssertionNode::EmitBoundaryCheck(RegExpCompiler* compiler, Trace* trace) {
2159   RegExpMacroAssembler* assembler = compiler->macro_assembler();
2160   Isolate* isolate = assembler->isolate();
2161   Trace::TriBool next_is_word_character = Trace::UNKNOWN;
2162   bool not_at_start = (trace->at_start() == Trace::FALSE_VALUE);
2163   BoyerMooreLookahead* lookahead = bm_info(not_at_start);
2164   if (lookahead == nullptr) {
2165     int eats_at_least =
2166         std::min(kMaxLookaheadForBoyerMoore, EatsAtLeast(not_at_start));
2167     if (eats_at_least >= 1) {
2168       BoyerMooreLookahead* bm =
2169           zone()->New<BoyerMooreLookahead>(eats_at_least, compiler, zone());
2170       FillInBMInfo(isolate, 0, kRecursionBudget, bm, not_at_start);
2171       if (bm->at(0)->is_non_word()) next_is_word_character = Trace::FALSE_VALUE;
2172       if (bm->at(0)->is_word()) next_is_word_character = Trace::TRUE_VALUE;
2173     }
2174   } else {
2175     if (lookahead->at(0)->is_non_word())
2176       next_is_word_character = Trace::FALSE_VALUE;
2177     if (lookahead->at(0)->is_word()) next_is_word_character = Trace::TRUE_VALUE;
2178   }
2179   bool at_boundary = (assertion_type_ == AssertionNode::AT_BOUNDARY);
2180   if (next_is_word_character == Trace::UNKNOWN) {
2181     Label before_non_word;
2182     Label before_word;
2183     if (trace->characters_preloaded() != 1) {
2184       assembler->LoadCurrentCharacter(trace->cp_offset(), &before_non_word);
2185     }
2186     // Fall through on non-word.
2187     EmitWordCheck(assembler, &before_word, &before_non_word, false);
2188     // Next character is not a word character.
2189     assembler->Bind(&before_non_word);
2190     Label ok;
2191     BacktrackIfPrevious(compiler, trace, at_boundary ? kIsNonWord : kIsWord);
2192     assembler->GoTo(&ok);
2193 
2194     assembler->Bind(&before_word);
2195     BacktrackIfPrevious(compiler, trace, at_boundary ? kIsWord : kIsNonWord);
2196     assembler->Bind(&ok);
2197   } else if (next_is_word_character == Trace::TRUE_VALUE) {
2198     BacktrackIfPrevious(compiler, trace, at_boundary ? kIsWord : kIsNonWord);
2199   } else {
2200     DCHECK(next_is_word_character == Trace::FALSE_VALUE);
2201     BacktrackIfPrevious(compiler, trace, at_boundary ? kIsNonWord : kIsWord);
2202   }
2203 }
2204 
BacktrackIfPrevious(RegExpCompiler * compiler,Trace * trace,AssertionNode::IfPrevious backtrack_if_previous)2205 void AssertionNode::BacktrackIfPrevious(
2206     RegExpCompiler* compiler, Trace* trace,
2207     AssertionNode::IfPrevious backtrack_if_previous) {
2208   RegExpMacroAssembler* assembler = compiler->macro_assembler();
2209   Trace new_trace(*trace);
2210   new_trace.InvalidateCurrentCharacter();
2211 
2212   Label fall_through;
2213   Label* non_word = backtrack_if_previous == kIsNonWord ? new_trace.backtrack()
2214                                                         : &fall_through;
2215   Label* word = backtrack_if_previous == kIsNonWord ? &fall_through
2216                                                     : new_trace.backtrack();
2217 
2218   // A positive (> 0) cp_offset means we've already successfully matched a
2219   // non-empty-width part of the pattern, and thus cannot be at or before the
2220   // start of the subject string. We can thus skip both at-start and
2221   // bounds-checks when loading the one-character lookbehind.
2222   const bool may_be_at_or_before_subject_string_start =
2223       new_trace.cp_offset() <= 0;
2224 
2225   if (may_be_at_or_before_subject_string_start) {
2226     // The start of input counts as a non-word character, so the question is
2227     // decided if we are at the start.
2228     assembler->CheckAtStart(new_trace.cp_offset(), non_word);
2229   }
2230 
2231   // If we've already checked that we are not at the start of input, it's okay
2232   // to load the previous character without bounds checks.
2233   const bool can_skip_bounds_check = !may_be_at_or_before_subject_string_start;
2234   assembler->LoadCurrentCharacter(new_trace.cp_offset() - 1, non_word,
2235                                   can_skip_bounds_check);
2236   EmitWordCheck(assembler, word, non_word, backtrack_if_previous == kIsNonWord);
2237 
2238   assembler->Bind(&fall_through);
2239   on_success()->Emit(compiler, &new_trace);
2240 }
2241 
GetQuickCheckDetails(QuickCheckDetails * details,RegExpCompiler * compiler,int filled_in,bool not_at_start)2242 void AssertionNode::GetQuickCheckDetails(QuickCheckDetails* details,
2243                                          RegExpCompiler* compiler,
2244                                          int filled_in, bool not_at_start) {
2245   if (assertion_type_ == AT_START && not_at_start) {
2246     details->set_cannot_match();
2247     return;
2248   }
2249   return on_success()->GetQuickCheckDetails(details, compiler, filled_in,
2250                                             not_at_start);
2251 }
2252 
Emit(RegExpCompiler * compiler,Trace * trace)2253 void AssertionNode::Emit(RegExpCompiler* compiler, Trace* trace) {
2254   RegExpMacroAssembler* assembler = compiler->macro_assembler();
2255   switch (assertion_type_) {
2256     case AT_END: {
2257       Label ok;
2258       assembler->CheckPosition(trace->cp_offset(), &ok);
2259       assembler->GoTo(trace->backtrack());
2260       assembler->Bind(&ok);
2261       break;
2262     }
2263     case AT_START: {
2264       if (trace->at_start() == Trace::FALSE_VALUE) {
2265         assembler->GoTo(trace->backtrack());
2266         return;
2267       }
2268       if (trace->at_start() == Trace::UNKNOWN) {
2269         assembler->CheckNotAtStart(trace->cp_offset(), trace->backtrack());
2270         Trace at_start_trace = *trace;
2271         at_start_trace.set_at_start(Trace::TRUE_VALUE);
2272         on_success()->Emit(compiler, &at_start_trace);
2273         return;
2274       }
2275     } break;
2276     case AFTER_NEWLINE:
2277       EmitHat(compiler, on_success(), trace);
2278       return;
2279     case AT_BOUNDARY:
2280     case AT_NON_BOUNDARY: {
2281       EmitBoundaryCheck(compiler, trace);
2282       return;
2283     }
2284   }
2285   on_success()->Emit(compiler, trace);
2286 }
2287 
2288 namespace {
2289 
DeterminedAlready(QuickCheckDetails * quick_check,int offset)2290 bool DeterminedAlready(QuickCheckDetails* quick_check, int offset) {
2291   if (quick_check == nullptr) return false;
2292   if (offset >= quick_check->characters()) return false;
2293   return quick_check->positions(offset)->determines_perfectly;
2294 }
2295 
UpdateBoundsCheck(int index,int * checked_up_to)2296 void UpdateBoundsCheck(int index, int* checked_up_to) {
2297   if (index > *checked_up_to) {
2298     *checked_up_to = index;
2299   }
2300 }
2301 
2302 }  // namespace
2303 
2304 // We call this repeatedly to generate code for each pass over the text node.
2305 // The passes are in increasing order of difficulty because we hope one
2306 // of the first passes will fail in which case we are saved the work of the
2307 // later passes.  for example for the case independent regexp /%[asdfghjkl]a/
2308 // we will check the '%' in the first pass, the case independent 'a' in the
2309 // second pass and the character class in the last pass.
2310 //
2311 // The passes are done from right to left, so for example to test for /bar/
2312 // we will first test for an 'r' with offset 2, then an 'a' with offset 1
2313 // and then a 'b' with offset 0.  This means we can avoid the end-of-input
2314 // bounds check most of the time.  In the example we only need to check for
2315 // end-of-input when loading the putative 'r'.
2316 //
2317 // A slight complication involves the fact that the first character may already
2318 // be fetched into a register by the previous node.  In this case we want to
2319 // do the test for that character first.  We do this in separate passes.  The
2320 // 'preloaded' argument indicates that we are doing such a 'pass'.  If such a
2321 // pass has been performed then subsequent passes will have true in
2322 // first_element_checked to indicate that that character does not need to be
2323 // checked again.
2324 //
2325 // In addition to all this we are passed a Trace, which can
2326 // contain an AlternativeGeneration object.  In this AlternativeGeneration
2327 // object we can see details of any quick check that was already passed in
2328 // order to get to the code we are now generating.  The quick check can involve
2329 // loading characters, which means we do not need to recheck the bounds
2330 // up to the limit the quick check already checked.  In addition the quick
2331 // check can have involved a mask and compare operation which may simplify
2332 // or obviate the need for further checks at some character positions.
TextEmitPass(RegExpCompiler * compiler,TextEmitPassType pass,bool preloaded,Trace * trace,bool first_element_checked,int * checked_up_to)2333 void TextNode::TextEmitPass(RegExpCompiler* compiler, TextEmitPassType pass,
2334                             bool preloaded, Trace* trace,
2335                             bool first_element_checked, int* checked_up_to) {
2336   RegExpMacroAssembler* assembler = compiler->macro_assembler();
2337   Isolate* isolate = assembler->isolate();
2338   bool one_byte = compiler->one_byte();
2339   Label* backtrack = trace->backtrack();
2340   QuickCheckDetails* quick_check = trace->quick_check_performed();
2341   int element_count = elements()->length();
2342   int backward_offset = read_backward() ? -Length() : 0;
2343   for (int i = preloaded ? 0 : element_count - 1; i >= 0; i--) {
2344     TextElement elm = elements()->at(i);
2345     int cp_offset = trace->cp_offset() + elm.cp_offset() + backward_offset;
2346     if (elm.text_type() == TextElement::ATOM) {
2347       if (SkipPass(pass, IsIgnoreCase(compiler->flags()))) continue;
2348       base::Vector<const base::uc16> quarks = elm.atom()->data();
2349       for (int j = preloaded ? 0 : quarks.length() - 1; j >= 0; j--) {
2350         if (first_element_checked && i == 0 && j == 0) continue;
2351         if (DeterminedAlready(quick_check, elm.cp_offset() + j)) continue;
2352         base::uc16 quark = quarks[j];
2353         if (IsIgnoreCase(compiler->flags())) {
2354           // Everywhere else we assume that a non-Latin-1 character cannot match
2355           // a Latin-1 character. Avoid the cases where this is assumption is
2356           // invalid by using the Latin1 equivalent instead.
2357           quark = unibrow::Latin1::TryConvertToLatin1(quark);
2358         }
2359         bool needs_bounds_check =
2360             *checked_up_to < cp_offset + j || read_backward();
2361         bool bounds_checked = false;
2362         switch (pass) {
2363           case NON_LATIN1_MATCH:
2364             DCHECK(one_byte);
2365             if (quark > String::kMaxOneByteCharCode) {
2366               assembler->GoTo(backtrack);
2367               return;
2368             }
2369             break;
2370           case NON_LETTER_CHARACTER_MATCH:
2371             bounds_checked =
2372                 EmitAtomNonLetter(isolate, compiler, quark, backtrack,
2373                                   cp_offset + j, needs_bounds_check, preloaded);
2374             break;
2375           case SIMPLE_CHARACTER_MATCH:
2376             bounds_checked = EmitSimpleCharacter(isolate, compiler, quark,
2377                                                  backtrack, cp_offset + j,
2378                                                  needs_bounds_check, preloaded);
2379             break;
2380           case CASE_CHARACTER_MATCH:
2381             bounds_checked =
2382                 EmitAtomLetter(isolate, compiler, quark, backtrack,
2383                                cp_offset + j, needs_bounds_check, preloaded);
2384             break;
2385           default:
2386             break;
2387         }
2388         if (bounds_checked) UpdateBoundsCheck(cp_offset + j, checked_up_to);
2389       }
2390     } else {
2391       DCHECK_EQ(TextElement::CHAR_CLASS, elm.text_type());
2392       if (pass == CHARACTER_CLASS_MATCH) {
2393         if (first_element_checked && i == 0) continue;
2394         if (DeterminedAlready(quick_check, elm.cp_offset())) continue;
2395         RegExpCharacterClass* cc = elm.char_class();
2396         bool bounds_check = *checked_up_to < cp_offset || read_backward();
2397         EmitCharClass(assembler, cc, one_byte, backtrack, cp_offset,
2398                       bounds_check, preloaded, zone());
2399         UpdateBoundsCheck(cp_offset, checked_up_to);
2400       }
2401     }
2402   }
2403 }
2404 
Length()2405 int TextNode::Length() {
2406   TextElement elm = elements()->last();
2407   DCHECK_LE(0, elm.cp_offset());
2408   return elm.cp_offset() + elm.length();
2409 }
2410 
SkipPass(TextEmitPassType pass,bool ignore_case)2411 bool TextNode::SkipPass(TextEmitPassType pass, bool ignore_case) {
2412   if (ignore_case) {
2413     return pass == SIMPLE_CHARACTER_MATCH;
2414   } else {
2415     return pass == NON_LETTER_CHARACTER_MATCH || pass == CASE_CHARACTER_MATCH;
2416   }
2417 }
2418 
CreateForCharacterRanges(Zone * zone,ZoneList<CharacterRange> * ranges,bool read_backward,RegExpNode * on_success)2419 TextNode* TextNode::CreateForCharacterRanges(Zone* zone,
2420                                              ZoneList<CharacterRange>* ranges,
2421                                              bool read_backward,
2422                                              RegExpNode* on_success) {
2423   DCHECK_NOT_NULL(ranges);
2424   // TODO(jgruber): There's no fundamental need to create this
2425   // RegExpCharacterClass; we could refactor to avoid the allocation.
2426   return zone->New<TextNode>(zone->New<RegExpCharacterClass>(zone, ranges),
2427                              read_backward, on_success);
2428 }
2429 
CreateForSurrogatePair(Zone * zone,CharacterRange lead,ZoneList<CharacterRange> * trail_ranges,bool read_backward,RegExpNode * on_success)2430 TextNode* TextNode::CreateForSurrogatePair(
2431     Zone* zone, CharacterRange lead, ZoneList<CharacterRange>* trail_ranges,
2432     bool read_backward, RegExpNode* on_success) {
2433   ZoneList<CharacterRange>* lead_ranges = CharacterRange::List(zone, lead);
2434   ZoneList<TextElement>* elms = zone->New<ZoneList<TextElement>>(2, zone);
2435   elms->Add(TextElement::CharClass(
2436                 zone->New<RegExpCharacterClass>(zone, lead_ranges)),
2437             zone);
2438   elms->Add(TextElement::CharClass(
2439                 zone->New<RegExpCharacterClass>(zone, trail_ranges)),
2440             zone);
2441   return zone->New<TextNode>(elms, read_backward, on_success);
2442 }
2443 
CreateForSurrogatePair(Zone * zone,ZoneList<CharacterRange> * lead_ranges,CharacterRange trail,bool read_backward,RegExpNode * on_success)2444 TextNode* TextNode::CreateForSurrogatePair(
2445     Zone* zone, ZoneList<CharacterRange>* lead_ranges, CharacterRange trail,
2446     bool read_backward, RegExpNode* on_success) {
2447   ZoneList<CharacterRange>* trail_ranges = CharacterRange::List(zone, trail);
2448   ZoneList<TextElement>* elms = zone->New<ZoneList<TextElement>>(2, zone);
2449   elms->Add(TextElement::CharClass(
2450                 zone->New<RegExpCharacterClass>(zone, lead_ranges)),
2451             zone);
2452   elms->Add(TextElement::CharClass(
2453                 zone->New<RegExpCharacterClass>(zone, trail_ranges)),
2454             zone);
2455   return zone->New<TextNode>(elms, read_backward, on_success);
2456 }
2457 
2458 // This generates the code to match a text node.  A text node can contain
2459 // straight character sequences (possibly to be matched in a case-independent
2460 // way) and character classes.  For efficiency we do not do this in a single
2461 // pass from left to right.  Instead we pass over the text node several times,
2462 // emitting code for some character positions every time.  See the comment on
2463 // TextEmitPass for details.
Emit(RegExpCompiler * compiler,Trace * trace)2464 void TextNode::Emit(RegExpCompiler* compiler, Trace* trace) {
2465   LimitResult limit_result = LimitVersions(compiler, trace);
2466   if (limit_result == DONE) return;
2467   DCHECK(limit_result == CONTINUE);
2468 
2469   if (trace->cp_offset() + Length() > RegExpMacroAssembler::kMaxCPOffset) {
2470     compiler->SetRegExpTooBig();
2471     return;
2472   }
2473 
2474   if (compiler->one_byte()) {
2475     int dummy = 0;
2476     TextEmitPass(compiler, NON_LATIN1_MATCH, false, trace, false, &dummy);
2477   }
2478 
2479   bool first_elt_done = false;
2480   int bound_checked_to = trace->cp_offset() - 1;
2481   bound_checked_to += trace->bound_checked_up_to();
2482 
2483   // If a character is preloaded into the current character register then
2484   // check that now.
2485   if (trace->characters_preloaded() == 1) {
2486     for (int pass = kFirstRealPass; pass <= kLastPass; pass++) {
2487       TextEmitPass(compiler, static_cast<TextEmitPassType>(pass), true, trace,
2488                    false, &bound_checked_to);
2489     }
2490     first_elt_done = true;
2491   }
2492 
2493   for (int pass = kFirstRealPass; pass <= kLastPass; pass++) {
2494     TextEmitPass(compiler, static_cast<TextEmitPassType>(pass), false, trace,
2495                  first_elt_done, &bound_checked_to);
2496   }
2497 
2498   Trace successor_trace(*trace);
2499   // If we advance backward, we may end up at the start.
2500   successor_trace.AdvanceCurrentPositionInTrace(
2501       read_backward() ? -Length() : Length(), compiler);
2502   successor_trace.set_at_start(read_backward() ? Trace::UNKNOWN
2503                                                : Trace::FALSE_VALUE);
2504   RecursionCheck rc(compiler);
2505   on_success()->Emit(compiler, &successor_trace);
2506 }
2507 
InvalidateCurrentCharacter()2508 void Trace::InvalidateCurrentCharacter() { characters_preloaded_ = 0; }
2509 
AdvanceCurrentPositionInTrace(int by,RegExpCompiler * compiler)2510 void Trace::AdvanceCurrentPositionInTrace(int by, RegExpCompiler* compiler) {
2511   // We don't have an instruction for shifting the current character register
2512   // down or for using a shifted value for anything so lets just forget that
2513   // we preloaded any characters into it.
2514   characters_preloaded_ = 0;
2515   // Adjust the offsets of the quick check performed information.  This
2516   // information is used to find out what we already determined about the
2517   // characters by means of mask and compare.
2518   quick_check_performed_.Advance(by, compiler->one_byte());
2519   cp_offset_ += by;
2520   if (cp_offset_ > RegExpMacroAssembler::kMaxCPOffset) {
2521     compiler->SetRegExpTooBig();
2522     cp_offset_ = 0;
2523   }
2524   bound_checked_up_to_ = std::max(0, bound_checked_up_to_ - by);
2525 }
2526 
MakeCaseIndependent(Isolate * isolate,bool is_one_byte,RegExpFlags flags)2527 void TextNode::MakeCaseIndependent(Isolate* isolate, bool is_one_byte,
2528                                    RegExpFlags flags) {
2529   if (!IsIgnoreCase(flags)) return;
2530 #ifdef V8_INTL_SUPPORT
2531   if (NeedsUnicodeCaseEquivalents(flags)) return;
2532 #endif
2533 
2534   int element_count = elements()->length();
2535   for (int i = 0; i < element_count; i++) {
2536     TextElement elm = elements()->at(i);
2537     if (elm.text_type() == TextElement::CHAR_CLASS) {
2538       RegExpCharacterClass* cc = elm.char_class();
2539       // None of the standard character classes is different in the case
2540       // independent case and it slows us down if we don't know that.
2541       if (cc->is_standard(zone())) continue;
2542       ZoneList<CharacterRange>* ranges = cc->ranges(zone());
2543       CharacterRange::AddCaseEquivalents(isolate, zone(), ranges, is_one_byte);
2544     }
2545   }
2546 }
2547 
GreedyLoopTextLength()2548 int TextNode::GreedyLoopTextLength() { return Length(); }
2549 
GetSuccessorOfOmnivorousTextNode(RegExpCompiler * compiler)2550 RegExpNode* TextNode::GetSuccessorOfOmnivorousTextNode(
2551     RegExpCompiler* compiler) {
2552   if (read_backward()) return nullptr;
2553   if (elements()->length() != 1) return nullptr;
2554   TextElement elm = elements()->at(0);
2555   if (elm.text_type() != TextElement::CHAR_CLASS) return nullptr;
2556   RegExpCharacterClass* node = elm.char_class();
2557   ZoneList<CharacterRange>* ranges = node->ranges(zone());
2558   CharacterRange::Canonicalize(ranges);
2559   if (node->is_negated()) {
2560     return ranges->length() == 0 ? on_success() : nullptr;
2561   }
2562   if (ranges->length() != 1) return nullptr;
2563   const base::uc32 max_char = MaxCodeUnit(compiler->one_byte());
2564   return ranges->at(0).IsEverything(max_char) ? on_success() : nullptr;
2565 }
2566 
2567 // Finds the fixed match length of a sequence of nodes that goes from
2568 // this alternative and back to this choice node.  If there are variable
2569 // length nodes or other complications in the way then return a sentinel
2570 // value indicating that a greedy loop cannot be constructed.
GreedyLoopTextLengthForAlternative(GuardedAlternative * alternative)2571 int ChoiceNode::GreedyLoopTextLengthForAlternative(
2572     GuardedAlternative* alternative) {
2573   int length = 0;
2574   RegExpNode* node = alternative->node();
2575   // Later we will generate code for all these text nodes using recursion
2576   // so we have to limit the max number.
2577   int recursion_depth = 0;
2578   while (node != this) {
2579     if (recursion_depth++ > RegExpCompiler::kMaxRecursion) {
2580       return kNodeIsTooComplexForGreedyLoops;
2581     }
2582     int node_length = node->GreedyLoopTextLength();
2583     if (node_length == kNodeIsTooComplexForGreedyLoops) {
2584       return kNodeIsTooComplexForGreedyLoops;
2585     }
2586     length += node_length;
2587     SeqRegExpNode* seq_node = static_cast<SeqRegExpNode*>(node);
2588     node = seq_node->on_success();
2589   }
2590   if (read_backward()) {
2591     length = -length;
2592   }
2593   // Check that we can jump by the whole text length. If not, return sentinel
2594   // to indicate the we can't construct a greedy loop.
2595   if (length < RegExpMacroAssembler::kMinCPOffset ||
2596       length > RegExpMacroAssembler::kMaxCPOffset) {
2597     return kNodeIsTooComplexForGreedyLoops;
2598   }
2599   return length;
2600 }
2601 
AddLoopAlternative(GuardedAlternative alt)2602 void LoopChoiceNode::AddLoopAlternative(GuardedAlternative alt) {
2603   DCHECK_NULL(loop_node_);
2604   AddAlternative(alt);
2605   loop_node_ = alt.node();
2606 }
2607 
AddContinueAlternative(GuardedAlternative alt)2608 void LoopChoiceNode::AddContinueAlternative(GuardedAlternative alt) {
2609   DCHECK_NULL(continue_node_);
2610   AddAlternative(alt);
2611   continue_node_ = alt.node();
2612 }
2613 
Emit(RegExpCompiler * compiler,Trace * trace)2614 void LoopChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
2615   RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
2616   if (trace->stop_node() == this) {
2617     // Back edge of greedy optimized loop node graph.
2618     int text_length =
2619         GreedyLoopTextLengthForAlternative(&(alternatives_->at(0)));
2620     DCHECK_NE(kNodeIsTooComplexForGreedyLoops, text_length);
2621     // Update the counter-based backtracking info on the stack.  This is an
2622     // optimization for greedy loops (see below).
2623     DCHECK(trace->cp_offset() == text_length);
2624     macro_assembler->AdvanceCurrentPosition(text_length);
2625     macro_assembler->GoTo(trace->loop_label());
2626     return;
2627   }
2628   DCHECK_NULL(trace->stop_node());
2629   if (!trace->is_trivial()) {
2630     trace->Flush(compiler, this);
2631     return;
2632   }
2633   ChoiceNode::Emit(compiler, trace);
2634 }
2635 
CalculatePreloadCharacters(RegExpCompiler * compiler,int eats_at_least)2636 int ChoiceNode::CalculatePreloadCharacters(RegExpCompiler* compiler,
2637                                            int eats_at_least) {
2638   int preload_characters = std::min(4, eats_at_least);
2639   DCHECK_LE(preload_characters, 4);
2640   if (compiler->macro_assembler()->CanReadUnaligned()) {
2641     bool one_byte = compiler->one_byte();
2642     if (one_byte) {
2643       // We can't preload 3 characters because there is no machine instruction
2644       // to do that.  We can't just load 4 because we could be reading
2645       // beyond the end of the string, which could cause a memory fault.
2646       if (preload_characters == 3) preload_characters = 2;
2647     } else {
2648       if (preload_characters > 2) preload_characters = 2;
2649     }
2650   } else {
2651     if (preload_characters > 1) preload_characters = 1;
2652   }
2653   return preload_characters;
2654 }
2655 
2656 // This class is used when generating the alternatives in a choice node.  It
2657 // records the way the alternative is being code generated.
2658 class AlternativeGeneration : public Malloced {
2659  public:
AlternativeGeneration()2660   AlternativeGeneration()
2661       : possible_success(),
2662         expects_preload(false),
2663         after(),
2664         quick_check_details() {}
2665   Label possible_success;
2666   bool expects_preload;
2667   Label after;
2668   QuickCheckDetails quick_check_details;
2669 };
2670 
2671 // Creates a list of AlternativeGenerations.  If the list has a reasonable
2672 // size then it is on the stack, otherwise the excess is on the heap.
2673 class AlternativeGenerationList {
2674  public:
AlternativeGenerationList(int count,Zone * zone)2675   AlternativeGenerationList(int count, Zone* zone) : alt_gens_(count, zone) {
2676     for (int i = 0; i < count && i < kAFew; i++) {
2677       alt_gens_.Add(a_few_alt_gens_ + i, zone);
2678     }
2679     for (int i = kAFew; i < count; i++) {
2680       alt_gens_.Add(new AlternativeGeneration(), zone);
2681     }
2682   }
~AlternativeGenerationList()2683   ~AlternativeGenerationList() {
2684     for (int i = kAFew; i < alt_gens_.length(); i++) {
2685       delete alt_gens_[i];
2686       alt_gens_[i] = nullptr;
2687     }
2688   }
2689 
at(int i)2690   AlternativeGeneration* at(int i) { return alt_gens_[i]; }
2691 
2692  private:
2693   static const int kAFew = 10;
2694   ZoneList<AlternativeGeneration*> alt_gens_;
2695   AlternativeGeneration a_few_alt_gens_[kAFew];
2696 };
2697 
Set(int character)2698 void BoyerMoorePositionInfo::Set(int character) {
2699   SetInterval(Interval(character, character));
2700 }
2701 
2702 namespace {
2703 
AddRange(ContainedInLattice containment,const int * ranges,int ranges_length,Interval new_range)2704 ContainedInLattice AddRange(ContainedInLattice containment, const int* ranges,
2705                             int ranges_length, Interval new_range) {
2706   DCHECK_EQ(1, ranges_length & 1);
2707   DCHECK_EQ(String::kMaxCodePoint + 1, ranges[ranges_length - 1]);
2708   if (containment == kLatticeUnknown) return containment;
2709   bool inside = false;
2710   int last = 0;
2711   for (int i = 0; i < ranges_length; inside = !inside, last = ranges[i], i++) {
2712     // Consider the range from last to ranges[i].
2713     // We haven't got to the new range yet.
2714     if (ranges[i] <= new_range.from()) continue;
2715     // New range is wholly inside last-ranges[i].  Note that new_range.to() is
2716     // inclusive, but the values in ranges are not.
2717     if (last <= new_range.from() && new_range.to() < ranges[i]) {
2718       return Combine(containment, inside ? kLatticeIn : kLatticeOut);
2719     }
2720     return kLatticeUnknown;
2721   }
2722   return containment;
2723 }
2724 
BitsetFirstSetBit(BoyerMoorePositionInfo::Bitset bitset)2725 int BitsetFirstSetBit(BoyerMoorePositionInfo::Bitset bitset) {
2726   STATIC_ASSERT(BoyerMoorePositionInfo::kMapSize ==
2727                 2 * kInt64Size * kBitsPerByte);
2728 
2729   // Slight fiddling is needed here, since the bitset is of length 128 while
2730   // CountTrailingZeros requires an integral type and std::bitset can only
2731   // convert to unsigned long long. So we handle the most- and least-significant
2732   // bits separately.
2733 
2734   {
2735     static constexpr BoyerMoorePositionInfo::Bitset mask(~uint64_t{0});
2736     BoyerMoorePositionInfo::Bitset masked_bitset = bitset & mask;
2737     STATIC_ASSERT(kInt64Size >= sizeof(decltype(masked_bitset.to_ullong())));
2738     uint64_t lsb = masked_bitset.to_ullong();
2739     if (lsb != 0) return base::bits::CountTrailingZeros(lsb);
2740   }
2741 
2742   {
2743     BoyerMoorePositionInfo::Bitset masked_bitset = bitset >> 64;
2744     uint64_t msb = masked_bitset.to_ullong();
2745     if (msb != 0) return 64 + base::bits::CountTrailingZeros(msb);
2746   }
2747 
2748   return -1;
2749 }
2750 
2751 }  // namespace
2752 
SetInterval(const Interval & interval)2753 void BoyerMoorePositionInfo::SetInterval(const Interval& interval) {
2754   w_ = AddRange(w_, kWordRanges, kWordRangeCount, interval);
2755 
2756   if (interval.size() >= kMapSize) {
2757     map_count_ = kMapSize;
2758     map_.set();
2759     return;
2760   }
2761 
2762   for (int i = interval.from(); i <= interval.to(); i++) {
2763     int mod_character = (i & kMask);
2764     if (!map_[mod_character]) {
2765       map_count_++;
2766       map_.set(mod_character);
2767     }
2768     if (map_count_ == kMapSize) return;
2769   }
2770 }
2771 
SetAll()2772 void BoyerMoorePositionInfo::SetAll() {
2773   w_ = kLatticeUnknown;
2774   if (map_count_ != kMapSize) {
2775     map_count_ = kMapSize;
2776     map_.set();
2777   }
2778 }
2779 
BoyerMooreLookahead(int length,RegExpCompiler * compiler,Zone * zone)2780 BoyerMooreLookahead::BoyerMooreLookahead(int length, RegExpCompiler* compiler,
2781                                          Zone* zone)
2782     : length_(length),
2783       compiler_(compiler),
2784       max_char_(MaxCodeUnit(compiler->one_byte())) {
2785   bitmaps_ = zone->New<ZoneList<BoyerMoorePositionInfo*>>(length, zone);
2786   for (int i = 0; i < length; i++) {
2787     bitmaps_->Add(zone->New<BoyerMoorePositionInfo>(), zone);
2788   }
2789 }
2790 
2791 // Find the longest range of lookahead that has the fewest number of different
2792 // characters that can occur at a given position.  Since we are optimizing two
2793 // different parameters at once this is a tradeoff.
FindWorthwhileInterval(int * from,int * to)2794 bool BoyerMooreLookahead::FindWorthwhileInterval(int* from, int* to) {
2795   int biggest_points = 0;
2796   // If more than 32 characters out of 128 can occur it is unlikely that we can
2797   // be lucky enough to step forwards much of the time.
2798   const int kMaxMax = 32;
2799   for (int max_number_of_chars = 4; max_number_of_chars < kMaxMax;
2800        max_number_of_chars *= 2) {
2801     biggest_points =
2802         FindBestInterval(max_number_of_chars, biggest_points, from, to);
2803   }
2804   if (biggest_points == 0) return false;
2805   return true;
2806 }
2807 
2808 // Find the highest-points range between 0 and length_ where the character
2809 // information is not too vague.  'Too vague' means that there are more than
2810 // max_number_of_chars that can occur at this position.  Calculates the number
2811 // of points as the product of width-of-the-range and
2812 // probability-of-finding-one-of-the-characters, where the probability is
2813 // calculated using the frequency distribution of the sample subject string.
FindBestInterval(int max_number_of_chars,int old_biggest_points,int * from,int * to)2814 int BoyerMooreLookahead::FindBestInterval(int max_number_of_chars,
2815                                           int old_biggest_points, int* from,
2816                                           int* to) {
2817   int biggest_points = old_biggest_points;
2818   static const int kSize = RegExpMacroAssembler::kTableSize;
2819   for (int i = 0; i < length_;) {
2820     while (i < length_ && Count(i) > max_number_of_chars) i++;
2821     if (i == length_) break;
2822     int remembered_from = i;
2823 
2824     BoyerMoorePositionInfo::Bitset union_bitset;
2825     for (; i < length_ && Count(i) <= max_number_of_chars; i++) {
2826       union_bitset |= bitmaps_->at(i)->raw_bitset();
2827     }
2828 
2829     int frequency = 0;
2830 
2831     // Iterate only over set bits.
2832     int j;
2833     while ((j = BitsetFirstSetBit(union_bitset)) != -1) {
2834       DCHECK(union_bitset[j]);  // Sanity check.
2835       // Add 1 to the frequency to give a small per-character boost for
2836       // the cases where our sampling is not good enough and many
2837       // characters have a frequency of zero.  This means the frequency
2838       // can theoretically be up to 2*kSize though we treat it mostly as
2839       // a fraction of kSize.
2840       frequency += compiler_->frequency_collator()->Frequency(j) + 1;
2841       union_bitset.reset(j);
2842     }
2843 
2844     // We use the probability of skipping times the distance we are skipping to
2845     // judge the effectiveness of this.  Actually we have a cut-off:  By
2846     // dividing by 2 we switch off the skipping if the probability of skipping
2847     // is less than 50%.  This is because the multibyte mask-and-compare
2848     // skipping in quickcheck is more likely to do well on this case.
2849     bool in_quickcheck_range =
2850         ((i - remembered_from < 4) ||
2851          (compiler_->one_byte() ? remembered_from <= 4 : remembered_from <= 2));
2852     // Called 'probability' but it is only a rough estimate and can actually
2853     // be outside the 0-kSize range.
2854     int probability = (in_quickcheck_range ? kSize / 2 : kSize) - frequency;
2855     int points = (i - remembered_from) * probability;
2856     if (points > biggest_points) {
2857       *from = remembered_from;
2858       *to = i - 1;
2859       biggest_points = points;
2860     }
2861   }
2862   return biggest_points;
2863 }
2864 
2865 // Take all the characters that will not prevent a successful match if they
2866 // occur in the subject string in the range between min_lookahead and
2867 // max_lookahead (inclusive) measured from the current position.  If the
2868 // character at max_lookahead offset is not one of these characters, then we
2869 // can safely skip forwards by the number of characters in the range.
GetSkipTable(int min_lookahead,int max_lookahead,Handle<ByteArray> boolean_skip_table)2870 int BoyerMooreLookahead::GetSkipTable(int min_lookahead, int max_lookahead,
2871                                       Handle<ByteArray> boolean_skip_table) {
2872   const int kSkipArrayEntry = 0;
2873   const int kDontSkipArrayEntry = 1;
2874 
2875   std::memset(boolean_skip_table->GetDataStartAddress(), kSkipArrayEntry,
2876               boolean_skip_table->length());
2877 
2878   for (int i = max_lookahead; i >= min_lookahead; i--) {
2879     BoyerMoorePositionInfo::Bitset bitset = bitmaps_->at(i)->raw_bitset();
2880 
2881     // Iterate only over set bits.
2882     int j;
2883     while ((j = BitsetFirstSetBit(bitset)) != -1) {
2884       DCHECK(bitset[j]);  // Sanity check.
2885       boolean_skip_table->set(j, kDontSkipArrayEntry);
2886       bitset.reset(j);
2887     }
2888   }
2889 
2890   const int skip = max_lookahead + 1 - min_lookahead;
2891   return skip;
2892 }
2893 
2894 // See comment above on the implementation of GetSkipTable.
EmitSkipInstructions(RegExpMacroAssembler * masm)2895 void BoyerMooreLookahead::EmitSkipInstructions(RegExpMacroAssembler* masm) {
2896   const int kSize = RegExpMacroAssembler::kTableSize;
2897 
2898   int min_lookahead = 0;
2899   int max_lookahead = 0;
2900 
2901   if (!FindWorthwhileInterval(&min_lookahead, &max_lookahead)) return;
2902 
2903   // Check if we only have a single non-empty position info, and that info
2904   // contains precisely one character.
2905   bool found_single_character = false;
2906   int single_character = 0;
2907   for (int i = max_lookahead; i >= min_lookahead; i--) {
2908     BoyerMoorePositionInfo* map = bitmaps_->at(i);
2909     if (map->map_count() == 0) continue;
2910 
2911     if (found_single_character || map->map_count() > 1) {
2912       found_single_character = false;
2913       break;
2914     }
2915 
2916     DCHECK(!found_single_character);
2917     DCHECK_EQ(map->map_count(), 1);
2918 
2919     found_single_character = true;
2920     single_character = BitsetFirstSetBit(map->raw_bitset());
2921 
2922     DCHECK_NE(single_character, -1);
2923   }
2924 
2925   int lookahead_width = max_lookahead + 1 - min_lookahead;
2926 
2927   if (found_single_character && lookahead_width == 1 && max_lookahead < 3) {
2928     // The mask-compare can probably handle this better.
2929     return;
2930   }
2931 
2932   if (found_single_character) {
2933     Label cont, again;
2934     masm->Bind(&again);
2935     masm->LoadCurrentCharacter(max_lookahead, &cont, true);
2936     if (max_char_ > kSize) {
2937       masm->CheckCharacterAfterAnd(single_character,
2938                                    RegExpMacroAssembler::kTableMask, &cont);
2939     } else {
2940       masm->CheckCharacter(single_character, &cont);
2941     }
2942     masm->AdvanceCurrentPosition(lookahead_width);
2943     masm->GoTo(&again);
2944     masm->Bind(&cont);
2945     return;
2946   }
2947 
2948   Factory* factory = masm->isolate()->factory();
2949   Handle<ByteArray> boolean_skip_table =
2950       factory->NewByteArray(kSize, AllocationType::kOld);
2951   int skip_distance =
2952       GetSkipTable(min_lookahead, max_lookahead, boolean_skip_table);
2953   DCHECK_NE(0, skip_distance);
2954 
2955   Label cont, again;
2956   masm->Bind(&again);
2957   masm->LoadCurrentCharacter(max_lookahead, &cont, true);
2958   masm->CheckBitInTable(boolean_skip_table, &cont);
2959   masm->AdvanceCurrentPosition(skip_distance);
2960   masm->GoTo(&again);
2961   masm->Bind(&cont);
2962 }
2963 
2964 /* Code generation for choice nodes.
2965  *
2966  * We generate quick checks that do a mask and compare to eliminate a
2967  * choice.  If the quick check succeeds then it jumps to the continuation to
2968  * do slow checks and check subsequent nodes.  If it fails (the common case)
2969  * it falls through to the next choice.
2970  *
2971  * Here is the desired flow graph.  Nodes directly below each other imply
2972  * fallthrough.  Alternatives 1 and 2 have quick checks.  Alternative
2973  * 3 doesn't have a quick check so we have to call the slow check.
2974  * Nodes are marked Qn for quick checks and Sn for slow checks.  The entire
2975  * regexp continuation is generated directly after the Sn node, up to the
2976  * next GoTo if we decide to reuse some already generated code.  Some
2977  * nodes expect preload_characters to be preloaded into the current
2978  * character register.  R nodes do this preloading.  Vertices are marked
2979  * F for failures and S for success (possible success in the case of quick
2980  * nodes).  L, V, < and > are used as arrow heads.
2981  *
2982  * ----------> R
2983  *             |
2984  *             V
2985  *            Q1 -----> S1
2986  *             |   S   /
2987  *            F|      /
2988  *             |    F/
2989  *             |    /
2990  *             |   R
2991  *             |  /
2992  *             V L
2993  *            Q2 -----> S2
2994  *             |   S   /
2995  *            F|      /
2996  *             |    F/
2997  *             |    /
2998  *             |   R
2999  *             |  /
3000  *             V L
3001  *            S3
3002  *             |
3003  *            F|
3004  *             |
3005  *             R
3006  *             |
3007  * backtrack   V
3008  * <----------Q4
3009  *   \    F    |
3010  *    \        |S
3011  *     \   F   V
3012  *      \-----S4
3013  *
3014  * For greedy loops we push the current position, then generate the code that
3015  * eats the input specially in EmitGreedyLoop.  The other choice (the
3016  * continuation) is generated by the normal code in EmitChoices, and steps back
3017  * in the input to the starting position when it fails to match.  The loop code
3018  * looks like this (U is the unwind code that steps back in the greedy loop).
3019  *
3020  *              _____
3021  *             /     \
3022  *             V     |
3023  * ----------> S1    |
3024  *            /|     |
3025  *           / |S    |
3026  *         F/  \_____/
3027  *         /
3028  *        |<-----
3029  *        |      \
3030  *        V       |S
3031  *        Q2 ---> U----->backtrack
3032  *        |  F   /
3033  *       S|     /
3034  *        V  F /
3035  *        S2--/
3036  */
3037 
GreedyLoopState(bool not_at_start)3038 GreedyLoopState::GreedyLoopState(bool not_at_start) {
3039   counter_backtrack_trace_.set_backtrack(&label_);
3040   if (not_at_start) counter_backtrack_trace_.set_at_start(Trace::FALSE_VALUE);
3041 }
3042 
AssertGuardsMentionRegisters(Trace * trace)3043 void ChoiceNode::AssertGuardsMentionRegisters(Trace* trace) {
3044 #ifdef DEBUG
3045   int choice_count = alternatives_->length();
3046   for (int i = 0; i < choice_count - 1; i++) {
3047     GuardedAlternative alternative = alternatives_->at(i);
3048     ZoneList<Guard*>* guards = alternative.guards();
3049     int guard_count = (guards == nullptr) ? 0 : guards->length();
3050     for (int j = 0; j < guard_count; j++) {
3051       DCHECK(!trace->mentions_reg(guards->at(j)->reg()));
3052     }
3053   }
3054 #endif
3055 }
3056 
SetUpPreLoad(RegExpCompiler * compiler,Trace * current_trace,PreloadState * state)3057 void ChoiceNode::SetUpPreLoad(RegExpCompiler* compiler, Trace* current_trace,
3058                               PreloadState* state) {
3059   if (state->eats_at_least_ == PreloadState::kEatsAtLeastNotYetInitialized) {
3060     // Save some time by looking at most one machine word ahead.
3061     state->eats_at_least_ =
3062         EatsAtLeast(current_trace->at_start() == Trace::FALSE_VALUE);
3063   }
3064   state->preload_characters_ =
3065       CalculatePreloadCharacters(compiler, state->eats_at_least_);
3066 
3067   state->preload_is_current_ =
3068       (current_trace->characters_preloaded() == state->preload_characters_);
3069   state->preload_has_checked_bounds_ = state->preload_is_current_;
3070 }
3071 
Emit(RegExpCompiler * compiler,Trace * trace)3072 void ChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
3073   int choice_count = alternatives_->length();
3074 
3075   if (choice_count == 1 && alternatives_->at(0).guards() == nullptr) {
3076     alternatives_->at(0).node()->Emit(compiler, trace);
3077     return;
3078   }
3079 
3080   AssertGuardsMentionRegisters(trace);
3081 
3082   LimitResult limit_result = LimitVersions(compiler, trace);
3083   if (limit_result == DONE) return;
3084   DCHECK(limit_result == CONTINUE);
3085 
3086   // For loop nodes we already flushed (see LoopChoiceNode::Emit), but for
3087   // other choice nodes we only flush if we are out of code size budget.
3088   if (trace->flush_budget() == 0 && trace->actions() != nullptr) {
3089     trace->Flush(compiler, this);
3090     return;
3091   }
3092 
3093   RecursionCheck rc(compiler);
3094 
3095   PreloadState preload;
3096   preload.init();
3097   GreedyLoopState greedy_loop_state(not_at_start());
3098 
3099   int text_length = GreedyLoopTextLengthForAlternative(&alternatives_->at(0));
3100   AlternativeGenerationList alt_gens(choice_count, zone());
3101 
3102   if (choice_count > 1 && text_length != kNodeIsTooComplexForGreedyLoops) {
3103     trace = EmitGreedyLoop(compiler, trace, &alt_gens, &preload,
3104                            &greedy_loop_state, text_length);
3105   } else {
3106     // TODO(erikcorry): Delete this.  We don't need this label, but it makes us
3107     // match the traces produced pre-cleanup.
3108     Label second_choice;
3109     compiler->macro_assembler()->Bind(&second_choice);
3110 
3111     preload.eats_at_least_ = EmitOptimizedUnanchoredSearch(compiler, trace);
3112 
3113     EmitChoices(compiler, &alt_gens, 0, trace, &preload);
3114   }
3115 
3116   // At this point we need to generate slow checks for the alternatives where
3117   // the quick check was inlined.  We can recognize these because the associated
3118   // label was bound.
3119   int new_flush_budget = trace->flush_budget() / choice_count;
3120   for (int i = 0; i < choice_count; i++) {
3121     AlternativeGeneration* alt_gen = alt_gens.at(i);
3122     Trace new_trace(*trace);
3123     // If there are actions to be flushed we have to limit how many times
3124     // they are flushed.  Take the budget of the parent trace and distribute
3125     // it fairly amongst the children.
3126     if (new_trace.actions() != nullptr) {
3127       new_trace.set_flush_budget(new_flush_budget);
3128     }
3129     bool next_expects_preload =
3130         i == choice_count - 1 ? false : alt_gens.at(i + 1)->expects_preload;
3131     EmitOutOfLineContinuation(compiler, &new_trace, alternatives_->at(i),
3132                               alt_gen, preload.preload_characters_,
3133                               next_expects_preload);
3134   }
3135 }
3136 
EmitGreedyLoop(RegExpCompiler * compiler,Trace * trace,AlternativeGenerationList * alt_gens,PreloadState * preload,GreedyLoopState * greedy_loop_state,int text_length)3137 Trace* ChoiceNode::EmitGreedyLoop(RegExpCompiler* compiler, Trace* trace,
3138                                   AlternativeGenerationList* alt_gens,
3139                                   PreloadState* preload,
3140                                   GreedyLoopState* greedy_loop_state,
3141                                   int text_length) {
3142   RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
3143   // Here we have special handling for greedy loops containing only text nodes
3144   // and other simple nodes.  These are handled by pushing the current
3145   // position on the stack and then incrementing the current position each
3146   // time around the switch.  On backtrack we decrement the current position
3147   // and check it against the pushed value.  This avoids pushing backtrack
3148   // information for each iteration of the loop, which could take up a lot of
3149   // space.
3150   DCHECK(trace->stop_node() == nullptr);
3151   macro_assembler->PushCurrentPosition();
3152   Label greedy_match_failed;
3153   Trace greedy_match_trace;
3154   if (not_at_start()) greedy_match_trace.set_at_start(Trace::FALSE_VALUE);
3155   greedy_match_trace.set_backtrack(&greedy_match_failed);
3156   Label loop_label;
3157   macro_assembler->Bind(&loop_label);
3158   greedy_match_trace.set_stop_node(this);
3159   greedy_match_trace.set_loop_label(&loop_label);
3160   alternatives_->at(0).node()->Emit(compiler, &greedy_match_trace);
3161   macro_assembler->Bind(&greedy_match_failed);
3162 
3163   Label second_choice;  // For use in greedy matches.
3164   macro_assembler->Bind(&second_choice);
3165 
3166   Trace* new_trace = greedy_loop_state->counter_backtrack_trace();
3167 
3168   EmitChoices(compiler, alt_gens, 1, new_trace, preload);
3169 
3170   macro_assembler->Bind(greedy_loop_state->label());
3171   // If we have unwound to the bottom then backtrack.
3172   macro_assembler->CheckGreedyLoop(trace->backtrack());
3173   // Otherwise try the second priority at an earlier position.
3174   macro_assembler->AdvanceCurrentPosition(-text_length);
3175   macro_assembler->GoTo(&second_choice);
3176   return new_trace;
3177 }
3178 
EmitOptimizedUnanchoredSearch(RegExpCompiler * compiler,Trace * trace)3179 int ChoiceNode::EmitOptimizedUnanchoredSearch(RegExpCompiler* compiler,
3180                                               Trace* trace) {
3181   int eats_at_least = PreloadState::kEatsAtLeastNotYetInitialized;
3182   if (alternatives_->length() != 2) return eats_at_least;
3183 
3184   GuardedAlternative alt1 = alternatives_->at(1);
3185   if (alt1.guards() != nullptr && alt1.guards()->length() != 0) {
3186     return eats_at_least;
3187   }
3188   RegExpNode* eats_anything_node = alt1.node();
3189   if (eats_anything_node->GetSuccessorOfOmnivorousTextNode(compiler) != this) {
3190     return eats_at_least;
3191   }
3192 
3193   // Really we should be creating a new trace when we execute this function,
3194   // but there is no need, because the code it generates cannot backtrack, and
3195   // we always arrive here with a trivial trace (since it's the entry to a
3196   // loop.  That also implies that there are no preloaded characters, which is
3197   // good, because it means we won't be violating any assumptions by
3198   // overwriting those characters with new load instructions.
3199   DCHECK(trace->is_trivial());
3200 
3201   RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
3202   Isolate* isolate = macro_assembler->isolate();
3203   // At this point we know that we are at a non-greedy loop that will eat
3204   // any character one at a time.  Any non-anchored regexp has such a
3205   // loop prepended to it in order to find where it starts.  We look for
3206   // a pattern of the form ...abc... where we can look 6 characters ahead
3207   // and step forwards 3 if the character is not one of abc.  Abc need
3208   // not be atoms, they can be any reasonably limited character class or
3209   // small alternation.
3210   BoyerMooreLookahead* bm = bm_info(false);
3211   if (bm == nullptr) {
3212     eats_at_least = std::min(kMaxLookaheadForBoyerMoore, EatsAtLeast(false));
3213     if (eats_at_least >= 1) {
3214       bm = zone()->New<BoyerMooreLookahead>(eats_at_least, compiler, zone());
3215       GuardedAlternative alt0 = alternatives_->at(0);
3216       alt0.node()->FillInBMInfo(isolate, 0, kRecursionBudget, bm, false);
3217     }
3218   }
3219   if (bm != nullptr) {
3220     bm->EmitSkipInstructions(macro_assembler);
3221   }
3222   return eats_at_least;
3223 }
3224 
EmitChoices(RegExpCompiler * compiler,AlternativeGenerationList * alt_gens,int first_choice,Trace * trace,PreloadState * preload)3225 void ChoiceNode::EmitChoices(RegExpCompiler* compiler,
3226                              AlternativeGenerationList* alt_gens,
3227                              int first_choice, Trace* trace,
3228                              PreloadState* preload) {
3229   RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
3230   SetUpPreLoad(compiler, trace, preload);
3231 
3232   // For now we just call all choices one after the other.  The idea ultimately
3233   // is to use the Dispatch table to try only the relevant ones.
3234   int choice_count = alternatives_->length();
3235 
3236   int new_flush_budget = trace->flush_budget() / choice_count;
3237 
3238   for (int i = first_choice; i < choice_count; i++) {
3239     bool is_last = i == choice_count - 1;
3240     bool fall_through_on_failure = !is_last;
3241     GuardedAlternative alternative = alternatives_->at(i);
3242     AlternativeGeneration* alt_gen = alt_gens->at(i);
3243     alt_gen->quick_check_details.set_characters(preload->preload_characters_);
3244     ZoneList<Guard*>* guards = alternative.guards();
3245     int guard_count = (guards == nullptr) ? 0 : guards->length();
3246     Trace new_trace(*trace);
3247     new_trace.set_characters_preloaded(
3248         preload->preload_is_current_ ? preload->preload_characters_ : 0);
3249     if (preload->preload_has_checked_bounds_) {
3250       new_trace.set_bound_checked_up_to(preload->preload_characters_);
3251     }
3252     new_trace.quick_check_performed()->Clear();
3253     if (not_at_start_) new_trace.set_at_start(Trace::FALSE_VALUE);
3254     if (!is_last) {
3255       new_trace.set_backtrack(&alt_gen->after);
3256     }
3257     alt_gen->expects_preload = preload->preload_is_current_;
3258     bool generate_full_check_inline = false;
3259     if (compiler->optimize() &&
3260         try_to_emit_quick_check_for_alternative(i == 0) &&
3261         alternative.node()->EmitQuickCheck(
3262             compiler, trace, &new_trace, preload->preload_has_checked_bounds_,
3263             &alt_gen->possible_success, &alt_gen->quick_check_details,
3264             fall_through_on_failure, this)) {
3265       // Quick check was generated for this choice.
3266       preload->preload_is_current_ = true;
3267       preload->preload_has_checked_bounds_ = true;
3268       // If we generated the quick check to fall through on possible success,
3269       // we now need to generate the full check inline.
3270       if (!fall_through_on_failure) {
3271         macro_assembler->Bind(&alt_gen->possible_success);
3272         new_trace.set_quick_check_performed(&alt_gen->quick_check_details);
3273         new_trace.set_characters_preloaded(preload->preload_characters_);
3274         new_trace.set_bound_checked_up_to(preload->preload_characters_);
3275         generate_full_check_inline = true;
3276       }
3277     } else if (alt_gen->quick_check_details.cannot_match()) {
3278       if (!fall_through_on_failure) {
3279         macro_assembler->GoTo(trace->backtrack());
3280       }
3281       continue;
3282     } else {
3283       // No quick check was generated.  Put the full code here.
3284       // If this is not the first choice then there could be slow checks from
3285       // previous cases that go here when they fail.  There's no reason to
3286       // insist that they preload characters since the slow check we are about
3287       // to generate probably can't use it.
3288       if (i != first_choice) {
3289         alt_gen->expects_preload = false;
3290         new_trace.InvalidateCurrentCharacter();
3291       }
3292       generate_full_check_inline = true;
3293     }
3294     if (generate_full_check_inline) {
3295       if (new_trace.actions() != nullptr) {
3296         new_trace.set_flush_budget(new_flush_budget);
3297       }
3298       for (int j = 0; j < guard_count; j++) {
3299         GenerateGuard(macro_assembler, guards->at(j), &new_trace);
3300       }
3301       alternative.node()->Emit(compiler, &new_trace);
3302       preload->preload_is_current_ = false;
3303     }
3304     macro_assembler->Bind(&alt_gen->after);
3305   }
3306 }
3307 
EmitOutOfLineContinuation(RegExpCompiler * compiler,Trace * trace,GuardedAlternative alternative,AlternativeGeneration * alt_gen,int preload_characters,bool next_expects_preload)3308 void ChoiceNode::EmitOutOfLineContinuation(RegExpCompiler* compiler,
3309                                            Trace* trace,
3310                                            GuardedAlternative alternative,
3311                                            AlternativeGeneration* alt_gen,
3312                                            int preload_characters,
3313                                            bool next_expects_preload) {
3314   if (!alt_gen->possible_success.is_linked()) return;
3315 
3316   RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
3317   macro_assembler->Bind(&alt_gen->possible_success);
3318   Trace out_of_line_trace(*trace);
3319   out_of_line_trace.set_characters_preloaded(preload_characters);
3320   out_of_line_trace.set_quick_check_performed(&alt_gen->quick_check_details);
3321   if (not_at_start_) out_of_line_trace.set_at_start(Trace::FALSE_VALUE);
3322   ZoneList<Guard*>* guards = alternative.guards();
3323   int guard_count = (guards == nullptr) ? 0 : guards->length();
3324   if (next_expects_preload) {
3325     Label reload_current_char;
3326     out_of_line_trace.set_backtrack(&reload_current_char);
3327     for (int j = 0; j < guard_count; j++) {
3328       GenerateGuard(macro_assembler, guards->at(j), &out_of_line_trace);
3329     }
3330     alternative.node()->Emit(compiler, &out_of_line_trace);
3331     macro_assembler->Bind(&reload_current_char);
3332     // Reload the current character, since the next quick check expects that.
3333     // We don't need to check bounds here because we only get into this
3334     // code through a quick check which already did the checked load.
3335     macro_assembler->LoadCurrentCharacter(trace->cp_offset(), nullptr, false,
3336                                           preload_characters);
3337     macro_assembler->GoTo(&(alt_gen->after));
3338   } else {
3339     out_of_line_trace.set_backtrack(&(alt_gen->after));
3340     for (int j = 0; j < guard_count; j++) {
3341       GenerateGuard(macro_assembler, guards->at(j), &out_of_line_trace);
3342     }
3343     alternative.node()->Emit(compiler, &out_of_line_trace);
3344   }
3345 }
3346 
Emit(RegExpCompiler * compiler,Trace * trace)3347 void ActionNode::Emit(RegExpCompiler* compiler, Trace* trace) {
3348   RegExpMacroAssembler* assembler = compiler->macro_assembler();
3349   LimitResult limit_result = LimitVersions(compiler, trace);
3350   if (limit_result == DONE) return;
3351   DCHECK(limit_result == CONTINUE);
3352 
3353   RecursionCheck rc(compiler);
3354 
3355   switch (action_type_) {
3356     case STORE_POSITION: {
3357       Trace::DeferredCapture new_capture(data_.u_position_register.reg,
3358                                          data_.u_position_register.is_capture,
3359                                          trace);
3360       Trace new_trace = *trace;
3361       new_trace.add_action(&new_capture);
3362       on_success()->Emit(compiler, &new_trace);
3363       break;
3364     }
3365     case INCREMENT_REGISTER: {
3366       Trace::DeferredIncrementRegister new_increment(
3367           data_.u_increment_register.reg);
3368       Trace new_trace = *trace;
3369       new_trace.add_action(&new_increment);
3370       on_success()->Emit(compiler, &new_trace);
3371       break;
3372     }
3373     case SET_REGISTER_FOR_LOOP: {
3374       Trace::DeferredSetRegisterForLoop new_set(data_.u_store_register.reg,
3375                                                 data_.u_store_register.value);
3376       Trace new_trace = *trace;
3377       new_trace.add_action(&new_set);
3378       on_success()->Emit(compiler, &new_trace);
3379       break;
3380     }
3381     case CLEAR_CAPTURES: {
3382       Trace::DeferredClearCaptures new_capture(Interval(
3383           data_.u_clear_captures.range_from, data_.u_clear_captures.range_to));
3384       Trace new_trace = *trace;
3385       new_trace.add_action(&new_capture);
3386       on_success()->Emit(compiler, &new_trace);
3387       break;
3388     }
3389     case BEGIN_POSITIVE_SUBMATCH:
3390     case BEGIN_NEGATIVE_SUBMATCH:
3391       if (!trace->is_trivial()) {
3392         trace->Flush(compiler, this);
3393       } else {
3394         assembler->WriteCurrentPositionToRegister(
3395             data_.u_submatch.current_position_register, 0);
3396         assembler->WriteStackPointerToRegister(
3397             data_.u_submatch.stack_pointer_register);
3398         on_success()->Emit(compiler, trace);
3399       }
3400       break;
3401     case EMPTY_MATCH_CHECK: {
3402       int start_pos_reg = data_.u_empty_match_check.start_register;
3403       int stored_pos = 0;
3404       int rep_reg = data_.u_empty_match_check.repetition_register;
3405       bool has_minimum = (rep_reg != RegExpCompiler::kNoRegister);
3406       bool know_dist = trace->GetStoredPosition(start_pos_reg, &stored_pos);
3407       if (know_dist && !has_minimum && stored_pos == trace->cp_offset()) {
3408         // If we know we haven't advanced and there is no minimum we
3409         // can just backtrack immediately.
3410         assembler->GoTo(trace->backtrack());
3411       } else if (know_dist && stored_pos < trace->cp_offset()) {
3412         // If we know we've advanced we can generate the continuation
3413         // immediately.
3414         on_success()->Emit(compiler, trace);
3415       } else if (!trace->is_trivial()) {
3416         trace->Flush(compiler, this);
3417       } else {
3418         Label skip_empty_check;
3419         // If we have a minimum number of repetitions we check the current
3420         // number first and skip the empty check if it's not enough.
3421         if (has_minimum) {
3422           int limit = data_.u_empty_match_check.repetition_limit;
3423           assembler->IfRegisterLT(rep_reg, limit, &skip_empty_check);
3424         }
3425         // If the match is empty we bail out, otherwise we fall through
3426         // to the on-success continuation.
3427         assembler->IfRegisterEqPos(data_.u_empty_match_check.start_register,
3428                                    trace->backtrack());
3429         assembler->Bind(&skip_empty_check);
3430         on_success()->Emit(compiler, trace);
3431       }
3432       break;
3433     }
3434     case POSITIVE_SUBMATCH_SUCCESS: {
3435       if (!trace->is_trivial()) {
3436         trace->Flush(compiler, this);
3437         return;
3438       }
3439       assembler->ReadCurrentPositionFromRegister(
3440           data_.u_submatch.current_position_register);
3441       assembler->ReadStackPointerFromRegister(
3442           data_.u_submatch.stack_pointer_register);
3443       int clear_register_count = data_.u_submatch.clear_register_count;
3444       if (clear_register_count == 0) {
3445         on_success()->Emit(compiler, trace);
3446         return;
3447       }
3448       int clear_registers_from = data_.u_submatch.clear_register_from;
3449       Label clear_registers_backtrack;
3450       Trace new_trace = *trace;
3451       new_trace.set_backtrack(&clear_registers_backtrack);
3452       on_success()->Emit(compiler, &new_trace);
3453 
3454       assembler->Bind(&clear_registers_backtrack);
3455       int clear_registers_to = clear_registers_from + clear_register_count - 1;
3456       assembler->ClearRegisters(clear_registers_from, clear_registers_to);
3457 
3458       DCHECK(trace->backtrack() == nullptr);
3459       assembler->Backtrack();
3460       return;
3461     }
3462     default:
3463       UNREACHABLE();
3464   }
3465 }
3466 
Emit(RegExpCompiler * compiler,Trace * trace)3467 void BackReferenceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
3468   RegExpMacroAssembler* assembler = compiler->macro_assembler();
3469   if (!trace->is_trivial()) {
3470     trace->Flush(compiler, this);
3471     return;
3472   }
3473 
3474   LimitResult limit_result = LimitVersions(compiler, trace);
3475   if (limit_result == DONE) return;
3476   DCHECK(limit_result == CONTINUE);
3477 
3478   RecursionCheck rc(compiler);
3479 
3480   DCHECK_EQ(start_reg_ + 1, end_reg_);
3481   if (IsIgnoreCase(flags_)) {
3482     bool unicode = IsUnicode(flags_);
3483     assembler->CheckNotBackReferenceIgnoreCase(start_reg_, read_backward(),
3484                                                unicode, trace->backtrack());
3485   } else {
3486     assembler->CheckNotBackReference(start_reg_, read_backward(),
3487                                      trace->backtrack());
3488   }
3489   // We are going to advance backward, so we may end up at the start.
3490   if (read_backward()) trace->set_at_start(Trace::UNKNOWN);
3491 
3492   // Check that the back reference does not end inside a surrogate pair.
3493   if (IsUnicode(flags_) && !compiler->one_byte()) {
3494     assembler->CheckNotInSurrogatePair(trace->cp_offset(), trace->backtrack());
3495   }
3496   on_success()->Emit(compiler, trace);
3497 }
3498 
CalculateOffsets()3499 void TextNode::CalculateOffsets() {
3500   int element_count = elements()->length();
3501   // Set up the offsets of the elements relative to the start.  This is a fixed
3502   // quantity since a TextNode can only contain fixed-width things.
3503   int cp_offset = 0;
3504   for (int i = 0; i < element_count; i++) {
3505     TextElement& elm = elements()->at(i);
3506     elm.set_cp_offset(cp_offset);
3507     cp_offset += elm.length();
3508   }
3509 }
3510 
3511 namespace {
3512 
3513 // Assertion propagation moves information about assertions such as
3514 // \b to the affected nodes.  For instance, in /.\b./ information must
3515 // be propagated to the first '.' that whatever follows needs to know
3516 // if it matched a word or a non-word, and to the second '.' that it
3517 // has to check if it succeeds a word or non-word.  In this case the
3518 // result will be something like:
3519 //
3520 //   +-------+        +------------+
3521 //   |   .   |        |      .     |
3522 //   +-------+  --->  +------------+
3523 //   | word? |        | check word |
3524 //   +-------+        +------------+
3525 class AssertionPropagator : public AllStatic {
3526  public:
VisitText(TextNode * that)3527   static void VisitText(TextNode* that) {}
3528 
VisitAction(ActionNode * that)3529   static void VisitAction(ActionNode* that) {
3530     // If the next node is interested in what it follows then this node
3531     // has to be interested too so it can pass the information on.
3532     that->info()->AddFromFollowing(that->on_success()->info());
3533   }
3534 
VisitChoice(ChoiceNode * that,int i)3535   static void VisitChoice(ChoiceNode* that, int i) {
3536     // Anything the following nodes need to know has to be known by
3537     // this node also, so it can pass it on.
3538     that->info()->AddFromFollowing(that->alternatives()->at(i).node()->info());
3539   }
3540 
VisitLoopChoiceContinueNode(LoopChoiceNode * that)3541   static void VisitLoopChoiceContinueNode(LoopChoiceNode* that) {
3542     that->info()->AddFromFollowing(that->continue_node()->info());
3543   }
3544 
VisitLoopChoiceLoopNode(LoopChoiceNode * that)3545   static void VisitLoopChoiceLoopNode(LoopChoiceNode* that) {
3546     that->info()->AddFromFollowing(that->loop_node()->info());
3547   }
3548 
VisitNegativeLookaroundChoiceLookaroundNode(NegativeLookaroundChoiceNode * that)3549   static void VisitNegativeLookaroundChoiceLookaroundNode(
3550       NegativeLookaroundChoiceNode* that) {
3551     VisitChoice(that, NegativeLookaroundChoiceNode::kLookaroundIndex);
3552   }
3553 
VisitNegativeLookaroundChoiceContinueNode(NegativeLookaroundChoiceNode * that)3554   static void VisitNegativeLookaroundChoiceContinueNode(
3555       NegativeLookaroundChoiceNode* that) {
3556     VisitChoice(that, NegativeLookaroundChoiceNode::kContinueIndex);
3557   }
3558 
VisitBackReference(BackReferenceNode * that)3559   static void VisitBackReference(BackReferenceNode* that) {}
3560 
VisitAssertion(AssertionNode * that)3561   static void VisitAssertion(AssertionNode* that) {}
3562 };
3563 
3564 // Propagates information about the minimum size of successful matches from
3565 // successor nodes to their predecessors. Note that all eats_at_least values
3566 // are initialized to zero before analysis.
3567 class EatsAtLeastPropagator : public AllStatic {
3568  public:
VisitText(TextNode * that)3569   static void VisitText(TextNode* that) {
3570     // The eats_at_least value is not used if reading backward.
3571     if (!that->read_backward()) {
3572       // We are not at the start after this node, and thus we can use the
3573       // successor's eats_at_least_from_not_start value.
3574       uint8_t eats_at_least = base::saturated_cast<uint8_t>(
3575           that->Length() + that->on_success()
3576                                ->eats_at_least_info()
3577                                ->eats_at_least_from_not_start);
3578       that->set_eats_at_least_info(EatsAtLeastInfo(eats_at_least));
3579     }
3580   }
3581 
VisitAction(ActionNode * that)3582   static void VisitAction(ActionNode* that) {
3583     switch (that->action_type()) {
3584       case ActionNode::BEGIN_POSITIVE_SUBMATCH:
3585       case ActionNode::POSITIVE_SUBMATCH_SUCCESS:
3586         // We do not propagate eats_at_least data through positive lookarounds,
3587         // because they rewind input.
3588         // TODO(v8:11859) Potential approaches for fixing this include:
3589         // 1. Add a dedicated choice node for positive lookaround, similar to
3590         //    NegativeLookaroundChoiceNode.
3591         // 2. Add an eats_at_least_inside_loop field to EatsAtLeastInfo, which
3592         //    is <= eats_at_least_from_possibly_start, and use that value in
3593         //    EatsAtLeastFromLoopEntry.
3594         DCHECK(that->eats_at_least_info()->IsZero());
3595         break;
3596       case ActionNode::SET_REGISTER_FOR_LOOP:
3597         // SET_REGISTER_FOR_LOOP indicates a loop entry point, which means the
3598         // loop body will run at least the minimum number of times before the
3599         // continuation case can run.
3600         that->set_eats_at_least_info(
3601             that->on_success()->EatsAtLeastFromLoopEntry());
3602         break;
3603       case ActionNode::BEGIN_NEGATIVE_SUBMATCH:
3604       default:
3605         // Otherwise, the current node eats at least as much as its successor.
3606         // Note: we can propagate eats_at_least data for BEGIN_NEGATIVE_SUBMATCH
3607         // because NegativeLookaroundChoiceNode ignores its lookaround successor
3608         // when computing eats-at-least and quick check information.
3609         that->set_eats_at_least_info(*that->on_success()->eats_at_least_info());
3610         break;
3611     }
3612   }
3613 
VisitChoice(ChoiceNode * that,int i)3614   static void VisitChoice(ChoiceNode* that, int i) {
3615     // The minimum possible match from a choice node is the minimum of its
3616     // successors.
3617     EatsAtLeastInfo eats_at_least =
3618         i == 0 ? EatsAtLeastInfo(UINT8_MAX) : *that->eats_at_least_info();
3619     eats_at_least.SetMin(
3620         *that->alternatives()->at(i).node()->eats_at_least_info());
3621     that->set_eats_at_least_info(eats_at_least);
3622   }
3623 
VisitLoopChoiceContinueNode(LoopChoiceNode * that)3624   static void VisitLoopChoiceContinueNode(LoopChoiceNode* that) {
3625     if (!that->read_backward()) {
3626       that->set_eats_at_least_info(
3627           *that->continue_node()->eats_at_least_info());
3628     }
3629   }
3630 
VisitLoopChoiceLoopNode(LoopChoiceNode * that)3631   static void VisitLoopChoiceLoopNode(LoopChoiceNode* that) {}
3632 
VisitNegativeLookaroundChoiceLookaroundNode(NegativeLookaroundChoiceNode * that)3633   static void VisitNegativeLookaroundChoiceLookaroundNode(
3634       NegativeLookaroundChoiceNode* that) {}
3635 
VisitNegativeLookaroundChoiceContinueNode(NegativeLookaroundChoiceNode * that)3636   static void VisitNegativeLookaroundChoiceContinueNode(
3637       NegativeLookaroundChoiceNode* that) {
3638     that->set_eats_at_least_info(*that->continue_node()->eats_at_least_info());
3639   }
3640 
VisitBackReference(BackReferenceNode * that)3641   static void VisitBackReference(BackReferenceNode* that) {
3642     if (!that->read_backward()) {
3643       that->set_eats_at_least_info(*that->on_success()->eats_at_least_info());
3644     }
3645   }
3646 
VisitAssertion(AssertionNode * that)3647   static void VisitAssertion(AssertionNode* that) {
3648     EatsAtLeastInfo eats_at_least = *that->on_success()->eats_at_least_info();
3649     if (that->assertion_type() == AssertionNode::AT_START) {
3650       // If we know we are not at the start and we are asked "how many
3651       // characters will you match if you succeed?" then we can answer anything
3652       // since false implies false.  So let's just set the max answer
3653       // (UINT8_MAX) since that won't prevent us from preloading a lot of
3654       // characters for the other branches in the node graph.
3655       eats_at_least.eats_at_least_from_not_start = UINT8_MAX;
3656     }
3657     that->set_eats_at_least_info(eats_at_least);
3658   }
3659 };
3660 
3661 }  // namespace
3662 
3663 // -------------------------------------------------------------------
3664 // Analysis
3665 
3666 // Iterates the node graph and provides the opportunity for propagators to set
3667 // values that depend on successor nodes.
3668 template <typename... Propagators>
3669 class Analysis : public NodeVisitor {
3670  public:
Analysis(Isolate * isolate,bool is_one_byte,RegExpFlags flags)3671   Analysis(Isolate* isolate, bool is_one_byte, RegExpFlags flags)
3672       : isolate_(isolate),
3673         is_one_byte_(is_one_byte),
3674         flags_(flags),
3675         error_(RegExpError::kNone) {}
3676 
EnsureAnalyzed(RegExpNode * that)3677   void EnsureAnalyzed(RegExpNode* that) {
3678     StackLimitCheck check(isolate());
3679     if (check.HasOverflowed()) {
3680       if (FLAG_correctness_fuzzer_suppressions) {
3681         FATAL("Analysis: Aborting on stack overflow");
3682       }
3683       fail(RegExpError::kAnalysisStackOverflow);
3684       return;
3685     }
3686     if (that->info()->been_analyzed || that->info()->being_analyzed) return;
3687     that->info()->being_analyzed = true;
3688     that->Accept(this);
3689     that->info()->being_analyzed = false;
3690     that->info()->been_analyzed = true;
3691   }
3692 
has_failed()3693   bool has_failed() { return error_ != RegExpError::kNone; }
error()3694   RegExpError error() {
3695     DCHECK(error_ != RegExpError::kNone);
3696     return error_;
3697   }
fail(RegExpError error)3698   void fail(RegExpError error) { error_ = error; }
3699 
isolate() const3700   Isolate* isolate() const { return isolate_; }
3701 
VisitEnd(EndNode * that)3702   void VisitEnd(EndNode* that) override {
3703     // nothing to do
3704   }
3705 
3706 // Used to call the given static function on each propagator / variadic template
3707 // argument.
3708 #define STATIC_FOR_EACH(expr)       \
3709   do {                              \
3710     int dummy[] = {((expr), 0)...}; \
3711     USE(dummy);                     \
3712   } while (false)
3713 
VisitText(TextNode * that)3714   void VisitText(TextNode* that) override {
3715     that->MakeCaseIndependent(isolate(), is_one_byte_, flags_);
3716     EnsureAnalyzed(that->on_success());
3717     if (has_failed()) return;
3718     that->CalculateOffsets();
3719     STATIC_FOR_EACH(Propagators::VisitText(that));
3720   }
3721 
VisitAction(ActionNode * that)3722   void VisitAction(ActionNode* that) override {
3723     EnsureAnalyzed(that->on_success());
3724     if (has_failed()) return;
3725     STATIC_FOR_EACH(Propagators::VisitAction(that));
3726   }
3727 
VisitChoice(ChoiceNode * that)3728   void VisitChoice(ChoiceNode* that) override {
3729     for (int i = 0; i < that->alternatives()->length(); i++) {
3730       EnsureAnalyzed(that->alternatives()->at(i).node());
3731       if (has_failed()) return;
3732       STATIC_FOR_EACH(Propagators::VisitChoice(that, i));
3733     }
3734   }
3735 
VisitLoopChoice(LoopChoiceNode * that)3736   void VisitLoopChoice(LoopChoiceNode* that) override {
3737     DCHECK_EQ(that->alternatives()->length(), 2);  // Just loop and continue.
3738 
3739     // First propagate all information from the continuation node.
3740     EnsureAnalyzed(that->continue_node());
3741     if (has_failed()) return;
3742     STATIC_FOR_EACH(Propagators::VisitLoopChoiceContinueNode(that));
3743 
3744     // Check the loop last since it may need the value of this node
3745     // to get a correct result.
3746     EnsureAnalyzed(that->loop_node());
3747     if (has_failed()) return;
3748     STATIC_FOR_EACH(Propagators::VisitLoopChoiceLoopNode(that));
3749   }
3750 
VisitNegativeLookaroundChoice(NegativeLookaroundChoiceNode * that)3751   void VisitNegativeLookaroundChoice(
3752       NegativeLookaroundChoiceNode* that) override {
3753     DCHECK_EQ(that->alternatives()->length(), 2);  // Lookaround and continue.
3754 
3755     EnsureAnalyzed(that->lookaround_node());
3756     if (has_failed()) return;
3757     STATIC_FOR_EACH(
3758         Propagators::VisitNegativeLookaroundChoiceLookaroundNode(that));
3759 
3760     EnsureAnalyzed(that->continue_node());
3761     if (has_failed()) return;
3762     STATIC_FOR_EACH(
3763         Propagators::VisitNegativeLookaroundChoiceContinueNode(that));
3764   }
3765 
VisitBackReference(BackReferenceNode * that)3766   void VisitBackReference(BackReferenceNode* that) override {
3767     EnsureAnalyzed(that->on_success());
3768     if (has_failed()) return;
3769     STATIC_FOR_EACH(Propagators::VisitBackReference(that));
3770   }
3771 
VisitAssertion(AssertionNode * that)3772   void VisitAssertion(AssertionNode* that) override {
3773     EnsureAnalyzed(that->on_success());
3774     if (has_failed()) return;
3775     STATIC_FOR_EACH(Propagators::VisitAssertion(that));
3776   }
3777 
3778 #undef STATIC_FOR_EACH
3779 
3780  private:
3781   Isolate* isolate_;
3782   const bool is_one_byte_;
3783   const RegExpFlags flags_;
3784   RegExpError error_;
3785 
3786   DISALLOW_IMPLICIT_CONSTRUCTORS(Analysis);
3787 };
3788 
AnalyzeRegExp(Isolate * isolate,bool is_one_byte,RegExpFlags flags,RegExpNode * node)3789 RegExpError AnalyzeRegExp(Isolate* isolate, bool is_one_byte, RegExpFlags flags,
3790                           RegExpNode* node) {
3791   Analysis<AssertionPropagator, EatsAtLeastPropagator> analysis(
3792       isolate, is_one_byte, flags);
3793   DCHECK_EQ(node->info()->been_analyzed, false);
3794   analysis.EnsureAnalyzed(node);
3795   DCHECK_IMPLIES(analysis.has_failed(), analysis.error() != RegExpError::kNone);
3796   return analysis.has_failed() ? analysis.error() : RegExpError::kNone;
3797 }
3798 
FillInBMInfo(Isolate * isolate,int offset,int budget,BoyerMooreLookahead * bm,bool not_at_start)3799 void BackReferenceNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
3800                                      BoyerMooreLookahead* bm,
3801                                      bool not_at_start) {
3802   // Working out the set of characters that a backreference can match is too
3803   // hard, so we just say that any character can match.
3804   bm->SetRest(offset);
3805   SaveBMInfo(bm, not_at_start, offset);
3806 }
3807 
3808 STATIC_ASSERT(BoyerMoorePositionInfo::kMapSize ==
3809               RegExpMacroAssembler::kTableSize);
3810 
FillInBMInfo(Isolate * isolate,int offset,int budget,BoyerMooreLookahead * bm,bool not_at_start)3811 void ChoiceNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
3812                               BoyerMooreLookahead* bm, bool not_at_start) {
3813   ZoneList<GuardedAlternative>* alts = alternatives();
3814   budget = (budget - 1) / alts->length();
3815   for (int i = 0; i < alts->length(); i++) {
3816     GuardedAlternative& alt = alts->at(i);
3817     if (alt.guards() != nullptr && alt.guards()->length() != 0) {
3818       bm->SetRest(offset);  // Give up trying to fill in info.
3819       SaveBMInfo(bm, not_at_start, offset);
3820       return;
3821     }
3822     alt.node()->FillInBMInfo(isolate, offset, budget, bm, not_at_start);
3823   }
3824   SaveBMInfo(bm, not_at_start, offset);
3825 }
3826 
FillInBMInfo(Isolate * isolate,int initial_offset,int budget,BoyerMooreLookahead * bm,bool not_at_start)3827 void TextNode::FillInBMInfo(Isolate* isolate, int initial_offset, int budget,
3828                             BoyerMooreLookahead* bm, bool not_at_start) {
3829   if (initial_offset >= bm->length()) return;
3830   int offset = initial_offset;
3831   int max_char = bm->max_char();
3832   for (int i = 0; i < elements()->length(); i++) {
3833     if (offset >= bm->length()) {
3834       if (initial_offset == 0) set_bm_info(not_at_start, bm);
3835       return;
3836     }
3837     TextElement text = elements()->at(i);
3838     if (text.text_type() == TextElement::ATOM) {
3839       RegExpAtom* atom = text.atom();
3840       for (int j = 0; j < atom->length(); j++, offset++) {
3841         if (offset >= bm->length()) {
3842           if (initial_offset == 0) set_bm_info(not_at_start, bm);
3843           return;
3844         }
3845         base::uc16 character = atom->data()[j];
3846         if (IsIgnoreCase(bm->compiler()->flags())) {
3847           unibrow::uchar chars[4];
3848           int length = GetCaseIndependentLetters(
3849               isolate, character, bm->max_char() == String::kMaxOneByteCharCode,
3850               chars, 4);
3851           for (int k = 0; k < length; k++) {
3852             bm->Set(offset, chars[k]);
3853           }
3854         } else {
3855           if (character <= max_char) bm->Set(offset, character);
3856         }
3857       }
3858     } else {
3859       DCHECK_EQ(TextElement::CHAR_CLASS, text.text_type());
3860       RegExpCharacterClass* char_class = text.char_class();
3861       ZoneList<CharacterRange>* ranges = char_class->ranges(zone());
3862       if (char_class->is_negated()) {
3863         bm->SetAll(offset);
3864       } else {
3865         for (int k = 0; k < ranges->length(); k++) {
3866           CharacterRange& range = ranges->at(k);
3867           if (static_cast<int>(range.from()) > max_char) continue;
3868           int to = std::min(max_char, static_cast<int>(range.to()));
3869           bm->SetInterval(offset, Interval(range.from(), to));
3870         }
3871       }
3872       offset++;
3873     }
3874   }
3875   if (offset >= bm->length()) {
3876     if (initial_offset == 0) set_bm_info(not_at_start, bm);
3877     return;
3878   }
3879   on_success()->FillInBMInfo(isolate, offset, budget - 1, bm,
3880                              true);  // Not at start after a text node.
3881   if (initial_offset == 0) set_bm_info(not_at_start, bm);
3882 }
3883 
OptionallyStepBackToLeadSurrogate(RegExpNode * on_success)3884 RegExpNode* RegExpCompiler::OptionallyStepBackToLeadSurrogate(
3885     RegExpNode* on_success) {
3886   DCHECK(!read_backward());
3887   ZoneList<CharacterRange>* lead_surrogates = CharacterRange::List(
3888       zone(), CharacterRange::Range(kLeadSurrogateStart, kLeadSurrogateEnd));
3889   ZoneList<CharacterRange>* trail_surrogates = CharacterRange::List(
3890       zone(), CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd));
3891 
3892   ChoiceNode* optional_step_back = zone()->New<ChoiceNode>(2, zone());
3893 
3894   int stack_register = UnicodeLookaroundStackRegister();
3895   int position_register = UnicodeLookaroundPositionRegister();
3896   RegExpNode* step_back = TextNode::CreateForCharacterRanges(
3897       zone(), lead_surrogates, true, on_success);
3898   RegExpLookaround::Builder builder(true, step_back, stack_register,
3899                                     position_register);
3900   RegExpNode* match_trail = TextNode::CreateForCharacterRanges(
3901       zone(), trail_surrogates, false, builder.on_match_success());
3902 
3903   optional_step_back->AddAlternative(
3904       GuardedAlternative(builder.ForMatch(match_trail)));
3905   optional_step_back->AddAlternative(GuardedAlternative(on_success));
3906 
3907   return optional_step_back;
3908 }
3909 
PreprocessRegExp(RegExpCompileData * data,RegExpFlags flags,bool is_one_byte)3910 RegExpNode* RegExpCompiler::PreprocessRegExp(RegExpCompileData* data,
3911                                              RegExpFlags flags,
3912                                              bool is_one_byte) {
3913   // Wrap the body of the regexp in capture #0.
3914   RegExpNode* captured_body =
3915       RegExpCapture::ToNode(data->tree, 0, this, accept());
3916   RegExpNode* node = captured_body;
3917   if (!data->tree->IsAnchoredAtStart() && !IsSticky(flags)) {
3918     // Add a .*? at the beginning, outside the body capture, unless
3919     // this expression is anchored at the beginning or sticky.
3920     RegExpNode* loop_node = RegExpQuantifier::ToNode(
3921         0, RegExpTree::kInfinity, false,
3922         zone()->New<RegExpCharacterClass>(StandardCharacterSet::kEverything),
3923         this, captured_body, data->contains_anchor);
3924 
3925     if (data->contains_anchor) {
3926       // Unroll loop once, to take care of the case that might start
3927       // at the start of input.
3928       ChoiceNode* first_step_node = zone()->New<ChoiceNode>(2, zone());
3929       first_step_node->AddAlternative(GuardedAlternative(captured_body));
3930       first_step_node->AddAlternative(GuardedAlternative(zone()->New<TextNode>(
3931           zone()->New<RegExpCharacterClass>(StandardCharacterSet::kEverything),
3932           false, loop_node)));
3933       node = first_step_node;
3934     } else {
3935       node = loop_node;
3936     }
3937   }
3938   if (is_one_byte) {
3939     node = node->FilterOneByte(RegExpCompiler::kMaxRecursion, flags);
3940     // Do it again to propagate the new nodes to places where they were not
3941     // put because they had not been calculated yet.
3942     if (node != nullptr) {
3943       node = node->FilterOneByte(RegExpCompiler::kMaxRecursion, flags);
3944     }
3945   } else if (IsUnicode(flags) && (IsGlobal(flags) || IsSticky(flags))) {
3946     node = OptionallyStepBackToLeadSurrogate(node);
3947   }
3948 
3949   if (node == nullptr) node = zone()->New<EndNode>(EndNode::BACKTRACK, zone());
3950   return node;
3951 }
3952 
ToNodeCheckForStackOverflow()3953 void RegExpCompiler::ToNodeCheckForStackOverflow() {
3954   if (StackLimitCheck{isolate()}.HasOverflowed()) {
3955     FatalProcessOutOfMemory(isolate(), "RegExpCompiler");
3956   }
3957 }
3958 
3959 }  // namespace internal
3960 }  // namespace v8
3961