1 // Copyright 2019 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "src/regexp/regexp-compiler.h"
6
7 #include "src/base/safe_conversions.h"
8 #include "src/execution/isolate.h"
9 #include "src/objects/objects-inl.h"
10 #include "src/regexp/regexp-macro-assembler-arch.h"
11 #ifdef V8_INTL_SUPPORT
12 #include "src/regexp/special-case.h"
13 #endif // V8_INTL_SUPPORT
14 #include "src/strings/unicode-inl.h"
15 #include "src/zone/zone-list-inl.h"
16
17 #ifdef V8_INTL_SUPPORT
18 #include "unicode/locid.h"
19 #include "unicode/uniset.h"
20 #include "unicode/utypes.h"
21 #endif // V8_INTL_SUPPORT
22
23 namespace v8 {
24 namespace internal {
25
26 using namespace regexp_compiler_constants; // NOLINT(build/namespaces)
27
28 // -------------------------------------------------------------------
29 // Implementation of the Irregexp regular expression engine.
30 //
31 // The Irregexp regular expression engine is intended to be a complete
32 // implementation of ECMAScript regular expressions. It generates either
33 // bytecodes or native code.
34
35 // The Irregexp regexp engine is structured in three steps.
36 // 1) The parser generates an abstract syntax tree. See ast.cc.
37 // 2) From the AST a node network is created. The nodes are all
38 // subclasses of RegExpNode. The nodes represent states when
39 // executing a regular expression. Several optimizations are
40 // performed on the node network.
41 // 3) From the nodes we generate either byte codes or native code
42 // that can actually execute the regular expression (perform
43 // the search). The code generation step is described in more
44 // detail below.
45
46 // Code generation.
47 //
48 // The nodes are divided into four main categories.
49 // * Choice nodes
50 // These represent places where the regular expression can
51 // match in more than one way. For example on entry to an
52 // alternation (foo|bar) or a repetition (*, +, ? or {}).
53 // * Action nodes
54 // These represent places where some action should be
55 // performed. Examples include recording the current position
56 // in the input string to a register (in order to implement
57 // captures) or other actions on register for example in order
58 // to implement the counters needed for {} repetitions.
59 // * Matching nodes
60 // These attempt to match some element part of the input string.
61 // Examples of elements include character classes, plain strings
62 // or back references.
63 // * End nodes
64 // These are used to implement the actions required on finding
65 // a successful match or failing to find a match.
66 //
67 // The code generated (whether as byte codes or native code) maintains
68 // some state as it runs. This consists of the following elements:
69 //
70 // * The capture registers. Used for string captures.
71 // * Other registers. Used for counters etc.
72 // * The current position.
73 // * The stack of backtracking information. Used when a matching node
74 // fails to find a match and needs to try an alternative.
75 //
76 // Conceptual regular expression execution model:
77 //
78 // There is a simple conceptual model of regular expression execution
79 // which will be presented first. The actual code generated is a more
80 // efficient simulation of the simple conceptual model:
81 //
82 // * Choice nodes are implemented as follows:
83 // For each choice except the last {
84 // push current position
85 // push backtrack code location
86 // <generate code to test for choice>
87 // backtrack code location:
88 // pop current position
89 // }
90 // <generate code to test for last choice>
91 //
92 // * Actions nodes are generated as follows
93 // <push affected registers on backtrack stack>
94 // <generate code to perform action>
95 // push backtrack code location
96 // <generate code to test for following nodes>
97 // backtrack code location:
98 // <pop affected registers to restore their state>
99 // <pop backtrack location from stack and go to it>
100 //
101 // * Matching nodes are generated as follows:
102 // if input string matches at current position
103 // update current position
104 // <generate code to test for following nodes>
105 // else
106 // <pop backtrack location from stack and go to it>
107 //
108 // Thus it can be seen that the current position is saved and restored
109 // by the choice nodes, whereas the registers are saved and restored by
110 // by the action nodes that manipulate them.
111 //
112 // The other interesting aspect of this model is that nodes are generated
113 // at the point where they are needed by a recursive call to Emit(). If
114 // the node has already been code generated then the Emit() call will
115 // generate a jump to the previously generated code instead. In order to
116 // limit recursion it is possible for the Emit() function to put the node
117 // on a work list for later generation and instead generate a jump. The
118 // destination of the jump is resolved later when the code is generated.
119 //
120 // Actual regular expression code generation.
121 //
122 // Code generation is actually more complicated than the above. In order
123 // to improve the efficiency of the generated code some optimizations are
124 // performed
125 //
126 // * Choice nodes have 1-character lookahead.
127 // A choice node looks at the following character and eliminates some of
128 // the choices immediately based on that character. This is not yet
129 // implemented.
130 // * Simple greedy loops store reduced backtracking information.
131 // A quantifier like /.*foo/m will greedily match the whole input. It will
132 // then need to backtrack to a point where it can match "foo". The naive
133 // implementation of this would push each character position onto the
134 // backtracking stack, then pop them off one by one. This would use space
135 // proportional to the length of the input string. However since the "."
136 // can only match in one way and always has a constant length (in this case
137 // of 1) it suffices to store the current position on the top of the stack
138 // once. Matching now becomes merely incrementing the current position and
139 // backtracking becomes decrementing the current position and checking the
140 // result against the stored current position. This is faster and saves
141 // space.
142 // * The current state is virtualized.
143 // This is used to defer expensive operations until it is clear that they
144 // are needed and to generate code for a node more than once, allowing
145 // specialized an efficient versions of the code to be created. This is
146 // explained in the section below.
147 //
148 // Execution state virtualization.
149 //
150 // Instead of emitting code, nodes that manipulate the state can record their
151 // manipulation in an object called the Trace. The Trace object can record a
152 // current position offset, an optional backtrack code location on the top of
153 // the virtualized backtrack stack and some register changes. When a node is
154 // to be emitted it can flush the Trace or update it. Flushing the Trace
155 // will emit code to bring the actual state into line with the virtual state.
156 // Avoiding flushing the state can postpone some work (e.g. updates of capture
157 // registers). Postponing work can save time when executing the regular
158 // expression since it may be found that the work never has to be done as a
159 // failure to match can occur. In addition it is much faster to jump to a
160 // known backtrack code location than it is to pop an unknown backtrack
161 // location from the stack and jump there.
162 //
163 // The virtual state found in the Trace affects code generation. For example
164 // the virtual state contains the difference between the actual current
165 // position and the virtual current position, and matching code needs to use
166 // this offset to attempt a match in the correct location of the input
167 // string. Therefore code generated for a non-trivial trace is specialized
168 // to that trace. The code generator therefore has the ability to generate
169 // code for each node several times. In order to limit the size of the
170 // generated code there is an arbitrary limit on how many specialized sets of
171 // code may be generated for a given node. If the limit is reached, the
172 // trace is flushed and a generic version of the code for a node is emitted.
173 // This is subsequently used for that node. The code emitted for non-generic
174 // trace is not recorded in the node and so it cannot currently be reused in
175 // the event that code generation is requested for an identical trace.
176
177 namespace {
178
MaxCodeUnit(const bool one_byte)179 constexpr uc32 MaxCodeUnit(const bool one_byte) {
180 STATIC_ASSERT(String::kMaxOneByteCharCodeU <=
181 std::numeric_limits<uint16_t>::max());
182 STATIC_ASSERT(String::kMaxUtf16CodeUnitU <=
183 std::numeric_limits<uint16_t>::max());
184 return one_byte ? String::kMaxOneByteCharCodeU : String::kMaxUtf16CodeUnitU;
185 }
186
CharMask(const bool one_byte)187 constexpr uint32_t CharMask(const bool one_byte) {
188 STATIC_ASSERT(base::bits::IsPowerOfTwo(String::kMaxOneByteCharCodeU + 1));
189 STATIC_ASSERT(base::bits::IsPowerOfTwo(String::kMaxUtf16CodeUnitU + 1));
190 return MaxCodeUnit(one_byte);
191 }
192
193 } // namespace
194
AppendToText(RegExpText * text,Zone * zone)195 void RegExpTree::AppendToText(RegExpText* text, Zone* zone) { UNREACHABLE(); }
196
AppendToText(RegExpText * text,Zone * zone)197 void RegExpAtom::AppendToText(RegExpText* text, Zone* zone) {
198 text->AddElement(TextElement::Atom(this), zone);
199 }
200
AppendToText(RegExpText * text,Zone * zone)201 void RegExpCharacterClass::AppendToText(RegExpText* text, Zone* zone) {
202 text->AddElement(TextElement::CharClass(this), zone);
203 }
204
AppendToText(RegExpText * text,Zone * zone)205 void RegExpText::AppendToText(RegExpText* text, Zone* zone) {
206 for (int i = 0; i < elements()->length(); i++)
207 text->AddElement(elements()->at(i), zone);
208 }
209
Atom(RegExpAtom * atom)210 TextElement TextElement::Atom(RegExpAtom* atom) {
211 return TextElement(ATOM, atom);
212 }
213
CharClass(RegExpCharacterClass * char_class)214 TextElement TextElement::CharClass(RegExpCharacterClass* char_class) {
215 return TextElement(CHAR_CLASS, char_class);
216 }
217
length() const218 int TextElement::length() const {
219 switch (text_type()) {
220 case ATOM:
221 return atom()->length();
222
223 case CHAR_CLASS:
224 return 1;
225 }
226 UNREACHABLE();
227 }
228
229 class RecursionCheck {
230 public:
RecursionCheck(RegExpCompiler * compiler)231 explicit RecursionCheck(RegExpCompiler* compiler) : compiler_(compiler) {
232 compiler->IncrementRecursionDepth();
233 }
~RecursionCheck()234 ~RecursionCheck() { compiler_->DecrementRecursionDepth(); }
235
236 private:
237 RegExpCompiler* compiler_;
238 };
239
240 // Attempts to compile the regexp using an Irregexp code generator. Returns
241 // a fixed array or a null handle depending on whether it succeeded.
RegExpCompiler(Isolate * isolate,Zone * zone,int capture_count,bool one_byte)242 RegExpCompiler::RegExpCompiler(Isolate* isolate, Zone* zone, int capture_count,
243 bool one_byte)
244 : next_register_(JSRegExp::RegistersForCaptureCount(capture_count)),
245 unicode_lookaround_stack_register_(kNoRegister),
246 unicode_lookaround_position_register_(kNoRegister),
247 work_list_(nullptr),
248 recursion_depth_(0),
249 one_byte_(one_byte),
250 reg_exp_too_big_(false),
251 limiting_recursion_(false),
252 optimize_(FLAG_regexp_optimization),
253 read_backward_(false),
254 current_expansion_factor_(1),
255 frequency_collator_(),
256 isolate_(isolate),
257 zone_(zone) {
258 accept_ = zone->New<EndNode>(EndNode::ACCEPT, zone);
259 DCHECK_GE(RegExpMacroAssembler::kMaxRegister, next_register_ - 1);
260 }
261
Assemble(Isolate * isolate,RegExpMacroAssembler * macro_assembler,RegExpNode * start,int capture_count,Handle<String> pattern)262 RegExpCompiler::CompilationResult RegExpCompiler::Assemble(
263 Isolate* isolate, RegExpMacroAssembler* macro_assembler, RegExpNode* start,
264 int capture_count, Handle<String> pattern) {
265 macro_assembler_ = macro_assembler;
266
267 ZoneVector<RegExpNode*> work_list(zone());
268 work_list_ = &work_list;
269 Label fail;
270 macro_assembler_->PushBacktrack(&fail);
271 Trace new_trace;
272 start->Emit(this, &new_trace);
273 macro_assembler_->BindJumpTarget(&fail);
274 macro_assembler_->Fail();
275 while (!work_list.empty()) {
276 RegExpNode* node = work_list.back();
277 work_list.pop_back();
278 node->set_on_work_list(false);
279 if (!node->label()->is_bound()) node->Emit(this, &new_trace);
280 }
281 if (reg_exp_too_big_) {
282 macro_assembler_->AbortedCodeGeneration();
283 return CompilationResult::RegExpTooBig();
284 }
285
286 Handle<HeapObject> code = macro_assembler_->GetCode(pattern);
287 isolate->IncreaseTotalRegexpCodeGenerated(code);
288 work_list_ = nullptr;
289
290 return {code, next_register_};
291 }
292
Mentions(int that)293 bool Trace::DeferredAction::Mentions(int that) {
294 if (action_type() == ActionNode::CLEAR_CAPTURES) {
295 Interval range = static_cast<DeferredClearCaptures*>(this)->range();
296 return range.Contains(that);
297 } else {
298 return reg() == that;
299 }
300 }
301
mentions_reg(int reg)302 bool Trace::mentions_reg(int reg) {
303 for (DeferredAction* action = actions_; action != nullptr;
304 action = action->next()) {
305 if (action->Mentions(reg)) return true;
306 }
307 return false;
308 }
309
GetStoredPosition(int reg,int * cp_offset)310 bool Trace::GetStoredPosition(int reg, int* cp_offset) {
311 DCHECK_EQ(0, *cp_offset);
312 for (DeferredAction* action = actions_; action != nullptr;
313 action = action->next()) {
314 if (action->Mentions(reg)) {
315 if (action->action_type() == ActionNode::STORE_POSITION) {
316 *cp_offset = static_cast<DeferredCapture*>(action)->cp_offset();
317 return true;
318 } else {
319 return false;
320 }
321 }
322 }
323 return false;
324 }
325
326 // A (dynamically-sized) set of unsigned integers that behaves especially well
327 // on small integers (< kFirstLimit). May do zone-allocation.
328 class DynamicBitSet : public ZoneObject {
329 public:
Get(unsigned value) const330 V8_EXPORT_PRIVATE bool Get(unsigned value) const {
331 if (value < kFirstLimit) {
332 return (first_ & (1 << value)) != 0;
333 } else if (remaining_ == nullptr) {
334 return false;
335 } else {
336 return remaining_->Contains(value);
337 }
338 }
339
340 // Destructively set a value in this set.
Set(unsigned value,Zone * zone)341 void Set(unsigned value, Zone* zone) {
342 if (value < kFirstLimit) {
343 first_ |= (1 << value);
344 } else {
345 if (remaining_ == nullptr)
346 remaining_ = zone->New<ZoneList<unsigned>>(1, zone);
347 if (remaining_->is_empty() || !remaining_->Contains(value))
348 remaining_->Add(value, zone);
349 }
350 }
351
352 private:
353 static constexpr unsigned kFirstLimit = 32;
354
355 uint32_t first_ = 0;
356 ZoneList<unsigned>* remaining_ = nullptr;
357 };
358
FindAffectedRegisters(DynamicBitSet * affected_registers,Zone * zone)359 int Trace::FindAffectedRegisters(DynamicBitSet* affected_registers,
360 Zone* zone) {
361 int max_register = RegExpCompiler::kNoRegister;
362 for (DeferredAction* action = actions_; action != nullptr;
363 action = action->next()) {
364 if (action->action_type() == ActionNode::CLEAR_CAPTURES) {
365 Interval range = static_cast<DeferredClearCaptures*>(action)->range();
366 for (int i = range.from(); i <= range.to(); i++)
367 affected_registers->Set(i, zone);
368 if (range.to() > max_register) max_register = range.to();
369 } else {
370 affected_registers->Set(action->reg(), zone);
371 if (action->reg() > max_register) max_register = action->reg();
372 }
373 }
374 return max_register;
375 }
376
RestoreAffectedRegisters(RegExpMacroAssembler * assembler,int max_register,const DynamicBitSet & registers_to_pop,const DynamicBitSet & registers_to_clear)377 void Trace::RestoreAffectedRegisters(RegExpMacroAssembler* assembler,
378 int max_register,
379 const DynamicBitSet& registers_to_pop,
380 const DynamicBitSet& registers_to_clear) {
381 for (int reg = max_register; reg >= 0; reg--) {
382 if (registers_to_pop.Get(reg)) {
383 assembler->PopRegister(reg);
384 } else if (registers_to_clear.Get(reg)) {
385 int clear_to = reg;
386 while (reg > 0 && registers_to_clear.Get(reg - 1)) {
387 reg--;
388 }
389 assembler->ClearRegisters(reg, clear_to);
390 }
391 }
392 }
393
PerformDeferredActions(RegExpMacroAssembler * assembler,int max_register,const DynamicBitSet & affected_registers,DynamicBitSet * registers_to_pop,DynamicBitSet * registers_to_clear,Zone * zone)394 void Trace::PerformDeferredActions(RegExpMacroAssembler* assembler,
395 int max_register,
396 const DynamicBitSet& affected_registers,
397 DynamicBitSet* registers_to_pop,
398 DynamicBitSet* registers_to_clear,
399 Zone* zone) {
400 // The "+1" is to avoid a push_limit of zero if stack_limit_slack() is 1.
401 const int push_limit = (assembler->stack_limit_slack() + 1) / 2;
402
403 // Count pushes performed to force a stack limit check occasionally.
404 int pushes = 0;
405
406 for (int reg = 0; reg <= max_register; reg++) {
407 if (!affected_registers.Get(reg)) continue;
408
409 // The chronologically first deferred action in the trace
410 // is used to infer the action needed to restore a register
411 // to its previous state (or not, if it's safe to ignore it).
412 enum DeferredActionUndoType { IGNORE, RESTORE, CLEAR };
413 DeferredActionUndoType undo_action = IGNORE;
414
415 int value = 0;
416 bool absolute = false;
417 bool clear = false;
418 static const int kNoStore = kMinInt;
419 int store_position = kNoStore;
420 // This is a little tricky because we are scanning the actions in reverse
421 // historical order (newest first).
422 for (DeferredAction* action = actions_; action != nullptr;
423 action = action->next()) {
424 if (action->Mentions(reg)) {
425 switch (action->action_type()) {
426 case ActionNode::SET_REGISTER_FOR_LOOP: {
427 Trace::DeferredSetRegisterForLoop* psr =
428 static_cast<Trace::DeferredSetRegisterForLoop*>(action);
429 if (!absolute) {
430 value += psr->value();
431 absolute = true;
432 }
433 // SET_REGISTER_FOR_LOOP is only used for newly introduced loop
434 // counters. They can have a significant previous value if they
435 // occur in a loop. TODO(lrn): Propagate this information, so
436 // we can set undo_action to IGNORE if we know there is no value to
437 // restore.
438 undo_action = RESTORE;
439 DCHECK_EQ(store_position, kNoStore);
440 DCHECK(!clear);
441 break;
442 }
443 case ActionNode::INCREMENT_REGISTER:
444 if (!absolute) {
445 value++;
446 }
447 DCHECK_EQ(store_position, kNoStore);
448 DCHECK(!clear);
449 undo_action = RESTORE;
450 break;
451 case ActionNode::STORE_POSITION: {
452 Trace::DeferredCapture* pc =
453 static_cast<Trace::DeferredCapture*>(action);
454 if (!clear && store_position == kNoStore) {
455 store_position = pc->cp_offset();
456 }
457
458 // For captures we know that stores and clears alternate.
459 // Other register, are never cleared, and if the occur
460 // inside a loop, they might be assigned more than once.
461 if (reg <= 1) {
462 // Registers zero and one, aka "capture zero", is
463 // always set correctly if we succeed. There is no
464 // need to undo a setting on backtrack, because we
465 // will set it again or fail.
466 undo_action = IGNORE;
467 } else {
468 undo_action = pc->is_capture() ? CLEAR : RESTORE;
469 }
470 DCHECK(!absolute);
471 DCHECK_EQ(value, 0);
472 break;
473 }
474 case ActionNode::CLEAR_CAPTURES: {
475 // Since we're scanning in reverse order, if we've already
476 // set the position we have to ignore historically earlier
477 // clearing operations.
478 if (store_position == kNoStore) {
479 clear = true;
480 }
481 undo_action = RESTORE;
482 DCHECK(!absolute);
483 DCHECK_EQ(value, 0);
484 break;
485 }
486 default:
487 UNREACHABLE();
488 break;
489 }
490 }
491 }
492 // Prepare for the undo-action (e.g., push if it's going to be popped).
493 if (undo_action == RESTORE) {
494 pushes++;
495 RegExpMacroAssembler::StackCheckFlag stack_check =
496 RegExpMacroAssembler::kNoStackLimitCheck;
497 if (pushes == push_limit) {
498 stack_check = RegExpMacroAssembler::kCheckStackLimit;
499 pushes = 0;
500 }
501
502 assembler->PushRegister(reg, stack_check);
503 registers_to_pop->Set(reg, zone);
504 } else if (undo_action == CLEAR) {
505 registers_to_clear->Set(reg, zone);
506 }
507 // Perform the chronologically last action (or accumulated increment)
508 // for the register.
509 if (store_position != kNoStore) {
510 assembler->WriteCurrentPositionToRegister(reg, store_position);
511 } else if (clear) {
512 assembler->ClearRegisters(reg, reg);
513 } else if (absolute) {
514 assembler->SetRegister(reg, value);
515 } else if (value != 0) {
516 assembler->AdvanceRegister(reg, value);
517 }
518 }
519 }
520
521 // This is called as we come into a loop choice node and some other tricky
522 // nodes. It normalizes the state of the code generator to ensure we can
523 // generate generic code.
Flush(RegExpCompiler * compiler,RegExpNode * successor)524 void Trace::Flush(RegExpCompiler* compiler, RegExpNode* successor) {
525 RegExpMacroAssembler* assembler = compiler->macro_assembler();
526
527 DCHECK(!is_trivial());
528
529 if (actions_ == nullptr && backtrack() == nullptr) {
530 // Here we just have some deferred cp advances to fix and we are back to
531 // a normal situation. We may also have to forget some information gained
532 // through a quick check that was already performed.
533 if (cp_offset_ != 0) assembler->AdvanceCurrentPosition(cp_offset_);
534 // Create a new trivial state and generate the node with that.
535 Trace new_state;
536 successor->Emit(compiler, &new_state);
537 return;
538 }
539
540 // Generate deferred actions here along with code to undo them again.
541 DynamicBitSet affected_registers;
542
543 if (backtrack() != nullptr) {
544 // Here we have a concrete backtrack location. These are set up by choice
545 // nodes and so they indicate that we have a deferred save of the current
546 // position which we may need to emit here.
547 assembler->PushCurrentPosition();
548 }
549
550 int max_register =
551 FindAffectedRegisters(&affected_registers, compiler->zone());
552 DynamicBitSet registers_to_pop;
553 DynamicBitSet registers_to_clear;
554 PerformDeferredActions(assembler, max_register, affected_registers,
555 ®isters_to_pop, ®isters_to_clear,
556 compiler->zone());
557 if (cp_offset_ != 0) {
558 assembler->AdvanceCurrentPosition(cp_offset_);
559 }
560
561 // Create a new trivial state and generate the node with that.
562 Label undo;
563 assembler->PushBacktrack(&undo);
564 if (successor->KeepRecursing(compiler)) {
565 Trace new_state;
566 successor->Emit(compiler, &new_state);
567 } else {
568 compiler->AddWork(successor);
569 assembler->GoTo(successor->label());
570 }
571
572 // On backtrack we need to restore state.
573 assembler->BindJumpTarget(&undo);
574 RestoreAffectedRegisters(assembler, max_register, registers_to_pop,
575 registers_to_clear);
576 if (backtrack() == nullptr) {
577 assembler->Backtrack();
578 } else {
579 assembler->PopCurrentPosition();
580 assembler->GoTo(backtrack());
581 }
582 }
583
Emit(RegExpCompiler * compiler,Trace * trace)584 void NegativeSubmatchSuccess::Emit(RegExpCompiler* compiler, Trace* trace) {
585 RegExpMacroAssembler* assembler = compiler->macro_assembler();
586
587 // Omit flushing the trace. We discard the entire stack frame anyway.
588
589 if (!label()->is_bound()) {
590 // We are completely independent of the trace, since we ignore it,
591 // so this code can be used as the generic version.
592 assembler->Bind(label());
593 }
594
595 // Throw away everything on the backtrack stack since the start
596 // of the negative submatch and restore the character position.
597 assembler->ReadCurrentPositionFromRegister(current_position_register_);
598 assembler->ReadStackPointerFromRegister(stack_pointer_register_);
599 if (clear_capture_count_ > 0) {
600 // Clear any captures that might have been performed during the success
601 // of the body of the negative look-ahead.
602 int clear_capture_end = clear_capture_start_ + clear_capture_count_ - 1;
603 assembler->ClearRegisters(clear_capture_start_, clear_capture_end);
604 }
605 // Now that we have unwound the stack we find at the top of the stack the
606 // backtrack that the BeginSubmatch node got.
607 assembler->Backtrack();
608 }
609
Emit(RegExpCompiler * compiler,Trace * trace)610 void EndNode::Emit(RegExpCompiler* compiler, Trace* trace) {
611 if (!trace->is_trivial()) {
612 trace->Flush(compiler, this);
613 return;
614 }
615 RegExpMacroAssembler* assembler = compiler->macro_assembler();
616 if (!label()->is_bound()) {
617 assembler->Bind(label());
618 }
619 switch (action_) {
620 case ACCEPT:
621 assembler->Succeed();
622 return;
623 case BACKTRACK:
624 assembler->GoTo(trace->backtrack());
625 return;
626 case NEGATIVE_SUBMATCH_SUCCESS:
627 // This case is handled in a different virtual method.
628 UNREACHABLE();
629 }
630 UNIMPLEMENTED();
631 }
632
AddGuard(Guard * guard,Zone * zone)633 void GuardedAlternative::AddGuard(Guard* guard, Zone* zone) {
634 if (guards_ == nullptr) guards_ = zone->New<ZoneList<Guard*>>(1, zone);
635 guards_->Add(guard, zone);
636 }
637
SetRegisterForLoop(int reg,int val,RegExpNode * on_success)638 ActionNode* ActionNode::SetRegisterForLoop(int reg, int val,
639 RegExpNode* on_success) {
640 ActionNode* result =
641 on_success->zone()->New<ActionNode>(SET_REGISTER_FOR_LOOP, on_success);
642 result->data_.u_store_register.reg = reg;
643 result->data_.u_store_register.value = val;
644 return result;
645 }
646
IncrementRegister(int reg,RegExpNode * on_success)647 ActionNode* ActionNode::IncrementRegister(int reg, RegExpNode* on_success) {
648 ActionNode* result =
649 on_success->zone()->New<ActionNode>(INCREMENT_REGISTER, on_success);
650 result->data_.u_increment_register.reg = reg;
651 return result;
652 }
653
StorePosition(int reg,bool is_capture,RegExpNode * on_success)654 ActionNode* ActionNode::StorePosition(int reg, bool is_capture,
655 RegExpNode* on_success) {
656 ActionNode* result =
657 on_success->zone()->New<ActionNode>(STORE_POSITION, on_success);
658 result->data_.u_position_register.reg = reg;
659 result->data_.u_position_register.is_capture = is_capture;
660 return result;
661 }
662
ClearCaptures(Interval range,RegExpNode * on_success)663 ActionNode* ActionNode::ClearCaptures(Interval range, RegExpNode* on_success) {
664 ActionNode* result =
665 on_success->zone()->New<ActionNode>(CLEAR_CAPTURES, on_success);
666 result->data_.u_clear_captures.range_from = range.from();
667 result->data_.u_clear_captures.range_to = range.to();
668 return result;
669 }
670
BeginSubmatch(int stack_reg,int position_reg,RegExpNode * on_success)671 ActionNode* ActionNode::BeginSubmatch(int stack_reg, int position_reg,
672 RegExpNode* on_success) {
673 ActionNode* result =
674 on_success->zone()->New<ActionNode>(BEGIN_SUBMATCH, on_success);
675 result->data_.u_submatch.stack_pointer_register = stack_reg;
676 result->data_.u_submatch.current_position_register = position_reg;
677 return result;
678 }
679
PositiveSubmatchSuccess(int stack_reg,int position_reg,int clear_register_count,int clear_register_from,RegExpNode * on_success)680 ActionNode* ActionNode::PositiveSubmatchSuccess(int stack_reg, int position_reg,
681 int clear_register_count,
682 int clear_register_from,
683 RegExpNode* on_success) {
684 ActionNode* result = on_success->zone()->New<ActionNode>(
685 POSITIVE_SUBMATCH_SUCCESS, on_success);
686 result->data_.u_submatch.stack_pointer_register = stack_reg;
687 result->data_.u_submatch.current_position_register = position_reg;
688 result->data_.u_submatch.clear_register_count = clear_register_count;
689 result->data_.u_submatch.clear_register_from = clear_register_from;
690 return result;
691 }
692
EmptyMatchCheck(int start_register,int repetition_register,int repetition_limit,RegExpNode * on_success)693 ActionNode* ActionNode::EmptyMatchCheck(int start_register,
694 int repetition_register,
695 int repetition_limit,
696 RegExpNode* on_success) {
697 ActionNode* result =
698 on_success->zone()->New<ActionNode>(EMPTY_MATCH_CHECK, on_success);
699 result->data_.u_empty_match_check.start_register = start_register;
700 result->data_.u_empty_match_check.repetition_register = repetition_register;
701 result->data_.u_empty_match_check.repetition_limit = repetition_limit;
702 return result;
703 }
704
705 #define DEFINE_ACCEPT(Type) \
706 void Type##Node::Accept(NodeVisitor* visitor) { visitor->Visit##Type(this); }
FOR_EACH_NODE_TYPE(DEFINE_ACCEPT)707 FOR_EACH_NODE_TYPE(DEFINE_ACCEPT)
708 #undef DEFINE_ACCEPT
709
710 // -------------------------------------------------------------------
711 // Emit code.
712
713 void ChoiceNode::GenerateGuard(RegExpMacroAssembler* macro_assembler,
714 Guard* guard, Trace* trace) {
715 switch (guard->op()) {
716 case Guard::LT:
717 DCHECK(!trace->mentions_reg(guard->reg()));
718 macro_assembler->IfRegisterGE(guard->reg(), guard->value(),
719 trace->backtrack());
720 break;
721 case Guard::GEQ:
722 DCHECK(!trace->mentions_reg(guard->reg()));
723 macro_assembler->IfRegisterLT(guard->reg(), guard->value(),
724 trace->backtrack());
725 break;
726 }
727 }
728
729 namespace {
730
731 #ifdef DEBUG
ContainsOnlyUtf16CodeUnits(unibrow::uchar * chars,int length)732 bool ContainsOnlyUtf16CodeUnits(unibrow::uchar* chars, int length) {
733 STATIC_ASSERT(sizeof(unibrow::uchar) == 4);
734 for (int i = 0; i < length; i++) {
735 if (chars[i] > String::kMaxUtf16CodeUnit) return false;
736 }
737 return true;
738 }
739 #endif // DEBUG
740
741 } // namespace
742
743 // Returns the number of characters in the equivalence class, omitting those
744 // that cannot occur in the source string because it is Latin1.
GetCaseIndependentLetters(Isolate * isolate,uc16 character,bool one_byte_subject,unibrow::uchar * letters,int letter_length)745 static int GetCaseIndependentLetters(Isolate* isolate, uc16 character,
746 bool one_byte_subject,
747 unibrow::uchar* letters,
748 int letter_length) {
749 #ifdef V8_INTL_SUPPORT
750 if (RegExpCaseFolding::IgnoreSet().contains(character)) {
751 letters[0] = character;
752 DCHECK(ContainsOnlyUtf16CodeUnits(letters, 1));
753 return 1;
754 }
755 bool in_special_add_set =
756 RegExpCaseFolding::SpecialAddSet().contains(character);
757
758 icu::UnicodeSet set;
759 set.add(character);
760 set = set.closeOver(USET_CASE_INSENSITIVE);
761
762 UChar32 canon = 0;
763 if (in_special_add_set) {
764 canon = RegExpCaseFolding::Canonicalize(character);
765 }
766
767 int32_t range_count = set.getRangeCount();
768 int items = 0;
769 for (int32_t i = 0; i < range_count; i++) {
770 UChar32 start = set.getRangeStart(i);
771 UChar32 end = set.getRangeEnd(i);
772 CHECK(end - start + items <= letter_length);
773 for (UChar32 cu = start; cu <= end; cu++) {
774 if (one_byte_subject && cu > String::kMaxOneByteCharCode) break;
775 if (in_special_add_set && RegExpCaseFolding::Canonicalize(cu) != canon) {
776 continue;
777 }
778 letters[items++] = static_cast<unibrow::uchar>(cu);
779 }
780 }
781 DCHECK(ContainsOnlyUtf16CodeUnits(letters, items));
782 return items;
783 #else
784 int length =
785 isolate->jsregexp_uncanonicalize()->get(character, '\0', letters);
786 // Unibrow returns 0 or 1 for characters where case independence is
787 // trivial.
788 if (length == 0) {
789 letters[0] = character;
790 length = 1;
791 }
792
793 if (one_byte_subject) {
794 int new_length = 0;
795 for (int i = 0; i < length; i++) {
796 if (letters[i] <= String::kMaxOneByteCharCode) {
797 letters[new_length++] = letters[i];
798 }
799 }
800 length = new_length;
801 }
802
803 DCHECK(ContainsOnlyUtf16CodeUnits(letters, length));
804 return length;
805 #endif // V8_INTL_SUPPORT
806 }
807
EmitSimpleCharacter(Isolate * isolate,RegExpCompiler * compiler,uc16 c,Label * on_failure,int cp_offset,bool check,bool preloaded)808 static inline bool EmitSimpleCharacter(Isolate* isolate,
809 RegExpCompiler* compiler, uc16 c,
810 Label* on_failure, int cp_offset,
811 bool check, bool preloaded) {
812 RegExpMacroAssembler* assembler = compiler->macro_assembler();
813 bool bound_checked = false;
814 if (!preloaded) {
815 assembler->LoadCurrentCharacter(cp_offset, on_failure, check);
816 bound_checked = true;
817 }
818 assembler->CheckNotCharacter(c, on_failure);
819 return bound_checked;
820 }
821
822 // Only emits non-letters (things that don't have case). Only used for case
823 // independent matches.
EmitAtomNonLetter(Isolate * isolate,RegExpCompiler * compiler,uc16 c,Label * on_failure,int cp_offset,bool check,bool preloaded)824 static inline bool EmitAtomNonLetter(Isolate* isolate, RegExpCompiler* compiler,
825 uc16 c, Label* on_failure, int cp_offset,
826 bool check, bool preloaded) {
827 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
828 bool one_byte = compiler->one_byte();
829 unibrow::uchar chars[4];
830 int length = GetCaseIndependentLetters(isolate, c, one_byte, chars, 4);
831 if (length < 1) {
832 // This can't match. Must be an one-byte subject and a non-one-byte
833 // character. We do not need to do anything since the one-byte pass
834 // already handled this.
835 return false; // Bounds not checked.
836 }
837 bool checked = false;
838 // We handle the length > 1 case in a later pass.
839 if (length == 1) {
840 if (one_byte && c > String::kMaxOneByteCharCodeU) {
841 // Can't match - see above.
842 return false; // Bounds not checked.
843 }
844 if (!preloaded) {
845 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);
846 checked = check;
847 }
848 macro_assembler->CheckNotCharacter(c, on_failure);
849 }
850 return checked;
851 }
852
ShortCutEmitCharacterPair(RegExpMacroAssembler * macro_assembler,bool one_byte,uc16 c1,uc16 c2,Label * on_failure)853 static bool ShortCutEmitCharacterPair(RegExpMacroAssembler* macro_assembler,
854 bool one_byte, uc16 c1, uc16 c2,
855 Label* on_failure) {
856 const uint32_t char_mask = CharMask(one_byte);
857 uc16 exor = c1 ^ c2;
858 // Check whether exor has only one bit set.
859 if (((exor - 1) & exor) == 0) {
860 // If c1 and c2 differ only by one bit.
861 // Ecma262UnCanonicalize always gives the highest number last.
862 DCHECK(c2 > c1);
863 uc16 mask = char_mask ^ exor;
864 macro_assembler->CheckNotCharacterAfterAnd(c1, mask, on_failure);
865 return true;
866 }
867 DCHECK(c2 > c1);
868 uc16 diff = c2 - c1;
869 if (((diff - 1) & diff) == 0 && c1 >= diff) {
870 // If the characters differ by 2^n but don't differ by one bit then
871 // subtract the difference from the found character, then do the or
872 // trick. We avoid the theoretical case where negative numbers are
873 // involved in order to simplify code generation.
874 uc16 mask = char_mask ^ diff;
875 macro_assembler->CheckNotCharacterAfterMinusAnd(c1 - diff, diff, mask,
876 on_failure);
877 return true;
878 }
879 return false;
880 }
881
882 // Only emits letters (things that have case). Only used for case independent
883 // matches.
EmitAtomLetter(Isolate * isolate,RegExpCompiler * compiler,uc16 c,Label * on_failure,int cp_offset,bool check,bool preloaded)884 static inline bool EmitAtomLetter(Isolate* isolate, RegExpCompiler* compiler,
885 uc16 c, Label* on_failure, int cp_offset,
886 bool check, bool preloaded) {
887 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
888 bool one_byte = compiler->one_byte();
889 unibrow::uchar chars[4];
890 int length = GetCaseIndependentLetters(isolate, c, one_byte, chars, 4);
891 if (length <= 1) return false;
892 // We may not need to check against the end of the input string
893 // if this character lies before a character that matched.
894 if (!preloaded) {
895 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);
896 }
897 Label ok;
898 switch (length) {
899 case 2: {
900 if (ShortCutEmitCharacterPair(macro_assembler, one_byte, chars[0],
901 chars[1], on_failure)) {
902 } else {
903 macro_assembler->CheckCharacter(chars[0], &ok);
904 macro_assembler->CheckNotCharacter(chars[1], on_failure);
905 macro_assembler->Bind(&ok);
906 }
907 break;
908 }
909 case 4:
910 macro_assembler->CheckCharacter(chars[3], &ok);
911 V8_FALLTHROUGH;
912 case 3:
913 macro_assembler->CheckCharacter(chars[0], &ok);
914 macro_assembler->CheckCharacter(chars[1], &ok);
915 macro_assembler->CheckNotCharacter(chars[2], on_failure);
916 macro_assembler->Bind(&ok);
917 break;
918 default:
919 UNREACHABLE();
920 }
921 return true;
922 }
923
EmitBoundaryTest(RegExpMacroAssembler * masm,int border,Label * fall_through,Label * above_or_equal,Label * below)924 static void EmitBoundaryTest(RegExpMacroAssembler* masm, int border,
925 Label* fall_through, Label* above_or_equal,
926 Label* below) {
927 if (below != fall_through) {
928 masm->CheckCharacterLT(border, below);
929 if (above_or_equal != fall_through) masm->GoTo(above_or_equal);
930 } else {
931 masm->CheckCharacterGT(border - 1, above_or_equal);
932 }
933 }
934
EmitDoubleBoundaryTest(RegExpMacroAssembler * masm,int first,int last,Label * fall_through,Label * in_range,Label * out_of_range)935 static void EmitDoubleBoundaryTest(RegExpMacroAssembler* masm, int first,
936 int last, Label* fall_through,
937 Label* in_range, Label* out_of_range) {
938 if (in_range == fall_through) {
939 if (first == last) {
940 masm->CheckNotCharacter(first, out_of_range);
941 } else {
942 masm->CheckCharacterNotInRange(first, last, out_of_range);
943 }
944 } else {
945 if (first == last) {
946 masm->CheckCharacter(first, in_range);
947 } else {
948 masm->CheckCharacterInRange(first, last, in_range);
949 }
950 if (out_of_range != fall_through) masm->GoTo(out_of_range);
951 }
952 }
953
954 // even_label is for ranges[i] to ranges[i + 1] where i - start_index is even.
955 // odd_label is for ranges[i] to ranges[i + 1] where i - start_index is odd.
EmitUseLookupTable(RegExpMacroAssembler * masm,ZoneList<int> * ranges,int start_index,int end_index,int min_char,Label * fall_through,Label * even_label,Label * odd_label)956 static void EmitUseLookupTable(RegExpMacroAssembler* masm,
957 ZoneList<int>* ranges, int start_index,
958 int end_index, int min_char, Label* fall_through,
959 Label* even_label, Label* odd_label) {
960 static const int kSize = RegExpMacroAssembler::kTableSize;
961 static const int kMask = RegExpMacroAssembler::kTableMask;
962
963 int base = (min_char & ~kMask);
964 USE(base);
965
966 // Assert that everything is on one kTableSize page.
967 for (int i = start_index; i <= end_index; i++) {
968 DCHECK_EQ(ranges->at(i) & ~kMask, base);
969 }
970 DCHECK(start_index == 0 || (ranges->at(start_index - 1) & ~kMask) <= base);
971
972 char templ[kSize];
973 Label* on_bit_set;
974 Label* on_bit_clear;
975 int bit;
976 if (even_label == fall_through) {
977 on_bit_set = odd_label;
978 on_bit_clear = even_label;
979 bit = 1;
980 } else {
981 on_bit_set = even_label;
982 on_bit_clear = odd_label;
983 bit = 0;
984 }
985 for (int i = 0; i < (ranges->at(start_index) & kMask) && i < kSize; i++) {
986 templ[i] = bit;
987 }
988 int j = 0;
989 bit ^= 1;
990 for (int i = start_index; i < end_index; i++) {
991 for (j = (ranges->at(i) & kMask); j < (ranges->at(i + 1) & kMask); j++) {
992 templ[j] = bit;
993 }
994 bit ^= 1;
995 }
996 for (int i = j; i < kSize; i++) {
997 templ[i] = bit;
998 }
999 Factory* factory = masm->isolate()->factory();
1000 // TODO(erikcorry): Cache these.
1001 Handle<ByteArray> ba = factory->NewByteArray(kSize, AllocationType::kOld);
1002 for (int i = 0; i < kSize; i++) {
1003 ba->set(i, templ[i]);
1004 }
1005 masm->CheckBitInTable(ba, on_bit_set);
1006 if (on_bit_clear != fall_through) masm->GoTo(on_bit_clear);
1007 }
1008
CutOutRange(RegExpMacroAssembler * masm,ZoneList<int> * ranges,int start_index,int end_index,int cut_index,Label * even_label,Label * odd_label)1009 static void CutOutRange(RegExpMacroAssembler* masm, ZoneList<int>* ranges,
1010 int start_index, int end_index, int cut_index,
1011 Label* even_label, Label* odd_label) {
1012 bool odd = (((cut_index - start_index) & 1) == 1);
1013 Label* in_range_label = odd ? odd_label : even_label;
1014 Label dummy;
1015 EmitDoubleBoundaryTest(masm, ranges->at(cut_index),
1016 ranges->at(cut_index + 1) - 1, &dummy, in_range_label,
1017 &dummy);
1018 DCHECK(!dummy.is_linked());
1019 // Cut out the single range by rewriting the array. This creates a new
1020 // range that is a merger of the two ranges on either side of the one we
1021 // are cutting out. The oddity of the labels is preserved.
1022 for (int j = cut_index; j > start_index; j--) {
1023 ranges->at(j) = ranges->at(j - 1);
1024 }
1025 for (int j = cut_index + 1; j < end_index; j++) {
1026 ranges->at(j) = ranges->at(j + 1);
1027 }
1028 }
1029
1030 // Unicode case. Split the search space into kSize spaces that are handled
1031 // with recursion.
SplitSearchSpace(ZoneList<int> * ranges,int start_index,int end_index,int * new_start_index,int * new_end_index,int * border)1032 static void SplitSearchSpace(ZoneList<int>* ranges, int start_index,
1033 int end_index, int* new_start_index,
1034 int* new_end_index, int* border) {
1035 static const int kSize = RegExpMacroAssembler::kTableSize;
1036 static const int kMask = RegExpMacroAssembler::kTableMask;
1037
1038 int first = ranges->at(start_index);
1039 int last = ranges->at(end_index) - 1;
1040
1041 *new_start_index = start_index;
1042 *border = (ranges->at(start_index) & ~kMask) + kSize;
1043 while (*new_start_index < end_index) {
1044 if (ranges->at(*new_start_index) > *border) break;
1045 (*new_start_index)++;
1046 }
1047 // new_start_index is the index of the first edge that is beyond the
1048 // current kSize space.
1049
1050 // For very large search spaces we do a binary chop search of the non-Latin1
1051 // space instead of just going to the end of the current kSize space. The
1052 // heuristics are complicated a little by the fact that any 128-character
1053 // encoding space can be quickly tested with a table lookup, so we don't
1054 // wish to do binary chop search at a smaller granularity than that. A
1055 // 128-character space can take up a lot of space in the ranges array if,
1056 // for example, we only want to match every second character (eg. the lower
1057 // case characters on some Unicode pages).
1058 int binary_chop_index = (end_index + start_index) / 2;
1059 // The first test ensures that we get to the code that handles the Latin1
1060 // range with a single not-taken branch, speeding up this important
1061 // character range (even non-Latin1 charset-based text has spaces and
1062 // punctuation).
1063 if (*border - 1 > String::kMaxOneByteCharCode && // Latin1 case.
1064 end_index - start_index > (*new_start_index - start_index) * 2 &&
1065 last - first > kSize * 2 && binary_chop_index > *new_start_index &&
1066 ranges->at(binary_chop_index) >= first + 2 * kSize) {
1067 int scan_forward_for_section_border = binary_chop_index;
1068 int new_border = (ranges->at(binary_chop_index) | kMask) + 1;
1069
1070 while (scan_forward_for_section_border < end_index) {
1071 if (ranges->at(scan_forward_for_section_border) > new_border) {
1072 *new_start_index = scan_forward_for_section_border;
1073 *border = new_border;
1074 break;
1075 }
1076 scan_forward_for_section_border++;
1077 }
1078 }
1079
1080 DCHECK(*new_start_index > start_index);
1081 *new_end_index = *new_start_index - 1;
1082 if (ranges->at(*new_end_index) == *border) {
1083 (*new_end_index)--;
1084 }
1085 if (*border >= ranges->at(end_index)) {
1086 *border = ranges->at(end_index);
1087 *new_start_index = end_index; // Won't be used.
1088 *new_end_index = end_index - 1;
1089 }
1090 }
1091
1092 // Gets a series of segment boundaries representing a character class. If the
1093 // character is in the range between an even and an odd boundary (counting from
1094 // start_index) then go to even_label, otherwise go to odd_label. We already
1095 // know that the character is in the range of min_char to max_char inclusive.
1096 // Either label can be nullptr indicating backtracking. Either label can also
1097 // be equal to the fall_through label.
GenerateBranches(RegExpMacroAssembler * masm,ZoneList<int> * ranges,int start_index,int end_index,uc32 min_char,uc32 max_char,Label * fall_through,Label * even_label,Label * odd_label)1098 static void GenerateBranches(RegExpMacroAssembler* masm, ZoneList<int>* ranges,
1099 int start_index, int end_index, uc32 min_char,
1100 uc32 max_char, Label* fall_through,
1101 Label* even_label, Label* odd_label) {
1102 DCHECK_LE(min_char, String::kMaxUtf16CodeUnit);
1103 DCHECK_LE(max_char, String::kMaxUtf16CodeUnit);
1104
1105 int first = ranges->at(start_index);
1106 int last = ranges->at(end_index) - 1;
1107
1108 DCHECK_LT(min_char, first);
1109
1110 // Just need to test if the character is before or on-or-after
1111 // a particular character.
1112 if (start_index == end_index) {
1113 EmitBoundaryTest(masm, first, fall_through, even_label, odd_label);
1114 return;
1115 }
1116
1117 // Another almost trivial case: There is one interval in the middle that is
1118 // different from the end intervals.
1119 if (start_index + 1 == end_index) {
1120 EmitDoubleBoundaryTest(masm, first, last, fall_through, even_label,
1121 odd_label);
1122 return;
1123 }
1124
1125 // It's not worth using table lookup if there are very few intervals in the
1126 // character class.
1127 if (end_index - start_index <= 6) {
1128 // It is faster to test for individual characters, so we look for those
1129 // first, then try arbitrary ranges in the second round.
1130 static int kNoCutIndex = -1;
1131 int cut = kNoCutIndex;
1132 for (int i = start_index; i < end_index; i++) {
1133 if (ranges->at(i) == ranges->at(i + 1) - 1) {
1134 cut = i;
1135 break;
1136 }
1137 }
1138 if (cut == kNoCutIndex) cut = start_index;
1139 CutOutRange(masm, ranges, start_index, end_index, cut, even_label,
1140 odd_label);
1141 DCHECK_GE(end_index - start_index, 2);
1142 GenerateBranches(masm, ranges, start_index + 1, end_index - 1, min_char,
1143 max_char, fall_through, even_label, odd_label);
1144 return;
1145 }
1146
1147 // If there are a lot of intervals in the regexp, then we will use tables to
1148 // determine whether the character is inside or outside the character class.
1149 static const int kBits = RegExpMacroAssembler::kTableSizeBits;
1150
1151 if ((max_char >> kBits) == (min_char >> kBits)) {
1152 EmitUseLookupTable(masm, ranges, start_index, end_index, min_char,
1153 fall_through, even_label, odd_label);
1154 return;
1155 }
1156
1157 if ((min_char >> kBits) != static_cast<uc32>(first >> kBits)) {
1158 masm->CheckCharacterLT(first, odd_label);
1159 GenerateBranches(masm, ranges, start_index + 1, end_index, first, max_char,
1160 fall_through, odd_label, even_label);
1161 return;
1162 }
1163
1164 int new_start_index = 0;
1165 int new_end_index = 0;
1166 int border = 0;
1167
1168 SplitSearchSpace(ranges, start_index, end_index, &new_start_index,
1169 &new_end_index, &border);
1170
1171 Label handle_rest;
1172 Label* above = &handle_rest;
1173 if (border == last + 1) {
1174 // We didn't find any section that started after the limit, so everything
1175 // above the border is one of the terminal labels.
1176 above = (end_index & 1) != (start_index & 1) ? odd_label : even_label;
1177 DCHECK(new_end_index == end_index - 1);
1178 }
1179
1180 DCHECK_LE(start_index, new_end_index);
1181 DCHECK_LE(new_start_index, end_index);
1182 DCHECK_LT(start_index, new_start_index);
1183 DCHECK_LT(new_end_index, end_index);
1184 DCHECK(new_end_index + 1 == new_start_index ||
1185 (new_end_index + 2 == new_start_index &&
1186 border == ranges->at(new_end_index + 1)));
1187 DCHECK_LT(min_char, border - 1);
1188 DCHECK_LT(border, max_char);
1189 DCHECK_LT(ranges->at(new_end_index), border);
1190 DCHECK(border < ranges->at(new_start_index) ||
1191 (border == ranges->at(new_start_index) &&
1192 new_start_index == end_index && new_end_index == end_index - 1 &&
1193 border == last + 1));
1194 DCHECK(new_start_index == 0 || border >= ranges->at(new_start_index - 1));
1195
1196 masm->CheckCharacterGT(border - 1, above);
1197 Label dummy;
1198 GenerateBranches(masm, ranges, start_index, new_end_index, min_char,
1199 border - 1, &dummy, even_label, odd_label);
1200 if (handle_rest.is_linked()) {
1201 masm->Bind(&handle_rest);
1202 bool flip = (new_start_index & 1) != (start_index & 1);
1203 GenerateBranches(masm, ranges, new_start_index, end_index, border, max_char,
1204 &dummy, flip ? odd_label : even_label,
1205 flip ? even_label : odd_label);
1206 }
1207 }
1208
EmitCharClass(RegExpMacroAssembler * macro_assembler,RegExpCharacterClass * cc,bool one_byte,Label * on_failure,int cp_offset,bool check_offset,bool preloaded,Zone * zone)1209 static void EmitCharClass(RegExpMacroAssembler* macro_assembler,
1210 RegExpCharacterClass* cc, bool one_byte,
1211 Label* on_failure, int cp_offset, bool check_offset,
1212 bool preloaded, Zone* zone) {
1213 ZoneList<CharacterRange>* ranges = cc->ranges(zone);
1214 CharacterRange::Canonicalize(ranges);
1215
1216 const uc32 max_char = MaxCodeUnit(one_byte);
1217 int range_count = ranges->length();
1218
1219 int last_valid_range = range_count - 1;
1220 while (last_valid_range >= 0) {
1221 CharacterRange& range = ranges->at(last_valid_range);
1222 if (range.from() <= max_char) break;
1223 last_valid_range--;
1224 }
1225
1226 if (last_valid_range < 0) {
1227 if (!cc->is_negated()) {
1228 macro_assembler->GoTo(on_failure);
1229 }
1230 if (check_offset) {
1231 macro_assembler->CheckPosition(cp_offset, on_failure);
1232 }
1233 return;
1234 }
1235
1236 if (last_valid_range == 0 && ranges->at(0).IsEverything(max_char)) {
1237 if (cc->is_negated()) {
1238 macro_assembler->GoTo(on_failure);
1239 } else {
1240 // This is a common case hit by non-anchored expressions.
1241 if (check_offset) {
1242 macro_assembler->CheckPosition(cp_offset, on_failure);
1243 }
1244 }
1245 return;
1246 }
1247
1248 if (!preloaded) {
1249 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check_offset);
1250 }
1251
1252 if (cc->is_standard(zone) && macro_assembler->CheckSpecialCharacterClass(
1253 cc->standard_type(), on_failure)) {
1254 return;
1255 }
1256
1257 // A new list with ascending entries. Each entry is a code unit
1258 // where there is a boundary between code units that are part of
1259 // the class and code units that are not. Normally we insert an
1260 // entry at zero which goes to the failure label, but if there
1261 // was already one there we fall through for success on that entry.
1262 // Subsequent entries have alternating meaning (success/failure).
1263 // TODO(jgruber,v8:10568): Change `range_boundaries` to a ZoneList<uc32>.
1264 ZoneList<int>* range_boundaries =
1265 zone->New<ZoneList<int>>(last_valid_range, zone);
1266
1267 bool zeroth_entry_is_failure = !cc->is_negated();
1268
1269 for (int i = 0; i <= last_valid_range; i++) {
1270 CharacterRange& range = ranges->at(i);
1271 if (range.from() == 0) {
1272 DCHECK_EQ(i, 0);
1273 zeroth_entry_is_failure = !zeroth_entry_is_failure;
1274 } else {
1275 range_boundaries->Add(range.from(), zone);
1276 }
1277 range_boundaries->Add(range.to() + 1, zone);
1278 }
1279 int end_index = range_boundaries->length() - 1;
1280 if (static_cast<uc32>(range_boundaries->at(end_index)) > max_char) {
1281 end_index--;
1282 }
1283
1284 Label fall_through;
1285 GenerateBranches(macro_assembler, range_boundaries,
1286 0, // start_index.
1287 end_index,
1288 0, // min_char.
1289 max_char, &fall_through,
1290 zeroth_entry_is_failure ? &fall_through : on_failure,
1291 zeroth_entry_is_failure ? on_failure : &fall_through);
1292 macro_assembler->Bind(&fall_through);
1293 }
1294
1295 RegExpNode::~RegExpNode() = default;
1296
LimitVersions(RegExpCompiler * compiler,Trace * trace)1297 RegExpNode::LimitResult RegExpNode::LimitVersions(RegExpCompiler* compiler,
1298 Trace* trace) {
1299 // If we are generating a greedy loop then don't stop and don't reuse code.
1300 if (trace->stop_node() != nullptr) {
1301 return CONTINUE;
1302 }
1303
1304 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
1305 if (trace->is_trivial()) {
1306 if (label_.is_bound() || on_work_list() || !KeepRecursing(compiler)) {
1307 // If a generic version is already scheduled to be generated or we have
1308 // recursed too deeply then just generate a jump to that code.
1309 macro_assembler->GoTo(&label_);
1310 // This will queue it up for generation of a generic version if it hasn't
1311 // already been queued.
1312 compiler->AddWork(this);
1313 return DONE;
1314 }
1315 // Generate generic version of the node and bind the label for later use.
1316 macro_assembler->Bind(&label_);
1317 return CONTINUE;
1318 }
1319
1320 // We are being asked to make a non-generic version. Keep track of how many
1321 // non-generic versions we generate so as not to overdo it.
1322 trace_count_++;
1323 if (KeepRecursing(compiler) && compiler->optimize() &&
1324 trace_count_ < kMaxCopiesCodeGenerated) {
1325 return CONTINUE;
1326 }
1327
1328 // If we get here code has been generated for this node too many times or
1329 // recursion is too deep. Time to switch to a generic version. The code for
1330 // generic versions above can handle deep recursion properly.
1331 bool was_limiting = compiler->limiting_recursion();
1332 compiler->set_limiting_recursion(true);
1333 trace->Flush(compiler, this);
1334 compiler->set_limiting_recursion(was_limiting);
1335 return DONE;
1336 }
1337
KeepRecursing(RegExpCompiler * compiler)1338 bool RegExpNode::KeepRecursing(RegExpCompiler* compiler) {
1339 return !compiler->limiting_recursion() &&
1340 compiler->recursion_depth() <= RegExpCompiler::kMaxRecursion;
1341 }
1342
FillInBMInfo(Isolate * isolate,int offset,int budget,BoyerMooreLookahead * bm,bool not_at_start)1343 void ActionNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
1344 BoyerMooreLookahead* bm, bool not_at_start) {
1345 if (action_type_ == POSITIVE_SUBMATCH_SUCCESS) {
1346 // Anything may follow a positive submatch success, thus we need to accept
1347 // all characters from this position onwards.
1348 bm->SetRest(offset);
1349 } else {
1350 on_success()->FillInBMInfo(isolate, offset, budget - 1, bm, not_at_start);
1351 }
1352 SaveBMInfo(bm, not_at_start, offset);
1353 }
1354
GetQuickCheckDetails(QuickCheckDetails * details,RegExpCompiler * compiler,int filled_in,bool not_at_start)1355 void ActionNode::GetQuickCheckDetails(QuickCheckDetails* details,
1356 RegExpCompiler* compiler, int filled_in,
1357 bool not_at_start) {
1358 if (action_type_ == SET_REGISTER_FOR_LOOP) {
1359 on_success()->GetQuickCheckDetailsFromLoopEntry(details, compiler,
1360 filled_in, not_at_start);
1361 } else {
1362 on_success()->GetQuickCheckDetails(details, compiler, filled_in,
1363 not_at_start);
1364 }
1365 }
1366
FillInBMInfo(Isolate * isolate,int offset,int budget,BoyerMooreLookahead * bm,bool not_at_start)1367 void AssertionNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
1368 BoyerMooreLookahead* bm, bool not_at_start) {
1369 // Match the behaviour of EatsAtLeast on this node.
1370 if (assertion_type() == AT_START && not_at_start) return;
1371 on_success()->FillInBMInfo(isolate, offset, budget - 1, bm, not_at_start);
1372 SaveBMInfo(bm, not_at_start, offset);
1373 }
1374
GetQuickCheckDetails(QuickCheckDetails * details,RegExpCompiler * compiler,int filled_in,bool not_at_start)1375 void NegativeLookaroundChoiceNode::GetQuickCheckDetails(
1376 QuickCheckDetails* details, RegExpCompiler* compiler, int filled_in,
1377 bool not_at_start) {
1378 RegExpNode* node = continue_node();
1379 return node->GetQuickCheckDetails(details, compiler, filled_in, not_at_start);
1380 }
1381
1382 // Takes the left-most 1-bit and smears it out, setting all bits to its right.
SmearBitsRight(uint32_t v)1383 static inline uint32_t SmearBitsRight(uint32_t v) {
1384 v |= v >> 1;
1385 v |= v >> 2;
1386 v |= v >> 4;
1387 v |= v >> 8;
1388 v |= v >> 16;
1389 return v;
1390 }
1391
Rationalize(bool asc)1392 bool QuickCheckDetails::Rationalize(bool asc) {
1393 bool found_useful_op = false;
1394 const uint32_t char_mask = CharMask(asc);
1395 mask_ = 0;
1396 value_ = 0;
1397 int char_shift = 0;
1398 for (int i = 0; i < characters_; i++) {
1399 Position* pos = &positions_[i];
1400 if ((pos->mask & String::kMaxOneByteCharCode) != 0) {
1401 found_useful_op = true;
1402 }
1403 mask_ |= (pos->mask & char_mask) << char_shift;
1404 value_ |= (pos->value & char_mask) << char_shift;
1405 char_shift += asc ? 8 : 16;
1406 }
1407 return found_useful_op;
1408 }
1409
EatsAtLeast(bool not_at_start)1410 int RegExpNode::EatsAtLeast(bool not_at_start) {
1411 return not_at_start ? eats_at_least_.eats_at_least_from_not_start
1412 : eats_at_least_.eats_at_least_from_possibly_start;
1413 }
1414
EatsAtLeastFromLoopEntry()1415 EatsAtLeastInfo RegExpNode::EatsAtLeastFromLoopEntry() {
1416 // SET_REGISTER_FOR_LOOP is only used to initialize loop counters, and it
1417 // implies that the following node must be a LoopChoiceNode. If we need to
1418 // set registers to constant values for other reasons, we could introduce a
1419 // new action type SET_REGISTER that doesn't imply anything about its
1420 // successor.
1421 UNREACHABLE();
1422 }
1423
GetQuickCheckDetailsFromLoopEntry(QuickCheckDetails * details,RegExpCompiler * compiler,int characters_filled_in,bool not_at_start)1424 void RegExpNode::GetQuickCheckDetailsFromLoopEntry(QuickCheckDetails* details,
1425 RegExpCompiler* compiler,
1426 int characters_filled_in,
1427 bool not_at_start) {
1428 // See comment in RegExpNode::EatsAtLeastFromLoopEntry.
1429 UNREACHABLE();
1430 }
1431
EatsAtLeastFromLoopEntry()1432 EatsAtLeastInfo LoopChoiceNode::EatsAtLeastFromLoopEntry() {
1433 DCHECK_EQ(alternatives_->length(), 2); // There's just loop and continue.
1434
1435 if (read_backward()) {
1436 // Can't do anything special for a backward loop, so return the basic values
1437 // that we got during analysis.
1438 return *eats_at_least_info();
1439 }
1440
1441 // Figure out how much the loop body itself eats, not including anything in
1442 // the continuation case. In general, the nodes in the loop body should report
1443 // that they eat at least the number eaten by the continuation node, since any
1444 // successful match in the loop body must also include the continuation node.
1445 // However, in some cases involving positive lookaround, the loop body under-
1446 // reports its appetite, so use saturated math here to avoid negative numbers.
1447 uint8_t loop_body_from_not_start = base::saturated_cast<uint8_t>(
1448 loop_node_->EatsAtLeast(true) - continue_node_->EatsAtLeast(true));
1449 uint8_t loop_body_from_possibly_start = base::saturated_cast<uint8_t>(
1450 loop_node_->EatsAtLeast(false) - continue_node_->EatsAtLeast(true));
1451
1452 // Limit the number of loop iterations to avoid overflow in subsequent steps.
1453 int loop_iterations = base::saturated_cast<uint8_t>(min_loop_iterations());
1454
1455 EatsAtLeastInfo result;
1456 result.eats_at_least_from_not_start =
1457 base::saturated_cast<uint8_t>(loop_iterations * loop_body_from_not_start +
1458 continue_node_->EatsAtLeast(true));
1459 if (loop_iterations > 0 && loop_body_from_possibly_start > 0) {
1460 // First loop iteration eats at least one, so all subsequent iterations
1461 // and the after-loop chunk are guaranteed to not be at the start.
1462 result.eats_at_least_from_possibly_start = base::saturated_cast<uint8_t>(
1463 loop_body_from_possibly_start +
1464 (loop_iterations - 1) * loop_body_from_not_start +
1465 continue_node_->EatsAtLeast(true));
1466 } else {
1467 // Loop body might eat nothing, so only continue node contributes.
1468 result.eats_at_least_from_possibly_start =
1469 continue_node_->EatsAtLeast(false);
1470 }
1471 return result;
1472 }
1473
EmitQuickCheck(RegExpCompiler * compiler,Trace * bounds_check_trace,Trace * trace,bool preload_has_checked_bounds,Label * on_possible_success,QuickCheckDetails * details,bool fall_through_on_failure,ChoiceNode * predecessor)1474 bool RegExpNode::EmitQuickCheck(RegExpCompiler* compiler,
1475 Trace* bounds_check_trace, Trace* trace,
1476 bool preload_has_checked_bounds,
1477 Label* on_possible_success,
1478 QuickCheckDetails* details,
1479 bool fall_through_on_failure,
1480 ChoiceNode* predecessor) {
1481 DCHECK_NOT_NULL(predecessor);
1482 if (details->characters() == 0) return false;
1483 GetQuickCheckDetails(details, compiler, 0,
1484 trace->at_start() == Trace::FALSE_VALUE);
1485 if (details->cannot_match()) return false;
1486 if (!details->Rationalize(compiler->one_byte())) return false;
1487 DCHECK(details->characters() == 1 ||
1488 compiler->macro_assembler()->CanReadUnaligned());
1489 uint32_t mask = details->mask();
1490 uint32_t value = details->value();
1491
1492 RegExpMacroAssembler* assembler = compiler->macro_assembler();
1493
1494 if (trace->characters_preloaded() != details->characters()) {
1495 DCHECK(trace->cp_offset() == bounds_check_trace->cp_offset());
1496 // The bounds check is performed using the minimum number of characters
1497 // any choice would eat, so if the bounds check fails, then none of the
1498 // choices can succeed, so we can just immediately backtrack, rather
1499 // than go to the next choice. The number of characters preloaded may be
1500 // less than the number used for the bounds check.
1501 int eats_at_least = predecessor->EatsAtLeast(
1502 bounds_check_trace->at_start() == Trace::FALSE_VALUE);
1503 DCHECK_GE(eats_at_least, details->characters());
1504 assembler->LoadCurrentCharacter(
1505 trace->cp_offset(), bounds_check_trace->backtrack(),
1506 !preload_has_checked_bounds, details->characters(), eats_at_least);
1507 }
1508
1509 bool need_mask = true;
1510
1511 if (details->characters() == 1) {
1512 // If number of characters preloaded is 1 then we used a byte or 16 bit
1513 // load so the value is already masked down.
1514 const uint32_t char_mask = CharMask(compiler->one_byte());
1515 if ((mask & char_mask) == char_mask) need_mask = false;
1516 mask &= char_mask;
1517 } else {
1518 // For 2-character preloads in one-byte mode or 1-character preloads in
1519 // two-byte mode we also use a 16 bit load with zero extend.
1520 static const uint32_t kTwoByteMask = 0xFFFF;
1521 static const uint32_t kFourByteMask = 0xFFFFFFFF;
1522 if (details->characters() == 2 && compiler->one_byte()) {
1523 if ((mask & kTwoByteMask) == kTwoByteMask) need_mask = false;
1524 } else if (details->characters() == 1 && !compiler->one_byte()) {
1525 if ((mask & kTwoByteMask) == kTwoByteMask) need_mask = false;
1526 } else {
1527 if (mask == kFourByteMask) need_mask = false;
1528 }
1529 }
1530
1531 if (fall_through_on_failure) {
1532 if (need_mask) {
1533 assembler->CheckCharacterAfterAnd(value, mask, on_possible_success);
1534 } else {
1535 assembler->CheckCharacter(value, on_possible_success);
1536 }
1537 } else {
1538 if (need_mask) {
1539 assembler->CheckNotCharacterAfterAnd(value, mask, trace->backtrack());
1540 } else {
1541 assembler->CheckNotCharacter(value, trace->backtrack());
1542 }
1543 }
1544 return true;
1545 }
1546
1547 // Here is the meat of GetQuickCheckDetails (see also the comment on the
1548 // super-class in the .h file).
1549 //
1550 // We iterate along the text object, building up for each character a
1551 // mask and value that can be used to test for a quick failure to match.
1552 // The masks and values for the positions will be combined into a single
1553 // machine word for the current character width in order to be used in
1554 // generating a quick check.
GetQuickCheckDetails(QuickCheckDetails * details,RegExpCompiler * compiler,int characters_filled_in,bool not_at_start)1555 void TextNode::GetQuickCheckDetails(QuickCheckDetails* details,
1556 RegExpCompiler* compiler,
1557 int characters_filled_in,
1558 bool not_at_start) {
1559 // Do not collect any quick check details if the text node reads backward,
1560 // since it reads in the opposite direction than we use for quick checks.
1561 if (read_backward()) return;
1562 Isolate* isolate = compiler->macro_assembler()->isolate();
1563 DCHECK(characters_filled_in < details->characters());
1564 int characters = details->characters();
1565 const uint32_t char_mask = CharMask(compiler->one_byte());
1566 for (int k = 0; k < elements()->length(); k++) {
1567 TextElement elm = elements()->at(k);
1568 if (elm.text_type() == TextElement::ATOM) {
1569 Vector<const uc16> quarks = elm.atom()->data();
1570 for (int i = 0; i < characters && i < quarks.length(); i++) {
1571 QuickCheckDetails::Position* pos =
1572 details->positions(characters_filled_in);
1573 uc16 c = quarks[i];
1574 if (elm.atom()->ignore_case()) {
1575 unibrow::uchar chars[4];
1576 int length = GetCaseIndependentLetters(
1577 isolate, c, compiler->one_byte(), chars, 4);
1578 if (length == 0) {
1579 // This can happen because all case variants are non-Latin1, but we
1580 // know the input is Latin1.
1581 details->set_cannot_match();
1582 pos->determines_perfectly = false;
1583 return;
1584 }
1585 if (length == 1) {
1586 // This letter has no case equivalents, so it's nice and simple
1587 // and the mask-compare will determine definitely whether we have
1588 // a match at this character position.
1589 pos->mask = char_mask;
1590 pos->value = chars[0];
1591 pos->determines_perfectly = true;
1592 } else {
1593 uint32_t common_bits = char_mask;
1594 uint32_t bits = chars[0];
1595 for (int j = 1; j < length; j++) {
1596 uint32_t differing_bits = ((chars[j] & common_bits) ^ bits);
1597 common_bits ^= differing_bits;
1598 bits &= common_bits;
1599 }
1600 // If length is 2 and common bits has only one zero in it then
1601 // our mask and compare instruction will determine definitely
1602 // whether we have a match at this character position. Otherwise
1603 // it can only be an approximate check.
1604 uint32_t one_zero = (common_bits | ~char_mask);
1605 if (length == 2 && ((~one_zero) & ((~one_zero) - 1)) == 0) {
1606 pos->determines_perfectly = true;
1607 }
1608 pos->mask = common_bits;
1609 pos->value = bits;
1610 }
1611 } else {
1612 // Don't ignore case. Nice simple case where the mask-compare will
1613 // determine definitely whether we have a match at this character
1614 // position.
1615 if (c > char_mask) {
1616 details->set_cannot_match();
1617 pos->determines_perfectly = false;
1618 return;
1619 }
1620 pos->mask = char_mask;
1621 pos->value = c;
1622 pos->determines_perfectly = true;
1623 }
1624 characters_filled_in++;
1625 DCHECK(characters_filled_in <= details->characters());
1626 if (characters_filled_in == details->characters()) {
1627 return;
1628 }
1629 }
1630 } else {
1631 QuickCheckDetails::Position* pos =
1632 details->positions(characters_filled_in);
1633 RegExpCharacterClass* tree = elm.char_class();
1634 ZoneList<CharacterRange>* ranges = tree->ranges(zone());
1635 DCHECK(!ranges->is_empty());
1636 if (tree->is_negated()) {
1637 // A quick check uses multi-character mask and compare. There is no
1638 // useful way to incorporate a negative char class into this scheme
1639 // so we just conservatively create a mask and value that will always
1640 // succeed.
1641 pos->mask = 0;
1642 pos->value = 0;
1643 } else {
1644 int first_range = 0;
1645 while (ranges->at(first_range).from() > char_mask) {
1646 first_range++;
1647 if (first_range == ranges->length()) {
1648 details->set_cannot_match();
1649 pos->determines_perfectly = false;
1650 return;
1651 }
1652 }
1653 CharacterRange range = ranges->at(first_range);
1654 const uc32 first_from = range.from();
1655 const uc32 first_to = (range.to() > char_mask) ? char_mask : range.to();
1656 const uint32_t differing_bits = (first_from ^ first_to);
1657 // A mask and compare is only perfect if the differing bits form a
1658 // number like 00011111 with one single block of trailing 1s.
1659 if ((differing_bits & (differing_bits + 1)) == 0 &&
1660 first_from + differing_bits == first_to) {
1661 pos->determines_perfectly = true;
1662 }
1663 uint32_t common_bits = ~SmearBitsRight(differing_bits);
1664 uint32_t bits = (first_from & common_bits);
1665 for (int i = first_range + 1; i < ranges->length(); i++) {
1666 CharacterRange range = ranges->at(i);
1667 const uc32 from = range.from();
1668 if (from > char_mask) continue;
1669 const uc32 to = (range.to() > char_mask) ? char_mask : range.to();
1670 // Here we are combining more ranges into the mask and compare
1671 // value. With each new range the mask becomes more sparse and
1672 // so the chances of a false positive rise. A character class
1673 // with multiple ranges is assumed never to be equivalent to a
1674 // mask and compare operation.
1675 pos->determines_perfectly = false;
1676 uint32_t new_common_bits = (from ^ to);
1677 new_common_bits = ~SmearBitsRight(new_common_bits);
1678 common_bits &= new_common_bits;
1679 bits &= new_common_bits;
1680 uint32_t differing_bits = (from & common_bits) ^ bits;
1681 common_bits ^= differing_bits;
1682 bits &= common_bits;
1683 }
1684 pos->mask = common_bits;
1685 pos->value = bits;
1686 }
1687 characters_filled_in++;
1688 DCHECK(characters_filled_in <= details->characters());
1689 if (characters_filled_in == details->characters()) return;
1690 }
1691 }
1692 DCHECK(characters_filled_in != details->characters());
1693 if (!details->cannot_match()) {
1694 on_success()->GetQuickCheckDetails(details, compiler, characters_filled_in,
1695 true);
1696 }
1697 }
1698
Clear()1699 void QuickCheckDetails::Clear() {
1700 for (int i = 0; i < characters_; i++) {
1701 positions_[i].mask = 0;
1702 positions_[i].value = 0;
1703 positions_[i].determines_perfectly = false;
1704 }
1705 characters_ = 0;
1706 }
1707
Advance(int by,bool one_byte)1708 void QuickCheckDetails::Advance(int by, bool one_byte) {
1709 if (by >= characters_ || by < 0) {
1710 DCHECK_IMPLIES(by < 0, characters_ == 0);
1711 Clear();
1712 return;
1713 }
1714 DCHECK_LE(characters_ - by, 4);
1715 DCHECK_LE(characters_, 4);
1716 for (int i = 0; i < characters_ - by; i++) {
1717 positions_[i] = positions_[by + i];
1718 }
1719 for (int i = characters_ - by; i < characters_; i++) {
1720 positions_[i].mask = 0;
1721 positions_[i].value = 0;
1722 positions_[i].determines_perfectly = false;
1723 }
1724 characters_ -= by;
1725 // We could change mask_ and value_ here but we would never advance unless
1726 // they had already been used in a check and they won't be used again because
1727 // it would gain us nothing. So there's no point.
1728 }
1729
Merge(QuickCheckDetails * other,int from_index)1730 void QuickCheckDetails::Merge(QuickCheckDetails* other, int from_index) {
1731 DCHECK(characters_ == other->characters_);
1732 if (other->cannot_match_) {
1733 return;
1734 }
1735 if (cannot_match_) {
1736 *this = *other;
1737 return;
1738 }
1739 for (int i = from_index; i < characters_; i++) {
1740 QuickCheckDetails::Position* pos = positions(i);
1741 QuickCheckDetails::Position* other_pos = other->positions(i);
1742 if (pos->mask != other_pos->mask || pos->value != other_pos->value ||
1743 !other_pos->determines_perfectly) {
1744 // Our mask-compare operation will be approximate unless we have the
1745 // exact same operation on both sides of the alternation.
1746 pos->determines_perfectly = false;
1747 }
1748 pos->mask &= other_pos->mask;
1749 pos->value &= pos->mask;
1750 other_pos->value &= pos->mask;
1751 uint32_t differing_bits = (pos->value ^ other_pos->value);
1752 pos->mask &= ~differing_bits;
1753 pos->value &= pos->mask;
1754 }
1755 }
1756
1757 class VisitMarker {
1758 public:
VisitMarker(NodeInfo * info)1759 explicit VisitMarker(NodeInfo* info) : info_(info) {
1760 DCHECK(!info->visited);
1761 info->visited = true;
1762 }
~VisitMarker()1763 ~VisitMarker() { info_->visited = false; }
1764
1765 private:
1766 NodeInfo* info_;
1767 };
1768
1769 // Temporarily sets traversed_loop_initialization_node_.
1770 class LoopInitializationMarker {
1771 public:
LoopInitializationMarker(LoopChoiceNode * node)1772 explicit LoopInitializationMarker(LoopChoiceNode* node) : node_(node) {
1773 DCHECK(!node_->traversed_loop_initialization_node_);
1774 node_->traversed_loop_initialization_node_ = true;
1775 }
~LoopInitializationMarker()1776 ~LoopInitializationMarker() {
1777 DCHECK(node_->traversed_loop_initialization_node_);
1778 node_->traversed_loop_initialization_node_ = false;
1779 }
1780 LoopInitializationMarker(const LoopInitializationMarker&) = delete;
1781 LoopInitializationMarker& operator=(const LoopInitializationMarker&) = delete;
1782
1783 private:
1784 LoopChoiceNode* node_;
1785 };
1786
1787 // Temporarily decrements min_loop_iterations_.
1788 class IterationDecrementer {
1789 public:
IterationDecrementer(LoopChoiceNode * node)1790 explicit IterationDecrementer(LoopChoiceNode* node) : node_(node) {
1791 DCHECK_GT(node_->min_loop_iterations_, 0);
1792 --node_->min_loop_iterations_;
1793 }
~IterationDecrementer()1794 ~IterationDecrementer() { ++node_->min_loop_iterations_; }
1795 IterationDecrementer(const IterationDecrementer&) = delete;
1796 IterationDecrementer& operator=(const IterationDecrementer&) = delete;
1797
1798 private:
1799 LoopChoiceNode* node_;
1800 };
1801
FilterOneByte(int depth)1802 RegExpNode* SeqRegExpNode::FilterOneByte(int depth) {
1803 if (info()->replacement_calculated) return replacement();
1804 if (depth < 0) return this;
1805 DCHECK(!info()->visited);
1806 VisitMarker marker(info());
1807 return FilterSuccessor(depth - 1);
1808 }
1809
FilterSuccessor(int depth)1810 RegExpNode* SeqRegExpNode::FilterSuccessor(int depth) {
1811 RegExpNode* next = on_success_->FilterOneByte(depth - 1);
1812 if (next == nullptr) return set_replacement(nullptr);
1813 on_success_ = next;
1814 return set_replacement(this);
1815 }
1816
1817 // We need to check for the following characters: 0x39C 0x3BC 0x178.
RangeContainsLatin1Equivalents(CharacterRange range)1818 bool RangeContainsLatin1Equivalents(CharacterRange range) {
1819 // TODO(dcarney): this could be a lot more efficient.
1820 return range.Contains(0x039C) || range.Contains(0x03BC) ||
1821 range.Contains(0x0178);
1822 }
1823
RangesContainLatin1Equivalents(ZoneList<CharacterRange> * ranges)1824 static bool RangesContainLatin1Equivalents(ZoneList<CharacterRange>* ranges) {
1825 for (int i = 0; i < ranges->length(); i++) {
1826 // TODO(dcarney): this could be a lot more efficient.
1827 if (RangeContainsLatin1Equivalents(ranges->at(i))) return true;
1828 }
1829 return false;
1830 }
1831
FilterOneByte(int depth)1832 RegExpNode* TextNode::FilterOneByte(int depth) {
1833 if (info()->replacement_calculated) return replacement();
1834 if (depth < 0) return this;
1835 DCHECK(!info()->visited);
1836 VisitMarker marker(info());
1837 int element_count = elements()->length();
1838 for (int i = 0; i < element_count; i++) {
1839 TextElement elm = elements()->at(i);
1840 if (elm.text_type() == TextElement::ATOM) {
1841 Vector<const uc16> quarks = elm.atom()->data();
1842 for (int j = 0; j < quarks.length(); j++) {
1843 uc16 c = quarks[j];
1844 if (elm.atom()->ignore_case()) {
1845 c = unibrow::Latin1::TryConvertToLatin1(c);
1846 }
1847 if (c > unibrow::Latin1::kMaxChar) return set_replacement(nullptr);
1848 // Replace quark in case we converted to Latin-1.
1849 uc16* writable_quarks = const_cast<uc16*>(quarks.begin());
1850 writable_quarks[j] = c;
1851 }
1852 } else {
1853 DCHECK(elm.text_type() == TextElement::CHAR_CLASS);
1854 RegExpCharacterClass* cc = elm.char_class();
1855 ZoneList<CharacterRange>* ranges = cc->ranges(zone());
1856 CharacterRange::Canonicalize(ranges);
1857 // Now they are in order so we only need to look at the first.
1858 int range_count = ranges->length();
1859 if (cc->is_negated()) {
1860 if (range_count != 0 && ranges->at(0).from() == 0 &&
1861 ranges->at(0).to() >= String::kMaxOneByteCharCode) {
1862 // This will be handled in a later filter.
1863 if (IgnoreCase(cc->flags()) &&
1864 RangesContainLatin1Equivalents(ranges)) {
1865 continue;
1866 }
1867 return set_replacement(nullptr);
1868 }
1869 } else {
1870 if (range_count == 0 ||
1871 ranges->at(0).from() > String::kMaxOneByteCharCode) {
1872 // This will be handled in a later filter.
1873 if (IgnoreCase(cc->flags()) &&
1874 RangesContainLatin1Equivalents(ranges)) {
1875 continue;
1876 }
1877 return set_replacement(nullptr);
1878 }
1879 }
1880 }
1881 }
1882 return FilterSuccessor(depth - 1);
1883 }
1884
FilterOneByte(int depth)1885 RegExpNode* LoopChoiceNode::FilterOneByte(int depth) {
1886 if (info()->replacement_calculated) return replacement();
1887 if (depth < 0) return this;
1888 if (info()->visited) return this;
1889 {
1890 VisitMarker marker(info());
1891
1892 RegExpNode* continue_replacement = continue_node_->FilterOneByte(depth - 1);
1893 // If we can't continue after the loop then there is no sense in doing the
1894 // loop.
1895 if (continue_replacement == nullptr) return set_replacement(nullptr);
1896 }
1897
1898 return ChoiceNode::FilterOneByte(depth - 1);
1899 }
1900
FilterOneByte(int depth)1901 RegExpNode* ChoiceNode::FilterOneByte(int depth) {
1902 if (info()->replacement_calculated) return replacement();
1903 if (depth < 0) return this;
1904 if (info()->visited) return this;
1905 VisitMarker marker(info());
1906 int choice_count = alternatives_->length();
1907
1908 for (int i = 0; i < choice_count; i++) {
1909 GuardedAlternative alternative = alternatives_->at(i);
1910 if (alternative.guards() != nullptr &&
1911 alternative.guards()->length() != 0) {
1912 set_replacement(this);
1913 return this;
1914 }
1915 }
1916
1917 int surviving = 0;
1918 RegExpNode* survivor = nullptr;
1919 for (int i = 0; i < choice_count; i++) {
1920 GuardedAlternative alternative = alternatives_->at(i);
1921 RegExpNode* replacement = alternative.node()->FilterOneByte(depth - 1);
1922 DCHECK(replacement != this); // No missing EMPTY_MATCH_CHECK.
1923 if (replacement != nullptr) {
1924 alternatives_->at(i).set_node(replacement);
1925 surviving++;
1926 survivor = replacement;
1927 }
1928 }
1929 if (surviving < 2) return set_replacement(survivor);
1930
1931 set_replacement(this);
1932 if (surviving == choice_count) {
1933 return this;
1934 }
1935 // Only some of the nodes survived the filtering. We need to rebuild the
1936 // alternatives list.
1937 ZoneList<GuardedAlternative>* new_alternatives =
1938 zone()->New<ZoneList<GuardedAlternative>>(surviving, zone());
1939 for (int i = 0; i < choice_count; i++) {
1940 RegExpNode* replacement =
1941 alternatives_->at(i).node()->FilterOneByte(depth - 1);
1942 if (replacement != nullptr) {
1943 alternatives_->at(i).set_node(replacement);
1944 new_alternatives->Add(alternatives_->at(i), zone());
1945 }
1946 }
1947 alternatives_ = new_alternatives;
1948 return this;
1949 }
1950
FilterOneByte(int depth)1951 RegExpNode* NegativeLookaroundChoiceNode::FilterOneByte(int depth) {
1952 if (info()->replacement_calculated) return replacement();
1953 if (depth < 0) return this;
1954 if (info()->visited) return this;
1955 VisitMarker marker(info());
1956 // Alternative 0 is the negative lookahead, alternative 1 is what comes
1957 // afterwards.
1958 RegExpNode* node = continue_node();
1959 RegExpNode* replacement = node->FilterOneByte(depth - 1);
1960 if (replacement == nullptr) return set_replacement(nullptr);
1961 alternatives_->at(kContinueIndex).set_node(replacement);
1962
1963 RegExpNode* neg_node = lookaround_node();
1964 RegExpNode* neg_replacement = neg_node->FilterOneByte(depth - 1);
1965 // If the negative lookahead is always going to fail then
1966 // we don't need to check it.
1967 if (neg_replacement == nullptr) return set_replacement(replacement);
1968 alternatives_->at(kLookaroundIndex).set_node(neg_replacement);
1969 return set_replacement(this);
1970 }
1971
GetQuickCheckDetails(QuickCheckDetails * details,RegExpCompiler * compiler,int characters_filled_in,bool not_at_start)1972 void LoopChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,
1973 RegExpCompiler* compiler,
1974 int characters_filled_in,
1975 bool not_at_start) {
1976 if (body_can_be_zero_length_ || info()->visited) return;
1977 not_at_start = not_at_start || this->not_at_start();
1978 DCHECK_EQ(alternatives_->length(), 2); // There's just loop and continue.
1979 if (traversed_loop_initialization_node_ && min_loop_iterations_ > 0 &&
1980 loop_node_->EatsAtLeast(not_at_start) >
1981 continue_node_->EatsAtLeast(true)) {
1982 // Loop body is guaranteed to execute at least once, and consume characters
1983 // when it does, meaning the only possible quick checks from this point
1984 // begin with the loop body. We may recursively visit this LoopChoiceNode,
1985 // but we temporarily decrease its minimum iteration counter so we know when
1986 // to check the continue case.
1987 IterationDecrementer next_iteration(this);
1988 loop_node_->GetQuickCheckDetails(details, compiler, characters_filled_in,
1989 not_at_start);
1990 } else {
1991 // Might not consume anything in the loop body, so treat it like a normal
1992 // ChoiceNode (and don't recursively visit this node again).
1993 VisitMarker marker(info());
1994 ChoiceNode::GetQuickCheckDetails(details, compiler, characters_filled_in,
1995 not_at_start);
1996 }
1997 }
1998
GetQuickCheckDetailsFromLoopEntry(QuickCheckDetails * details,RegExpCompiler * compiler,int characters_filled_in,bool not_at_start)1999 void LoopChoiceNode::GetQuickCheckDetailsFromLoopEntry(
2000 QuickCheckDetails* details, RegExpCompiler* compiler,
2001 int characters_filled_in, bool not_at_start) {
2002 if (traversed_loop_initialization_node_) {
2003 // We already entered this loop once, exited via its continuation node, and
2004 // followed an outer loop's back-edge to before the loop entry point. We
2005 // could try to reset the minimum iteration count to its starting value at
2006 // this point, but that seems like more trouble than it's worth. It's safe
2007 // to keep going with the current (possibly reduced) minimum iteration
2008 // count.
2009 GetQuickCheckDetails(details, compiler, characters_filled_in, not_at_start);
2010 } else {
2011 // We are entering a loop via its counter initialization action, meaning we
2012 // are guaranteed to run the loop body at least some minimum number of times
2013 // before running the continuation node. Set a flag so that this node knows
2014 // (now and any times we visit it again recursively) that it was entered
2015 // from the top.
2016 LoopInitializationMarker marker(this);
2017 GetQuickCheckDetails(details, compiler, characters_filled_in, not_at_start);
2018 }
2019 }
2020
FillInBMInfo(Isolate * isolate,int offset,int budget,BoyerMooreLookahead * bm,bool not_at_start)2021 void LoopChoiceNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
2022 BoyerMooreLookahead* bm, bool not_at_start) {
2023 if (body_can_be_zero_length_ || budget <= 0) {
2024 bm->SetRest(offset);
2025 SaveBMInfo(bm, not_at_start, offset);
2026 return;
2027 }
2028 ChoiceNode::FillInBMInfo(isolate, offset, budget - 1, bm, not_at_start);
2029 SaveBMInfo(bm, not_at_start, offset);
2030 }
2031
GetQuickCheckDetails(QuickCheckDetails * details,RegExpCompiler * compiler,int characters_filled_in,bool not_at_start)2032 void ChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,
2033 RegExpCompiler* compiler,
2034 int characters_filled_in,
2035 bool not_at_start) {
2036 not_at_start = (not_at_start || not_at_start_);
2037 int choice_count = alternatives_->length();
2038 DCHECK_LT(0, choice_count);
2039 alternatives_->at(0).node()->GetQuickCheckDetails(
2040 details, compiler, characters_filled_in, not_at_start);
2041 for (int i = 1; i < choice_count; i++) {
2042 QuickCheckDetails new_details(details->characters());
2043 RegExpNode* node = alternatives_->at(i).node();
2044 node->GetQuickCheckDetails(&new_details, compiler, characters_filled_in,
2045 not_at_start);
2046 // Here we merge the quick match details of the two branches.
2047 details->Merge(&new_details, characters_filled_in);
2048 }
2049 }
2050
2051 namespace {
2052
2053 // Check for [0-9A-Z_a-z].
EmitWordCheck(RegExpMacroAssembler * assembler,Label * word,Label * non_word,bool fall_through_on_word)2054 void EmitWordCheck(RegExpMacroAssembler* assembler, Label* word,
2055 Label* non_word, bool fall_through_on_word) {
2056 if (assembler->CheckSpecialCharacterClass(
2057 fall_through_on_word ? 'w' : 'W',
2058 fall_through_on_word ? non_word : word)) {
2059 // Optimized implementation available.
2060 return;
2061 }
2062 assembler->CheckCharacterGT('z', non_word);
2063 assembler->CheckCharacterLT('0', non_word);
2064 assembler->CheckCharacterGT('a' - 1, word);
2065 assembler->CheckCharacterLT('9' + 1, word);
2066 assembler->CheckCharacterLT('A', non_word);
2067 assembler->CheckCharacterLT('Z' + 1, word);
2068 if (fall_through_on_word) {
2069 assembler->CheckNotCharacter('_', non_word);
2070 } else {
2071 assembler->CheckCharacter('_', word);
2072 }
2073 }
2074
2075 // Emit the code to check for a ^ in multiline mode (1-character lookbehind
2076 // that matches newline or the start of input).
EmitHat(RegExpCompiler * compiler,RegExpNode * on_success,Trace * trace)2077 void EmitHat(RegExpCompiler* compiler, RegExpNode* on_success, Trace* trace) {
2078 RegExpMacroAssembler* assembler = compiler->macro_assembler();
2079
2080 // We will load the previous character into the current character register.
2081 Trace new_trace(*trace);
2082 new_trace.InvalidateCurrentCharacter();
2083
2084 // A positive (> 0) cp_offset means we've already successfully matched a
2085 // non-empty-width part of the pattern, and thus cannot be at or before the
2086 // start of the subject string. We can thus skip both at-start and
2087 // bounds-checks when loading the one-character lookbehind.
2088 const bool may_be_at_or_before_subject_string_start =
2089 new_trace.cp_offset() <= 0;
2090
2091 Label ok;
2092 if (may_be_at_or_before_subject_string_start) {
2093 // The start of input counts as a newline in this context, so skip to ok if
2094 // we are at the start.
2095 assembler->CheckAtStart(new_trace.cp_offset(), &ok);
2096 }
2097
2098 // If we've already checked that we are not at the start of input, it's okay
2099 // to load the previous character without bounds checks.
2100 const bool can_skip_bounds_check = !may_be_at_or_before_subject_string_start;
2101 assembler->LoadCurrentCharacter(new_trace.cp_offset() - 1,
2102 new_trace.backtrack(), can_skip_bounds_check);
2103 if (!assembler->CheckSpecialCharacterClass('n', new_trace.backtrack())) {
2104 // Newline means \n, \r, 0x2028 or 0x2029.
2105 if (!compiler->one_byte()) {
2106 assembler->CheckCharacterAfterAnd(0x2028, 0xFFFE, &ok);
2107 }
2108 assembler->CheckCharacter('\n', &ok);
2109 assembler->CheckNotCharacter('\r', new_trace.backtrack());
2110 }
2111 assembler->Bind(&ok);
2112 on_success->Emit(compiler, &new_trace);
2113 }
2114
2115 } // namespace
2116
2117 // Emit the code to handle \b and \B (word-boundary or non-word-boundary).
EmitBoundaryCheck(RegExpCompiler * compiler,Trace * trace)2118 void AssertionNode::EmitBoundaryCheck(RegExpCompiler* compiler, Trace* trace) {
2119 RegExpMacroAssembler* assembler = compiler->macro_assembler();
2120 Isolate* isolate = assembler->isolate();
2121 Trace::TriBool next_is_word_character = Trace::UNKNOWN;
2122 bool not_at_start = (trace->at_start() == Trace::FALSE_VALUE);
2123 BoyerMooreLookahead* lookahead = bm_info(not_at_start);
2124 if (lookahead == nullptr) {
2125 int eats_at_least =
2126 Min(kMaxLookaheadForBoyerMoore, EatsAtLeast(not_at_start));
2127 if (eats_at_least >= 1) {
2128 BoyerMooreLookahead* bm =
2129 zone()->New<BoyerMooreLookahead>(eats_at_least, compiler, zone());
2130 FillInBMInfo(isolate, 0, kRecursionBudget, bm, not_at_start);
2131 if (bm->at(0)->is_non_word()) next_is_word_character = Trace::FALSE_VALUE;
2132 if (bm->at(0)->is_word()) next_is_word_character = Trace::TRUE_VALUE;
2133 }
2134 } else {
2135 if (lookahead->at(0)->is_non_word())
2136 next_is_word_character = Trace::FALSE_VALUE;
2137 if (lookahead->at(0)->is_word()) next_is_word_character = Trace::TRUE_VALUE;
2138 }
2139 bool at_boundary = (assertion_type_ == AssertionNode::AT_BOUNDARY);
2140 if (next_is_word_character == Trace::UNKNOWN) {
2141 Label before_non_word;
2142 Label before_word;
2143 if (trace->characters_preloaded() != 1) {
2144 assembler->LoadCurrentCharacter(trace->cp_offset(), &before_non_word);
2145 }
2146 // Fall through on non-word.
2147 EmitWordCheck(assembler, &before_word, &before_non_word, false);
2148 // Next character is not a word character.
2149 assembler->Bind(&before_non_word);
2150 Label ok;
2151 BacktrackIfPrevious(compiler, trace, at_boundary ? kIsNonWord : kIsWord);
2152 assembler->GoTo(&ok);
2153
2154 assembler->Bind(&before_word);
2155 BacktrackIfPrevious(compiler, trace, at_boundary ? kIsWord : kIsNonWord);
2156 assembler->Bind(&ok);
2157 } else if (next_is_word_character == Trace::TRUE_VALUE) {
2158 BacktrackIfPrevious(compiler, trace, at_boundary ? kIsWord : kIsNonWord);
2159 } else {
2160 DCHECK(next_is_word_character == Trace::FALSE_VALUE);
2161 BacktrackIfPrevious(compiler, trace, at_boundary ? kIsNonWord : kIsWord);
2162 }
2163 }
2164
BacktrackIfPrevious(RegExpCompiler * compiler,Trace * trace,AssertionNode::IfPrevious backtrack_if_previous)2165 void AssertionNode::BacktrackIfPrevious(
2166 RegExpCompiler* compiler, Trace* trace,
2167 AssertionNode::IfPrevious backtrack_if_previous) {
2168 RegExpMacroAssembler* assembler = compiler->macro_assembler();
2169 Trace new_trace(*trace);
2170 new_trace.InvalidateCurrentCharacter();
2171
2172 Label fall_through;
2173 Label* non_word = backtrack_if_previous == kIsNonWord ? new_trace.backtrack()
2174 : &fall_through;
2175 Label* word = backtrack_if_previous == kIsNonWord ? &fall_through
2176 : new_trace.backtrack();
2177
2178 // A positive (> 0) cp_offset means we've already successfully matched a
2179 // non-empty-width part of the pattern, and thus cannot be at or before the
2180 // start of the subject string. We can thus skip both at-start and
2181 // bounds-checks when loading the one-character lookbehind.
2182 const bool may_be_at_or_before_subject_string_start =
2183 new_trace.cp_offset() <= 0;
2184
2185 if (may_be_at_or_before_subject_string_start) {
2186 // The start of input counts as a non-word character, so the question is
2187 // decided if we are at the start.
2188 assembler->CheckAtStart(new_trace.cp_offset(), non_word);
2189 }
2190
2191 // If we've already checked that we are not at the start of input, it's okay
2192 // to load the previous character without bounds checks.
2193 const bool can_skip_bounds_check = !may_be_at_or_before_subject_string_start;
2194 assembler->LoadCurrentCharacter(new_trace.cp_offset() - 1, non_word,
2195 can_skip_bounds_check);
2196 EmitWordCheck(assembler, word, non_word, backtrack_if_previous == kIsNonWord);
2197
2198 assembler->Bind(&fall_through);
2199 on_success()->Emit(compiler, &new_trace);
2200 }
2201
GetQuickCheckDetails(QuickCheckDetails * details,RegExpCompiler * compiler,int filled_in,bool not_at_start)2202 void AssertionNode::GetQuickCheckDetails(QuickCheckDetails* details,
2203 RegExpCompiler* compiler,
2204 int filled_in, bool not_at_start) {
2205 if (assertion_type_ == AT_START && not_at_start) {
2206 details->set_cannot_match();
2207 return;
2208 }
2209 return on_success()->GetQuickCheckDetails(details, compiler, filled_in,
2210 not_at_start);
2211 }
2212
Emit(RegExpCompiler * compiler,Trace * trace)2213 void AssertionNode::Emit(RegExpCompiler* compiler, Trace* trace) {
2214 RegExpMacroAssembler* assembler = compiler->macro_assembler();
2215 switch (assertion_type_) {
2216 case AT_END: {
2217 Label ok;
2218 assembler->CheckPosition(trace->cp_offset(), &ok);
2219 assembler->GoTo(trace->backtrack());
2220 assembler->Bind(&ok);
2221 break;
2222 }
2223 case AT_START: {
2224 if (trace->at_start() == Trace::FALSE_VALUE) {
2225 assembler->GoTo(trace->backtrack());
2226 return;
2227 }
2228 if (trace->at_start() == Trace::UNKNOWN) {
2229 assembler->CheckNotAtStart(trace->cp_offset(), trace->backtrack());
2230 Trace at_start_trace = *trace;
2231 at_start_trace.set_at_start(Trace::TRUE_VALUE);
2232 on_success()->Emit(compiler, &at_start_trace);
2233 return;
2234 }
2235 } break;
2236 case AFTER_NEWLINE:
2237 EmitHat(compiler, on_success(), trace);
2238 return;
2239 case AT_BOUNDARY:
2240 case AT_NON_BOUNDARY: {
2241 EmitBoundaryCheck(compiler, trace);
2242 return;
2243 }
2244 }
2245 on_success()->Emit(compiler, trace);
2246 }
2247
DeterminedAlready(QuickCheckDetails * quick_check,int offset)2248 static bool DeterminedAlready(QuickCheckDetails* quick_check, int offset) {
2249 if (quick_check == nullptr) return false;
2250 if (offset >= quick_check->characters()) return false;
2251 return quick_check->positions(offset)->determines_perfectly;
2252 }
2253
UpdateBoundsCheck(int index,int * checked_up_to)2254 static void UpdateBoundsCheck(int index, int* checked_up_to) {
2255 if (index > *checked_up_to) {
2256 *checked_up_to = index;
2257 }
2258 }
2259
2260 // We call this repeatedly to generate code for each pass over the text node.
2261 // The passes are in increasing order of difficulty because we hope one
2262 // of the first passes will fail in which case we are saved the work of the
2263 // later passes. for example for the case independent regexp /%[asdfghjkl]a/
2264 // we will check the '%' in the first pass, the case independent 'a' in the
2265 // second pass and the character class in the last pass.
2266 //
2267 // The passes are done from right to left, so for example to test for /bar/
2268 // we will first test for an 'r' with offset 2, then an 'a' with offset 1
2269 // and then a 'b' with offset 0. This means we can avoid the end-of-input
2270 // bounds check most of the time. In the example we only need to check for
2271 // end-of-input when loading the putative 'r'.
2272 //
2273 // A slight complication involves the fact that the first character may already
2274 // be fetched into a register by the previous node. In this case we want to
2275 // do the test for that character first. We do this in separate passes. The
2276 // 'preloaded' argument indicates that we are doing such a 'pass'. If such a
2277 // pass has been performed then subsequent passes will have true in
2278 // first_element_checked to indicate that that character does not need to be
2279 // checked again.
2280 //
2281 // In addition to all this we are passed a Trace, which can
2282 // contain an AlternativeGeneration object. In this AlternativeGeneration
2283 // object we can see details of any quick check that was already passed in
2284 // order to get to the code we are now generating. The quick check can involve
2285 // loading characters, which means we do not need to recheck the bounds
2286 // up to the limit the quick check already checked. In addition the quick
2287 // check can have involved a mask and compare operation which may simplify
2288 // or obviate the need for further checks at some character positions.
TextEmitPass(RegExpCompiler * compiler,TextEmitPassType pass,bool preloaded,Trace * trace,bool first_element_checked,int * checked_up_to)2289 void TextNode::TextEmitPass(RegExpCompiler* compiler, TextEmitPassType pass,
2290 bool preloaded, Trace* trace,
2291 bool first_element_checked, int* checked_up_to) {
2292 RegExpMacroAssembler* assembler = compiler->macro_assembler();
2293 Isolate* isolate = assembler->isolate();
2294 bool one_byte = compiler->one_byte();
2295 Label* backtrack = trace->backtrack();
2296 QuickCheckDetails* quick_check = trace->quick_check_performed();
2297 int element_count = elements()->length();
2298 int backward_offset = read_backward() ? -Length() : 0;
2299 for (int i = preloaded ? 0 : element_count - 1; i >= 0; i--) {
2300 TextElement elm = elements()->at(i);
2301 int cp_offset = trace->cp_offset() + elm.cp_offset() + backward_offset;
2302 if (elm.text_type() == TextElement::ATOM) {
2303 if (SkipPass(pass, elm.atom()->ignore_case())) continue;
2304 Vector<const uc16> quarks = elm.atom()->data();
2305 for (int j = preloaded ? 0 : quarks.length() - 1; j >= 0; j--) {
2306 if (first_element_checked && i == 0 && j == 0) continue;
2307 if (DeterminedAlready(quick_check, elm.cp_offset() + j)) continue;
2308 uc16 quark = quarks[j];
2309 if (elm.atom()->ignore_case()) {
2310 // Everywhere else we assume that a non-Latin-1 character cannot match
2311 // a Latin-1 character. Avoid the cases where this is assumption is
2312 // invalid by using the Latin1 equivalent instead.
2313 quark = unibrow::Latin1::TryConvertToLatin1(quark);
2314 }
2315 bool needs_bounds_check =
2316 *checked_up_to < cp_offset + j || read_backward();
2317 bool bounds_checked = false;
2318 switch (pass) {
2319 case NON_LATIN1_MATCH:
2320 DCHECK(one_byte);
2321 if (quark > String::kMaxOneByteCharCode) {
2322 assembler->GoTo(backtrack);
2323 return;
2324 }
2325 break;
2326 case NON_LETTER_CHARACTER_MATCH:
2327 bounds_checked =
2328 EmitAtomNonLetter(isolate, compiler, quark, backtrack,
2329 cp_offset + j, needs_bounds_check, preloaded);
2330 break;
2331 case SIMPLE_CHARACTER_MATCH:
2332 bounds_checked = EmitSimpleCharacter(isolate, compiler, quark,
2333 backtrack, cp_offset + j,
2334 needs_bounds_check, preloaded);
2335 break;
2336 case CASE_CHARACTER_MATCH:
2337 bounds_checked =
2338 EmitAtomLetter(isolate, compiler, quark, backtrack,
2339 cp_offset + j, needs_bounds_check, preloaded);
2340 break;
2341 default:
2342 break;
2343 }
2344 if (bounds_checked) UpdateBoundsCheck(cp_offset + j, checked_up_to);
2345 }
2346 } else {
2347 DCHECK_EQ(TextElement::CHAR_CLASS, elm.text_type());
2348 if (pass == CHARACTER_CLASS_MATCH) {
2349 if (first_element_checked && i == 0) continue;
2350 if (DeterminedAlready(quick_check, elm.cp_offset())) continue;
2351 RegExpCharacterClass* cc = elm.char_class();
2352 bool bounds_check = *checked_up_to < cp_offset || read_backward();
2353 EmitCharClass(assembler, cc, one_byte, backtrack, cp_offset,
2354 bounds_check, preloaded, zone());
2355 UpdateBoundsCheck(cp_offset, checked_up_to);
2356 }
2357 }
2358 }
2359 }
2360
Length()2361 int TextNode::Length() {
2362 TextElement elm = elements()->last();
2363 DCHECK_LE(0, elm.cp_offset());
2364 return elm.cp_offset() + elm.length();
2365 }
2366
SkipPass(TextEmitPassType pass,bool ignore_case)2367 bool TextNode::SkipPass(TextEmitPassType pass, bool ignore_case) {
2368 if (ignore_case) {
2369 return pass == SIMPLE_CHARACTER_MATCH;
2370 } else {
2371 return pass == NON_LETTER_CHARACTER_MATCH || pass == CASE_CHARACTER_MATCH;
2372 }
2373 }
2374
CreateForCharacterRanges(Zone * zone,ZoneList<CharacterRange> * ranges,bool read_backward,RegExpNode * on_success,JSRegExp::Flags flags)2375 TextNode* TextNode::CreateForCharacterRanges(Zone* zone,
2376 ZoneList<CharacterRange>* ranges,
2377 bool read_backward,
2378 RegExpNode* on_success,
2379 JSRegExp::Flags flags) {
2380 DCHECK_NOT_NULL(ranges);
2381 ZoneList<TextElement>* elms = zone->New<ZoneList<TextElement>>(1, zone);
2382 elms->Add(TextElement::CharClass(
2383 zone->New<RegExpCharacterClass>(zone, ranges, flags)),
2384 zone);
2385 return zone->New<TextNode>(elms, read_backward, on_success);
2386 }
2387
CreateForSurrogatePair(Zone * zone,CharacterRange lead,CharacterRange trail,bool read_backward,RegExpNode * on_success,JSRegExp::Flags flags)2388 TextNode* TextNode::CreateForSurrogatePair(Zone* zone, CharacterRange lead,
2389 CharacterRange trail,
2390 bool read_backward,
2391 RegExpNode* on_success,
2392 JSRegExp::Flags flags) {
2393 ZoneList<CharacterRange>* lead_ranges = CharacterRange::List(zone, lead);
2394 ZoneList<CharacterRange>* trail_ranges = CharacterRange::List(zone, trail);
2395 ZoneList<TextElement>* elms = zone->New<ZoneList<TextElement>>(2, zone);
2396 elms->Add(TextElement::CharClass(
2397 zone->New<RegExpCharacterClass>(zone, lead_ranges, flags)),
2398 zone);
2399 elms->Add(TextElement::CharClass(
2400 zone->New<RegExpCharacterClass>(zone, trail_ranges, flags)),
2401 zone);
2402 return zone->New<TextNode>(elms, read_backward, on_success);
2403 }
2404
2405 // This generates the code to match a text node. A text node can contain
2406 // straight character sequences (possibly to be matched in a case-independent
2407 // way) and character classes. For efficiency we do not do this in a single
2408 // pass from left to right. Instead we pass over the text node several times,
2409 // emitting code for some character positions every time. See the comment on
2410 // TextEmitPass for details.
Emit(RegExpCompiler * compiler,Trace * trace)2411 void TextNode::Emit(RegExpCompiler* compiler, Trace* trace) {
2412 LimitResult limit_result = LimitVersions(compiler, trace);
2413 if (limit_result == DONE) return;
2414 DCHECK(limit_result == CONTINUE);
2415
2416 if (trace->cp_offset() + Length() > RegExpMacroAssembler::kMaxCPOffset) {
2417 compiler->SetRegExpTooBig();
2418 return;
2419 }
2420
2421 if (compiler->one_byte()) {
2422 int dummy = 0;
2423 TextEmitPass(compiler, NON_LATIN1_MATCH, false, trace, false, &dummy);
2424 }
2425
2426 bool first_elt_done = false;
2427 int bound_checked_to = trace->cp_offset() - 1;
2428 bound_checked_to += trace->bound_checked_up_to();
2429
2430 // If a character is preloaded into the current character register then
2431 // check that now.
2432 if (trace->characters_preloaded() == 1) {
2433 for (int pass = kFirstRealPass; pass <= kLastPass; pass++) {
2434 TextEmitPass(compiler, static_cast<TextEmitPassType>(pass), true, trace,
2435 false, &bound_checked_to);
2436 }
2437 first_elt_done = true;
2438 }
2439
2440 for (int pass = kFirstRealPass; pass <= kLastPass; pass++) {
2441 TextEmitPass(compiler, static_cast<TextEmitPassType>(pass), false, trace,
2442 first_elt_done, &bound_checked_to);
2443 }
2444
2445 Trace successor_trace(*trace);
2446 // If we advance backward, we may end up at the start.
2447 successor_trace.AdvanceCurrentPositionInTrace(
2448 read_backward() ? -Length() : Length(), compiler);
2449 successor_trace.set_at_start(read_backward() ? Trace::UNKNOWN
2450 : Trace::FALSE_VALUE);
2451 RecursionCheck rc(compiler);
2452 on_success()->Emit(compiler, &successor_trace);
2453 }
2454
InvalidateCurrentCharacter()2455 void Trace::InvalidateCurrentCharacter() { characters_preloaded_ = 0; }
2456
AdvanceCurrentPositionInTrace(int by,RegExpCompiler * compiler)2457 void Trace::AdvanceCurrentPositionInTrace(int by, RegExpCompiler* compiler) {
2458 // We don't have an instruction for shifting the current character register
2459 // down or for using a shifted value for anything so lets just forget that
2460 // we preloaded any characters into it.
2461 characters_preloaded_ = 0;
2462 // Adjust the offsets of the quick check performed information. This
2463 // information is used to find out what we already determined about the
2464 // characters by means of mask and compare.
2465 quick_check_performed_.Advance(by, compiler->one_byte());
2466 cp_offset_ += by;
2467 if (cp_offset_ > RegExpMacroAssembler::kMaxCPOffset) {
2468 compiler->SetRegExpTooBig();
2469 cp_offset_ = 0;
2470 }
2471 bound_checked_up_to_ = Max(0, bound_checked_up_to_ - by);
2472 }
2473
MakeCaseIndependent(Isolate * isolate,bool is_one_byte)2474 void TextNode::MakeCaseIndependent(Isolate* isolate, bool is_one_byte) {
2475 int element_count = elements()->length();
2476 for (int i = 0; i < element_count; i++) {
2477 TextElement elm = elements()->at(i);
2478 if (elm.text_type() == TextElement::CHAR_CLASS) {
2479 RegExpCharacterClass* cc = elm.char_class();
2480 #ifdef V8_INTL_SUPPORT
2481 bool case_equivalents_already_added =
2482 NeedsUnicodeCaseEquivalents(cc->flags());
2483 #else
2484 bool case_equivalents_already_added = false;
2485 #endif
2486 if (IgnoreCase(cc->flags()) && !case_equivalents_already_added) {
2487 // None of the standard character classes is different in the case
2488 // independent case and it slows us down if we don't know that.
2489 if (cc->is_standard(zone())) continue;
2490 ZoneList<CharacterRange>* ranges = cc->ranges(zone());
2491 CharacterRange::AddCaseEquivalents(isolate, zone(), ranges,
2492 is_one_byte);
2493 }
2494 }
2495 }
2496 }
2497
GreedyLoopTextLength()2498 int TextNode::GreedyLoopTextLength() { return Length(); }
2499
GetSuccessorOfOmnivorousTextNode(RegExpCompiler * compiler)2500 RegExpNode* TextNode::GetSuccessorOfOmnivorousTextNode(
2501 RegExpCompiler* compiler) {
2502 if (read_backward()) return nullptr;
2503 if (elements()->length() != 1) return nullptr;
2504 TextElement elm = elements()->at(0);
2505 if (elm.text_type() != TextElement::CHAR_CLASS) return nullptr;
2506 RegExpCharacterClass* node = elm.char_class();
2507 ZoneList<CharacterRange>* ranges = node->ranges(zone());
2508 CharacterRange::Canonicalize(ranges);
2509 if (node->is_negated()) {
2510 return ranges->length() == 0 ? on_success() : nullptr;
2511 }
2512 if (ranges->length() != 1) return nullptr;
2513 const uc32 max_char = MaxCodeUnit(compiler->one_byte());
2514 return ranges->at(0).IsEverything(max_char) ? on_success() : nullptr;
2515 }
2516
2517 // Finds the fixed match length of a sequence of nodes that goes from
2518 // this alternative and back to this choice node. If there are variable
2519 // length nodes or other complications in the way then return a sentinel
2520 // value indicating that a greedy loop cannot be constructed.
GreedyLoopTextLengthForAlternative(GuardedAlternative * alternative)2521 int ChoiceNode::GreedyLoopTextLengthForAlternative(
2522 GuardedAlternative* alternative) {
2523 int length = 0;
2524 RegExpNode* node = alternative->node();
2525 // Later we will generate code for all these text nodes using recursion
2526 // so we have to limit the max number.
2527 int recursion_depth = 0;
2528 while (node != this) {
2529 if (recursion_depth++ > RegExpCompiler::kMaxRecursion) {
2530 return kNodeIsTooComplexForGreedyLoops;
2531 }
2532 int node_length = node->GreedyLoopTextLength();
2533 if (node_length == kNodeIsTooComplexForGreedyLoops) {
2534 return kNodeIsTooComplexForGreedyLoops;
2535 }
2536 length += node_length;
2537 SeqRegExpNode* seq_node = static_cast<SeqRegExpNode*>(node);
2538 node = seq_node->on_success();
2539 }
2540 return read_backward() ? -length : length;
2541 }
2542
AddLoopAlternative(GuardedAlternative alt)2543 void LoopChoiceNode::AddLoopAlternative(GuardedAlternative alt) {
2544 DCHECK_NULL(loop_node_);
2545 AddAlternative(alt);
2546 loop_node_ = alt.node();
2547 }
2548
AddContinueAlternative(GuardedAlternative alt)2549 void LoopChoiceNode::AddContinueAlternative(GuardedAlternative alt) {
2550 DCHECK_NULL(continue_node_);
2551 AddAlternative(alt);
2552 continue_node_ = alt.node();
2553 }
2554
Emit(RegExpCompiler * compiler,Trace * trace)2555 void LoopChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
2556 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
2557 if (trace->stop_node() == this) {
2558 // Back edge of greedy optimized loop node graph.
2559 int text_length =
2560 GreedyLoopTextLengthForAlternative(&(alternatives_->at(0)));
2561 DCHECK_NE(kNodeIsTooComplexForGreedyLoops, text_length);
2562 // Update the counter-based backtracking info on the stack. This is an
2563 // optimization for greedy loops (see below).
2564 DCHECK(trace->cp_offset() == text_length);
2565 macro_assembler->AdvanceCurrentPosition(text_length);
2566 macro_assembler->GoTo(trace->loop_label());
2567 return;
2568 }
2569 DCHECK_NULL(trace->stop_node());
2570 if (!trace->is_trivial()) {
2571 trace->Flush(compiler, this);
2572 return;
2573 }
2574 ChoiceNode::Emit(compiler, trace);
2575 }
2576
CalculatePreloadCharacters(RegExpCompiler * compiler,int eats_at_least)2577 int ChoiceNode::CalculatePreloadCharacters(RegExpCompiler* compiler,
2578 int eats_at_least) {
2579 int preload_characters = Min(4, eats_at_least);
2580 DCHECK_LE(preload_characters, 4);
2581 if (compiler->macro_assembler()->CanReadUnaligned()) {
2582 bool one_byte = compiler->one_byte();
2583 if (one_byte) {
2584 // We can't preload 3 characters because there is no machine instruction
2585 // to do that. We can't just load 4 because we could be reading
2586 // beyond the end of the string, which could cause a memory fault.
2587 if (preload_characters == 3) preload_characters = 2;
2588 } else {
2589 if (preload_characters > 2) preload_characters = 2;
2590 }
2591 } else {
2592 if (preload_characters > 1) preload_characters = 1;
2593 }
2594 return preload_characters;
2595 }
2596
2597 // This class is used when generating the alternatives in a choice node. It
2598 // records the way the alternative is being code generated.
2599 class AlternativeGeneration : public Malloced {
2600 public:
AlternativeGeneration()2601 AlternativeGeneration()
2602 : possible_success(),
2603 expects_preload(false),
2604 after(),
2605 quick_check_details() {}
2606 Label possible_success;
2607 bool expects_preload;
2608 Label after;
2609 QuickCheckDetails quick_check_details;
2610 };
2611
2612 // Creates a list of AlternativeGenerations. If the list has a reasonable
2613 // size then it is on the stack, otherwise the excess is on the heap.
2614 class AlternativeGenerationList {
2615 public:
AlternativeGenerationList(int count,Zone * zone)2616 AlternativeGenerationList(int count, Zone* zone) : alt_gens_(count, zone) {
2617 for (int i = 0; i < count && i < kAFew; i++) {
2618 alt_gens_.Add(a_few_alt_gens_ + i, zone);
2619 }
2620 for (int i = kAFew; i < count; i++) {
2621 alt_gens_.Add(new AlternativeGeneration(), zone);
2622 }
2623 }
~AlternativeGenerationList()2624 ~AlternativeGenerationList() {
2625 for (int i = kAFew; i < alt_gens_.length(); i++) {
2626 delete alt_gens_[i];
2627 alt_gens_[i] = nullptr;
2628 }
2629 }
2630
at(int i)2631 AlternativeGeneration* at(int i) { return alt_gens_[i]; }
2632
2633 private:
2634 static const int kAFew = 10;
2635 ZoneList<AlternativeGeneration*> alt_gens_;
2636 AlternativeGeneration a_few_alt_gens_[kAFew];
2637 };
2638
Set(int character)2639 void BoyerMoorePositionInfo::Set(int character) {
2640 SetInterval(Interval(character, character));
2641 }
2642
2643 namespace {
2644
AddRange(ContainedInLattice containment,const int * ranges,int ranges_length,Interval new_range)2645 ContainedInLattice AddRange(ContainedInLattice containment, const int* ranges,
2646 int ranges_length, Interval new_range) {
2647 DCHECK_EQ(1, ranges_length & 1);
2648 DCHECK_EQ(String::kMaxCodePoint + 1, ranges[ranges_length - 1]);
2649 if (containment == kLatticeUnknown) return containment;
2650 bool inside = false;
2651 int last = 0;
2652 for (int i = 0; i < ranges_length; inside = !inside, last = ranges[i], i++) {
2653 // Consider the range from last to ranges[i].
2654 // We haven't got to the new range yet.
2655 if (ranges[i] <= new_range.from()) continue;
2656 // New range is wholly inside last-ranges[i]. Note that new_range.to() is
2657 // inclusive, but the values in ranges are not.
2658 if (last <= new_range.from() && new_range.to() < ranges[i]) {
2659 return Combine(containment, inside ? kLatticeIn : kLatticeOut);
2660 }
2661 return kLatticeUnknown;
2662 }
2663 return containment;
2664 }
2665
BitsetFirstSetBit(BoyerMoorePositionInfo::Bitset bitset)2666 int BitsetFirstSetBit(BoyerMoorePositionInfo::Bitset bitset) {
2667 STATIC_ASSERT(BoyerMoorePositionInfo::kMapSize ==
2668 2 * kInt64Size * kBitsPerByte);
2669
2670 // Slight fiddling is needed here, since the bitset is of length 128 while
2671 // CountTrailingZeros requires an integral type and std::bitset can only
2672 // convert to unsigned long long. So we handle the most- and least-significant
2673 // bits separately.
2674
2675 {
2676 static constexpr BoyerMoorePositionInfo::Bitset mask(~uint64_t{0});
2677 BoyerMoorePositionInfo::Bitset masked_bitset = bitset & mask;
2678 STATIC_ASSERT(kInt64Size >= sizeof(decltype(masked_bitset.to_ullong())));
2679 uint64_t lsb = masked_bitset.to_ullong();
2680 if (lsb != 0) return base::bits::CountTrailingZeros(lsb);
2681 }
2682
2683 {
2684 BoyerMoorePositionInfo::Bitset masked_bitset = bitset >> 64;
2685 uint64_t msb = masked_bitset.to_ullong();
2686 if (msb != 0) return 64 + base::bits::CountTrailingZeros(msb);
2687 }
2688
2689 return -1;
2690 }
2691
2692 } // namespace
2693
SetInterval(const Interval & interval)2694 void BoyerMoorePositionInfo::SetInterval(const Interval& interval) {
2695 w_ = AddRange(w_, kWordRanges, kWordRangeCount, interval);
2696
2697 if (interval.size() >= kMapSize) {
2698 map_count_ = kMapSize;
2699 map_.set();
2700 return;
2701 }
2702
2703 for (int i = interval.from(); i <= interval.to(); i++) {
2704 int mod_character = (i & kMask);
2705 if (!map_[mod_character]) {
2706 map_count_++;
2707 map_.set(mod_character);
2708 }
2709 if (map_count_ == kMapSize) return;
2710 }
2711 }
2712
SetAll()2713 void BoyerMoorePositionInfo::SetAll() {
2714 w_ = kLatticeUnknown;
2715 if (map_count_ != kMapSize) {
2716 map_count_ = kMapSize;
2717 map_.set();
2718 }
2719 }
2720
BoyerMooreLookahead(int length,RegExpCompiler * compiler,Zone * zone)2721 BoyerMooreLookahead::BoyerMooreLookahead(int length, RegExpCompiler* compiler,
2722 Zone* zone)
2723 : length_(length),
2724 compiler_(compiler),
2725 max_char_(MaxCodeUnit(compiler->one_byte())) {
2726 bitmaps_ = zone->New<ZoneList<BoyerMoorePositionInfo*>>(length, zone);
2727 for (int i = 0; i < length; i++) {
2728 bitmaps_->Add(zone->New<BoyerMoorePositionInfo>(), zone);
2729 }
2730 }
2731
2732 // Find the longest range of lookahead that has the fewest number of different
2733 // characters that can occur at a given position. Since we are optimizing two
2734 // different parameters at once this is a tradeoff.
FindWorthwhileInterval(int * from,int * to)2735 bool BoyerMooreLookahead::FindWorthwhileInterval(int* from, int* to) {
2736 int biggest_points = 0;
2737 // If more than 32 characters out of 128 can occur it is unlikely that we can
2738 // be lucky enough to step forwards much of the time.
2739 const int kMaxMax = 32;
2740 for (int max_number_of_chars = 4; max_number_of_chars < kMaxMax;
2741 max_number_of_chars *= 2) {
2742 biggest_points =
2743 FindBestInterval(max_number_of_chars, biggest_points, from, to);
2744 }
2745 if (biggest_points == 0) return false;
2746 return true;
2747 }
2748
2749 // Find the highest-points range between 0 and length_ where the character
2750 // information is not too vague. 'Too vague' means that there are more than
2751 // max_number_of_chars that can occur at this position. Calculates the number
2752 // of points as the product of width-of-the-range and
2753 // probability-of-finding-one-of-the-characters, where the probability is
2754 // calculated using the frequency distribution of the sample subject string.
FindBestInterval(int max_number_of_chars,int old_biggest_points,int * from,int * to)2755 int BoyerMooreLookahead::FindBestInterval(int max_number_of_chars,
2756 int old_biggest_points, int* from,
2757 int* to) {
2758 int biggest_points = old_biggest_points;
2759 static const int kSize = RegExpMacroAssembler::kTableSize;
2760 for (int i = 0; i < length_;) {
2761 while (i < length_ && Count(i) > max_number_of_chars) i++;
2762 if (i == length_) break;
2763 int remembered_from = i;
2764
2765 BoyerMoorePositionInfo::Bitset union_bitset;
2766 for (; i < length_ && Count(i) <= max_number_of_chars; i++) {
2767 union_bitset |= bitmaps_->at(i)->raw_bitset();
2768 }
2769
2770 int frequency = 0;
2771
2772 // Iterate only over set bits.
2773 int j;
2774 while ((j = BitsetFirstSetBit(union_bitset)) != -1) {
2775 DCHECK(union_bitset[j]); // Sanity check.
2776 // Add 1 to the frequency to give a small per-character boost for
2777 // the cases where our sampling is not good enough and many
2778 // characters have a frequency of zero. This means the frequency
2779 // can theoretically be up to 2*kSize though we treat it mostly as
2780 // a fraction of kSize.
2781 frequency += compiler_->frequency_collator()->Frequency(j) + 1;
2782 union_bitset.reset(j);
2783 }
2784
2785 // We use the probability of skipping times the distance we are skipping to
2786 // judge the effectiveness of this. Actually we have a cut-off: By
2787 // dividing by 2 we switch off the skipping if the probability of skipping
2788 // is less than 50%. This is because the multibyte mask-and-compare
2789 // skipping in quickcheck is more likely to do well on this case.
2790 bool in_quickcheck_range =
2791 ((i - remembered_from < 4) ||
2792 (compiler_->one_byte() ? remembered_from <= 4 : remembered_from <= 2));
2793 // Called 'probability' but it is only a rough estimate and can actually
2794 // be outside the 0-kSize range.
2795 int probability = (in_quickcheck_range ? kSize / 2 : kSize) - frequency;
2796 int points = (i - remembered_from) * probability;
2797 if (points > biggest_points) {
2798 *from = remembered_from;
2799 *to = i - 1;
2800 biggest_points = points;
2801 }
2802 }
2803 return biggest_points;
2804 }
2805
2806 // Take all the characters that will not prevent a successful match if they
2807 // occur in the subject string in the range between min_lookahead and
2808 // max_lookahead (inclusive) measured from the current position. If the
2809 // character at max_lookahead offset is not one of these characters, then we
2810 // can safely skip forwards by the number of characters in the range.
GetSkipTable(int min_lookahead,int max_lookahead,Handle<ByteArray> boolean_skip_table)2811 int BoyerMooreLookahead::GetSkipTable(int min_lookahead, int max_lookahead,
2812 Handle<ByteArray> boolean_skip_table) {
2813 const int kSkipArrayEntry = 0;
2814 const int kDontSkipArrayEntry = 1;
2815
2816 std::memset(boolean_skip_table->GetDataStartAddress(), kSkipArrayEntry,
2817 boolean_skip_table->length());
2818
2819 for (int i = max_lookahead; i >= min_lookahead; i--) {
2820 BoyerMoorePositionInfo::Bitset bitset = bitmaps_->at(i)->raw_bitset();
2821
2822 // Iterate only over set bits.
2823 int j;
2824 while ((j = BitsetFirstSetBit(bitset)) != -1) {
2825 DCHECK(bitset[j]); // Sanity check.
2826 boolean_skip_table->set(j, kDontSkipArrayEntry);
2827 bitset.reset(j);
2828 }
2829 }
2830
2831 const int skip = max_lookahead + 1 - min_lookahead;
2832 return skip;
2833 }
2834
2835 // See comment above on the implementation of GetSkipTable.
EmitSkipInstructions(RegExpMacroAssembler * masm)2836 void BoyerMooreLookahead::EmitSkipInstructions(RegExpMacroAssembler* masm) {
2837 const int kSize = RegExpMacroAssembler::kTableSize;
2838
2839 int min_lookahead = 0;
2840 int max_lookahead = 0;
2841
2842 if (!FindWorthwhileInterval(&min_lookahead, &max_lookahead)) return;
2843
2844 // Check if we only have a single non-empty position info, and that info
2845 // contains precisely one character.
2846 bool found_single_character = false;
2847 int single_character = 0;
2848 for (int i = max_lookahead; i >= min_lookahead; i--) {
2849 BoyerMoorePositionInfo* map = bitmaps_->at(i);
2850 if (map->map_count() == 0) continue;
2851
2852 if (found_single_character || map->map_count() > 1) {
2853 found_single_character = false;
2854 break;
2855 }
2856
2857 DCHECK(!found_single_character);
2858 DCHECK_EQ(map->map_count(), 1);
2859
2860 found_single_character = true;
2861 single_character = BitsetFirstSetBit(map->raw_bitset());
2862
2863 DCHECK_NE(single_character, -1);
2864 }
2865
2866 int lookahead_width = max_lookahead + 1 - min_lookahead;
2867
2868 if (found_single_character && lookahead_width == 1 && max_lookahead < 3) {
2869 // The mask-compare can probably handle this better.
2870 return;
2871 }
2872
2873 if (found_single_character) {
2874 Label cont, again;
2875 masm->Bind(&again);
2876 masm->LoadCurrentCharacter(max_lookahead, &cont, true);
2877 if (max_char_ > kSize) {
2878 masm->CheckCharacterAfterAnd(single_character,
2879 RegExpMacroAssembler::kTableMask, &cont);
2880 } else {
2881 masm->CheckCharacter(single_character, &cont);
2882 }
2883 masm->AdvanceCurrentPosition(lookahead_width);
2884 masm->GoTo(&again);
2885 masm->Bind(&cont);
2886 return;
2887 }
2888
2889 Factory* factory = masm->isolate()->factory();
2890 Handle<ByteArray> boolean_skip_table =
2891 factory->NewByteArray(kSize, AllocationType::kOld);
2892 int skip_distance =
2893 GetSkipTable(min_lookahead, max_lookahead, boolean_skip_table);
2894 DCHECK_NE(0, skip_distance);
2895
2896 Label cont, again;
2897 masm->Bind(&again);
2898 masm->LoadCurrentCharacter(max_lookahead, &cont, true);
2899 masm->CheckBitInTable(boolean_skip_table, &cont);
2900 masm->AdvanceCurrentPosition(skip_distance);
2901 masm->GoTo(&again);
2902 masm->Bind(&cont);
2903 }
2904
2905 /* Code generation for choice nodes.
2906 *
2907 * We generate quick checks that do a mask and compare to eliminate a
2908 * choice. If the quick check succeeds then it jumps to the continuation to
2909 * do slow checks and check subsequent nodes. If it fails (the common case)
2910 * it falls through to the next choice.
2911 *
2912 * Here is the desired flow graph. Nodes directly below each other imply
2913 * fallthrough. Alternatives 1 and 2 have quick checks. Alternative
2914 * 3 doesn't have a quick check so we have to call the slow check.
2915 * Nodes are marked Qn for quick checks and Sn for slow checks. The entire
2916 * regexp continuation is generated directly after the Sn node, up to the
2917 * next GoTo if we decide to reuse some already generated code. Some
2918 * nodes expect preload_characters to be preloaded into the current
2919 * character register. R nodes do this preloading. Vertices are marked
2920 * F for failures and S for success (possible success in the case of quick
2921 * nodes). L, V, < and > are used as arrow heads.
2922 *
2923 * ----------> R
2924 * |
2925 * V
2926 * Q1 -----> S1
2927 * | S /
2928 * F| /
2929 * | F/
2930 * | /
2931 * | R
2932 * | /
2933 * V L
2934 * Q2 -----> S2
2935 * | S /
2936 * F| /
2937 * | F/
2938 * | /
2939 * | R
2940 * | /
2941 * V L
2942 * S3
2943 * |
2944 * F|
2945 * |
2946 * R
2947 * |
2948 * backtrack V
2949 * <----------Q4
2950 * \ F |
2951 * \ |S
2952 * \ F V
2953 * \-----S4
2954 *
2955 * For greedy loops we push the current position, then generate the code that
2956 * eats the input specially in EmitGreedyLoop. The other choice (the
2957 * continuation) is generated by the normal code in EmitChoices, and steps back
2958 * in the input to the starting position when it fails to match. The loop code
2959 * looks like this (U is the unwind code that steps back in the greedy loop).
2960 *
2961 * _____
2962 * / \
2963 * V |
2964 * ----------> S1 |
2965 * /| |
2966 * / |S |
2967 * F/ \_____/
2968 * /
2969 * |<-----
2970 * | \
2971 * V |S
2972 * Q2 ---> U----->backtrack
2973 * | F /
2974 * S| /
2975 * V F /
2976 * S2--/
2977 */
2978
GreedyLoopState(bool not_at_start)2979 GreedyLoopState::GreedyLoopState(bool not_at_start) {
2980 counter_backtrack_trace_.set_backtrack(&label_);
2981 if (not_at_start) counter_backtrack_trace_.set_at_start(Trace::FALSE_VALUE);
2982 }
2983
AssertGuardsMentionRegisters(Trace * trace)2984 void ChoiceNode::AssertGuardsMentionRegisters(Trace* trace) {
2985 #ifdef DEBUG
2986 int choice_count = alternatives_->length();
2987 for (int i = 0; i < choice_count - 1; i++) {
2988 GuardedAlternative alternative = alternatives_->at(i);
2989 ZoneList<Guard*>* guards = alternative.guards();
2990 int guard_count = (guards == nullptr) ? 0 : guards->length();
2991 for (int j = 0; j < guard_count; j++) {
2992 DCHECK(!trace->mentions_reg(guards->at(j)->reg()));
2993 }
2994 }
2995 #endif
2996 }
2997
SetUpPreLoad(RegExpCompiler * compiler,Trace * current_trace,PreloadState * state)2998 void ChoiceNode::SetUpPreLoad(RegExpCompiler* compiler, Trace* current_trace,
2999 PreloadState* state) {
3000 if (state->eats_at_least_ == PreloadState::kEatsAtLeastNotYetInitialized) {
3001 // Save some time by looking at most one machine word ahead.
3002 state->eats_at_least_ =
3003 EatsAtLeast(current_trace->at_start() == Trace::FALSE_VALUE);
3004 }
3005 state->preload_characters_ =
3006 CalculatePreloadCharacters(compiler, state->eats_at_least_);
3007
3008 state->preload_is_current_ =
3009 (current_trace->characters_preloaded() == state->preload_characters_);
3010 state->preload_has_checked_bounds_ = state->preload_is_current_;
3011 }
3012
Emit(RegExpCompiler * compiler,Trace * trace)3013 void ChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
3014 int choice_count = alternatives_->length();
3015
3016 if (choice_count == 1 && alternatives_->at(0).guards() == nullptr) {
3017 alternatives_->at(0).node()->Emit(compiler, trace);
3018 return;
3019 }
3020
3021 AssertGuardsMentionRegisters(trace);
3022
3023 LimitResult limit_result = LimitVersions(compiler, trace);
3024 if (limit_result == DONE) return;
3025 DCHECK(limit_result == CONTINUE);
3026
3027 // For loop nodes we already flushed (see LoopChoiceNode::Emit), but for
3028 // other choice nodes we only flush if we are out of code size budget.
3029 if (trace->flush_budget() == 0 && trace->actions() != nullptr) {
3030 trace->Flush(compiler, this);
3031 return;
3032 }
3033
3034 RecursionCheck rc(compiler);
3035
3036 PreloadState preload;
3037 preload.init();
3038 GreedyLoopState greedy_loop_state(not_at_start());
3039
3040 int text_length = GreedyLoopTextLengthForAlternative(&alternatives_->at(0));
3041 AlternativeGenerationList alt_gens(choice_count, zone());
3042
3043 if (choice_count > 1 && text_length != kNodeIsTooComplexForGreedyLoops) {
3044 trace = EmitGreedyLoop(compiler, trace, &alt_gens, &preload,
3045 &greedy_loop_state, text_length);
3046 } else {
3047 // TODO(erikcorry): Delete this. We don't need this label, but it makes us
3048 // match the traces produced pre-cleanup.
3049 Label second_choice;
3050 compiler->macro_assembler()->Bind(&second_choice);
3051
3052 preload.eats_at_least_ = EmitOptimizedUnanchoredSearch(compiler, trace);
3053
3054 EmitChoices(compiler, &alt_gens, 0, trace, &preload);
3055 }
3056
3057 // At this point we need to generate slow checks for the alternatives where
3058 // the quick check was inlined. We can recognize these because the associated
3059 // label was bound.
3060 int new_flush_budget = trace->flush_budget() / choice_count;
3061 for (int i = 0; i < choice_count; i++) {
3062 AlternativeGeneration* alt_gen = alt_gens.at(i);
3063 Trace new_trace(*trace);
3064 // If there are actions to be flushed we have to limit how many times
3065 // they are flushed. Take the budget of the parent trace and distribute
3066 // it fairly amongst the children.
3067 if (new_trace.actions() != nullptr) {
3068 new_trace.set_flush_budget(new_flush_budget);
3069 }
3070 bool next_expects_preload =
3071 i == choice_count - 1 ? false : alt_gens.at(i + 1)->expects_preload;
3072 EmitOutOfLineContinuation(compiler, &new_trace, alternatives_->at(i),
3073 alt_gen, preload.preload_characters_,
3074 next_expects_preload);
3075 }
3076 }
3077
EmitGreedyLoop(RegExpCompiler * compiler,Trace * trace,AlternativeGenerationList * alt_gens,PreloadState * preload,GreedyLoopState * greedy_loop_state,int text_length)3078 Trace* ChoiceNode::EmitGreedyLoop(RegExpCompiler* compiler, Trace* trace,
3079 AlternativeGenerationList* alt_gens,
3080 PreloadState* preload,
3081 GreedyLoopState* greedy_loop_state,
3082 int text_length) {
3083 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
3084 // Here we have special handling for greedy loops containing only text nodes
3085 // and other simple nodes. These are handled by pushing the current
3086 // position on the stack and then incrementing the current position each
3087 // time around the switch. On backtrack we decrement the current position
3088 // and check it against the pushed value. This avoids pushing backtrack
3089 // information for each iteration of the loop, which could take up a lot of
3090 // space.
3091 DCHECK(trace->stop_node() == nullptr);
3092 macro_assembler->PushCurrentPosition();
3093 Label greedy_match_failed;
3094 Trace greedy_match_trace;
3095 if (not_at_start()) greedy_match_trace.set_at_start(Trace::FALSE_VALUE);
3096 greedy_match_trace.set_backtrack(&greedy_match_failed);
3097 Label loop_label;
3098 macro_assembler->Bind(&loop_label);
3099 greedy_match_trace.set_stop_node(this);
3100 greedy_match_trace.set_loop_label(&loop_label);
3101 alternatives_->at(0).node()->Emit(compiler, &greedy_match_trace);
3102 macro_assembler->Bind(&greedy_match_failed);
3103
3104 Label second_choice; // For use in greedy matches.
3105 macro_assembler->Bind(&second_choice);
3106
3107 Trace* new_trace = greedy_loop_state->counter_backtrack_trace();
3108
3109 EmitChoices(compiler, alt_gens, 1, new_trace, preload);
3110
3111 macro_assembler->Bind(greedy_loop_state->label());
3112 // If we have unwound to the bottom then backtrack.
3113 macro_assembler->CheckGreedyLoop(trace->backtrack());
3114 // Otherwise try the second priority at an earlier position.
3115 macro_assembler->AdvanceCurrentPosition(-text_length);
3116 macro_assembler->GoTo(&second_choice);
3117 return new_trace;
3118 }
3119
EmitOptimizedUnanchoredSearch(RegExpCompiler * compiler,Trace * trace)3120 int ChoiceNode::EmitOptimizedUnanchoredSearch(RegExpCompiler* compiler,
3121 Trace* trace) {
3122 int eats_at_least = PreloadState::kEatsAtLeastNotYetInitialized;
3123 if (alternatives_->length() != 2) return eats_at_least;
3124
3125 GuardedAlternative alt1 = alternatives_->at(1);
3126 if (alt1.guards() != nullptr && alt1.guards()->length() != 0) {
3127 return eats_at_least;
3128 }
3129 RegExpNode* eats_anything_node = alt1.node();
3130 if (eats_anything_node->GetSuccessorOfOmnivorousTextNode(compiler) != this) {
3131 return eats_at_least;
3132 }
3133
3134 // Really we should be creating a new trace when we execute this function,
3135 // but there is no need, because the code it generates cannot backtrack, and
3136 // we always arrive here with a trivial trace (since it's the entry to a
3137 // loop. That also implies that there are no preloaded characters, which is
3138 // good, because it means we won't be violating any assumptions by
3139 // overwriting those characters with new load instructions.
3140 DCHECK(trace->is_trivial());
3141
3142 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
3143 Isolate* isolate = macro_assembler->isolate();
3144 // At this point we know that we are at a non-greedy loop that will eat
3145 // any character one at a time. Any non-anchored regexp has such a
3146 // loop prepended to it in order to find where it starts. We look for
3147 // a pattern of the form ...abc... where we can look 6 characters ahead
3148 // and step forwards 3 if the character is not one of abc. Abc need
3149 // not be atoms, they can be any reasonably limited character class or
3150 // small alternation.
3151 BoyerMooreLookahead* bm = bm_info(false);
3152 if (bm == nullptr) {
3153 eats_at_least = Min(kMaxLookaheadForBoyerMoore, EatsAtLeast(false));
3154 if (eats_at_least >= 1) {
3155 bm = zone()->New<BoyerMooreLookahead>(eats_at_least, compiler, zone());
3156 GuardedAlternative alt0 = alternatives_->at(0);
3157 alt0.node()->FillInBMInfo(isolate, 0, kRecursionBudget, bm, false);
3158 }
3159 }
3160 if (bm != nullptr) {
3161 bm->EmitSkipInstructions(macro_assembler);
3162 }
3163 return eats_at_least;
3164 }
3165
EmitChoices(RegExpCompiler * compiler,AlternativeGenerationList * alt_gens,int first_choice,Trace * trace,PreloadState * preload)3166 void ChoiceNode::EmitChoices(RegExpCompiler* compiler,
3167 AlternativeGenerationList* alt_gens,
3168 int first_choice, Trace* trace,
3169 PreloadState* preload) {
3170 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
3171 SetUpPreLoad(compiler, trace, preload);
3172
3173 // For now we just call all choices one after the other. The idea ultimately
3174 // is to use the Dispatch table to try only the relevant ones.
3175 int choice_count = alternatives_->length();
3176
3177 int new_flush_budget = trace->flush_budget() / choice_count;
3178
3179 for (int i = first_choice; i < choice_count; i++) {
3180 bool is_last = i == choice_count - 1;
3181 bool fall_through_on_failure = !is_last;
3182 GuardedAlternative alternative = alternatives_->at(i);
3183 AlternativeGeneration* alt_gen = alt_gens->at(i);
3184 alt_gen->quick_check_details.set_characters(preload->preload_characters_);
3185 ZoneList<Guard*>* guards = alternative.guards();
3186 int guard_count = (guards == nullptr) ? 0 : guards->length();
3187 Trace new_trace(*trace);
3188 new_trace.set_characters_preloaded(
3189 preload->preload_is_current_ ? preload->preload_characters_ : 0);
3190 if (preload->preload_has_checked_bounds_) {
3191 new_trace.set_bound_checked_up_to(preload->preload_characters_);
3192 }
3193 new_trace.quick_check_performed()->Clear();
3194 if (not_at_start_) new_trace.set_at_start(Trace::FALSE_VALUE);
3195 if (!is_last) {
3196 new_trace.set_backtrack(&alt_gen->after);
3197 }
3198 alt_gen->expects_preload = preload->preload_is_current_;
3199 bool generate_full_check_inline = false;
3200 if (compiler->optimize() &&
3201 try_to_emit_quick_check_for_alternative(i == 0) &&
3202 alternative.node()->EmitQuickCheck(
3203 compiler, trace, &new_trace, preload->preload_has_checked_bounds_,
3204 &alt_gen->possible_success, &alt_gen->quick_check_details,
3205 fall_through_on_failure, this)) {
3206 // Quick check was generated for this choice.
3207 preload->preload_is_current_ = true;
3208 preload->preload_has_checked_bounds_ = true;
3209 // If we generated the quick check to fall through on possible success,
3210 // we now need to generate the full check inline.
3211 if (!fall_through_on_failure) {
3212 macro_assembler->Bind(&alt_gen->possible_success);
3213 new_trace.set_quick_check_performed(&alt_gen->quick_check_details);
3214 new_trace.set_characters_preloaded(preload->preload_characters_);
3215 new_trace.set_bound_checked_up_to(preload->preload_characters_);
3216 generate_full_check_inline = true;
3217 }
3218 } else if (alt_gen->quick_check_details.cannot_match()) {
3219 if (!fall_through_on_failure) {
3220 macro_assembler->GoTo(trace->backtrack());
3221 }
3222 continue;
3223 } else {
3224 // No quick check was generated. Put the full code here.
3225 // If this is not the first choice then there could be slow checks from
3226 // previous cases that go here when they fail. There's no reason to
3227 // insist that they preload characters since the slow check we are about
3228 // to generate probably can't use it.
3229 if (i != first_choice) {
3230 alt_gen->expects_preload = false;
3231 new_trace.InvalidateCurrentCharacter();
3232 }
3233 generate_full_check_inline = true;
3234 }
3235 if (generate_full_check_inline) {
3236 if (new_trace.actions() != nullptr) {
3237 new_trace.set_flush_budget(new_flush_budget);
3238 }
3239 for (int j = 0; j < guard_count; j++) {
3240 GenerateGuard(macro_assembler, guards->at(j), &new_trace);
3241 }
3242 alternative.node()->Emit(compiler, &new_trace);
3243 preload->preload_is_current_ = false;
3244 }
3245 macro_assembler->Bind(&alt_gen->after);
3246 }
3247 }
3248
EmitOutOfLineContinuation(RegExpCompiler * compiler,Trace * trace,GuardedAlternative alternative,AlternativeGeneration * alt_gen,int preload_characters,bool next_expects_preload)3249 void ChoiceNode::EmitOutOfLineContinuation(RegExpCompiler* compiler,
3250 Trace* trace,
3251 GuardedAlternative alternative,
3252 AlternativeGeneration* alt_gen,
3253 int preload_characters,
3254 bool next_expects_preload) {
3255 if (!alt_gen->possible_success.is_linked()) return;
3256
3257 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
3258 macro_assembler->Bind(&alt_gen->possible_success);
3259 Trace out_of_line_trace(*trace);
3260 out_of_line_trace.set_characters_preloaded(preload_characters);
3261 out_of_line_trace.set_quick_check_performed(&alt_gen->quick_check_details);
3262 if (not_at_start_) out_of_line_trace.set_at_start(Trace::FALSE_VALUE);
3263 ZoneList<Guard*>* guards = alternative.guards();
3264 int guard_count = (guards == nullptr) ? 0 : guards->length();
3265 if (next_expects_preload) {
3266 Label reload_current_char;
3267 out_of_line_trace.set_backtrack(&reload_current_char);
3268 for (int j = 0; j < guard_count; j++) {
3269 GenerateGuard(macro_assembler, guards->at(j), &out_of_line_trace);
3270 }
3271 alternative.node()->Emit(compiler, &out_of_line_trace);
3272 macro_assembler->Bind(&reload_current_char);
3273 // Reload the current character, since the next quick check expects that.
3274 // We don't need to check bounds here because we only get into this
3275 // code through a quick check which already did the checked load.
3276 macro_assembler->LoadCurrentCharacter(trace->cp_offset(), nullptr, false,
3277 preload_characters);
3278 macro_assembler->GoTo(&(alt_gen->after));
3279 } else {
3280 out_of_line_trace.set_backtrack(&(alt_gen->after));
3281 for (int j = 0; j < guard_count; j++) {
3282 GenerateGuard(macro_assembler, guards->at(j), &out_of_line_trace);
3283 }
3284 alternative.node()->Emit(compiler, &out_of_line_trace);
3285 }
3286 }
3287
Emit(RegExpCompiler * compiler,Trace * trace)3288 void ActionNode::Emit(RegExpCompiler* compiler, Trace* trace) {
3289 RegExpMacroAssembler* assembler = compiler->macro_assembler();
3290 LimitResult limit_result = LimitVersions(compiler, trace);
3291 if (limit_result == DONE) return;
3292 DCHECK(limit_result == CONTINUE);
3293
3294 RecursionCheck rc(compiler);
3295
3296 switch (action_type_) {
3297 case STORE_POSITION: {
3298 Trace::DeferredCapture new_capture(data_.u_position_register.reg,
3299 data_.u_position_register.is_capture,
3300 trace);
3301 Trace new_trace = *trace;
3302 new_trace.add_action(&new_capture);
3303 on_success()->Emit(compiler, &new_trace);
3304 break;
3305 }
3306 case INCREMENT_REGISTER: {
3307 Trace::DeferredIncrementRegister new_increment(
3308 data_.u_increment_register.reg);
3309 Trace new_trace = *trace;
3310 new_trace.add_action(&new_increment);
3311 on_success()->Emit(compiler, &new_trace);
3312 break;
3313 }
3314 case SET_REGISTER_FOR_LOOP: {
3315 Trace::DeferredSetRegisterForLoop new_set(data_.u_store_register.reg,
3316 data_.u_store_register.value);
3317 Trace new_trace = *trace;
3318 new_trace.add_action(&new_set);
3319 on_success()->Emit(compiler, &new_trace);
3320 break;
3321 }
3322 case CLEAR_CAPTURES: {
3323 Trace::DeferredClearCaptures new_capture(Interval(
3324 data_.u_clear_captures.range_from, data_.u_clear_captures.range_to));
3325 Trace new_trace = *trace;
3326 new_trace.add_action(&new_capture);
3327 on_success()->Emit(compiler, &new_trace);
3328 break;
3329 }
3330 case BEGIN_SUBMATCH:
3331 if (!trace->is_trivial()) {
3332 trace->Flush(compiler, this);
3333 } else {
3334 assembler->WriteCurrentPositionToRegister(
3335 data_.u_submatch.current_position_register, 0);
3336 assembler->WriteStackPointerToRegister(
3337 data_.u_submatch.stack_pointer_register);
3338 on_success()->Emit(compiler, trace);
3339 }
3340 break;
3341 case EMPTY_MATCH_CHECK: {
3342 int start_pos_reg = data_.u_empty_match_check.start_register;
3343 int stored_pos = 0;
3344 int rep_reg = data_.u_empty_match_check.repetition_register;
3345 bool has_minimum = (rep_reg != RegExpCompiler::kNoRegister);
3346 bool know_dist = trace->GetStoredPosition(start_pos_reg, &stored_pos);
3347 if (know_dist && !has_minimum && stored_pos == trace->cp_offset()) {
3348 // If we know we haven't advanced and there is no minimum we
3349 // can just backtrack immediately.
3350 assembler->GoTo(trace->backtrack());
3351 } else if (know_dist && stored_pos < trace->cp_offset()) {
3352 // If we know we've advanced we can generate the continuation
3353 // immediately.
3354 on_success()->Emit(compiler, trace);
3355 } else if (!trace->is_trivial()) {
3356 trace->Flush(compiler, this);
3357 } else {
3358 Label skip_empty_check;
3359 // If we have a minimum number of repetitions we check the current
3360 // number first and skip the empty check if it's not enough.
3361 if (has_minimum) {
3362 int limit = data_.u_empty_match_check.repetition_limit;
3363 assembler->IfRegisterLT(rep_reg, limit, &skip_empty_check);
3364 }
3365 // If the match is empty we bail out, otherwise we fall through
3366 // to the on-success continuation.
3367 assembler->IfRegisterEqPos(data_.u_empty_match_check.start_register,
3368 trace->backtrack());
3369 assembler->Bind(&skip_empty_check);
3370 on_success()->Emit(compiler, trace);
3371 }
3372 break;
3373 }
3374 case POSITIVE_SUBMATCH_SUCCESS: {
3375 if (!trace->is_trivial()) {
3376 trace->Flush(compiler, this);
3377 return;
3378 }
3379 assembler->ReadCurrentPositionFromRegister(
3380 data_.u_submatch.current_position_register);
3381 assembler->ReadStackPointerFromRegister(
3382 data_.u_submatch.stack_pointer_register);
3383 int clear_register_count = data_.u_submatch.clear_register_count;
3384 if (clear_register_count == 0) {
3385 on_success()->Emit(compiler, trace);
3386 return;
3387 }
3388 int clear_registers_from = data_.u_submatch.clear_register_from;
3389 Label clear_registers_backtrack;
3390 Trace new_trace = *trace;
3391 new_trace.set_backtrack(&clear_registers_backtrack);
3392 on_success()->Emit(compiler, &new_trace);
3393
3394 assembler->Bind(&clear_registers_backtrack);
3395 int clear_registers_to = clear_registers_from + clear_register_count - 1;
3396 assembler->ClearRegisters(clear_registers_from, clear_registers_to);
3397
3398 DCHECK(trace->backtrack() == nullptr);
3399 assembler->Backtrack();
3400 return;
3401 }
3402 default:
3403 UNREACHABLE();
3404 }
3405 }
3406
Emit(RegExpCompiler * compiler,Trace * trace)3407 void BackReferenceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
3408 RegExpMacroAssembler* assembler = compiler->macro_assembler();
3409 if (!trace->is_trivial()) {
3410 trace->Flush(compiler, this);
3411 return;
3412 }
3413
3414 LimitResult limit_result = LimitVersions(compiler, trace);
3415 if (limit_result == DONE) return;
3416 DCHECK(limit_result == CONTINUE);
3417
3418 RecursionCheck rc(compiler);
3419
3420 DCHECK_EQ(start_reg_ + 1, end_reg_);
3421 if (IgnoreCase(flags_)) {
3422 bool unicode = IsUnicode(flags_);
3423 assembler->CheckNotBackReferenceIgnoreCase(start_reg_, read_backward(),
3424 unicode, trace->backtrack());
3425 } else {
3426 assembler->CheckNotBackReference(start_reg_, read_backward(),
3427 trace->backtrack());
3428 }
3429 // We are going to advance backward, so we may end up at the start.
3430 if (read_backward()) trace->set_at_start(Trace::UNKNOWN);
3431
3432 // Check that the back reference does not end inside a surrogate pair.
3433 if (IsUnicode(flags_) && !compiler->one_byte()) {
3434 assembler->CheckNotInSurrogatePair(trace->cp_offset(), trace->backtrack());
3435 }
3436 on_success()->Emit(compiler, trace);
3437 }
3438
CalculateOffsets()3439 void TextNode::CalculateOffsets() {
3440 int element_count = elements()->length();
3441 // Set up the offsets of the elements relative to the start. This is a fixed
3442 // quantity since a TextNode can only contain fixed-width things.
3443 int cp_offset = 0;
3444 for (int i = 0; i < element_count; i++) {
3445 TextElement& elm = elements()->at(i);
3446 elm.set_cp_offset(cp_offset);
3447 cp_offset += elm.length();
3448 }
3449 }
3450
3451 namespace {
3452
3453 // Assertion propagation moves information about assertions such as
3454 // \b to the affected nodes. For instance, in /.\b./ information must
3455 // be propagated to the first '.' that whatever follows needs to know
3456 // if it matched a word or a non-word, and to the second '.' that it
3457 // has to check if it succeeds a word or non-word. In this case the
3458 // result will be something like:
3459 //
3460 // +-------+ +------------+
3461 // | . | | . |
3462 // +-------+ ---> +------------+
3463 // | word? | | check word |
3464 // +-------+ +------------+
3465 class AssertionPropagator : public AllStatic {
3466 public:
VisitText(TextNode * that)3467 static void VisitText(TextNode* that) {}
3468
VisitAction(ActionNode * that)3469 static void VisitAction(ActionNode* that) {
3470 // If the next node is interested in what it follows then this node
3471 // has to be interested too so it can pass the information on.
3472 that->info()->AddFromFollowing(that->on_success()->info());
3473 }
3474
VisitChoice(ChoiceNode * that,int i)3475 static void VisitChoice(ChoiceNode* that, int i) {
3476 // Anything the following nodes need to know has to be known by
3477 // this node also, so it can pass it on.
3478 that->info()->AddFromFollowing(that->alternatives()->at(i).node()->info());
3479 }
3480
VisitLoopChoiceContinueNode(LoopChoiceNode * that)3481 static void VisitLoopChoiceContinueNode(LoopChoiceNode* that) {
3482 that->info()->AddFromFollowing(that->continue_node()->info());
3483 }
3484
VisitLoopChoiceLoopNode(LoopChoiceNode * that)3485 static void VisitLoopChoiceLoopNode(LoopChoiceNode* that) {
3486 that->info()->AddFromFollowing(that->loop_node()->info());
3487 }
3488
VisitNegativeLookaroundChoiceLookaroundNode(NegativeLookaroundChoiceNode * that)3489 static void VisitNegativeLookaroundChoiceLookaroundNode(
3490 NegativeLookaroundChoiceNode* that) {
3491 VisitChoice(that, NegativeLookaroundChoiceNode::kLookaroundIndex);
3492 }
3493
VisitNegativeLookaroundChoiceContinueNode(NegativeLookaroundChoiceNode * that)3494 static void VisitNegativeLookaroundChoiceContinueNode(
3495 NegativeLookaroundChoiceNode* that) {
3496 VisitChoice(that, NegativeLookaroundChoiceNode::kContinueIndex);
3497 }
3498
VisitBackReference(BackReferenceNode * that)3499 static void VisitBackReference(BackReferenceNode* that) {}
3500
VisitAssertion(AssertionNode * that)3501 static void VisitAssertion(AssertionNode* that) {}
3502 };
3503
3504 // Propagates information about the minimum size of successful matches from
3505 // successor nodes to their predecessors. Note that all eats_at_least values
3506 // are initialized to zero before analysis.
3507 class EatsAtLeastPropagator : public AllStatic {
3508 public:
VisitText(TextNode * that)3509 static void VisitText(TextNode* that) {
3510 // The eats_at_least value is not used if reading backward.
3511 if (!that->read_backward()) {
3512 // We are not at the start after this node, and thus we can use the
3513 // successor's eats_at_least_from_not_start value.
3514 uint8_t eats_at_least = base::saturated_cast<uint8_t>(
3515 that->Length() + that->on_success()
3516 ->eats_at_least_info()
3517 ->eats_at_least_from_not_start);
3518 that->set_eats_at_least_info(EatsAtLeastInfo(eats_at_least));
3519 }
3520 }
3521
VisitAction(ActionNode * that)3522 static void VisitAction(ActionNode* that) {
3523 // POSITIVE_SUBMATCH_SUCCESS rewinds input, so we must not consider
3524 // successor nodes for eats_at_least. SET_REGISTER_FOR_LOOP indicates a loop
3525 // entry point, which means the loop body will run at least the minimum
3526 // number of times before the continuation case can run. Otherwise the
3527 // current node eats at least as much as its successor.
3528 switch (that->action_type()) {
3529 case ActionNode::POSITIVE_SUBMATCH_SUCCESS:
3530 break; // Was already initialized to zero.
3531 case ActionNode::SET_REGISTER_FOR_LOOP:
3532 that->set_eats_at_least_info(
3533 that->on_success()->EatsAtLeastFromLoopEntry());
3534 break;
3535 default:
3536 that->set_eats_at_least_info(*that->on_success()->eats_at_least_info());
3537 break;
3538 }
3539 }
3540
VisitChoice(ChoiceNode * that,int i)3541 static void VisitChoice(ChoiceNode* that, int i) {
3542 // The minimum possible match from a choice node is the minimum of its
3543 // successors.
3544 EatsAtLeastInfo eats_at_least =
3545 i == 0 ? EatsAtLeastInfo(UINT8_MAX) : *that->eats_at_least_info();
3546 eats_at_least.SetMin(
3547 *that->alternatives()->at(i).node()->eats_at_least_info());
3548 that->set_eats_at_least_info(eats_at_least);
3549 }
3550
VisitLoopChoiceContinueNode(LoopChoiceNode * that)3551 static void VisitLoopChoiceContinueNode(LoopChoiceNode* that) {
3552 that->set_eats_at_least_info(*that->continue_node()->eats_at_least_info());
3553 }
3554
VisitLoopChoiceLoopNode(LoopChoiceNode * that)3555 static void VisitLoopChoiceLoopNode(LoopChoiceNode* that) {}
3556
VisitNegativeLookaroundChoiceLookaroundNode(NegativeLookaroundChoiceNode * that)3557 static void VisitNegativeLookaroundChoiceLookaroundNode(
3558 NegativeLookaroundChoiceNode* that) {}
3559
VisitNegativeLookaroundChoiceContinueNode(NegativeLookaroundChoiceNode * that)3560 static void VisitNegativeLookaroundChoiceContinueNode(
3561 NegativeLookaroundChoiceNode* that) {
3562 that->set_eats_at_least_info(*that->continue_node()->eats_at_least_info());
3563 }
3564
VisitBackReference(BackReferenceNode * that)3565 static void VisitBackReference(BackReferenceNode* that) {
3566 if (!that->read_backward()) {
3567 that->set_eats_at_least_info(*that->on_success()->eats_at_least_info());
3568 }
3569 }
3570
VisitAssertion(AssertionNode * that)3571 static void VisitAssertion(AssertionNode* that) {
3572 EatsAtLeastInfo eats_at_least = *that->on_success()->eats_at_least_info();
3573 if (that->assertion_type() == AssertionNode::AT_START) {
3574 // If we know we are not at the start and we are asked "how many
3575 // characters will you match if you succeed?" then we can answer anything
3576 // since false implies false. So let's just set the max answer
3577 // (UINT8_MAX) since that won't prevent us from preloading a lot of
3578 // characters for the other branches in the node graph.
3579 eats_at_least.eats_at_least_from_not_start = UINT8_MAX;
3580 }
3581 that->set_eats_at_least_info(eats_at_least);
3582 }
3583 };
3584
3585 } // namespace
3586
3587 // -------------------------------------------------------------------
3588 // Analysis
3589
3590 // Iterates the node graph and provides the opportunity for propagators to set
3591 // values that depend on successor nodes.
3592 template <typename... Propagators>
3593 class Analysis : public NodeVisitor {
3594 public:
Analysis(Isolate * isolate,bool is_one_byte)3595 Analysis(Isolate* isolate, bool is_one_byte)
3596 : isolate_(isolate),
3597 is_one_byte_(is_one_byte),
3598 error_(RegExpError::kNone) {}
3599
EnsureAnalyzed(RegExpNode * that)3600 void EnsureAnalyzed(RegExpNode* that) {
3601 StackLimitCheck check(isolate());
3602 if (check.HasOverflowed()) {
3603 if (FLAG_correctness_fuzzer_suppressions) {
3604 FATAL("Analysis: Aborting on stack overflow");
3605 }
3606 fail(RegExpError::kAnalysisStackOverflow);
3607 return;
3608 }
3609 if (that->info()->been_analyzed || that->info()->being_analyzed) return;
3610 that->info()->being_analyzed = true;
3611 that->Accept(this);
3612 that->info()->being_analyzed = false;
3613 that->info()->been_analyzed = true;
3614 }
3615
has_failed()3616 bool has_failed() { return error_ != RegExpError::kNone; }
error()3617 RegExpError error() {
3618 DCHECK(error_ != RegExpError::kNone);
3619 return error_;
3620 }
fail(RegExpError error)3621 void fail(RegExpError error) { error_ = error; }
3622
isolate() const3623 Isolate* isolate() const { return isolate_; }
3624
VisitEnd(EndNode * that)3625 void VisitEnd(EndNode* that) override {
3626 // nothing to do
3627 }
3628
3629 // Used to call the given static function on each propagator / variadic template
3630 // argument.
3631 #define STATIC_FOR_EACH(expr) \
3632 do { \
3633 int dummy[] = {((expr), 0)...}; \
3634 USE(dummy); \
3635 } while (false)
3636
VisitText(TextNode * that)3637 void VisitText(TextNode* that) override {
3638 that->MakeCaseIndependent(isolate(), is_one_byte_);
3639 EnsureAnalyzed(that->on_success());
3640 if (has_failed()) return;
3641 that->CalculateOffsets();
3642 STATIC_FOR_EACH(Propagators::VisitText(that));
3643 }
3644
VisitAction(ActionNode * that)3645 void VisitAction(ActionNode* that) override {
3646 EnsureAnalyzed(that->on_success());
3647 if (has_failed()) return;
3648 STATIC_FOR_EACH(Propagators::VisitAction(that));
3649 }
3650
VisitChoice(ChoiceNode * that)3651 void VisitChoice(ChoiceNode* that) override {
3652 for (int i = 0; i < that->alternatives()->length(); i++) {
3653 EnsureAnalyzed(that->alternatives()->at(i).node());
3654 if (has_failed()) return;
3655 STATIC_FOR_EACH(Propagators::VisitChoice(that, i));
3656 }
3657 }
3658
VisitLoopChoice(LoopChoiceNode * that)3659 void VisitLoopChoice(LoopChoiceNode* that) override {
3660 DCHECK_EQ(that->alternatives()->length(), 2); // Just loop and continue.
3661
3662 // First propagate all information from the continuation node.
3663 EnsureAnalyzed(that->continue_node());
3664 if (has_failed()) return;
3665 STATIC_FOR_EACH(Propagators::VisitLoopChoiceContinueNode(that));
3666
3667 // Check the loop last since it may need the value of this node
3668 // to get a correct result.
3669 EnsureAnalyzed(that->loop_node());
3670 if (has_failed()) return;
3671 STATIC_FOR_EACH(Propagators::VisitLoopChoiceLoopNode(that));
3672 }
3673
VisitNegativeLookaroundChoice(NegativeLookaroundChoiceNode * that)3674 void VisitNegativeLookaroundChoice(
3675 NegativeLookaroundChoiceNode* that) override {
3676 DCHECK_EQ(that->alternatives()->length(), 2); // Lookaround and continue.
3677
3678 EnsureAnalyzed(that->lookaround_node());
3679 if (has_failed()) return;
3680 STATIC_FOR_EACH(
3681 Propagators::VisitNegativeLookaroundChoiceLookaroundNode(that));
3682
3683 EnsureAnalyzed(that->continue_node());
3684 if (has_failed()) return;
3685 STATIC_FOR_EACH(
3686 Propagators::VisitNegativeLookaroundChoiceContinueNode(that));
3687 }
3688
VisitBackReference(BackReferenceNode * that)3689 void VisitBackReference(BackReferenceNode* that) override {
3690 EnsureAnalyzed(that->on_success());
3691 if (has_failed()) return;
3692 STATIC_FOR_EACH(Propagators::VisitBackReference(that));
3693 }
3694
VisitAssertion(AssertionNode * that)3695 void VisitAssertion(AssertionNode* that) override {
3696 EnsureAnalyzed(that->on_success());
3697 if (has_failed()) return;
3698 STATIC_FOR_EACH(Propagators::VisitAssertion(that));
3699 }
3700
3701 #undef STATIC_FOR_EACH
3702
3703 private:
3704 Isolate* isolate_;
3705 bool is_one_byte_;
3706 RegExpError error_;
3707
3708 DISALLOW_IMPLICIT_CONSTRUCTORS(Analysis);
3709 };
3710
AnalyzeRegExp(Isolate * isolate,bool is_one_byte,RegExpNode * node)3711 RegExpError AnalyzeRegExp(Isolate* isolate, bool is_one_byte,
3712 RegExpNode* node) {
3713 Analysis<AssertionPropagator, EatsAtLeastPropagator> analysis(isolate,
3714 is_one_byte);
3715 DCHECK_EQ(node->info()->been_analyzed, false);
3716 analysis.EnsureAnalyzed(node);
3717 DCHECK_IMPLIES(analysis.has_failed(), analysis.error() != RegExpError::kNone);
3718 return analysis.has_failed() ? analysis.error() : RegExpError::kNone;
3719 }
3720
FillInBMInfo(Isolate * isolate,int offset,int budget,BoyerMooreLookahead * bm,bool not_at_start)3721 void BackReferenceNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
3722 BoyerMooreLookahead* bm,
3723 bool not_at_start) {
3724 // Working out the set of characters that a backreference can match is too
3725 // hard, so we just say that any character can match.
3726 bm->SetRest(offset);
3727 SaveBMInfo(bm, not_at_start, offset);
3728 }
3729
3730 STATIC_ASSERT(BoyerMoorePositionInfo::kMapSize ==
3731 RegExpMacroAssembler::kTableSize);
3732
FillInBMInfo(Isolate * isolate,int offset,int budget,BoyerMooreLookahead * bm,bool not_at_start)3733 void ChoiceNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
3734 BoyerMooreLookahead* bm, bool not_at_start) {
3735 ZoneList<GuardedAlternative>* alts = alternatives();
3736 budget = (budget - 1) / alts->length();
3737 for (int i = 0; i < alts->length(); i++) {
3738 GuardedAlternative& alt = alts->at(i);
3739 if (alt.guards() != nullptr && alt.guards()->length() != 0) {
3740 bm->SetRest(offset); // Give up trying to fill in info.
3741 SaveBMInfo(bm, not_at_start, offset);
3742 return;
3743 }
3744 alt.node()->FillInBMInfo(isolate, offset, budget, bm, not_at_start);
3745 }
3746 SaveBMInfo(bm, not_at_start, offset);
3747 }
3748
FillInBMInfo(Isolate * isolate,int initial_offset,int budget,BoyerMooreLookahead * bm,bool not_at_start)3749 void TextNode::FillInBMInfo(Isolate* isolate, int initial_offset, int budget,
3750 BoyerMooreLookahead* bm, bool not_at_start) {
3751 if (initial_offset >= bm->length()) return;
3752 int offset = initial_offset;
3753 int max_char = bm->max_char();
3754 for (int i = 0; i < elements()->length(); i++) {
3755 if (offset >= bm->length()) {
3756 if (initial_offset == 0) set_bm_info(not_at_start, bm);
3757 return;
3758 }
3759 TextElement text = elements()->at(i);
3760 if (text.text_type() == TextElement::ATOM) {
3761 RegExpAtom* atom = text.atom();
3762 for (int j = 0; j < atom->length(); j++, offset++) {
3763 if (offset >= bm->length()) {
3764 if (initial_offset == 0) set_bm_info(not_at_start, bm);
3765 return;
3766 }
3767 uc16 character = atom->data()[j];
3768 if (IgnoreCase(atom->flags())) {
3769 unibrow::uchar chars[4];
3770 int length = GetCaseIndependentLetters(
3771 isolate, character, bm->max_char() == String::kMaxOneByteCharCode,
3772 chars, 4);
3773 for (int j = 0; j < length; j++) {
3774 bm->Set(offset, chars[j]);
3775 }
3776 } else {
3777 if (character <= max_char) bm->Set(offset, character);
3778 }
3779 }
3780 } else {
3781 DCHECK_EQ(TextElement::CHAR_CLASS, text.text_type());
3782 RegExpCharacterClass* char_class = text.char_class();
3783 ZoneList<CharacterRange>* ranges = char_class->ranges(zone());
3784 if (char_class->is_negated()) {
3785 bm->SetAll(offset);
3786 } else {
3787 for (int k = 0; k < ranges->length(); k++) {
3788 CharacterRange& range = ranges->at(k);
3789 if (static_cast<int>(range.from()) > max_char) continue;
3790 int to = Min(max_char, static_cast<int>(range.to()));
3791 bm->SetInterval(offset, Interval(range.from(), to));
3792 }
3793 }
3794 offset++;
3795 }
3796 }
3797 if (offset >= bm->length()) {
3798 if (initial_offset == 0) set_bm_info(not_at_start, bm);
3799 return;
3800 }
3801 on_success()->FillInBMInfo(isolate, offset, budget - 1, bm,
3802 true); // Not at start after a text node.
3803 if (initial_offset == 0) set_bm_info(not_at_start, bm);
3804 }
3805
OptionallyStepBackToLeadSurrogate(RegExpNode * on_success,JSRegExp::Flags flags)3806 RegExpNode* RegExpCompiler::OptionallyStepBackToLeadSurrogate(
3807 RegExpNode* on_success, JSRegExp::Flags flags) {
3808 DCHECK(!read_backward());
3809 ZoneList<CharacterRange>* lead_surrogates = CharacterRange::List(
3810 zone(), CharacterRange::Range(kLeadSurrogateStart, kLeadSurrogateEnd));
3811 ZoneList<CharacterRange>* trail_surrogates = CharacterRange::List(
3812 zone(), CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd));
3813
3814 ChoiceNode* optional_step_back = zone()->New<ChoiceNode>(2, zone());
3815
3816 int stack_register = UnicodeLookaroundStackRegister();
3817 int position_register = UnicodeLookaroundPositionRegister();
3818 RegExpNode* step_back = TextNode::CreateForCharacterRanges(
3819 zone(), lead_surrogates, true, on_success, flags);
3820 RegExpLookaround::Builder builder(true, step_back, stack_register,
3821 position_register);
3822 RegExpNode* match_trail = TextNode::CreateForCharacterRanges(
3823 zone(), trail_surrogates, false, builder.on_match_success(), flags);
3824
3825 optional_step_back->AddAlternative(
3826 GuardedAlternative(builder.ForMatch(match_trail)));
3827 optional_step_back->AddAlternative(GuardedAlternative(on_success));
3828
3829 return optional_step_back;
3830 }
3831
PreprocessRegExp(RegExpCompileData * data,JSRegExp::Flags flags,bool is_one_byte)3832 RegExpNode* RegExpCompiler::PreprocessRegExp(RegExpCompileData* data,
3833 JSRegExp::Flags flags,
3834 bool is_one_byte) {
3835 // Wrap the body of the regexp in capture #0.
3836 RegExpNode* captured_body =
3837 RegExpCapture::ToNode(data->tree, 0, this, accept());
3838 RegExpNode* node = captured_body;
3839 if (!data->tree->IsAnchoredAtStart() && !IsSticky(flags)) {
3840 // Add a .*? at the beginning, outside the body capture, unless
3841 // this expression is anchored at the beginning or sticky.
3842 JSRegExp::Flags default_flags = JSRegExp::Flags();
3843 RegExpNode* loop_node = RegExpQuantifier::ToNode(
3844 0, RegExpTree::kInfinity, false,
3845 zone()->New<RegExpCharacterClass>('*', default_flags), this,
3846 captured_body, data->contains_anchor);
3847
3848 if (data->contains_anchor) {
3849 // Unroll loop once, to take care of the case that might start
3850 // at the start of input.
3851 ChoiceNode* first_step_node = zone()->New<ChoiceNode>(2, zone());
3852 first_step_node->AddAlternative(GuardedAlternative(captured_body));
3853 first_step_node->AddAlternative(GuardedAlternative(zone()->New<TextNode>(
3854 zone()->New<RegExpCharacterClass>('*', default_flags), false,
3855 loop_node)));
3856 node = first_step_node;
3857 } else {
3858 node = loop_node;
3859 }
3860 }
3861 if (is_one_byte) {
3862 node = node->FilterOneByte(RegExpCompiler::kMaxRecursion);
3863 // Do it again to propagate the new nodes to places where they were not
3864 // put because they had not been calculated yet.
3865 if (node != nullptr) {
3866 node = node->FilterOneByte(RegExpCompiler::kMaxRecursion);
3867 }
3868 } else if (IsUnicode(flags) && (IsGlobal(flags) || IsSticky(flags))) {
3869 node = OptionallyStepBackToLeadSurrogate(node, flags);
3870 }
3871
3872 if (node == nullptr) node = zone()->New<EndNode>(EndNode::BACKTRACK, zone());
3873 return node;
3874 }
3875
3876 } // namespace internal
3877 } // namespace v8
3878