1 // Copyright 2019 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "src/regexp/regexp-compiler.h"
6
7 #include "src/base/safe_conversions.h"
8 #include "src/execution/isolate.h"
9 #include "src/objects/fixed-array-inl.h"
10 #include "src/regexp/regexp-macro-assembler-arch.h"
11 #include "src/strings/unicode-inl.h"
12 #include "src/zone/zone-list-inl.h"
13
14 #ifdef V8_INTL_SUPPORT
15 #include "src/regexp/special-case.h"
16 #include "unicode/locid.h"
17 #include "unicode/uniset.h"
18 #include "unicode/utypes.h"
19 #endif // V8_INTL_SUPPORT
20
21 namespace v8 {
22 namespace internal {
23
24 using namespace regexp_compiler_constants; // NOLINT(build/namespaces)
25
26 // -------------------------------------------------------------------
27 // Implementation of the Irregexp regular expression engine.
28 //
29 // The Irregexp regular expression engine is intended to be a complete
30 // implementation of ECMAScript regular expressions. It generates either
31 // bytecodes or native code.
32
33 // The Irregexp regexp engine is structured in three steps.
34 // 1) The parser generates an abstract syntax tree. See ast.cc.
35 // 2) From the AST a node network is created. The nodes are all
36 // subclasses of RegExpNode. The nodes represent states when
37 // executing a regular expression. Several optimizations are
38 // performed on the node network.
39 // 3) From the nodes we generate either byte codes or native code
40 // that can actually execute the regular expression (perform
41 // the search). The code generation step is described in more
42 // detail below.
43
44 // Code generation.
45 //
46 // The nodes are divided into four main categories.
47 // * Choice nodes
48 // These represent places where the regular expression can
49 // match in more than one way. For example on entry to an
50 // alternation (foo|bar) or a repetition (*, +, ? or {}).
51 // * Action nodes
52 // These represent places where some action should be
53 // performed. Examples include recording the current position
54 // in the input string to a register (in order to implement
55 // captures) or other actions on register for example in order
56 // to implement the counters needed for {} repetitions.
57 // * Matching nodes
58 // These attempt to match some element part of the input string.
59 // Examples of elements include character classes, plain strings
60 // or back references.
61 // * End nodes
62 // These are used to implement the actions required on finding
63 // a successful match or failing to find a match.
64 //
65 // The code generated (whether as byte codes or native code) maintains
66 // some state as it runs. This consists of the following elements:
67 //
68 // * The capture registers. Used for string captures.
69 // * Other registers. Used for counters etc.
70 // * The current position.
71 // * The stack of backtracking information. Used when a matching node
72 // fails to find a match and needs to try an alternative.
73 //
74 // Conceptual regular expression execution model:
75 //
76 // There is a simple conceptual model of regular expression execution
77 // which will be presented first. The actual code generated is a more
78 // efficient simulation of the simple conceptual model:
79 //
80 // * Choice nodes are implemented as follows:
81 // For each choice except the last {
82 // push current position
83 // push backtrack code location
84 // <generate code to test for choice>
85 // backtrack code location:
86 // pop current position
87 // }
88 // <generate code to test for last choice>
89 //
90 // * Actions nodes are generated as follows
91 // <push affected registers on backtrack stack>
92 // <generate code to perform action>
93 // push backtrack code location
94 // <generate code to test for following nodes>
95 // backtrack code location:
96 // <pop affected registers to restore their state>
97 // <pop backtrack location from stack and go to it>
98 //
99 // * Matching nodes are generated as follows:
100 // if input string matches at current position
101 // update current position
102 // <generate code to test for following nodes>
103 // else
104 // <pop backtrack location from stack and go to it>
105 //
106 // Thus it can be seen that the current position is saved and restored
107 // by the choice nodes, whereas the registers are saved and restored by
108 // by the action nodes that manipulate them.
109 //
110 // The other interesting aspect of this model is that nodes are generated
111 // at the point where they are needed by a recursive call to Emit(). If
112 // the node has already been code generated then the Emit() call will
113 // generate a jump to the previously generated code instead. In order to
114 // limit recursion it is possible for the Emit() function to put the node
115 // on a work list for later generation and instead generate a jump. The
116 // destination of the jump is resolved later when the code is generated.
117 //
118 // Actual regular expression code generation.
119 //
120 // Code generation is actually more complicated than the above. In order
121 // to improve the efficiency of the generated code some optimizations are
122 // performed
123 //
124 // * Choice nodes have 1-character lookahead.
125 // A choice node looks at the following character and eliminates some of
126 // the choices immediately based on that character. This is not yet
127 // implemented.
128 // * Simple greedy loops store reduced backtracking information.
129 // A quantifier like /.*foo/m will greedily match the whole input. It will
130 // then need to backtrack to a point where it can match "foo". The naive
131 // implementation of this would push each character position onto the
132 // backtracking stack, then pop them off one by one. This would use space
133 // proportional to the length of the input string. However since the "."
134 // can only match in one way and always has a constant length (in this case
135 // of 1) it suffices to store the current position on the top of the stack
136 // once. Matching now becomes merely incrementing the current position and
137 // backtracking becomes decrementing the current position and checking the
138 // result against the stored current position. This is faster and saves
139 // space.
140 // * The current state is virtualized.
141 // This is used to defer expensive operations until it is clear that they
142 // are needed and to generate code for a node more than once, allowing
143 // specialized an efficient versions of the code to be created. This is
144 // explained in the section below.
145 //
146 // Execution state virtualization.
147 //
148 // Instead of emitting code, nodes that manipulate the state can record their
149 // manipulation in an object called the Trace. The Trace object can record a
150 // current position offset, an optional backtrack code location on the top of
151 // the virtualized backtrack stack and some register changes. When a node is
152 // to be emitted it can flush the Trace or update it. Flushing the Trace
153 // will emit code to bring the actual state into line with the virtual state.
154 // Avoiding flushing the state can postpone some work (e.g. updates of capture
155 // registers). Postponing work can save time when executing the regular
156 // expression since it may be found that the work never has to be done as a
157 // failure to match can occur. In addition it is much faster to jump to a
158 // known backtrack code location than it is to pop an unknown backtrack
159 // location from the stack and jump there.
160 //
161 // The virtual state found in the Trace affects code generation. For example
162 // the virtual state contains the difference between the actual current
163 // position and the virtual current position, and matching code needs to use
164 // this offset to attempt a match in the correct location of the input
165 // string. Therefore code generated for a non-trivial trace is specialized
166 // to that trace. The code generator therefore has the ability to generate
167 // code for each node several times. In order to limit the size of the
168 // generated code there is an arbitrary limit on how many specialized sets of
169 // code may be generated for a given node. If the limit is reached, the
170 // trace is flushed and a generic version of the code for a node is emitted.
171 // This is subsequently used for that node. The code emitted for non-generic
172 // trace is not recorded in the node and so it cannot currently be reused in
173 // the event that code generation is requested for an identical trace.
174
175 namespace {
176
MaxCodeUnit(const bool one_byte)177 constexpr base::uc32 MaxCodeUnit(const bool one_byte) {
178 STATIC_ASSERT(String::kMaxOneByteCharCodeU <=
179 std::numeric_limits<uint16_t>::max());
180 STATIC_ASSERT(String::kMaxUtf16CodeUnitU <=
181 std::numeric_limits<uint16_t>::max());
182 return one_byte ? String::kMaxOneByteCharCodeU : String::kMaxUtf16CodeUnitU;
183 }
184
CharMask(const bool one_byte)185 constexpr uint32_t CharMask(const bool one_byte) {
186 STATIC_ASSERT(base::bits::IsPowerOfTwo(String::kMaxOneByteCharCodeU + 1));
187 STATIC_ASSERT(base::bits::IsPowerOfTwo(String::kMaxUtf16CodeUnitU + 1));
188 return MaxCodeUnit(one_byte);
189 }
190
191 } // namespace
192
AppendToText(RegExpText * text,Zone * zone)193 void RegExpTree::AppendToText(RegExpText* text, Zone* zone) { UNREACHABLE(); }
194
AppendToText(RegExpText * text,Zone * zone)195 void RegExpAtom::AppendToText(RegExpText* text, Zone* zone) {
196 text->AddElement(TextElement::Atom(this), zone);
197 }
198
AppendToText(RegExpText * text,Zone * zone)199 void RegExpCharacterClass::AppendToText(RegExpText* text, Zone* zone) {
200 text->AddElement(TextElement::CharClass(this), zone);
201 }
202
AppendToText(RegExpText * text,Zone * zone)203 void RegExpText::AppendToText(RegExpText* text, Zone* zone) {
204 for (int i = 0; i < elements()->length(); i++)
205 text->AddElement(elements()->at(i), zone);
206 }
207
Atom(RegExpAtom * atom)208 TextElement TextElement::Atom(RegExpAtom* atom) {
209 return TextElement(ATOM, atom);
210 }
211
CharClass(RegExpCharacterClass * char_class)212 TextElement TextElement::CharClass(RegExpCharacterClass* char_class) {
213 return TextElement(CHAR_CLASS, char_class);
214 }
215
length() const216 int TextElement::length() const {
217 switch (text_type()) {
218 case ATOM:
219 return atom()->length();
220
221 case CHAR_CLASS:
222 return 1;
223 }
224 UNREACHABLE();
225 }
226
227 class RecursionCheck {
228 public:
RecursionCheck(RegExpCompiler * compiler)229 explicit RecursionCheck(RegExpCompiler* compiler) : compiler_(compiler) {
230 compiler->IncrementRecursionDepth();
231 }
~RecursionCheck()232 ~RecursionCheck() { compiler_->DecrementRecursionDepth(); }
233
234 private:
235 RegExpCompiler* compiler_;
236 };
237
238 // Attempts to compile the regexp using an Irregexp code generator. Returns
239 // a fixed array or a null handle depending on whether it succeeded.
RegExpCompiler(Isolate * isolate,Zone * zone,int capture_count,RegExpFlags flags,bool one_byte)240 RegExpCompiler::RegExpCompiler(Isolate* isolate, Zone* zone, int capture_count,
241 RegExpFlags flags, bool one_byte)
242 : next_register_(JSRegExp::RegistersForCaptureCount(capture_count)),
243 unicode_lookaround_stack_register_(kNoRegister),
244 unicode_lookaround_position_register_(kNoRegister),
245 work_list_(nullptr),
246 recursion_depth_(0),
247 flags_(flags),
248 one_byte_(one_byte),
249 reg_exp_too_big_(false),
250 limiting_recursion_(false),
251 optimize_(FLAG_regexp_optimization),
252 read_backward_(false),
253 current_expansion_factor_(1),
254 frequency_collator_(),
255 isolate_(isolate),
256 zone_(zone) {
257 accept_ = zone->New<EndNode>(EndNode::ACCEPT, zone);
258 DCHECK_GE(RegExpMacroAssembler::kMaxRegister, next_register_ - 1);
259 }
260
Assemble(Isolate * isolate,RegExpMacroAssembler * macro_assembler,RegExpNode * start,int capture_count,Handle<String> pattern)261 RegExpCompiler::CompilationResult RegExpCompiler::Assemble(
262 Isolate* isolate, RegExpMacroAssembler* macro_assembler, RegExpNode* start,
263 int capture_count, Handle<String> pattern) {
264 macro_assembler_ = macro_assembler;
265
266 ZoneVector<RegExpNode*> work_list(zone());
267 work_list_ = &work_list;
268 Label fail;
269 macro_assembler_->PushBacktrack(&fail);
270 Trace new_trace;
271 start->Emit(this, &new_trace);
272 macro_assembler_->BindJumpTarget(&fail);
273 macro_assembler_->Fail();
274 while (!work_list.empty()) {
275 RegExpNode* node = work_list.back();
276 work_list.pop_back();
277 node->set_on_work_list(false);
278 if (!node->label()->is_bound()) node->Emit(this, &new_trace);
279 }
280 if (reg_exp_too_big_) {
281 if (FLAG_correctness_fuzzer_suppressions) {
282 FATAL("Aborting on excess zone allocation");
283 }
284 macro_assembler_->AbortedCodeGeneration();
285 return CompilationResult::RegExpTooBig();
286 }
287
288 Handle<HeapObject> code = macro_assembler_->GetCode(pattern);
289 isolate->IncreaseTotalRegexpCodeGenerated(code);
290 work_list_ = nullptr;
291
292 return {code, next_register_};
293 }
294
Mentions(int that)295 bool Trace::DeferredAction::Mentions(int that) {
296 if (action_type() == ActionNode::CLEAR_CAPTURES) {
297 Interval range = static_cast<DeferredClearCaptures*>(this)->range();
298 return range.Contains(that);
299 } else {
300 return reg() == that;
301 }
302 }
303
mentions_reg(int reg)304 bool Trace::mentions_reg(int reg) {
305 for (DeferredAction* action = actions_; action != nullptr;
306 action = action->next()) {
307 if (action->Mentions(reg)) return true;
308 }
309 return false;
310 }
311
GetStoredPosition(int reg,int * cp_offset)312 bool Trace::GetStoredPosition(int reg, int* cp_offset) {
313 DCHECK_EQ(0, *cp_offset);
314 for (DeferredAction* action = actions_; action != nullptr;
315 action = action->next()) {
316 if (action->Mentions(reg)) {
317 if (action->action_type() == ActionNode::STORE_POSITION) {
318 *cp_offset = static_cast<DeferredCapture*>(action)->cp_offset();
319 return true;
320 } else {
321 return false;
322 }
323 }
324 }
325 return false;
326 }
327
328 // A (dynamically-sized) set of unsigned integers that behaves especially well
329 // on small integers (< kFirstLimit). May do zone-allocation.
330 class DynamicBitSet : public ZoneObject {
331 public:
Get(unsigned value) const332 V8_EXPORT_PRIVATE bool Get(unsigned value) const {
333 if (value < kFirstLimit) {
334 return (first_ & (1 << value)) != 0;
335 } else if (remaining_ == nullptr) {
336 return false;
337 } else {
338 return remaining_->Contains(value);
339 }
340 }
341
342 // Destructively set a value in this set.
Set(unsigned value,Zone * zone)343 void Set(unsigned value, Zone* zone) {
344 if (value < kFirstLimit) {
345 first_ |= (1 << value);
346 } else {
347 if (remaining_ == nullptr)
348 remaining_ = zone->New<ZoneList<unsigned>>(1, zone);
349 if (remaining_->is_empty() || !remaining_->Contains(value))
350 remaining_->Add(value, zone);
351 }
352 }
353
354 private:
355 static constexpr unsigned kFirstLimit = 32;
356
357 uint32_t first_ = 0;
358 ZoneList<unsigned>* remaining_ = nullptr;
359 };
360
FindAffectedRegisters(DynamicBitSet * affected_registers,Zone * zone)361 int Trace::FindAffectedRegisters(DynamicBitSet* affected_registers,
362 Zone* zone) {
363 int max_register = RegExpCompiler::kNoRegister;
364 for (DeferredAction* action = actions_; action != nullptr;
365 action = action->next()) {
366 if (action->action_type() == ActionNode::CLEAR_CAPTURES) {
367 Interval range = static_cast<DeferredClearCaptures*>(action)->range();
368 for (int i = range.from(); i <= range.to(); i++)
369 affected_registers->Set(i, zone);
370 if (range.to() > max_register) max_register = range.to();
371 } else {
372 affected_registers->Set(action->reg(), zone);
373 if (action->reg() > max_register) max_register = action->reg();
374 }
375 }
376 return max_register;
377 }
378
RestoreAffectedRegisters(RegExpMacroAssembler * assembler,int max_register,const DynamicBitSet & registers_to_pop,const DynamicBitSet & registers_to_clear)379 void Trace::RestoreAffectedRegisters(RegExpMacroAssembler* assembler,
380 int max_register,
381 const DynamicBitSet& registers_to_pop,
382 const DynamicBitSet& registers_to_clear) {
383 for (int reg = max_register; reg >= 0; reg--) {
384 if (registers_to_pop.Get(reg)) {
385 assembler->PopRegister(reg);
386 } else if (registers_to_clear.Get(reg)) {
387 int clear_to = reg;
388 while (reg > 0 && registers_to_clear.Get(reg - 1)) {
389 reg--;
390 }
391 assembler->ClearRegisters(reg, clear_to);
392 }
393 }
394 }
395
PerformDeferredActions(RegExpMacroAssembler * assembler,int max_register,const DynamicBitSet & affected_registers,DynamicBitSet * registers_to_pop,DynamicBitSet * registers_to_clear,Zone * zone)396 void Trace::PerformDeferredActions(RegExpMacroAssembler* assembler,
397 int max_register,
398 const DynamicBitSet& affected_registers,
399 DynamicBitSet* registers_to_pop,
400 DynamicBitSet* registers_to_clear,
401 Zone* zone) {
402 // The "+1" is to avoid a push_limit of zero if stack_limit_slack() is 1.
403 const int push_limit = (assembler->stack_limit_slack() + 1) / 2;
404
405 // Count pushes performed to force a stack limit check occasionally.
406 int pushes = 0;
407
408 for (int reg = 0; reg <= max_register; reg++) {
409 if (!affected_registers.Get(reg)) continue;
410
411 // The chronologically first deferred action in the trace
412 // is used to infer the action needed to restore a register
413 // to its previous state (or not, if it's safe to ignore it).
414 enum DeferredActionUndoType { IGNORE, RESTORE, CLEAR };
415 DeferredActionUndoType undo_action = IGNORE;
416
417 int value = 0;
418 bool absolute = false;
419 bool clear = false;
420 static const int kNoStore = kMinInt;
421 int store_position = kNoStore;
422 // This is a little tricky because we are scanning the actions in reverse
423 // historical order (newest first).
424 for (DeferredAction* action = actions_; action != nullptr;
425 action = action->next()) {
426 if (action->Mentions(reg)) {
427 switch (action->action_type()) {
428 case ActionNode::SET_REGISTER_FOR_LOOP: {
429 Trace::DeferredSetRegisterForLoop* psr =
430 static_cast<Trace::DeferredSetRegisterForLoop*>(action);
431 if (!absolute) {
432 value += psr->value();
433 absolute = true;
434 }
435 // SET_REGISTER_FOR_LOOP is only used for newly introduced loop
436 // counters. They can have a significant previous value if they
437 // occur in a loop. TODO(lrn): Propagate this information, so
438 // we can set undo_action to IGNORE if we know there is no value to
439 // restore.
440 undo_action = RESTORE;
441 DCHECK_EQ(store_position, kNoStore);
442 DCHECK(!clear);
443 break;
444 }
445 case ActionNode::INCREMENT_REGISTER:
446 if (!absolute) {
447 value++;
448 }
449 DCHECK_EQ(store_position, kNoStore);
450 DCHECK(!clear);
451 undo_action = RESTORE;
452 break;
453 case ActionNode::STORE_POSITION: {
454 Trace::DeferredCapture* pc =
455 static_cast<Trace::DeferredCapture*>(action);
456 if (!clear && store_position == kNoStore) {
457 store_position = pc->cp_offset();
458 }
459
460 // For captures we know that stores and clears alternate.
461 // Other register, are never cleared, and if the occur
462 // inside a loop, they might be assigned more than once.
463 if (reg <= 1) {
464 // Registers zero and one, aka "capture zero", is
465 // always set correctly if we succeed. There is no
466 // need to undo a setting on backtrack, because we
467 // will set it again or fail.
468 undo_action = IGNORE;
469 } else {
470 undo_action = pc->is_capture() ? CLEAR : RESTORE;
471 }
472 DCHECK(!absolute);
473 DCHECK_EQ(value, 0);
474 break;
475 }
476 case ActionNode::CLEAR_CAPTURES: {
477 // Since we're scanning in reverse order, if we've already
478 // set the position we have to ignore historically earlier
479 // clearing operations.
480 if (store_position == kNoStore) {
481 clear = true;
482 }
483 undo_action = RESTORE;
484 DCHECK(!absolute);
485 DCHECK_EQ(value, 0);
486 break;
487 }
488 default:
489 UNREACHABLE();
490 }
491 }
492 }
493 // Prepare for the undo-action (e.g., push if it's going to be popped).
494 if (undo_action == RESTORE) {
495 pushes++;
496 RegExpMacroAssembler::StackCheckFlag stack_check =
497 RegExpMacroAssembler::kNoStackLimitCheck;
498 if (pushes == push_limit) {
499 stack_check = RegExpMacroAssembler::kCheckStackLimit;
500 pushes = 0;
501 }
502
503 assembler->PushRegister(reg, stack_check);
504 registers_to_pop->Set(reg, zone);
505 } else if (undo_action == CLEAR) {
506 registers_to_clear->Set(reg, zone);
507 }
508 // Perform the chronologically last action (or accumulated increment)
509 // for the register.
510 if (store_position != kNoStore) {
511 assembler->WriteCurrentPositionToRegister(reg, store_position);
512 } else if (clear) {
513 assembler->ClearRegisters(reg, reg);
514 } else if (absolute) {
515 assembler->SetRegister(reg, value);
516 } else if (value != 0) {
517 assembler->AdvanceRegister(reg, value);
518 }
519 }
520 }
521
522 // This is called as we come into a loop choice node and some other tricky
523 // nodes. It normalizes the state of the code generator to ensure we can
524 // generate generic code.
Flush(RegExpCompiler * compiler,RegExpNode * successor)525 void Trace::Flush(RegExpCompiler* compiler, RegExpNode* successor) {
526 RegExpMacroAssembler* assembler = compiler->macro_assembler();
527
528 DCHECK(!is_trivial());
529
530 if (actions_ == nullptr && backtrack() == nullptr) {
531 // Here we just have some deferred cp advances to fix and we are back to
532 // a normal situation. We may also have to forget some information gained
533 // through a quick check that was already performed.
534 if (cp_offset_ != 0) assembler->AdvanceCurrentPosition(cp_offset_);
535 // Create a new trivial state and generate the node with that.
536 Trace new_state;
537 successor->Emit(compiler, &new_state);
538 return;
539 }
540
541 // Generate deferred actions here along with code to undo them again.
542 DynamicBitSet affected_registers;
543
544 if (backtrack() != nullptr) {
545 // Here we have a concrete backtrack location. These are set up by choice
546 // nodes and so they indicate that we have a deferred save of the current
547 // position which we may need to emit here.
548 assembler->PushCurrentPosition();
549 }
550
551 int max_register =
552 FindAffectedRegisters(&affected_registers, compiler->zone());
553 DynamicBitSet registers_to_pop;
554 DynamicBitSet registers_to_clear;
555 PerformDeferredActions(assembler, max_register, affected_registers,
556 ®isters_to_pop, ®isters_to_clear,
557 compiler->zone());
558 if (cp_offset_ != 0) {
559 assembler->AdvanceCurrentPosition(cp_offset_);
560 }
561
562 // Create a new trivial state and generate the node with that.
563 Label undo;
564 assembler->PushBacktrack(&undo);
565 if (successor->KeepRecursing(compiler)) {
566 Trace new_state;
567 successor->Emit(compiler, &new_state);
568 } else {
569 compiler->AddWork(successor);
570 assembler->GoTo(successor->label());
571 }
572
573 // On backtrack we need to restore state.
574 assembler->BindJumpTarget(&undo);
575 RestoreAffectedRegisters(assembler, max_register, registers_to_pop,
576 registers_to_clear);
577 if (backtrack() == nullptr) {
578 assembler->Backtrack();
579 } else {
580 assembler->PopCurrentPosition();
581 assembler->GoTo(backtrack());
582 }
583 }
584
Emit(RegExpCompiler * compiler,Trace * trace)585 void NegativeSubmatchSuccess::Emit(RegExpCompiler* compiler, Trace* trace) {
586 RegExpMacroAssembler* assembler = compiler->macro_assembler();
587
588 // Omit flushing the trace. We discard the entire stack frame anyway.
589
590 if (!label()->is_bound()) {
591 // We are completely independent of the trace, since we ignore it,
592 // so this code can be used as the generic version.
593 assembler->Bind(label());
594 }
595
596 // Throw away everything on the backtrack stack since the start
597 // of the negative submatch and restore the character position.
598 assembler->ReadCurrentPositionFromRegister(current_position_register_);
599 assembler->ReadStackPointerFromRegister(stack_pointer_register_);
600 if (clear_capture_count_ > 0) {
601 // Clear any captures that might have been performed during the success
602 // of the body of the negative look-ahead.
603 int clear_capture_end = clear_capture_start_ + clear_capture_count_ - 1;
604 assembler->ClearRegisters(clear_capture_start_, clear_capture_end);
605 }
606 // Now that we have unwound the stack we find at the top of the stack the
607 // backtrack that the BeginNegativeSubmatch node got.
608 assembler->Backtrack();
609 }
610
Emit(RegExpCompiler * compiler,Trace * trace)611 void EndNode::Emit(RegExpCompiler* compiler, Trace* trace) {
612 if (!trace->is_trivial()) {
613 trace->Flush(compiler, this);
614 return;
615 }
616 RegExpMacroAssembler* assembler = compiler->macro_assembler();
617 if (!label()->is_bound()) {
618 assembler->Bind(label());
619 }
620 switch (action_) {
621 case ACCEPT:
622 assembler->Succeed();
623 return;
624 case BACKTRACK:
625 assembler->GoTo(trace->backtrack());
626 return;
627 case NEGATIVE_SUBMATCH_SUCCESS:
628 // This case is handled in a different virtual method.
629 UNREACHABLE();
630 }
631 UNIMPLEMENTED();
632 }
633
AddGuard(Guard * guard,Zone * zone)634 void GuardedAlternative::AddGuard(Guard* guard, Zone* zone) {
635 if (guards_ == nullptr) guards_ = zone->New<ZoneList<Guard*>>(1, zone);
636 guards_->Add(guard, zone);
637 }
638
SetRegisterForLoop(int reg,int val,RegExpNode * on_success)639 ActionNode* ActionNode::SetRegisterForLoop(int reg, int val,
640 RegExpNode* on_success) {
641 ActionNode* result =
642 on_success->zone()->New<ActionNode>(SET_REGISTER_FOR_LOOP, on_success);
643 result->data_.u_store_register.reg = reg;
644 result->data_.u_store_register.value = val;
645 return result;
646 }
647
IncrementRegister(int reg,RegExpNode * on_success)648 ActionNode* ActionNode::IncrementRegister(int reg, RegExpNode* on_success) {
649 ActionNode* result =
650 on_success->zone()->New<ActionNode>(INCREMENT_REGISTER, on_success);
651 result->data_.u_increment_register.reg = reg;
652 return result;
653 }
654
StorePosition(int reg,bool is_capture,RegExpNode * on_success)655 ActionNode* ActionNode::StorePosition(int reg, bool is_capture,
656 RegExpNode* on_success) {
657 ActionNode* result =
658 on_success->zone()->New<ActionNode>(STORE_POSITION, on_success);
659 result->data_.u_position_register.reg = reg;
660 result->data_.u_position_register.is_capture = is_capture;
661 return result;
662 }
663
ClearCaptures(Interval range,RegExpNode * on_success)664 ActionNode* ActionNode::ClearCaptures(Interval range, RegExpNode* on_success) {
665 ActionNode* result =
666 on_success->zone()->New<ActionNode>(CLEAR_CAPTURES, on_success);
667 result->data_.u_clear_captures.range_from = range.from();
668 result->data_.u_clear_captures.range_to = range.to();
669 return result;
670 }
671
BeginPositiveSubmatch(int stack_reg,int position_reg,RegExpNode * on_success)672 ActionNode* ActionNode::BeginPositiveSubmatch(int stack_reg, int position_reg,
673 RegExpNode* on_success) {
674 ActionNode* result =
675 on_success->zone()->New<ActionNode>(BEGIN_POSITIVE_SUBMATCH, on_success);
676 result->data_.u_submatch.stack_pointer_register = stack_reg;
677 result->data_.u_submatch.current_position_register = position_reg;
678 return result;
679 }
680
BeginNegativeSubmatch(int stack_reg,int position_reg,RegExpNode * on_success)681 ActionNode* ActionNode::BeginNegativeSubmatch(int stack_reg, int position_reg,
682 RegExpNode* on_success) {
683 ActionNode* result =
684 on_success->zone()->New<ActionNode>(BEGIN_NEGATIVE_SUBMATCH, on_success);
685 result->data_.u_submatch.stack_pointer_register = stack_reg;
686 result->data_.u_submatch.current_position_register = position_reg;
687 return result;
688 }
689
PositiveSubmatchSuccess(int stack_reg,int position_reg,int clear_register_count,int clear_register_from,RegExpNode * on_success)690 ActionNode* ActionNode::PositiveSubmatchSuccess(int stack_reg, int position_reg,
691 int clear_register_count,
692 int clear_register_from,
693 RegExpNode* on_success) {
694 ActionNode* result = on_success->zone()->New<ActionNode>(
695 POSITIVE_SUBMATCH_SUCCESS, on_success);
696 result->data_.u_submatch.stack_pointer_register = stack_reg;
697 result->data_.u_submatch.current_position_register = position_reg;
698 result->data_.u_submatch.clear_register_count = clear_register_count;
699 result->data_.u_submatch.clear_register_from = clear_register_from;
700 return result;
701 }
702
EmptyMatchCheck(int start_register,int repetition_register,int repetition_limit,RegExpNode * on_success)703 ActionNode* ActionNode::EmptyMatchCheck(int start_register,
704 int repetition_register,
705 int repetition_limit,
706 RegExpNode* on_success) {
707 ActionNode* result =
708 on_success->zone()->New<ActionNode>(EMPTY_MATCH_CHECK, on_success);
709 result->data_.u_empty_match_check.start_register = start_register;
710 result->data_.u_empty_match_check.repetition_register = repetition_register;
711 result->data_.u_empty_match_check.repetition_limit = repetition_limit;
712 return result;
713 }
714
715 #define DEFINE_ACCEPT(Type) \
716 void Type##Node::Accept(NodeVisitor* visitor) { visitor->Visit##Type(this); }
FOR_EACH_NODE_TYPE(DEFINE_ACCEPT)717 FOR_EACH_NODE_TYPE(DEFINE_ACCEPT)
718 #undef DEFINE_ACCEPT
719
720 // -------------------------------------------------------------------
721 // Emit code.
722
723 void ChoiceNode::GenerateGuard(RegExpMacroAssembler* macro_assembler,
724 Guard* guard, Trace* trace) {
725 switch (guard->op()) {
726 case Guard::LT:
727 DCHECK(!trace->mentions_reg(guard->reg()));
728 macro_assembler->IfRegisterGE(guard->reg(), guard->value(),
729 trace->backtrack());
730 break;
731 case Guard::GEQ:
732 DCHECK(!trace->mentions_reg(guard->reg()));
733 macro_assembler->IfRegisterLT(guard->reg(), guard->value(),
734 trace->backtrack());
735 break;
736 }
737 }
738
739 namespace {
740
741 #ifdef DEBUG
ContainsOnlyUtf16CodeUnits(unibrow::uchar * chars,int length)742 bool ContainsOnlyUtf16CodeUnits(unibrow::uchar* chars, int length) {
743 STATIC_ASSERT(sizeof(unibrow::uchar) == 4);
744 for (int i = 0; i < length; i++) {
745 if (chars[i] > String::kMaxUtf16CodeUnit) return false;
746 }
747 return true;
748 }
749 #endif // DEBUG
750
751 // Returns the number of characters in the equivalence class, omitting those
752 // that cannot occur in the source string because it is Latin1.
GetCaseIndependentLetters(Isolate * isolate,base::uc16 character,bool one_byte_subject,unibrow::uchar * letters,int letter_length)753 int GetCaseIndependentLetters(Isolate* isolate, base::uc16 character,
754 bool one_byte_subject, unibrow::uchar* letters,
755 int letter_length) {
756 #ifdef V8_INTL_SUPPORT
757 if (RegExpCaseFolding::IgnoreSet().contains(character)) {
758 letters[0] = character;
759 DCHECK(ContainsOnlyUtf16CodeUnits(letters, 1));
760 return 1;
761 }
762 bool in_special_add_set =
763 RegExpCaseFolding::SpecialAddSet().contains(character);
764
765 icu::UnicodeSet set;
766 set.add(character);
767 set = set.closeOver(USET_CASE_INSENSITIVE);
768
769 UChar32 canon = 0;
770 if (in_special_add_set) {
771 canon = RegExpCaseFolding::Canonicalize(character);
772 }
773
774 int32_t range_count = set.getRangeCount();
775 int items = 0;
776 for (int32_t i = 0; i < range_count; i++) {
777 UChar32 start = set.getRangeStart(i);
778 UChar32 end = set.getRangeEnd(i);
779 CHECK(end - start + items <= letter_length);
780 for (UChar32 cu = start; cu <= end; cu++) {
781 if (one_byte_subject && cu > String::kMaxOneByteCharCode) break;
782 if (in_special_add_set && RegExpCaseFolding::Canonicalize(cu) != canon) {
783 continue;
784 }
785 letters[items++] = static_cast<unibrow::uchar>(cu);
786 }
787 }
788 DCHECK(ContainsOnlyUtf16CodeUnits(letters, items));
789 return items;
790 #else
791 int length =
792 isolate->jsregexp_uncanonicalize()->get(character, '\0', letters);
793 // Unibrow returns 0 or 1 for characters where case independence is
794 // trivial.
795 if (length == 0) {
796 letters[0] = character;
797 length = 1;
798 }
799
800 if (one_byte_subject) {
801 int new_length = 0;
802 for (int i = 0; i < length; i++) {
803 if (letters[i] <= String::kMaxOneByteCharCode) {
804 letters[new_length++] = letters[i];
805 }
806 }
807 length = new_length;
808 }
809
810 DCHECK(ContainsOnlyUtf16CodeUnits(letters, length));
811 return length;
812 #endif // V8_INTL_SUPPORT
813 }
814
EmitSimpleCharacter(Isolate * isolate,RegExpCompiler * compiler,base::uc16 c,Label * on_failure,int cp_offset,bool check,bool preloaded)815 inline bool EmitSimpleCharacter(Isolate* isolate, RegExpCompiler* compiler,
816 base::uc16 c, Label* on_failure, int cp_offset,
817 bool check, bool preloaded) {
818 RegExpMacroAssembler* assembler = compiler->macro_assembler();
819 bool bound_checked = false;
820 if (!preloaded) {
821 assembler->LoadCurrentCharacter(cp_offset, on_failure, check);
822 bound_checked = true;
823 }
824 assembler->CheckNotCharacter(c, on_failure);
825 return bound_checked;
826 }
827
828 // Only emits non-letters (things that don't have case). Only used for case
829 // independent matches.
EmitAtomNonLetter(Isolate * isolate,RegExpCompiler * compiler,base::uc16 c,Label * on_failure,int cp_offset,bool check,bool preloaded)830 inline bool EmitAtomNonLetter(Isolate* isolate, RegExpCompiler* compiler,
831 base::uc16 c, Label* on_failure, int cp_offset,
832 bool check, bool preloaded) {
833 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
834 bool one_byte = compiler->one_byte();
835 unibrow::uchar chars[4];
836 int length = GetCaseIndependentLetters(isolate, c, one_byte, chars, 4);
837 if (length < 1) {
838 // This can't match. Must be an one-byte subject and a non-one-byte
839 // character. We do not need to do anything since the one-byte pass
840 // already handled this.
841 return false; // Bounds not checked.
842 }
843 bool checked = false;
844 // We handle the length > 1 case in a later pass.
845 if (length == 1) {
846 if (one_byte && c > String::kMaxOneByteCharCodeU) {
847 // Can't match - see above.
848 return false; // Bounds not checked.
849 }
850 if (!preloaded) {
851 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);
852 checked = check;
853 }
854 macro_assembler->CheckNotCharacter(c, on_failure);
855 }
856 return checked;
857 }
858
ShortCutEmitCharacterPair(RegExpMacroAssembler * macro_assembler,bool one_byte,base::uc16 c1,base::uc16 c2,Label * on_failure)859 bool ShortCutEmitCharacterPair(RegExpMacroAssembler* macro_assembler,
860 bool one_byte, base::uc16 c1, base::uc16 c2,
861 Label* on_failure) {
862 const uint32_t char_mask = CharMask(one_byte);
863 base::uc16 exor = c1 ^ c2;
864 // Check whether exor has only one bit set.
865 if (((exor - 1) & exor) == 0) {
866 // If c1 and c2 differ only by one bit.
867 // Ecma262UnCanonicalize always gives the highest number last.
868 DCHECK(c2 > c1);
869 base::uc16 mask = char_mask ^ exor;
870 macro_assembler->CheckNotCharacterAfterAnd(c1, mask, on_failure);
871 return true;
872 }
873 DCHECK(c2 > c1);
874 base::uc16 diff = c2 - c1;
875 if (((diff - 1) & diff) == 0 && c1 >= diff) {
876 // If the characters differ by 2^n but don't differ by one bit then
877 // subtract the difference from the found character, then do the or
878 // trick. We avoid the theoretical case where negative numbers are
879 // involved in order to simplify code generation.
880 base::uc16 mask = char_mask ^ diff;
881 macro_assembler->CheckNotCharacterAfterMinusAnd(c1 - diff, diff, mask,
882 on_failure);
883 return true;
884 }
885 return false;
886 }
887
888 // Only emits letters (things that have case). Only used for case independent
889 // matches.
EmitAtomLetter(Isolate * isolate,RegExpCompiler * compiler,base::uc16 c,Label * on_failure,int cp_offset,bool check,bool preloaded)890 inline bool EmitAtomLetter(Isolate* isolate, RegExpCompiler* compiler,
891 base::uc16 c, Label* on_failure, int cp_offset,
892 bool check, bool preloaded) {
893 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
894 bool one_byte = compiler->one_byte();
895 unibrow::uchar chars[4];
896 int length = GetCaseIndependentLetters(isolate, c, one_byte, chars, 4);
897 if (length <= 1) return false;
898 // We may not need to check against the end of the input string
899 // if this character lies before a character that matched.
900 if (!preloaded) {
901 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);
902 }
903 Label ok;
904 switch (length) {
905 case 2: {
906 if (ShortCutEmitCharacterPair(macro_assembler, one_byte, chars[0],
907 chars[1], on_failure)) {
908 } else {
909 macro_assembler->CheckCharacter(chars[0], &ok);
910 macro_assembler->CheckNotCharacter(chars[1], on_failure);
911 macro_assembler->Bind(&ok);
912 }
913 break;
914 }
915 case 4:
916 macro_assembler->CheckCharacter(chars[3], &ok);
917 V8_FALLTHROUGH;
918 case 3:
919 macro_assembler->CheckCharacter(chars[0], &ok);
920 macro_assembler->CheckCharacter(chars[1], &ok);
921 macro_assembler->CheckNotCharacter(chars[2], on_failure);
922 macro_assembler->Bind(&ok);
923 break;
924 default:
925 UNREACHABLE();
926 }
927 return true;
928 }
929
EmitBoundaryTest(RegExpMacroAssembler * masm,int border,Label * fall_through,Label * above_or_equal,Label * below)930 void EmitBoundaryTest(RegExpMacroAssembler* masm, int border,
931 Label* fall_through, Label* above_or_equal,
932 Label* below) {
933 if (below != fall_through) {
934 masm->CheckCharacterLT(border, below);
935 if (above_or_equal != fall_through) masm->GoTo(above_or_equal);
936 } else {
937 masm->CheckCharacterGT(border - 1, above_or_equal);
938 }
939 }
940
EmitDoubleBoundaryTest(RegExpMacroAssembler * masm,int first,int last,Label * fall_through,Label * in_range,Label * out_of_range)941 void EmitDoubleBoundaryTest(RegExpMacroAssembler* masm, int first, int last,
942 Label* fall_through, Label* in_range,
943 Label* out_of_range) {
944 if (in_range == fall_through) {
945 if (first == last) {
946 masm->CheckNotCharacter(first, out_of_range);
947 } else {
948 masm->CheckCharacterNotInRange(first, last, out_of_range);
949 }
950 } else {
951 if (first == last) {
952 masm->CheckCharacter(first, in_range);
953 } else {
954 masm->CheckCharacterInRange(first, last, in_range);
955 }
956 if (out_of_range != fall_through) masm->GoTo(out_of_range);
957 }
958 }
959
960 // even_label is for ranges[i] to ranges[i + 1] where i - start_index is even.
961 // odd_label is for ranges[i] to ranges[i + 1] where i - start_index is odd.
EmitUseLookupTable(RegExpMacroAssembler * masm,ZoneList<base::uc32> * ranges,uint32_t start_index,uint32_t end_index,base::uc32 min_char,Label * fall_through,Label * even_label,Label * odd_label)962 void EmitUseLookupTable(RegExpMacroAssembler* masm,
963 ZoneList<base::uc32>* ranges, uint32_t start_index,
964 uint32_t end_index, base::uc32 min_char,
965 Label* fall_through, Label* even_label,
966 Label* odd_label) {
967 static const uint32_t kSize = RegExpMacroAssembler::kTableSize;
968 static const uint32_t kMask = RegExpMacroAssembler::kTableMask;
969
970 base::uc32 base = (min_char & ~kMask);
971 USE(base);
972
973 // Assert that everything is on one kTableSize page.
974 for (uint32_t i = start_index; i <= end_index; i++) {
975 DCHECK_EQ(ranges->at(i) & ~kMask, base);
976 }
977 DCHECK(start_index == 0 || (ranges->at(start_index - 1) & ~kMask) <= base);
978
979 char templ[kSize];
980 Label* on_bit_set;
981 Label* on_bit_clear;
982 int bit;
983 if (even_label == fall_through) {
984 on_bit_set = odd_label;
985 on_bit_clear = even_label;
986 bit = 1;
987 } else {
988 on_bit_set = even_label;
989 on_bit_clear = odd_label;
990 bit = 0;
991 }
992 for (uint32_t i = 0; i < (ranges->at(start_index) & kMask) && i < kSize;
993 i++) {
994 templ[i] = bit;
995 }
996 uint32_t j = 0;
997 bit ^= 1;
998 for (uint32_t i = start_index; i < end_index; i++) {
999 for (j = (ranges->at(i) & kMask); j < (ranges->at(i + 1) & kMask); j++) {
1000 templ[j] = bit;
1001 }
1002 bit ^= 1;
1003 }
1004 for (uint32_t i = j; i < kSize; i++) {
1005 templ[i] = bit;
1006 }
1007 Factory* factory = masm->isolate()->factory();
1008 // TODO(erikcorry): Cache these.
1009 Handle<ByteArray> ba = factory->NewByteArray(kSize, AllocationType::kOld);
1010 for (uint32_t i = 0; i < kSize; i++) {
1011 ba->set(i, templ[i]);
1012 }
1013 masm->CheckBitInTable(ba, on_bit_set);
1014 if (on_bit_clear != fall_through) masm->GoTo(on_bit_clear);
1015 }
1016
CutOutRange(RegExpMacroAssembler * masm,ZoneList<base::uc32> * ranges,uint32_t start_index,uint32_t end_index,uint32_t cut_index,Label * even_label,Label * odd_label)1017 void CutOutRange(RegExpMacroAssembler* masm, ZoneList<base::uc32>* ranges,
1018 uint32_t start_index, uint32_t end_index, uint32_t cut_index,
1019 Label* even_label, Label* odd_label) {
1020 bool odd = (((cut_index - start_index) & 1) == 1);
1021 Label* in_range_label = odd ? odd_label : even_label;
1022 Label dummy;
1023 EmitDoubleBoundaryTest(masm, ranges->at(cut_index),
1024 ranges->at(cut_index + 1) - 1, &dummy, in_range_label,
1025 &dummy);
1026 DCHECK(!dummy.is_linked());
1027 // Cut out the single range by rewriting the array. This creates a new
1028 // range that is a merger of the two ranges on either side of the one we
1029 // are cutting out. The oddity of the labels is preserved.
1030 for (uint32_t j = cut_index; j > start_index; j--) {
1031 ranges->at(j) = ranges->at(j - 1);
1032 }
1033 for (uint32_t j = cut_index + 1; j < end_index; j++) {
1034 ranges->at(j) = ranges->at(j + 1);
1035 }
1036 }
1037
1038 // Unicode case. Split the search space into kSize spaces that are handled
1039 // with recursion.
SplitSearchSpace(ZoneList<base::uc32> * ranges,uint32_t start_index,uint32_t end_index,uint32_t * new_start_index,uint32_t * new_end_index,base::uc32 * border)1040 void SplitSearchSpace(ZoneList<base::uc32>* ranges, uint32_t start_index,
1041 uint32_t end_index, uint32_t* new_start_index,
1042 uint32_t* new_end_index, base::uc32* border) {
1043 static const uint32_t kSize = RegExpMacroAssembler::kTableSize;
1044 static const uint32_t kMask = RegExpMacroAssembler::kTableMask;
1045
1046 base::uc32 first = ranges->at(start_index);
1047 base::uc32 last = ranges->at(end_index) - 1;
1048
1049 *new_start_index = start_index;
1050 *border = (ranges->at(start_index) & ~kMask) + kSize;
1051 while (*new_start_index < end_index) {
1052 if (ranges->at(*new_start_index) > *border) break;
1053 (*new_start_index)++;
1054 }
1055 // new_start_index is the index of the first edge that is beyond the
1056 // current kSize space.
1057
1058 // For very large search spaces we do a binary chop search of the non-Latin1
1059 // space instead of just going to the end of the current kSize space. The
1060 // heuristics are complicated a little by the fact that any 128-character
1061 // encoding space can be quickly tested with a table lookup, so we don't
1062 // wish to do binary chop search at a smaller granularity than that. A
1063 // 128-character space can take up a lot of space in the ranges array if,
1064 // for example, we only want to match every second character (eg. the lower
1065 // case characters on some Unicode pages).
1066 uint32_t binary_chop_index = (end_index + start_index) / 2;
1067 // The first test ensures that we get to the code that handles the Latin1
1068 // range with a single not-taken branch, speeding up this important
1069 // character range (even non-Latin1 charset-based text has spaces and
1070 // punctuation).
1071 if (*border - 1 > String::kMaxOneByteCharCode && // Latin1 case.
1072 end_index - start_index > (*new_start_index - start_index) * 2 &&
1073 last - first > kSize * 2 && binary_chop_index > *new_start_index &&
1074 ranges->at(binary_chop_index) >= first + 2 * kSize) {
1075 uint32_t scan_forward_for_section_border = binary_chop_index;
1076 uint32_t new_border = (ranges->at(binary_chop_index) | kMask) + 1;
1077
1078 while (scan_forward_for_section_border < end_index) {
1079 if (ranges->at(scan_forward_for_section_border) > new_border) {
1080 *new_start_index = scan_forward_for_section_border;
1081 *border = new_border;
1082 break;
1083 }
1084 scan_forward_for_section_border++;
1085 }
1086 }
1087
1088 DCHECK(*new_start_index > start_index);
1089 *new_end_index = *new_start_index - 1;
1090 if (ranges->at(*new_end_index) == *border) {
1091 (*new_end_index)--;
1092 }
1093 if (*border >= ranges->at(end_index)) {
1094 *border = ranges->at(end_index);
1095 *new_start_index = end_index; // Won't be used.
1096 *new_end_index = end_index - 1;
1097 }
1098 }
1099
1100 // Gets a series of segment boundaries representing a character class. If the
1101 // character is in the range between an even and an odd boundary (counting from
1102 // start_index) then go to even_label, otherwise go to odd_label. We already
1103 // know that the character is in the range of min_char to max_char inclusive.
1104 // Either label can be nullptr indicating backtracking. Either label can also
1105 // be equal to the fall_through label.
GenerateBranches(RegExpMacroAssembler * masm,ZoneList<base::uc32> * ranges,uint32_t start_index,uint32_t end_index,base::uc32 min_char,base::uc32 max_char,Label * fall_through,Label * even_label,Label * odd_label)1106 void GenerateBranches(RegExpMacroAssembler* masm, ZoneList<base::uc32>* ranges,
1107 uint32_t start_index, uint32_t end_index,
1108 base::uc32 min_char, base::uc32 max_char,
1109 Label* fall_through, Label* even_label,
1110 Label* odd_label) {
1111 DCHECK_LE(min_char, String::kMaxUtf16CodeUnit);
1112 DCHECK_LE(max_char, String::kMaxUtf16CodeUnit);
1113
1114 base::uc32 first = ranges->at(start_index);
1115 base::uc32 last = ranges->at(end_index) - 1;
1116
1117 DCHECK_LT(min_char, first);
1118
1119 // Just need to test if the character is before or on-or-after
1120 // a particular character.
1121 if (start_index == end_index) {
1122 EmitBoundaryTest(masm, first, fall_through, even_label, odd_label);
1123 return;
1124 }
1125
1126 // Another almost trivial case: There is one interval in the middle that is
1127 // different from the end intervals.
1128 if (start_index + 1 == end_index) {
1129 EmitDoubleBoundaryTest(masm, first, last, fall_through, even_label,
1130 odd_label);
1131 return;
1132 }
1133
1134 // It's not worth using table lookup if there are very few intervals in the
1135 // character class.
1136 if (end_index - start_index <= 6) {
1137 // It is faster to test for individual characters, so we look for those
1138 // first, then try arbitrary ranges in the second round.
1139 static uint32_t kNoCutIndex = -1;
1140 uint32_t cut = kNoCutIndex;
1141 for (uint32_t i = start_index; i < end_index; i++) {
1142 if (ranges->at(i) == ranges->at(i + 1) - 1) {
1143 cut = i;
1144 break;
1145 }
1146 }
1147 if (cut == kNoCutIndex) cut = start_index;
1148 CutOutRange(masm, ranges, start_index, end_index, cut, even_label,
1149 odd_label);
1150 DCHECK_GE(end_index - start_index, 2);
1151 GenerateBranches(masm, ranges, start_index + 1, end_index - 1, min_char,
1152 max_char, fall_through, even_label, odd_label);
1153 return;
1154 }
1155
1156 // If there are a lot of intervals in the regexp, then we will use tables to
1157 // determine whether the character is inside or outside the character class.
1158 static const int kBits = RegExpMacroAssembler::kTableSizeBits;
1159
1160 if ((max_char >> kBits) == (min_char >> kBits)) {
1161 EmitUseLookupTable(masm, ranges, start_index, end_index, min_char,
1162 fall_through, even_label, odd_label);
1163 return;
1164 }
1165
1166 if ((min_char >> kBits) != first >> kBits) {
1167 masm->CheckCharacterLT(first, odd_label);
1168 GenerateBranches(masm, ranges, start_index + 1, end_index, first, max_char,
1169 fall_through, odd_label, even_label);
1170 return;
1171 }
1172
1173 uint32_t new_start_index = 0;
1174 uint32_t new_end_index = 0;
1175 base::uc32 border = 0;
1176
1177 SplitSearchSpace(ranges, start_index, end_index, &new_start_index,
1178 &new_end_index, &border);
1179
1180 Label handle_rest;
1181 Label* above = &handle_rest;
1182 if (border == last + 1) {
1183 // We didn't find any section that started after the limit, so everything
1184 // above the border is one of the terminal labels.
1185 above = (end_index & 1) != (start_index & 1) ? odd_label : even_label;
1186 DCHECK(new_end_index == end_index - 1);
1187 }
1188
1189 DCHECK_LE(start_index, new_end_index);
1190 DCHECK_LE(new_start_index, end_index);
1191 DCHECK_LT(start_index, new_start_index);
1192 DCHECK_LT(new_end_index, end_index);
1193 DCHECK(new_end_index + 1 == new_start_index ||
1194 (new_end_index + 2 == new_start_index &&
1195 border == ranges->at(new_end_index + 1)));
1196 DCHECK_LT(min_char, border - 1);
1197 DCHECK_LT(border, max_char);
1198 DCHECK_LT(ranges->at(new_end_index), border);
1199 DCHECK(border < ranges->at(new_start_index) ||
1200 (border == ranges->at(new_start_index) &&
1201 new_start_index == end_index && new_end_index == end_index - 1 &&
1202 border == last + 1));
1203 DCHECK(new_start_index == 0 || border >= ranges->at(new_start_index - 1));
1204
1205 masm->CheckCharacterGT(border - 1, above);
1206 Label dummy;
1207 GenerateBranches(masm, ranges, start_index, new_end_index, min_char,
1208 border - 1, &dummy, even_label, odd_label);
1209 if (handle_rest.is_linked()) {
1210 masm->Bind(&handle_rest);
1211 bool flip = (new_start_index & 1) != (start_index & 1);
1212 GenerateBranches(masm, ranges, new_start_index, end_index, border, max_char,
1213 &dummy, flip ? odd_label : even_label,
1214 flip ? even_label : odd_label);
1215 }
1216 }
1217
EmitCharClass(RegExpMacroAssembler * macro_assembler,RegExpCharacterClass * cc,bool one_byte,Label * on_failure,int cp_offset,bool check_offset,bool preloaded,Zone * zone)1218 void EmitCharClass(RegExpMacroAssembler* macro_assembler,
1219 RegExpCharacterClass* cc, bool one_byte, Label* on_failure,
1220 int cp_offset, bool check_offset, bool preloaded,
1221 Zone* zone) {
1222 ZoneList<CharacterRange>* ranges = cc->ranges(zone);
1223 CharacterRange::Canonicalize(ranges);
1224
1225 // Now that all processing (like case-insensitivity) is done, clamp the
1226 // ranges to the set of ranges that may actually occur in the subject string.
1227 if (one_byte) CharacterRange::ClampToOneByte(ranges);
1228
1229 const int ranges_length = ranges->length();
1230 if (ranges_length == 0) {
1231 if (!cc->is_negated()) {
1232 macro_assembler->GoTo(on_failure);
1233 }
1234 if (check_offset) {
1235 macro_assembler->CheckPosition(cp_offset, on_failure);
1236 }
1237 return;
1238 }
1239
1240 const base::uc32 max_char = MaxCodeUnit(one_byte);
1241 if (ranges_length == 1 && ranges->at(0).IsEverything(max_char)) {
1242 if (cc->is_negated()) {
1243 macro_assembler->GoTo(on_failure);
1244 } else {
1245 // This is a common case hit by non-anchored expressions.
1246 if (check_offset) {
1247 macro_assembler->CheckPosition(cp_offset, on_failure);
1248 }
1249 }
1250 return;
1251 }
1252
1253 if (!preloaded) {
1254 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check_offset);
1255 }
1256
1257 if (cc->is_standard(zone) && macro_assembler->CheckSpecialCharacterClass(
1258 cc->standard_type(), on_failure)) {
1259 return;
1260 }
1261
1262 static constexpr int kMaxRangesForInlineBranchGeneration = 16;
1263 if (ranges_length > kMaxRangesForInlineBranchGeneration) {
1264 // For large range sets, emit a more compact instruction sequence to avoid
1265 // a potentially problematic increase in code size.
1266 // Note the flipped logic below (we check InRange if negated, NotInRange if
1267 // not negated); this is necessary since the method falls through on
1268 // failure whereas we want to fall through on success.
1269 if (cc->is_negated()) {
1270 if (macro_assembler->CheckCharacterInRangeArray(ranges, on_failure)) {
1271 return;
1272 }
1273 } else {
1274 if (macro_assembler->CheckCharacterNotInRangeArray(ranges, on_failure)) {
1275 return;
1276 }
1277 }
1278 }
1279
1280 // Generate a flat list of range boundaries for consumption by
1281 // GenerateBranches. See the comment on that function for how the list should
1282 // be structured
1283 ZoneList<base::uc32>* range_boundaries =
1284 zone->New<ZoneList<base::uc32>>(ranges_length * 2, zone);
1285
1286 bool zeroth_entry_is_failure = !cc->is_negated();
1287
1288 for (int i = 0; i < ranges_length; i++) {
1289 CharacterRange& range = ranges->at(i);
1290 if (range.from() == 0) {
1291 DCHECK_EQ(i, 0);
1292 zeroth_entry_is_failure = !zeroth_entry_is_failure;
1293 } else {
1294 range_boundaries->Add(range.from(), zone);
1295 }
1296 // `+ 1` to convert from inclusive to exclusive `to`.
1297 // [from, to] == [from, to+1[.
1298 range_boundaries->Add(range.to() + 1, zone);
1299 }
1300 int end_index = range_boundaries->length() - 1;
1301 if (range_boundaries->at(end_index) > max_char) {
1302 end_index--;
1303 }
1304
1305 Label fall_through;
1306 GenerateBranches(macro_assembler, range_boundaries,
1307 0, // start_index.
1308 end_index,
1309 0, // min_char.
1310 max_char, &fall_through,
1311 zeroth_entry_is_failure ? &fall_through : on_failure,
1312 zeroth_entry_is_failure ? on_failure : &fall_through);
1313 macro_assembler->Bind(&fall_through);
1314 }
1315
1316 } // namespace
1317
1318 RegExpNode::~RegExpNode() = default;
1319
LimitVersions(RegExpCompiler * compiler,Trace * trace)1320 RegExpNode::LimitResult RegExpNode::LimitVersions(RegExpCompiler* compiler,
1321 Trace* trace) {
1322 // If we are generating a greedy loop then don't stop and don't reuse code.
1323 if (trace->stop_node() != nullptr) {
1324 return CONTINUE;
1325 }
1326
1327 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
1328 if (trace->is_trivial()) {
1329 if (label_.is_bound() || on_work_list() || !KeepRecursing(compiler)) {
1330 // If a generic version is already scheduled to be generated or we have
1331 // recursed too deeply then just generate a jump to that code.
1332 macro_assembler->GoTo(&label_);
1333 // This will queue it up for generation of a generic version if it hasn't
1334 // already been queued.
1335 compiler->AddWork(this);
1336 return DONE;
1337 }
1338 // Generate generic version of the node and bind the label for later use.
1339 macro_assembler->Bind(&label_);
1340 return CONTINUE;
1341 }
1342
1343 // We are being asked to make a non-generic version. Keep track of how many
1344 // non-generic versions we generate so as not to overdo it.
1345 trace_count_++;
1346 if (KeepRecursing(compiler) && compiler->optimize() &&
1347 trace_count_ < kMaxCopiesCodeGenerated) {
1348 return CONTINUE;
1349 }
1350
1351 // If we get here code has been generated for this node too many times or
1352 // recursion is too deep. Time to switch to a generic version. The code for
1353 // generic versions above can handle deep recursion properly.
1354 bool was_limiting = compiler->limiting_recursion();
1355 compiler->set_limiting_recursion(true);
1356 trace->Flush(compiler, this);
1357 compiler->set_limiting_recursion(was_limiting);
1358 return DONE;
1359 }
1360
KeepRecursing(RegExpCompiler * compiler)1361 bool RegExpNode::KeepRecursing(RegExpCompiler* compiler) {
1362 return !compiler->limiting_recursion() &&
1363 compiler->recursion_depth() <= RegExpCompiler::kMaxRecursion;
1364 }
1365
FillInBMInfo(Isolate * isolate,int offset,int budget,BoyerMooreLookahead * bm,bool not_at_start)1366 void ActionNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
1367 BoyerMooreLookahead* bm, bool not_at_start) {
1368 if (action_type_ == POSITIVE_SUBMATCH_SUCCESS) {
1369 // Anything may follow a positive submatch success, thus we need to accept
1370 // all characters from this position onwards.
1371 bm->SetRest(offset);
1372 } else {
1373 on_success()->FillInBMInfo(isolate, offset, budget - 1, bm, not_at_start);
1374 }
1375 SaveBMInfo(bm, not_at_start, offset);
1376 }
1377
GetQuickCheckDetails(QuickCheckDetails * details,RegExpCompiler * compiler,int filled_in,bool not_at_start)1378 void ActionNode::GetQuickCheckDetails(QuickCheckDetails* details,
1379 RegExpCompiler* compiler, int filled_in,
1380 bool not_at_start) {
1381 if (action_type_ == SET_REGISTER_FOR_LOOP) {
1382 on_success()->GetQuickCheckDetailsFromLoopEntry(details, compiler,
1383 filled_in, not_at_start);
1384 } else {
1385 on_success()->GetQuickCheckDetails(details, compiler, filled_in,
1386 not_at_start);
1387 }
1388 }
1389
FillInBMInfo(Isolate * isolate,int offset,int budget,BoyerMooreLookahead * bm,bool not_at_start)1390 void AssertionNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
1391 BoyerMooreLookahead* bm, bool not_at_start) {
1392 // Match the behaviour of EatsAtLeast on this node.
1393 if (assertion_type() == AT_START && not_at_start) return;
1394 on_success()->FillInBMInfo(isolate, offset, budget - 1, bm, not_at_start);
1395 SaveBMInfo(bm, not_at_start, offset);
1396 }
1397
GetQuickCheckDetails(QuickCheckDetails * details,RegExpCompiler * compiler,int filled_in,bool not_at_start)1398 void NegativeLookaroundChoiceNode::GetQuickCheckDetails(
1399 QuickCheckDetails* details, RegExpCompiler* compiler, int filled_in,
1400 bool not_at_start) {
1401 RegExpNode* node = continue_node();
1402 return node->GetQuickCheckDetails(details, compiler, filled_in, not_at_start);
1403 }
1404
1405 namespace {
1406
1407 // Takes the left-most 1-bit and smears it out, setting all bits to its right.
SmearBitsRight(uint32_t v)1408 inline uint32_t SmearBitsRight(uint32_t v) {
1409 v |= v >> 1;
1410 v |= v >> 2;
1411 v |= v >> 4;
1412 v |= v >> 8;
1413 v |= v >> 16;
1414 return v;
1415 }
1416
1417 } // namespace
1418
Rationalize(bool asc)1419 bool QuickCheckDetails::Rationalize(bool asc) {
1420 bool found_useful_op = false;
1421 const uint32_t char_mask = CharMask(asc);
1422 mask_ = 0;
1423 value_ = 0;
1424 int char_shift = 0;
1425 for (int i = 0; i < characters_; i++) {
1426 Position* pos = &positions_[i];
1427 if ((pos->mask & String::kMaxOneByteCharCode) != 0) {
1428 found_useful_op = true;
1429 }
1430 mask_ |= (pos->mask & char_mask) << char_shift;
1431 value_ |= (pos->value & char_mask) << char_shift;
1432 char_shift += asc ? 8 : 16;
1433 }
1434 return found_useful_op;
1435 }
1436
EatsAtLeast(bool not_at_start)1437 int RegExpNode::EatsAtLeast(bool not_at_start) {
1438 return not_at_start ? eats_at_least_.eats_at_least_from_not_start
1439 : eats_at_least_.eats_at_least_from_possibly_start;
1440 }
1441
EatsAtLeastFromLoopEntry()1442 EatsAtLeastInfo RegExpNode::EatsAtLeastFromLoopEntry() {
1443 // SET_REGISTER_FOR_LOOP is only used to initialize loop counters, and it
1444 // implies that the following node must be a LoopChoiceNode. If we need to
1445 // set registers to constant values for other reasons, we could introduce a
1446 // new action type SET_REGISTER that doesn't imply anything about its
1447 // successor.
1448 UNREACHABLE();
1449 }
1450
GetQuickCheckDetailsFromLoopEntry(QuickCheckDetails * details,RegExpCompiler * compiler,int characters_filled_in,bool not_at_start)1451 void RegExpNode::GetQuickCheckDetailsFromLoopEntry(QuickCheckDetails* details,
1452 RegExpCompiler* compiler,
1453 int characters_filled_in,
1454 bool not_at_start) {
1455 // See comment in RegExpNode::EatsAtLeastFromLoopEntry.
1456 UNREACHABLE();
1457 }
1458
EatsAtLeastFromLoopEntry()1459 EatsAtLeastInfo LoopChoiceNode::EatsAtLeastFromLoopEntry() {
1460 DCHECK_EQ(alternatives_->length(), 2); // There's just loop and continue.
1461
1462 if (read_backward()) {
1463 // The eats_at_least value is not used if reading backward. The
1464 // EatsAtLeastPropagator should've zeroed it as well.
1465 DCHECK_EQ(eats_at_least_info()->eats_at_least_from_possibly_start, 0);
1466 DCHECK_EQ(eats_at_least_info()->eats_at_least_from_not_start, 0);
1467 return {};
1468 }
1469
1470 // Figure out how much the loop body itself eats, not including anything in
1471 // the continuation case. In general, the nodes in the loop body should report
1472 // that they eat at least the number eaten by the continuation node, since any
1473 // successful match in the loop body must also include the continuation node.
1474 // However, in some cases involving positive lookaround, the loop body under-
1475 // reports its appetite, so use saturated math here to avoid negative numbers.
1476 uint8_t loop_body_from_not_start = base::saturated_cast<uint8_t>(
1477 loop_node_->EatsAtLeast(true) - continue_node_->EatsAtLeast(true));
1478 uint8_t loop_body_from_possibly_start = base::saturated_cast<uint8_t>(
1479 loop_node_->EatsAtLeast(false) - continue_node_->EatsAtLeast(true));
1480
1481 // Limit the number of loop iterations to avoid overflow in subsequent steps.
1482 int loop_iterations = base::saturated_cast<uint8_t>(min_loop_iterations());
1483
1484 EatsAtLeastInfo result;
1485 result.eats_at_least_from_not_start =
1486 base::saturated_cast<uint8_t>(loop_iterations * loop_body_from_not_start +
1487 continue_node_->EatsAtLeast(true));
1488 if (loop_iterations > 0 && loop_body_from_possibly_start > 0) {
1489 // First loop iteration eats at least one, so all subsequent iterations
1490 // and the after-loop chunk are guaranteed to not be at the start.
1491 result.eats_at_least_from_possibly_start = base::saturated_cast<uint8_t>(
1492 loop_body_from_possibly_start +
1493 (loop_iterations - 1) * loop_body_from_not_start +
1494 continue_node_->EatsAtLeast(true));
1495 } else {
1496 // Loop body might eat nothing, so only continue node contributes.
1497 result.eats_at_least_from_possibly_start =
1498 continue_node_->EatsAtLeast(false);
1499 }
1500 return result;
1501 }
1502
EmitQuickCheck(RegExpCompiler * compiler,Trace * bounds_check_trace,Trace * trace,bool preload_has_checked_bounds,Label * on_possible_success,QuickCheckDetails * details,bool fall_through_on_failure,ChoiceNode * predecessor)1503 bool RegExpNode::EmitQuickCheck(RegExpCompiler* compiler,
1504 Trace* bounds_check_trace, Trace* trace,
1505 bool preload_has_checked_bounds,
1506 Label* on_possible_success,
1507 QuickCheckDetails* details,
1508 bool fall_through_on_failure,
1509 ChoiceNode* predecessor) {
1510 DCHECK_NOT_NULL(predecessor);
1511 if (details->characters() == 0) return false;
1512 GetQuickCheckDetails(details, compiler, 0,
1513 trace->at_start() == Trace::FALSE_VALUE);
1514 if (details->cannot_match()) return false;
1515 if (!details->Rationalize(compiler->one_byte())) return false;
1516 DCHECK(details->characters() == 1 ||
1517 compiler->macro_assembler()->CanReadUnaligned());
1518 uint32_t mask = details->mask();
1519 uint32_t value = details->value();
1520
1521 RegExpMacroAssembler* assembler = compiler->macro_assembler();
1522
1523 if (trace->characters_preloaded() != details->characters()) {
1524 DCHECK(trace->cp_offset() == bounds_check_trace->cp_offset());
1525 // The bounds check is performed using the minimum number of characters
1526 // any choice would eat, so if the bounds check fails, then none of the
1527 // choices can succeed, so we can just immediately backtrack, rather
1528 // than go to the next choice. The number of characters preloaded may be
1529 // less than the number used for the bounds check.
1530 int eats_at_least = predecessor->EatsAtLeast(
1531 bounds_check_trace->at_start() == Trace::FALSE_VALUE);
1532 DCHECK_GE(eats_at_least, details->characters());
1533 assembler->LoadCurrentCharacter(
1534 trace->cp_offset(), bounds_check_trace->backtrack(),
1535 !preload_has_checked_bounds, details->characters(), eats_at_least);
1536 }
1537
1538 bool need_mask = true;
1539
1540 if (details->characters() == 1) {
1541 // If number of characters preloaded is 1 then we used a byte or 16 bit
1542 // load so the value is already masked down.
1543 const uint32_t char_mask = CharMask(compiler->one_byte());
1544 if ((mask & char_mask) == char_mask) need_mask = false;
1545 mask &= char_mask;
1546 } else {
1547 // For 2-character preloads in one-byte mode or 1-character preloads in
1548 // two-byte mode we also use a 16 bit load with zero extend.
1549 static const uint32_t kTwoByteMask = 0xFFFF;
1550 static const uint32_t kFourByteMask = 0xFFFFFFFF;
1551 if (details->characters() == 2 && compiler->one_byte()) {
1552 if ((mask & kTwoByteMask) == kTwoByteMask) need_mask = false;
1553 } else if (details->characters() == 1 && !compiler->one_byte()) {
1554 if ((mask & kTwoByteMask) == kTwoByteMask) need_mask = false;
1555 } else {
1556 if (mask == kFourByteMask) need_mask = false;
1557 }
1558 }
1559
1560 if (fall_through_on_failure) {
1561 if (need_mask) {
1562 assembler->CheckCharacterAfterAnd(value, mask, on_possible_success);
1563 } else {
1564 assembler->CheckCharacter(value, on_possible_success);
1565 }
1566 } else {
1567 if (need_mask) {
1568 assembler->CheckNotCharacterAfterAnd(value, mask, trace->backtrack());
1569 } else {
1570 assembler->CheckNotCharacter(value, trace->backtrack());
1571 }
1572 }
1573 return true;
1574 }
1575
1576 // Here is the meat of GetQuickCheckDetails (see also the comment on the
1577 // super-class in the .h file).
1578 //
1579 // We iterate along the text object, building up for each character a
1580 // mask and value that can be used to test for a quick failure to match.
1581 // The masks and values for the positions will be combined into a single
1582 // machine word for the current character width in order to be used in
1583 // generating a quick check.
GetQuickCheckDetails(QuickCheckDetails * details,RegExpCompiler * compiler,int characters_filled_in,bool not_at_start)1584 void TextNode::GetQuickCheckDetails(QuickCheckDetails* details,
1585 RegExpCompiler* compiler,
1586 int characters_filled_in,
1587 bool not_at_start) {
1588 // Do not collect any quick check details if the text node reads backward,
1589 // since it reads in the opposite direction than we use for quick checks.
1590 if (read_backward()) return;
1591 Isolate* isolate = compiler->macro_assembler()->isolate();
1592 DCHECK(characters_filled_in < details->characters());
1593 int characters = details->characters();
1594 const uint32_t char_mask = CharMask(compiler->one_byte());
1595 for (int k = 0; k < elements()->length(); k++) {
1596 TextElement elm = elements()->at(k);
1597 if (elm.text_type() == TextElement::ATOM) {
1598 base::Vector<const base::uc16> quarks = elm.atom()->data();
1599 for (int i = 0; i < characters && i < quarks.length(); i++) {
1600 QuickCheckDetails::Position* pos =
1601 details->positions(characters_filled_in);
1602 base::uc16 c = quarks[i];
1603 if (IsIgnoreCase(compiler->flags())) {
1604 unibrow::uchar chars[4];
1605 int length = GetCaseIndependentLetters(
1606 isolate, c, compiler->one_byte(), chars, 4);
1607 if (length == 0) {
1608 // This can happen because all case variants are non-Latin1, but we
1609 // know the input is Latin1.
1610 details->set_cannot_match();
1611 pos->determines_perfectly = false;
1612 return;
1613 }
1614 if (length == 1) {
1615 // This letter has no case equivalents, so it's nice and simple
1616 // and the mask-compare will determine definitely whether we have
1617 // a match at this character position.
1618 pos->mask = char_mask;
1619 pos->value = chars[0];
1620 pos->determines_perfectly = true;
1621 } else {
1622 uint32_t common_bits = char_mask;
1623 uint32_t bits = chars[0];
1624 for (int j = 1; j < length; j++) {
1625 uint32_t differing_bits = ((chars[j] & common_bits) ^ bits);
1626 common_bits ^= differing_bits;
1627 bits &= common_bits;
1628 }
1629 // If length is 2 and common bits has only one zero in it then
1630 // our mask and compare instruction will determine definitely
1631 // whether we have a match at this character position. Otherwise
1632 // it can only be an approximate check.
1633 uint32_t one_zero = (common_bits | ~char_mask);
1634 if (length == 2 && ((~one_zero) & ((~one_zero) - 1)) == 0) {
1635 pos->determines_perfectly = true;
1636 }
1637 pos->mask = common_bits;
1638 pos->value = bits;
1639 }
1640 } else {
1641 // Don't ignore case. Nice simple case where the mask-compare will
1642 // determine definitely whether we have a match at this character
1643 // position.
1644 if (c > char_mask) {
1645 details->set_cannot_match();
1646 pos->determines_perfectly = false;
1647 return;
1648 }
1649 pos->mask = char_mask;
1650 pos->value = c;
1651 pos->determines_perfectly = true;
1652 }
1653 characters_filled_in++;
1654 DCHECK(characters_filled_in <= details->characters());
1655 if (characters_filled_in == details->characters()) {
1656 return;
1657 }
1658 }
1659 } else {
1660 QuickCheckDetails::Position* pos =
1661 details->positions(characters_filled_in);
1662 RegExpCharacterClass* tree = elm.char_class();
1663 ZoneList<CharacterRange>* ranges = tree->ranges(zone());
1664 if (tree->is_negated() || ranges->is_empty()) {
1665 // A quick check uses multi-character mask and compare. There is no
1666 // useful way to incorporate a negative char class into this scheme
1667 // so we just conservatively create a mask and value that will always
1668 // succeed.
1669 // Likewise for empty ranges (empty ranges can occur e.g. when
1670 // compiling for one-byte subjects and impossible (non-one-byte) ranges
1671 // have been removed).
1672 pos->mask = 0;
1673 pos->value = 0;
1674 } else {
1675 int first_range = 0;
1676 while (ranges->at(first_range).from() > char_mask) {
1677 first_range++;
1678 if (first_range == ranges->length()) {
1679 details->set_cannot_match();
1680 pos->determines_perfectly = false;
1681 return;
1682 }
1683 }
1684 CharacterRange range = ranges->at(first_range);
1685 const base::uc32 first_from = range.from();
1686 const base::uc32 first_to =
1687 (range.to() > char_mask) ? char_mask : range.to();
1688 const uint32_t differing_bits = (first_from ^ first_to);
1689 // A mask and compare is only perfect if the differing bits form a
1690 // number like 00011111 with one single block of trailing 1s.
1691 if ((differing_bits & (differing_bits + 1)) == 0 &&
1692 first_from + differing_bits == first_to) {
1693 pos->determines_perfectly = true;
1694 }
1695 uint32_t common_bits = ~SmearBitsRight(differing_bits);
1696 uint32_t bits = (first_from & common_bits);
1697 for (int i = first_range + 1; i < ranges->length(); i++) {
1698 range = ranges->at(i);
1699 const base::uc32 from = range.from();
1700 if (from > char_mask) continue;
1701 const base::uc32 to =
1702 (range.to() > char_mask) ? char_mask : range.to();
1703 // Here we are combining more ranges into the mask and compare
1704 // value. With each new range the mask becomes more sparse and
1705 // so the chances of a false positive rise. A character class
1706 // with multiple ranges is assumed never to be equivalent to a
1707 // mask and compare operation.
1708 pos->determines_perfectly = false;
1709 uint32_t new_common_bits = (from ^ to);
1710 new_common_bits = ~SmearBitsRight(new_common_bits);
1711 common_bits &= new_common_bits;
1712 bits &= new_common_bits;
1713 uint32_t new_differing_bits = (from & common_bits) ^ bits;
1714 common_bits ^= new_differing_bits;
1715 bits &= common_bits;
1716 }
1717 pos->mask = common_bits;
1718 pos->value = bits;
1719 }
1720 characters_filled_in++;
1721 DCHECK(characters_filled_in <= details->characters());
1722 if (characters_filled_in == details->characters()) return;
1723 }
1724 }
1725 DCHECK(characters_filled_in != details->characters());
1726 if (!details->cannot_match()) {
1727 on_success()->GetQuickCheckDetails(details, compiler, characters_filled_in,
1728 true);
1729 }
1730 }
1731
Clear()1732 void QuickCheckDetails::Clear() {
1733 for (int i = 0; i < characters_; i++) {
1734 positions_[i].mask = 0;
1735 positions_[i].value = 0;
1736 positions_[i].determines_perfectly = false;
1737 }
1738 characters_ = 0;
1739 }
1740
Advance(int by,bool one_byte)1741 void QuickCheckDetails::Advance(int by, bool one_byte) {
1742 if (by >= characters_ || by < 0) {
1743 DCHECK_IMPLIES(by < 0, characters_ == 0);
1744 Clear();
1745 return;
1746 }
1747 DCHECK_LE(characters_ - by, 4);
1748 DCHECK_LE(characters_, 4);
1749 for (int i = 0; i < characters_ - by; i++) {
1750 positions_[i] = positions_[by + i];
1751 }
1752 for (int i = characters_ - by; i < characters_; i++) {
1753 positions_[i].mask = 0;
1754 positions_[i].value = 0;
1755 positions_[i].determines_perfectly = false;
1756 }
1757 characters_ -= by;
1758 // We could change mask_ and value_ here but we would never advance unless
1759 // they had already been used in a check and they won't be used again because
1760 // it would gain us nothing. So there's no point.
1761 }
1762
Merge(QuickCheckDetails * other,int from_index)1763 void QuickCheckDetails::Merge(QuickCheckDetails* other, int from_index) {
1764 DCHECK(characters_ == other->characters_);
1765 if (other->cannot_match_) {
1766 return;
1767 }
1768 if (cannot_match_) {
1769 *this = *other;
1770 return;
1771 }
1772 for (int i = from_index; i < characters_; i++) {
1773 QuickCheckDetails::Position* pos = positions(i);
1774 QuickCheckDetails::Position* other_pos = other->positions(i);
1775 if (pos->mask != other_pos->mask || pos->value != other_pos->value ||
1776 !other_pos->determines_perfectly) {
1777 // Our mask-compare operation will be approximate unless we have the
1778 // exact same operation on both sides of the alternation.
1779 pos->determines_perfectly = false;
1780 }
1781 pos->mask &= other_pos->mask;
1782 pos->value &= pos->mask;
1783 other_pos->value &= pos->mask;
1784 uint32_t differing_bits = (pos->value ^ other_pos->value);
1785 pos->mask &= ~differing_bits;
1786 pos->value &= pos->mask;
1787 }
1788 }
1789
1790 class VisitMarker {
1791 public:
VisitMarker(NodeInfo * info)1792 explicit VisitMarker(NodeInfo* info) : info_(info) {
1793 DCHECK(!info->visited);
1794 info->visited = true;
1795 }
~VisitMarker()1796 ~VisitMarker() { info_->visited = false; }
1797
1798 private:
1799 NodeInfo* info_;
1800 };
1801
1802 // Temporarily sets traversed_loop_initialization_node_.
1803 class LoopInitializationMarker {
1804 public:
LoopInitializationMarker(LoopChoiceNode * node)1805 explicit LoopInitializationMarker(LoopChoiceNode* node) : node_(node) {
1806 DCHECK(!node_->traversed_loop_initialization_node_);
1807 node_->traversed_loop_initialization_node_ = true;
1808 }
~LoopInitializationMarker()1809 ~LoopInitializationMarker() {
1810 DCHECK(node_->traversed_loop_initialization_node_);
1811 node_->traversed_loop_initialization_node_ = false;
1812 }
1813 LoopInitializationMarker(const LoopInitializationMarker&) = delete;
1814 LoopInitializationMarker& operator=(const LoopInitializationMarker&) = delete;
1815
1816 private:
1817 LoopChoiceNode* node_;
1818 };
1819
1820 // Temporarily decrements min_loop_iterations_.
1821 class IterationDecrementer {
1822 public:
IterationDecrementer(LoopChoiceNode * node)1823 explicit IterationDecrementer(LoopChoiceNode* node) : node_(node) {
1824 DCHECK_GT(node_->min_loop_iterations_, 0);
1825 --node_->min_loop_iterations_;
1826 }
~IterationDecrementer()1827 ~IterationDecrementer() { ++node_->min_loop_iterations_; }
1828 IterationDecrementer(const IterationDecrementer&) = delete;
1829 IterationDecrementer& operator=(const IterationDecrementer&) = delete;
1830
1831 private:
1832 LoopChoiceNode* node_;
1833 };
1834
FilterOneByte(int depth,RegExpFlags flags)1835 RegExpNode* SeqRegExpNode::FilterOneByte(int depth, RegExpFlags flags) {
1836 if (info()->replacement_calculated) return replacement();
1837 if (depth < 0) return this;
1838 DCHECK(!info()->visited);
1839 VisitMarker marker(info());
1840 return FilterSuccessor(depth - 1, flags);
1841 }
1842
FilterSuccessor(int depth,RegExpFlags flags)1843 RegExpNode* SeqRegExpNode::FilterSuccessor(int depth, RegExpFlags flags) {
1844 RegExpNode* next = on_success_->FilterOneByte(depth - 1, flags);
1845 if (next == nullptr) return set_replacement(nullptr);
1846 on_success_ = next;
1847 return set_replacement(this);
1848 }
1849
1850 // We need to check for the following characters: 0x39C 0x3BC 0x178.
RangeContainsLatin1Equivalents(CharacterRange range)1851 bool RangeContainsLatin1Equivalents(CharacterRange range) {
1852 // TODO(dcarney): this could be a lot more efficient.
1853 return range.Contains(0x039C) || range.Contains(0x03BC) ||
1854 range.Contains(0x0178);
1855 }
1856
1857 namespace {
1858
RangesContainLatin1Equivalents(ZoneList<CharacterRange> * ranges)1859 bool RangesContainLatin1Equivalents(ZoneList<CharacterRange>* ranges) {
1860 for (int i = 0; i < ranges->length(); i++) {
1861 // TODO(dcarney): this could be a lot more efficient.
1862 if (RangeContainsLatin1Equivalents(ranges->at(i))) return true;
1863 }
1864 return false;
1865 }
1866
1867 } // namespace
1868
FilterOneByte(int depth,RegExpFlags flags)1869 RegExpNode* TextNode::FilterOneByte(int depth, RegExpFlags flags) {
1870 if (info()->replacement_calculated) return replacement();
1871 if (depth < 0) return this;
1872 DCHECK(!info()->visited);
1873 VisitMarker marker(info());
1874 int element_count = elements()->length();
1875 for (int i = 0; i < element_count; i++) {
1876 TextElement elm = elements()->at(i);
1877 if (elm.text_type() == TextElement::ATOM) {
1878 base::Vector<const base::uc16> quarks = elm.atom()->data();
1879 for (int j = 0; j < quarks.length(); j++) {
1880 base::uc16 c = quarks[j];
1881 if (IsIgnoreCase(flags)) {
1882 c = unibrow::Latin1::TryConvertToLatin1(c);
1883 }
1884 if (c > unibrow::Latin1::kMaxChar) return set_replacement(nullptr);
1885 // Replace quark in case we converted to Latin-1.
1886 base::uc16* writable_quarks = const_cast<base::uc16*>(quarks.begin());
1887 writable_quarks[j] = c;
1888 }
1889 } else {
1890 DCHECK(elm.text_type() == TextElement::CHAR_CLASS);
1891 RegExpCharacterClass* cc = elm.char_class();
1892 ZoneList<CharacterRange>* ranges = cc->ranges(zone());
1893 CharacterRange::Canonicalize(ranges);
1894 // Now they are in order so we only need to look at the first.
1895 int range_count = ranges->length();
1896 if (cc->is_negated()) {
1897 if (range_count != 0 && ranges->at(0).from() == 0 &&
1898 ranges->at(0).to() >= String::kMaxOneByteCharCode) {
1899 // This will be handled in a later filter.
1900 if (IsIgnoreCase(flags) && RangesContainLatin1Equivalents(ranges)) {
1901 continue;
1902 }
1903 return set_replacement(nullptr);
1904 }
1905 } else {
1906 if (range_count == 0 ||
1907 ranges->at(0).from() > String::kMaxOneByteCharCode) {
1908 // This will be handled in a later filter.
1909 if (IsIgnoreCase(flags) && RangesContainLatin1Equivalents(ranges)) {
1910 continue;
1911 }
1912 return set_replacement(nullptr);
1913 }
1914 }
1915 }
1916 }
1917 return FilterSuccessor(depth - 1, flags);
1918 }
1919
FilterOneByte(int depth,RegExpFlags flags)1920 RegExpNode* LoopChoiceNode::FilterOneByte(int depth, RegExpFlags flags) {
1921 if (info()->replacement_calculated) return replacement();
1922 if (depth < 0) return this;
1923 if (info()->visited) return this;
1924 {
1925 VisitMarker marker(info());
1926
1927 RegExpNode* continue_replacement =
1928 continue_node_->FilterOneByte(depth - 1, flags);
1929 // If we can't continue after the loop then there is no sense in doing the
1930 // loop.
1931 if (continue_replacement == nullptr) return set_replacement(nullptr);
1932 }
1933
1934 return ChoiceNode::FilterOneByte(depth - 1, flags);
1935 }
1936
FilterOneByte(int depth,RegExpFlags flags)1937 RegExpNode* ChoiceNode::FilterOneByte(int depth, RegExpFlags flags) {
1938 if (info()->replacement_calculated) return replacement();
1939 if (depth < 0) return this;
1940 if (info()->visited) return this;
1941 VisitMarker marker(info());
1942 int choice_count = alternatives_->length();
1943
1944 for (int i = 0; i < choice_count; i++) {
1945 GuardedAlternative alternative = alternatives_->at(i);
1946 if (alternative.guards() != nullptr &&
1947 alternative.guards()->length() != 0) {
1948 set_replacement(this);
1949 return this;
1950 }
1951 }
1952
1953 int surviving = 0;
1954 RegExpNode* survivor = nullptr;
1955 for (int i = 0; i < choice_count; i++) {
1956 GuardedAlternative alternative = alternatives_->at(i);
1957 RegExpNode* replacement =
1958 alternative.node()->FilterOneByte(depth - 1, flags);
1959 DCHECK(replacement != this); // No missing EMPTY_MATCH_CHECK.
1960 if (replacement != nullptr) {
1961 alternatives_->at(i).set_node(replacement);
1962 surviving++;
1963 survivor = replacement;
1964 }
1965 }
1966 if (surviving < 2) return set_replacement(survivor);
1967
1968 set_replacement(this);
1969 if (surviving == choice_count) {
1970 return this;
1971 }
1972 // Only some of the nodes survived the filtering. We need to rebuild the
1973 // alternatives list.
1974 ZoneList<GuardedAlternative>* new_alternatives =
1975 zone()->New<ZoneList<GuardedAlternative>>(surviving, zone());
1976 for (int i = 0; i < choice_count; i++) {
1977 RegExpNode* replacement =
1978 alternatives_->at(i).node()->FilterOneByte(depth - 1, flags);
1979 if (replacement != nullptr) {
1980 alternatives_->at(i).set_node(replacement);
1981 new_alternatives->Add(alternatives_->at(i), zone());
1982 }
1983 }
1984 alternatives_ = new_alternatives;
1985 return this;
1986 }
1987
FilterOneByte(int depth,RegExpFlags flags)1988 RegExpNode* NegativeLookaroundChoiceNode::FilterOneByte(int depth,
1989 RegExpFlags flags) {
1990 if (info()->replacement_calculated) return replacement();
1991 if (depth < 0) return this;
1992 if (info()->visited) return this;
1993 VisitMarker marker(info());
1994 // Alternative 0 is the negative lookahead, alternative 1 is what comes
1995 // afterwards.
1996 RegExpNode* node = continue_node();
1997 RegExpNode* replacement = node->FilterOneByte(depth - 1, flags);
1998 if (replacement == nullptr) return set_replacement(nullptr);
1999 alternatives_->at(kContinueIndex).set_node(replacement);
2000
2001 RegExpNode* neg_node = lookaround_node();
2002 RegExpNode* neg_replacement = neg_node->FilterOneByte(depth - 1, flags);
2003 // If the negative lookahead is always going to fail then
2004 // we don't need to check it.
2005 if (neg_replacement == nullptr) return set_replacement(replacement);
2006 alternatives_->at(kLookaroundIndex).set_node(neg_replacement);
2007 return set_replacement(this);
2008 }
2009
GetQuickCheckDetails(QuickCheckDetails * details,RegExpCompiler * compiler,int characters_filled_in,bool not_at_start)2010 void LoopChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,
2011 RegExpCompiler* compiler,
2012 int characters_filled_in,
2013 bool not_at_start) {
2014 if (body_can_be_zero_length_ || info()->visited) return;
2015 not_at_start = not_at_start || this->not_at_start();
2016 DCHECK_EQ(alternatives_->length(), 2); // There's just loop and continue.
2017 if (traversed_loop_initialization_node_ && min_loop_iterations_ > 0 &&
2018 loop_node_->EatsAtLeast(not_at_start) >
2019 continue_node_->EatsAtLeast(true)) {
2020 // Loop body is guaranteed to execute at least once, and consume characters
2021 // when it does, meaning the only possible quick checks from this point
2022 // begin with the loop body. We may recursively visit this LoopChoiceNode,
2023 // but we temporarily decrease its minimum iteration counter so we know when
2024 // to check the continue case.
2025 IterationDecrementer next_iteration(this);
2026 loop_node_->GetQuickCheckDetails(details, compiler, characters_filled_in,
2027 not_at_start);
2028 } else {
2029 // Might not consume anything in the loop body, so treat it like a normal
2030 // ChoiceNode (and don't recursively visit this node again).
2031 VisitMarker marker(info());
2032 ChoiceNode::GetQuickCheckDetails(details, compiler, characters_filled_in,
2033 not_at_start);
2034 }
2035 }
2036
GetQuickCheckDetailsFromLoopEntry(QuickCheckDetails * details,RegExpCompiler * compiler,int characters_filled_in,bool not_at_start)2037 void LoopChoiceNode::GetQuickCheckDetailsFromLoopEntry(
2038 QuickCheckDetails* details, RegExpCompiler* compiler,
2039 int characters_filled_in, bool not_at_start) {
2040 if (traversed_loop_initialization_node_) {
2041 // We already entered this loop once, exited via its continuation node, and
2042 // followed an outer loop's back-edge to before the loop entry point. We
2043 // could try to reset the minimum iteration count to its starting value at
2044 // this point, but that seems like more trouble than it's worth. It's safe
2045 // to keep going with the current (possibly reduced) minimum iteration
2046 // count.
2047 GetQuickCheckDetails(details, compiler, characters_filled_in, not_at_start);
2048 } else {
2049 // We are entering a loop via its counter initialization action, meaning we
2050 // are guaranteed to run the loop body at least some minimum number of times
2051 // before running the continuation node. Set a flag so that this node knows
2052 // (now and any times we visit it again recursively) that it was entered
2053 // from the top.
2054 LoopInitializationMarker marker(this);
2055 GetQuickCheckDetails(details, compiler, characters_filled_in, not_at_start);
2056 }
2057 }
2058
FillInBMInfo(Isolate * isolate,int offset,int budget,BoyerMooreLookahead * bm,bool not_at_start)2059 void LoopChoiceNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
2060 BoyerMooreLookahead* bm, bool not_at_start) {
2061 if (body_can_be_zero_length_ || budget <= 0) {
2062 bm->SetRest(offset);
2063 SaveBMInfo(bm, not_at_start, offset);
2064 return;
2065 }
2066 ChoiceNode::FillInBMInfo(isolate, offset, budget - 1, bm, not_at_start);
2067 SaveBMInfo(bm, not_at_start, offset);
2068 }
2069
GetQuickCheckDetails(QuickCheckDetails * details,RegExpCompiler * compiler,int characters_filled_in,bool not_at_start)2070 void ChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,
2071 RegExpCompiler* compiler,
2072 int characters_filled_in,
2073 bool not_at_start) {
2074 not_at_start = (not_at_start || not_at_start_);
2075 int choice_count = alternatives_->length();
2076 DCHECK_LT(0, choice_count);
2077 alternatives_->at(0).node()->GetQuickCheckDetails(
2078 details, compiler, characters_filled_in, not_at_start);
2079 for (int i = 1; i < choice_count; i++) {
2080 QuickCheckDetails new_details(details->characters());
2081 RegExpNode* node = alternatives_->at(i).node();
2082 node->GetQuickCheckDetails(&new_details, compiler, characters_filled_in,
2083 not_at_start);
2084 // Here we merge the quick match details of the two branches.
2085 details->Merge(&new_details, characters_filled_in);
2086 }
2087 }
2088
2089 namespace {
2090
2091 // Check for [0-9A-Z_a-z].
EmitWordCheck(RegExpMacroAssembler * assembler,Label * word,Label * non_word,bool fall_through_on_word)2092 void EmitWordCheck(RegExpMacroAssembler* assembler, Label* word,
2093 Label* non_word, bool fall_through_on_word) {
2094 if (assembler->CheckSpecialCharacterClass(
2095 fall_through_on_word ? StandardCharacterSet::kWord
2096 : StandardCharacterSet::kNotWord,
2097 fall_through_on_word ? non_word : word)) {
2098 // Optimized implementation available.
2099 return;
2100 }
2101 assembler->CheckCharacterGT('z', non_word);
2102 assembler->CheckCharacterLT('0', non_word);
2103 assembler->CheckCharacterGT('a' - 1, word);
2104 assembler->CheckCharacterLT('9' + 1, word);
2105 assembler->CheckCharacterLT('A', non_word);
2106 assembler->CheckCharacterLT('Z' + 1, word);
2107 if (fall_through_on_word) {
2108 assembler->CheckNotCharacter('_', non_word);
2109 } else {
2110 assembler->CheckCharacter('_', word);
2111 }
2112 }
2113
2114 // Emit the code to check for a ^ in multiline mode (1-character lookbehind
2115 // that matches newline or the start of input).
EmitHat(RegExpCompiler * compiler,RegExpNode * on_success,Trace * trace)2116 void EmitHat(RegExpCompiler* compiler, RegExpNode* on_success, Trace* trace) {
2117 RegExpMacroAssembler* assembler = compiler->macro_assembler();
2118
2119 // We will load the previous character into the current character register.
2120 Trace new_trace(*trace);
2121 new_trace.InvalidateCurrentCharacter();
2122
2123 // A positive (> 0) cp_offset means we've already successfully matched a
2124 // non-empty-width part of the pattern, and thus cannot be at or before the
2125 // start of the subject string. We can thus skip both at-start and
2126 // bounds-checks when loading the one-character lookbehind.
2127 const bool may_be_at_or_before_subject_string_start =
2128 new_trace.cp_offset() <= 0;
2129
2130 Label ok;
2131 if (may_be_at_or_before_subject_string_start) {
2132 // The start of input counts as a newline in this context, so skip to ok if
2133 // we are at the start.
2134 assembler->CheckAtStart(new_trace.cp_offset(), &ok);
2135 }
2136
2137 // If we've already checked that we are not at the start of input, it's okay
2138 // to load the previous character without bounds checks.
2139 const bool can_skip_bounds_check = !may_be_at_or_before_subject_string_start;
2140 assembler->LoadCurrentCharacter(new_trace.cp_offset() - 1,
2141 new_trace.backtrack(), can_skip_bounds_check);
2142 if (!assembler->CheckSpecialCharacterClass(
2143 StandardCharacterSet::kLineTerminator, new_trace.backtrack())) {
2144 // Newline means \n, \r, 0x2028 or 0x2029.
2145 if (!compiler->one_byte()) {
2146 assembler->CheckCharacterAfterAnd(0x2028, 0xFFFE, &ok);
2147 }
2148 assembler->CheckCharacter('\n', &ok);
2149 assembler->CheckNotCharacter('\r', new_trace.backtrack());
2150 }
2151 assembler->Bind(&ok);
2152 on_success->Emit(compiler, &new_trace);
2153 }
2154
2155 } // namespace
2156
2157 // Emit the code to handle \b and \B (word-boundary or non-word-boundary).
EmitBoundaryCheck(RegExpCompiler * compiler,Trace * trace)2158 void AssertionNode::EmitBoundaryCheck(RegExpCompiler* compiler, Trace* trace) {
2159 RegExpMacroAssembler* assembler = compiler->macro_assembler();
2160 Isolate* isolate = assembler->isolate();
2161 Trace::TriBool next_is_word_character = Trace::UNKNOWN;
2162 bool not_at_start = (trace->at_start() == Trace::FALSE_VALUE);
2163 BoyerMooreLookahead* lookahead = bm_info(not_at_start);
2164 if (lookahead == nullptr) {
2165 int eats_at_least =
2166 std::min(kMaxLookaheadForBoyerMoore, EatsAtLeast(not_at_start));
2167 if (eats_at_least >= 1) {
2168 BoyerMooreLookahead* bm =
2169 zone()->New<BoyerMooreLookahead>(eats_at_least, compiler, zone());
2170 FillInBMInfo(isolate, 0, kRecursionBudget, bm, not_at_start);
2171 if (bm->at(0)->is_non_word()) next_is_word_character = Trace::FALSE_VALUE;
2172 if (bm->at(0)->is_word()) next_is_word_character = Trace::TRUE_VALUE;
2173 }
2174 } else {
2175 if (lookahead->at(0)->is_non_word())
2176 next_is_word_character = Trace::FALSE_VALUE;
2177 if (lookahead->at(0)->is_word()) next_is_word_character = Trace::TRUE_VALUE;
2178 }
2179 bool at_boundary = (assertion_type_ == AssertionNode::AT_BOUNDARY);
2180 if (next_is_word_character == Trace::UNKNOWN) {
2181 Label before_non_word;
2182 Label before_word;
2183 if (trace->characters_preloaded() != 1) {
2184 assembler->LoadCurrentCharacter(trace->cp_offset(), &before_non_word);
2185 }
2186 // Fall through on non-word.
2187 EmitWordCheck(assembler, &before_word, &before_non_word, false);
2188 // Next character is not a word character.
2189 assembler->Bind(&before_non_word);
2190 Label ok;
2191 BacktrackIfPrevious(compiler, trace, at_boundary ? kIsNonWord : kIsWord);
2192 assembler->GoTo(&ok);
2193
2194 assembler->Bind(&before_word);
2195 BacktrackIfPrevious(compiler, trace, at_boundary ? kIsWord : kIsNonWord);
2196 assembler->Bind(&ok);
2197 } else if (next_is_word_character == Trace::TRUE_VALUE) {
2198 BacktrackIfPrevious(compiler, trace, at_boundary ? kIsWord : kIsNonWord);
2199 } else {
2200 DCHECK(next_is_word_character == Trace::FALSE_VALUE);
2201 BacktrackIfPrevious(compiler, trace, at_boundary ? kIsNonWord : kIsWord);
2202 }
2203 }
2204
BacktrackIfPrevious(RegExpCompiler * compiler,Trace * trace,AssertionNode::IfPrevious backtrack_if_previous)2205 void AssertionNode::BacktrackIfPrevious(
2206 RegExpCompiler* compiler, Trace* trace,
2207 AssertionNode::IfPrevious backtrack_if_previous) {
2208 RegExpMacroAssembler* assembler = compiler->macro_assembler();
2209 Trace new_trace(*trace);
2210 new_trace.InvalidateCurrentCharacter();
2211
2212 Label fall_through;
2213 Label* non_word = backtrack_if_previous == kIsNonWord ? new_trace.backtrack()
2214 : &fall_through;
2215 Label* word = backtrack_if_previous == kIsNonWord ? &fall_through
2216 : new_trace.backtrack();
2217
2218 // A positive (> 0) cp_offset means we've already successfully matched a
2219 // non-empty-width part of the pattern, and thus cannot be at or before the
2220 // start of the subject string. We can thus skip both at-start and
2221 // bounds-checks when loading the one-character lookbehind.
2222 const bool may_be_at_or_before_subject_string_start =
2223 new_trace.cp_offset() <= 0;
2224
2225 if (may_be_at_or_before_subject_string_start) {
2226 // The start of input counts as a non-word character, so the question is
2227 // decided if we are at the start.
2228 assembler->CheckAtStart(new_trace.cp_offset(), non_word);
2229 }
2230
2231 // If we've already checked that we are not at the start of input, it's okay
2232 // to load the previous character without bounds checks.
2233 const bool can_skip_bounds_check = !may_be_at_or_before_subject_string_start;
2234 assembler->LoadCurrentCharacter(new_trace.cp_offset() - 1, non_word,
2235 can_skip_bounds_check);
2236 EmitWordCheck(assembler, word, non_word, backtrack_if_previous == kIsNonWord);
2237
2238 assembler->Bind(&fall_through);
2239 on_success()->Emit(compiler, &new_trace);
2240 }
2241
GetQuickCheckDetails(QuickCheckDetails * details,RegExpCompiler * compiler,int filled_in,bool not_at_start)2242 void AssertionNode::GetQuickCheckDetails(QuickCheckDetails* details,
2243 RegExpCompiler* compiler,
2244 int filled_in, bool not_at_start) {
2245 if (assertion_type_ == AT_START && not_at_start) {
2246 details->set_cannot_match();
2247 return;
2248 }
2249 return on_success()->GetQuickCheckDetails(details, compiler, filled_in,
2250 not_at_start);
2251 }
2252
Emit(RegExpCompiler * compiler,Trace * trace)2253 void AssertionNode::Emit(RegExpCompiler* compiler, Trace* trace) {
2254 RegExpMacroAssembler* assembler = compiler->macro_assembler();
2255 switch (assertion_type_) {
2256 case AT_END: {
2257 Label ok;
2258 assembler->CheckPosition(trace->cp_offset(), &ok);
2259 assembler->GoTo(trace->backtrack());
2260 assembler->Bind(&ok);
2261 break;
2262 }
2263 case AT_START: {
2264 if (trace->at_start() == Trace::FALSE_VALUE) {
2265 assembler->GoTo(trace->backtrack());
2266 return;
2267 }
2268 if (trace->at_start() == Trace::UNKNOWN) {
2269 assembler->CheckNotAtStart(trace->cp_offset(), trace->backtrack());
2270 Trace at_start_trace = *trace;
2271 at_start_trace.set_at_start(Trace::TRUE_VALUE);
2272 on_success()->Emit(compiler, &at_start_trace);
2273 return;
2274 }
2275 } break;
2276 case AFTER_NEWLINE:
2277 EmitHat(compiler, on_success(), trace);
2278 return;
2279 case AT_BOUNDARY:
2280 case AT_NON_BOUNDARY: {
2281 EmitBoundaryCheck(compiler, trace);
2282 return;
2283 }
2284 }
2285 on_success()->Emit(compiler, trace);
2286 }
2287
2288 namespace {
2289
DeterminedAlready(QuickCheckDetails * quick_check,int offset)2290 bool DeterminedAlready(QuickCheckDetails* quick_check, int offset) {
2291 if (quick_check == nullptr) return false;
2292 if (offset >= quick_check->characters()) return false;
2293 return quick_check->positions(offset)->determines_perfectly;
2294 }
2295
UpdateBoundsCheck(int index,int * checked_up_to)2296 void UpdateBoundsCheck(int index, int* checked_up_to) {
2297 if (index > *checked_up_to) {
2298 *checked_up_to = index;
2299 }
2300 }
2301
2302 } // namespace
2303
2304 // We call this repeatedly to generate code for each pass over the text node.
2305 // The passes are in increasing order of difficulty because we hope one
2306 // of the first passes will fail in which case we are saved the work of the
2307 // later passes. for example for the case independent regexp /%[asdfghjkl]a/
2308 // we will check the '%' in the first pass, the case independent 'a' in the
2309 // second pass and the character class in the last pass.
2310 //
2311 // The passes are done from right to left, so for example to test for /bar/
2312 // we will first test for an 'r' with offset 2, then an 'a' with offset 1
2313 // and then a 'b' with offset 0. This means we can avoid the end-of-input
2314 // bounds check most of the time. In the example we only need to check for
2315 // end-of-input when loading the putative 'r'.
2316 //
2317 // A slight complication involves the fact that the first character may already
2318 // be fetched into a register by the previous node. In this case we want to
2319 // do the test for that character first. We do this in separate passes. The
2320 // 'preloaded' argument indicates that we are doing such a 'pass'. If such a
2321 // pass has been performed then subsequent passes will have true in
2322 // first_element_checked to indicate that that character does not need to be
2323 // checked again.
2324 //
2325 // In addition to all this we are passed a Trace, which can
2326 // contain an AlternativeGeneration object. In this AlternativeGeneration
2327 // object we can see details of any quick check that was already passed in
2328 // order to get to the code we are now generating. The quick check can involve
2329 // loading characters, which means we do not need to recheck the bounds
2330 // up to the limit the quick check already checked. In addition the quick
2331 // check can have involved a mask and compare operation which may simplify
2332 // or obviate the need for further checks at some character positions.
TextEmitPass(RegExpCompiler * compiler,TextEmitPassType pass,bool preloaded,Trace * trace,bool first_element_checked,int * checked_up_to)2333 void TextNode::TextEmitPass(RegExpCompiler* compiler, TextEmitPassType pass,
2334 bool preloaded, Trace* trace,
2335 bool first_element_checked, int* checked_up_to) {
2336 RegExpMacroAssembler* assembler = compiler->macro_assembler();
2337 Isolate* isolate = assembler->isolate();
2338 bool one_byte = compiler->one_byte();
2339 Label* backtrack = trace->backtrack();
2340 QuickCheckDetails* quick_check = trace->quick_check_performed();
2341 int element_count = elements()->length();
2342 int backward_offset = read_backward() ? -Length() : 0;
2343 for (int i = preloaded ? 0 : element_count - 1; i >= 0; i--) {
2344 TextElement elm = elements()->at(i);
2345 int cp_offset = trace->cp_offset() + elm.cp_offset() + backward_offset;
2346 if (elm.text_type() == TextElement::ATOM) {
2347 if (SkipPass(pass, IsIgnoreCase(compiler->flags()))) continue;
2348 base::Vector<const base::uc16> quarks = elm.atom()->data();
2349 for (int j = preloaded ? 0 : quarks.length() - 1; j >= 0; j--) {
2350 if (first_element_checked && i == 0 && j == 0) continue;
2351 if (DeterminedAlready(quick_check, elm.cp_offset() + j)) continue;
2352 base::uc16 quark = quarks[j];
2353 if (IsIgnoreCase(compiler->flags())) {
2354 // Everywhere else we assume that a non-Latin-1 character cannot match
2355 // a Latin-1 character. Avoid the cases where this is assumption is
2356 // invalid by using the Latin1 equivalent instead.
2357 quark = unibrow::Latin1::TryConvertToLatin1(quark);
2358 }
2359 bool needs_bounds_check =
2360 *checked_up_to < cp_offset + j || read_backward();
2361 bool bounds_checked = false;
2362 switch (pass) {
2363 case NON_LATIN1_MATCH:
2364 DCHECK(one_byte);
2365 if (quark > String::kMaxOneByteCharCode) {
2366 assembler->GoTo(backtrack);
2367 return;
2368 }
2369 break;
2370 case NON_LETTER_CHARACTER_MATCH:
2371 bounds_checked =
2372 EmitAtomNonLetter(isolate, compiler, quark, backtrack,
2373 cp_offset + j, needs_bounds_check, preloaded);
2374 break;
2375 case SIMPLE_CHARACTER_MATCH:
2376 bounds_checked = EmitSimpleCharacter(isolate, compiler, quark,
2377 backtrack, cp_offset + j,
2378 needs_bounds_check, preloaded);
2379 break;
2380 case CASE_CHARACTER_MATCH:
2381 bounds_checked =
2382 EmitAtomLetter(isolate, compiler, quark, backtrack,
2383 cp_offset + j, needs_bounds_check, preloaded);
2384 break;
2385 default:
2386 break;
2387 }
2388 if (bounds_checked) UpdateBoundsCheck(cp_offset + j, checked_up_to);
2389 }
2390 } else {
2391 DCHECK_EQ(TextElement::CHAR_CLASS, elm.text_type());
2392 if (pass == CHARACTER_CLASS_MATCH) {
2393 if (first_element_checked && i == 0) continue;
2394 if (DeterminedAlready(quick_check, elm.cp_offset())) continue;
2395 RegExpCharacterClass* cc = elm.char_class();
2396 bool bounds_check = *checked_up_to < cp_offset || read_backward();
2397 EmitCharClass(assembler, cc, one_byte, backtrack, cp_offset,
2398 bounds_check, preloaded, zone());
2399 UpdateBoundsCheck(cp_offset, checked_up_to);
2400 }
2401 }
2402 }
2403 }
2404
Length()2405 int TextNode::Length() {
2406 TextElement elm = elements()->last();
2407 DCHECK_LE(0, elm.cp_offset());
2408 return elm.cp_offset() + elm.length();
2409 }
2410
SkipPass(TextEmitPassType pass,bool ignore_case)2411 bool TextNode::SkipPass(TextEmitPassType pass, bool ignore_case) {
2412 if (ignore_case) {
2413 return pass == SIMPLE_CHARACTER_MATCH;
2414 } else {
2415 return pass == NON_LETTER_CHARACTER_MATCH || pass == CASE_CHARACTER_MATCH;
2416 }
2417 }
2418
CreateForCharacterRanges(Zone * zone,ZoneList<CharacterRange> * ranges,bool read_backward,RegExpNode * on_success)2419 TextNode* TextNode::CreateForCharacterRanges(Zone* zone,
2420 ZoneList<CharacterRange>* ranges,
2421 bool read_backward,
2422 RegExpNode* on_success) {
2423 DCHECK_NOT_NULL(ranges);
2424 // TODO(jgruber): There's no fundamental need to create this
2425 // RegExpCharacterClass; we could refactor to avoid the allocation.
2426 return zone->New<TextNode>(zone->New<RegExpCharacterClass>(zone, ranges),
2427 read_backward, on_success);
2428 }
2429
CreateForSurrogatePair(Zone * zone,CharacterRange lead,ZoneList<CharacterRange> * trail_ranges,bool read_backward,RegExpNode * on_success)2430 TextNode* TextNode::CreateForSurrogatePair(
2431 Zone* zone, CharacterRange lead, ZoneList<CharacterRange>* trail_ranges,
2432 bool read_backward, RegExpNode* on_success) {
2433 ZoneList<CharacterRange>* lead_ranges = CharacterRange::List(zone, lead);
2434 ZoneList<TextElement>* elms = zone->New<ZoneList<TextElement>>(2, zone);
2435 elms->Add(TextElement::CharClass(
2436 zone->New<RegExpCharacterClass>(zone, lead_ranges)),
2437 zone);
2438 elms->Add(TextElement::CharClass(
2439 zone->New<RegExpCharacterClass>(zone, trail_ranges)),
2440 zone);
2441 return zone->New<TextNode>(elms, read_backward, on_success);
2442 }
2443
CreateForSurrogatePair(Zone * zone,ZoneList<CharacterRange> * lead_ranges,CharacterRange trail,bool read_backward,RegExpNode * on_success)2444 TextNode* TextNode::CreateForSurrogatePair(
2445 Zone* zone, ZoneList<CharacterRange>* lead_ranges, CharacterRange trail,
2446 bool read_backward, RegExpNode* on_success) {
2447 ZoneList<CharacterRange>* trail_ranges = CharacterRange::List(zone, trail);
2448 ZoneList<TextElement>* elms = zone->New<ZoneList<TextElement>>(2, zone);
2449 elms->Add(TextElement::CharClass(
2450 zone->New<RegExpCharacterClass>(zone, lead_ranges)),
2451 zone);
2452 elms->Add(TextElement::CharClass(
2453 zone->New<RegExpCharacterClass>(zone, trail_ranges)),
2454 zone);
2455 return zone->New<TextNode>(elms, read_backward, on_success);
2456 }
2457
2458 // This generates the code to match a text node. A text node can contain
2459 // straight character sequences (possibly to be matched in a case-independent
2460 // way) and character classes. For efficiency we do not do this in a single
2461 // pass from left to right. Instead we pass over the text node several times,
2462 // emitting code for some character positions every time. See the comment on
2463 // TextEmitPass for details.
Emit(RegExpCompiler * compiler,Trace * trace)2464 void TextNode::Emit(RegExpCompiler* compiler, Trace* trace) {
2465 LimitResult limit_result = LimitVersions(compiler, trace);
2466 if (limit_result == DONE) return;
2467 DCHECK(limit_result == CONTINUE);
2468
2469 if (trace->cp_offset() + Length() > RegExpMacroAssembler::kMaxCPOffset) {
2470 compiler->SetRegExpTooBig();
2471 return;
2472 }
2473
2474 if (compiler->one_byte()) {
2475 int dummy = 0;
2476 TextEmitPass(compiler, NON_LATIN1_MATCH, false, trace, false, &dummy);
2477 }
2478
2479 bool first_elt_done = false;
2480 int bound_checked_to = trace->cp_offset() - 1;
2481 bound_checked_to += trace->bound_checked_up_to();
2482
2483 // If a character is preloaded into the current character register then
2484 // check that now.
2485 if (trace->characters_preloaded() == 1) {
2486 for (int pass = kFirstRealPass; pass <= kLastPass; pass++) {
2487 TextEmitPass(compiler, static_cast<TextEmitPassType>(pass), true, trace,
2488 false, &bound_checked_to);
2489 }
2490 first_elt_done = true;
2491 }
2492
2493 for (int pass = kFirstRealPass; pass <= kLastPass; pass++) {
2494 TextEmitPass(compiler, static_cast<TextEmitPassType>(pass), false, trace,
2495 first_elt_done, &bound_checked_to);
2496 }
2497
2498 Trace successor_trace(*trace);
2499 // If we advance backward, we may end up at the start.
2500 successor_trace.AdvanceCurrentPositionInTrace(
2501 read_backward() ? -Length() : Length(), compiler);
2502 successor_trace.set_at_start(read_backward() ? Trace::UNKNOWN
2503 : Trace::FALSE_VALUE);
2504 RecursionCheck rc(compiler);
2505 on_success()->Emit(compiler, &successor_trace);
2506 }
2507
InvalidateCurrentCharacter()2508 void Trace::InvalidateCurrentCharacter() { characters_preloaded_ = 0; }
2509
AdvanceCurrentPositionInTrace(int by,RegExpCompiler * compiler)2510 void Trace::AdvanceCurrentPositionInTrace(int by, RegExpCompiler* compiler) {
2511 // We don't have an instruction for shifting the current character register
2512 // down or for using a shifted value for anything so lets just forget that
2513 // we preloaded any characters into it.
2514 characters_preloaded_ = 0;
2515 // Adjust the offsets of the quick check performed information. This
2516 // information is used to find out what we already determined about the
2517 // characters by means of mask and compare.
2518 quick_check_performed_.Advance(by, compiler->one_byte());
2519 cp_offset_ += by;
2520 if (cp_offset_ > RegExpMacroAssembler::kMaxCPOffset) {
2521 compiler->SetRegExpTooBig();
2522 cp_offset_ = 0;
2523 }
2524 bound_checked_up_to_ = std::max(0, bound_checked_up_to_ - by);
2525 }
2526
MakeCaseIndependent(Isolate * isolate,bool is_one_byte,RegExpFlags flags)2527 void TextNode::MakeCaseIndependent(Isolate* isolate, bool is_one_byte,
2528 RegExpFlags flags) {
2529 if (!IsIgnoreCase(flags)) return;
2530 #ifdef V8_INTL_SUPPORT
2531 if (NeedsUnicodeCaseEquivalents(flags)) return;
2532 #endif
2533
2534 int element_count = elements()->length();
2535 for (int i = 0; i < element_count; i++) {
2536 TextElement elm = elements()->at(i);
2537 if (elm.text_type() == TextElement::CHAR_CLASS) {
2538 RegExpCharacterClass* cc = elm.char_class();
2539 // None of the standard character classes is different in the case
2540 // independent case and it slows us down if we don't know that.
2541 if (cc->is_standard(zone())) continue;
2542 ZoneList<CharacterRange>* ranges = cc->ranges(zone());
2543 CharacterRange::AddCaseEquivalents(isolate, zone(), ranges, is_one_byte);
2544 }
2545 }
2546 }
2547
GreedyLoopTextLength()2548 int TextNode::GreedyLoopTextLength() { return Length(); }
2549
GetSuccessorOfOmnivorousTextNode(RegExpCompiler * compiler)2550 RegExpNode* TextNode::GetSuccessorOfOmnivorousTextNode(
2551 RegExpCompiler* compiler) {
2552 if (read_backward()) return nullptr;
2553 if (elements()->length() != 1) return nullptr;
2554 TextElement elm = elements()->at(0);
2555 if (elm.text_type() != TextElement::CHAR_CLASS) return nullptr;
2556 RegExpCharacterClass* node = elm.char_class();
2557 ZoneList<CharacterRange>* ranges = node->ranges(zone());
2558 CharacterRange::Canonicalize(ranges);
2559 if (node->is_negated()) {
2560 return ranges->length() == 0 ? on_success() : nullptr;
2561 }
2562 if (ranges->length() != 1) return nullptr;
2563 const base::uc32 max_char = MaxCodeUnit(compiler->one_byte());
2564 return ranges->at(0).IsEverything(max_char) ? on_success() : nullptr;
2565 }
2566
2567 // Finds the fixed match length of a sequence of nodes that goes from
2568 // this alternative and back to this choice node. If there are variable
2569 // length nodes or other complications in the way then return a sentinel
2570 // value indicating that a greedy loop cannot be constructed.
GreedyLoopTextLengthForAlternative(GuardedAlternative * alternative)2571 int ChoiceNode::GreedyLoopTextLengthForAlternative(
2572 GuardedAlternative* alternative) {
2573 int length = 0;
2574 RegExpNode* node = alternative->node();
2575 // Later we will generate code for all these text nodes using recursion
2576 // so we have to limit the max number.
2577 int recursion_depth = 0;
2578 while (node != this) {
2579 if (recursion_depth++ > RegExpCompiler::kMaxRecursion) {
2580 return kNodeIsTooComplexForGreedyLoops;
2581 }
2582 int node_length = node->GreedyLoopTextLength();
2583 if (node_length == kNodeIsTooComplexForGreedyLoops) {
2584 return kNodeIsTooComplexForGreedyLoops;
2585 }
2586 length += node_length;
2587 SeqRegExpNode* seq_node = static_cast<SeqRegExpNode*>(node);
2588 node = seq_node->on_success();
2589 }
2590 if (read_backward()) {
2591 length = -length;
2592 }
2593 // Check that we can jump by the whole text length. If not, return sentinel
2594 // to indicate the we can't construct a greedy loop.
2595 if (length < RegExpMacroAssembler::kMinCPOffset ||
2596 length > RegExpMacroAssembler::kMaxCPOffset) {
2597 return kNodeIsTooComplexForGreedyLoops;
2598 }
2599 return length;
2600 }
2601
AddLoopAlternative(GuardedAlternative alt)2602 void LoopChoiceNode::AddLoopAlternative(GuardedAlternative alt) {
2603 DCHECK_NULL(loop_node_);
2604 AddAlternative(alt);
2605 loop_node_ = alt.node();
2606 }
2607
AddContinueAlternative(GuardedAlternative alt)2608 void LoopChoiceNode::AddContinueAlternative(GuardedAlternative alt) {
2609 DCHECK_NULL(continue_node_);
2610 AddAlternative(alt);
2611 continue_node_ = alt.node();
2612 }
2613
Emit(RegExpCompiler * compiler,Trace * trace)2614 void LoopChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
2615 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
2616 if (trace->stop_node() == this) {
2617 // Back edge of greedy optimized loop node graph.
2618 int text_length =
2619 GreedyLoopTextLengthForAlternative(&(alternatives_->at(0)));
2620 DCHECK_NE(kNodeIsTooComplexForGreedyLoops, text_length);
2621 // Update the counter-based backtracking info on the stack. This is an
2622 // optimization for greedy loops (see below).
2623 DCHECK(trace->cp_offset() == text_length);
2624 macro_assembler->AdvanceCurrentPosition(text_length);
2625 macro_assembler->GoTo(trace->loop_label());
2626 return;
2627 }
2628 DCHECK_NULL(trace->stop_node());
2629 if (!trace->is_trivial()) {
2630 trace->Flush(compiler, this);
2631 return;
2632 }
2633 ChoiceNode::Emit(compiler, trace);
2634 }
2635
CalculatePreloadCharacters(RegExpCompiler * compiler,int eats_at_least)2636 int ChoiceNode::CalculatePreloadCharacters(RegExpCompiler* compiler,
2637 int eats_at_least) {
2638 int preload_characters = std::min(4, eats_at_least);
2639 DCHECK_LE(preload_characters, 4);
2640 if (compiler->macro_assembler()->CanReadUnaligned()) {
2641 bool one_byte = compiler->one_byte();
2642 if (one_byte) {
2643 // We can't preload 3 characters because there is no machine instruction
2644 // to do that. We can't just load 4 because we could be reading
2645 // beyond the end of the string, which could cause a memory fault.
2646 if (preload_characters == 3) preload_characters = 2;
2647 } else {
2648 if (preload_characters > 2) preload_characters = 2;
2649 }
2650 } else {
2651 if (preload_characters > 1) preload_characters = 1;
2652 }
2653 return preload_characters;
2654 }
2655
2656 // This class is used when generating the alternatives in a choice node. It
2657 // records the way the alternative is being code generated.
2658 class AlternativeGeneration : public Malloced {
2659 public:
AlternativeGeneration()2660 AlternativeGeneration()
2661 : possible_success(),
2662 expects_preload(false),
2663 after(),
2664 quick_check_details() {}
2665 Label possible_success;
2666 bool expects_preload;
2667 Label after;
2668 QuickCheckDetails quick_check_details;
2669 };
2670
2671 // Creates a list of AlternativeGenerations. If the list has a reasonable
2672 // size then it is on the stack, otherwise the excess is on the heap.
2673 class AlternativeGenerationList {
2674 public:
AlternativeGenerationList(int count,Zone * zone)2675 AlternativeGenerationList(int count, Zone* zone) : alt_gens_(count, zone) {
2676 for (int i = 0; i < count && i < kAFew; i++) {
2677 alt_gens_.Add(a_few_alt_gens_ + i, zone);
2678 }
2679 for (int i = kAFew; i < count; i++) {
2680 alt_gens_.Add(new AlternativeGeneration(), zone);
2681 }
2682 }
~AlternativeGenerationList()2683 ~AlternativeGenerationList() {
2684 for (int i = kAFew; i < alt_gens_.length(); i++) {
2685 delete alt_gens_[i];
2686 alt_gens_[i] = nullptr;
2687 }
2688 }
2689
at(int i)2690 AlternativeGeneration* at(int i) { return alt_gens_[i]; }
2691
2692 private:
2693 static const int kAFew = 10;
2694 ZoneList<AlternativeGeneration*> alt_gens_;
2695 AlternativeGeneration a_few_alt_gens_[kAFew];
2696 };
2697
Set(int character)2698 void BoyerMoorePositionInfo::Set(int character) {
2699 SetInterval(Interval(character, character));
2700 }
2701
2702 namespace {
2703
AddRange(ContainedInLattice containment,const int * ranges,int ranges_length,Interval new_range)2704 ContainedInLattice AddRange(ContainedInLattice containment, const int* ranges,
2705 int ranges_length, Interval new_range) {
2706 DCHECK_EQ(1, ranges_length & 1);
2707 DCHECK_EQ(String::kMaxCodePoint + 1, ranges[ranges_length - 1]);
2708 if (containment == kLatticeUnknown) return containment;
2709 bool inside = false;
2710 int last = 0;
2711 for (int i = 0; i < ranges_length; inside = !inside, last = ranges[i], i++) {
2712 // Consider the range from last to ranges[i].
2713 // We haven't got to the new range yet.
2714 if (ranges[i] <= new_range.from()) continue;
2715 // New range is wholly inside last-ranges[i]. Note that new_range.to() is
2716 // inclusive, but the values in ranges are not.
2717 if (last <= new_range.from() && new_range.to() < ranges[i]) {
2718 return Combine(containment, inside ? kLatticeIn : kLatticeOut);
2719 }
2720 return kLatticeUnknown;
2721 }
2722 return containment;
2723 }
2724
BitsetFirstSetBit(BoyerMoorePositionInfo::Bitset bitset)2725 int BitsetFirstSetBit(BoyerMoorePositionInfo::Bitset bitset) {
2726 STATIC_ASSERT(BoyerMoorePositionInfo::kMapSize ==
2727 2 * kInt64Size * kBitsPerByte);
2728
2729 // Slight fiddling is needed here, since the bitset is of length 128 while
2730 // CountTrailingZeros requires an integral type and std::bitset can only
2731 // convert to unsigned long long. So we handle the most- and least-significant
2732 // bits separately.
2733
2734 {
2735 static constexpr BoyerMoorePositionInfo::Bitset mask(~uint64_t{0});
2736 BoyerMoorePositionInfo::Bitset masked_bitset = bitset & mask;
2737 STATIC_ASSERT(kInt64Size >= sizeof(decltype(masked_bitset.to_ullong())));
2738 uint64_t lsb = masked_bitset.to_ullong();
2739 if (lsb != 0) return base::bits::CountTrailingZeros(lsb);
2740 }
2741
2742 {
2743 BoyerMoorePositionInfo::Bitset masked_bitset = bitset >> 64;
2744 uint64_t msb = masked_bitset.to_ullong();
2745 if (msb != 0) return 64 + base::bits::CountTrailingZeros(msb);
2746 }
2747
2748 return -1;
2749 }
2750
2751 } // namespace
2752
SetInterval(const Interval & interval)2753 void BoyerMoorePositionInfo::SetInterval(const Interval& interval) {
2754 w_ = AddRange(w_, kWordRanges, kWordRangeCount, interval);
2755
2756 if (interval.size() >= kMapSize) {
2757 map_count_ = kMapSize;
2758 map_.set();
2759 return;
2760 }
2761
2762 for (int i = interval.from(); i <= interval.to(); i++) {
2763 int mod_character = (i & kMask);
2764 if (!map_[mod_character]) {
2765 map_count_++;
2766 map_.set(mod_character);
2767 }
2768 if (map_count_ == kMapSize) return;
2769 }
2770 }
2771
SetAll()2772 void BoyerMoorePositionInfo::SetAll() {
2773 w_ = kLatticeUnknown;
2774 if (map_count_ != kMapSize) {
2775 map_count_ = kMapSize;
2776 map_.set();
2777 }
2778 }
2779
BoyerMooreLookahead(int length,RegExpCompiler * compiler,Zone * zone)2780 BoyerMooreLookahead::BoyerMooreLookahead(int length, RegExpCompiler* compiler,
2781 Zone* zone)
2782 : length_(length),
2783 compiler_(compiler),
2784 max_char_(MaxCodeUnit(compiler->one_byte())) {
2785 bitmaps_ = zone->New<ZoneList<BoyerMoorePositionInfo*>>(length, zone);
2786 for (int i = 0; i < length; i++) {
2787 bitmaps_->Add(zone->New<BoyerMoorePositionInfo>(), zone);
2788 }
2789 }
2790
2791 // Find the longest range of lookahead that has the fewest number of different
2792 // characters that can occur at a given position. Since we are optimizing two
2793 // different parameters at once this is a tradeoff.
FindWorthwhileInterval(int * from,int * to)2794 bool BoyerMooreLookahead::FindWorthwhileInterval(int* from, int* to) {
2795 int biggest_points = 0;
2796 // If more than 32 characters out of 128 can occur it is unlikely that we can
2797 // be lucky enough to step forwards much of the time.
2798 const int kMaxMax = 32;
2799 for (int max_number_of_chars = 4; max_number_of_chars < kMaxMax;
2800 max_number_of_chars *= 2) {
2801 biggest_points =
2802 FindBestInterval(max_number_of_chars, biggest_points, from, to);
2803 }
2804 if (biggest_points == 0) return false;
2805 return true;
2806 }
2807
2808 // Find the highest-points range between 0 and length_ where the character
2809 // information is not too vague. 'Too vague' means that there are more than
2810 // max_number_of_chars that can occur at this position. Calculates the number
2811 // of points as the product of width-of-the-range and
2812 // probability-of-finding-one-of-the-characters, where the probability is
2813 // calculated using the frequency distribution of the sample subject string.
FindBestInterval(int max_number_of_chars,int old_biggest_points,int * from,int * to)2814 int BoyerMooreLookahead::FindBestInterval(int max_number_of_chars,
2815 int old_biggest_points, int* from,
2816 int* to) {
2817 int biggest_points = old_biggest_points;
2818 static const int kSize = RegExpMacroAssembler::kTableSize;
2819 for (int i = 0; i < length_;) {
2820 while (i < length_ && Count(i) > max_number_of_chars) i++;
2821 if (i == length_) break;
2822 int remembered_from = i;
2823
2824 BoyerMoorePositionInfo::Bitset union_bitset;
2825 for (; i < length_ && Count(i) <= max_number_of_chars; i++) {
2826 union_bitset |= bitmaps_->at(i)->raw_bitset();
2827 }
2828
2829 int frequency = 0;
2830
2831 // Iterate only over set bits.
2832 int j;
2833 while ((j = BitsetFirstSetBit(union_bitset)) != -1) {
2834 DCHECK(union_bitset[j]); // Sanity check.
2835 // Add 1 to the frequency to give a small per-character boost for
2836 // the cases where our sampling is not good enough and many
2837 // characters have a frequency of zero. This means the frequency
2838 // can theoretically be up to 2*kSize though we treat it mostly as
2839 // a fraction of kSize.
2840 frequency += compiler_->frequency_collator()->Frequency(j) + 1;
2841 union_bitset.reset(j);
2842 }
2843
2844 // We use the probability of skipping times the distance we are skipping to
2845 // judge the effectiveness of this. Actually we have a cut-off: By
2846 // dividing by 2 we switch off the skipping if the probability of skipping
2847 // is less than 50%. This is because the multibyte mask-and-compare
2848 // skipping in quickcheck is more likely to do well on this case.
2849 bool in_quickcheck_range =
2850 ((i - remembered_from < 4) ||
2851 (compiler_->one_byte() ? remembered_from <= 4 : remembered_from <= 2));
2852 // Called 'probability' but it is only a rough estimate and can actually
2853 // be outside the 0-kSize range.
2854 int probability = (in_quickcheck_range ? kSize / 2 : kSize) - frequency;
2855 int points = (i - remembered_from) * probability;
2856 if (points > biggest_points) {
2857 *from = remembered_from;
2858 *to = i - 1;
2859 biggest_points = points;
2860 }
2861 }
2862 return biggest_points;
2863 }
2864
2865 // Take all the characters that will not prevent a successful match if they
2866 // occur in the subject string in the range between min_lookahead and
2867 // max_lookahead (inclusive) measured from the current position. If the
2868 // character at max_lookahead offset is not one of these characters, then we
2869 // can safely skip forwards by the number of characters in the range.
GetSkipTable(int min_lookahead,int max_lookahead,Handle<ByteArray> boolean_skip_table)2870 int BoyerMooreLookahead::GetSkipTable(int min_lookahead, int max_lookahead,
2871 Handle<ByteArray> boolean_skip_table) {
2872 const int kSkipArrayEntry = 0;
2873 const int kDontSkipArrayEntry = 1;
2874
2875 std::memset(boolean_skip_table->GetDataStartAddress(), kSkipArrayEntry,
2876 boolean_skip_table->length());
2877
2878 for (int i = max_lookahead; i >= min_lookahead; i--) {
2879 BoyerMoorePositionInfo::Bitset bitset = bitmaps_->at(i)->raw_bitset();
2880
2881 // Iterate only over set bits.
2882 int j;
2883 while ((j = BitsetFirstSetBit(bitset)) != -1) {
2884 DCHECK(bitset[j]); // Sanity check.
2885 boolean_skip_table->set(j, kDontSkipArrayEntry);
2886 bitset.reset(j);
2887 }
2888 }
2889
2890 const int skip = max_lookahead + 1 - min_lookahead;
2891 return skip;
2892 }
2893
2894 // See comment above on the implementation of GetSkipTable.
EmitSkipInstructions(RegExpMacroAssembler * masm)2895 void BoyerMooreLookahead::EmitSkipInstructions(RegExpMacroAssembler* masm) {
2896 const int kSize = RegExpMacroAssembler::kTableSize;
2897
2898 int min_lookahead = 0;
2899 int max_lookahead = 0;
2900
2901 if (!FindWorthwhileInterval(&min_lookahead, &max_lookahead)) return;
2902
2903 // Check if we only have a single non-empty position info, and that info
2904 // contains precisely one character.
2905 bool found_single_character = false;
2906 int single_character = 0;
2907 for (int i = max_lookahead; i >= min_lookahead; i--) {
2908 BoyerMoorePositionInfo* map = bitmaps_->at(i);
2909 if (map->map_count() == 0) continue;
2910
2911 if (found_single_character || map->map_count() > 1) {
2912 found_single_character = false;
2913 break;
2914 }
2915
2916 DCHECK(!found_single_character);
2917 DCHECK_EQ(map->map_count(), 1);
2918
2919 found_single_character = true;
2920 single_character = BitsetFirstSetBit(map->raw_bitset());
2921
2922 DCHECK_NE(single_character, -1);
2923 }
2924
2925 int lookahead_width = max_lookahead + 1 - min_lookahead;
2926
2927 if (found_single_character && lookahead_width == 1 && max_lookahead < 3) {
2928 // The mask-compare can probably handle this better.
2929 return;
2930 }
2931
2932 if (found_single_character) {
2933 Label cont, again;
2934 masm->Bind(&again);
2935 masm->LoadCurrentCharacter(max_lookahead, &cont, true);
2936 if (max_char_ > kSize) {
2937 masm->CheckCharacterAfterAnd(single_character,
2938 RegExpMacroAssembler::kTableMask, &cont);
2939 } else {
2940 masm->CheckCharacter(single_character, &cont);
2941 }
2942 masm->AdvanceCurrentPosition(lookahead_width);
2943 masm->GoTo(&again);
2944 masm->Bind(&cont);
2945 return;
2946 }
2947
2948 Factory* factory = masm->isolate()->factory();
2949 Handle<ByteArray> boolean_skip_table =
2950 factory->NewByteArray(kSize, AllocationType::kOld);
2951 int skip_distance =
2952 GetSkipTable(min_lookahead, max_lookahead, boolean_skip_table);
2953 DCHECK_NE(0, skip_distance);
2954
2955 Label cont, again;
2956 masm->Bind(&again);
2957 masm->LoadCurrentCharacter(max_lookahead, &cont, true);
2958 masm->CheckBitInTable(boolean_skip_table, &cont);
2959 masm->AdvanceCurrentPosition(skip_distance);
2960 masm->GoTo(&again);
2961 masm->Bind(&cont);
2962 }
2963
2964 /* Code generation for choice nodes.
2965 *
2966 * We generate quick checks that do a mask and compare to eliminate a
2967 * choice. If the quick check succeeds then it jumps to the continuation to
2968 * do slow checks and check subsequent nodes. If it fails (the common case)
2969 * it falls through to the next choice.
2970 *
2971 * Here is the desired flow graph. Nodes directly below each other imply
2972 * fallthrough. Alternatives 1 and 2 have quick checks. Alternative
2973 * 3 doesn't have a quick check so we have to call the slow check.
2974 * Nodes are marked Qn for quick checks and Sn for slow checks. The entire
2975 * regexp continuation is generated directly after the Sn node, up to the
2976 * next GoTo if we decide to reuse some already generated code. Some
2977 * nodes expect preload_characters to be preloaded into the current
2978 * character register. R nodes do this preloading. Vertices are marked
2979 * F for failures and S for success (possible success in the case of quick
2980 * nodes). L, V, < and > are used as arrow heads.
2981 *
2982 * ----------> R
2983 * |
2984 * V
2985 * Q1 -----> S1
2986 * | S /
2987 * F| /
2988 * | F/
2989 * | /
2990 * | R
2991 * | /
2992 * V L
2993 * Q2 -----> S2
2994 * | S /
2995 * F| /
2996 * | F/
2997 * | /
2998 * | R
2999 * | /
3000 * V L
3001 * S3
3002 * |
3003 * F|
3004 * |
3005 * R
3006 * |
3007 * backtrack V
3008 * <----------Q4
3009 * \ F |
3010 * \ |S
3011 * \ F V
3012 * \-----S4
3013 *
3014 * For greedy loops we push the current position, then generate the code that
3015 * eats the input specially in EmitGreedyLoop. The other choice (the
3016 * continuation) is generated by the normal code in EmitChoices, and steps back
3017 * in the input to the starting position when it fails to match. The loop code
3018 * looks like this (U is the unwind code that steps back in the greedy loop).
3019 *
3020 * _____
3021 * / \
3022 * V |
3023 * ----------> S1 |
3024 * /| |
3025 * / |S |
3026 * F/ \_____/
3027 * /
3028 * |<-----
3029 * | \
3030 * V |S
3031 * Q2 ---> U----->backtrack
3032 * | F /
3033 * S| /
3034 * V F /
3035 * S2--/
3036 */
3037
GreedyLoopState(bool not_at_start)3038 GreedyLoopState::GreedyLoopState(bool not_at_start) {
3039 counter_backtrack_trace_.set_backtrack(&label_);
3040 if (not_at_start) counter_backtrack_trace_.set_at_start(Trace::FALSE_VALUE);
3041 }
3042
AssertGuardsMentionRegisters(Trace * trace)3043 void ChoiceNode::AssertGuardsMentionRegisters(Trace* trace) {
3044 #ifdef DEBUG
3045 int choice_count = alternatives_->length();
3046 for (int i = 0; i < choice_count - 1; i++) {
3047 GuardedAlternative alternative = alternatives_->at(i);
3048 ZoneList<Guard*>* guards = alternative.guards();
3049 int guard_count = (guards == nullptr) ? 0 : guards->length();
3050 for (int j = 0; j < guard_count; j++) {
3051 DCHECK(!trace->mentions_reg(guards->at(j)->reg()));
3052 }
3053 }
3054 #endif
3055 }
3056
SetUpPreLoad(RegExpCompiler * compiler,Trace * current_trace,PreloadState * state)3057 void ChoiceNode::SetUpPreLoad(RegExpCompiler* compiler, Trace* current_trace,
3058 PreloadState* state) {
3059 if (state->eats_at_least_ == PreloadState::kEatsAtLeastNotYetInitialized) {
3060 // Save some time by looking at most one machine word ahead.
3061 state->eats_at_least_ =
3062 EatsAtLeast(current_trace->at_start() == Trace::FALSE_VALUE);
3063 }
3064 state->preload_characters_ =
3065 CalculatePreloadCharacters(compiler, state->eats_at_least_);
3066
3067 state->preload_is_current_ =
3068 (current_trace->characters_preloaded() == state->preload_characters_);
3069 state->preload_has_checked_bounds_ = state->preload_is_current_;
3070 }
3071
Emit(RegExpCompiler * compiler,Trace * trace)3072 void ChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
3073 int choice_count = alternatives_->length();
3074
3075 if (choice_count == 1 && alternatives_->at(0).guards() == nullptr) {
3076 alternatives_->at(0).node()->Emit(compiler, trace);
3077 return;
3078 }
3079
3080 AssertGuardsMentionRegisters(trace);
3081
3082 LimitResult limit_result = LimitVersions(compiler, trace);
3083 if (limit_result == DONE) return;
3084 DCHECK(limit_result == CONTINUE);
3085
3086 // For loop nodes we already flushed (see LoopChoiceNode::Emit), but for
3087 // other choice nodes we only flush if we are out of code size budget.
3088 if (trace->flush_budget() == 0 && trace->actions() != nullptr) {
3089 trace->Flush(compiler, this);
3090 return;
3091 }
3092
3093 RecursionCheck rc(compiler);
3094
3095 PreloadState preload;
3096 preload.init();
3097 GreedyLoopState greedy_loop_state(not_at_start());
3098
3099 int text_length = GreedyLoopTextLengthForAlternative(&alternatives_->at(0));
3100 AlternativeGenerationList alt_gens(choice_count, zone());
3101
3102 if (choice_count > 1 && text_length != kNodeIsTooComplexForGreedyLoops) {
3103 trace = EmitGreedyLoop(compiler, trace, &alt_gens, &preload,
3104 &greedy_loop_state, text_length);
3105 } else {
3106 // TODO(erikcorry): Delete this. We don't need this label, but it makes us
3107 // match the traces produced pre-cleanup.
3108 Label second_choice;
3109 compiler->macro_assembler()->Bind(&second_choice);
3110
3111 preload.eats_at_least_ = EmitOptimizedUnanchoredSearch(compiler, trace);
3112
3113 EmitChoices(compiler, &alt_gens, 0, trace, &preload);
3114 }
3115
3116 // At this point we need to generate slow checks for the alternatives where
3117 // the quick check was inlined. We can recognize these because the associated
3118 // label was bound.
3119 int new_flush_budget = trace->flush_budget() / choice_count;
3120 for (int i = 0; i < choice_count; i++) {
3121 AlternativeGeneration* alt_gen = alt_gens.at(i);
3122 Trace new_trace(*trace);
3123 // If there are actions to be flushed we have to limit how many times
3124 // they are flushed. Take the budget of the parent trace and distribute
3125 // it fairly amongst the children.
3126 if (new_trace.actions() != nullptr) {
3127 new_trace.set_flush_budget(new_flush_budget);
3128 }
3129 bool next_expects_preload =
3130 i == choice_count - 1 ? false : alt_gens.at(i + 1)->expects_preload;
3131 EmitOutOfLineContinuation(compiler, &new_trace, alternatives_->at(i),
3132 alt_gen, preload.preload_characters_,
3133 next_expects_preload);
3134 }
3135 }
3136
EmitGreedyLoop(RegExpCompiler * compiler,Trace * trace,AlternativeGenerationList * alt_gens,PreloadState * preload,GreedyLoopState * greedy_loop_state,int text_length)3137 Trace* ChoiceNode::EmitGreedyLoop(RegExpCompiler* compiler, Trace* trace,
3138 AlternativeGenerationList* alt_gens,
3139 PreloadState* preload,
3140 GreedyLoopState* greedy_loop_state,
3141 int text_length) {
3142 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
3143 // Here we have special handling for greedy loops containing only text nodes
3144 // and other simple nodes. These are handled by pushing the current
3145 // position on the stack and then incrementing the current position each
3146 // time around the switch. On backtrack we decrement the current position
3147 // and check it against the pushed value. This avoids pushing backtrack
3148 // information for each iteration of the loop, which could take up a lot of
3149 // space.
3150 DCHECK(trace->stop_node() == nullptr);
3151 macro_assembler->PushCurrentPosition();
3152 Label greedy_match_failed;
3153 Trace greedy_match_trace;
3154 if (not_at_start()) greedy_match_trace.set_at_start(Trace::FALSE_VALUE);
3155 greedy_match_trace.set_backtrack(&greedy_match_failed);
3156 Label loop_label;
3157 macro_assembler->Bind(&loop_label);
3158 greedy_match_trace.set_stop_node(this);
3159 greedy_match_trace.set_loop_label(&loop_label);
3160 alternatives_->at(0).node()->Emit(compiler, &greedy_match_trace);
3161 macro_assembler->Bind(&greedy_match_failed);
3162
3163 Label second_choice; // For use in greedy matches.
3164 macro_assembler->Bind(&second_choice);
3165
3166 Trace* new_trace = greedy_loop_state->counter_backtrack_trace();
3167
3168 EmitChoices(compiler, alt_gens, 1, new_trace, preload);
3169
3170 macro_assembler->Bind(greedy_loop_state->label());
3171 // If we have unwound to the bottom then backtrack.
3172 macro_assembler->CheckGreedyLoop(trace->backtrack());
3173 // Otherwise try the second priority at an earlier position.
3174 macro_assembler->AdvanceCurrentPosition(-text_length);
3175 macro_assembler->GoTo(&second_choice);
3176 return new_trace;
3177 }
3178
EmitOptimizedUnanchoredSearch(RegExpCompiler * compiler,Trace * trace)3179 int ChoiceNode::EmitOptimizedUnanchoredSearch(RegExpCompiler* compiler,
3180 Trace* trace) {
3181 int eats_at_least = PreloadState::kEatsAtLeastNotYetInitialized;
3182 if (alternatives_->length() != 2) return eats_at_least;
3183
3184 GuardedAlternative alt1 = alternatives_->at(1);
3185 if (alt1.guards() != nullptr && alt1.guards()->length() != 0) {
3186 return eats_at_least;
3187 }
3188 RegExpNode* eats_anything_node = alt1.node();
3189 if (eats_anything_node->GetSuccessorOfOmnivorousTextNode(compiler) != this) {
3190 return eats_at_least;
3191 }
3192
3193 // Really we should be creating a new trace when we execute this function,
3194 // but there is no need, because the code it generates cannot backtrack, and
3195 // we always arrive here with a trivial trace (since it's the entry to a
3196 // loop. That also implies that there are no preloaded characters, which is
3197 // good, because it means we won't be violating any assumptions by
3198 // overwriting those characters with new load instructions.
3199 DCHECK(trace->is_trivial());
3200
3201 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
3202 Isolate* isolate = macro_assembler->isolate();
3203 // At this point we know that we are at a non-greedy loop that will eat
3204 // any character one at a time. Any non-anchored regexp has such a
3205 // loop prepended to it in order to find where it starts. We look for
3206 // a pattern of the form ...abc... where we can look 6 characters ahead
3207 // and step forwards 3 if the character is not one of abc. Abc need
3208 // not be atoms, they can be any reasonably limited character class or
3209 // small alternation.
3210 BoyerMooreLookahead* bm = bm_info(false);
3211 if (bm == nullptr) {
3212 eats_at_least = std::min(kMaxLookaheadForBoyerMoore, EatsAtLeast(false));
3213 if (eats_at_least >= 1) {
3214 bm = zone()->New<BoyerMooreLookahead>(eats_at_least, compiler, zone());
3215 GuardedAlternative alt0 = alternatives_->at(0);
3216 alt0.node()->FillInBMInfo(isolate, 0, kRecursionBudget, bm, false);
3217 }
3218 }
3219 if (bm != nullptr) {
3220 bm->EmitSkipInstructions(macro_assembler);
3221 }
3222 return eats_at_least;
3223 }
3224
EmitChoices(RegExpCompiler * compiler,AlternativeGenerationList * alt_gens,int first_choice,Trace * trace,PreloadState * preload)3225 void ChoiceNode::EmitChoices(RegExpCompiler* compiler,
3226 AlternativeGenerationList* alt_gens,
3227 int first_choice, Trace* trace,
3228 PreloadState* preload) {
3229 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
3230 SetUpPreLoad(compiler, trace, preload);
3231
3232 // For now we just call all choices one after the other. The idea ultimately
3233 // is to use the Dispatch table to try only the relevant ones.
3234 int choice_count = alternatives_->length();
3235
3236 int new_flush_budget = trace->flush_budget() / choice_count;
3237
3238 for (int i = first_choice; i < choice_count; i++) {
3239 bool is_last = i == choice_count - 1;
3240 bool fall_through_on_failure = !is_last;
3241 GuardedAlternative alternative = alternatives_->at(i);
3242 AlternativeGeneration* alt_gen = alt_gens->at(i);
3243 alt_gen->quick_check_details.set_characters(preload->preload_characters_);
3244 ZoneList<Guard*>* guards = alternative.guards();
3245 int guard_count = (guards == nullptr) ? 0 : guards->length();
3246 Trace new_trace(*trace);
3247 new_trace.set_characters_preloaded(
3248 preload->preload_is_current_ ? preload->preload_characters_ : 0);
3249 if (preload->preload_has_checked_bounds_) {
3250 new_trace.set_bound_checked_up_to(preload->preload_characters_);
3251 }
3252 new_trace.quick_check_performed()->Clear();
3253 if (not_at_start_) new_trace.set_at_start(Trace::FALSE_VALUE);
3254 if (!is_last) {
3255 new_trace.set_backtrack(&alt_gen->after);
3256 }
3257 alt_gen->expects_preload = preload->preload_is_current_;
3258 bool generate_full_check_inline = false;
3259 if (compiler->optimize() &&
3260 try_to_emit_quick_check_for_alternative(i == 0) &&
3261 alternative.node()->EmitQuickCheck(
3262 compiler, trace, &new_trace, preload->preload_has_checked_bounds_,
3263 &alt_gen->possible_success, &alt_gen->quick_check_details,
3264 fall_through_on_failure, this)) {
3265 // Quick check was generated for this choice.
3266 preload->preload_is_current_ = true;
3267 preload->preload_has_checked_bounds_ = true;
3268 // If we generated the quick check to fall through on possible success,
3269 // we now need to generate the full check inline.
3270 if (!fall_through_on_failure) {
3271 macro_assembler->Bind(&alt_gen->possible_success);
3272 new_trace.set_quick_check_performed(&alt_gen->quick_check_details);
3273 new_trace.set_characters_preloaded(preload->preload_characters_);
3274 new_trace.set_bound_checked_up_to(preload->preload_characters_);
3275 generate_full_check_inline = true;
3276 }
3277 } else if (alt_gen->quick_check_details.cannot_match()) {
3278 if (!fall_through_on_failure) {
3279 macro_assembler->GoTo(trace->backtrack());
3280 }
3281 continue;
3282 } else {
3283 // No quick check was generated. Put the full code here.
3284 // If this is not the first choice then there could be slow checks from
3285 // previous cases that go here when they fail. There's no reason to
3286 // insist that they preload characters since the slow check we are about
3287 // to generate probably can't use it.
3288 if (i != first_choice) {
3289 alt_gen->expects_preload = false;
3290 new_trace.InvalidateCurrentCharacter();
3291 }
3292 generate_full_check_inline = true;
3293 }
3294 if (generate_full_check_inline) {
3295 if (new_trace.actions() != nullptr) {
3296 new_trace.set_flush_budget(new_flush_budget);
3297 }
3298 for (int j = 0; j < guard_count; j++) {
3299 GenerateGuard(macro_assembler, guards->at(j), &new_trace);
3300 }
3301 alternative.node()->Emit(compiler, &new_trace);
3302 preload->preload_is_current_ = false;
3303 }
3304 macro_assembler->Bind(&alt_gen->after);
3305 }
3306 }
3307
EmitOutOfLineContinuation(RegExpCompiler * compiler,Trace * trace,GuardedAlternative alternative,AlternativeGeneration * alt_gen,int preload_characters,bool next_expects_preload)3308 void ChoiceNode::EmitOutOfLineContinuation(RegExpCompiler* compiler,
3309 Trace* trace,
3310 GuardedAlternative alternative,
3311 AlternativeGeneration* alt_gen,
3312 int preload_characters,
3313 bool next_expects_preload) {
3314 if (!alt_gen->possible_success.is_linked()) return;
3315
3316 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
3317 macro_assembler->Bind(&alt_gen->possible_success);
3318 Trace out_of_line_trace(*trace);
3319 out_of_line_trace.set_characters_preloaded(preload_characters);
3320 out_of_line_trace.set_quick_check_performed(&alt_gen->quick_check_details);
3321 if (not_at_start_) out_of_line_trace.set_at_start(Trace::FALSE_VALUE);
3322 ZoneList<Guard*>* guards = alternative.guards();
3323 int guard_count = (guards == nullptr) ? 0 : guards->length();
3324 if (next_expects_preload) {
3325 Label reload_current_char;
3326 out_of_line_trace.set_backtrack(&reload_current_char);
3327 for (int j = 0; j < guard_count; j++) {
3328 GenerateGuard(macro_assembler, guards->at(j), &out_of_line_trace);
3329 }
3330 alternative.node()->Emit(compiler, &out_of_line_trace);
3331 macro_assembler->Bind(&reload_current_char);
3332 // Reload the current character, since the next quick check expects that.
3333 // We don't need to check bounds here because we only get into this
3334 // code through a quick check which already did the checked load.
3335 macro_assembler->LoadCurrentCharacter(trace->cp_offset(), nullptr, false,
3336 preload_characters);
3337 macro_assembler->GoTo(&(alt_gen->after));
3338 } else {
3339 out_of_line_trace.set_backtrack(&(alt_gen->after));
3340 for (int j = 0; j < guard_count; j++) {
3341 GenerateGuard(macro_assembler, guards->at(j), &out_of_line_trace);
3342 }
3343 alternative.node()->Emit(compiler, &out_of_line_trace);
3344 }
3345 }
3346
Emit(RegExpCompiler * compiler,Trace * trace)3347 void ActionNode::Emit(RegExpCompiler* compiler, Trace* trace) {
3348 RegExpMacroAssembler* assembler = compiler->macro_assembler();
3349 LimitResult limit_result = LimitVersions(compiler, trace);
3350 if (limit_result == DONE) return;
3351 DCHECK(limit_result == CONTINUE);
3352
3353 RecursionCheck rc(compiler);
3354
3355 switch (action_type_) {
3356 case STORE_POSITION: {
3357 Trace::DeferredCapture new_capture(data_.u_position_register.reg,
3358 data_.u_position_register.is_capture,
3359 trace);
3360 Trace new_trace = *trace;
3361 new_trace.add_action(&new_capture);
3362 on_success()->Emit(compiler, &new_trace);
3363 break;
3364 }
3365 case INCREMENT_REGISTER: {
3366 Trace::DeferredIncrementRegister new_increment(
3367 data_.u_increment_register.reg);
3368 Trace new_trace = *trace;
3369 new_trace.add_action(&new_increment);
3370 on_success()->Emit(compiler, &new_trace);
3371 break;
3372 }
3373 case SET_REGISTER_FOR_LOOP: {
3374 Trace::DeferredSetRegisterForLoop new_set(data_.u_store_register.reg,
3375 data_.u_store_register.value);
3376 Trace new_trace = *trace;
3377 new_trace.add_action(&new_set);
3378 on_success()->Emit(compiler, &new_trace);
3379 break;
3380 }
3381 case CLEAR_CAPTURES: {
3382 Trace::DeferredClearCaptures new_capture(Interval(
3383 data_.u_clear_captures.range_from, data_.u_clear_captures.range_to));
3384 Trace new_trace = *trace;
3385 new_trace.add_action(&new_capture);
3386 on_success()->Emit(compiler, &new_trace);
3387 break;
3388 }
3389 case BEGIN_POSITIVE_SUBMATCH:
3390 case BEGIN_NEGATIVE_SUBMATCH:
3391 if (!trace->is_trivial()) {
3392 trace->Flush(compiler, this);
3393 } else {
3394 assembler->WriteCurrentPositionToRegister(
3395 data_.u_submatch.current_position_register, 0);
3396 assembler->WriteStackPointerToRegister(
3397 data_.u_submatch.stack_pointer_register);
3398 on_success()->Emit(compiler, trace);
3399 }
3400 break;
3401 case EMPTY_MATCH_CHECK: {
3402 int start_pos_reg = data_.u_empty_match_check.start_register;
3403 int stored_pos = 0;
3404 int rep_reg = data_.u_empty_match_check.repetition_register;
3405 bool has_minimum = (rep_reg != RegExpCompiler::kNoRegister);
3406 bool know_dist = trace->GetStoredPosition(start_pos_reg, &stored_pos);
3407 if (know_dist && !has_minimum && stored_pos == trace->cp_offset()) {
3408 // If we know we haven't advanced and there is no minimum we
3409 // can just backtrack immediately.
3410 assembler->GoTo(trace->backtrack());
3411 } else if (know_dist && stored_pos < trace->cp_offset()) {
3412 // If we know we've advanced we can generate the continuation
3413 // immediately.
3414 on_success()->Emit(compiler, trace);
3415 } else if (!trace->is_trivial()) {
3416 trace->Flush(compiler, this);
3417 } else {
3418 Label skip_empty_check;
3419 // If we have a minimum number of repetitions we check the current
3420 // number first and skip the empty check if it's not enough.
3421 if (has_minimum) {
3422 int limit = data_.u_empty_match_check.repetition_limit;
3423 assembler->IfRegisterLT(rep_reg, limit, &skip_empty_check);
3424 }
3425 // If the match is empty we bail out, otherwise we fall through
3426 // to the on-success continuation.
3427 assembler->IfRegisterEqPos(data_.u_empty_match_check.start_register,
3428 trace->backtrack());
3429 assembler->Bind(&skip_empty_check);
3430 on_success()->Emit(compiler, trace);
3431 }
3432 break;
3433 }
3434 case POSITIVE_SUBMATCH_SUCCESS: {
3435 if (!trace->is_trivial()) {
3436 trace->Flush(compiler, this);
3437 return;
3438 }
3439 assembler->ReadCurrentPositionFromRegister(
3440 data_.u_submatch.current_position_register);
3441 assembler->ReadStackPointerFromRegister(
3442 data_.u_submatch.stack_pointer_register);
3443 int clear_register_count = data_.u_submatch.clear_register_count;
3444 if (clear_register_count == 0) {
3445 on_success()->Emit(compiler, trace);
3446 return;
3447 }
3448 int clear_registers_from = data_.u_submatch.clear_register_from;
3449 Label clear_registers_backtrack;
3450 Trace new_trace = *trace;
3451 new_trace.set_backtrack(&clear_registers_backtrack);
3452 on_success()->Emit(compiler, &new_trace);
3453
3454 assembler->Bind(&clear_registers_backtrack);
3455 int clear_registers_to = clear_registers_from + clear_register_count - 1;
3456 assembler->ClearRegisters(clear_registers_from, clear_registers_to);
3457
3458 DCHECK(trace->backtrack() == nullptr);
3459 assembler->Backtrack();
3460 return;
3461 }
3462 default:
3463 UNREACHABLE();
3464 }
3465 }
3466
Emit(RegExpCompiler * compiler,Trace * trace)3467 void BackReferenceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
3468 RegExpMacroAssembler* assembler = compiler->macro_assembler();
3469 if (!trace->is_trivial()) {
3470 trace->Flush(compiler, this);
3471 return;
3472 }
3473
3474 LimitResult limit_result = LimitVersions(compiler, trace);
3475 if (limit_result == DONE) return;
3476 DCHECK(limit_result == CONTINUE);
3477
3478 RecursionCheck rc(compiler);
3479
3480 DCHECK_EQ(start_reg_ + 1, end_reg_);
3481 if (IsIgnoreCase(flags_)) {
3482 bool unicode = IsUnicode(flags_);
3483 assembler->CheckNotBackReferenceIgnoreCase(start_reg_, read_backward(),
3484 unicode, trace->backtrack());
3485 } else {
3486 assembler->CheckNotBackReference(start_reg_, read_backward(),
3487 trace->backtrack());
3488 }
3489 // We are going to advance backward, so we may end up at the start.
3490 if (read_backward()) trace->set_at_start(Trace::UNKNOWN);
3491
3492 // Check that the back reference does not end inside a surrogate pair.
3493 if (IsUnicode(flags_) && !compiler->one_byte()) {
3494 assembler->CheckNotInSurrogatePair(trace->cp_offset(), trace->backtrack());
3495 }
3496 on_success()->Emit(compiler, trace);
3497 }
3498
CalculateOffsets()3499 void TextNode::CalculateOffsets() {
3500 int element_count = elements()->length();
3501 // Set up the offsets of the elements relative to the start. This is a fixed
3502 // quantity since a TextNode can only contain fixed-width things.
3503 int cp_offset = 0;
3504 for (int i = 0; i < element_count; i++) {
3505 TextElement& elm = elements()->at(i);
3506 elm.set_cp_offset(cp_offset);
3507 cp_offset += elm.length();
3508 }
3509 }
3510
3511 namespace {
3512
3513 // Assertion propagation moves information about assertions such as
3514 // \b to the affected nodes. For instance, in /.\b./ information must
3515 // be propagated to the first '.' that whatever follows needs to know
3516 // if it matched a word or a non-word, and to the second '.' that it
3517 // has to check if it succeeds a word or non-word. In this case the
3518 // result will be something like:
3519 //
3520 // +-------+ +------------+
3521 // | . | | . |
3522 // +-------+ ---> +------------+
3523 // | word? | | check word |
3524 // +-------+ +------------+
3525 class AssertionPropagator : public AllStatic {
3526 public:
VisitText(TextNode * that)3527 static void VisitText(TextNode* that) {}
3528
VisitAction(ActionNode * that)3529 static void VisitAction(ActionNode* that) {
3530 // If the next node is interested in what it follows then this node
3531 // has to be interested too so it can pass the information on.
3532 that->info()->AddFromFollowing(that->on_success()->info());
3533 }
3534
VisitChoice(ChoiceNode * that,int i)3535 static void VisitChoice(ChoiceNode* that, int i) {
3536 // Anything the following nodes need to know has to be known by
3537 // this node also, so it can pass it on.
3538 that->info()->AddFromFollowing(that->alternatives()->at(i).node()->info());
3539 }
3540
VisitLoopChoiceContinueNode(LoopChoiceNode * that)3541 static void VisitLoopChoiceContinueNode(LoopChoiceNode* that) {
3542 that->info()->AddFromFollowing(that->continue_node()->info());
3543 }
3544
VisitLoopChoiceLoopNode(LoopChoiceNode * that)3545 static void VisitLoopChoiceLoopNode(LoopChoiceNode* that) {
3546 that->info()->AddFromFollowing(that->loop_node()->info());
3547 }
3548
VisitNegativeLookaroundChoiceLookaroundNode(NegativeLookaroundChoiceNode * that)3549 static void VisitNegativeLookaroundChoiceLookaroundNode(
3550 NegativeLookaroundChoiceNode* that) {
3551 VisitChoice(that, NegativeLookaroundChoiceNode::kLookaroundIndex);
3552 }
3553
VisitNegativeLookaroundChoiceContinueNode(NegativeLookaroundChoiceNode * that)3554 static void VisitNegativeLookaroundChoiceContinueNode(
3555 NegativeLookaroundChoiceNode* that) {
3556 VisitChoice(that, NegativeLookaroundChoiceNode::kContinueIndex);
3557 }
3558
VisitBackReference(BackReferenceNode * that)3559 static void VisitBackReference(BackReferenceNode* that) {}
3560
VisitAssertion(AssertionNode * that)3561 static void VisitAssertion(AssertionNode* that) {}
3562 };
3563
3564 // Propagates information about the minimum size of successful matches from
3565 // successor nodes to their predecessors. Note that all eats_at_least values
3566 // are initialized to zero before analysis.
3567 class EatsAtLeastPropagator : public AllStatic {
3568 public:
VisitText(TextNode * that)3569 static void VisitText(TextNode* that) {
3570 // The eats_at_least value is not used if reading backward.
3571 if (!that->read_backward()) {
3572 // We are not at the start after this node, and thus we can use the
3573 // successor's eats_at_least_from_not_start value.
3574 uint8_t eats_at_least = base::saturated_cast<uint8_t>(
3575 that->Length() + that->on_success()
3576 ->eats_at_least_info()
3577 ->eats_at_least_from_not_start);
3578 that->set_eats_at_least_info(EatsAtLeastInfo(eats_at_least));
3579 }
3580 }
3581
VisitAction(ActionNode * that)3582 static void VisitAction(ActionNode* that) {
3583 switch (that->action_type()) {
3584 case ActionNode::BEGIN_POSITIVE_SUBMATCH:
3585 case ActionNode::POSITIVE_SUBMATCH_SUCCESS:
3586 // We do not propagate eats_at_least data through positive lookarounds,
3587 // because they rewind input.
3588 // TODO(v8:11859) Potential approaches for fixing this include:
3589 // 1. Add a dedicated choice node for positive lookaround, similar to
3590 // NegativeLookaroundChoiceNode.
3591 // 2. Add an eats_at_least_inside_loop field to EatsAtLeastInfo, which
3592 // is <= eats_at_least_from_possibly_start, and use that value in
3593 // EatsAtLeastFromLoopEntry.
3594 DCHECK(that->eats_at_least_info()->IsZero());
3595 break;
3596 case ActionNode::SET_REGISTER_FOR_LOOP:
3597 // SET_REGISTER_FOR_LOOP indicates a loop entry point, which means the
3598 // loop body will run at least the minimum number of times before the
3599 // continuation case can run.
3600 that->set_eats_at_least_info(
3601 that->on_success()->EatsAtLeastFromLoopEntry());
3602 break;
3603 case ActionNode::BEGIN_NEGATIVE_SUBMATCH:
3604 default:
3605 // Otherwise, the current node eats at least as much as its successor.
3606 // Note: we can propagate eats_at_least data for BEGIN_NEGATIVE_SUBMATCH
3607 // because NegativeLookaroundChoiceNode ignores its lookaround successor
3608 // when computing eats-at-least and quick check information.
3609 that->set_eats_at_least_info(*that->on_success()->eats_at_least_info());
3610 break;
3611 }
3612 }
3613
VisitChoice(ChoiceNode * that,int i)3614 static void VisitChoice(ChoiceNode* that, int i) {
3615 // The minimum possible match from a choice node is the minimum of its
3616 // successors.
3617 EatsAtLeastInfo eats_at_least =
3618 i == 0 ? EatsAtLeastInfo(UINT8_MAX) : *that->eats_at_least_info();
3619 eats_at_least.SetMin(
3620 *that->alternatives()->at(i).node()->eats_at_least_info());
3621 that->set_eats_at_least_info(eats_at_least);
3622 }
3623
VisitLoopChoiceContinueNode(LoopChoiceNode * that)3624 static void VisitLoopChoiceContinueNode(LoopChoiceNode* that) {
3625 if (!that->read_backward()) {
3626 that->set_eats_at_least_info(
3627 *that->continue_node()->eats_at_least_info());
3628 }
3629 }
3630
VisitLoopChoiceLoopNode(LoopChoiceNode * that)3631 static void VisitLoopChoiceLoopNode(LoopChoiceNode* that) {}
3632
VisitNegativeLookaroundChoiceLookaroundNode(NegativeLookaroundChoiceNode * that)3633 static void VisitNegativeLookaroundChoiceLookaroundNode(
3634 NegativeLookaroundChoiceNode* that) {}
3635
VisitNegativeLookaroundChoiceContinueNode(NegativeLookaroundChoiceNode * that)3636 static void VisitNegativeLookaroundChoiceContinueNode(
3637 NegativeLookaroundChoiceNode* that) {
3638 that->set_eats_at_least_info(*that->continue_node()->eats_at_least_info());
3639 }
3640
VisitBackReference(BackReferenceNode * that)3641 static void VisitBackReference(BackReferenceNode* that) {
3642 if (!that->read_backward()) {
3643 that->set_eats_at_least_info(*that->on_success()->eats_at_least_info());
3644 }
3645 }
3646
VisitAssertion(AssertionNode * that)3647 static void VisitAssertion(AssertionNode* that) {
3648 EatsAtLeastInfo eats_at_least = *that->on_success()->eats_at_least_info();
3649 if (that->assertion_type() == AssertionNode::AT_START) {
3650 // If we know we are not at the start and we are asked "how many
3651 // characters will you match if you succeed?" then we can answer anything
3652 // since false implies false. So let's just set the max answer
3653 // (UINT8_MAX) since that won't prevent us from preloading a lot of
3654 // characters for the other branches in the node graph.
3655 eats_at_least.eats_at_least_from_not_start = UINT8_MAX;
3656 }
3657 that->set_eats_at_least_info(eats_at_least);
3658 }
3659 };
3660
3661 } // namespace
3662
3663 // -------------------------------------------------------------------
3664 // Analysis
3665
3666 // Iterates the node graph and provides the opportunity for propagators to set
3667 // values that depend on successor nodes.
3668 template <typename... Propagators>
3669 class Analysis : public NodeVisitor {
3670 public:
Analysis(Isolate * isolate,bool is_one_byte,RegExpFlags flags)3671 Analysis(Isolate* isolate, bool is_one_byte, RegExpFlags flags)
3672 : isolate_(isolate),
3673 is_one_byte_(is_one_byte),
3674 flags_(flags),
3675 error_(RegExpError::kNone) {}
3676
EnsureAnalyzed(RegExpNode * that)3677 void EnsureAnalyzed(RegExpNode* that) {
3678 StackLimitCheck check(isolate());
3679 if (check.HasOverflowed()) {
3680 if (FLAG_correctness_fuzzer_suppressions) {
3681 FATAL("Analysis: Aborting on stack overflow");
3682 }
3683 fail(RegExpError::kAnalysisStackOverflow);
3684 return;
3685 }
3686 if (that->info()->been_analyzed || that->info()->being_analyzed) return;
3687 that->info()->being_analyzed = true;
3688 that->Accept(this);
3689 that->info()->being_analyzed = false;
3690 that->info()->been_analyzed = true;
3691 }
3692
has_failed()3693 bool has_failed() { return error_ != RegExpError::kNone; }
error()3694 RegExpError error() {
3695 DCHECK(error_ != RegExpError::kNone);
3696 return error_;
3697 }
fail(RegExpError error)3698 void fail(RegExpError error) { error_ = error; }
3699
isolate() const3700 Isolate* isolate() const { return isolate_; }
3701
VisitEnd(EndNode * that)3702 void VisitEnd(EndNode* that) override {
3703 // nothing to do
3704 }
3705
3706 // Used to call the given static function on each propagator / variadic template
3707 // argument.
3708 #define STATIC_FOR_EACH(expr) \
3709 do { \
3710 int dummy[] = {((expr), 0)...}; \
3711 USE(dummy); \
3712 } while (false)
3713
VisitText(TextNode * that)3714 void VisitText(TextNode* that) override {
3715 that->MakeCaseIndependent(isolate(), is_one_byte_, flags_);
3716 EnsureAnalyzed(that->on_success());
3717 if (has_failed()) return;
3718 that->CalculateOffsets();
3719 STATIC_FOR_EACH(Propagators::VisitText(that));
3720 }
3721
VisitAction(ActionNode * that)3722 void VisitAction(ActionNode* that) override {
3723 EnsureAnalyzed(that->on_success());
3724 if (has_failed()) return;
3725 STATIC_FOR_EACH(Propagators::VisitAction(that));
3726 }
3727
VisitChoice(ChoiceNode * that)3728 void VisitChoice(ChoiceNode* that) override {
3729 for (int i = 0; i < that->alternatives()->length(); i++) {
3730 EnsureAnalyzed(that->alternatives()->at(i).node());
3731 if (has_failed()) return;
3732 STATIC_FOR_EACH(Propagators::VisitChoice(that, i));
3733 }
3734 }
3735
VisitLoopChoice(LoopChoiceNode * that)3736 void VisitLoopChoice(LoopChoiceNode* that) override {
3737 DCHECK_EQ(that->alternatives()->length(), 2); // Just loop and continue.
3738
3739 // First propagate all information from the continuation node.
3740 EnsureAnalyzed(that->continue_node());
3741 if (has_failed()) return;
3742 STATIC_FOR_EACH(Propagators::VisitLoopChoiceContinueNode(that));
3743
3744 // Check the loop last since it may need the value of this node
3745 // to get a correct result.
3746 EnsureAnalyzed(that->loop_node());
3747 if (has_failed()) return;
3748 STATIC_FOR_EACH(Propagators::VisitLoopChoiceLoopNode(that));
3749 }
3750
VisitNegativeLookaroundChoice(NegativeLookaroundChoiceNode * that)3751 void VisitNegativeLookaroundChoice(
3752 NegativeLookaroundChoiceNode* that) override {
3753 DCHECK_EQ(that->alternatives()->length(), 2); // Lookaround and continue.
3754
3755 EnsureAnalyzed(that->lookaround_node());
3756 if (has_failed()) return;
3757 STATIC_FOR_EACH(
3758 Propagators::VisitNegativeLookaroundChoiceLookaroundNode(that));
3759
3760 EnsureAnalyzed(that->continue_node());
3761 if (has_failed()) return;
3762 STATIC_FOR_EACH(
3763 Propagators::VisitNegativeLookaroundChoiceContinueNode(that));
3764 }
3765
VisitBackReference(BackReferenceNode * that)3766 void VisitBackReference(BackReferenceNode* that) override {
3767 EnsureAnalyzed(that->on_success());
3768 if (has_failed()) return;
3769 STATIC_FOR_EACH(Propagators::VisitBackReference(that));
3770 }
3771
VisitAssertion(AssertionNode * that)3772 void VisitAssertion(AssertionNode* that) override {
3773 EnsureAnalyzed(that->on_success());
3774 if (has_failed()) return;
3775 STATIC_FOR_EACH(Propagators::VisitAssertion(that));
3776 }
3777
3778 #undef STATIC_FOR_EACH
3779
3780 private:
3781 Isolate* isolate_;
3782 const bool is_one_byte_;
3783 const RegExpFlags flags_;
3784 RegExpError error_;
3785
3786 DISALLOW_IMPLICIT_CONSTRUCTORS(Analysis);
3787 };
3788
AnalyzeRegExp(Isolate * isolate,bool is_one_byte,RegExpFlags flags,RegExpNode * node)3789 RegExpError AnalyzeRegExp(Isolate* isolate, bool is_one_byte, RegExpFlags flags,
3790 RegExpNode* node) {
3791 Analysis<AssertionPropagator, EatsAtLeastPropagator> analysis(
3792 isolate, is_one_byte, flags);
3793 DCHECK_EQ(node->info()->been_analyzed, false);
3794 analysis.EnsureAnalyzed(node);
3795 DCHECK_IMPLIES(analysis.has_failed(), analysis.error() != RegExpError::kNone);
3796 return analysis.has_failed() ? analysis.error() : RegExpError::kNone;
3797 }
3798
FillInBMInfo(Isolate * isolate,int offset,int budget,BoyerMooreLookahead * bm,bool not_at_start)3799 void BackReferenceNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
3800 BoyerMooreLookahead* bm,
3801 bool not_at_start) {
3802 // Working out the set of characters that a backreference can match is too
3803 // hard, so we just say that any character can match.
3804 bm->SetRest(offset);
3805 SaveBMInfo(bm, not_at_start, offset);
3806 }
3807
3808 STATIC_ASSERT(BoyerMoorePositionInfo::kMapSize ==
3809 RegExpMacroAssembler::kTableSize);
3810
FillInBMInfo(Isolate * isolate,int offset,int budget,BoyerMooreLookahead * bm,bool not_at_start)3811 void ChoiceNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
3812 BoyerMooreLookahead* bm, bool not_at_start) {
3813 ZoneList<GuardedAlternative>* alts = alternatives();
3814 budget = (budget - 1) / alts->length();
3815 for (int i = 0; i < alts->length(); i++) {
3816 GuardedAlternative& alt = alts->at(i);
3817 if (alt.guards() != nullptr && alt.guards()->length() != 0) {
3818 bm->SetRest(offset); // Give up trying to fill in info.
3819 SaveBMInfo(bm, not_at_start, offset);
3820 return;
3821 }
3822 alt.node()->FillInBMInfo(isolate, offset, budget, bm, not_at_start);
3823 }
3824 SaveBMInfo(bm, not_at_start, offset);
3825 }
3826
FillInBMInfo(Isolate * isolate,int initial_offset,int budget,BoyerMooreLookahead * bm,bool not_at_start)3827 void TextNode::FillInBMInfo(Isolate* isolate, int initial_offset, int budget,
3828 BoyerMooreLookahead* bm, bool not_at_start) {
3829 if (initial_offset >= bm->length()) return;
3830 int offset = initial_offset;
3831 int max_char = bm->max_char();
3832 for (int i = 0; i < elements()->length(); i++) {
3833 if (offset >= bm->length()) {
3834 if (initial_offset == 0) set_bm_info(not_at_start, bm);
3835 return;
3836 }
3837 TextElement text = elements()->at(i);
3838 if (text.text_type() == TextElement::ATOM) {
3839 RegExpAtom* atom = text.atom();
3840 for (int j = 0; j < atom->length(); j++, offset++) {
3841 if (offset >= bm->length()) {
3842 if (initial_offset == 0) set_bm_info(not_at_start, bm);
3843 return;
3844 }
3845 base::uc16 character = atom->data()[j];
3846 if (IsIgnoreCase(bm->compiler()->flags())) {
3847 unibrow::uchar chars[4];
3848 int length = GetCaseIndependentLetters(
3849 isolate, character, bm->max_char() == String::kMaxOneByteCharCode,
3850 chars, 4);
3851 for (int k = 0; k < length; k++) {
3852 bm->Set(offset, chars[k]);
3853 }
3854 } else {
3855 if (character <= max_char) bm->Set(offset, character);
3856 }
3857 }
3858 } else {
3859 DCHECK_EQ(TextElement::CHAR_CLASS, text.text_type());
3860 RegExpCharacterClass* char_class = text.char_class();
3861 ZoneList<CharacterRange>* ranges = char_class->ranges(zone());
3862 if (char_class->is_negated()) {
3863 bm->SetAll(offset);
3864 } else {
3865 for (int k = 0; k < ranges->length(); k++) {
3866 CharacterRange& range = ranges->at(k);
3867 if (static_cast<int>(range.from()) > max_char) continue;
3868 int to = std::min(max_char, static_cast<int>(range.to()));
3869 bm->SetInterval(offset, Interval(range.from(), to));
3870 }
3871 }
3872 offset++;
3873 }
3874 }
3875 if (offset >= bm->length()) {
3876 if (initial_offset == 0) set_bm_info(not_at_start, bm);
3877 return;
3878 }
3879 on_success()->FillInBMInfo(isolate, offset, budget - 1, bm,
3880 true); // Not at start after a text node.
3881 if (initial_offset == 0) set_bm_info(not_at_start, bm);
3882 }
3883
OptionallyStepBackToLeadSurrogate(RegExpNode * on_success)3884 RegExpNode* RegExpCompiler::OptionallyStepBackToLeadSurrogate(
3885 RegExpNode* on_success) {
3886 DCHECK(!read_backward());
3887 ZoneList<CharacterRange>* lead_surrogates = CharacterRange::List(
3888 zone(), CharacterRange::Range(kLeadSurrogateStart, kLeadSurrogateEnd));
3889 ZoneList<CharacterRange>* trail_surrogates = CharacterRange::List(
3890 zone(), CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd));
3891
3892 ChoiceNode* optional_step_back = zone()->New<ChoiceNode>(2, zone());
3893
3894 int stack_register = UnicodeLookaroundStackRegister();
3895 int position_register = UnicodeLookaroundPositionRegister();
3896 RegExpNode* step_back = TextNode::CreateForCharacterRanges(
3897 zone(), lead_surrogates, true, on_success);
3898 RegExpLookaround::Builder builder(true, step_back, stack_register,
3899 position_register);
3900 RegExpNode* match_trail = TextNode::CreateForCharacterRanges(
3901 zone(), trail_surrogates, false, builder.on_match_success());
3902
3903 optional_step_back->AddAlternative(
3904 GuardedAlternative(builder.ForMatch(match_trail)));
3905 optional_step_back->AddAlternative(GuardedAlternative(on_success));
3906
3907 return optional_step_back;
3908 }
3909
PreprocessRegExp(RegExpCompileData * data,RegExpFlags flags,bool is_one_byte)3910 RegExpNode* RegExpCompiler::PreprocessRegExp(RegExpCompileData* data,
3911 RegExpFlags flags,
3912 bool is_one_byte) {
3913 // Wrap the body of the regexp in capture #0.
3914 RegExpNode* captured_body =
3915 RegExpCapture::ToNode(data->tree, 0, this, accept());
3916 RegExpNode* node = captured_body;
3917 if (!data->tree->IsAnchoredAtStart() && !IsSticky(flags)) {
3918 // Add a .*? at the beginning, outside the body capture, unless
3919 // this expression is anchored at the beginning or sticky.
3920 RegExpNode* loop_node = RegExpQuantifier::ToNode(
3921 0, RegExpTree::kInfinity, false,
3922 zone()->New<RegExpCharacterClass>(StandardCharacterSet::kEverything),
3923 this, captured_body, data->contains_anchor);
3924
3925 if (data->contains_anchor) {
3926 // Unroll loop once, to take care of the case that might start
3927 // at the start of input.
3928 ChoiceNode* first_step_node = zone()->New<ChoiceNode>(2, zone());
3929 first_step_node->AddAlternative(GuardedAlternative(captured_body));
3930 first_step_node->AddAlternative(GuardedAlternative(zone()->New<TextNode>(
3931 zone()->New<RegExpCharacterClass>(StandardCharacterSet::kEverything),
3932 false, loop_node)));
3933 node = first_step_node;
3934 } else {
3935 node = loop_node;
3936 }
3937 }
3938 if (is_one_byte) {
3939 node = node->FilterOneByte(RegExpCompiler::kMaxRecursion, flags);
3940 // Do it again to propagate the new nodes to places where they were not
3941 // put because they had not been calculated yet.
3942 if (node != nullptr) {
3943 node = node->FilterOneByte(RegExpCompiler::kMaxRecursion, flags);
3944 }
3945 } else if (IsUnicode(flags) && (IsGlobal(flags) || IsSticky(flags))) {
3946 node = OptionallyStepBackToLeadSurrogate(node);
3947 }
3948
3949 if (node == nullptr) node = zone()->New<EndNode>(EndNode::BACKTRACK, zone());
3950 return node;
3951 }
3952
ToNodeCheckForStackOverflow()3953 void RegExpCompiler::ToNodeCheckForStackOverflow() {
3954 if (StackLimitCheck{isolate()}.HasOverflowed()) {
3955 FatalProcessOutOfMemory(isolate(), "RegExpCompiler");
3956 }
3957 }
3958
3959 } // namespace internal
3960 } // namespace v8
3961