1 // Copyright 2012 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #ifndef V8_REGEXP_JSREGEXP_H_
6 #define V8_REGEXP_JSREGEXP_H_
7
8 #include "src/allocation.h"
9 #include "src/assembler.h"
10 #include "src/isolate.h"
11 #include "src/objects/js-regexp.h"
12 #include "src/regexp/regexp-ast.h"
13 #include "src/regexp/regexp-macro-assembler.h"
14
15 namespace v8 {
16 namespace internal {
17
18 class NodeVisitor;
19 class RegExpCompiler;
20 class RegExpMacroAssembler;
21 class RegExpNode;
22 class RegExpTree;
23 class BoyerMooreLookahead;
24
IgnoreCase(JSRegExp::Flags flags)25 inline bool IgnoreCase(JSRegExp::Flags flags) {
26 return (flags & JSRegExp::kIgnoreCase) != 0;
27 }
28
IsUnicode(JSRegExp::Flags flags)29 inline bool IsUnicode(JSRegExp::Flags flags) {
30 return (flags & JSRegExp::kUnicode) != 0;
31 }
32
IsSticky(JSRegExp::Flags flags)33 inline bool IsSticky(JSRegExp::Flags flags) {
34 return (flags & JSRegExp::kSticky) != 0;
35 }
36
IsGlobal(JSRegExp::Flags flags)37 inline bool IsGlobal(JSRegExp::Flags flags) {
38 return (flags & JSRegExp::kGlobal) != 0;
39 }
40
DotAll(JSRegExp::Flags flags)41 inline bool DotAll(JSRegExp::Flags flags) {
42 return (flags & JSRegExp::kDotAll) != 0;
43 }
44
Multiline(JSRegExp::Flags flags)45 inline bool Multiline(JSRegExp::Flags flags) {
46 return (flags & JSRegExp::kMultiline) != 0;
47 }
48
NeedsUnicodeCaseEquivalents(JSRegExp::Flags flags)49 inline bool NeedsUnicodeCaseEquivalents(JSRegExp::Flags flags) {
50 // Both unicode and ignore_case flags are set. We need to use ICU to find
51 // the closure over case equivalents.
52 return IsUnicode(flags) && IgnoreCase(flags);
53 }
54
55 class RegExpImpl {
56 public:
57 // Whether V8 is compiled with native regexp support or not.
UsesNativeRegExp()58 static bool UsesNativeRegExp() {
59 #ifdef V8_INTERPRETED_REGEXP
60 return false;
61 #else
62 return true;
63 #endif
64 }
65
66 // Returns a string representation of a regular expression.
67 // Implements RegExp.prototype.toString, see ECMA-262 section 15.10.6.4.
68 // This function calls the garbage collector if necessary.
69 static Handle<String> ToString(Handle<Object> value);
70
71 // Parses the RegExp pattern and prepares the JSRegExp object with
72 // generic data and choice of implementation - as well as what
73 // the implementation wants to store in the data field.
74 // Returns false if compilation fails.
75 V8_WARN_UNUSED_RESULT static MaybeHandle<Object> Compile(
76 Isolate* isolate, Handle<JSRegExp> re, Handle<String> pattern,
77 JSRegExp::Flags flags);
78
79 // See ECMA-262 section 15.10.6.2.
80 // This function calls the garbage collector if necessary.
81 V8_EXPORT_PRIVATE V8_WARN_UNUSED_RESULT static MaybeHandle<Object> Exec(
82 Isolate* isolate, Handle<JSRegExp> regexp, Handle<String> subject,
83 int index, Handle<RegExpMatchInfo> last_match_info);
84
85 // Prepares a JSRegExp object with Irregexp-specific data.
86 static void IrregexpInitialize(Isolate* isolate, Handle<JSRegExp> re,
87 Handle<String> pattern, JSRegExp::Flags flags,
88 int capture_register_count);
89
90 static void AtomCompile(Isolate* isolate, Handle<JSRegExp> re,
91 Handle<String> pattern, JSRegExp::Flags flags,
92 Handle<String> match_pattern);
93
94 static int AtomExecRaw(Isolate* isolate, Handle<JSRegExp> regexp,
95 Handle<String> subject, int index, int32_t* output,
96 int output_size);
97
98 static Handle<Object> AtomExec(Isolate* isolate, Handle<JSRegExp> regexp,
99 Handle<String> subject, int index,
100 Handle<RegExpMatchInfo> last_match_info);
101
102 enum IrregexpResult { RE_FAILURE = 0, RE_SUCCESS = 1, RE_EXCEPTION = -1 };
103
104 // Prepare a RegExp for being executed one or more times (using
105 // IrregexpExecOnce) on the subject.
106 // This ensures that the regexp is compiled for the subject, and that
107 // the subject is flat.
108 // Returns the number of integer spaces required by IrregexpExecOnce
109 // as its "registers" argument. If the regexp cannot be compiled,
110 // an exception is set as pending, and this function returns negative.
111 static int IrregexpPrepare(Isolate* isolate, Handle<JSRegExp> regexp,
112 Handle<String> subject);
113
114 // Execute a regular expression on the subject, starting from index.
115 // If matching succeeds, return the number of matches. This can be larger
116 // than one in the case of global regular expressions.
117 // The captures and subcaptures are stored into the registers vector.
118 // If matching fails, returns RE_FAILURE.
119 // If execution fails, sets a pending exception and returns RE_EXCEPTION.
120 static int IrregexpExecRaw(Isolate* isolate, Handle<JSRegExp> regexp,
121 Handle<String> subject, int index, int32_t* output,
122 int output_size);
123
124 // Execute an Irregexp bytecode pattern.
125 // On a successful match, the result is a JSArray containing
126 // captured positions. On a failure, the result is the null value.
127 // Returns an empty handle in case of an exception.
128 V8_WARN_UNUSED_RESULT static MaybeHandle<Object> IrregexpExec(
129 Isolate* isolate, Handle<JSRegExp> regexp, Handle<String> subject,
130 int index, Handle<RegExpMatchInfo> last_match_info);
131
132 // Set last match info. If match is nullptr, then setting captures is
133 // omitted.
134 static Handle<RegExpMatchInfo> SetLastMatchInfo(
135 Isolate* isolate, Handle<RegExpMatchInfo> last_match_info,
136 Handle<String> subject, int capture_count, int32_t* match);
137
138 class GlobalCache {
139 public:
140 GlobalCache(Handle<JSRegExp> regexp,
141 Handle<String> subject,
142 Isolate* isolate);
143
144 V8_INLINE ~GlobalCache();
145
146 // Fetch the next entry in the cache for global regexp match results.
147 // This does not set the last match info. Upon failure, nullptr is
148 // returned. The cause can be checked with Result(). The previous result is
149 // still in available in memory when a failure happens.
150 V8_INLINE int32_t* FetchNext();
151
152 V8_INLINE int32_t* LastSuccessfulMatch();
153
HasException()154 V8_INLINE bool HasException() { return num_matches_ < 0; }
155
156 private:
157 int AdvanceZeroLength(int last_index);
158
159 int num_matches_;
160 int max_matches_;
161 int current_match_index_;
162 int registers_per_match_;
163 // Pointer to the last set of captures.
164 int32_t* register_array_;
165 int register_array_size_;
166 Handle<JSRegExp> regexp_;
167 Handle<String> subject_;
168 Isolate* isolate_;
169 };
170
171 // For acting on the JSRegExp data FixedArray.
172 static int IrregexpMaxRegisterCount(FixedArray* re);
173 static void SetIrregexpMaxRegisterCount(FixedArray* re, int value);
174 static void SetIrregexpCaptureNameMap(FixedArray* re,
175 Handle<FixedArray> value);
176 static int IrregexpNumberOfCaptures(FixedArray* re);
177 static int IrregexpNumberOfRegisters(FixedArray* re);
178 static ByteArray* IrregexpByteCode(FixedArray* re, bool is_one_byte);
179 static Code* IrregexpNativeCode(FixedArray* re, bool is_one_byte);
180
181 // Limit the space regexps take up on the heap. In order to limit this we
182 // would like to keep track of the amount of regexp code on the heap. This
183 // is not tracked, however. As a conservative approximation we track the
184 // total regexp code compiled including code that has subsequently been freed
185 // and the total executable memory at any point.
186 static const size_t kRegExpExecutableMemoryLimit = 16 * MB;
187 static const size_t kRegExpCompiledLimit = 1 * MB;
188 static const int kRegExpTooLargeToOptimize = 20 * KB;
189
190 private:
191 static bool CompileIrregexp(Isolate* isolate, Handle<JSRegExp> re,
192 Handle<String> sample_subject, bool is_one_byte);
193 static inline bool EnsureCompiledIrregexp(Isolate* isolate,
194 Handle<JSRegExp> re,
195 Handle<String> sample_subject,
196 bool is_one_byte);
197 };
198
199
200 // Represents the location of one element relative to the intersection of
201 // two sets. Corresponds to the four areas of a Venn diagram.
202 enum ElementInSetsRelation {
203 kInsideNone = 0,
204 kInsideFirst = 1,
205 kInsideSecond = 2,
206 kInsideBoth = 3
207 };
208
209
210 // A set of unsigned integers that behaves especially well on small
211 // integers (< 32). May do zone-allocation.
212 class OutSet: public ZoneObject {
213 public:
OutSet()214 OutSet() : first_(0), remaining_(nullptr), successors_(nullptr) {}
215 OutSet* Extend(unsigned value, Zone* zone);
216 bool Get(unsigned value) const;
217 static const unsigned kFirstLimit = 32;
218
219 private:
220 // Destructively set a value in this set. In most cases you want
221 // to use Extend instead to ensure that only one instance exists
222 // that contains the same values.
223 void Set(unsigned value, Zone* zone);
224
225 // The successors are a list of sets that contain the same values
226 // as this set and the one more value that is not present in this
227 // set.
successors(Zone * zone)228 ZoneList<OutSet*>* successors(Zone* zone) { return successors_; }
229
OutSet(uint32_t first,ZoneList<unsigned> * remaining)230 OutSet(uint32_t first, ZoneList<unsigned>* remaining)
231 : first_(first), remaining_(remaining), successors_(nullptr) {}
232 uint32_t first_;
233 ZoneList<unsigned>* remaining_;
234 ZoneList<OutSet*>* successors_;
235 friend class Trace;
236 };
237
238
239 // A mapping from integers, specified as ranges, to a set of integers.
240 // Used for mapping character ranges to choices.
241 class DispatchTable : public ZoneObject {
242 public:
DispatchTable(Zone * zone)243 explicit DispatchTable(Zone* zone) : tree_(zone) { }
244
245 class Entry {
246 public:
Entry()247 Entry() : from_(0), to_(0), out_set_(nullptr) {}
Entry(uc32 from,uc32 to,OutSet * out_set)248 Entry(uc32 from, uc32 to, OutSet* out_set)
249 : from_(from), to_(to), out_set_(out_set) {
250 DCHECK(from <= to);
251 }
from()252 uc32 from() { return from_; }
to()253 uc32 to() { return to_; }
set_to(uc32 value)254 void set_to(uc32 value) { to_ = value; }
AddValue(int value,Zone * zone)255 void AddValue(int value, Zone* zone) {
256 out_set_ = out_set_->Extend(value, zone);
257 }
out_set()258 OutSet* out_set() { return out_set_; }
259 private:
260 uc32 from_;
261 uc32 to_;
262 OutSet* out_set_;
263 };
264
265 class Config {
266 public:
267 typedef uc32 Key;
268 typedef Entry Value;
269 static const uc32 kNoKey;
NoValue()270 static const Entry NoValue() { return Value(); }
Compare(uc32 a,uc32 b)271 static inline int Compare(uc32 a, uc32 b) {
272 if (a == b)
273 return 0;
274 else if (a < b)
275 return -1;
276 else
277 return 1;
278 }
279 };
280
281 void AddRange(CharacterRange range, int value, Zone* zone);
282 OutSet* Get(uc32 value);
283 void Dump();
284
285 template <typename Callback>
ForEach(Callback * callback)286 void ForEach(Callback* callback) {
287 return tree()->ForEach(callback);
288 }
289
290 private:
291 // There can't be a static empty set since it allocates its
292 // successors in a zone and caches them.
empty()293 OutSet* empty() { return &empty_; }
294 OutSet empty_;
tree()295 ZoneSplayTree<Config>* tree() { return &tree_; }
296 ZoneSplayTree<Config> tree_;
297 };
298
299
300 // Categorizes character ranges into BMP, non-BMP, lead, and trail surrogates.
301 class UnicodeRangeSplitter {
302 public:
303 UnicodeRangeSplitter(Zone* zone, ZoneList<CharacterRange>* base);
304 void Call(uc32 from, DispatchTable::Entry entry);
305
bmp()306 ZoneList<CharacterRange>* bmp() { return bmp_; }
lead_surrogates()307 ZoneList<CharacterRange>* lead_surrogates() { return lead_surrogates_; }
trail_surrogates()308 ZoneList<CharacterRange>* trail_surrogates() { return trail_surrogates_; }
non_bmp()309 ZoneList<CharacterRange>* non_bmp() const { return non_bmp_; }
310
311 private:
312 static const int kBase = 0;
313 // Separate ranges into
314 static const int kBmpCodePoints = 1;
315 static const int kLeadSurrogates = 2;
316 static const int kTrailSurrogates = 3;
317 static const int kNonBmpCodePoints = 4;
318
319 Zone* zone_;
320 DispatchTable table_;
321 ZoneList<CharacterRange>* bmp_;
322 ZoneList<CharacterRange>* lead_surrogates_;
323 ZoneList<CharacterRange>* trail_surrogates_;
324 ZoneList<CharacterRange>* non_bmp_;
325 };
326
327
328 #define FOR_EACH_NODE_TYPE(VISIT) \
329 VISIT(End) \
330 VISIT(Action) \
331 VISIT(Choice) \
332 VISIT(BackReference) \
333 VISIT(Assertion) \
334 VISIT(Text)
335
336
337 class Trace;
338 struct PreloadState;
339 class GreedyLoopState;
340 class AlternativeGenerationList;
341
342 struct NodeInfo {
NodeInfoNodeInfo343 NodeInfo()
344 : being_analyzed(false),
345 been_analyzed(false),
346 follows_word_interest(false),
347 follows_newline_interest(false),
348 follows_start_interest(false),
349 at_end(false),
350 visited(false),
351 replacement_calculated(false) { }
352
353 // Returns true if the interests and assumptions of this node
354 // matches the given one.
MatchesNodeInfo355 bool Matches(NodeInfo* that) {
356 return (at_end == that->at_end) &&
357 (follows_word_interest == that->follows_word_interest) &&
358 (follows_newline_interest == that->follows_newline_interest) &&
359 (follows_start_interest == that->follows_start_interest);
360 }
361
362 // Updates the interests of this node given the interests of the
363 // node preceding it.
AddFromPrecedingNodeInfo364 void AddFromPreceding(NodeInfo* that) {
365 at_end |= that->at_end;
366 follows_word_interest |= that->follows_word_interest;
367 follows_newline_interest |= that->follows_newline_interest;
368 follows_start_interest |= that->follows_start_interest;
369 }
370
HasLookbehindNodeInfo371 bool HasLookbehind() {
372 return follows_word_interest ||
373 follows_newline_interest ||
374 follows_start_interest;
375 }
376
377 // Sets the interests of this node to include the interests of the
378 // following node.
AddFromFollowingNodeInfo379 void AddFromFollowing(NodeInfo* that) {
380 follows_word_interest |= that->follows_word_interest;
381 follows_newline_interest |= that->follows_newline_interest;
382 follows_start_interest |= that->follows_start_interest;
383 }
384
ResetCompilationStateNodeInfo385 void ResetCompilationState() {
386 being_analyzed = false;
387 been_analyzed = false;
388 }
389
390 bool being_analyzed: 1;
391 bool been_analyzed: 1;
392
393 // These bits are set of this node has to know what the preceding
394 // character was.
395 bool follows_word_interest: 1;
396 bool follows_newline_interest: 1;
397 bool follows_start_interest: 1;
398
399 bool at_end: 1;
400 bool visited: 1;
401 bool replacement_calculated: 1;
402 };
403
404
405 // Details of a quick mask-compare check that can look ahead in the
406 // input stream.
407 class QuickCheckDetails {
408 public:
QuickCheckDetails()409 QuickCheckDetails()
410 : characters_(0),
411 mask_(0),
412 value_(0),
413 cannot_match_(false) { }
QuickCheckDetails(int characters)414 explicit QuickCheckDetails(int characters)
415 : characters_(characters),
416 mask_(0),
417 value_(0),
418 cannot_match_(false) { }
419 bool Rationalize(bool one_byte);
420 // Merge in the information from another branch of an alternation.
421 void Merge(QuickCheckDetails* other, int from_index);
422 // Advance the current position by some amount.
423 void Advance(int by, bool one_byte);
424 void Clear();
cannot_match()425 bool cannot_match() { return cannot_match_; }
set_cannot_match()426 void set_cannot_match() { cannot_match_ = true; }
427 struct Position {
PositionPosition428 Position() : mask(0), value(0), determines_perfectly(false) { }
429 uc16 mask;
430 uc16 value;
431 bool determines_perfectly;
432 };
characters()433 int characters() { return characters_; }
set_characters(int characters)434 void set_characters(int characters) { characters_ = characters; }
positions(int index)435 Position* positions(int index) {
436 DCHECK_LE(0, index);
437 DCHECK_GT(characters_, index);
438 return positions_ + index;
439 }
mask()440 uint32_t mask() { return mask_; }
value()441 uint32_t value() { return value_; }
442
443 private:
444 // How many characters do we have quick check information from. This is
445 // the same for all branches of a choice node.
446 int characters_;
447 Position positions_[4];
448 // These values are the condensate of the above array after Rationalize().
449 uint32_t mask_;
450 uint32_t value_;
451 // If set to true, there is no way this quick check can match at all.
452 // E.g., if it requires to be at the start of the input, and isn't.
453 bool cannot_match_;
454 };
455
456
457 extern int kUninitializedRegExpNodePlaceHolder;
458
459
460 class RegExpNode: public ZoneObject {
461 public:
RegExpNode(Zone * zone)462 explicit RegExpNode(Zone* zone)
463 : replacement_(nullptr),
464 on_work_list_(false),
465 trace_count_(0),
466 zone_(zone) {
467 bm_info_[0] = bm_info_[1] = nullptr;
468 }
469 virtual ~RegExpNode();
470 virtual void Accept(NodeVisitor* visitor) = 0;
471 // Generates a goto to this node or actually generates the code at this point.
472 virtual void Emit(RegExpCompiler* compiler, Trace* trace) = 0;
473 // How many characters must this node consume at a minimum in order to
474 // succeed. If we have found at least 'still_to_find' characters that
475 // must be consumed there is no need to ask any following nodes whether
476 // they are sure to eat any more characters. The not_at_start argument is
477 // used to indicate that we know we are not at the start of the input. In
478 // this case anchored branches will always fail and can be ignored when
479 // determining how many characters are consumed on success.
480 virtual int EatsAtLeast(int still_to_find, int budget, bool not_at_start) = 0;
481 // Emits some quick code that checks whether the preloaded characters match.
482 // Falls through on certain failure, jumps to the label on possible success.
483 // If the node cannot make a quick check it does nothing and returns false.
484 bool EmitQuickCheck(RegExpCompiler* compiler,
485 Trace* bounds_check_trace,
486 Trace* trace,
487 bool preload_has_checked_bounds,
488 Label* on_possible_success,
489 QuickCheckDetails* details_return,
490 bool fall_through_on_failure);
491 // For a given number of characters this returns a mask and a value. The
492 // next n characters are anded with the mask and compared with the value.
493 // A comparison failure indicates the node cannot match the next n characters.
494 // A comparison success indicates the node may match.
495 virtual void GetQuickCheckDetails(QuickCheckDetails* details,
496 RegExpCompiler* compiler,
497 int characters_filled_in,
498 bool not_at_start) = 0;
499 static const int kNodeIsTooComplexForGreedyLoops = kMinInt;
GreedyLoopTextLength()500 virtual int GreedyLoopTextLength() { return kNodeIsTooComplexForGreedyLoops; }
501 // Only returns the successor for a text node of length 1 that matches any
502 // character and that has no guards on it.
GetSuccessorOfOmnivorousTextNode(RegExpCompiler * compiler)503 virtual RegExpNode* GetSuccessorOfOmnivorousTextNode(
504 RegExpCompiler* compiler) {
505 return nullptr;
506 }
507
508 // Collects information on the possible code units (mod 128) that can match if
509 // we look forward. This is used for a Boyer-Moore-like string searching
510 // implementation. TODO(erikcorry): This should share more code with
511 // EatsAtLeast, GetQuickCheckDetails. The budget argument is used to limit
512 // the number of nodes we are willing to look at in order to create this data.
513 static const int kRecursionBudget = 200;
514 bool KeepRecursing(RegExpCompiler* compiler);
FillInBMInfo(Isolate * isolate,int offset,int budget,BoyerMooreLookahead * bm,bool not_at_start)515 virtual void FillInBMInfo(Isolate* isolate, int offset, int budget,
516 BoyerMooreLookahead* bm, bool not_at_start) {
517 UNREACHABLE();
518 }
519
520 // If we know that the input is one-byte then there are some nodes that can
521 // never match. This method returns a node that can be substituted for
522 // itself, or nullptr if the node can never match.
FilterOneByte(int depth)523 virtual RegExpNode* FilterOneByte(int depth) { return this; }
524 // Helper for FilterOneByte.
replacement()525 RegExpNode* replacement() {
526 DCHECK(info()->replacement_calculated);
527 return replacement_;
528 }
set_replacement(RegExpNode * replacement)529 RegExpNode* set_replacement(RegExpNode* replacement) {
530 info()->replacement_calculated = true;
531 replacement_ = replacement;
532 return replacement; // For convenience.
533 }
534
535 // We want to avoid recalculating the lookahead info, so we store it on the
536 // node. Only info that is for this node is stored. We can tell that the
537 // info is for this node when offset == 0, so the information is calculated
538 // relative to this node.
SaveBMInfo(BoyerMooreLookahead * bm,bool not_at_start,int offset)539 void SaveBMInfo(BoyerMooreLookahead* bm, bool not_at_start, int offset) {
540 if (offset == 0) set_bm_info(not_at_start, bm);
541 }
542
label()543 Label* label() { return &label_; }
544 // If non-generic code is generated for a node (i.e. the node is not at the
545 // start of the trace) then it cannot be reused. This variable sets a limit
546 // on how often we allow that to happen before we insist on starting a new
547 // trace and generating generic code for a node that can be reused by flushing
548 // the deferred actions in the current trace and generating a goto.
549 static const int kMaxCopiesCodeGenerated = 10;
550
on_work_list()551 bool on_work_list() { return on_work_list_; }
set_on_work_list(bool value)552 void set_on_work_list(bool value) { on_work_list_ = value; }
553
info()554 NodeInfo* info() { return &info_; }
555
bm_info(bool not_at_start)556 BoyerMooreLookahead* bm_info(bool not_at_start) {
557 return bm_info_[not_at_start ? 1 : 0];
558 }
559
zone()560 Zone* zone() const { return zone_; }
561
562 protected:
563 enum LimitResult { DONE, CONTINUE };
564 RegExpNode* replacement_;
565
566 LimitResult LimitVersions(RegExpCompiler* compiler, Trace* trace);
567
set_bm_info(bool not_at_start,BoyerMooreLookahead * bm)568 void set_bm_info(bool not_at_start, BoyerMooreLookahead* bm) {
569 bm_info_[not_at_start ? 1 : 0] = bm;
570 }
571
572 private:
573 static const int kFirstCharBudget = 10;
574 Label label_;
575 bool on_work_list_;
576 NodeInfo info_;
577 // This variable keeps track of how many times code has been generated for
578 // this node (in different traces). We don't keep track of where the
579 // generated code is located unless the code is generated at the start of
580 // a trace, in which case it is generic and can be reused by flushing the
581 // deferred operations in the current trace and generating a goto.
582 int trace_count_;
583 BoyerMooreLookahead* bm_info_[2];
584
585 Zone* zone_;
586 };
587
588
589 class SeqRegExpNode: public RegExpNode {
590 public:
SeqRegExpNode(RegExpNode * on_success)591 explicit SeqRegExpNode(RegExpNode* on_success)
592 : RegExpNode(on_success->zone()), on_success_(on_success) { }
on_success()593 RegExpNode* on_success() { return on_success_; }
set_on_success(RegExpNode * node)594 void set_on_success(RegExpNode* node) { on_success_ = node; }
595 virtual RegExpNode* FilterOneByte(int depth);
FillInBMInfo(Isolate * isolate,int offset,int budget,BoyerMooreLookahead * bm,bool not_at_start)596 virtual void FillInBMInfo(Isolate* isolate, int offset, int budget,
597 BoyerMooreLookahead* bm, bool not_at_start) {
598 on_success_->FillInBMInfo(isolate, offset, budget - 1, bm, not_at_start);
599 if (offset == 0) set_bm_info(not_at_start, bm);
600 }
601
602 protected:
603 RegExpNode* FilterSuccessor(int depth);
604
605 private:
606 RegExpNode* on_success_;
607 };
608
609
610 class ActionNode: public SeqRegExpNode {
611 public:
612 enum ActionType {
613 SET_REGISTER,
614 INCREMENT_REGISTER,
615 STORE_POSITION,
616 BEGIN_SUBMATCH,
617 POSITIVE_SUBMATCH_SUCCESS,
618 EMPTY_MATCH_CHECK,
619 CLEAR_CAPTURES
620 };
621 static ActionNode* SetRegister(int reg, int val, RegExpNode* on_success);
622 static ActionNode* IncrementRegister(int reg, RegExpNode* on_success);
623 static ActionNode* StorePosition(int reg,
624 bool is_capture,
625 RegExpNode* on_success);
626 static ActionNode* ClearCaptures(Interval range, RegExpNode* on_success);
627 static ActionNode* BeginSubmatch(int stack_pointer_reg,
628 int position_reg,
629 RegExpNode* on_success);
630 static ActionNode* PositiveSubmatchSuccess(int stack_pointer_reg,
631 int restore_reg,
632 int clear_capture_count,
633 int clear_capture_from,
634 RegExpNode* on_success);
635 static ActionNode* EmptyMatchCheck(int start_register,
636 int repetition_register,
637 int repetition_limit,
638 RegExpNode* on_success);
639 virtual void Accept(NodeVisitor* visitor);
640 virtual void Emit(RegExpCompiler* compiler, Trace* trace);
641 virtual int EatsAtLeast(int still_to_find, int budget, bool not_at_start);
GetQuickCheckDetails(QuickCheckDetails * details,RegExpCompiler * compiler,int filled_in,bool not_at_start)642 virtual void GetQuickCheckDetails(QuickCheckDetails* details,
643 RegExpCompiler* compiler,
644 int filled_in,
645 bool not_at_start) {
646 return on_success()->GetQuickCheckDetails(
647 details, compiler, filled_in, not_at_start);
648 }
649 virtual void FillInBMInfo(Isolate* isolate, int offset, int budget,
650 BoyerMooreLookahead* bm, bool not_at_start);
action_type()651 ActionType action_type() { return action_type_; }
652 // TODO(erikcorry): We should allow some action nodes in greedy loops.
GreedyLoopTextLength()653 virtual int GreedyLoopTextLength() { return kNodeIsTooComplexForGreedyLoops; }
654
655 private:
656 union {
657 struct {
658 int reg;
659 int value;
660 } u_store_register;
661 struct {
662 int reg;
663 } u_increment_register;
664 struct {
665 int reg;
666 bool is_capture;
667 } u_position_register;
668 struct {
669 int stack_pointer_register;
670 int current_position_register;
671 int clear_register_count;
672 int clear_register_from;
673 } u_submatch;
674 struct {
675 int start_register;
676 int repetition_register;
677 int repetition_limit;
678 } u_empty_match_check;
679 struct {
680 int range_from;
681 int range_to;
682 } u_clear_captures;
683 } data_;
ActionNode(ActionType action_type,RegExpNode * on_success)684 ActionNode(ActionType action_type, RegExpNode* on_success)
685 : SeqRegExpNode(on_success),
686 action_type_(action_type) { }
687 ActionType action_type_;
688 friend class DotPrinter;
689 };
690
691
692 class TextNode: public SeqRegExpNode {
693 public:
TextNode(ZoneList<TextElement> * elms,bool read_backward,RegExpNode * on_success)694 TextNode(ZoneList<TextElement>* elms, bool read_backward,
695 RegExpNode* on_success)
696 : SeqRegExpNode(on_success), elms_(elms), read_backward_(read_backward) {}
TextNode(RegExpCharacterClass * that,bool read_backward,RegExpNode * on_success)697 TextNode(RegExpCharacterClass* that, bool read_backward,
698 RegExpNode* on_success)
699 : SeqRegExpNode(on_success),
700 elms_(new (zone()) ZoneList<TextElement>(1, zone())),
701 read_backward_(read_backward) {
702 elms_->Add(TextElement::CharClass(that), zone());
703 }
704 // Create TextNode for a single character class for the given ranges.
705 static TextNode* CreateForCharacterRanges(Zone* zone,
706 ZoneList<CharacterRange>* ranges,
707 bool read_backward,
708 RegExpNode* on_success,
709 JSRegExp::Flags flags);
710 // Create TextNode for a surrogate pair with a range given for the
711 // lead and the trail surrogate each.
712 static TextNode* CreateForSurrogatePair(Zone* zone, CharacterRange lead,
713 CharacterRange trail,
714 bool read_backward,
715 RegExpNode* on_success,
716 JSRegExp::Flags flags);
717 virtual void Accept(NodeVisitor* visitor);
718 virtual void Emit(RegExpCompiler* compiler, Trace* trace);
719 virtual int EatsAtLeast(int still_to_find, int budget, bool not_at_start);
720 virtual void GetQuickCheckDetails(QuickCheckDetails* details,
721 RegExpCompiler* compiler,
722 int characters_filled_in,
723 bool not_at_start);
elements()724 ZoneList<TextElement>* elements() { return elms_; }
read_backward()725 bool read_backward() { return read_backward_; }
726 void MakeCaseIndependent(Isolate* isolate, bool is_one_byte);
727 virtual int GreedyLoopTextLength();
728 virtual RegExpNode* GetSuccessorOfOmnivorousTextNode(
729 RegExpCompiler* compiler);
730 virtual void FillInBMInfo(Isolate* isolate, int offset, int budget,
731 BoyerMooreLookahead* bm, bool not_at_start);
732 void CalculateOffsets();
733 virtual RegExpNode* FilterOneByte(int depth);
734
735 private:
736 enum TextEmitPassType {
737 NON_LATIN1_MATCH, // Check for characters that can't match.
738 SIMPLE_CHARACTER_MATCH, // Case-dependent single character check.
739 NON_LETTER_CHARACTER_MATCH, // Check characters that have no case equivs.
740 CASE_CHARACTER_MATCH, // Case-independent single character check.
741 CHARACTER_CLASS_MATCH // Character class.
742 };
743 static bool SkipPass(TextEmitPassType pass, bool ignore_case);
744 static const int kFirstRealPass = SIMPLE_CHARACTER_MATCH;
745 static const int kLastPass = CHARACTER_CLASS_MATCH;
746 void TextEmitPass(RegExpCompiler* compiler,
747 TextEmitPassType pass,
748 bool preloaded,
749 Trace* trace,
750 bool first_element_checked,
751 int* checked_up_to);
752 int Length();
753 ZoneList<TextElement>* elms_;
754 bool read_backward_;
755 };
756
757
758 class AssertionNode: public SeqRegExpNode {
759 public:
760 enum AssertionType {
761 AT_END,
762 AT_START,
763 AT_BOUNDARY,
764 AT_NON_BOUNDARY,
765 AFTER_NEWLINE
766 };
AtEnd(RegExpNode * on_success)767 static AssertionNode* AtEnd(RegExpNode* on_success) {
768 return new(on_success->zone()) AssertionNode(AT_END, on_success);
769 }
AtStart(RegExpNode * on_success)770 static AssertionNode* AtStart(RegExpNode* on_success) {
771 return new(on_success->zone()) AssertionNode(AT_START, on_success);
772 }
AtBoundary(RegExpNode * on_success)773 static AssertionNode* AtBoundary(RegExpNode* on_success) {
774 return new(on_success->zone()) AssertionNode(AT_BOUNDARY, on_success);
775 }
AtNonBoundary(RegExpNode * on_success)776 static AssertionNode* AtNonBoundary(RegExpNode* on_success) {
777 return new(on_success->zone()) AssertionNode(AT_NON_BOUNDARY, on_success);
778 }
AfterNewline(RegExpNode * on_success)779 static AssertionNode* AfterNewline(RegExpNode* on_success) {
780 return new(on_success->zone()) AssertionNode(AFTER_NEWLINE, on_success);
781 }
782 virtual void Accept(NodeVisitor* visitor);
783 virtual void Emit(RegExpCompiler* compiler, Trace* trace);
784 virtual int EatsAtLeast(int still_to_find, int budget, bool not_at_start);
785 virtual void GetQuickCheckDetails(QuickCheckDetails* details,
786 RegExpCompiler* compiler,
787 int filled_in,
788 bool not_at_start);
789 virtual void FillInBMInfo(Isolate* isolate, int offset, int budget,
790 BoyerMooreLookahead* bm, bool not_at_start);
assertion_type()791 AssertionType assertion_type() { return assertion_type_; }
792
793 private:
794 void EmitBoundaryCheck(RegExpCompiler* compiler, Trace* trace);
795 enum IfPrevious { kIsNonWord, kIsWord };
796 void BacktrackIfPrevious(RegExpCompiler* compiler,
797 Trace* trace,
798 IfPrevious backtrack_if_previous);
AssertionNode(AssertionType t,RegExpNode * on_success)799 AssertionNode(AssertionType t, RegExpNode* on_success)
800 : SeqRegExpNode(on_success), assertion_type_(t) { }
801 AssertionType assertion_type_;
802 };
803
804
805 class BackReferenceNode: public SeqRegExpNode {
806 public:
BackReferenceNode(int start_reg,int end_reg,JSRegExp::Flags flags,bool read_backward,RegExpNode * on_success)807 BackReferenceNode(int start_reg, int end_reg, JSRegExp::Flags flags,
808 bool read_backward, RegExpNode* on_success)
809 : SeqRegExpNode(on_success),
810 start_reg_(start_reg),
811 end_reg_(end_reg),
812 flags_(flags),
813 read_backward_(read_backward) {}
814 virtual void Accept(NodeVisitor* visitor);
start_register()815 int start_register() { return start_reg_; }
end_register()816 int end_register() { return end_reg_; }
read_backward()817 bool read_backward() { return read_backward_; }
818 virtual void Emit(RegExpCompiler* compiler, Trace* trace);
819 virtual int EatsAtLeast(int still_to_find,
820 int recursion_depth,
821 bool not_at_start);
GetQuickCheckDetails(QuickCheckDetails * details,RegExpCompiler * compiler,int characters_filled_in,bool not_at_start)822 virtual void GetQuickCheckDetails(QuickCheckDetails* details,
823 RegExpCompiler* compiler,
824 int characters_filled_in,
825 bool not_at_start) {
826 return;
827 }
828 virtual void FillInBMInfo(Isolate* isolate, int offset, int budget,
829 BoyerMooreLookahead* bm, bool not_at_start);
830
831 private:
832 int start_reg_;
833 int end_reg_;
834 JSRegExp::Flags flags_;
835 bool read_backward_;
836 };
837
838
839 class EndNode: public RegExpNode {
840 public:
841 enum Action { ACCEPT, BACKTRACK, NEGATIVE_SUBMATCH_SUCCESS };
EndNode(Action action,Zone * zone)842 EndNode(Action action, Zone* zone) : RegExpNode(zone), action_(action) {}
843 virtual void Accept(NodeVisitor* visitor);
844 virtual void Emit(RegExpCompiler* compiler, Trace* trace);
EatsAtLeast(int still_to_find,int recursion_depth,bool not_at_start)845 virtual int EatsAtLeast(int still_to_find,
846 int recursion_depth,
847 bool not_at_start) { return 0; }
GetQuickCheckDetails(QuickCheckDetails * details,RegExpCompiler * compiler,int characters_filled_in,bool not_at_start)848 virtual void GetQuickCheckDetails(QuickCheckDetails* details,
849 RegExpCompiler* compiler,
850 int characters_filled_in,
851 bool not_at_start) {
852 // Returning 0 from EatsAtLeast should ensure we never get here.
853 UNREACHABLE();
854 }
FillInBMInfo(Isolate * isolate,int offset,int budget,BoyerMooreLookahead * bm,bool not_at_start)855 virtual void FillInBMInfo(Isolate* isolate, int offset, int budget,
856 BoyerMooreLookahead* bm, bool not_at_start) {
857 // Returning 0 from EatsAtLeast should ensure we never get here.
858 UNREACHABLE();
859 }
860
861 private:
862 Action action_;
863 };
864
865
866 class NegativeSubmatchSuccess: public EndNode {
867 public:
NegativeSubmatchSuccess(int stack_pointer_reg,int position_reg,int clear_capture_count,int clear_capture_start,Zone * zone)868 NegativeSubmatchSuccess(int stack_pointer_reg,
869 int position_reg,
870 int clear_capture_count,
871 int clear_capture_start,
872 Zone* zone)
873 : EndNode(NEGATIVE_SUBMATCH_SUCCESS, zone),
874 stack_pointer_register_(stack_pointer_reg),
875 current_position_register_(position_reg),
876 clear_capture_count_(clear_capture_count),
877 clear_capture_start_(clear_capture_start) { }
878 virtual void Emit(RegExpCompiler* compiler, Trace* trace);
879
880 private:
881 int stack_pointer_register_;
882 int current_position_register_;
883 int clear_capture_count_;
884 int clear_capture_start_;
885 };
886
887
888 class Guard: public ZoneObject {
889 public:
890 enum Relation { LT, GEQ };
Guard(int reg,Relation op,int value)891 Guard(int reg, Relation op, int value)
892 : reg_(reg),
893 op_(op),
894 value_(value) { }
reg()895 int reg() { return reg_; }
op()896 Relation op() { return op_; }
value()897 int value() { return value_; }
898
899 private:
900 int reg_;
901 Relation op_;
902 int value_;
903 };
904
905
906 class GuardedAlternative {
907 public:
GuardedAlternative(RegExpNode * node)908 explicit GuardedAlternative(RegExpNode* node)
909 : node_(node), guards_(nullptr) {}
910 void AddGuard(Guard* guard, Zone* zone);
node()911 RegExpNode* node() { return node_; }
set_node(RegExpNode * node)912 void set_node(RegExpNode* node) { node_ = node; }
guards()913 ZoneList<Guard*>* guards() { return guards_; }
914
915 private:
916 RegExpNode* node_;
917 ZoneList<Guard*>* guards_;
918 };
919
920
921 class AlternativeGeneration;
922
923
924 class ChoiceNode: public RegExpNode {
925 public:
ChoiceNode(int expected_size,Zone * zone)926 explicit ChoiceNode(int expected_size, Zone* zone)
927 : RegExpNode(zone),
928 alternatives_(new (zone)
929 ZoneList<GuardedAlternative>(expected_size, zone)),
930 table_(nullptr),
931 not_at_start_(false),
932 being_calculated_(false) {}
933 virtual void Accept(NodeVisitor* visitor);
AddAlternative(GuardedAlternative node)934 void AddAlternative(GuardedAlternative node) {
935 alternatives()->Add(node, zone());
936 }
alternatives()937 ZoneList<GuardedAlternative>* alternatives() { return alternatives_; }
938 DispatchTable* GetTable(bool ignore_case);
939 virtual void Emit(RegExpCompiler* compiler, Trace* trace);
940 virtual int EatsAtLeast(int still_to_find, int budget, bool not_at_start);
941 int EatsAtLeastHelper(int still_to_find,
942 int budget,
943 RegExpNode* ignore_this_node,
944 bool not_at_start);
945 virtual void GetQuickCheckDetails(QuickCheckDetails* details,
946 RegExpCompiler* compiler,
947 int characters_filled_in,
948 bool not_at_start);
949 virtual void FillInBMInfo(Isolate* isolate, int offset, int budget,
950 BoyerMooreLookahead* bm, bool not_at_start);
951
being_calculated()952 bool being_calculated() { return being_calculated_; }
not_at_start()953 bool not_at_start() { return not_at_start_; }
set_not_at_start()954 void set_not_at_start() { not_at_start_ = true; }
set_being_calculated(bool b)955 void set_being_calculated(bool b) { being_calculated_ = b; }
try_to_emit_quick_check_for_alternative(bool is_first)956 virtual bool try_to_emit_quick_check_for_alternative(bool is_first) {
957 return true;
958 }
959 virtual RegExpNode* FilterOneByte(int depth);
read_backward()960 virtual bool read_backward() { return false; }
961
962 protected:
963 int GreedyLoopTextLengthForAlternative(GuardedAlternative* alternative);
964 ZoneList<GuardedAlternative>* alternatives_;
965
966 private:
967 friend class DispatchTableConstructor;
968 friend class Analysis;
969 void GenerateGuard(RegExpMacroAssembler* macro_assembler,
970 Guard* guard,
971 Trace* trace);
972 int CalculatePreloadCharacters(RegExpCompiler* compiler, int eats_at_least);
973 void EmitOutOfLineContinuation(RegExpCompiler* compiler,
974 Trace* trace,
975 GuardedAlternative alternative,
976 AlternativeGeneration* alt_gen,
977 int preload_characters,
978 bool next_expects_preload);
979 void SetUpPreLoad(RegExpCompiler* compiler,
980 Trace* current_trace,
981 PreloadState* preloads);
982 void AssertGuardsMentionRegisters(Trace* trace);
983 int EmitOptimizedUnanchoredSearch(RegExpCompiler* compiler, Trace* trace);
984 Trace* EmitGreedyLoop(RegExpCompiler* compiler,
985 Trace* trace,
986 AlternativeGenerationList* alt_gens,
987 PreloadState* preloads,
988 GreedyLoopState* greedy_loop_state,
989 int text_length);
990 void EmitChoices(RegExpCompiler* compiler,
991 AlternativeGenerationList* alt_gens,
992 int first_choice,
993 Trace* trace,
994 PreloadState* preloads);
995 DispatchTable* table_;
996 // If true, this node is never checked at the start of the input.
997 // Allows a new trace to start with at_start() set to false.
998 bool not_at_start_;
999 bool being_calculated_;
1000 };
1001
1002
1003 class NegativeLookaroundChoiceNode : public ChoiceNode {
1004 public:
NegativeLookaroundChoiceNode(GuardedAlternative this_must_fail,GuardedAlternative then_do_this,Zone * zone)1005 explicit NegativeLookaroundChoiceNode(GuardedAlternative this_must_fail,
1006 GuardedAlternative then_do_this,
1007 Zone* zone)
1008 : ChoiceNode(2, zone) {
1009 AddAlternative(this_must_fail);
1010 AddAlternative(then_do_this);
1011 }
1012 virtual int EatsAtLeast(int still_to_find, int budget, bool not_at_start);
1013 virtual void GetQuickCheckDetails(QuickCheckDetails* details,
1014 RegExpCompiler* compiler,
1015 int characters_filled_in,
1016 bool not_at_start);
FillInBMInfo(Isolate * isolate,int offset,int budget,BoyerMooreLookahead * bm,bool not_at_start)1017 virtual void FillInBMInfo(Isolate* isolate, int offset, int budget,
1018 BoyerMooreLookahead* bm, bool not_at_start) {
1019 alternatives_->at(1).node()->FillInBMInfo(isolate, offset, budget - 1, bm,
1020 not_at_start);
1021 if (offset == 0) set_bm_info(not_at_start, bm);
1022 }
1023 // For a negative lookahead we don't emit the quick check for the
1024 // alternative that is expected to fail. This is because quick check code
1025 // starts by loading enough characters for the alternative that takes fewest
1026 // characters, but on a negative lookahead the negative branch did not take
1027 // part in that calculation (EatsAtLeast) so the assumptions don't hold.
try_to_emit_quick_check_for_alternative(bool is_first)1028 virtual bool try_to_emit_quick_check_for_alternative(bool is_first) {
1029 return !is_first;
1030 }
1031 virtual RegExpNode* FilterOneByte(int depth);
1032 };
1033
1034
1035 class LoopChoiceNode: public ChoiceNode {
1036 public:
LoopChoiceNode(bool body_can_be_zero_length,bool read_backward,Zone * zone)1037 LoopChoiceNode(bool body_can_be_zero_length, bool read_backward, Zone* zone)
1038 : ChoiceNode(2, zone),
1039 loop_node_(nullptr),
1040 continue_node_(nullptr),
1041 body_can_be_zero_length_(body_can_be_zero_length),
1042 read_backward_(read_backward) {}
1043 void AddLoopAlternative(GuardedAlternative alt);
1044 void AddContinueAlternative(GuardedAlternative alt);
1045 virtual void Emit(RegExpCompiler* compiler, Trace* trace);
1046 virtual int EatsAtLeast(int still_to_find, int budget, bool not_at_start);
1047 virtual void GetQuickCheckDetails(QuickCheckDetails* details,
1048 RegExpCompiler* compiler,
1049 int characters_filled_in,
1050 bool not_at_start);
1051 virtual void FillInBMInfo(Isolate* isolate, int offset, int budget,
1052 BoyerMooreLookahead* bm, bool not_at_start);
loop_node()1053 RegExpNode* loop_node() { return loop_node_; }
continue_node()1054 RegExpNode* continue_node() { return continue_node_; }
body_can_be_zero_length()1055 bool body_can_be_zero_length() { return body_can_be_zero_length_; }
read_backward()1056 virtual bool read_backward() { return read_backward_; }
1057 virtual void Accept(NodeVisitor* visitor);
1058 virtual RegExpNode* FilterOneByte(int depth);
1059
1060 private:
1061 // AddAlternative is made private for loop nodes because alternatives
1062 // should not be added freely, we need to keep track of which node
1063 // goes back to the node itself.
AddAlternative(GuardedAlternative node)1064 void AddAlternative(GuardedAlternative node) {
1065 ChoiceNode::AddAlternative(node);
1066 }
1067
1068 RegExpNode* loop_node_;
1069 RegExpNode* continue_node_;
1070 bool body_can_be_zero_length_;
1071 bool read_backward_;
1072 };
1073
1074
1075 // Improve the speed that we scan for an initial point where a non-anchored
1076 // regexp can match by using a Boyer-Moore-like table. This is done by
1077 // identifying non-greedy non-capturing loops in the nodes that eat any
1078 // character one at a time. For example in the middle of the regexp
1079 // /foo[\s\S]*?bar/ we find such a loop. There is also such a loop implicitly
1080 // inserted at the start of any non-anchored regexp.
1081 //
1082 // When we have found such a loop we look ahead in the nodes to find the set of
1083 // characters that can come at given distances. For example for the regexp
1084 // /.?foo/ we know that there are at least 3 characters ahead of us, and the
1085 // sets of characters that can occur are [any, [f, o], [o]]. We find a range in
1086 // the lookahead info where the set of characters is reasonably constrained. In
1087 // our example this is from index 1 to 2 (0 is not constrained). We can now
1088 // look 3 characters ahead and if we don't find one of [f, o] (the union of
1089 // [f, o] and [o]) then we can skip forwards by the range size (in this case 2).
1090 //
1091 // For Unicode input strings we do the same, but modulo 128.
1092 //
1093 // We also look at the first string fed to the regexp and use that to get a hint
1094 // of the character frequencies in the inputs. This affects the assessment of
1095 // whether the set of characters is 'reasonably constrained'.
1096 //
1097 // We also have another lookahead mechanism (called quick check in the code),
1098 // which uses a wide load of multiple characters followed by a mask and compare
1099 // to determine whether a match is possible at this point.
1100 enum ContainedInLattice {
1101 kNotYet = 0,
1102 kLatticeIn = 1,
1103 kLatticeOut = 2,
1104 kLatticeUnknown = 3 // Can also mean both in and out.
1105 };
1106
1107
Combine(ContainedInLattice a,ContainedInLattice b)1108 inline ContainedInLattice Combine(ContainedInLattice a, ContainedInLattice b) {
1109 return static_cast<ContainedInLattice>(a | b);
1110 }
1111
1112
1113 ContainedInLattice AddRange(ContainedInLattice a,
1114 const int* ranges,
1115 int ranges_size,
1116 Interval new_range);
1117
1118
1119 class BoyerMoorePositionInfo : public ZoneObject {
1120 public:
BoyerMoorePositionInfo(Zone * zone)1121 explicit BoyerMoorePositionInfo(Zone* zone)
1122 : map_(new(zone) ZoneList<bool>(kMapSize, zone)),
1123 map_count_(0),
1124 w_(kNotYet),
1125 s_(kNotYet),
1126 d_(kNotYet),
1127 surrogate_(kNotYet) {
1128 for (int i = 0; i < kMapSize; i++) {
1129 map_->Add(false, zone);
1130 }
1131 }
1132
at(int i)1133 bool& at(int i) { return map_->at(i); }
1134
1135 static const int kMapSize = 128;
1136 static const int kMask = kMapSize - 1;
1137
map_count()1138 int map_count() const { return map_count_; }
1139
1140 void Set(int character);
1141 void SetInterval(const Interval& interval);
1142 void SetAll();
is_non_word()1143 bool is_non_word() { return w_ == kLatticeOut; }
is_word()1144 bool is_word() { return w_ == kLatticeIn; }
1145
1146 private:
1147 ZoneList<bool>* map_;
1148 int map_count_; // Number of set bits in the map.
1149 ContainedInLattice w_; // The \w character class.
1150 ContainedInLattice s_; // The \s character class.
1151 ContainedInLattice d_; // The \d character class.
1152 ContainedInLattice surrogate_; // Surrogate UTF-16 code units.
1153 };
1154
1155
1156 class BoyerMooreLookahead : public ZoneObject {
1157 public:
1158 BoyerMooreLookahead(int length, RegExpCompiler* compiler, Zone* zone);
1159
length()1160 int length() { return length_; }
max_char()1161 int max_char() { return max_char_; }
compiler()1162 RegExpCompiler* compiler() { return compiler_; }
1163
Count(int map_number)1164 int Count(int map_number) {
1165 return bitmaps_->at(map_number)->map_count();
1166 }
1167
at(int i)1168 BoyerMoorePositionInfo* at(int i) { return bitmaps_->at(i); }
1169
Set(int map_number,int character)1170 void Set(int map_number, int character) {
1171 if (character > max_char_) return;
1172 BoyerMoorePositionInfo* info = bitmaps_->at(map_number);
1173 info->Set(character);
1174 }
1175
SetInterval(int map_number,const Interval & interval)1176 void SetInterval(int map_number, const Interval& interval) {
1177 if (interval.from() > max_char_) return;
1178 BoyerMoorePositionInfo* info = bitmaps_->at(map_number);
1179 if (interval.to() > max_char_) {
1180 info->SetInterval(Interval(interval.from(), max_char_));
1181 } else {
1182 info->SetInterval(interval);
1183 }
1184 }
1185
SetAll(int map_number)1186 void SetAll(int map_number) {
1187 bitmaps_->at(map_number)->SetAll();
1188 }
1189
SetRest(int from_map)1190 void SetRest(int from_map) {
1191 for (int i = from_map; i < length_; i++) SetAll(i);
1192 }
1193 void EmitSkipInstructions(RegExpMacroAssembler* masm);
1194
1195 private:
1196 // This is the value obtained by EatsAtLeast. If we do not have at least this
1197 // many characters left in the sample string then the match is bound to fail.
1198 // Therefore it is OK to read a character this far ahead of the current match
1199 // point.
1200 int length_;
1201 RegExpCompiler* compiler_;
1202 // 0xff for Latin1, 0xffff for UTF-16.
1203 int max_char_;
1204 ZoneList<BoyerMoorePositionInfo*>* bitmaps_;
1205
1206 int GetSkipTable(int min_lookahead,
1207 int max_lookahead,
1208 Handle<ByteArray> boolean_skip_table);
1209 bool FindWorthwhileInterval(int* from, int* to);
1210 int FindBestInterval(
1211 int max_number_of_chars, int old_biggest_points, int* from, int* to);
1212 };
1213
1214
1215 // There are many ways to generate code for a node. This class encapsulates
1216 // the current way we should be generating. In other words it encapsulates
1217 // the current state of the code generator. The effect of this is that we
1218 // generate code for paths that the matcher can take through the regular
1219 // expression. A given node in the regexp can be code-generated several times
1220 // as it can be part of several traces. For example for the regexp:
1221 // /foo(bar|ip)baz/ the code to match baz will be generated twice, once as part
1222 // of the foo-bar-baz trace and once as part of the foo-ip-baz trace. The code
1223 // to match foo is generated only once (the traces have a common prefix). The
1224 // code to store the capture is deferred and generated (twice) after the places
1225 // where baz has been matched.
1226 class Trace {
1227 public:
1228 // A value for a property that is either known to be true, know to be false,
1229 // or not known.
1230 enum TriBool {
1231 UNKNOWN = -1, FALSE_VALUE = 0, TRUE_VALUE = 1
1232 };
1233
1234 class DeferredAction {
1235 public:
DeferredAction(ActionNode::ActionType action_type,int reg)1236 DeferredAction(ActionNode::ActionType action_type, int reg)
1237 : action_type_(action_type), reg_(reg), next_(nullptr) {}
next()1238 DeferredAction* next() { return next_; }
1239 bool Mentions(int reg);
reg()1240 int reg() { return reg_; }
action_type()1241 ActionNode::ActionType action_type() { return action_type_; }
1242 private:
1243 ActionNode::ActionType action_type_;
1244 int reg_;
1245 DeferredAction* next_;
1246 friend class Trace;
1247 };
1248
1249 class DeferredCapture : public DeferredAction {
1250 public:
DeferredCapture(int reg,bool is_capture,Trace * trace)1251 DeferredCapture(int reg, bool is_capture, Trace* trace)
1252 : DeferredAction(ActionNode::STORE_POSITION, reg),
1253 cp_offset_(trace->cp_offset()),
1254 is_capture_(is_capture) { }
cp_offset()1255 int cp_offset() { return cp_offset_; }
is_capture()1256 bool is_capture() { return is_capture_; }
1257 private:
1258 int cp_offset_;
1259 bool is_capture_;
set_cp_offset(int cp_offset)1260 void set_cp_offset(int cp_offset) { cp_offset_ = cp_offset; }
1261 };
1262
1263 class DeferredSetRegister : public DeferredAction {
1264 public:
DeferredSetRegister(int reg,int value)1265 DeferredSetRegister(int reg, int value)
1266 : DeferredAction(ActionNode::SET_REGISTER, reg),
1267 value_(value) { }
value()1268 int value() { return value_; }
1269 private:
1270 int value_;
1271 };
1272
1273 class DeferredClearCaptures : public DeferredAction {
1274 public:
DeferredClearCaptures(Interval range)1275 explicit DeferredClearCaptures(Interval range)
1276 : DeferredAction(ActionNode::CLEAR_CAPTURES, -1),
1277 range_(range) { }
range()1278 Interval range() { return range_; }
1279 private:
1280 Interval range_;
1281 };
1282
1283 class DeferredIncrementRegister : public DeferredAction {
1284 public:
DeferredIncrementRegister(int reg)1285 explicit DeferredIncrementRegister(int reg)
1286 : DeferredAction(ActionNode::INCREMENT_REGISTER, reg) { }
1287 };
1288
Trace()1289 Trace()
1290 : cp_offset_(0),
1291 actions_(nullptr),
1292 backtrack_(nullptr),
1293 stop_node_(nullptr),
1294 loop_label_(nullptr),
1295 characters_preloaded_(0),
1296 bound_checked_up_to_(0),
1297 flush_budget_(100),
1298 at_start_(UNKNOWN) {}
1299
1300 // End the trace. This involves flushing the deferred actions in the trace
1301 // and pushing a backtrack location onto the backtrack stack. Once this is
1302 // done we can start a new trace or go to one that has already been
1303 // generated.
1304 void Flush(RegExpCompiler* compiler, RegExpNode* successor);
cp_offset()1305 int cp_offset() { return cp_offset_; }
actions()1306 DeferredAction* actions() { return actions_; }
1307 // A trivial trace is one that has no deferred actions or other state that
1308 // affects the assumptions used when generating code. There is no recorded
1309 // backtrack location in a trivial trace, so with a trivial trace we will
1310 // generate code that, on a failure to match, gets the backtrack location
1311 // from the backtrack stack rather than using a direct jump instruction. We
1312 // always start code generation with a trivial trace and non-trivial traces
1313 // are created as we emit code for nodes or add to the list of deferred
1314 // actions in the trace. The location of the code generated for a node using
1315 // a trivial trace is recorded in a label in the node so that gotos can be
1316 // generated to that code.
is_trivial()1317 bool is_trivial() {
1318 return backtrack_ == nullptr && actions_ == nullptr && cp_offset_ == 0 &&
1319 characters_preloaded_ == 0 && bound_checked_up_to_ == 0 &&
1320 quick_check_performed_.characters() == 0 && at_start_ == UNKNOWN;
1321 }
at_start()1322 TriBool at_start() { return at_start_; }
set_at_start(TriBool at_start)1323 void set_at_start(TriBool at_start) { at_start_ = at_start; }
backtrack()1324 Label* backtrack() { return backtrack_; }
loop_label()1325 Label* loop_label() { return loop_label_; }
stop_node()1326 RegExpNode* stop_node() { return stop_node_; }
characters_preloaded()1327 int characters_preloaded() { return characters_preloaded_; }
bound_checked_up_to()1328 int bound_checked_up_to() { return bound_checked_up_to_; }
flush_budget()1329 int flush_budget() { return flush_budget_; }
quick_check_performed()1330 QuickCheckDetails* quick_check_performed() { return &quick_check_performed_; }
1331 bool mentions_reg(int reg);
1332 // Returns true if a deferred position store exists to the specified
1333 // register and stores the offset in the out-parameter. Otherwise
1334 // returns false.
1335 bool GetStoredPosition(int reg, int* cp_offset);
1336 // These set methods and AdvanceCurrentPositionInTrace should be used only on
1337 // new traces - the intention is that traces are immutable after creation.
add_action(DeferredAction * new_action)1338 void add_action(DeferredAction* new_action) {
1339 DCHECK(new_action->next_ == nullptr);
1340 new_action->next_ = actions_;
1341 actions_ = new_action;
1342 }
set_backtrack(Label * backtrack)1343 void set_backtrack(Label* backtrack) { backtrack_ = backtrack; }
set_stop_node(RegExpNode * node)1344 void set_stop_node(RegExpNode* node) { stop_node_ = node; }
set_loop_label(Label * label)1345 void set_loop_label(Label* label) { loop_label_ = label; }
set_characters_preloaded(int count)1346 void set_characters_preloaded(int count) { characters_preloaded_ = count; }
set_bound_checked_up_to(int to)1347 void set_bound_checked_up_to(int to) { bound_checked_up_to_ = to; }
set_flush_budget(int to)1348 void set_flush_budget(int to) { flush_budget_ = to; }
set_quick_check_performed(QuickCheckDetails * d)1349 void set_quick_check_performed(QuickCheckDetails* d) {
1350 quick_check_performed_ = *d;
1351 }
1352 void InvalidateCurrentCharacter();
1353 void AdvanceCurrentPositionInTrace(int by, RegExpCompiler* compiler);
1354
1355 private:
1356 int FindAffectedRegisters(OutSet* affected_registers, Zone* zone);
1357 void PerformDeferredActions(RegExpMacroAssembler* macro,
1358 int max_register,
1359 const OutSet& affected_registers,
1360 OutSet* registers_to_pop,
1361 OutSet* registers_to_clear,
1362 Zone* zone);
1363 void RestoreAffectedRegisters(RegExpMacroAssembler* macro,
1364 int max_register,
1365 const OutSet& registers_to_pop,
1366 const OutSet& registers_to_clear);
1367 int cp_offset_;
1368 DeferredAction* actions_;
1369 Label* backtrack_;
1370 RegExpNode* stop_node_;
1371 Label* loop_label_;
1372 int characters_preloaded_;
1373 int bound_checked_up_to_;
1374 QuickCheckDetails quick_check_performed_;
1375 int flush_budget_;
1376 TriBool at_start_;
1377 };
1378
1379
1380 class GreedyLoopState {
1381 public:
1382 explicit GreedyLoopState(bool not_at_start);
1383
label()1384 Label* label() { return &label_; }
counter_backtrack_trace()1385 Trace* counter_backtrack_trace() { return &counter_backtrack_trace_; }
1386
1387 private:
1388 Label label_;
1389 Trace counter_backtrack_trace_;
1390 };
1391
1392
1393 struct PreloadState {
1394 static const int kEatsAtLeastNotYetInitialized = -1;
1395 bool preload_is_current_;
1396 bool preload_has_checked_bounds_;
1397 int preload_characters_;
1398 int eats_at_least_;
initPreloadState1399 void init() {
1400 eats_at_least_ = kEatsAtLeastNotYetInitialized;
1401 }
1402 };
1403
1404
1405 class NodeVisitor {
1406 public:
~NodeVisitor()1407 virtual ~NodeVisitor() { }
1408 #define DECLARE_VISIT(Type) \
1409 virtual void Visit##Type(Type##Node* that) = 0;
FOR_EACH_NODE_TYPE(DECLARE_VISIT)1410 FOR_EACH_NODE_TYPE(DECLARE_VISIT)
1411 #undef DECLARE_VISIT
1412 virtual void VisitLoopChoice(LoopChoiceNode* that) { VisitChoice(that); }
1413 };
1414
1415
1416 // Node visitor used to add the start set of the alternatives to the
1417 // dispatch table of a choice node.
1418 class DispatchTableConstructor: public NodeVisitor {
1419 public:
DispatchTableConstructor(DispatchTable * table,bool ignore_case,Zone * zone)1420 DispatchTableConstructor(DispatchTable* table, bool ignore_case,
1421 Zone* zone)
1422 : table_(table),
1423 choice_index_(-1),
1424 ignore_case_(ignore_case),
1425 zone_(zone) { }
1426
1427 void BuildTable(ChoiceNode* node);
1428
AddRange(CharacterRange range)1429 void AddRange(CharacterRange range) {
1430 table()->AddRange(range, choice_index_, zone_);
1431 }
1432
1433 void AddInverse(ZoneList<CharacterRange>* ranges);
1434
1435 #define DECLARE_VISIT(Type) \
1436 virtual void Visit##Type(Type##Node* that);
FOR_EACH_NODE_TYPE(DECLARE_VISIT)1437 FOR_EACH_NODE_TYPE(DECLARE_VISIT)
1438 #undef DECLARE_VISIT
1439
1440 DispatchTable* table() { return table_; }
set_choice_index(int value)1441 void set_choice_index(int value) { choice_index_ = value; }
1442
1443 protected:
1444 DispatchTable* table_;
1445 int choice_index_;
1446 bool ignore_case_;
1447 Zone* zone_;
1448 };
1449
1450
1451 // Assertion propagation moves information about assertions such as
1452 // \b to the affected nodes. For instance, in /.\b./ information must
1453 // be propagated to the first '.' that whatever follows needs to know
1454 // if it matched a word or a non-word, and to the second '.' that it
1455 // has to check if it succeeds a word or non-word. In this case the
1456 // result will be something like:
1457 //
1458 // +-------+ +------------+
1459 // | . | | . |
1460 // +-------+ ---> +------------+
1461 // | word? | | check word |
1462 // +-------+ +------------+
1463 class Analysis: public NodeVisitor {
1464 public:
Analysis(Isolate * isolate,bool is_one_byte)1465 Analysis(Isolate* isolate, bool is_one_byte)
1466 : isolate_(isolate), is_one_byte_(is_one_byte), error_message_(nullptr) {}
1467 void EnsureAnalyzed(RegExpNode* node);
1468
1469 #define DECLARE_VISIT(Type) \
1470 virtual void Visit##Type(Type##Node* that);
1471 FOR_EACH_NODE_TYPE(DECLARE_VISIT)
1472 #undef DECLARE_VISIT
1473 virtual void VisitLoopChoice(LoopChoiceNode* that);
1474
has_failed()1475 bool has_failed() { return error_message_ != nullptr; }
error_message()1476 const char* error_message() {
1477 DCHECK(error_message_ != nullptr);
1478 return error_message_;
1479 }
fail(const char * error_message)1480 void fail(const char* error_message) {
1481 error_message_ = error_message;
1482 }
1483
isolate()1484 Isolate* isolate() const { return isolate_; }
1485
1486 private:
1487 Isolate* isolate_;
1488 bool is_one_byte_;
1489 const char* error_message_;
1490
1491 DISALLOW_IMPLICIT_CONSTRUCTORS(Analysis);
1492 };
1493
1494
1495 struct RegExpCompileData {
RegExpCompileDataRegExpCompileData1496 RegExpCompileData()
1497 : tree(nullptr),
1498 node(nullptr),
1499 simple(true),
1500 contains_anchor(false),
1501 capture_count(0) {}
1502 RegExpTree* tree;
1503 RegExpNode* node;
1504 bool simple;
1505 bool contains_anchor;
1506 Handle<FixedArray> capture_name_map;
1507 Handle<String> error;
1508 int capture_count;
1509 };
1510
1511
1512 class RegExpEngine: public AllStatic {
1513 public:
1514 struct CompilationResult {
CompilationResultCompilationResult1515 CompilationResult(Isolate* isolate, const char* error_message)
1516 : error_message(error_message),
1517 code(ReadOnlyRoots(isolate).the_hole_value()),
1518 num_registers(0) {}
CompilationResultCompilationResult1519 CompilationResult(Object* code, int registers)
1520 : error_message(nullptr), code(code), num_registers(registers) {}
1521 const char* error_message;
1522 Object* code;
1523 int num_registers;
1524 };
1525
1526 static CompilationResult Compile(Isolate* isolate, Zone* zone,
1527 RegExpCompileData* input,
1528 JSRegExp::Flags flags,
1529 Handle<String> pattern,
1530 Handle<String> sample_subject,
1531 bool is_one_byte);
1532
1533 static bool TooMuchRegExpCode(Isolate* isolate, Handle<String> pattern);
1534
1535 static void DotPrint(const char* label, RegExpNode* node, bool ignore_case);
1536 };
1537
1538
1539 class RegExpResultsCache : public AllStatic {
1540 public:
1541 enum ResultsCacheType { REGEXP_MULTIPLE_INDICES, STRING_SPLIT_SUBSTRINGS };
1542
1543 // Attempt to retrieve a cached result. On failure, 0 is returned as a Smi.
1544 // On success, the returned result is guaranteed to be a COW-array.
1545 static Object* Lookup(Heap* heap, String* key_string, Object* key_pattern,
1546 FixedArray** last_match_out, ResultsCacheType type);
1547 // Attempt to add value_array to the cache specified by type. On success,
1548 // value_array is turned into a COW-array.
1549 static void Enter(Isolate* isolate, Handle<String> key_string,
1550 Handle<Object> key_pattern, Handle<FixedArray> value_array,
1551 Handle<FixedArray> last_match_cache, ResultsCacheType type);
1552 static void Clear(FixedArray* cache);
1553 static const int kRegExpResultsCacheSize = 0x100;
1554
1555 private:
1556 static const int kArrayEntriesPerCacheEntry = 4;
1557 static const int kStringOffset = 0;
1558 static const int kPatternOffset = 1;
1559 static const int kArrayOffset = 2;
1560 static const int kLastMatchOffset = 3;
1561 };
1562
1563 } // namespace internal
1564 } // namespace v8
1565
1566 #endif // V8_REGEXP_JSREGEXP_H_
1567