• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2020 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #include "src/regexp/experimental/experimental-compiler.h"
6 
7 #include "src/base/strings.h"
8 #include "src/regexp/experimental/experimental.h"
9 #include "src/zone/zone-list-inl.h"
10 
11 namespace v8 {
12 namespace internal {
13 
14 namespace {
15 
16 // TODO(mbid, v8:10765): Currently the experimental engine doesn't support
17 // UTF-16, but this shouldn't be too hard to implement.
18 constexpr base::uc32 kMaxSupportedCodepoint = 0xFFFFu;
19 #ifdef DEBUG
20 constexpr base::uc32 kMaxCodePoint = 0x10ffff;
21 #endif  // DEBUG
22 
23 class CanBeHandledVisitor final : private RegExpVisitor {
24   // Visitor to implement `ExperimentalRegExp::CanBeHandled`.
25  public:
Check(RegExpTree * tree,RegExpFlags flags,int capture_count)26   static bool Check(RegExpTree* tree, RegExpFlags flags, int capture_count) {
27     if (!AreSuitableFlags(flags)) return false;
28     CanBeHandledVisitor visitor;
29     tree->Accept(&visitor, nullptr);
30     return visitor.result_;
31   }
32 
33  private:
34   CanBeHandledVisitor() = default;
35 
AreSuitableFlags(RegExpFlags flags)36   static bool AreSuitableFlags(RegExpFlags flags) {
37     // TODO(mbid, v8:10765): We should be able to support all flags in the
38     // future.
39     static constexpr RegExpFlags kAllowedFlags =
40         RegExpFlag::kGlobal | RegExpFlag::kSticky | RegExpFlag::kMultiline |
41         RegExpFlag::kDotAll | RegExpFlag::kLinear;
42     // We support Unicode iff kUnicode is among the supported flags.
43     STATIC_ASSERT(ExperimentalRegExp::kSupportsUnicode ==
44                   IsUnicode(kAllowedFlags));
45     return (flags & ~kAllowedFlags) == 0;
46   }
47 
VisitDisjunction(RegExpDisjunction * node,void *)48   void* VisitDisjunction(RegExpDisjunction* node, void*) override {
49     for (RegExpTree* alt : *node->alternatives()) {
50       alt->Accept(this, nullptr);
51       if (!result_) {
52         return nullptr;
53       }
54     }
55     return nullptr;
56   }
57 
VisitAlternative(RegExpAlternative * node,void *)58   void* VisitAlternative(RegExpAlternative* node, void*) override {
59     for (RegExpTree* child : *node->nodes()) {
60       child->Accept(this, nullptr);
61       if (!result_) {
62         return nullptr;
63       }
64     }
65     return nullptr;
66   }
67 
VisitCharacterClass(RegExpCharacterClass * node,void *)68   void* VisitCharacterClass(RegExpCharacterClass* node, void*) override {
69     return nullptr;
70   }
71 
VisitAssertion(RegExpAssertion * node,void *)72   void* VisitAssertion(RegExpAssertion* node, void*) override {
73     return nullptr;
74   }
75 
VisitAtom(RegExpAtom * node,void *)76   void* VisitAtom(RegExpAtom* node, void*) override {
77     return nullptr;
78   }
79 
VisitText(RegExpText * node,void *)80   void* VisitText(RegExpText* node, void*) override {
81     for (TextElement& el : *node->elements()) {
82       el.tree()->Accept(this, nullptr);
83       if (!result_) {
84         return nullptr;
85       }
86     }
87     return nullptr;
88   }
89 
VisitQuantifier(RegExpQuantifier * node,void *)90   void* VisitQuantifier(RegExpQuantifier* node, void*) override {
91     // Finite but large values of `min()` and `max()` are bad for the
92     // breadth-first engine because finite (optional) repetition is dealt with
93     // by replicating the bytecode of the body of the quantifier.  The number
94     // of replicatons grows exponentially in how deeply quantifiers are nested.
95     // `replication_factor_` keeps track of how often the current node will
96     // have to be replicated in the generated bytecode, and we don't allow this
97     // to exceed some small value.
98     static constexpr int kMaxReplicationFactor = 16;
99 
100     // First we rule out values for min and max that are too big even before
101     // taking into account the ambient replication_factor_.  This also guards
102     // against overflows in `local_replication` or `replication_factor_`.
103     if (node->min() > kMaxReplicationFactor ||
104         (node->max() != RegExpTree::kInfinity &&
105          node->max() > kMaxReplicationFactor)) {
106       result_ = false;
107       return nullptr;
108     }
109 
110     // Save the current replication factor so that it can be restored if we
111     // return with `result_ == true`.
112     int before_replication_factor = replication_factor_;
113 
114     int local_replication;
115     if (node->max() == RegExpTree::kInfinity) {
116       local_replication = node->min() + 1;
117     } else {
118       local_replication = node->max();
119     }
120 
121     replication_factor_ *= local_replication;
122     if (replication_factor_ > kMaxReplicationFactor) {
123       result_ = false;
124       return nullptr;
125     }
126 
127     switch (node->quantifier_type()) {
128       case RegExpQuantifier::GREEDY:
129       case RegExpQuantifier::NON_GREEDY:
130         break;
131       case RegExpQuantifier::POSSESSIVE:
132         // TODO(mbid, v8:10765): It's not clear to me whether this can be
133         // supported in breadth-first mode. Re2 doesn't support it.
134         result_ = false;
135         return nullptr;
136     }
137 
138     node->body()->Accept(this, nullptr);
139     replication_factor_ = before_replication_factor;
140     return nullptr;
141   }
142 
VisitCapture(RegExpCapture * node,void *)143   void* VisitCapture(RegExpCapture* node, void*) override {
144     node->body()->Accept(this, nullptr);
145     return nullptr;
146   }
147 
VisitGroup(RegExpGroup * node,void *)148   void* VisitGroup(RegExpGroup* node, void*) override {
149     node->body()->Accept(this, nullptr);
150     return nullptr;
151   }
152 
VisitLookaround(RegExpLookaround * node,void *)153   void* VisitLookaround(RegExpLookaround* node, void*) override {
154     // TODO(mbid, v8:10765): This will be hard to support, but not impossible I
155     // think.  See product automata.
156     result_ = false;
157     return nullptr;
158   }
159 
VisitBackReference(RegExpBackReference * node,void *)160   void* VisitBackReference(RegExpBackReference* node, void*) override {
161     // This can't be implemented without backtracking.
162     result_ = false;
163     return nullptr;
164   }
165 
VisitEmpty(RegExpEmpty * node,void *)166   void* VisitEmpty(RegExpEmpty* node, void*) override { return nullptr; }
167 
168  private:
169   // See comment in `VisitQuantifier`:
170   int replication_factor_ = 1;
171 
172   bool result_ = true;
173 };
174 
175 }  // namespace
176 
CanBeHandled(RegExpTree * tree,RegExpFlags flags,int capture_count)177 bool ExperimentalRegExpCompiler::CanBeHandled(RegExpTree* tree,
178                                               RegExpFlags flags,
179                                               int capture_count) {
180   return CanBeHandledVisitor::Check(tree, flags, capture_count);
181 }
182 
183 namespace {
184 
185 // A label in bytecode which starts with no known address. The address *must*
186 // be bound with `Bind` before the label goes out of scope.
187 // Implemented as a linked list through the `payload.pc` of FORK and JMP
188 // instructions.
189 struct Label {
190  public:
191   Label() = default;
~Labelv8::internal::__anon6d2affe80211::Label192   ~Label() {
193     DCHECK_EQ(state_, BOUND);
194     DCHECK_GE(bound_index_, 0);
195   }
196 
197   // Don't copy, don't move.  Moving could be implemented, but it's not
198   // needed anywhere.
199   Label(const Label&) = delete;
200   Label& operator=(const Label&) = delete;
201 
202  private:
203   friend class BytecodeAssembler;
204 
205   // UNBOUND implies unbound_patch_list_begin_.
206   // BOUND implies bound_index_.
207   enum { UNBOUND, BOUND } state_ = UNBOUND;
208   union {
209     int unbound_patch_list_begin_ = -1;
210     int bound_index_;
211   };
212 };
213 
214 class BytecodeAssembler {
215  public:
216   // TODO(mbid,v8:10765): Use some upper bound for code_ capacity computed from
217   // the `tree` size we're going to compile?
BytecodeAssembler(Zone * zone)218   explicit BytecodeAssembler(Zone* zone) : zone_(zone), code_(0, zone) {}
219 
IntoCode()220   ZoneList<RegExpInstruction> IntoCode() && { return std::move(code_); }
221 
Accept()222   void Accept() { code_.Add(RegExpInstruction::Accept(), zone_); }
223 
Assertion(RegExpAssertion::Type t)224   void Assertion(RegExpAssertion::Type t) {
225     code_.Add(RegExpInstruction::Assertion(t), zone_);
226   }
227 
ClearRegister(int32_t register_index)228   void ClearRegister(int32_t register_index) {
229     code_.Add(RegExpInstruction::ClearRegister(register_index), zone_);
230   }
231 
ConsumeRange(base::uc16 from,base::uc16 to)232   void ConsumeRange(base::uc16 from, base::uc16 to) {
233     code_.Add(RegExpInstruction::ConsumeRange(from, to), zone_);
234   }
235 
ConsumeAnyChar()236   void ConsumeAnyChar() {
237     code_.Add(RegExpInstruction::ConsumeAnyChar(), zone_);
238   }
239 
Fork(Label & target)240   void Fork(Label& target) {
241     LabelledInstrImpl(RegExpInstruction::Opcode::FORK, target);
242   }
243 
Jmp(Label & target)244   void Jmp(Label& target) {
245     LabelledInstrImpl(RegExpInstruction::Opcode::JMP, target);
246   }
247 
SetRegisterToCp(int32_t register_index)248   void SetRegisterToCp(int32_t register_index) {
249     code_.Add(RegExpInstruction::SetRegisterToCp(register_index), zone_);
250   }
251 
Bind(Label & target)252   void Bind(Label& target) {
253     DCHECK_EQ(target.state_, Label::UNBOUND);
254 
255     int index = code_.length();
256 
257     while (target.unbound_patch_list_begin_ != -1) {
258       RegExpInstruction& inst = code_[target.unbound_patch_list_begin_];
259       DCHECK(inst.opcode == RegExpInstruction::FORK ||
260              inst.opcode == RegExpInstruction::JMP);
261 
262       target.unbound_patch_list_begin_ = inst.payload.pc;
263       inst.payload.pc = index;
264     }
265 
266     target.state_ = Label::BOUND;
267     target.bound_index_ = index;
268   }
269 
Fail()270   void Fail() { code_.Add(RegExpInstruction::Fail(), zone_); }
271 
272  private:
LabelledInstrImpl(RegExpInstruction::Opcode op,Label & target)273   void LabelledInstrImpl(RegExpInstruction::Opcode op, Label& target) {
274     RegExpInstruction result;
275     result.opcode = op;
276 
277     if (target.state_ == Label::BOUND) {
278       result.payload.pc = target.bound_index_;
279     } else {
280       DCHECK_EQ(target.state_, Label::UNBOUND);
281       int new_list_begin = code_.length();
282       DCHECK_GE(new_list_begin, 0);
283 
284       result.payload.pc = target.unbound_patch_list_begin_;
285 
286       target.unbound_patch_list_begin_ = new_list_begin;
287     }
288 
289     code_.Add(result, zone_);
290   }
291 
292   Zone* zone_;
293   ZoneList<RegExpInstruction> code_;
294 };
295 
296 class CompileVisitor : private RegExpVisitor {
297  public:
Compile(RegExpTree * tree,RegExpFlags flags,Zone * zone)298   static ZoneList<RegExpInstruction> Compile(RegExpTree* tree,
299                                              RegExpFlags flags, Zone* zone) {
300     CompileVisitor compiler(zone);
301 
302     if (!IsSticky(flags) && !tree->IsAnchoredAtStart()) {
303       // The match is not anchored, i.e. may start at any input position, so we
304       // emit a preamble corresponding to /.*?/.  This skips an arbitrary
305       // prefix in the input non-greedily.
306       compiler.CompileNonGreedyStar(
307           [&]() { compiler.assembler_.ConsumeAnyChar(); });
308     }
309 
310     compiler.assembler_.SetRegisterToCp(0);
311     tree->Accept(&compiler, nullptr);
312     compiler.assembler_.SetRegisterToCp(1);
313     compiler.assembler_.Accept();
314 
315     return std::move(compiler.assembler_).IntoCode();
316   }
317 
318  private:
CompileVisitor(Zone * zone)319   explicit CompileVisitor(Zone* zone) : zone_(zone), assembler_(zone) {}
320 
321   // Generate a disjunction of code fragments compiled by a function `alt_gen`.
322   // `alt_gen` is called repeatedly with argument `int i = 0, 1, ..., alt_num -
323   // 1` and should build code corresponding to the ith alternative.
324   template <class F>
CompileDisjunction(int alt_num,F && gen_alt)325   void CompileDisjunction(int alt_num, F&& gen_alt) {
326     // An alternative a1 | ... | an is compiled into
327     //
328     //     FORK tail1
329     //     <a1>
330     //     JMP end
331     //   tail1:
332     //     FORK tail2
333     //     <a2>
334     //     JMP end
335     //   tail2:
336     //     ...
337     //     ...
338     //   tail{n -1}:
339     //     <an>
340     //   end:
341     //
342     // By the semantics of the FORK instruction (see above at definition and
343     // semantics), a forked thread has lower priority than the thread that
344     // spawned it.  This means that with the code we're generating here, the
345     // thread matching the alternative a1 has indeed highest priority, followed
346     // by the thread for a2 and so on.
347 
348     if (alt_num == 0) {
349       // The empty disjunction.  This can never match.
350       assembler_.Fail();
351       return;
352     }
353 
354     Label end;
355 
356     for (int i = 0; i != alt_num - 1; ++i) {
357       Label tail;
358       assembler_.Fork(tail);
359       gen_alt(i);
360       assembler_.Jmp(end);
361       assembler_.Bind(tail);
362     }
363 
364     gen_alt(alt_num - 1);
365 
366     assembler_.Bind(end);
367   }
368 
VisitDisjunction(RegExpDisjunction * node,void *)369   void* VisitDisjunction(RegExpDisjunction* node, void*) override {
370     ZoneList<RegExpTree*>& alts = *node->alternatives();
371     CompileDisjunction(alts.length(),
372                        [&](int i) { alts[i]->Accept(this, nullptr); });
373     return nullptr;
374   }
375 
VisitAlternative(RegExpAlternative * node,void *)376   void* VisitAlternative(RegExpAlternative* node, void*) override {
377     for (RegExpTree* child : *node->nodes()) {
378       child->Accept(this, nullptr);
379     }
380     return nullptr;
381   }
382 
VisitAssertion(RegExpAssertion * node,void *)383   void* VisitAssertion(RegExpAssertion* node, void*) override {
384     assembler_.Assertion(node->assertion_type());
385     return nullptr;
386   }
387 
VisitCharacterClass(RegExpCharacterClass * node,void *)388   void* VisitCharacterClass(RegExpCharacterClass* node, void*) override {
389     // A character class is compiled as Disjunction over its `CharacterRange`s.
390     ZoneList<CharacterRange>* ranges = node->ranges(zone_);
391     CharacterRange::Canonicalize(ranges);
392     if (node->is_negated()) {
393       // The complement of a disjoint, non-adjacent (i.e. `Canonicalize`d)
394       // union of k intervals is a union of at most k + 1 intervals.
395       ZoneList<CharacterRange>* negated =
396           zone_->New<ZoneList<CharacterRange>>(ranges->length() + 1, zone_);
397       CharacterRange::Negate(ranges, negated, zone_);
398       DCHECK_LE(negated->length(), ranges->length() + 1);
399       ranges = negated;
400     }
401 
402     CompileDisjunction(ranges->length(), [&](int i) {
403       // We don't support utf16 for now, so only ranges that can be specified
404       // by (complements of) ranges with base::uc16 bounds.
405       STATIC_ASSERT(kMaxSupportedCodepoint <=
406                     std::numeric_limits<base::uc16>::max());
407 
408       base::uc32 from = (*ranges)[i].from();
409       DCHECK_LE(from, kMaxSupportedCodepoint);
410       base::uc16 from_uc16 = static_cast<base::uc16>(from);
411 
412       base::uc32 to = (*ranges)[i].to();
413       DCHECK_IMPLIES(to > kMaxSupportedCodepoint, to == kMaxCodePoint);
414       base::uc16 to_uc16 =
415           static_cast<base::uc16>(std::min(to, kMaxSupportedCodepoint));
416 
417       assembler_.ConsumeRange(from_uc16, to_uc16);
418     });
419     return nullptr;
420   }
421 
VisitAtom(RegExpAtom * node,void *)422   void* VisitAtom(RegExpAtom* node, void*) override {
423     for (base::uc16 c : node->data()) {
424       assembler_.ConsumeRange(c, c);
425     }
426     return nullptr;
427   }
428 
ClearRegisters(Interval indices)429   void ClearRegisters(Interval indices) {
430     if (indices.is_empty()) return;
431     DCHECK_EQ(indices.from() % 2, 0);
432     DCHECK_EQ(indices.to() % 2, 1);
433     for (int i = indices.from(); i <= indices.to(); i += 2) {
434       // It suffices to clear the register containing the `begin` of a capture
435       // because this indicates that the capture is undefined, regardless of
436       // the value in the `end` register.
437       assembler_.ClearRegister(i);
438     }
439   }
440 
441   // Emit bytecode corresponding to /<emit_body>*/.
442   template <class F>
CompileGreedyStar(F && emit_body)443   void CompileGreedyStar(F&& emit_body) {
444     // This is compiled into
445     //
446     //   begin:
447     //     FORK end
448     //     <body>
449     //     JMP begin
450     //   end:
451     //     ...
452     //
453     // This is greedy because a forked thread has lower priority than the
454     // thread that spawned it.
455     Label begin;
456     Label end;
457 
458     assembler_.Bind(begin);
459     assembler_.Fork(end);
460     emit_body();
461     assembler_.Jmp(begin);
462 
463     assembler_.Bind(end);
464   }
465 
466   // Emit bytecode corresponding to /<emit_body>*?/.
467   template <class F>
CompileNonGreedyStar(F && emit_body)468   void CompileNonGreedyStar(F&& emit_body) {
469     // This is compiled into
470     //
471     //     FORK body
472     //     JMP end
473     //   body:
474     //     <body>
475     //     FORK body
476     //   end:
477     //     ...
478 
479     Label body;
480     Label end;
481 
482     assembler_.Fork(body);
483     assembler_.Jmp(end);
484 
485     assembler_.Bind(body);
486     emit_body();
487     assembler_.Fork(body);
488 
489     assembler_.Bind(end);
490   }
491 
492   // Emit bytecode corresponding to /<emit_body>{0, max_repetition_num}/.
493   template <class F>
CompileGreedyRepetition(F && emit_body,int max_repetition_num)494   void CompileGreedyRepetition(F&& emit_body, int max_repetition_num) {
495     // This is compiled into
496     //
497     //     FORK end
498     //     <body>
499     //     FORK end
500     //     <body>
501     //     ...
502     //     ...
503     //     FORK end
504     //     <body>
505     //   end:
506     //     ...
507 
508     Label end;
509     for (int i = 0; i != max_repetition_num; ++i) {
510       assembler_.Fork(end);
511       emit_body();
512     }
513     assembler_.Bind(end);
514   }
515 
516   // Emit bytecode corresponding to /<emit_body>{0, max_repetition_num}?/.
517   template <class F>
CompileNonGreedyRepetition(F && emit_body,int max_repetition_num)518   void CompileNonGreedyRepetition(F&& emit_body, int max_repetition_num) {
519     // This is compiled into
520     //
521     //     FORK body0
522     //     JMP end
523     //   body0:
524     //     <body>
525     //     FORK body1
526     //     JMP end
527     //   body1:
528     //     <body>
529     //     ...
530     //     ...
531     //   body{max_repetition_num - 1}:
532     //     <body>
533     //   end:
534     //     ...
535 
536     Label end;
537     for (int i = 0; i != max_repetition_num; ++i) {
538       Label body;
539       assembler_.Fork(body);
540       assembler_.Jmp(end);
541 
542       assembler_.Bind(body);
543       emit_body();
544     }
545     assembler_.Bind(end);
546   }
547 
VisitQuantifier(RegExpQuantifier * node,void *)548   void* VisitQuantifier(RegExpQuantifier* node, void*) override {
549     // Emit the body, but clear registers occuring in body first.
550     //
551     // TODO(mbid,v8:10765): It's not always necessary to a) capture registers
552     // and b) clear them. For example, we don't have to capture anything for
553     // the first 4 repetitions if node->min() >= 5, and then we don't have to
554     // clear registers in the first node->min() repetitions.
555     // Later, and if node->min() == 0, we don't have to clear registers before
556     // the first optional repetition.
557     Interval body_registers = node->body()->CaptureRegisters();
558     auto emit_body = [&]() {
559       ClearRegisters(body_registers);
560       node->body()->Accept(this, nullptr);
561     };
562 
563     // First repeat the body `min()` times.
564     for (int i = 0; i != node->min(); ++i) emit_body();
565 
566     switch (node->quantifier_type()) {
567       case RegExpQuantifier::POSSESSIVE:
568         UNREACHABLE();
569       case RegExpQuantifier::GREEDY: {
570         if (node->max() == RegExpTree::kInfinity) {
571           CompileGreedyStar(emit_body);
572         } else {
573           DCHECK_NE(node->max(), RegExpTree::kInfinity);
574           CompileGreedyRepetition(emit_body, node->max() - node->min());
575         }
576         break;
577       }
578       case RegExpQuantifier::NON_GREEDY: {
579         if (node->max() == RegExpTree::kInfinity) {
580           CompileNonGreedyStar(emit_body);
581         } else {
582           DCHECK_NE(node->max(), RegExpTree::kInfinity);
583           CompileNonGreedyRepetition(emit_body, node->max() - node->min());
584         }
585       }
586     }
587     return nullptr;
588   }
589 
VisitCapture(RegExpCapture * node,void *)590   void* VisitCapture(RegExpCapture* node, void*) override {
591     int index = node->index();
592     int start_register = RegExpCapture::StartRegister(index);
593     int end_register = RegExpCapture::EndRegister(index);
594     assembler_.SetRegisterToCp(start_register);
595     node->body()->Accept(this, nullptr);
596     assembler_.SetRegisterToCp(end_register);
597     return nullptr;
598   }
599 
VisitGroup(RegExpGroup * node,void *)600   void* VisitGroup(RegExpGroup* node, void*) override {
601     node->body()->Accept(this, nullptr);
602     return nullptr;
603   }
604 
VisitLookaround(RegExpLookaround * node,void *)605   void* VisitLookaround(RegExpLookaround* node, void*) override {
606     // TODO(mbid,v8:10765): Support this case.
607     UNREACHABLE();
608   }
609 
VisitBackReference(RegExpBackReference * node,void *)610   void* VisitBackReference(RegExpBackReference* node, void*) override {
611     UNREACHABLE();
612   }
613 
VisitEmpty(RegExpEmpty * node,void *)614   void* VisitEmpty(RegExpEmpty* node, void*) override { return nullptr; }
615 
VisitText(RegExpText * node,void *)616   void* VisitText(RegExpText* node, void*) override {
617     for (TextElement& text_el : *node->elements()) {
618       text_el.tree()->Accept(this, nullptr);
619     }
620     return nullptr;
621   }
622 
623  private:
624   Zone* zone_;
625   BytecodeAssembler assembler_;
626 };
627 
628 }  // namespace
629 
Compile(RegExpTree * tree,RegExpFlags flags,Zone * zone)630 ZoneList<RegExpInstruction> ExperimentalRegExpCompiler::Compile(
631     RegExpTree* tree, RegExpFlags flags, Zone* zone) {
632   return CompileVisitor::Compile(tree, flags, zone);
633 }
634 
635 }  // namespace internal
636 }  // namespace v8
637