• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2020 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #include "src/regexp/experimental/experimental-compiler.h"
6 
7 #include "src/regexp/experimental/experimental.h"
8 #include "src/zone/zone-list-inl.h"
9 
10 namespace v8 {
11 namespace internal {
12 
13 namespace {
14 
15 // TODO(mbid, v8:10765): Currently the experimental engine doesn't support
16 // UTF-16, but this shouldn't be too hard to implement.
17 constexpr uc32 kMaxSupportedCodepoint = 0xFFFFu;
18 
19 class CanBeHandledVisitor final : private RegExpVisitor {
20   // Visitor to implement `ExperimentalRegExp::CanBeHandled`.
21  public:
Check(RegExpTree * tree,JSRegExp::Flags flags,int capture_count)22   static bool Check(RegExpTree* tree, JSRegExp::Flags flags,
23                     int capture_count) {
24     if (!AreSuitableFlags(flags)) return false;
25     CanBeHandledVisitor visitor;
26     tree->Accept(&visitor, nullptr);
27     return visitor.result_;
28   }
29 
30  private:
31   CanBeHandledVisitor() = default;
32 
AreSuitableFlags(JSRegExp::Flags flags)33   static bool AreSuitableFlags(JSRegExp::Flags flags) {
34     // TODO(mbid, v8:10765): We should be able to support all flags in the
35     // future.
36     static constexpr JSRegExp::Flags kAllowedFlags =
37         JSRegExp::kGlobal | JSRegExp::kSticky | JSRegExp::kMultiline |
38         JSRegExp::kDotAll | JSRegExp::kLinear;
39     // We support Unicode iff kUnicode is among the supported flags.
40     STATIC_ASSERT(ExperimentalRegExp::kSupportsUnicode ==
41                   ((kAllowedFlags & JSRegExp::kUnicode) != 0));
42     return (flags & ~kAllowedFlags) == 0;
43   }
44 
VisitDisjunction(RegExpDisjunction * node,void *)45   void* VisitDisjunction(RegExpDisjunction* node, void*) override {
46     for (RegExpTree* alt : *node->alternatives()) {
47       alt->Accept(this, nullptr);
48       if (!result_) {
49         return nullptr;
50       }
51     }
52     return nullptr;
53   }
54 
VisitAlternative(RegExpAlternative * node,void *)55   void* VisitAlternative(RegExpAlternative* node, void*) override {
56     for (RegExpTree* child : *node->nodes()) {
57       child->Accept(this, nullptr);
58       if (!result_) {
59         return nullptr;
60       }
61     }
62     return nullptr;
63   }
64 
VisitCharacterClass(RegExpCharacterClass * node,void *)65   void* VisitCharacterClass(RegExpCharacterClass* node, void*) override {
66     result_ = result_ && AreSuitableFlags(node->flags());
67     return nullptr;
68   }
69 
VisitAssertion(RegExpAssertion * node,void *)70   void* VisitAssertion(RegExpAssertion* node, void*) override {
71     result_ = result_ && AreSuitableFlags(node->flags());
72     return nullptr;
73   }
74 
VisitAtom(RegExpAtom * node,void *)75   void* VisitAtom(RegExpAtom* node, void*) override {
76     result_ = result_ && AreSuitableFlags(node->flags());
77     return nullptr;
78   }
79 
VisitText(RegExpText * node,void *)80   void* VisitText(RegExpText* node, void*) override {
81     for (TextElement& el : *node->elements()) {
82       el.tree()->Accept(this, nullptr);
83       if (!result_) {
84         return nullptr;
85       }
86     }
87     return nullptr;
88   }
89 
VisitQuantifier(RegExpQuantifier * node,void *)90   void* VisitQuantifier(RegExpQuantifier* node, void*) override {
91     // Finite but large values of `min()` and `max()` are bad for the
92     // breadth-first engine because finite (optional) repetition is dealt with
93     // by replicating the bytecode of the body of the quantifier.  The number
94     // of replicatons grows exponentially in how deeply quantifiers are nested.
95     // `replication_factor_` keeps track of how often the current node will
96     // have to be replicated in the generated bytecode, and we don't allow this
97     // to exceed some small value.
98     static constexpr int kMaxReplicationFactor = 16;
99 
100     // First we rule out values for min and max that are too big even before
101     // taking into account the ambient replication_factor_.  This also guards
102     // against overflows in `local_replication` or `replication_factor_`.
103     if (node->min() > kMaxReplicationFactor ||
104         (node->max() != RegExpTree::kInfinity &&
105          node->max() > kMaxReplicationFactor)) {
106       result_ = false;
107       return nullptr;
108     }
109 
110     // Save the current replication factor so that it can be restored if we
111     // return with `result_ == true`.
112     int before_replication_factor = replication_factor_;
113 
114     int local_replication;
115     if (node->max() == RegExpTree::kInfinity) {
116       local_replication = node->min() + 1;
117     } else {
118       local_replication = node->max();
119     }
120 
121     replication_factor_ *= local_replication;
122     if (replication_factor_ > kMaxReplicationFactor) {
123       result_ = false;
124       return nullptr;
125     }
126 
127     switch (node->quantifier_type()) {
128       case RegExpQuantifier::GREEDY:
129       case RegExpQuantifier::NON_GREEDY:
130         break;
131       case RegExpQuantifier::POSSESSIVE:
132         // TODO(mbid, v8:10765): It's not clear to me whether this can be
133         // supported in breadth-first mode. Re2 doesn't support it.
134         result_ = false;
135         return nullptr;
136     }
137 
138     node->body()->Accept(this, nullptr);
139     replication_factor_ = before_replication_factor;
140     return nullptr;
141   }
142 
VisitCapture(RegExpCapture * node,void *)143   void* VisitCapture(RegExpCapture* node, void*) override {
144     node->body()->Accept(this, nullptr);
145     return nullptr;
146   }
147 
VisitGroup(RegExpGroup * node,void *)148   void* VisitGroup(RegExpGroup* node, void*) override {
149     node->body()->Accept(this, nullptr);
150     return nullptr;
151   }
152 
VisitLookaround(RegExpLookaround * node,void *)153   void* VisitLookaround(RegExpLookaround* node, void*) override {
154     // TODO(mbid, v8:10765): This will be hard to support, but not impossible I
155     // think.  See product automata.
156     result_ = false;
157     return nullptr;
158   }
159 
VisitBackReference(RegExpBackReference * node,void *)160   void* VisitBackReference(RegExpBackReference* node, void*) override {
161     // This can't be implemented without backtracking.
162     result_ = false;
163     return nullptr;
164   }
165 
VisitEmpty(RegExpEmpty * node,void *)166   void* VisitEmpty(RegExpEmpty* node, void*) override { return nullptr; }
167 
168  private:
169   // See comment in `VisitQuantifier`:
170   int replication_factor_ = 1;
171 
172   bool result_ = true;
173 };
174 
175 }  // namespace
176 
CanBeHandled(RegExpTree * tree,JSRegExp::Flags flags,int capture_count)177 bool ExperimentalRegExpCompiler::CanBeHandled(RegExpTree* tree,
178                                               JSRegExp::Flags flags,
179                                               int capture_count) {
180   return CanBeHandledVisitor::Check(tree, flags, capture_count);
181 }
182 
183 namespace {
184 
185 // A label in bytecode which starts with no known address. The address *must*
186 // be bound with `Bind` before the label goes out of scope.
187 // Implemented as a linked list through the `payload.pc` of FORK and JMP
188 // instructions.
189 struct Label {
190  public:
191   Label() = default;
~Labelv8::internal::__anon6c49fcb10211::Label192   ~Label() {
193     DCHECK_EQ(state_, BOUND);
194     DCHECK_GE(bound_index_, 0);
195   }
196 
197   // Don't copy, don't move.  Moving could be implemented, but it's not
198   // needed anywhere.
199   Label(const Label&) = delete;
200   Label& operator=(const Label&) = delete;
201 
202  private:
203   friend class BytecodeAssembler;
204 
205   // UNBOUND implies unbound_patch_list_begin_.
206   // BOUND implies bound_index_.
207   enum { UNBOUND, BOUND } state_ = UNBOUND;
208   union {
209     int unbound_patch_list_begin_ = -1;
210     int bound_index_;
211   };
212 };
213 
214 class BytecodeAssembler {
215  public:
216   // TODO(mbid,v8:10765): Use some upper bound for code_ capacity computed from
217   // the `tree` size we're going to compile?
BytecodeAssembler(Zone * zone)218   explicit BytecodeAssembler(Zone* zone) : zone_(zone), code_(0, zone) {}
219 
IntoCode()220   ZoneList<RegExpInstruction> IntoCode() && { return std::move(code_); }
221 
Accept()222   void Accept() { code_.Add(RegExpInstruction::Accept(), zone_); }
223 
Assertion(RegExpAssertion::AssertionType t)224   void Assertion(RegExpAssertion::AssertionType t) {
225     code_.Add(RegExpInstruction::Assertion(t), zone_);
226   }
227 
ClearRegister(int32_t register_index)228   void ClearRegister(int32_t register_index) {
229     code_.Add(RegExpInstruction::ClearRegister(register_index), zone_);
230   }
231 
ConsumeRange(uc16 from,uc16 to)232   void ConsumeRange(uc16 from, uc16 to) {
233     code_.Add(RegExpInstruction::ConsumeRange(from, to), zone_);
234   }
235 
ConsumeAnyChar()236   void ConsumeAnyChar() {
237     code_.Add(RegExpInstruction::ConsumeAnyChar(), zone_);
238   }
239 
Fork(Label & target)240   void Fork(Label& target) {
241     LabelledInstrImpl(RegExpInstruction::Opcode::FORK, target);
242   }
243 
Jmp(Label & target)244   void Jmp(Label& target) {
245     LabelledInstrImpl(RegExpInstruction::Opcode::JMP, target);
246   }
247 
SetRegisterToCp(int32_t register_index)248   void SetRegisterToCp(int32_t register_index) {
249     code_.Add(RegExpInstruction::SetRegisterToCp(register_index), zone_);
250   }
251 
Bind(Label & target)252   void Bind(Label& target) {
253     DCHECK_EQ(target.state_, Label::UNBOUND);
254 
255     int index = code_.length();
256 
257     while (target.unbound_patch_list_begin_ != -1) {
258       RegExpInstruction& inst = code_[target.unbound_patch_list_begin_];
259       DCHECK(inst.opcode == RegExpInstruction::FORK ||
260              inst.opcode == RegExpInstruction::JMP);
261 
262       target.unbound_patch_list_begin_ = inst.payload.pc;
263       inst.payload.pc = index;
264     }
265 
266     target.state_ = Label::BOUND;
267     target.bound_index_ = index;
268   }
269 
Fail()270   void Fail() { code_.Add(RegExpInstruction::Fail(), zone_); }
271 
272  private:
LabelledInstrImpl(RegExpInstruction::Opcode op,Label & target)273   void LabelledInstrImpl(RegExpInstruction::Opcode op, Label& target) {
274     RegExpInstruction result;
275     result.opcode = op;
276 
277     if (target.state_ == Label::BOUND) {
278       result.payload.pc = target.bound_index_;
279     } else {
280       DCHECK_EQ(target.state_, Label::UNBOUND);
281       int new_list_begin = code_.length();
282       DCHECK_GE(new_list_begin, 0);
283 
284       result.payload.pc = target.unbound_patch_list_begin_;
285 
286       target.unbound_patch_list_begin_ = new_list_begin;
287     }
288 
289     code_.Add(result, zone_);
290   }
291 
292   Zone* zone_;
293   ZoneList<RegExpInstruction> code_;
294 };
295 
296 class CompileVisitor : private RegExpVisitor {
297  public:
Compile(RegExpTree * tree,JSRegExp::Flags flags,Zone * zone)298   static ZoneList<RegExpInstruction> Compile(RegExpTree* tree,
299                                              JSRegExp::Flags flags,
300                                              Zone* zone) {
301     CompileVisitor compiler(zone);
302 
303     if ((flags & JSRegExp::kSticky) == 0 && !tree->IsAnchoredAtStart()) {
304       // The match is not anchored, i.e. may start at any input position, so we
305       // emit a preamble corresponding to /.*?/.  This skips an arbitrary
306       // prefix in the input non-greedily.
307       compiler.CompileNonGreedyStar(
308           [&]() { compiler.assembler_.ConsumeAnyChar(); });
309     }
310 
311     compiler.assembler_.SetRegisterToCp(0);
312     tree->Accept(&compiler, nullptr);
313     compiler.assembler_.SetRegisterToCp(1);
314     compiler.assembler_.Accept();
315 
316     return std::move(compiler.assembler_).IntoCode();
317   }
318 
319  private:
CompileVisitor(Zone * zone)320   explicit CompileVisitor(Zone* zone) : zone_(zone), assembler_(zone) {}
321 
322   // Generate a disjunction of code fragments compiled by a function `alt_gen`.
323   // `alt_gen` is called repeatedly with argument `int i = 0, 1, ..., alt_num -
324   // 1` and should build code corresponding to the ith alternative.
325   template <class F>
CompileDisjunction(int alt_num,F && gen_alt)326   void CompileDisjunction(int alt_num, F&& gen_alt) {
327     // An alternative a1 | ... | an is compiled into
328     //
329     //     FORK tail1
330     //     <a1>
331     //     JMP end
332     //   tail1:
333     //     FORK tail2
334     //     <a2>
335     //     JMP end
336     //   tail2:
337     //     ...
338     //     ...
339     //   tail{n -1}:
340     //     <an>
341     //   end:
342     //
343     // By the semantics of the FORK instruction (see above at definition and
344     // semantics), a forked thread has lower priority than the thread that
345     // spawned it.  This means that with the code we're generating here, the
346     // thread matching the alternative a1 has indeed highest priority, followed
347     // by the thread for a2 and so on.
348 
349     if (alt_num == 0) {
350       // The empty disjunction.  This can never match.
351       assembler_.Fail();
352       return;
353     }
354 
355     Label end;
356 
357     for (int i = 0; i != alt_num - 1; ++i) {
358       Label tail;
359       assembler_.Fork(tail);
360       gen_alt(i);
361       assembler_.Jmp(end);
362       assembler_.Bind(tail);
363     }
364 
365     gen_alt(alt_num - 1);
366 
367     assembler_.Bind(end);
368   }
369 
VisitDisjunction(RegExpDisjunction * node,void *)370   void* VisitDisjunction(RegExpDisjunction* node, void*) override {
371     ZoneList<RegExpTree*>& alts = *node->alternatives();
372     CompileDisjunction(alts.length(),
373                        [&](int i) { alts[i]->Accept(this, nullptr); });
374     return nullptr;
375   }
376 
VisitAlternative(RegExpAlternative * node,void *)377   void* VisitAlternative(RegExpAlternative* node, void*) override {
378     for (RegExpTree* child : *node->nodes()) {
379       child->Accept(this, nullptr);
380     }
381     return nullptr;
382   }
383 
VisitAssertion(RegExpAssertion * node,void *)384   void* VisitAssertion(RegExpAssertion* node, void*) override {
385     assembler_.Assertion(node->assertion_type());
386     return nullptr;
387   }
388 
VisitCharacterClass(RegExpCharacterClass * node,void *)389   void* VisitCharacterClass(RegExpCharacterClass* node, void*) override {
390     // A character class is compiled as Disjunction over its `CharacterRange`s.
391     ZoneList<CharacterRange>* ranges = node->ranges(zone_);
392     CharacterRange::Canonicalize(ranges);
393     if (node->is_negated()) {
394       // The complement of a disjoint, non-adjacent (i.e. `Canonicalize`d)
395       // union of k intervals is a union of at most k + 1 intervals.
396       ZoneList<CharacterRange>* negated =
397           zone_->New<ZoneList<CharacterRange>>(ranges->length() + 1, zone_);
398       CharacterRange::Negate(ranges, negated, zone_);
399       DCHECK_LE(negated->length(), ranges->length() + 1);
400       ranges = negated;
401     }
402 
403     CompileDisjunction(ranges->length(), [&](int i) {
404       // We don't support utf16 for now, so only ranges that can be specified
405       // by (complements of) ranges with uc16 bounds.
406       STATIC_ASSERT(kMaxSupportedCodepoint <= std::numeric_limits<uc16>::max());
407 
408       uc32 from = (*ranges)[i].from();
409       DCHECK_LE(from, kMaxSupportedCodepoint);
410       uc16 from_uc16 = static_cast<uc16>(from);
411 
412       uc32 to = (*ranges)[i].to();
413       DCHECK_IMPLIES(to > kMaxSupportedCodepoint, to == String::kMaxCodePoint);
414       uc16 to_uc16 = static_cast<uc16>(std::min(to, kMaxSupportedCodepoint));
415 
416       assembler_.ConsumeRange(from_uc16, to_uc16);
417     });
418     return nullptr;
419   }
420 
VisitAtom(RegExpAtom * node,void *)421   void* VisitAtom(RegExpAtom* node, void*) override {
422     for (uc16 c : node->data()) {
423       assembler_.ConsumeRange(c, c);
424     }
425     return nullptr;
426   }
427 
ClearRegisters(Interval indices)428   void ClearRegisters(Interval indices) {
429     if (indices.is_empty()) return;
430     DCHECK_EQ(indices.from() % 2, 0);
431     DCHECK_EQ(indices.to() % 2, 1);
432     for (int i = indices.from(); i <= indices.to(); i += 2) {
433       // It suffices to clear the register containing the `begin` of a capture
434       // because this indicates that the capture is undefined, regardless of
435       // the value in the `end` register.
436       assembler_.ClearRegister(i);
437     }
438   }
439 
440   // Emit bytecode corresponding to /<emit_body>*/.
441   template <class F>
CompileGreedyStar(F && emit_body)442   void CompileGreedyStar(F&& emit_body) {
443     // This is compiled into
444     //
445     //   begin:
446     //     FORK end
447     //     <body>
448     //     JMP begin
449     //   end:
450     //     ...
451     //
452     // This is greedy because a forked thread has lower priority than the
453     // thread that spawned it.
454     Label begin;
455     Label end;
456 
457     assembler_.Bind(begin);
458     assembler_.Fork(end);
459     emit_body();
460     assembler_.Jmp(begin);
461 
462     assembler_.Bind(end);
463   }
464 
465   // Emit bytecode corresponding to /<emit_body>*?/.
466   template <class F>
CompileNonGreedyStar(F && emit_body)467   void CompileNonGreedyStar(F&& emit_body) {
468     // This is compiled into
469     //
470     //     FORK body
471     //     JMP end
472     //   body:
473     //     <body>
474     //     FORK body
475     //   end:
476     //     ...
477 
478     Label body;
479     Label end;
480 
481     assembler_.Fork(body);
482     assembler_.Jmp(end);
483 
484     assembler_.Bind(body);
485     emit_body();
486     assembler_.Fork(body);
487 
488     assembler_.Bind(end);
489   }
490 
491   // Emit bytecode corresponding to /<emit_body>{0, max_repetition_num}/.
492   template <class F>
CompileGreedyRepetition(F && emit_body,int max_repetition_num)493   void CompileGreedyRepetition(F&& emit_body, int max_repetition_num) {
494     // This is compiled into
495     //
496     //     FORK end
497     //     <body>
498     //     FORK end
499     //     <body>
500     //     ...
501     //     ...
502     //     FORK end
503     //     <body>
504     //   end:
505     //     ...
506 
507     Label end;
508     for (int i = 0; i != max_repetition_num; ++i) {
509       assembler_.Fork(end);
510       emit_body();
511     }
512     assembler_.Bind(end);
513   }
514 
515   // Emit bytecode corresponding to /<emit_body>{0, max_repetition_num}?/.
516   template <class F>
CompileNonGreedyRepetition(F && emit_body,int max_repetition_num)517   void CompileNonGreedyRepetition(F&& emit_body, int max_repetition_num) {
518     // This is compiled into
519     //
520     //     FORK body0
521     //     JMP end
522     //   body0:
523     //     <body>
524     //     FORK body1
525     //     JMP end
526     //   body1:
527     //     <body>
528     //     ...
529     //     ...
530     //   body{max_repetition_num - 1}:
531     //     <body>
532     //   end:
533     //     ...
534 
535     Label end;
536     for (int i = 0; i != max_repetition_num; ++i) {
537       Label body;
538       assembler_.Fork(body);
539       assembler_.Jmp(end);
540 
541       assembler_.Bind(body);
542       emit_body();
543     }
544     assembler_.Bind(end);
545   }
546 
VisitQuantifier(RegExpQuantifier * node,void *)547   void* VisitQuantifier(RegExpQuantifier* node, void*) override {
548     // Emit the body, but clear registers occuring in body first.
549     //
550     // TODO(mbid,v8:10765): It's not always necessary to a) capture registers
551     // and b) clear them. For example, we don't have to capture anything for
552     // the first 4 repetitions if node->min() >= 5, and then we don't have to
553     // clear registers in the first node->min() repetitions.
554     // Later, and if node->min() == 0, we don't have to clear registers before
555     // the first optional repetition.
556     Interval body_registers = node->body()->CaptureRegisters();
557     auto emit_body = [&]() {
558       ClearRegisters(body_registers);
559       node->body()->Accept(this, nullptr);
560     };
561 
562     // First repeat the body `min()` times.
563     for (int i = 0; i != node->min(); ++i) emit_body();
564 
565     switch (node->quantifier_type()) {
566       case RegExpQuantifier::POSSESSIVE:
567         UNREACHABLE();
568       case RegExpQuantifier::GREEDY: {
569         if (node->max() == RegExpTree::kInfinity) {
570           CompileGreedyStar(emit_body);
571         } else {
572           DCHECK_NE(node->max(), RegExpTree::kInfinity);
573           CompileGreedyRepetition(emit_body, node->max() - node->min());
574         }
575         break;
576       }
577       case RegExpQuantifier::NON_GREEDY: {
578         if (node->max() == RegExpTree::kInfinity) {
579           CompileNonGreedyStar(emit_body);
580         } else {
581           DCHECK_NE(node->max(), RegExpTree::kInfinity);
582           CompileNonGreedyRepetition(emit_body, node->max() - node->min());
583         }
584       }
585     }
586     return nullptr;
587   }
588 
VisitCapture(RegExpCapture * node,void *)589   void* VisitCapture(RegExpCapture* node, void*) override {
590     int index = node->index();
591     int start_register = RegExpCapture::StartRegister(index);
592     int end_register = RegExpCapture::EndRegister(index);
593     assembler_.SetRegisterToCp(start_register);
594     node->body()->Accept(this, nullptr);
595     assembler_.SetRegisterToCp(end_register);
596     return nullptr;
597   }
598 
VisitGroup(RegExpGroup * node,void *)599   void* VisitGroup(RegExpGroup* node, void*) override {
600     node->body()->Accept(this, nullptr);
601     return nullptr;
602   }
603 
VisitLookaround(RegExpLookaround * node,void *)604   void* VisitLookaround(RegExpLookaround* node, void*) override {
605     // TODO(mbid,v8:10765): Support this case.
606     UNREACHABLE();
607   }
608 
VisitBackReference(RegExpBackReference * node,void *)609   void* VisitBackReference(RegExpBackReference* node, void*) override {
610     UNREACHABLE();
611   }
612 
VisitEmpty(RegExpEmpty * node,void *)613   void* VisitEmpty(RegExpEmpty* node, void*) override { return nullptr; }
614 
VisitText(RegExpText * node,void *)615   void* VisitText(RegExpText* node, void*) override {
616     for (TextElement& text_el : *node->elements()) {
617       text_el.tree()->Accept(this, nullptr);
618     }
619     return nullptr;
620   }
621 
622  private:
623   Zone* zone_;
624   BytecodeAssembler assembler_;
625 };
626 
627 }  // namespace
628 
Compile(RegExpTree * tree,JSRegExp::Flags flags,Zone * zone)629 ZoneList<RegExpInstruction> ExperimentalRegExpCompiler::Compile(
630     RegExpTree* tree, JSRegExp::Flags flags, Zone* zone) {
631   return CompileVisitor::Compile(tree, flags, zone);
632 }
633 
634 }  // namespace internal
635 }  // namespace v8
636