1 // Copyright 2020 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "src/regexp/experimental/experimental-compiler.h"
6
7 #include "src/regexp/experimental/experimental.h"
8 #include "src/zone/zone-list-inl.h"
9
10 namespace v8 {
11 namespace internal {
12
13 namespace {
14
15 // TODO(mbid, v8:10765): Currently the experimental engine doesn't support
16 // UTF-16, but this shouldn't be too hard to implement.
17 constexpr uc32 kMaxSupportedCodepoint = 0xFFFFu;
18
19 class CanBeHandledVisitor final : private RegExpVisitor {
20 // Visitor to implement `ExperimentalRegExp::CanBeHandled`.
21 public:
Check(RegExpTree * tree,JSRegExp::Flags flags,int capture_count)22 static bool Check(RegExpTree* tree, JSRegExp::Flags flags,
23 int capture_count) {
24 if (!AreSuitableFlags(flags)) return false;
25 CanBeHandledVisitor visitor;
26 tree->Accept(&visitor, nullptr);
27 return visitor.result_;
28 }
29
30 private:
31 CanBeHandledVisitor() = default;
32
AreSuitableFlags(JSRegExp::Flags flags)33 static bool AreSuitableFlags(JSRegExp::Flags flags) {
34 // TODO(mbid, v8:10765): We should be able to support all flags in the
35 // future.
36 static constexpr JSRegExp::Flags kAllowedFlags =
37 JSRegExp::kGlobal | JSRegExp::kSticky | JSRegExp::kMultiline |
38 JSRegExp::kDotAll | JSRegExp::kLinear;
39 // We support Unicode iff kUnicode is among the supported flags.
40 STATIC_ASSERT(ExperimentalRegExp::kSupportsUnicode ==
41 ((kAllowedFlags & JSRegExp::kUnicode) != 0));
42 return (flags & ~kAllowedFlags) == 0;
43 }
44
VisitDisjunction(RegExpDisjunction * node,void *)45 void* VisitDisjunction(RegExpDisjunction* node, void*) override {
46 for (RegExpTree* alt : *node->alternatives()) {
47 alt->Accept(this, nullptr);
48 if (!result_) {
49 return nullptr;
50 }
51 }
52 return nullptr;
53 }
54
VisitAlternative(RegExpAlternative * node,void *)55 void* VisitAlternative(RegExpAlternative* node, void*) override {
56 for (RegExpTree* child : *node->nodes()) {
57 child->Accept(this, nullptr);
58 if (!result_) {
59 return nullptr;
60 }
61 }
62 return nullptr;
63 }
64
VisitCharacterClass(RegExpCharacterClass * node,void *)65 void* VisitCharacterClass(RegExpCharacterClass* node, void*) override {
66 result_ = result_ && AreSuitableFlags(node->flags());
67 return nullptr;
68 }
69
VisitAssertion(RegExpAssertion * node,void *)70 void* VisitAssertion(RegExpAssertion* node, void*) override {
71 result_ = result_ && AreSuitableFlags(node->flags());
72 return nullptr;
73 }
74
VisitAtom(RegExpAtom * node,void *)75 void* VisitAtom(RegExpAtom* node, void*) override {
76 result_ = result_ && AreSuitableFlags(node->flags());
77 return nullptr;
78 }
79
VisitText(RegExpText * node,void *)80 void* VisitText(RegExpText* node, void*) override {
81 for (TextElement& el : *node->elements()) {
82 el.tree()->Accept(this, nullptr);
83 if (!result_) {
84 return nullptr;
85 }
86 }
87 return nullptr;
88 }
89
VisitQuantifier(RegExpQuantifier * node,void *)90 void* VisitQuantifier(RegExpQuantifier* node, void*) override {
91 // Finite but large values of `min()` and `max()` are bad for the
92 // breadth-first engine because finite (optional) repetition is dealt with
93 // by replicating the bytecode of the body of the quantifier. The number
94 // of replicatons grows exponentially in how deeply quantifiers are nested.
95 // `replication_factor_` keeps track of how often the current node will
96 // have to be replicated in the generated bytecode, and we don't allow this
97 // to exceed some small value.
98 static constexpr int kMaxReplicationFactor = 16;
99
100 // First we rule out values for min and max that are too big even before
101 // taking into account the ambient replication_factor_. This also guards
102 // against overflows in `local_replication` or `replication_factor_`.
103 if (node->min() > kMaxReplicationFactor ||
104 (node->max() != RegExpTree::kInfinity &&
105 node->max() > kMaxReplicationFactor)) {
106 result_ = false;
107 return nullptr;
108 }
109
110 // Save the current replication factor so that it can be restored if we
111 // return with `result_ == true`.
112 int before_replication_factor = replication_factor_;
113
114 int local_replication;
115 if (node->max() == RegExpTree::kInfinity) {
116 local_replication = node->min() + 1;
117 } else {
118 local_replication = node->max();
119 }
120
121 replication_factor_ *= local_replication;
122 if (replication_factor_ > kMaxReplicationFactor) {
123 result_ = false;
124 return nullptr;
125 }
126
127 switch (node->quantifier_type()) {
128 case RegExpQuantifier::GREEDY:
129 case RegExpQuantifier::NON_GREEDY:
130 break;
131 case RegExpQuantifier::POSSESSIVE:
132 // TODO(mbid, v8:10765): It's not clear to me whether this can be
133 // supported in breadth-first mode. Re2 doesn't support it.
134 result_ = false;
135 return nullptr;
136 }
137
138 node->body()->Accept(this, nullptr);
139 replication_factor_ = before_replication_factor;
140 return nullptr;
141 }
142
VisitCapture(RegExpCapture * node,void *)143 void* VisitCapture(RegExpCapture* node, void*) override {
144 node->body()->Accept(this, nullptr);
145 return nullptr;
146 }
147
VisitGroup(RegExpGroup * node,void *)148 void* VisitGroup(RegExpGroup* node, void*) override {
149 node->body()->Accept(this, nullptr);
150 return nullptr;
151 }
152
VisitLookaround(RegExpLookaround * node,void *)153 void* VisitLookaround(RegExpLookaround* node, void*) override {
154 // TODO(mbid, v8:10765): This will be hard to support, but not impossible I
155 // think. See product automata.
156 result_ = false;
157 return nullptr;
158 }
159
VisitBackReference(RegExpBackReference * node,void *)160 void* VisitBackReference(RegExpBackReference* node, void*) override {
161 // This can't be implemented without backtracking.
162 result_ = false;
163 return nullptr;
164 }
165
VisitEmpty(RegExpEmpty * node,void *)166 void* VisitEmpty(RegExpEmpty* node, void*) override { return nullptr; }
167
168 private:
169 // See comment in `VisitQuantifier`:
170 int replication_factor_ = 1;
171
172 bool result_ = true;
173 };
174
175 } // namespace
176
CanBeHandled(RegExpTree * tree,JSRegExp::Flags flags,int capture_count)177 bool ExperimentalRegExpCompiler::CanBeHandled(RegExpTree* tree,
178 JSRegExp::Flags flags,
179 int capture_count) {
180 return CanBeHandledVisitor::Check(tree, flags, capture_count);
181 }
182
183 namespace {
184
185 // A label in bytecode which starts with no known address. The address *must*
186 // be bound with `Bind` before the label goes out of scope.
187 // Implemented as a linked list through the `payload.pc` of FORK and JMP
188 // instructions.
189 struct Label {
190 public:
191 Label() = default;
~Labelv8::internal::__anon6c49fcb10211::Label192 ~Label() {
193 DCHECK_EQ(state_, BOUND);
194 DCHECK_GE(bound_index_, 0);
195 }
196
197 // Don't copy, don't move. Moving could be implemented, but it's not
198 // needed anywhere.
199 Label(const Label&) = delete;
200 Label& operator=(const Label&) = delete;
201
202 private:
203 friend class BytecodeAssembler;
204
205 // UNBOUND implies unbound_patch_list_begin_.
206 // BOUND implies bound_index_.
207 enum { UNBOUND, BOUND } state_ = UNBOUND;
208 union {
209 int unbound_patch_list_begin_ = -1;
210 int bound_index_;
211 };
212 };
213
214 class BytecodeAssembler {
215 public:
216 // TODO(mbid,v8:10765): Use some upper bound for code_ capacity computed from
217 // the `tree` size we're going to compile?
BytecodeAssembler(Zone * zone)218 explicit BytecodeAssembler(Zone* zone) : zone_(zone), code_(0, zone) {}
219
IntoCode()220 ZoneList<RegExpInstruction> IntoCode() && { return std::move(code_); }
221
Accept()222 void Accept() { code_.Add(RegExpInstruction::Accept(), zone_); }
223
Assertion(RegExpAssertion::AssertionType t)224 void Assertion(RegExpAssertion::AssertionType t) {
225 code_.Add(RegExpInstruction::Assertion(t), zone_);
226 }
227
ClearRegister(int32_t register_index)228 void ClearRegister(int32_t register_index) {
229 code_.Add(RegExpInstruction::ClearRegister(register_index), zone_);
230 }
231
ConsumeRange(uc16 from,uc16 to)232 void ConsumeRange(uc16 from, uc16 to) {
233 code_.Add(RegExpInstruction::ConsumeRange(from, to), zone_);
234 }
235
ConsumeAnyChar()236 void ConsumeAnyChar() {
237 code_.Add(RegExpInstruction::ConsumeAnyChar(), zone_);
238 }
239
Fork(Label & target)240 void Fork(Label& target) {
241 LabelledInstrImpl(RegExpInstruction::Opcode::FORK, target);
242 }
243
Jmp(Label & target)244 void Jmp(Label& target) {
245 LabelledInstrImpl(RegExpInstruction::Opcode::JMP, target);
246 }
247
SetRegisterToCp(int32_t register_index)248 void SetRegisterToCp(int32_t register_index) {
249 code_.Add(RegExpInstruction::SetRegisterToCp(register_index), zone_);
250 }
251
Bind(Label & target)252 void Bind(Label& target) {
253 DCHECK_EQ(target.state_, Label::UNBOUND);
254
255 int index = code_.length();
256
257 while (target.unbound_patch_list_begin_ != -1) {
258 RegExpInstruction& inst = code_[target.unbound_patch_list_begin_];
259 DCHECK(inst.opcode == RegExpInstruction::FORK ||
260 inst.opcode == RegExpInstruction::JMP);
261
262 target.unbound_patch_list_begin_ = inst.payload.pc;
263 inst.payload.pc = index;
264 }
265
266 target.state_ = Label::BOUND;
267 target.bound_index_ = index;
268 }
269
Fail()270 void Fail() { code_.Add(RegExpInstruction::Fail(), zone_); }
271
272 private:
LabelledInstrImpl(RegExpInstruction::Opcode op,Label & target)273 void LabelledInstrImpl(RegExpInstruction::Opcode op, Label& target) {
274 RegExpInstruction result;
275 result.opcode = op;
276
277 if (target.state_ == Label::BOUND) {
278 result.payload.pc = target.bound_index_;
279 } else {
280 DCHECK_EQ(target.state_, Label::UNBOUND);
281 int new_list_begin = code_.length();
282 DCHECK_GE(new_list_begin, 0);
283
284 result.payload.pc = target.unbound_patch_list_begin_;
285
286 target.unbound_patch_list_begin_ = new_list_begin;
287 }
288
289 code_.Add(result, zone_);
290 }
291
292 Zone* zone_;
293 ZoneList<RegExpInstruction> code_;
294 };
295
296 class CompileVisitor : private RegExpVisitor {
297 public:
Compile(RegExpTree * tree,JSRegExp::Flags flags,Zone * zone)298 static ZoneList<RegExpInstruction> Compile(RegExpTree* tree,
299 JSRegExp::Flags flags,
300 Zone* zone) {
301 CompileVisitor compiler(zone);
302
303 if ((flags & JSRegExp::kSticky) == 0 && !tree->IsAnchoredAtStart()) {
304 // The match is not anchored, i.e. may start at any input position, so we
305 // emit a preamble corresponding to /.*?/. This skips an arbitrary
306 // prefix in the input non-greedily.
307 compiler.CompileNonGreedyStar(
308 [&]() { compiler.assembler_.ConsumeAnyChar(); });
309 }
310
311 compiler.assembler_.SetRegisterToCp(0);
312 tree->Accept(&compiler, nullptr);
313 compiler.assembler_.SetRegisterToCp(1);
314 compiler.assembler_.Accept();
315
316 return std::move(compiler.assembler_).IntoCode();
317 }
318
319 private:
CompileVisitor(Zone * zone)320 explicit CompileVisitor(Zone* zone) : zone_(zone), assembler_(zone) {}
321
322 // Generate a disjunction of code fragments compiled by a function `alt_gen`.
323 // `alt_gen` is called repeatedly with argument `int i = 0, 1, ..., alt_num -
324 // 1` and should build code corresponding to the ith alternative.
325 template <class F>
CompileDisjunction(int alt_num,F && gen_alt)326 void CompileDisjunction(int alt_num, F&& gen_alt) {
327 // An alternative a1 | ... | an is compiled into
328 //
329 // FORK tail1
330 // <a1>
331 // JMP end
332 // tail1:
333 // FORK tail2
334 // <a2>
335 // JMP end
336 // tail2:
337 // ...
338 // ...
339 // tail{n -1}:
340 // <an>
341 // end:
342 //
343 // By the semantics of the FORK instruction (see above at definition and
344 // semantics), a forked thread has lower priority than the thread that
345 // spawned it. This means that with the code we're generating here, the
346 // thread matching the alternative a1 has indeed highest priority, followed
347 // by the thread for a2 and so on.
348
349 if (alt_num == 0) {
350 // The empty disjunction. This can never match.
351 assembler_.Fail();
352 return;
353 }
354
355 Label end;
356
357 for (int i = 0; i != alt_num - 1; ++i) {
358 Label tail;
359 assembler_.Fork(tail);
360 gen_alt(i);
361 assembler_.Jmp(end);
362 assembler_.Bind(tail);
363 }
364
365 gen_alt(alt_num - 1);
366
367 assembler_.Bind(end);
368 }
369
VisitDisjunction(RegExpDisjunction * node,void *)370 void* VisitDisjunction(RegExpDisjunction* node, void*) override {
371 ZoneList<RegExpTree*>& alts = *node->alternatives();
372 CompileDisjunction(alts.length(),
373 [&](int i) { alts[i]->Accept(this, nullptr); });
374 return nullptr;
375 }
376
VisitAlternative(RegExpAlternative * node,void *)377 void* VisitAlternative(RegExpAlternative* node, void*) override {
378 for (RegExpTree* child : *node->nodes()) {
379 child->Accept(this, nullptr);
380 }
381 return nullptr;
382 }
383
VisitAssertion(RegExpAssertion * node,void *)384 void* VisitAssertion(RegExpAssertion* node, void*) override {
385 assembler_.Assertion(node->assertion_type());
386 return nullptr;
387 }
388
VisitCharacterClass(RegExpCharacterClass * node,void *)389 void* VisitCharacterClass(RegExpCharacterClass* node, void*) override {
390 // A character class is compiled as Disjunction over its `CharacterRange`s.
391 ZoneList<CharacterRange>* ranges = node->ranges(zone_);
392 CharacterRange::Canonicalize(ranges);
393 if (node->is_negated()) {
394 // The complement of a disjoint, non-adjacent (i.e. `Canonicalize`d)
395 // union of k intervals is a union of at most k + 1 intervals.
396 ZoneList<CharacterRange>* negated =
397 zone_->New<ZoneList<CharacterRange>>(ranges->length() + 1, zone_);
398 CharacterRange::Negate(ranges, negated, zone_);
399 DCHECK_LE(negated->length(), ranges->length() + 1);
400 ranges = negated;
401 }
402
403 CompileDisjunction(ranges->length(), [&](int i) {
404 // We don't support utf16 for now, so only ranges that can be specified
405 // by (complements of) ranges with uc16 bounds.
406 STATIC_ASSERT(kMaxSupportedCodepoint <= std::numeric_limits<uc16>::max());
407
408 uc32 from = (*ranges)[i].from();
409 DCHECK_LE(from, kMaxSupportedCodepoint);
410 uc16 from_uc16 = static_cast<uc16>(from);
411
412 uc32 to = (*ranges)[i].to();
413 DCHECK_IMPLIES(to > kMaxSupportedCodepoint, to == String::kMaxCodePoint);
414 uc16 to_uc16 = static_cast<uc16>(std::min(to, kMaxSupportedCodepoint));
415
416 assembler_.ConsumeRange(from_uc16, to_uc16);
417 });
418 return nullptr;
419 }
420
VisitAtom(RegExpAtom * node,void *)421 void* VisitAtom(RegExpAtom* node, void*) override {
422 for (uc16 c : node->data()) {
423 assembler_.ConsumeRange(c, c);
424 }
425 return nullptr;
426 }
427
ClearRegisters(Interval indices)428 void ClearRegisters(Interval indices) {
429 if (indices.is_empty()) return;
430 DCHECK_EQ(indices.from() % 2, 0);
431 DCHECK_EQ(indices.to() % 2, 1);
432 for (int i = indices.from(); i <= indices.to(); i += 2) {
433 // It suffices to clear the register containing the `begin` of a capture
434 // because this indicates that the capture is undefined, regardless of
435 // the value in the `end` register.
436 assembler_.ClearRegister(i);
437 }
438 }
439
440 // Emit bytecode corresponding to /<emit_body>*/.
441 template <class F>
CompileGreedyStar(F && emit_body)442 void CompileGreedyStar(F&& emit_body) {
443 // This is compiled into
444 //
445 // begin:
446 // FORK end
447 // <body>
448 // JMP begin
449 // end:
450 // ...
451 //
452 // This is greedy because a forked thread has lower priority than the
453 // thread that spawned it.
454 Label begin;
455 Label end;
456
457 assembler_.Bind(begin);
458 assembler_.Fork(end);
459 emit_body();
460 assembler_.Jmp(begin);
461
462 assembler_.Bind(end);
463 }
464
465 // Emit bytecode corresponding to /<emit_body>*?/.
466 template <class F>
CompileNonGreedyStar(F && emit_body)467 void CompileNonGreedyStar(F&& emit_body) {
468 // This is compiled into
469 //
470 // FORK body
471 // JMP end
472 // body:
473 // <body>
474 // FORK body
475 // end:
476 // ...
477
478 Label body;
479 Label end;
480
481 assembler_.Fork(body);
482 assembler_.Jmp(end);
483
484 assembler_.Bind(body);
485 emit_body();
486 assembler_.Fork(body);
487
488 assembler_.Bind(end);
489 }
490
491 // Emit bytecode corresponding to /<emit_body>{0, max_repetition_num}/.
492 template <class F>
CompileGreedyRepetition(F && emit_body,int max_repetition_num)493 void CompileGreedyRepetition(F&& emit_body, int max_repetition_num) {
494 // This is compiled into
495 //
496 // FORK end
497 // <body>
498 // FORK end
499 // <body>
500 // ...
501 // ...
502 // FORK end
503 // <body>
504 // end:
505 // ...
506
507 Label end;
508 for (int i = 0; i != max_repetition_num; ++i) {
509 assembler_.Fork(end);
510 emit_body();
511 }
512 assembler_.Bind(end);
513 }
514
515 // Emit bytecode corresponding to /<emit_body>{0, max_repetition_num}?/.
516 template <class F>
CompileNonGreedyRepetition(F && emit_body,int max_repetition_num)517 void CompileNonGreedyRepetition(F&& emit_body, int max_repetition_num) {
518 // This is compiled into
519 //
520 // FORK body0
521 // JMP end
522 // body0:
523 // <body>
524 // FORK body1
525 // JMP end
526 // body1:
527 // <body>
528 // ...
529 // ...
530 // body{max_repetition_num - 1}:
531 // <body>
532 // end:
533 // ...
534
535 Label end;
536 for (int i = 0; i != max_repetition_num; ++i) {
537 Label body;
538 assembler_.Fork(body);
539 assembler_.Jmp(end);
540
541 assembler_.Bind(body);
542 emit_body();
543 }
544 assembler_.Bind(end);
545 }
546
VisitQuantifier(RegExpQuantifier * node,void *)547 void* VisitQuantifier(RegExpQuantifier* node, void*) override {
548 // Emit the body, but clear registers occuring in body first.
549 //
550 // TODO(mbid,v8:10765): It's not always necessary to a) capture registers
551 // and b) clear them. For example, we don't have to capture anything for
552 // the first 4 repetitions if node->min() >= 5, and then we don't have to
553 // clear registers in the first node->min() repetitions.
554 // Later, and if node->min() == 0, we don't have to clear registers before
555 // the first optional repetition.
556 Interval body_registers = node->body()->CaptureRegisters();
557 auto emit_body = [&]() {
558 ClearRegisters(body_registers);
559 node->body()->Accept(this, nullptr);
560 };
561
562 // First repeat the body `min()` times.
563 for (int i = 0; i != node->min(); ++i) emit_body();
564
565 switch (node->quantifier_type()) {
566 case RegExpQuantifier::POSSESSIVE:
567 UNREACHABLE();
568 case RegExpQuantifier::GREEDY: {
569 if (node->max() == RegExpTree::kInfinity) {
570 CompileGreedyStar(emit_body);
571 } else {
572 DCHECK_NE(node->max(), RegExpTree::kInfinity);
573 CompileGreedyRepetition(emit_body, node->max() - node->min());
574 }
575 break;
576 }
577 case RegExpQuantifier::NON_GREEDY: {
578 if (node->max() == RegExpTree::kInfinity) {
579 CompileNonGreedyStar(emit_body);
580 } else {
581 DCHECK_NE(node->max(), RegExpTree::kInfinity);
582 CompileNonGreedyRepetition(emit_body, node->max() - node->min());
583 }
584 }
585 }
586 return nullptr;
587 }
588
VisitCapture(RegExpCapture * node,void *)589 void* VisitCapture(RegExpCapture* node, void*) override {
590 int index = node->index();
591 int start_register = RegExpCapture::StartRegister(index);
592 int end_register = RegExpCapture::EndRegister(index);
593 assembler_.SetRegisterToCp(start_register);
594 node->body()->Accept(this, nullptr);
595 assembler_.SetRegisterToCp(end_register);
596 return nullptr;
597 }
598
VisitGroup(RegExpGroup * node,void *)599 void* VisitGroup(RegExpGroup* node, void*) override {
600 node->body()->Accept(this, nullptr);
601 return nullptr;
602 }
603
VisitLookaround(RegExpLookaround * node,void *)604 void* VisitLookaround(RegExpLookaround* node, void*) override {
605 // TODO(mbid,v8:10765): Support this case.
606 UNREACHABLE();
607 }
608
VisitBackReference(RegExpBackReference * node,void *)609 void* VisitBackReference(RegExpBackReference* node, void*) override {
610 UNREACHABLE();
611 }
612
VisitEmpty(RegExpEmpty * node,void *)613 void* VisitEmpty(RegExpEmpty* node, void*) override { return nullptr; }
614
VisitText(RegExpText * node,void *)615 void* VisitText(RegExpText* node, void*) override {
616 for (TextElement& text_el : *node->elements()) {
617 text_el.tree()->Accept(this, nullptr);
618 }
619 return nullptr;
620 }
621
622 private:
623 Zone* zone_;
624 BytecodeAssembler assembler_;
625 };
626
627 } // namespace
628
Compile(RegExpTree * tree,JSRegExp::Flags flags,Zone * zone)629 ZoneList<RegExpInstruction> ExperimentalRegExpCompiler::Compile(
630 RegExpTree* tree, JSRegExp::Flags flags, Zone* zone) {
631 return CompileVisitor::Compile(tree, flags, zone);
632 }
633
634 } // namespace internal
635 } // namespace v8
636