1 // Copyright 2020 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "src/regexp/experimental/experimental-compiler.h"
6
7 #include "src/base/strings.h"
8 #include "src/regexp/experimental/experimental.h"
9 #include "src/zone/zone-list-inl.h"
10
11 namespace v8 {
12 namespace internal {
13
14 namespace {
15
16 // TODO(mbid, v8:10765): Currently the experimental engine doesn't support
17 // UTF-16, but this shouldn't be too hard to implement.
18 constexpr base::uc32 kMaxSupportedCodepoint = 0xFFFFu;
19 #ifdef DEBUG
20 constexpr base::uc32 kMaxCodePoint = 0x10ffff;
21 #endif // DEBUG
22
23 class CanBeHandledVisitor final : private RegExpVisitor {
24 // Visitor to implement `ExperimentalRegExp::CanBeHandled`.
25 public:
Check(RegExpTree * tree,RegExpFlags flags,int capture_count)26 static bool Check(RegExpTree* tree, RegExpFlags flags, int capture_count) {
27 if (!AreSuitableFlags(flags)) return false;
28 CanBeHandledVisitor visitor;
29 tree->Accept(&visitor, nullptr);
30 return visitor.result_;
31 }
32
33 private:
34 CanBeHandledVisitor() = default;
35
AreSuitableFlags(RegExpFlags flags)36 static bool AreSuitableFlags(RegExpFlags flags) {
37 // TODO(mbid, v8:10765): We should be able to support all flags in the
38 // future.
39 static constexpr RegExpFlags kAllowedFlags =
40 RegExpFlag::kGlobal | RegExpFlag::kSticky | RegExpFlag::kMultiline |
41 RegExpFlag::kDotAll | RegExpFlag::kLinear;
42 // We support Unicode iff kUnicode is among the supported flags.
43 STATIC_ASSERT(ExperimentalRegExp::kSupportsUnicode ==
44 IsUnicode(kAllowedFlags));
45 return (flags & ~kAllowedFlags) == 0;
46 }
47
VisitDisjunction(RegExpDisjunction * node,void *)48 void* VisitDisjunction(RegExpDisjunction* node, void*) override {
49 for (RegExpTree* alt : *node->alternatives()) {
50 alt->Accept(this, nullptr);
51 if (!result_) {
52 return nullptr;
53 }
54 }
55 return nullptr;
56 }
57
VisitAlternative(RegExpAlternative * node,void *)58 void* VisitAlternative(RegExpAlternative* node, void*) override {
59 for (RegExpTree* child : *node->nodes()) {
60 child->Accept(this, nullptr);
61 if (!result_) {
62 return nullptr;
63 }
64 }
65 return nullptr;
66 }
67
VisitCharacterClass(RegExpCharacterClass * node,void *)68 void* VisitCharacterClass(RegExpCharacterClass* node, void*) override {
69 return nullptr;
70 }
71
VisitAssertion(RegExpAssertion * node,void *)72 void* VisitAssertion(RegExpAssertion* node, void*) override {
73 return nullptr;
74 }
75
VisitAtom(RegExpAtom * node,void *)76 void* VisitAtom(RegExpAtom* node, void*) override {
77 return nullptr;
78 }
79
VisitText(RegExpText * node,void *)80 void* VisitText(RegExpText* node, void*) override {
81 for (TextElement& el : *node->elements()) {
82 el.tree()->Accept(this, nullptr);
83 if (!result_) {
84 return nullptr;
85 }
86 }
87 return nullptr;
88 }
89
VisitQuantifier(RegExpQuantifier * node,void *)90 void* VisitQuantifier(RegExpQuantifier* node, void*) override {
91 // Finite but large values of `min()` and `max()` are bad for the
92 // breadth-first engine because finite (optional) repetition is dealt with
93 // by replicating the bytecode of the body of the quantifier. The number
94 // of replicatons grows exponentially in how deeply quantifiers are nested.
95 // `replication_factor_` keeps track of how often the current node will
96 // have to be replicated in the generated bytecode, and we don't allow this
97 // to exceed some small value.
98 static constexpr int kMaxReplicationFactor = 16;
99
100 // First we rule out values for min and max that are too big even before
101 // taking into account the ambient replication_factor_. This also guards
102 // against overflows in `local_replication` or `replication_factor_`.
103 if (node->min() > kMaxReplicationFactor ||
104 (node->max() != RegExpTree::kInfinity &&
105 node->max() > kMaxReplicationFactor)) {
106 result_ = false;
107 return nullptr;
108 }
109
110 // Save the current replication factor so that it can be restored if we
111 // return with `result_ == true`.
112 int before_replication_factor = replication_factor_;
113
114 int local_replication;
115 if (node->max() == RegExpTree::kInfinity) {
116 local_replication = node->min() + 1;
117 } else {
118 local_replication = node->max();
119 }
120
121 replication_factor_ *= local_replication;
122 if (replication_factor_ > kMaxReplicationFactor) {
123 result_ = false;
124 return nullptr;
125 }
126
127 switch (node->quantifier_type()) {
128 case RegExpQuantifier::GREEDY:
129 case RegExpQuantifier::NON_GREEDY:
130 break;
131 case RegExpQuantifier::POSSESSIVE:
132 // TODO(mbid, v8:10765): It's not clear to me whether this can be
133 // supported in breadth-first mode. Re2 doesn't support it.
134 result_ = false;
135 return nullptr;
136 }
137
138 node->body()->Accept(this, nullptr);
139 replication_factor_ = before_replication_factor;
140 return nullptr;
141 }
142
VisitCapture(RegExpCapture * node,void *)143 void* VisitCapture(RegExpCapture* node, void*) override {
144 node->body()->Accept(this, nullptr);
145 return nullptr;
146 }
147
VisitGroup(RegExpGroup * node,void *)148 void* VisitGroup(RegExpGroup* node, void*) override {
149 node->body()->Accept(this, nullptr);
150 return nullptr;
151 }
152
VisitLookaround(RegExpLookaround * node,void *)153 void* VisitLookaround(RegExpLookaround* node, void*) override {
154 // TODO(mbid, v8:10765): This will be hard to support, but not impossible I
155 // think. See product automata.
156 result_ = false;
157 return nullptr;
158 }
159
VisitBackReference(RegExpBackReference * node,void *)160 void* VisitBackReference(RegExpBackReference* node, void*) override {
161 // This can't be implemented without backtracking.
162 result_ = false;
163 return nullptr;
164 }
165
VisitEmpty(RegExpEmpty * node,void *)166 void* VisitEmpty(RegExpEmpty* node, void*) override { return nullptr; }
167
168 private:
169 // See comment in `VisitQuantifier`:
170 int replication_factor_ = 1;
171
172 bool result_ = true;
173 };
174
175 } // namespace
176
CanBeHandled(RegExpTree * tree,RegExpFlags flags,int capture_count)177 bool ExperimentalRegExpCompiler::CanBeHandled(RegExpTree* tree,
178 RegExpFlags flags,
179 int capture_count) {
180 return CanBeHandledVisitor::Check(tree, flags, capture_count);
181 }
182
183 namespace {
184
185 // A label in bytecode which starts with no known address. The address *must*
186 // be bound with `Bind` before the label goes out of scope.
187 // Implemented as a linked list through the `payload.pc` of FORK and JMP
188 // instructions.
189 struct Label {
190 public:
191 Label() = default;
~Labelv8::internal::__anon6d2affe80211::Label192 ~Label() {
193 DCHECK_EQ(state_, BOUND);
194 DCHECK_GE(bound_index_, 0);
195 }
196
197 // Don't copy, don't move. Moving could be implemented, but it's not
198 // needed anywhere.
199 Label(const Label&) = delete;
200 Label& operator=(const Label&) = delete;
201
202 private:
203 friend class BytecodeAssembler;
204
205 // UNBOUND implies unbound_patch_list_begin_.
206 // BOUND implies bound_index_.
207 enum { UNBOUND, BOUND } state_ = UNBOUND;
208 union {
209 int unbound_patch_list_begin_ = -1;
210 int bound_index_;
211 };
212 };
213
214 class BytecodeAssembler {
215 public:
216 // TODO(mbid,v8:10765): Use some upper bound for code_ capacity computed from
217 // the `tree` size we're going to compile?
BytecodeAssembler(Zone * zone)218 explicit BytecodeAssembler(Zone* zone) : zone_(zone), code_(0, zone) {}
219
IntoCode()220 ZoneList<RegExpInstruction> IntoCode() && { return std::move(code_); }
221
Accept()222 void Accept() { code_.Add(RegExpInstruction::Accept(), zone_); }
223
Assertion(RegExpAssertion::Type t)224 void Assertion(RegExpAssertion::Type t) {
225 code_.Add(RegExpInstruction::Assertion(t), zone_);
226 }
227
ClearRegister(int32_t register_index)228 void ClearRegister(int32_t register_index) {
229 code_.Add(RegExpInstruction::ClearRegister(register_index), zone_);
230 }
231
ConsumeRange(base::uc16 from,base::uc16 to)232 void ConsumeRange(base::uc16 from, base::uc16 to) {
233 code_.Add(RegExpInstruction::ConsumeRange(from, to), zone_);
234 }
235
ConsumeAnyChar()236 void ConsumeAnyChar() {
237 code_.Add(RegExpInstruction::ConsumeAnyChar(), zone_);
238 }
239
Fork(Label & target)240 void Fork(Label& target) {
241 LabelledInstrImpl(RegExpInstruction::Opcode::FORK, target);
242 }
243
Jmp(Label & target)244 void Jmp(Label& target) {
245 LabelledInstrImpl(RegExpInstruction::Opcode::JMP, target);
246 }
247
SetRegisterToCp(int32_t register_index)248 void SetRegisterToCp(int32_t register_index) {
249 code_.Add(RegExpInstruction::SetRegisterToCp(register_index), zone_);
250 }
251
Bind(Label & target)252 void Bind(Label& target) {
253 DCHECK_EQ(target.state_, Label::UNBOUND);
254
255 int index = code_.length();
256
257 while (target.unbound_patch_list_begin_ != -1) {
258 RegExpInstruction& inst = code_[target.unbound_patch_list_begin_];
259 DCHECK(inst.opcode == RegExpInstruction::FORK ||
260 inst.opcode == RegExpInstruction::JMP);
261
262 target.unbound_patch_list_begin_ = inst.payload.pc;
263 inst.payload.pc = index;
264 }
265
266 target.state_ = Label::BOUND;
267 target.bound_index_ = index;
268 }
269
Fail()270 void Fail() { code_.Add(RegExpInstruction::Fail(), zone_); }
271
272 private:
LabelledInstrImpl(RegExpInstruction::Opcode op,Label & target)273 void LabelledInstrImpl(RegExpInstruction::Opcode op, Label& target) {
274 RegExpInstruction result;
275 result.opcode = op;
276
277 if (target.state_ == Label::BOUND) {
278 result.payload.pc = target.bound_index_;
279 } else {
280 DCHECK_EQ(target.state_, Label::UNBOUND);
281 int new_list_begin = code_.length();
282 DCHECK_GE(new_list_begin, 0);
283
284 result.payload.pc = target.unbound_patch_list_begin_;
285
286 target.unbound_patch_list_begin_ = new_list_begin;
287 }
288
289 code_.Add(result, zone_);
290 }
291
292 Zone* zone_;
293 ZoneList<RegExpInstruction> code_;
294 };
295
296 class CompileVisitor : private RegExpVisitor {
297 public:
Compile(RegExpTree * tree,RegExpFlags flags,Zone * zone)298 static ZoneList<RegExpInstruction> Compile(RegExpTree* tree,
299 RegExpFlags flags, Zone* zone) {
300 CompileVisitor compiler(zone);
301
302 if (!IsSticky(flags) && !tree->IsAnchoredAtStart()) {
303 // The match is not anchored, i.e. may start at any input position, so we
304 // emit a preamble corresponding to /.*?/. This skips an arbitrary
305 // prefix in the input non-greedily.
306 compiler.CompileNonGreedyStar(
307 [&]() { compiler.assembler_.ConsumeAnyChar(); });
308 }
309
310 compiler.assembler_.SetRegisterToCp(0);
311 tree->Accept(&compiler, nullptr);
312 compiler.assembler_.SetRegisterToCp(1);
313 compiler.assembler_.Accept();
314
315 return std::move(compiler.assembler_).IntoCode();
316 }
317
318 private:
CompileVisitor(Zone * zone)319 explicit CompileVisitor(Zone* zone) : zone_(zone), assembler_(zone) {}
320
321 // Generate a disjunction of code fragments compiled by a function `alt_gen`.
322 // `alt_gen` is called repeatedly with argument `int i = 0, 1, ..., alt_num -
323 // 1` and should build code corresponding to the ith alternative.
324 template <class F>
CompileDisjunction(int alt_num,F && gen_alt)325 void CompileDisjunction(int alt_num, F&& gen_alt) {
326 // An alternative a1 | ... | an is compiled into
327 //
328 // FORK tail1
329 // <a1>
330 // JMP end
331 // tail1:
332 // FORK tail2
333 // <a2>
334 // JMP end
335 // tail2:
336 // ...
337 // ...
338 // tail{n -1}:
339 // <an>
340 // end:
341 //
342 // By the semantics of the FORK instruction (see above at definition and
343 // semantics), a forked thread has lower priority than the thread that
344 // spawned it. This means that with the code we're generating here, the
345 // thread matching the alternative a1 has indeed highest priority, followed
346 // by the thread for a2 and so on.
347
348 if (alt_num == 0) {
349 // The empty disjunction. This can never match.
350 assembler_.Fail();
351 return;
352 }
353
354 Label end;
355
356 for (int i = 0; i != alt_num - 1; ++i) {
357 Label tail;
358 assembler_.Fork(tail);
359 gen_alt(i);
360 assembler_.Jmp(end);
361 assembler_.Bind(tail);
362 }
363
364 gen_alt(alt_num - 1);
365
366 assembler_.Bind(end);
367 }
368
VisitDisjunction(RegExpDisjunction * node,void *)369 void* VisitDisjunction(RegExpDisjunction* node, void*) override {
370 ZoneList<RegExpTree*>& alts = *node->alternatives();
371 CompileDisjunction(alts.length(),
372 [&](int i) { alts[i]->Accept(this, nullptr); });
373 return nullptr;
374 }
375
VisitAlternative(RegExpAlternative * node,void *)376 void* VisitAlternative(RegExpAlternative* node, void*) override {
377 for (RegExpTree* child : *node->nodes()) {
378 child->Accept(this, nullptr);
379 }
380 return nullptr;
381 }
382
VisitAssertion(RegExpAssertion * node,void *)383 void* VisitAssertion(RegExpAssertion* node, void*) override {
384 assembler_.Assertion(node->assertion_type());
385 return nullptr;
386 }
387
VisitCharacterClass(RegExpCharacterClass * node,void *)388 void* VisitCharacterClass(RegExpCharacterClass* node, void*) override {
389 // A character class is compiled as Disjunction over its `CharacterRange`s.
390 ZoneList<CharacterRange>* ranges = node->ranges(zone_);
391 CharacterRange::Canonicalize(ranges);
392 if (node->is_negated()) {
393 // The complement of a disjoint, non-adjacent (i.e. `Canonicalize`d)
394 // union of k intervals is a union of at most k + 1 intervals.
395 ZoneList<CharacterRange>* negated =
396 zone_->New<ZoneList<CharacterRange>>(ranges->length() + 1, zone_);
397 CharacterRange::Negate(ranges, negated, zone_);
398 DCHECK_LE(negated->length(), ranges->length() + 1);
399 ranges = negated;
400 }
401
402 CompileDisjunction(ranges->length(), [&](int i) {
403 // We don't support utf16 for now, so only ranges that can be specified
404 // by (complements of) ranges with base::uc16 bounds.
405 STATIC_ASSERT(kMaxSupportedCodepoint <=
406 std::numeric_limits<base::uc16>::max());
407
408 base::uc32 from = (*ranges)[i].from();
409 DCHECK_LE(from, kMaxSupportedCodepoint);
410 base::uc16 from_uc16 = static_cast<base::uc16>(from);
411
412 base::uc32 to = (*ranges)[i].to();
413 DCHECK_IMPLIES(to > kMaxSupportedCodepoint, to == kMaxCodePoint);
414 base::uc16 to_uc16 =
415 static_cast<base::uc16>(std::min(to, kMaxSupportedCodepoint));
416
417 assembler_.ConsumeRange(from_uc16, to_uc16);
418 });
419 return nullptr;
420 }
421
VisitAtom(RegExpAtom * node,void *)422 void* VisitAtom(RegExpAtom* node, void*) override {
423 for (base::uc16 c : node->data()) {
424 assembler_.ConsumeRange(c, c);
425 }
426 return nullptr;
427 }
428
ClearRegisters(Interval indices)429 void ClearRegisters(Interval indices) {
430 if (indices.is_empty()) return;
431 DCHECK_EQ(indices.from() % 2, 0);
432 DCHECK_EQ(indices.to() % 2, 1);
433 for (int i = indices.from(); i <= indices.to(); i += 2) {
434 // It suffices to clear the register containing the `begin` of a capture
435 // because this indicates that the capture is undefined, regardless of
436 // the value in the `end` register.
437 assembler_.ClearRegister(i);
438 }
439 }
440
441 // Emit bytecode corresponding to /<emit_body>*/.
442 template <class F>
CompileGreedyStar(F && emit_body)443 void CompileGreedyStar(F&& emit_body) {
444 // This is compiled into
445 //
446 // begin:
447 // FORK end
448 // <body>
449 // JMP begin
450 // end:
451 // ...
452 //
453 // This is greedy because a forked thread has lower priority than the
454 // thread that spawned it.
455 Label begin;
456 Label end;
457
458 assembler_.Bind(begin);
459 assembler_.Fork(end);
460 emit_body();
461 assembler_.Jmp(begin);
462
463 assembler_.Bind(end);
464 }
465
466 // Emit bytecode corresponding to /<emit_body>*?/.
467 template <class F>
CompileNonGreedyStar(F && emit_body)468 void CompileNonGreedyStar(F&& emit_body) {
469 // This is compiled into
470 //
471 // FORK body
472 // JMP end
473 // body:
474 // <body>
475 // FORK body
476 // end:
477 // ...
478
479 Label body;
480 Label end;
481
482 assembler_.Fork(body);
483 assembler_.Jmp(end);
484
485 assembler_.Bind(body);
486 emit_body();
487 assembler_.Fork(body);
488
489 assembler_.Bind(end);
490 }
491
492 // Emit bytecode corresponding to /<emit_body>{0, max_repetition_num}/.
493 template <class F>
CompileGreedyRepetition(F && emit_body,int max_repetition_num)494 void CompileGreedyRepetition(F&& emit_body, int max_repetition_num) {
495 // This is compiled into
496 //
497 // FORK end
498 // <body>
499 // FORK end
500 // <body>
501 // ...
502 // ...
503 // FORK end
504 // <body>
505 // end:
506 // ...
507
508 Label end;
509 for (int i = 0; i != max_repetition_num; ++i) {
510 assembler_.Fork(end);
511 emit_body();
512 }
513 assembler_.Bind(end);
514 }
515
516 // Emit bytecode corresponding to /<emit_body>{0, max_repetition_num}?/.
517 template <class F>
CompileNonGreedyRepetition(F && emit_body,int max_repetition_num)518 void CompileNonGreedyRepetition(F&& emit_body, int max_repetition_num) {
519 // This is compiled into
520 //
521 // FORK body0
522 // JMP end
523 // body0:
524 // <body>
525 // FORK body1
526 // JMP end
527 // body1:
528 // <body>
529 // ...
530 // ...
531 // body{max_repetition_num - 1}:
532 // <body>
533 // end:
534 // ...
535
536 Label end;
537 for (int i = 0; i != max_repetition_num; ++i) {
538 Label body;
539 assembler_.Fork(body);
540 assembler_.Jmp(end);
541
542 assembler_.Bind(body);
543 emit_body();
544 }
545 assembler_.Bind(end);
546 }
547
VisitQuantifier(RegExpQuantifier * node,void *)548 void* VisitQuantifier(RegExpQuantifier* node, void*) override {
549 // Emit the body, but clear registers occuring in body first.
550 //
551 // TODO(mbid,v8:10765): It's not always necessary to a) capture registers
552 // and b) clear them. For example, we don't have to capture anything for
553 // the first 4 repetitions if node->min() >= 5, and then we don't have to
554 // clear registers in the first node->min() repetitions.
555 // Later, and if node->min() == 0, we don't have to clear registers before
556 // the first optional repetition.
557 Interval body_registers = node->body()->CaptureRegisters();
558 auto emit_body = [&]() {
559 ClearRegisters(body_registers);
560 node->body()->Accept(this, nullptr);
561 };
562
563 // First repeat the body `min()` times.
564 for (int i = 0; i != node->min(); ++i) emit_body();
565
566 switch (node->quantifier_type()) {
567 case RegExpQuantifier::POSSESSIVE:
568 UNREACHABLE();
569 case RegExpQuantifier::GREEDY: {
570 if (node->max() == RegExpTree::kInfinity) {
571 CompileGreedyStar(emit_body);
572 } else {
573 DCHECK_NE(node->max(), RegExpTree::kInfinity);
574 CompileGreedyRepetition(emit_body, node->max() - node->min());
575 }
576 break;
577 }
578 case RegExpQuantifier::NON_GREEDY: {
579 if (node->max() == RegExpTree::kInfinity) {
580 CompileNonGreedyStar(emit_body);
581 } else {
582 DCHECK_NE(node->max(), RegExpTree::kInfinity);
583 CompileNonGreedyRepetition(emit_body, node->max() - node->min());
584 }
585 }
586 }
587 return nullptr;
588 }
589
VisitCapture(RegExpCapture * node,void *)590 void* VisitCapture(RegExpCapture* node, void*) override {
591 int index = node->index();
592 int start_register = RegExpCapture::StartRegister(index);
593 int end_register = RegExpCapture::EndRegister(index);
594 assembler_.SetRegisterToCp(start_register);
595 node->body()->Accept(this, nullptr);
596 assembler_.SetRegisterToCp(end_register);
597 return nullptr;
598 }
599
VisitGroup(RegExpGroup * node,void *)600 void* VisitGroup(RegExpGroup* node, void*) override {
601 node->body()->Accept(this, nullptr);
602 return nullptr;
603 }
604
VisitLookaround(RegExpLookaround * node,void *)605 void* VisitLookaround(RegExpLookaround* node, void*) override {
606 // TODO(mbid,v8:10765): Support this case.
607 UNREACHABLE();
608 }
609
VisitBackReference(RegExpBackReference * node,void *)610 void* VisitBackReference(RegExpBackReference* node, void*) override {
611 UNREACHABLE();
612 }
613
VisitEmpty(RegExpEmpty * node,void *)614 void* VisitEmpty(RegExpEmpty* node, void*) override { return nullptr; }
615
VisitText(RegExpText * node,void *)616 void* VisitText(RegExpText* node, void*) override {
617 for (TextElement& text_el : *node->elements()) {
618 text_el.tree()->Accept(this, nullptr);
619 }
620 return nullptr;
621 }
622
623 private:
624 Zone* zone_;
625 BytecodeAssembler assembler_;
626 };
627
628 } // namespace
629
Compile(RegExpTree * tree,RegExpFlags flags,Zone * zone)630 ZoneList<RegExpInstruction> ExperimentalRegExpCompiler::Compile(
631 RegExpTree* tree, RegExpFlags flags, Zone* zone) {
632 return CompileVisitor::Compile(tree, flags, zone);
633 }
634
635 } // namespace internal
636 } // namespace v8
637