1 // Copyright 2006-2007 The RE2 Authors. All Rights Reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 // Tested by search_test.cc.
6 //
7 // Prog::SearchNFA, an NFA search.
8 // This is an actual NFA like the theorists talk about,
9 // not the pseudo-NFA found in backtracking regexp implementations.
10 //
11 // IMPLEMENTATION
12 //
13 // This algorithm is a variant of one that appeared in Rob Pike's sam editor,
14 // which is a variant of the one described in Thompson's 1968 CACM paper.
15 // See http://swtch.com/~rsc/regexp/ for various history. The main feature
16 // over the DFA implementation is that it tracks submatch boundaries.
17 //
18 // When the choice of submatch boundaries is ambiguous, this particular
19 // implementation makes the same choices that traditional backtracking
20 // implementations (in particular, Perl and PCRE) do.
21 // Note that unlike in Perl and PCRE, this algorithm *cannot* take exponential
22 // time in the length of the input.
23 //
24 // Like Thompson's original machine and like the DFA implementation, this
25 // implementation notices a match only once it is one byte past it.
26
27 #include <stdio.h>
28 #include <string.h>
29
30 #include <algorithm>
31 #include <deque>
32 #include <string>
33 #include <utility>
34
35 #include "absl/log/absl_check.h"
36 #include "absl/log/absl_log.h"
37 #include "absl/strings/str_format.h"
38 #include "absl/strings/string_view.h"
39 #include "re2/pod_array.h"
40 #include "re2/prog.h"
41 #include "re2/regexp.h"
42 #include "re2/sparse_array.h"
43 #include "re2/sparse_set.h"
44
45 namespace re2 {
46
47 static const bool ExtraDebug = false;
48
49 class NFA {
50 public:
51 NFA(Prog* prog);
52 ~NFA();
53
54 // Searches for a matching string.
55 // * If anchored is true, only considers matches starting at offset.
56 // Otherwise finds lefmost match at or after offset.
57 // * If longest is true, returns the longest match starting
58 // at the chosen start point. Otherwise returns the so-called
59 // left-biased match, the one traditional backtracking engines
60 // (like Perl and PCRE) find.
61 // Records submatch boundaries in submatch[1..nsubmatch-1].
62 // Submatch[0] is the entire match. When there is a choice in
63 // which text matches each subexpression, the submatch boundaries
64 // are chosen to match what a backtracking implementation would choose.
65 bool Search(absl::string_view text, absl::string_view context, bool anchored,
66 bool longest, absl::string_view* submatch, int nsubmatch);
67
68 private:
69 struct Thread {
70 union {
71 int ref;
72 Thread* next; // when on free list
73 };
74 const char** capture;
75 };
76
77 // State for explicit stack in AddToThreadq.
78 struct AddState {
79 int id; // Inst to process
80 Thread* t; // if not null, set t0 = t before processing id
81 };
82
83 // Threadq is a list of threads. The list is sorted by the order
84 // in which Perl would explore that particular state -- the earlier
85 // choices appear earlier in the list.
86 typedef SparseArray<Thread*> Threadq;
87
88 inline Thread* AllocThread();
89 inline Thread* Incref(Thread* t);
90 inline void Decref(Thread* t);
91
92 // Follows all empty arrows from id0 and enqueues all the states reached.
93 // Enqueues only the ByteRange instructions that match byte c.
94 // context is used (with p) for evaluating empty-width specials.
95 // p is the current input position, and t0 is the current thread.
96 void AddToThreadq(Threadq* q, int id0, int c, absl::string_view context,
97 const char* p, Thread* t0);
98
99 // Run runq on byte c, appending new states to nextq.
100 // Updates matched_ and match_ as new, better matches are found.
101 // context is used (with p) for evaluating empty-width specials.
102 // p is the position of byte c in the input string for AddToThreadq;
103 // p-1 will be used when processing Match instructions.
104 // Frees all the threads on runq.
105 // If there is a shortcut to the end, returns that shortcut.
106 int Step(Threadq* runq, Threadq* nextq, int c, absl::string_view context,
107 const char* p);
108
109 // Returns text version of capture information, for debugging.
110 std::string FormatCapture(const char** capture);
111
CopyCapture(const char ** dst,const char ** src)112 void CopyCapture(const char** dst, const char** src) {
113 memmove(dst, src, ncapture_*sizeof src[0]);
114 }
115
116 Prog* prog_; // underlying program
117 int start_; // start instruction in program
118 int ncapture_; // number of submatches to track
119 bool longest_; // whether searching for longest match
120 bool endmatch_; // whether match must end at text.end()
121 const char* btext_; // beginning of text (for FormatSubmatch)
122 const char* etext_; // end of text (for endmatch_)
123 Threadq q0_, q1_; // pre-allocated for Search.
124 PODArray<AddState> stack_; // pre-allocated for AddToThreadq
125 std::deque<Thread> arena_; // thread arena
126 Thread* freelist_; // thread freelist
127 const char** match_; // best match so far
128 bool matched_; // any match so far?
129
130 NFA(const NFA&) = delete;
131 NFA& operator=(const NFA&) = delete;
132 };
133
NFA(Prog * prog)134 NFA::NFA(Prog* prog) {
135 prog_ = prog;
136 start_ = prog_->start();
137 ncapture_ = 0;
138 longest_ = false;
139 endmatch_ = false;
140 btext_ = NULL;
141 etext_ = NULL;
142 q0_.resize(prog_->size());
143 q1_.resize(prog_->size());
144 // See NFA::AddToThreadq() for why this is so.
145 int nstack = 2*prog_->inst_count(kInstCapture) +
146 prog_->inst_count(kInstEmptyWidth) +
147 prog_->inst_count(kInstNop) + 1; // + 1 for start inst
148 stack_ = PODArray<AddState>(nstack);
149 freelist_ = NULL;
150 match_ = NULL;
151 matched_ = false;
152 }
153
~NFA()154 NFA::~NFA() {
155 delete[] match_;
156 for (const Thread& t : arena_)
157 delete[] t.capture;
158 }
159
AllocThread()160 NFA::Thread* NFA::AllocThread() {
161 Thread* t = freelist_;
162 if (t != NULL) {
163 freelist_ = t->next;
164 t->ref = 1;
165 // We don't need to touch t->capture because
166 // the caller will immediately overwrite it.
167 return t;
168 }
169 arena_.emplace_back();
170 t = &arena_.back();
171 t->ref = 1;
172 t->capture = new const char*[ncapture_];
173 return t;
174 }
175
Incref(Thread * t)176 NFA::Thread* NFA::Incref(Thread* t) {
177 ABSL_DCHECK(t != NULL);
178 t->ref++;
179 return t;
180 }
181
Decref(Thread * t)182 void NFA::Decref(Thread* t) {
183 ABSL_DCHECK(t != NULL);
184 t->ref--;
185 if (t->ref > 0)
186 return;
187 ABSL_DCHECK_EQ(t->ref, 0);
188 t->next = freelist_;
189 freelist_ = t;
190 }
191
192 // Follows all empty arrows from id0 and enqueues all the states reached.
193 // Enqueues only the ByteRange instructions that match byte c.
194 // context is used (with p) for evaluating empty-width specials.
195 // p is the current input position, and t0 is the current thread.
AddToThreadq(Threadq * q,int id0,int c,absl::string_view context,const char * p,Thread * t0)196 void NFA::AddToThreadq(Threadq* q, int id0, int c, absl::string_view context,
197 const char* p, Thread* t0) {
198 if (id0 == 0)
199 return;
200
201 // Use stack_ to hold our stack of instructions yet to process.
202 // It was preallocated as follows:
203 // two entries per Capture;
204 // one entry per EmptyWidth; and
205 // one entry per Nop.
206 // This reflects the maximum number of stack pushes that each can
207 // perform. (Each instruction can be processed at most once.)
208 AddState* stk = stack_.data();
209 int nstk = 0;
210
211 stk[nstk++] = {id0, NULL};
212 while (nstk > 0) {
213 ABSL_DCHECK_LE(nstk, stack_.size());
214 AddState a = stk[--nstk];
215
216 Loop:
217 if (a.t != NULL) {
218 // t0 was a thread that we allocated and copied in order to
219 // record the capture, so we must now decref it.
220 Decref(t0);
221 t0 = a.t;
222 }
223
224 int id = a.id;
225 if (id == 0)
226 continue;
227 if (q->has_index(id)) {
228 if (ExtraDebug)
229 absl::FPrintF(stderr, " [%d%s]\n", id, FormatCapture(t0->capture));
230 continue;
231 }
232
233 // Create entry in q no matter what. We might fill it in below,
234 // or we might not. Even if not, it is necessary to have it,
235 // so that we don't revisit id0 during the recursion.
236 q->set_new(id, NULL);
237 Thread** tp = &q->get_existing(id);
238 int j;
239 Thread* t;
240 Prog::Inst* ip = prog_->inst(id);
241 switch (ip->opcode()) {
242 default:
243 ABSL_LOG(DFATAL) << "unhandled " << ip->opcode() << " in AddToThreadq";
244 break;
245
246 case kInstFail:
247 break;
248
249 case kInstAltMatch:
250 // Save state; will pick up at next byte.
251 t = Incref(t0);
252 *tp = t;
253
254 ABSL_DCHECK(!ip->last());
255 a = {id+1, NULL};
256 goto Loop;
257
258 case kInstNop:
259 if (!ip->last())
260 stk[nstk++] = {id+1, NULL};
261
262 // Continue on.
263 a = {ip->out(), NULL};
264 goto Loop;
265
266 case kInstCapture:
267 if (!ip->last())
268 stk[nstk++] = {id+1, NULL};
269
270 if ((j=ip->cap()) < ncapture_) {
271 // Push a dummy whose only job is to restore t0
272 // once we finish exploring this possibility.
273 stk[nstk++] = {0, t0};
274
275 // Record capture.
276 t = AllocThread();
277 CopyCapture(t->capture, t0->capture);
278 t->capture[j] = p;
279 t0 = t;
280 }
281 a = {ip->out(), NULL};
282 goto Loop;
283
284 case kInstByteRange:
285 if (!ip->Matches(c))
286 goto Next;
287
288 // Save state; will pick up at next byte.
289 t = Incref(t0);
290 *tp = t;
291 if (ExtraDebug)
292 absl::FPrintF(stderr, " + %d%s\n", id, FormatCapture(t0->capture));
293
294 if (ip->hint() == 0)
295 break;
296 a = {id+ip->hint(), NULL};
297 goto Loop;
298
299 case kInstMatch:
300 // Save state; will pick up at next byte.
301 t = Incref(t0);
302 *tp = t;
303 if (ExtraDebug)
304 absl::FPrintF(stderr, " ! %d%s\n", id, FormatCapture(t0->capture));
305
306 Next:
307 if (ip->last())
308 break;
309 a = {id+1, NULL};
310 goto Loop;
311
312 case kInstEmptyWidth:
313 if (!ip->last())
314 stk[nstk++] = {id+1, NULL};
315
316 // Continue on if we have all the right flag bits.
317 if (ip->empty() & ~Prog::EmptyFlags(context, p))
318 break;
319 a = {ip->out(), NULL};
320 goto Loop;
321 }
322 }
323 }
324
325 // Run runq on byte c, appending new states to nextq.
326 // Updates matched_ and match_ as new, better matches are found.
327 // context is used (with p) for evaluating empty-width specials.
328 // p is the position of byte c in the input string for AddToThreadq;
329 // p-1 will be used when processing Match instructions.
330 // Frees all the threads on runq.
331 // If there is a shortcut to the end, returns that shortcut.
Step(Threadq * runq,Threadq * nextq,int c,absl::string_view context,const char * p)332 int NFA::Step(Threadq* runq, Threadq* nextq, int c, absl::string_view context,
333 const char* p) {
334 nextq->clear();
335
336 for (Threadq::iterator i = runq->begin(); i != runq->end(); ++i) {
337 Thread* t = i->value();
338 if (t == NULL)
339 continue;
340
341 if (longest_) {
342 // Can skip any threads started after our current best match.
343 if (matched_ && match_[0] < t->capture[0]) {
344 Decref(t);
345 continue;
346 }
347 }
348
349 int id = i->index();
350 Prog::Inst* ip = prog_->inst(id);
351
352 switch (ip->opcode()) {
353 default:
354 // Should only see the values handled below.
355 ABSL_LOG(DFATAL) << "Unhandled " << ip->opcode() << " in step";
356 break;
357
358 case kInstByteRange:
359 AddToThreadq(nextq, ip->out(), c, context, p, t);
360 break;
361
362 case kInstAltMatch:
363 if (i != runq->begin())
364 break;
365 // The match is ours if we want it.
366 if (ip->greedy(prog_) || longest_) {
367 CopyCapture(match_, t->capture);
368 matched_ = true;
369
370 Decref(t);
371 for (++i; i != runq->end(); ++i) {
372 if (i->value() != NULL)
373 Decref(i->value());
374 }
375 runq->clear();
376 if (ip->greedy(prog_))
377 return ip->out1();
378 return ip->out();
379 }
380 break;
381
382 case kInstMatch: {
383 // Avoid invoking undefined behavior (arithmetic on a null pointer)
384 // by storing p instead of p-1. (What would the latter even mean?!)
385 // This complements the special case in NFA::Search().
386 if (p == NULL) {
387 CopyCapture(match_, t->capture);
388 match_[1] = p;
389 matched_ = true;
390 break;
391 }
392
393 if (endmatch_ && p-1 != etext_)
394 break;
395
396 if (longest_) {
397 // Leftmost-longest mode: save this match only if
398 // it is either farther to the left or at the same
399 // point but longer than an existing match.
400 if (!matched_ || t->capture[0] < match_[0] ||
401 (t->capture[0] == match_[0] && p-1 > match_[1])) {
402 CopyCapture(match_, t->capture);
403 match_[1] = p-1;
404 matched_ = true;
405 }
406 } else {
407 // Leftmost-biased mode: this match is by definition
408 // better than what we've already found (see next line).
409 CopyCapture(match_, t->capture);
410 match_[1] = p-1;
411 matched_ = true;
412
413 // Cut off the threads that can only find matches
414 // worse than the one we just found: don't run the
415 // rest of the current Threadq.
416 Decref(t);
417 for (++i; i != runq->end(); ++i) {
418 if (i->value() != NULL)
419 Decref(i->value());
420 }
421 runq->clear();
422 return 0;
423 }
424 break;
425 }
426 }
427 Decref(t);
428 }
429 runq->clear();
430 return 0;
431 }
432
FormatCapture(const char ** capture)433 std::string NFA::FormatCapture(const char** capture) {
434 std::string s;
435 for (int i = 0; i < ncapture_; i+=2) {
436 if (capture[i] == NULL)
437 s += "(?,?)";
438 else if (capture[i+1] == NULL)
439 s += absl::StrFormat("(%d,?)",
440 capture[i] - btext_);
441 else
442 s += absl::StrFormat("(%d,%d)",
443 capture[i] - btext_,
444 capture[i+1] - btext_);
445 }
446 return s;
447 }
448
Search(absl::string_view text,absl::string_view context,bool anchored,bool longest,absl::string_view * submatch,int nsubmatch)449 bool NFA::Search(absl::string_view text, absl::string_view context,
450 bool anchored, bool longest, absl::string_view* submatch,
451 int nsubmatch) {
452 if (start_ == 0)
453 return false;
454
455 if (context.data() == NULL)
456 context = text;
457
458 // Sanity check: make sure that text lies within context.
459 if (BeginPtr(text) < BeginPtr(context) || EndPtr(text) > EndPtr(context)) {
460 ABSL_LOG(DFATAL) << "context does not contain text";
461 return false;
462 }
463
464 if (prog_->anchor_start() && BeginPtr(context) != BeginPtr(text))
465 return false;
466 if (prog_->anchor_end() && EndPtr(context) != EndPtr(text))
467 return false;
468 anchored |= prog_->anchor_start();
469 if (prog_->anchor_end()) {
470 longest = true;
471 endmatch_ = true;
472 }
473
474 if (nsubmatch < 0) {
475 ABSL_LOG(DFATAL) << "Bad args: nsubmatch=" << nsubmatch;
476 return false;
477 }
478
479 // Save search parameters.
480 ncapture_ = 2*nsubmatch;
481 longest_ = longest;
482
483 if (nsubmatch == 0) {
484 // We need to maintain match[0], both to distinguish the
485 // longest match (if longest is true) and also to tell
486 // whether we've seen any matches at all.
487 ncapture_ = 2;
488 }
489
490 match_ = new const char*[ncapture_];
491 memset(match_, 0, ncapture_*sizeof match_[0]);
492 matched_ = false;
493
494 // For debugging prints.
495 btext_ = context.data();
496 // For convenience.
497 etext_ = text.data() + text.size();
498
499 if (ExtraDebug)
500 absl::FPrintF(stderr, "NFA::Search %s (context: %s) anchored=%d longest=%d\n",
501 text, context, anchored, longest);
502
503 // Set up search.
504 Threadq* runq = &q0_;
505 Threadq* nextq = &q1_;
506 runq->clear();
507 nextq->clear();
508
509 // Loop over the text, stepping the machine.
510 for (const char* p = text.data();; p++) {
511 if (ExtraDebug) {
512 int c = 0;
513 if (p == btext_)
514 c = '^';
515 else if (p > etext_)
516 c = '$';
517 else if (p < etext_)
518 c = p[0] & 0xFF;
519
520 absl::FPrintF(stderr, "%c:", c);
521 for (Threadq::iterator i = runq->begin(); i != runq->end(); ++i) {
522 Thread* t = i->value();
523 if (t == NULL)
524 continue;
525 absl::FPrintF(stderr, " %d%s", i->index(), FormatCapture(t->capture));
526 }
527 absl::FPrintF(stderr, "\n");
528 }
529
530 // This is a no-op the first time around the loop because runq is empty.
531 int id = Step(runq, nextq, p < etext_ ? p[0] & 0xFF : -1, context, p);
532 ABSL_DCHECK_EQ(runq->size(), 0);
533 using std::swap;
534 swap(nextq, runq);
535 nextq->clear();
536 if (id != 0) {
537 // We're done: full match ahead.
538 p = etext_;
539 for (;;) {
540 Prog::Inst* ip = prog_->inst(id);
541 switch (ip->opcode()) {
542 default:
543 ABSL_LOG(DFATAL) << "Unexpected opcode in short circuit: "
544 << ip->opcode();
545 break;
546
547 case kInstCapture:
548 if (ip->cap() < ncapture_)
549 match_[ip->cap()] = p;
550 id = ip->out();
551 continue;
552
553 case kInstNop:
554 id = ip->out();
555 continue;
556
557 case kInstMatch:
558 match_[1] = p;
559 matched_ = true;
560 break;
561 }
562 break;
563 }
564 break;
565 }
566
567 if (p > etext_)
568 break;
569
570 // Start a new thread if there have not been any matches.
571 // (No point in starting a new thread if there have been
572 // matches, since it would be to the right of the match
573 // we already found.)
574 if (!matched_ && (!anchored || p == text.data())) {
575 // Try to use prefix accel (e.g. memchr) to skip ahead.
576 // The search must be unanchored and there must be zero
577 // possible matches already.
578 if (!anchored && runq->size() == 0 &&
579 p < etext_ && prog_->can_prefix_accel()) {
580 p = reinterpret_cast<const char*>(prog_->PrefixAccel(p, etext_ - p));
581 if (p == NULL)
582 p = etext_;
583 }
584
585 Thread* t = AllocThread();
586 CopyCapture(t->capture, match_);
587 t->capture[0] = p;
588 AddToThreadq(runq, start_, p < etext_ ? p[0] & 0xFF : -1, context, p,
589 t);
590 Decref(t);
591 }
592
593 // If all the threads have died, stop early.
594 if (runq->size() == 0) {
595 if (ExtraDebug)
596 absl::FPrintF(stderr, "dead\n");
597 break;
598 }
599
600 // Avoid invoking undefined behavior (arithmetic on a null pointer)
601 // by simply not continuing the loop.
602 // This complements the special case in NFA::Step().
603 if (p == NULL) {
604 (void) Step(runq, nextq, -1, context, p);
605 ABSL_DCHECK_EQ(runq->size(), 0);
606 using std::swap;
607 swap(nextq, runq);
608 nextq->clear();
609 break;
610 }
611 }
612
613 for (Threadq::iterator i = runq->begin(); i != runq->end(); ++i) {
614 if (i->value() != NULL)
615 Decref(i->value());
616 }
617
618 if (matched_) {
619 for (int i = 0; i < nsubmatch; i++)
620 submatch[i] = absl::string_view(
621 match_[2 * i],
622 static_cast<size_t>(match_[2 * i + 1] - match_[2 * i]));
623 if (ExtraDebug)
624 absl::FPrintF(stderr, "match (%d,%d)\n",
625 match_[0] - btext_,
626 match_[1] - btext_);
627 return true;
628 }
629 return false;
630 }
631
SearchNFA(absl::string_view text,absl::string_view context,Anchor anchor,MatchKind kind,absl::string_view * match,int nmatch)632 bool Prog::SearchNFA(absl::string_view text, absl::string_view context,
633 Anchor anchor, MatchKind kind, absl::string_view* match,
634 int nmatch) {
635 if (ExtraDebug)
636 Dump();
637
638 NFA nfa(this);
639 absl::string_view sp;
640 if (kind == kFullMatch) {
641 anchor = kAnchored;
642 if (nmatch == 0) {
643 match = &sp;
644 nmatch = 1;
645 }
646 }
647 if (!nfa.Search(text, context, anchor == kAnchored, kind != kFirstMatch, match, nmatch))
648 return false;
649 if (kind == kFullMatch && EndPtr(match[0]) != EndPtr(text))
650 return false;
651 return true;
652 }
653
654 // For each instruction i in the program reachable from the start, compute the
655 // number of instructions reachable from i by following only empty transitions
656 // and record that count as fanout[i].
657 //
658 // fanout holds the results and is also the work queue for the outer iteration.
659 // reachable holds the reached nodes for the inner iteration.
Fanout(SparseArray<int> * fanout)660 void Prog::Fanout(SparseArray<int>* fanout) {
661 ABSL_DCHECK_EQ(fanout->max_size(), size());
662 SparseSet reachable(size());
663 fanout->clear();
664 fanout->set_new(start(), 0);
665 for (SparseArray<int>::iterator i = fanout->begin(); i != fanout->end(); ++i) {
666 int* count = &i->value();
667 reachable.clear();
668 reachable.insert(i->index());
669 for (SparseSet::iterator j = reachable.begin(); j != reachable.end(); ++j) {
670 int id = *j;
671 Prog::Inst* ip = inst(id);
672 switch (ip->opcode()) {
673 default:
674 ABSL_LOG(DFATAL) << "unhandled " << ip->opcode()
675 << " in Prog::Fanout()";
676 break;
677
678 case kInstByteRange:
679 if (!ip->last())
680 reachable.insert(id+1);
681
682 (*count)++;
683 if (!fanout->has_index(ip->out())) {
684 fanout->set_new(ip->out(), 0);
685 }
686 break;
687
688 case kInstAltMatch:
689 ABSL_DCHECK(!ip->last());
690 reachable.insert(id+1);
691 break;
692
693 case kInstCapture:
694 case kInstEmptyWidth:
695 case kInstNop:
696 if (!ip->last())
697 reachable.insert(id+1);
698
699 reachable.insert(ip->out());
700 break;
701
702 case kInstMatch:
703 if (!ip->last())
704 reachable.insert(id+1);
705 break;
706
707 case kInstFail:
708 break;
709 }
710 }
711 }
712 }
713
714 } // namespace re2
715