1 // Copyright 2006-2007 The RE2 Authors. All Rights Reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 // Tested by search_test.cc.
6 //
7 // Prog::SearchNFA, an NFA search.
8 // This is an actual NFA like the theorists talk about,
9 // not the pseudo-NFA found in backtracking regexp implementations.
10 //
11 // IMPLEMENTATION
12 //
13 // This algorithm is a variant of one that appeared in Rob Pike's sam editor,
14 // which is a variant of the one described in Thompson's 1968 CACM paper.
15 // See http://swtch.com/~rsc/regexp/ for various history. The main feature
16 // over the DFA implementation is that it tracks submatch boundaries.
17 //
18 // When the choice of submatch boundaries is ambiguous, this particular
19 // implementation makes the same choices that traditional backtracking
20 // implementations (in particular, Perl and PCRE) do.
21 // Note that unlike in Perl and PCRE, this algorithm *cannot* take exponential
22 // time in the length of the input.
23 //
24 // Like Thompson's original machine and like the DFA implementation, this
25 // implementation notices a match only once it is one byte past it.
26
27 #include <stdio.h>
28 #include <string.h>
29 #include <algorithm>
30 #include <deque>
31 #include <string>
32 #include <utility>
33 #include <vector>
34
35 #include "util/logging.h"
36 #include "util/strutil.h"
37 #include "re2/pod_array.h"
38 #include "re2/prog.h"
39 #include "re2/regexp.h"
40 #include "re2/sparse_array.h"
41 #include "re2/sparse_set.h"
42
43 namespace re2 {
44
45 static const bool ExtraDebug = false;
46
47 class NFA {
48 public:
49 NFA(Prog* prog);
50 ~NFA();
51
52 // Searches for a matching string.
53 // * If anchored is true, only considers matches starting at offset.
54 // Otherwise finds lefmost match at or after offset.
55 // * If longest is true, returns the longest match starting
56 // at the chosen start point. Otherwise returns the so-called
57 // left-biased match, the one traditional backtracking engines
58 // (like Perl and PCRE) find.
59 // Records submatch boundaries in submatch[1..nsubmatch-1].
60 // Submatch[0] is the entire match. When there is a choice in
61 // which text matches each subexpression, the submatch boundaries
62 // are chosen to match what a backtracking implementation would choose.
63 bool Search(const StringPiece& text, const StringPiece& context,
64 bool anchored, bool longest,
65 StringPiece* submatch, int nsubmatch);
66
67 private:
68 struct Thread {
69 union {
70 int ref;
71 Thread* next; // when on free list
72 };
73 const char** capture;
74 };
75
76 // State for explicit stack in AddToThreadq.
77 struct AddState {
78 int id; // Inst to process
79 Thread* t; // if not null, set t0 = t before processing id
80 };
81
82 // Threadq is a list of threads. The list is sorted by the order
83 // in which Perl would explore that particular state -- the earlier
84 // choices appear earlier in the list.
85 typedef SparseArray<Thread*> Threadq;
86
87 inline Thread* AllocThread();
88 inline Thread* Incref(Thread* t);
89 inline void Decref(Thread* t);
90
91 // Follows all empty arrows from id0 and enqueues all the states reached.
92 // Enqueues only the ByteRange instructions that match byte c.
93 // context is used (with p) for evaluating empty-width specials.
94 // p is the current input position, and t0 is the current thread.
95 void AddToThreadq(Threadq* q, int id0, int c, const StringPiece& context,
96 const char* p, Thread* t0);
97
98 // Run runq on byte c, appending new states to nextq.
99 // Updates matched_ and match_ as new, better matches are found.
100 // context is used (with p) for evaluating empty-width specials.
101 // p is the position of byte c in the input string for AddToThreadq;
102 // p-1 will be used when processing Match instructions.
103 // Frees all the threads on runq.
104 // If there is a shortcut to the end, returns that shortcut.
105 int Step(Threadq* runq, Threadq* nextq, int c, const StringPiece& context,
106 const char* p);
107
108 // Returns text version of capture information, for debugging.
109 std::string FormatCapture(const char** capture);
110
CopyCapture(const char ** dst,const char ** src)111 void CopyCapture(const char** dst, const char** src) {
112 memmove(dst, src, ncapture_*sizeof src[0]);
113 }
114
115 Prog* prog_; // underlying program
116 int start_; // start instruction in program
117 int ncapture_; // number of submatches to track
118 bool longest_; // whether searching for longest match
119 bool endmatch_; // whether match must end at text.end()
120 const char* btext_; // beginning of text (for FormatSubmatch)
121 const char* etext_; // end of text (for endmatch_)
122 Threadq q0_, q1_; // pre-allocated for Search.
123 PODArray<AddState> stack_; // pre-allocated for AddToThreadq
124 std::deque<Thread> arena_; // thread arena
125 Thread* freelist_; // thread freelist
126 const char** match_; // best match so far
127 bool matched_; // any match so far?
128
129 NFA(const NFA&) = delete;
130 NFA& operator=(const NFA&) = delete;
131 };
132
NFA(Prog * prog)133 NFA::NFA(Prog* prog) {
134 prog_ = prog;
135 start_ = prog_->start();
136 ncapture_ = 0;
137 longest_ = false;
138 endmatch_ = false;
139 btext_ = NULL;
140 etext_ = NULL;
141 q0_.resize(prog_->size());
142 q1_.resize(prog_->size());
143 // See NFA::AddToThreadq() for why this is so.
144 int nstack = 2*prog_->inst_count(kInstCapture) +
145 prog_->inst_count(kInstEmptyWidth) +
146 prog_->inst_count(kInstNop) + 1; // + 1 for start inst
147 stack_ = PODArray<AddState>(nstack);
148 freelist_ = NULL;
149 match_ = NULL;
150 matched_ = false;
151 }
152
~NFA()153 NFA::~NFA() {
154 delete[] match_;
155 for (const Thread& t : arena_)
156 delete[] t.capture;
157 }
158
AllocThread()159 NFA::Thread* NFA::AllocThread() {
160 Thread* t = freelist_;
161 if (t != NULL) {
162 freelist_ = t->next;
163 t->ref = 1;
164 // We don't need to touch t->capture because
165 // the caller will immediately overwrite it.
166 return t;
167 }
168 arena_.emplace_back();
169 t = &arena_.back();
170 t->ref = 1;
171 t->capture = new const char*[ncapture_];
172 return t;
173 }
174
Incref(Thread * t)175 NFA::Thread* NFA::Incref(Thread* t) {
176 DCHECK(t != NULL);
177 t->ref++;
178 return t;
179 }
180
Decref(Thread * t)181 void NFA::Decref(Thread* t) {
182 DCHECK(t != NULL);
183 t->ref--;
184 if (t->ref > 0)
185 return;
186 DCHECK_EQ(t->ref, 0);
187 t->next = freelist_;
188 freelist_ = t;
189 }
190
191 // Follows all empty arrows from id0 and enqueues all the states reached.
192 // Enqueues only the ByteRange instructions that match byte c.
193 // context is used (with p) for evaluating empty-width specials.
194 // p is the current input position, and t0 is the current thread.
AddToThreadq(Threadq * q,int id0,int c,const StringPiece & context,const char * p,Thread * t0)195 void NFA::AddToThreadq(Threadq* q, int id0, int c, const StringPiece& context,
196 const char* p, Thread* t0) {
197 if (id0 == 0)
198 return;
199
200 // Use stack_ to hold our stack of instructions yet to process.
201 // It was preallocated as follows:
202 // two entries per Capture;
203 // one entry per EmptyWidth; and
204 // one entry per Nop.
205 // This reflects the maximum number of stack pushes that each can
206 // perform. (Each instruction can be processed at most once.)
207 AddState* stk = stack_.data();
208 int nstk = 0;
209
210 stk[nstk++] = {id0, NULL};
211 while (nstk > 0) {
212 DCHECK_LE(nstk, stack_.size());
213 AddState a = stk[--nstk];
214
215 Loop:
216 if (a.t != NULL) {
217 // t0 was a thread that we allocated and copied in order to
218 // record the capture, so we must now decref it.
219 Decref(t0);
220 t0 = a.t;
221 }
222
223 int id = a.id;
224 if (id == 0)
225 continue;
226 if (q->has_index(id)) {
227 if (ExtraDebug)
228 fprintf(stderr, " [%d%s]\n", id, FormatCapture(t0->capture).c_str());
229 continue;
230 }
231
232 // Create entry in q no matter what. We might fill it in below,
233 // or we might not. Even if not, it is necessary to have it,
234 // so that we don't revisit id0 during the recursion.
235 q->set_new(id, NULL);
236 Thread** tp = &q->get_existing(id);
237 int j;
238 Thread* t;
239 Prog::Inst* ip = prog_->inst(id);
240 switch (ip->opcode()) {
241 default:
242 LOG(DFATAL) << "unhandled " << ip->opcode() << " in AddToThreadq";
243 break;
244
245 case kInstFail:
246 break;
247
248 case kInstAltMatch:
249 // Save state; will pick up at next byte.
250 t = Incref(t0);
251 *tp = t;
252
253 DCHECK(!ip->last());
254 a = {id+1, NULL};
255 goto Loop;
256
257 case kInstNop:
258 if (!ip->last())
259 stk[nstk++] = {id+1, NULL};
260
261 // Continue on.
262 a = {ip->out(), NULL};
263 goto Loop;
264
265 case kInstCapture:
266 if (!ip->last())
267 stk[nstk++] = {id+1, NULL};
268
269 if ((j=ip->cap()) < ncapture_) {
270 // Push a dummy whose only job is to restore t0
271 // once we finish exploring this possibility.
272 stk[nstk++] = {0, t0};
273
274 // Record capture.
275 t = AllocThread();
276 CopyCapture(t->capture, t0->capture);
277 t->capture[j] = p;
278 t0 = t;
279 }
280 a = {ip->out(), NULL};
281 goto Loop;
282
283 case kInstByteRange:
284 if (!ip->Matches(c))
285 goto Next;
286
287 // Save state; will pick up at next byte.
288 t = Incref(t0);
289 *tp = t;
290 if (ExtraDebug)
291 fprintf(stderr, " + %d%s\n", id, FormatCapture(t0->capture).c_str());
292
293 if (ip->hint() == 0)
294 break;
295 a = {id+ip->hint(), NULL};
296 goto Loop;
297
298 case kInstMatch:
299 // Save state; will pick up at next byte.
300 t = Incref(t0);
301 *tp = t;
302 if (ExtraDebug)
303 fprintf(stderr, " ! %d%s\n", id, FormatCapture(t0->capture).c_str());
304
305 Next:
306 if (ip->last())
307 break;
308 a = {id+1, NULL};
309 goto Loop;
310
311 case kInstEmptyWidth:
312 if (!ip->last())
313 stk[nstk++] = {id+1, NULL};
314
315 // Continue on if we have all the right flag bits.
316 if (ip->empty() & ~Prog::EmptyFlags(context, p))
317 break;
318 a = {ip->out(), NULL};
319 goto Loop;
320 }
321 }
322 }
323
324 // Run runq on byte c, appending new states to nextq.
325 // Updates matched_ and match_ as new, better matches are found.
326 // context is used (with p) for evaluating empty-width specials.
327 // p is the position of byte c in the input string for AddToThreadq;
328 // p-1 will be used when processing Match instructions.
329 // Frees all the threads on runq.
330 // If there is a shortcut to the end, returns that shortcut.
Step(Threadq * runq,Threadq * nextq,int c,const StringPiece & context,const char * p)331 int NFA::Step(Threadq* runq, Threadq* nextq, int c, const StringPiece& context,
332 const char* p) {
333 nextq->clear();
334
335 for (Threadq::iterator i = runq->begin(); i != runq->end(); ++i) {
336 Thread* t = i->value();
337 if (t == NULL)
338 continue;
339
340 if (longest_) {
341 // Can skip any threads started after our current best match.
342 if (matched_ && match_[0] < t->capture[0]) {
343 Decref(t);
344 continue;
345 }
346 }
347
348 int id = i->index();
349 Prog::Inst* ip = prog_->inst(id);
350
351 switch (ip->opcode()) {
352 default:
353 // Should only see the values handled below.
354 LOG(DFATAL) << "Unhandled " << ip->opcode() << " in step";
355 break;
356
357 case kInstByteRange:
358 AddToThreadq(nextq, ip->out(), c, context, p, t);
359 break;
360
361 case kInstAltMatch:
362 if (i != runq->begin())
363 break;
364 // The match is ours if we want it.
365 if (ip->greedy(prog_) || longest_) {
366 CopyCapture(match_, t->capture);
367 matched_ = true;
368
369 Decref(t);
370 for (++i; i != runq->end(); ++i) {
371 if (i->value() != NULL)
372 Decref(i->value());
373 }
374 runq->clear();
375 if (ip->greedy(prog_))
376 return ip->out1();
377 return ip->out();
378 }
379 break;
380
381 case kInstMatch: {
382 // Avoid invoking undefined behavior (arithmetic on a null pointer)
383 // by storing p instead of p-1. (What would the latter even mean?!)
384 // This complements the special case in NFA::Search().
385 if (p == NULL) {
386 CopyCapture(match_, t->capture);
387 match_[1] = p;
388 matched_ = true;
389 break;
390 }
391
392 if (endmatch_ && p-1 != etext_)
393 break;
394
395 if (longest_) {
396 // Leftmost-longest mode: save this match only if
397 // it is either farther to the left or at the same
398 // point but longer than an existing match.
399 if (!matched_ || t->capture[0] < match_[0] ||
400 (t->capture[0] == match_[0] && p-1 > match_[1])) {
401 CopyCapture(match_, t->capture);
402 match_[1] = p-1;
403 matched_ = true;
404 }
405 } else {
406 // Leftmost-biased mode: this match is by definition
407 // better than what we've already found (see next line).
408 CopyCapture(match_, t->capture);
409 match_[1] = p-1;
410 matched_ = true;
411
412 // Cut off the threads that can only find matches
413 // worse than the one we just found: don't run the
414 // rest of the current Threadq.
415 Decref(t);
416 for (++i; i != runq->end(); ++i) {
417 if (i->value() != NULL)
418 Decref(i->value());
419 }
420 runq->clear();
421 return 0;
422 }
423 break;
424 }
425 }
426 Decref(t);
427 }
428 runq->clear();
429 return 0;
430 }
431
FormatCapture(const char ** capture)432 std::string NFA::FormatCapture(const char** capture) {
433 std::string s;
434 for (int i = 0; i < ncapture_; i+=2) {
435 if (capture[i] == NULL)
436 s += "(?,?)";
437 else if (capture[i+1] == NULL)
438 s += StringPrintf("(%td,?)",
439 capture[i] - btext_);
440 else
441 s += StringPrintf("(%td,%td)",
442 capture[i] - btext_,
443 capture[i+1] - btext_);
444 }
445 return s;
446 }
447
Search(const StringPiece & text,const StringPiece & const_context,bool anchored,bool longest,StringPiece * submatch,int nsubmatch)448 bool NFA::Search(const StringPiece& text, const StringPiece& const_context,
449 bool anchored, bool longest,
450 StringPiece* submatch, int nsubmatch) {
451 if (start_ == 0)
452 return false;
453
454 StringPiece context = const_context;
455 if (context.data() == NULL)
456 context = text;
457
458 // Sanity check: make sure that text lies within context.
459 if (text.begin() < context.begin() || text.end() > context.end()) {
460 LOG(DFATAL) << "context does not contain text";
461 return false;
462 }
463
464 if (prog_->anchor_start() && context.begin() != text.begin())
465 return false;
466 if (prog_->anchor_end() && context.end() != text.end())
467 return false;
468 anchored |= prog_->anchor_start();
469 if (prog_->anchor_end()) {
470 longest = true;
471 endmatch_ = true;
472 }
473
474 if (nsubmatch < 0) {
475 LOG(DFATAL) << "Bad args: nsubmatch=" << nsubmatch;
476 return false;
477 }
478
479 // Save search parameters.
480 ncapture_ = 2*nsubmatch;
481 longest_ = longest;
482
483 if (nsubmatch == 0) {
484 // We need to maintain match[0], both to distinguish the
485 // longest match (if longest is true) and also to tell
486 // whether we've seen any matches at all.
487 ncapture_ = 2;
488 }
489
490 match_ = new const char*[ncapture_];
491 memset(match_, 0, ncapture_*sizeof match_[0]);
492 matched_ = false;
493
494 // For debugging prints.
495 btext_ = context.data();
496 // For convenience.
497 etext_ = text.data() + text.size();
498
499 if (ExtraDebug)
500 fprintf(stderr, "NFA::Search %s (context: %s) anchored=%d longest=%d\n",
501 std::string(text).c_str(), std::string(context).c_str(), anchored, longest);
502
503 // Set up search.
504 Threadq* runq = &q0_;
505 Threadq* nextq = &q1_;
506 runq->clear();
507 nextq->clear();
508
509 // Loop over the text, stepping the machine.
510 for (const char* p = text.data();; p++) {
511 if (ExtraDebug) {
512 int c = 0;
513 if (p == btext_)
514 c = '^';
515 else if (p > etext_)
516 c = '$';
517 else if (p < etext_)
518 c = p[0] & 0xFF;
519
520 fprintf(stderr, "%c:", c);
521 for (Threadq::iterator i = runq->begin(); i != runq->end(); ++i) {
522 Thread* t = i->value();
523 if (t == NULL)
524 continue;
525 fprintf(stderr, " %d%s", i->index(), FormatCapture(t->capture).c_str());
526 }
527 fprintf(stderr, "\n");
528 }
529
530 // This is a no-op the first time around the loop because runq is empty.
531 int id = Step(runq, nextq, p < etext_ ? p[0] & 0xFF : -1, context, p);
532 DCHECK_EQ(runq->size(), 0);
533 using std::swap;
534 swap(nextq, runq);
535 nextq->clear();
536 if (id != 0) {
537 // We're done: full match ahead.
538 p = etext_;
539 for (;;) {
540 Prog::Inst* ip = prog_->inst(id);
541 switch (ip->opcode()) {
542 default:
543 LOG(DFATAL) << "Unexpected opcode in short circuit: " << ip->opcode();
544 break;
545
546 case kInstCapture:
547 if (ip->cap() < ncapture_)
548 match_[ip->cap()] = p;
549 id = ip->out();
550 continue;
551
552 case kInstNop:
553 id = ip->out();
554 continue;
555
556 case kInstMatch:
557 match_[1] = p;
558 matched_ = true;
559 break;
560 }
561 break;
562 }
563 break;
564 }
565
566 if (p > etext_)
567 break;
568
569 // Start a new thread if there have not been any matches.
570 // (No point in starting a new thread if there have been
571 // matches, since it would be to the right of the match
572 // we already found.)
573 if (!matched_ && (!anchored || p == text.data())) {
574 // Try to use prefix accel (e.g. memchr) to skip ahead.
575 // The search must be unanchored and there must be zero
576 // possible matches already.
577 if (!anchored && runq->size() == 0 &&
578 p < etext_ && prog_->can_prefix_accel()) {
579 p = reinterpret_cast<const char*>(prog_->PrefixAccel(p, etext_ - p));
580 if (p == NULL)
581 p = etext_;
582 }
583
584 Thread* t = AllocThread();
585 CopyCapture(t->capture, match_);
586 t->capture[0] = p;
587 AddToThreadq(runq, start_, p < etext_ ? p[0] & 0xFF : -1, context, p,
588 t);
589 Decref(t);
590 }
591
592 // If all the threads have died, stop early.
593 if (runq->size() == 0) {
594 if (ExtraDebug)
595 fprintf(stderr, "dead\n");
596 break;
597 }
598
599 // Avoid invoking undefined behavior (arithmetic on a null pointer)
600 // by simply not continuing the loop.
601 // This complements the special case in NFA::Step().
602 if (p == NULL) {
603 (void)Step(runq, nextq, p < etext_ ? p[0] & 0xFF : -1, context, p);
604 DCHECK_EQ(runq->size(), 0);
605 using std::swap;
606 swap(nextq, runq);
607 nextq->clear();
608 break;
609 }
610 }
611
612 for (Threadq::iterator i = runq->begin(); i != runq->end(); ++i) {
613 if (i->value() != NULL)
614 Decref(i->value());
615 }
616
617 if (matched_) {
618 for (int i = 0; i < nsubmatch; i++)
619 submatch[i] =
620 StringPiece(match_[2 * i],
621 static_cast<size_t>(match_[2 * i + 1] - match_[2 * i]));
622 if (ExtraDebug)
623 fprintf(stderr, "match (%td,%td)\n",
624 match_[0] - btext_,
625 match_[1] - btext_);
626 return true;
627 }
628 return false;
629 }
630
631 bool
SearchNFA(const StringPiece & text,const StringPiece & context,Anchor anchor,MatchKind kind,StringPiece * match,int nmatch)632 Prog::SearchNFA(const StringPiece& text, const StringPiece& context,
633 Anchor anchor, MatchKind kind,
634 StringPiece* match, int nmatch) {
635 if (ExtraDebug)
636 Dump();
637
638 NFA nfa(this);
639 StringPiece sp;
640 if (kind == kFullMatch) {
641 anchor = kAnchored;
642 if (nmatch == 0) {
643 match = &sp;
644 nmatch = 1;
645 }
646 }
647 if (!nfa.Search(text, context, anchor == kAnchored, kind != kFirstMatch, match, nmatch))
648 return false;
649 if (kind == kFullMatch && match[0].end() != text.end())
650 return false;
651 return true;
652 }
653
654 // For each instruction i in the program reachable from the start, compute the
655 // number of instructions reachable from i by following only empty transitions
656 // and record that count as fanout[i].
657 //
658 // fanout holds the results and is also the work queue for the outer iteration.
659 // reachable holds the reached nodes for the inner iteration.
Fanout(SparseArray<int> * fanout)660 void Prog::Fanout(SparseArray<int>* fanout) {
661 DCHECK_EQ(fanout->max_size(), size());
662 SparseSet reachable(size());
663 fanout->clear();
664 fanout->set_new(start(), 0);
665 for (SparseArray<int>::iterator i = fanout->begin(); i != fanout->end(); ++i) {
666 int* count = &i->value();
667 reachable.clear();
668 reachable.insert(i->index());
669 for (SparseSet::iterator j = reachable.begin(); j != reachable.end(); ++j) {
670 int id = *j;
671 Prog::Inst* ip = inst(id);
672 switch (ip->opcode()) {
673 default:
674 LOG(DFATAL) << "unhandled " << ip->opcode() << " in Prog::Fanout()";
675 break;
676
677 case kInstByteRange:
678 if (!ip->last())
679 reachable.insert(id+1);
680
681 (*count)++;
682 if (!fanout->has_index(ip->out())) {
683 fanout->set_new(ip->out(), 0);
684 }
685 break;
686
687 case kInstAltMatch:
688 DCHECK(!ip->last());
689 reachable.insert(id+1);
690 break;
691
692 case kInstCapture:
693 case kInstEmptyWidth:
694 case kInstNop:
695 if (!ip->last())
696 reachable.insert(id+1);
697
698 reachable.insert(ip->out());
699 break;
700
701 case kInstMatch:
702 if (!ip->last())
703 reachable.insert(id+1);
704 break;
705
706 case kInstFail:
707 break;
708 }
709 }
710 }
711 }
712
713 } // namespace re2
714