• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2006-2007 The RE2 Authors.  All Rights Reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4 
5 // Tested by search_test.cc.
6 //
7 // Prog::SearchNFA, an NFA search.
8 // This is an actual NFA like the theorists talk about,
9 // not the pseudo-NFA found in backtracking regexp implementations.
10 //
11 // IMPLEMENTATION
12 //
13 // This algorithm is a variant of one that appeared in Rob Pike's sam editor,
14 // which is a variant of the one described in Thompson's 1968 CACM paper.
15 // See http://swtch.com/~rsc/regexp/ for various history.  The main feature
16 // over the DFA implementation is that it tracks submatch boundaries.
17 //
18 // When the choice of submatch boundaries is ambiguous, this particular
19 // implementation makes the same choices that traditional backtracking
20 // implementations (in particular, Perl and PCRE) do.
21 // Note that unlike in Perl and PCRE, this algorithm *cannot* take exponential
22 // time in the length of the input.
23 //
24 // Like Thompson's original machine and like the DFA implementation, this
25 // implementation notices a match only once it is one byte past it.
26 
27 #include <stdio.h>
28 #include <string.h>
29 #include <algorithm>
30 #include <deque>
31 #include <string>
32 #include <utility>
33 #include <vector>
34 
35 #include "util/logging.h"
36 #include "util/strutil.h"
37 #include "re2/pod_array.h"
38 #include "re2/prog.h"
39 #include "re2/regexp.h"
40 #include "re2/sparse_array.h"
41 #include "re2/sparse_set.h"
42 
43 namespace re2 {
44 
45 static const bool ExtraDebug = false;
46 
47 class NFA {
48  public:
49   NFA(Prog* prog);
50   ~NFA();
51 
52   // Searches for a matching string.
53   //   * If anchored is true, only considers matches starting at offset.
54   //     Otherwise finds lefmost match at or after offset.
55   //   * If longest is true, returns the longest match starting
56   //     at the chosen start point.  Otherwise returns the so-called
57   //     left-biased match, the one traditional backtracking engines
58   //     (like Perl and PCRE) find.
59   // Records submatch boundaries in submatch[1..nsubmatch-1].
60   // Submatch[0] is the entire match.  When there is a choice in
61   // which text matches each subexpression, the submatch boundaries
62   // are chosen to match what a backtracking implementation would choose.
63   bool Search(const StringPiece& text, const StringPiece& context,
64               bool anchored, bool longest,
65               StringPiece* submatch, int nsubmatch);
66 
67  private:
68   struct Thread {
69     union {
70       int ref;
71       Thread* next;  // when on free list
72     };
73     const char** capture;
74   };
75 
76   // State for explicit stack in AddToThreadq.
77   struct AddState {
78     int id;     // Inst to process
79     Thread* t;  // if not null, set t0 = t before processing id
80   };
81 
82   // Threadq is a list of threads.  The list is sorted by the order
83   // in which Perl would explore that particular state -- the earlier
84   // choices appear earlier in the list.
85   typedef SparseArray<Thread*> Threadq;
86 
87   inline Thread* AllocThread();
88   inline Thread* Incref(Thread* t);
89   inline void Decref(Thread* t);
90 
91   // Follows all empty arrows from id0 and enqueues all the states reached.
92   // Enqueues only the ByteRange instructions that match byte c.
93   // context is used (with p) for evaluating empty-width specials.
94   // p is the current input position, and t0 is the current thread.
95   void AddToThreadq(Threadq* q, int id0, int c, const StringPiece& context,
96                     const char* p, Thread* t0);
97 
98   // Run runq on byte c, appending new states to nextq.
99   // Updates matched_ and match_ as new, better matches are found.
100   // context is used (with p) for evaluating empty-width specials.
101   // p is the position of byte c in the input string for AddToThreadq;
102   // p-1 will be used when processing Match instructions.
103   // Frees all the threads on runq.
104   // If there is a shortcut to the end, returns that shortcut.
105   int Step(Threadq* runq, Threadq* nextq, int c, const StringPiece& context,
106            const char* p);
107 
108   // Returns text version of capture information, for debugging.
109   std::string FormatCapture(const char** capture);
110 
CopyCapture(const char ** dst,const char ** src)111   void CopyCapture(const char** dst, const char** src) {
112     memmove(dst, src, ncapture_*sizeof src[0]);
113   }
114 
115   Prog* prog_;                // underlying program
116   int start_;                 // start instruction in program
117   int ncapture_;              // number of submatches to track
118   bool longest_;              // whether searching for longest match
119   bool endmatch_;             // whether match must end at text.end()
120   const char* btext_;         // beginning of text (for FormatSubmatch)
121   const char* etext_;         // end of text (for endmatch_)
122   Threadq q0_, q1_;           // pre-allocated for Search.
123   PODArray<AddState> stack_;  // pre-allocated for AddToThreadq
124   std::deque<Thread> arena_;  // thread arena
125   Thread* freelist_;          // thread freelist
126   const char** match_;        // best match so far
127   bool matched_;              // any match so far?
128 
129   NFA(const NFA&) = delete;
130   NFA& operator=(const NFA&) = delete;
131 };
132 
NFA(Prog * prog)133 NFA::NFA(Prog* prog) {
134   prog_ = prog;
135   start_ = prog_->start();
136   ncapture_ = 0;
137   longest_ = false;
138   endmatch_ = false;
139   btext_ = NULL;
140   etext_ = NULL;
141   q0_.resize(prog_->size());
142   q1_.resize(prog_->size());
143   // See NFA::AddToThreadq() for why this is so.
144   int nstack = 2*prog_->inst_count(kInstCapture) +
145                prog_->inst_count(kInstEmptyWidth) +
146                prog_->inst_count(kInstNop) + 1;  // + 1 for start inst
147   stack_ = PODArray<AddState>(nstack);
148   freelist_ = NULL;
149   match_ = NULL;
150   matched_ = false;
151 }
152 
~NFA()153 NFA::~NFA() {
154   delete[] match_;
155   for (const Thread& t : arena_)
156     delete[] t.capture;
157 }
158 
AllocThread()159 NFA::Thread* NFA::AllocThread() {
160   Thread* t = freelist_;
161   if (t != NULL) {
162     freelist_ = t->next;
163     t->ref = 1;
164     // We don't need to touch t->capture because
165     // the caller will immediately overwrite it.
166     return t;
167   }
168   arena_.emplace_back();
169   t = &arena_.back();
170   t->ref = 1;
171   t->capture = new const char*[ncapture_];
172   return t;
173 }
174 
Incref(Thread * t)175 NFA::Thread* NFA::Incref(Thread* t) {
176   DCHECK(t != NULL);
177   t->ref++;
178   return t;
179 }
180 
Decref(Thread * t)181 void NFA::Decref(Thread* t) {
182   DCHECK(t != NULL);
183   t->ref--;
184   if (t->ref > 0)
185     return;
186   DCHECK_EQ(t->ref, 0);
187   t->next = freelist_;
188   freelist_ = t;
189 }
190 
191 // Follows all empty arrows from id0 and enqueues all the states reached.
192 // Enqueues only the ByteRange instructions that match byte c.
193 // context is used (with p) for evaluating empty-width specials.
194 // p is the current input position, and t0 is the current thread.
AddToThreadq(Threadq * q,int id0,int c,const StringPiece & context,const char * p,Thread * t0)195 void NFA::AddToThreadq(Threadq* q, int id0, int c, const StringPiece& context,
196                        const char* p, Thread* t0) {
197   if (id0 == 0)
198     return;
199 
200   // Use stack_ to hold our stack of instructions yet to process.
201   // It was preallocated as follows:
202   //   two entries per Capture;
203   //   one entry per EmptyWidth; and
204   //   one entry per Nop.
205   // This reflects the maximum number of stack pushes that each can
206   // perform. (Each instruction can be processed at most once.)
207   AddState* stk = stack_.data();
208   int nstk = 0;
209 
210   stk[nstk++] = {id0, NULL};
211   while (nstk > 0) {
212     DCHECK_LE(nstk, stack_.size());
213     AddState a = stk[--nstk];
214 
215   Loop:
216     if (a.t != NULL) {
217       // t0 was a thread that we allocated and copied in order to
218       // record the capture, so we must now decref it.
219       Decref(t0);
220       t0 = a.t;
221     }
222 
223     int id = a.id;
224     if (id == 0)
225       continue;
226     if (q->has_index(id)) {
227       if (ExtraDebug)
228         fprintf(stderr, "  [%d%s]\n", id, FormatCapture(t0->capture).c_str());
229       continue;
230     }
231 
232     // Create entry in q no matter what.  We might fill it in below,
233     // or we might not.  Even if not, it is necessary to have it,
234     // so that we don't revisit id0 during the recursion.
235     q->set_new(id, NULL);
236     Thread** tp = &q->get_existing(id);
237     int j;
238     Thread* t;
239     Prog::Inst* ip = prog_->inst(id);
240     switch (ip->opcode()) {
241     default:
242       LOG(DFATAL) << "unhandled " << ip->opcode() << " in AddToThreadq";
243       break;
244 
245     case kInstFail:
246       break;
247 
248     case kInstAltMatch:
249       // Save state; will pick up at next byte.
250       t = Incref(t0);
251       *tp = t;
252 
253       DCHECK(!ip->last());
254       a = {id+1, NULL};
255       goto Loop;
256 
257     case kInstNop:
258       if (!ip->last())
259         stk[nstk++] = {id+1, NULL};
260 
261       // Continue on.
262       a = {ip->out(), NULL};
263       goto Loop;
264 
265     case kInstCapture:
266       if (!ip->last())
267         stk[nstk++] = {id+1, NULL};
268 
269       if ((j=ip->cap()) < ncapture_) {
270         // Push a dummy whose only job is to restore t0
271         // once we finish exploring this possibility.
272         stk[nstk++] = {0, t0};
273 
274         // Record capture.
275         t = AllocThread();
276         CopyCapture(t->capture, t0->capture);
277         t->capture[j] = p;
278         t0 = t;
279       }
280       a = {ip->out(), NULL};
281       goto Loop;
282 
283     case kInstByteRange:
284       if (!ip->Matches(c))
285         goto Next;
286 
287       // Save state; will pick up at next byte.
288       t = Incref(t0);
289       *tp = t;
290       if (ExtraDebug)
291         fprintf(stderr, " + %d%s\n", id, FormatCapture(t0->capture).c_str());
292 
293       if (ip->hint() == 0)
294         break;
295       a = {id+ip->hint(), NULL};
296       goto Loop;
297 
298     case kInstMatch:
299       // Save state; will pick up at next byte.
300       t = Incref(t0);
301       *tp = t;
302       if (ExtraDebug)
303         fprintf(stderr, " ! %d%s\n", id, FormatCapture(t0->capture).c_str());
304 
305     Next:
306       if (ip->last())
307         break;
308       a = {id+1, NULL};
309       goto Loop;
310 
311     case kInstEmptyWidth:
312       if (!ip->last())
313         stk[nstk++] = {id+1, NULL};
314 
315       // Continue on if we have all the right flag bits.
316       if (ip->empty() & ~Prog::EmptyFlags(context, p))
317         break;
318       a = {ip->out(), NULL};
319       goto Loop;
320     }
321   }
322 }
323 
324 // Run runq on byte c, appending new states to nextq.
325 // Updates matched_ and match_ as new, better matches are found.
326 // context is used (with p) for evaluating empty-width specials.
327 // p is the position of byte c in the input string for AddToThreadq;
328 // p-1 will be used when processing Match instructions.
329 // Frees all the threads on runq.
330 // If there is a shortcut to the end, returns that shortcut.
Step(Threadq * runq,Threadq * nextq,int c,const StringPiece & context,const char * p)331 int NFA::Step(Threadq* runq, Threadq* nextq, int c, const StringPiece& context,
332               const char* p) {
333   nextq->clear();
334 
335   for (Threadq::iterator i = runq->begin(); i != runq->end(); ++i) {
336     Thread* t = i->value();
337     if (t == NULL)
338       continue;
339 
340     if (longest_) {
341       // Can skip any threads started after our current best match.
342       if (matched_ && match_[0] < t->capture[0]) {
343         Decref(t);
344         continue;
345       }
346     }
347 
348     int id = i->index();
349     Prog::Inst* ip = prog_->inst(id);
350 
351     switch (ip->opcode()) {
352       default:
353         // Should only see the values handled below.
354         LOG(DFATAL) << "Unhandled " << ip->opcode() << " in step";
355         break;
356 
357       case kInstByteRange:
358         AddToThreadq(nextq, ip->out(), c, context, p, t);
359         break;
360 
361       case kInstAltMatch:
362         if (i != runq->begin())
363           break;
364         // The match is ours if we want it.
365         if (ip->greedy(prog_) || longest_) {
366           CopyCapture(match_, t->capture);
367           matched_ = true;
368 
369           Decref(t);
370           for (++i; i != runq->end(); ++i) {
371             if (i->value() != NULL)
372               Decref(i->value());
373           }
374           runq->clear();
375           if (ip->greedy(prog_))
376             return ip->out1();
377           return ip->out();
378         }
379         break;
380 
381       case kInstMatch: {
382         // Avoid invoking undefined behavior (arithmetic on a null pointer)
383         // by storing p instead of p-1. (What would the latter even mean?!)
384         // This complements the special case in NFA::Search().
385         if (p == NULL) {
386           CopyCapture(match_, t->capture);
387           match_[1] = p;
388           matched_ = true;
389           break;
390         }
391 
392         if (endmatch_ && p-1 != etext_)
393           break;
394 
395         if (longest_) {
396           // Leftmost-longest mode: save this match only if
397           // it is either farther to the left or at the same
398           // point but longer than an existing match.
399           if (!matched_ || t->capture[0] < match_[0] ||
400               (t->capture[0] == match_[0] && p-1 > match_[1])) {
401             CopyCapture(match_, t->capture);
402             match_[1] = p-1;
403             matched_ = true;
404           }
405         } else {
406           // Leftmost-biased mode: this match is by definition
407           // better than what we've already found (see next line).
408           CopyCapture(match_, t->capture);
409           match_[1] = p-1;
410           matched_ = true;
411 
412           // Cut off the threads that can only find matches
413           // worse than the one we just found: don't run the
414           // rest of the current Threadq.
415           Decref(t);
416           for (++i; i != runq->end(); ++i) {
417             if (i->value() != NULL)
418               Decref(i->value());
419           }
420           runq->clear();
421           return 0;
422         }
423         break;
424       }
425     }
426     Decref(t);
427   }
428   runq->clear();
429   return 0;
430 }
431 
FormatCapture(const char ** capture)432 std::string NFA::FormatCapture(const char** capture) {
433   std::string s;
434   for (int i = 0; i < ncapture_; i+=2) {
435     if (capture[i] == NULL)
436       s += "(?,?)";
437     else if (capture[i+1] == NULL)
438       s += StringPrintf("(%td,?)",
439                         capture[i] - btext_);
440     else
441       s += StringPrintf("(%td,%td)",
442                         capture[i] - btext_,
443                         capture[i+1] - btext_);
444   }
445   return s;
446 }
447 
Search(const StringPiece & text,const StringPiece & const_context,bool anchored,bool longest,StringPiece * submatch,int nsubmatch)448 bool NFA::Search(const StringPiece& text, const StringPiece& const_context,
449             bool anchored, bool longest,
450             StringPiece* submatch, int nsubmatch) {
451   if (start_ == 0)
452     return false;
453 
454   StringPiece context = const_context;
455   if (context.data() == NULL)
456     context = text;
457 
458   // Sanity check: make sure that text lies within context.
459   if (text.begin() < context.begin() || text.end() > context.end()) {
460     LOG(DFATAL) << "context does not contain text";
461     return false;
462   }
463 
464   if (prog_->anchor_start() && context.begin() != text.begin())
465     return false;
466   if (prog_->anchor_end() && context.end() != text.end())
467     return false;
468   anchored |= prog_->anchor_start();
469   if (prog_->anchor_end()) {
470     longest = true;
471     endmatch_ = true;
472   }
473 
474   if (nsubmatch < 0) {
475     LOG(DFATAL) << "Bad args: nsubmatch=" << nsubmatch;
476     return false;
477   }
478 
479   // Save search parameters.
480   ncapture_ = 2*nsubmatch;
481   longest_ = longest;
482 
483   if (nsubmatch == 0) {
484     // We need to maintain match[0], both to distinguish the
485     // longest match (if longest is true) and also to tell
486     // whether we've seen any matches at all.
487     ncapture_ = 2;
488   }
489 
490   match_ = new const char*[ncapture_];
491   memset(match_, 0, ncapture_*sizeof match_[0]);
492   matched_ = false;
493 
494   // For debugging prints.
495   btext_ = context.data();
496   // For convenience.
497   etext_ = text.data() + text.size();
498 
499   if (ExtraDebug)
500     fprintf(stderr, "NFA::Search %s (context: %s) anchored=%d longest=%d\n",
501             std::string(text).c_str(), std::string(context).c_str(), anchored, longest);
502 
503   // Set up search.
504   Threadq* runq = &q0_;
505   Threadq* nextq = &q1_;
506   runq->clear();
507   nextq->clear();
508 
509   // Loop over the text, stepping the machine.
510   for (const char* p = text.data();; p++) {
511     if (ExtraDebug) {
512       int c = 0;
513       if (p == btext_)
514         c = '^';
515       else if (p > etext_)
516         c = '$';
517       else if (p < etext_)
518         c = p[0] & 0xFF;
519 
520       fprintf(stderr, "%c:", c);
521       for (Threadq::iterator i = runq->begin(); i != runq->end(); ++i) {
522         Thread* t = i->value();
523         if (t == NULL)
524           continue;
525         fprintf(stderr, " %d%s", i->index(), FormatCapture(t->capture).c_str());
526       }
527       fprintf(stderr, "\n");
528     }
529 
530     // This is a no-op the first time around the loop because runq is empty.
531     int id = Step(runq, nextq, p < etext_ ? p[0] & 0xFF : -1, context, p);
532     DCHECK_EQ(runq->size(), 0);
533     using std::swap;
534     swap(nextq, runq);
535     nextq->clear();
536     if (id != 0) {
537       // We're done: full match ahead.
538       p = etext_;
539       for (;;) {
540         Prog::Inst* ip = prog_->inst(id);
541         switch (ip->opcode()) {
542           default:
543             LOG(DFATAL) << "Unexpected opcode in short circuit: " << ip->opcode();
544             break;
545 
546           case kInstCapture:
547             if (ip->cap() < ncapture_)
548               match_[ip->cap()] = p;
549             id = ip->out();
550             continue;
551 
552           case kInstNop:
553             id = ip->out();
554             continue;
555 
556           case kInstMatch:
557             match_[1] = p;
558             matched_ = true;
559             break;
560         }
561         break;
562       }
563       break;
564     }
565 
566     if (p > etext_)
567       break;
568 
569     // Start a new thread if there have not been any matches.
570     // (No point in starting a new thread if there have been
571     // matches, since it would be to the right of the match
572     // we already found.)
573     if (!matched_ && (!anchored || p == text.data())) {
574       // Try to use prefix accel (e.g. memchr) to skip ahead.
575       // The search must be unanchored and there must be zero
576       // possible matches already.
577       if (!anchored && runq->size() == 0 &&
578           p < etext_ && prog_->can_prefix_accel()) {
579         p = reinterpret_cast<const char*>(prog_->PrefixAccel(p, etext_ - p));
580         if (p == NULL)
581           p = etext_;
582       }
583 
584       Thread* t = AllocThread();
585       CopyCapture(t->capture, match_);
586       t->capture[0] = p;
587       AddToThreadq(runq, start_, p < etext_ ? p[0] & 0xFF : -1, context, p,
588                    t);
589       Decref(t);
590     }
591 
592     // If all the threads have died, stop early.
593     if (runq->size() == 0) {
594       if (ExtraDebug)
595         fprintf(stderr, "dead\n");
596       break;
597     }
598 
599     // Avoid invoking undefined behavior (arithmetic on a null pointer)
600     // by simply not continuing the loop.
601     // This complements the special case in NFA::Step().
602     if (p == NULL) {
603       (void)Step(runq, nextq, p < etext_ ? p[0] & 0xFF : -1, context, p);
604       DCHECK_EQ(runq->size(), 0);
605       using std::swap;
606       swap(nextq, runq);
607       nextq->clear();
608       break;
609     }
610   }
611 
612   for (Threadq::iterator i = runq->begin(); i != runq->end(); ++i) {
613     if (i->value() != NULL)
614       Decref(i->value());
615   }
616 
617   if (matched_) {
618     for (int i = 0; i < nsubmatch; i++)
619       submatch[i] =
620           StringPiece(match_[2 * i],
621                       static_cast<size_t>(match_[2 * i + 1] - match_[2 * i]));
622     if (ExtraDebug)
623       fprintf(stderr, "match (%td,%td)\n",
624               match_[0] - btext_,
625               match_[1] - btext_);
626     return true;
627   }
628   return false;
629 }
630 
631 bool
SearchNFA(const StringPiece & text,const StringPiece & context,Anchor anchor,MatchKind kind,StringPiece * match,int nmatch)632 Prog::SearchNFA(const StringPiece& text, const StringPiece& context,
633                 Anchor anchor, MatchKind kind,
634                 StringPiece* match, int nmatch) {
635   if (ExtraDebug)
636     Dump();
637 
638   NFA nfa(this);
639   StringPiece sp;
640   if (kind == kFullMatch) {
641     anchor = kAnchored;
642     if (nmatch == 0) {
643       match = &sp;
644       nmatch = 1;
645     }
646   }
647   if (!nfa.Search(text, context, anchor == kAnchored, kind != kFirstMatch, match, nmatch))
648     return false;
649   if (kind == kFullMatch && match[0].end() != text.end())
650     return false;
651   return true;
652 }
653 
654 // For each instruction i in the program reachable from the start, compute the
655 // number of instructions reachable from i by following only empty transitions
656 // and record that count as fanout[i].
657 //
658 // fanout holds the results and is also the work queue for the outer iteration.
659 // reachable holds the reached nodes for the inner iteration.
Fanout(SparseArray<int> * fanout)660 void Prog::Fanout(SparseArray<int>* fanout) {
661   DCHECK_EQ(fanout->max_size(), size());
662   SparseSet reachable(size());
663   fanout->clear();
664   fanout->set_new(start(), 0);
665   for (SparseArray<int>::iterator i = fanout->begin(); i != fanout->end(); ++i) {
666     int* count = &i->value();
667     reachable.clear();
668     reachable.insert(i->index());
669     for (SparseSet::iterator j = reachable.begin(); j != reachable.end(); ++j) {
670       int id = *j;
671       Prog::Inst* ip = inst(id);
672       switch (ip->opcode()) {
673         default:
674           LOG(DFATAL) << "unhandled " << ip->opcode() << " in Prog::Fanout()";
675           break;
676 
677         case kInstByteRange:
678           if (!ip->last())
679             reachable.insert(id+1);
680 
681           (*count)++;
682           if (!fanout->has_index(ip->out())) {
683             fanout->set_new(ip->out(), 0);
684           }
685           break;
686 
687         case kInstAltMatch:
688           DCHECK(!ip->last());
689           reachable.insert(id+1);
690           break;
691 
692         case kInstCapture:
693         case kInstEmptyWidth:
694         case kInstNop:
695           if (!ip->last())
696             reachable.insert(id+1);
697 
698           reachable.insert(ip->out());
699           break;
700 
701         case kInstMatch:
702           if (!ip->last())
703             reachable.insert(id+1);
704           break;
705 
706         case kInstFail:
707           break;
708       }
709     }
710   }
711 }
712 
713 }  // namespace re2
714