• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2006-2007 The RE2 Authors.  All Rights Reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4 
5 // Tested by search_test.cc.
6 //
7 // Prog::SearchNFA, an NFA search.
8 // This is an actual NFA like the theorists talk about,
9 // not the pseudo-NFA found in backtracking regexp implementations.
10 //
11 // IMPLEMENTATION
12 //
13 // This algorithm is a variant of one that appeared in Rob Pike's sam editor,
14 // which is a variant of the one described in Thompson's 1968 CACM paper.
15 // See http://swtch.com/~rsc/regexp/ for various history.  The main feature
16 // over the DFA implementation is that it tracks submatch boundaries.
17 //
18 // When the choice of submatch boundaries is ambiguous, this particular
19 // implementation makes the same choices that traditional backtracking
20 // implementations (in particular, Perl and PCRE) do.
21 // Note that unlike in Perl and PCRE, this algorithm *cannot* take exponential
22 // time in the length of the input.
23 //
24 // Like Thompson's original machine and like the DFA implementation, this
25 // implementation notices a match only once it is one byte past it.
26 
27 #include <stdio.h>
28 #include <string.h>
29 
30 #include <algorithm>
31 #include <deque>
32 #include <string>
33 #include <utility>
34 
35 #include "absl/log/absl_check.h"
36 #include "absl/log/absl_log.h"
37 #include "absl/strings/str_format.h"
38 #include "absl/strings/string_view.h"
39 #include "re2/pod_array.h"
40 #include "re2/prog.h"
41 #include "re2/regexp.h"
42 #include "re2/sparse_array.h"
43 #include "re2/sparse_set.h"
44 
45 namespace re2 {
46 
47 static const bool ExtraDebug = false;
48 
49 class NFA {
50  public:
51   NFA(Prog* prog);
52   ~NFA();
53 
54   // Searches for a matching string.
55   //   * If anchored is true, only considers matches starting at offset.
56   //     Otherwise finds lefmost match at or after offset.
57   //   * If longest is true, returns the longest match starting
58   //     at the chosen start point.  Otherwise returns the so-called
59   //     left-biased match, the one traditional backtracking engines
60   //     (like Perl and PCRE) find.
61   // Records submatch boundaries in submatch[1..nsubmatch-1].
62   // Submatch[0] is the entire match.  When there is a choice in
63   // which text matches each subexpression, the submatch boundaries
64   // are chosen to match what a backtracking implementation would choose.
65   bool Search(absl::string_view text, absl::string_view context, bool anchored,
66               bool longest, absl::string_view* submatch, int nsubmatch);
67 
68  private:
69   struct Thread {
70     union {
71       int ref;
72       Thread* next;  // when on free list
73     };
74     const char** capture;
75   };
76 
77   // State for explicit stack in AddToThreadq.
78   struct AddState {
79     int id;     // Inst to process
80     Thread* t;  // if not null, set t0 = t before processing id
81   };
82 
83   // Threadq is a list of threads.  The list is sorted by the order
84   // in which Perl would explore that particular state -- the earlier
85   // choices appear earlier in the list.
86   typedef SparseArray<Thread*> Threadq;
87 
88   inline Thread* AllocThread();
89   inline Thread* Incref(Thread* t);
90   inline void Decref(Thread* t);
91 
92   // Follows all empty arrows from id0 and enqueues all the states reached.
93   // Enqueues only the ByteRange instructions that match byte c.
94   // context is used (with p) for evaluating empty-width specials.
95   // p is the current input position, and t0 is the current thread.
96   void AddToThreadq(Threadq* q, int id0, int c, absl::string_view context,
97                     const char* p, Thread* t0);
98 
99   // Run runq on byte c, appending new states to nextq.
100   // Updates matched_ and match_ as new, better matches are found.
101   // context is used (with p) for evaluating empty-width specials.
102   // p is the position of byte c in the input string for AddToThreadq;
103   // p-1 will be used when processing Match instructions.
104   // Frees all the threads on runq.
105   // If there is a shortcut to the end, returns that shortcut.
106   int Step(Threadq* runq, Threadq* nextq, int c, absl::string_view context,
107            const char* p);
108 
109   // Returns text version of capture information, for debugging.
110   std::string FormatCapture(const char** capture);
111 
CopyCapture(const char ** dst,const char ** src)112   void CopyCapture(const char** dst, const char** src) {
113     memmove(dst, src, ncapture_*sizeof src[0]);
114   }
115 
116   Prog* prog_;                // underlying program
117   int start_;                 // start instruction in program
118   int ncapture_;              // number of submatches to track
119   bool longest_;              // whether searching for longest match
120   bool endmatch_;             // whether match must end at text.end()
121   const char* btext_;         // beginning of text (for FormatSubmatch)
122   const char* etext_;         // end of text (for endmatch_)
123   Threadq q0_, q1_;           // pre-allocated for Search.
124   PODArray<AddState> stack_;  // pre-allocated for AddToThreadq
125   std::deque<Thread> arena_;  // thread arena
126   Thread* freelist_;          // thread freelist
127   const char** match_;        // best match so far
128   bool matched_;              // any match so far?
129 
130   NFA(const NFA&) = delete;
131   NFA& operator=(const NFA&) = delete;
132 };
133 
NFA(Prog * prog)134 NFA::NFA(Prog* prog) {
135   prog_ = prog;
136   start_ = prog_->start();
137   ncapture_ = 0;
138   longest_ = false;
139   endmatch_ = false;
140   btext_ = NULL;
141   etext_ = NULL;
142   q0_.resize(prog_->size());
143   q1_.resize(prog_->size());
144   // See NFA::AddToThreadq() for why this is so.
145   int nstack = 2*prog_->inst_count(kInstCapture) +
146                prog_->inst_count(kInstEmptyWidth) +
147                prog_->inst_count(kInstNop) + 1;  // + 1 for start inst
148   stack_ = PODArray<AddState>(nstack);
149   freelist_ = NULL;
150   match_ = NULL;
151   matched_ = false;
152 }
153 
~NFA()154 NFA::~NFA() {
155   delete[] match_;
156   for (const Thread& t : arena_)
157     delete[] t.capture;
158 }
159 
AllocThread()160 NFA::Thread* NFA::AllocThread() {
161   Thread* t = freelist_;
162   if (t != NULL) {
163     freelist_ = t->next;
164     t->ref = 1;
165     // We don't need to touch t->capture because
166     // the caller will immediately overwrite it.
167     return t;
168   }
169   arena_.emplace_back();
170   t = &arena_.back();
171   t->ref = 1;
172   t->capture = new const char*[ncapture_];
173   return t;
174 }
175 
Incref(Thread * t)176 NFA::Thread* NFA::Incref(Thread* t) {
177   ABSL_DCHECK(t != NULL);
178   t->ref++;
179   return t;
180 }
181 
Decref(Thread * t)182 void NFA::Decref(Thread* t) {
183   ABSL_DCHECK(t != NULL);
184   t->ref--;
185   if (t->ref > 0)
186     return;
187   ABSL_DCHECK_EQ(t->ref, 0);
188   t->next = freelist_;
189   freelist_ = t;
190 }
191 
192 // Follows all empty arrows from id0 and enqueues all the states reached.
193 // Enqueues only the ByteRange instructions that match byte c.
194 // context is used (with p) for evaluating empty-width specials.
195 // p is the current input position, and t0 is the current thread.
AddToThreadq(Threadq * q,int id0,int c,absl::string_view context,const char * p,Thread * t0)196 void NFA::AddToThreadq(Threadq* q, int id0, int c, absl::string_view context,
197                        const char* p, Thread* t0) {
198   if (id0 == 0)
199     return;
200 
201   // Use stack_ to hold our stack of instructions yet to process.
202   // It was preallocated as follows:
203   //   two entries per Capture;
204   //   one entry per EmptyWidth; and
205   //   one entry per Nop.
206   // This reflects the maximum number of stack pushes that each can
207   // perform. (Each instruction can be processed at most once.)
208   AddState* stk = stack_.data();
209   int nstk = 0;
210 
211   stk[nstk++] = {id0, NULL};
212   while (nstk > 0) {
213     ABSL_DCHECK_LE(nstk, stack_.size());
214     AddState a = stk[--nstk];
215 
216   Loop:
217     if (a.t != NULL) {
218       // t0 was a thread that we allocated and copied in order to
219       // record the capture, so we must now decref it.
220       Decref(t0);
221       t0 = a.t;
222     }
223 
224     int id = a.id;
225     if (id == 0)
226       continue;
227     if (q->has_index(id)) {
228       if (ExtraDebug)
229         absl::FPrintF(stderr, "  [%d%s]\n", id, FormatCapture(t0->capture));
230       continue;
231     }
232 
233     // Create entry in q no matter what.  We might fill it in below,
234     // or we might not.  Even if not, it is necessary to have it,
235     // so that we don't revisit id0 during the recursion.
236     q->set_new(id, NULL);
237     Thread** tp = &q->get_existing(id);
238     int j;
239     Thread* t;
240     Prog::Inst* ip = prog_->inst(id);
241     switch (ip->opcode()) {
242     default:
243       ABSL_LOG(DFATAL) << "unhandled " << ip->opcode() << " in AddToThreadq";
244       break;
245 
246     case kInstFail:
247       break;
248 
249     case kInstAltMatch:
250       // Save state; will pick up at next byte.
251       t = Incref(t0);
252       *tp = t;
253 
254       ABSL_DCHECK(!ip->last());
255       a = {id+1, NULL};
256       goto Loop;
257 
258     case kInstNop:
259       if (!ip->last())
260         stk[nstk++] = {id+1, NULL};
261 
262       // Continue on.
263       a = {ip->out(), NULL};
264       goto Loop;
265 
266     case kInstCapture:
267       if (!ip->last())
268         stk[nstk++] = {id+1, NULL};
269 
270       if ((j=ip->cap()) < ncapture_) {
271         // Push a dummy whose only job is to restore t0
272         // once we finish exploring this possibility.
273         stk[nstk++] = {0, t0};
274 
275         // Record capture.
276         t = AllocThread();
277         CopyCapture(t->capture, t0->capture);
278         t->capture[j] = p;
279         t0 = t;
280       }
281       a = {ip->out(), NULL};
282       goto Loop;
283 
284     case kInstByteRange:
285       if (!ip->Matches(c))
286         goto Next;
287 
288       // Save state; will pick up at next byte.
289       t = Incref(t0);
290       *tp = t;
291       if (ExtraDebug)
292         absl::FPrintF(stderr, " + %d%s\n", id, FormatCapture(t0->capture));
293 
294       if (ip->hint() == 0)
295         break;
296       a = {id+ip->hint(), NULL};
297       goto Loop;
298 
299     case kInstMatch:
300       // Save state; will pick up at next byte.
301       t = Incref(t0);
302       *tp = t;
303       if (ExtraDebug)
304         absl::FPrintF(stderr, " ! %d%s\n", id, FormatCapture(t0->capture));
305 
306     Next:
307       if (ip->last())
308         break;
309       a = {id+1, NULL};
310       goto Loop;
311 
312     case kInstEmptyWidth:
313       if (!ip->last())
314         stk[nstk++] = {id+1, NULL};
315 
316       // Continue on if we have all the right flag bits.
317       if (ip->empty() & ~Prog::EmptyFlags(context, p))
318         break;
319       a = {ip->out(), NULL};
320       goto Loop;
321     }
322   }
323 }
324 
325 // Run runq on byte c, appending new states to nextq.
326 // Updates matched_ and match_ as new, better matches are found.
327 // context is used (with p) for evaluating empty-width specials.
328 // p is the position of byte c in the input string for AddToThreadq;
329 // p-1 will be used when processing Match instructions.
330 // Frees all the threads on runq.
331 // If there is a shortcut to the end, returns that shortcut.
Step(Threadq * runq,Threadq * nextq,int c,absl::string_view context,const char * p)332 int NFA::Step(Threadq* runq, Threadq* nextq, int c, absl::string_view context,
333               const char* p) {
334   nextq->clear();
335 
336   for (Threadq::iterator i = runq->begin(); i != runq->end(); ++i) {
337     Thread* t = i->value();
338     if (t == NULL)
339       continue;
340 
341     if (longest_) {
342       // Can skip any threads started after our current best match.
343       if (matched_ && match_[0] < t->capture[0]) {
344         Decref(t);
345         continue;
346       }
347     }
348 
349     int id = i->index();
350     Prog::Inst* ip = prog_->inst(id);
351 
352     switch (ip->opcode()) {
353       default:
354         // Should only see the values handled below.
355         ABSL_LOG(DFATAL) << "Unhandled " << ip->opcode() << " in step";
356         break;
357 
358       case kInstByteRange:
359         AddToThreadq(nextq, ip->out(), c, context, p, t);
360         break;
361 
362       case kInstAltMatch:
363         if (i != runq->begin())
364           break;
365         // The match is ours if we want it.
366         if (ip->greedy(prog_) || longest_) {
367           CopyCapture(match_, t->capture);
368           matched_ = true;
369 
370           Decref(t);
371           for (++i; i != runq->end(); ++i) {
372             if (i->value() != NULL)
373               Decref(i->value());
374           }
375           runq->clear();
376           if (ip->greedy(prog_))
377             return ip->out1();
378           return ip->out();
379         }
380         break;
381 
382       case kInstMatch: {
383         // Avoid invoking undefined behavior (arithmetic on a null pointer)
384         // by storing p instead of p-1. (What would the latter even mean?!)
385         // This complements the special case in NFA::Search().
386         if (p == NULL) {
387           CopyCapture(match_, t->capture);
388           match_[1] = p;
389           matched_ = true;
390           break;
391         }
392 
393         if (endmatch_ && p-1 != etext_)
394           break;
395 
396         if (longest_) {
397           // Leftmost-longest mode: save this match only if
398           // it is either farther to the left or at the same
399           // point but longer than an existing match.
400           if (!matched_ || t->capture[0] < match_[0] ||
401               (t->capture[0] == match_[0] && p-1 > match_[1])) {
402             CopyCapture(match_, t->capture);
403             match_[1] = p-1;
404             matched_ = true;
405           }
406         } else {
407           // Leftmost-biased mode: this match is by definition
408           // better than what we've already found (see next line).
409           CopyCapture(match_, t->capture);
410           match_[1] = p-1;
411           matched_ = true;
412 
413           // Cut off the threads that can only find matches
414           // worse than the one we just found: don't run the
415           // rest of the current Threadq.
416           Decref(t);
417           for (++i; i != runq->end(); ++i) {
418             if (i->value() != NULL)
419               Decref(i->value());
420           }
421           runq->clear();
422           return 0;
423         }
424         break;
425       }
426     }
427     Decref(t);
428   }
429   runq->clear();
430   return 0;
431 }
432 
FormatCapture(const char ** capture)433 std::string NFA::FormatCapture(const char** capture) {
434   std::string s;
435   for (int i = 0; i < ncapture_; i+=2) {
436     if (capture[i] == NULL)
437       s += "(?,?)";
438     else if (capture[i+1] == NULL)
439       s += absl::StrFormat("(%d,?)",
440                            capture[i] - btext_);
441     else
442       s += absl::StrFormat("(%d,%d)",
443                            capture[i] - btext_,
444                            capture[i+1] - btext_);
445   }
446   return s;
447 }
448 
Search(absl::string_view text,absl::string_view context,bool anchored,bool longest,absl::string_view * submatch,int nsubmatch)449 bool NFA::Search(absl::string_view text, absl::string_view context,
450                  bool anchored, bool longest, absl::string_view* submatch,
451                  int nsubmatch) {
452   if (start_ == 0)
453     return false;
454 
455   if (context.data() == NULL)
456     context = text;
457 
458   // Sanity check: make sure that text lies within context.
459   if (BeginPtr(text) < BeginPtr(context) || EndPtr(text) > EndPtr(context)) {
460     ABSL_LOG(DFATAL) << "context does not contain text";
461     return false;
462   }
463 
464   if (prog_->anchor_start() && BeginPtr(context) != BeginPtr(text))
465     return false;
466   if (prog_->anchor_end() && EndPtr(context) != EndPtr(text))
467     return false;
468   anchored |= prog_->anchor_start();
469   if (prog_->anchor_end()) {
470     longest = true;
471     endmatch_ = true;
472   }
473 
474   if (nsubmatch < 0) {
475     ABSL_LOG(DFATAL) << "Bad args: nsubmatch=" << nsubmatch;
476     return false;
477   }
478 
479   // Save search parameters.
480   ncapture_ = 2*nsubmatch;
481   longest_ = longest;
482 
483   if (nsubmatch == 0) {
484     // We need to maintain match[0], both to distinguish the
485     // longest match (if longest is true) and also to tell
486     // whether we've seen any matches at all.
487     ncapture_ = 2;
488   }
489 
490   match_ = new const char*[ncapture_];
491   memset(match_, 0, ncapture_*sizeof match_[0]);
492   matched_ = false;
493 
494   // For debugging prints.
495   btext_ = context.data();
496   // For convenience.
497   etext_ = text.data() + text.size();
498 
499   if (ExtraDebug)
500     absl::FPrintF(stderr, "NFA::Search %s (context: %s) anchored=%d longest=%d\n",
501                   text, context, anchored, longest);
502 
503   // Set up search.
504   Threadq* runq = &q0_;
505   Threadq* nextq = &q1_;
506   runq->clear();
507   nextq->clear();
508 
509   // Loop over the text, stepping the machine.
510   for (const char* p = text.data();; p++) {
511     if (ExtraDebug) {
512       int c = 0;
513       if (p == btext_)
514         c = '^';
515       else if (p > etext_)
516         c = '$';
517       else if (p < etext_)
518         c = p[0] & 0xFF;
519 
520       absl::FPrintF(stderr, "%c:", c);
521       for (Threadq::iterator i = runq->begin(); i != runq->end(); ++i) {
522         Thread* t = i->value();
523         if (t == NULL)
524           continue;
525         absl::FPrintF(stderr, " %d%s", i->index(), FormatCapture(t->capture));
526       }
527       absl::FPrintF(stderr, "\n");
528     }
529 
530     // This is a no-op the first time around the loop because runq is empty.
531     int id = Step(runq, nextq, p < etext_ ? p[0] & 0xFF : -1, context, p);
532     ABSL_DCHECK_EQ(runq->size(), 0);
533     using std::swap;
534     swap(nextq, runq);
535     nextq->clear();
536     if (id != 0) {
537       // We're done: full match ahead.
538       p = etext_;
539       for (;;) {
540         Prog::Inst* ip = prog_->inst(id);
541         switch (ip->opcode()) {
542           default:
543             ABSL_LOG(DFATAL) << "Unexpected opcode in short circuit: "
544                              << ip->opcode();
545             break;
546 
547           case kInstCapture:
548             if (ip->cap() < ncapture_)
549               match_[ip->cap()] = p;
550             id = ip->out();
551             continue;
552 
553           case kInstNop:
554             id = ip->out();
555             continue;
556 
557           case kInstMatch:
558             match_[1] = p;
559             matched_ = true;
560             break;
561         }
562         break;
563       }
564       break;
565     }
566 
567     if (p > etext_)
568       break;
569 
570     // Start a new thread if there have not been any matches.
571     // (No point in starting a new thread if there have been
572     // matches, since it would be to the right of the match
573     // we already found.)
574     if (!matched_ && (!anchored || p == text.data())) {
575       // Try to use prefix accel (e.g. memchr) to skip ahead.
576       // The search must be unanchored and there must be zero
577       // possible matches already.
578       if (!anchored && runq->size() == 0 &&
579           p < etext_ && prog_->can_prefix_accel()) {
580         p = reinterpret_cast<const char*>(prog_->PrefixAccel(p, etext_ - p));
581         if (p == NULL)
582           p = etext_;
583       }
584 
585       Thread* t = AllocThread();
586       CopyCapture(t->capture, match_);
587       t->capture[0] = p;
588       AddToThreadq(runq, start_, p < etext_ ? p[0] & 0xFF : -1, context, p,
589                    t);
590       Decref(t);
591     }
592 
593     // If all the threads have died, stop early.
594     if (runq->size() == 0) {
595       if (ExtraDebug)
596         absl::FPrintF(stderr, "dead\n");
597       break;
598     }
599 
600     // Avoid invoking undefined behavior (arithmetic on a null pointer)
601     // by simply not continuing the loop.
602     // This complements the special case in NFA::Step().
603     if (p == NULL) {
604       (void) Step(runq, nextq, -1, context, p);
605       ABSL_DCHECK_EQ(runq->size(), 0);
606       using std::swap;
607       swap(nextq, runq);
608       nextq->clear();
609       break;
610     }
611   }
612 
613   for (Threadq::iterator i = runq->begin(); i != runq->end(); ++i) {
614     if (i->value() != NULL)
615       Decref(i->value());
616   }
617 
618   if (matched_) {
619     for (int i = 0; i < nsubmatch; i++)
620       submatch[i] = absl::string_view(
621           match_[2 * i],
622           static_cast<size_t>(match_[2 * i + 1] - match_[2 * i]));
623     if (ExtraDebug)
624       absl::FPrintF(stderr, "match (%d,%d)\n",
625                     match_[0] - btext_,
626                     match_[1] - btext_);
627     return true;
628   }
629   return false;
630 }
631 
SearchNFA(absl::string_view text,absl::string_view context,Anchor anchor,MatchKind kind,absl::string_view * match,int nmatch)632 bool Prog::SearchNFA(absl::string_view text, absl::string_view context,
633                      Anchor anchor, MatchKind kind, absl::string_view* match,
634                      int nmatch) {
635   if (ExtraDebug)
636     Dump();
637 
638   NFA nfa(this);
639   absl::string_view sp;
640   if (kind == kFullMatch) {
641     anchor = kAnchored;
642     if (nmatch == 0) {
643       match = &sp;
644       nmatch = 1;
645     }
646   }
647   if (!nfa.Search(text, context, anchor == kAnchored, kind != kFirstMatch, match, nmatch))
648     return false;
649   if (kind == kFullMatch && EndPtr(match[0]) != EndPtr(text))
650     return false;
651   return true;
652 }
653 
654 // For each instruction i in the program reachable from the start, compute the
655 // number of instructions reachable from i by following only empty transitions
656 // and record that count as fanout[i].
657 //
658 // fanout holds the results and is also the work queue for the outer iteration.
659 // reachable holds the reached nodes for the inner iteration.
Fanout(SparseArray<int> * fanout)660 void Prog::Fanout(SparseArray<int>* fanout) {
661   ABSL_DCHECK_EQ(fanout->max_size(), size());
662   SparseSet reachable(size());
663   fanout->clear();
664   fanout->set_new(start(), 0);
665   for (SparseArray<int>::iterator i = fanout->begin(); i != fanout->end(); ++i) {
666     int* count = &i->value();
667     reachable.clear();
668     reachable.insert(i->index());
669     for (SparseSet::iterator j = reachable.begin(); j != reachable.end(); ++j) {
670       int id = *j;
671       Prog::Inst* ip = inst(id);
672       switch (ip->opcode()) {
673         default:
674           ABSL_LOG(DFATAL) << "unhandled " << ip->opcode()
675                            << " in Prog::Fanout()";
676           break;
677 
678         case kInstByteRange:
679           if (!ip->last())
680             reachable.insert(id+1);
681 
682           (*count)++;
683           if (!fanout->has_index(ip->out())) {
684             fanout->set_new(ip->out(), 0);
685           }
686           break;
687 
688         case kInstAltMatch:
689           ABSL_DCHECK(!ip->last());
690           reachable.insert(id+1);
691           break;
692 
693         case kInstCapture:
694         case kInstEmptyWidth:
695         case kInstNop:
696           if (!ip->last())
697             reachable.insert(id+1);
698 
699           reachable.insert(ip->out());
700           break;
701 
702         case kInstMatch:
703           if (!ip->last())
704             reachable.insert(id+1);
705           break;
706 
707         case kInstFail:
708           break;
709       }
710     }
711   }
712 }
713 
714 }  // namespace re2
715