• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2008 The RE2 Authors.  All Rights Reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4 
5 // Tested by search_test.cc, exhaustive_test.cc, tester.cc
6 
7 // Prog::SearchBitState is a regular expression search with submatch
8 // tracking for small regular expressions and texts.  Similarly to
9 // testing/backtrack.cc, it allocates a bitmap with (count of
10 // lists) * (length of prog) bits to make sure it never explores the
11 // same (instruction list, character position) multiple times.  This
12 // limits the search to run in time linear in the length of the text.
13 //
14 // Unlike testing/backtrack.cc, SearchBitState is not recursive
15 // on the text.
16 //
17 // SearchBitState is a fast replacement for the NFA code on small
18 // regexps and texts when SearchOnePass cannot be used.
19 
20 #include <stddef.h>
21 #include <stdint.h>
22 #include <string.h>
23 #include <limits>
24 #include <utility>
25 
26 #include "util/logging.h"
27 #include "re2/pod_array.h"
28 #include "re2/prog.h"
29 #include "re2/regexp.h"
30 
31 namespace re2 {
32 
33 struct Job {
34   int id;
35   int rle;  // run length encoding
36   const char* p;
37 };
38 
39 class BitState {
40  public:
41   explicit BitState(Prog* prog);
42 
43   // The usual Search prototype.
44   // Can only call Search once per BitState.
45   bool Search(const StringPiece& text, const StringPiece& context,
46               bool anchored, bool longest,
47               StringPiece* submatch, int nsubmatch);
48 
49  private:
50   inline bool ShouldVisit(int id, const char* p);
51   void Push(int id, const char* p);
52   void GrowStack();
53   bool TrySearch(int id, const char* p);
54 
55   // Search parameters
56   Prog* prog_;              // program being run
57   StringPiece text_;        // text being searched
58   StringPiece context_;     // greater context of text being searched
59   bool anchored_;           // whether search is anchored at text.begin()
60   bool longest_;            // whether search wants leftmost-longest match
61   bool endmatch_;           // whether match must end at text.end()
62   StringPiece* submatch_;   // submatches to fill in
63   int nsubmatch_;           //   # of submatches to fill in
64 
65   // Search state
66   static const int VisitedBits = 32;
67   PODArray<uint32_t> visited_;  // bitmap: (list ID, char*) pairs visited
68   PODArray<const char*> cap_;   // capture registers
69   PODArray<Job> job_;           // stack of text positions to explore
70   int njob_;                    // stack size
71 };
72 
BitState(Prog * prog)73 BitState::BitState(Prog* prog)
74   : prog_(prog),
75     anchored_(false),
76     longest_(false),
77     endmatch_(false),
78     submatch_(NULL),
79     nsubmatch_(0),
80     njob_(0) {
81 }
82 
83 // Given id, which *must* be a list head, we can look up its list ID.
84 // Then the question is: Should the search visit the (list ID, p) pair?
85 // If so, remember that it was visited so that the next time,
86 // we don't repeat the visit.
ShouldVisit(int id,const char * p)87 bool BitState::ShouldVisit(int id, const char* p) {
88   int n = prog_->list_heads()[id] * static_cast<int>(text_.size()+1) +
89           static_cast<int>(p-text_.data());
90   if (visited_[n/VisitedBits] & (1 << (n & (VisitedBits-1))))
91     return false;
92   visited_[n/VisitedBits] |= 1 << (n & (VisitedBits-1));
93   return true;
94 }
95 
96 // Grow the stack.
GrowStack()97 void BitState::GrowStack() {
98   PODArray<Job> tmp(2*job_.size());
99   memmove(tmp.data(), job_.data(), njob_*sizeof job_[0]);
100   job_ = std::move(tmp);
101 }
102 
103 // Push (id, p) onto the stack, growing it if necessary.
Push(int id,const char * p)104 void BitState::Push(int id, const char* p) {
105   if (njob_ >= job_.size()) {
106     GrowStack();
107     if (njob_ >= job_.size()) {
108       LOG(DFATAL) << "GrowStack() failed: "
109                   << "njob_ = " << njob_ << ", "
110                   << "job_.size() = " << job_.size();
111       return;
112     }
113   }
114 
115   // If id < 0, it's undoing a Capture,
116   // so we mustn't interfere with that.
117   if (id >= 0 && njob_ > 0) {
118     Job* top = &job_[njob_-1];
119     if (id == top->id &&
120         p == top->p + top->rle + 1 &&
121         top->rle < std::numeric_limits<int>::max()) {
122       ++top->rle;
123       return;
124     }
125   }
126 
127   Job* top = &job_[njob_++];
128   top->id = id;
129   top->rle = 0;
130   top->p = p;
131 }
132 
133 // Try a search from instruction id0 in state p0.
134 // Return whether it succeeded.
TrySearch(int id0,const char * p0)135 bool BitState::TrySearch(int id0, const char* p0) {
136   bool matched = false;
137   const char* end = text_.data() + text_.size();
138   njob_ = 0;
139   // Push() no longer checks ShouldVisit(),
140   // so we must perform the check ourselves.
141   if (ShouldVisit(id0, p0))
142     Push(id0, p0);
143   while (njob_ > 0) {
144     // Pop job off stack.
145     --njob_;
146     int id = job_[njob_].id;
147     int& rle = job_[njob_].rle;
148     const char* p = job_[njob_].p;
149 
150     if (id < 0) {
151       // Undo the Capture.
152       cap_[prog_->inst(-id)->cap()] = p;
153       continue;
154     }
155 
156     if (rle > 0) {
157       p += rle;
158       // Revivify job on stack.
159       --rle;
160       ++njob_;
161     }
162 
163   Loop:
164     // Visit id, p.
165     Prog::Inst* ip = prog_->inst(id);
166     switch (ip->opcode()) {
167       default:
168         LOG(DFATAL) << "Unexpected opcode: " << ip->opcode();
169         return false;
170 
171       case kInstFail:
172         break;
173 
174       case kInstAltMatch:
175         if (ip->greedy(prog_)) {
176           // out1 is the Match instruction.
177           id = ip->out1();
178           p = end;
179           goto Loop;
180         }
181         if (longest_) {
182           // ip must be non-greedy...
183           // out is the Match instruction.
184           id = ip->out();
185           p = end;
186           goto Loop;
187         }
188         goto Next;
189 
190       case kInstByteRange: {
191         int c = -1;
192         if (p < end)
193           c = *p & 0xFF;
194         if (!ip->Matches(c))
195           goto Next;
196 
197         if (ip->hint() != 0)
198           Push(id+ip->hint(), p);  // try the next when we're done
199         id = ip->out();
200         p++;
201         goto CheckAndLoop;
202       }
203 
204       case kInstCapture:
205         if (!ip->last())
206           Push(id+1, p);  // try the next when we're done
207 
208         if (0 <= ip->cap() && ip->cap() < cap_.size()) {
209           // Capture p to register, but save old value first.
210           Push(-id, cap_[ip->cap()]);  // undo when we're done
211           cap_[ip->cap()] = p;
212         }
213 
214         id = ip->out();
215         goto CheckAndLoop;
216 
217       case kInstEmptyWidth:
218         if (ip->empty() & ~Prog::EmptyFlags(context_, p))
219           goto Next;
220 
221         if (!ip->last())
222           Push(id+1, p);  // try the next when we're done
223         id = ip->out();
224         goto CheckAndLoop;
225 
226       case kInstNop:
227         if (!ip->last())
228           Push(id+1, p);  // try the next when we're done
229         id = ip->out();
230 
231       CheckAndLoop:
232         // Sanity check: id is the head of its list, which must
233         // be the case if id-1 is the last of *its* list. :)
234         DCHECK(id == 0 || prog_->inst(id-1)->last());
235         if (ShouldVisit(id, p))
236           goto Loop;
237         break;
238 
239       case kInstMatch: {
240         if (endmatch_ && p != end)
241           goto Next;
242 
243         // We found a match.  If the caller doesn't care
244         // where the match is, no point going further.
245         if (nsubmatch_ == 0)
246           return true;
247 
248         // Record best match so far.
249         // Only need to check end point, because this entire
250         // call is only considering one start position.
251         matched = true;
252         cap_[1] = p;
253         if (submatch_[0].data() == NULL ||
254             (longest_ && p > submatch_[0].data() + submatch_[0].size())) {
255           for (int i = 0; i < nsubmatch_; i++)
256             submatch_[i] =
257                 StringPiece(cap_[2 * i],
258                             static_cast<size_t>(cap_[2 * i + 1] - cap_[2 * i]));
259         }
260 
261         // If going for first match, we're done.
262         if (!longest_)
263           return true;
264 
265         // If we used the entire text, no longer match is possible.
266         if (p == end)
267           return true;
268 
269         // Otherwise, continue on in hope of a longer match.
270         // Note the absence of the ShouldVisit() check here
271         // due to execution remaining in the same list.
272       Next:
273         if (!ip->last()) {
274           id++;
275           goto Loop;
276         }
277         break;
278       }
279     }
280   }
281   return matched;
282 }
283 
284 // Search text (within context) for prog_.
Search(const StringPiece & text,const StringPiece & context,bool anchored,bool longest,StringPiece * submatch,int nsubmatch)285 bool BitState::Search(const StringPiece& text, const StringPiece& context,
286                       bool anchored, bool longest,
287                       StringPiece* submatch, int nsubmatch) {
288   // Search parameters.
289   text_ = text;
290   context_ = context;
291   if (context_.data() == NULL)
292     context_ = text;
293   if (prog_->anchor_start() && context_.begin() != text.begin())
294     return false;
295   if (prog_->anchor_end() && context_.end() != text.end())
296     return false;
297   anchored_ = anchored || prog_->anchor_start();
298   longest_ = longest || prog_->anchor_end();
299   endmatch_ = prog_->anchor_end();
300   submatch_ = submatch;
301   nsubmatch_ = nsubmatch;
302   for (int i = 0; i < nsubmatch_; i++)
303     submatch_[i] = StringPiece();
304 
305   // Allocate scratch space.
306   int nvisited = prog_->list_count() * static_cast<int>(text.size()+1);
307   nvisited = (nvisited + VisitedBits-1) / VisitedBits;
308   visited_ = PODArray<uint32_t>(nvisited);
309   memset(visited_.data(), 0, nvisited*sizeof visited_[0]);
310 
311   int ncap = 2*nsubmatch;
312   if (ncap < 2)
313     ncap = 2;
314   cap_ = PODArray<const char*>(ncap);
315   memset(cap_.data(), 0, ncap*sizeof cap_[0]);
316 
317   // When sizeof(Job) == 16, we start with a nice round 1KiB. :)
318   job_ = PODArray<Job>(64);
319 
320   // Anchored search must start at text.begin().
321   if (anchored_) {
322     cap_[0] = text.data();
323     return TrySearch(prog_->start(), text.data());
324   }
325 
326   // Unanchored search, starting from each possible text position.
327   // Notice that we have to try the empty string at the end of
328   // the text, so the loop condition is p <= text.end(), not p < text.end().
329   // This looks like it's quadratic in the size of the text,
330   // but we are not clearing visited_ between calls to TrySearch,
331   // so no work is duplicated and it ends up still being linear.
332   for (const char* p = text.data(); p <= text.data() + text.size(); p++) {
333     // Try to use memchr to find the first byte quickly.
334     int fb = prog_->first_byte();
335     if (fb >= 0 && p < text.data() + text.size() && (p[0] & 0xFF) != fb) {
336       p = reinterpret_cast<const char*>(
337           memchr(p, fb, text.data() + text.size() - p));
338       if (p == NULL)
339         p = text.data() + text.size();
340     }
341 
342     cap_[0] = p;
343     if (TrySearch(prog_->start(), p))  // Match must be leftmost; done.
344       return true;
345     // Avoid invoking undefined behavior (arithmetic on a null pointer)
346     // by simply not continuing the loop.
347     if (p == NULL)
348       break;
349   }
350   return false;
351 }
352 
353 // Bit-state search.
SearchBitState(const StringPiece & text,const StringPiece & context,Anchor anchor,MatchKind kind,StringPiece * match,int nmatch)354 bool Prog::SearchBitState(const StringPiece& text,
355                           const StringPiece& context,
356                           Anchor anchor,
357                           MatchKind kind,
358                           StringPiece* match,
359                           int nmatch) {
360   // If full match, we ask for an anchored longest match
361   // and then check that match[0] == text.
362   // So make sure match[0] exists.
363   StringPiece sp0;
364   if (kind == kFullMatch) {
365     anchor = kAnchored;
366     if (nmatch < 1) {
367       match = &sp0;
368       nmatch = 1;
369     }
370   }
371 
372   // Run the search.
373   BitState b(this);
374   bool anchored = anchor == kAnchored;
375   bool longest = kind != kFirstMatch;
376   if (!b.Search(text, context, anchored, longest, match, nmatch))
377     return false;
378   if (kind == kFullMatch && match[0].end() != text.end())
379     return false;
380   return true;
381 }
382 
383 }  // namespace re2
384