• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2008 The RE2 Authors.  All Rights Reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4 
5 // Tested by search_test.cc, exhaustive_test.cc, tester.cc
6 //
7 // Prog::UnsafeSearchBacktrack is a backtracking regular expression search,
8 // except that it remembers where it has been, trading a lot of
9 // memory for a lot of time. It exists only for testing purposes.
10 //
11 // Let me repeat that.
12 //
13 // THIS CODE SHOULD NEVER BE USED IN PRODUCTION:
14 //   - It uses a ton of memory.
15 //   - It uses a ton of stack.
16 //   - It uses ABSL_CHECK() and ABSL_LOG(FATAL).
17 //   - It implements unanchored search by repeated anchored search.
18 //
19 // On the other hand, it is very simple and a good reference
20 // implementation for the more complicated regexp packages.
21 //
22 // In BUILD, this file is linked into the ":testing" library,
23 // not the main library, in order to make it harder to pick up
24 // accidentally.
25 
26 #include <stddef.h>
27 #include <stdint.h>
28 #include <string.h>
29 
30 #include "absl/base/macros.h"
31 #include "absl/log/absl_check.h"
32 #include "absl/log/absl_log.h"
33 #include "absl/strings/string_view.h"
34 #include "re2/pod_array.h"
35 #include "re2/prog.h"
36 #include "re2/regexp.h"
37 
38 namespace re2 {
39 
40 // Backtracker holds the state for a backtracking search.
41 //
42 // Excluding the search parameters, the main search state
43 // is just the "capture registers", which record, for the
44 // current execution, the string position at which each
45 // parenthesis was passed.  cap_[0] and cap_[1] are the
46 // left and right parenthesis in $0, cap_[2] and cap_[3] in $1, etc.
47 //
48 // To avoid infinite loops during backtracking on expressions
49 // like (a*)*, the visited_[] bitmap marks the (state, string-position)
50 // pairs that have already been explored and are thus not worth
51 // re-exploring if we get there via another path.  Modern backtracking
52 // libraries engineer their program representation differently, to make
53 // such infinite loops possible to avoid without keeping a giant visited_
54 // bitmap, but visited_ works fine for a reference implementation
55 // and it has the nice benefit of making the search run in linear time.
56 class Backtracker {
57  public:
58   explicit Backtracker(Prog* prog);
59 
60   bool Search(absl::string_view text, absl::string_view context, bool anchored,
61               bool longest, absl::string_view* submatch, int nsubmatch);
62 
63  private:
64   // Explores from instruction id at string position p looking for a match.
65   // Returns true if found (so that caller can stop trying other possibilities).
66   bool Visit(int id, const char* p);
67 
68   // Tries instruction id at string position p.
69   // Returns true if a match is found.
70   bool Try(int id, const char* p);
71 
72   // Search parameters
73   Prog* prog_;                   // program being run
74   absl::string_view text_;       // text being searched
75   absl::string_view context_;    // greater context of text being searched
76   bool anchored_;                // whether search is anchored at text.begin()
77   bool longest_;                 // whether search wants leftmost-longest match
78   bool endmatch_;                // whether search must end at text.end()
79   absl::string_view* submatch_;  // submatches to fill in
80   int nsubmatch_;                // # of submatches to fill in
81 
82   // Search state
83   const char* cap_[64];         // capture registers
84   PODArray<uint32_t> visited_;  // bitmap: (Inst*, char*) pairs visited
85 
86   Backtracker(const Backtracker&) = delete;
87   Backtracker& operator=(const Backtracker&) = delete;
88 };
89 
Backtracker(Prog * prog)90 Backtracker::Backtracker(Prog* prog)
91   : prog_(prog),
92     anchored_(false),
93     longest_(false),
94     endmatch_(false),
95     submatch_(NULL),
96     nsubmatch_(0) {
97 }
98 
99 // Runs a backtracking search.
Search(absl::string_view text,absl::string_view context,bool anchored,bool longest,absl::string_view * submatch,int nsubmatch)100 bool Backtracker::Search(absl::string_view text, absl::string_view context,
101                          bool anchored, bool longest,
102                          absl::string_view* submatch, int nsubmatch) {
103   text_ = text;
104   context_ = context;
105   if (context_.data() == NULL)
106     context_ = text;
107   if (prog_->anchor_start() && BeginPtr(text) > BeginPtr(context_))
108     return false;
109   if (prog_->anchor_end() && EndPtr(text) < EndPtr(context_))
110     return false;
111   anchored_ = anchored | prog_->anchor_start();
112   longest_ = longest | prog_->anchor_end();
113   endmatch_ = prog_->anchor_end();
114   submatch_ = submatch;
115   nsubmatch_ = nsubmatch;
116   ABSL_CHECK_LT(2*nsubmatch_, static_cast<int>(ABSL_ARRAYSIZE(cap_)));
117   memset(cap_, 0, sizeof cap_);
118 
119   // We use submatch_[0] for our own bookkeeping,
120   // so it had better exist.
121   absl::string_view sp0;
122   if (nsubmatch < 1) {
123     submatch_ = &sp0;
124     nsubmatch_ = 1;
125   }
126   submatch_[0] = absl::string_view();
127 
128   // Allocate new visited_ bitmap -- size is proportional
129   // to text, so have to reallocate on each call to Search.
130   int nvisited = prog_->size() * static_cast<int>(text.size()+1);
131   nvisited = (nvisited + 31) / 32;
132   visited_ = PODArray<uint32_t>(nvisited);
133   memset(visited_.data(), 0, nvisited*sizeof visited_[0]);
134 
135   // Anchored search must start at text.begin().
136   if (anchored_) {
137     cap_[0] = text.data();
138     return Visit(prog_->start(), text.data());
139   }
140 
141   // Unanchored search, starting from each possible text position.
142   // Notice that we have to try the empty string at the end of
143   // the text, so the loop condition is p <= text.end(), not p < text.end().
144   for (const char* p = text.data(); p <= text.data() + text.size(); p++) {
145     cap_[0] = p;
146     if (Visit(prog_->start(), p))  // Match must be leftmost; done.
147       return true;
148     // Avoid invoking undefined behavior (arithmetic on a null pointer)
149     // by simply not continuing the loop.
150     if (p == NULL)
151       break;
152   }
153   return false;
154 }
155 
156 // Explores from instruction id at string position p looking for a match.
157 // Return true if found (so that caller can stop trying other possibilities).
Visit(int id,const char * p)158 bool Backtracker::Visit(int id, const char* p) {
159   // Check bitmap.  If we've already explored from here,
160   // either it didn't match or it did but we're hoping for a better match.
161   // Either way, don't go down that road again.
162   ABSL_CHECK(p <= text_.data() + text_.size());
163   int n = id * static_cast<int>(text_.size()+1) +
164           static_cast<int>(p-text_.data());
165   ABSL_CHECK_LT(n/32, visited_.size());
166   if (visited_[n/32] & (1 << (n&31)))
167     return false;
168   visited_[n/32] |= 1 << (n&31);
169 
170   Prog::Inst* ip = prog_->inst(id);
171   if (Try(id, p)) {
172     if (longest_ && !ip->last())
173       Visit(id+1, p);
174     return true;
175   }
176   if (!ip->last())
177     return Visit(id+1, p);
178   return false;
179 }
180 
181 // Tries instruction id at string position p.
182 // Returns true if a match is found.
Try(int id,const char * p)183 bool Backtracker::Try(int id, const char* p) {
184   // Pick out byte at current position.  If at end of string,
185   // have to explore in hope of finishing a match.  Use impossible byte -1.
186   int c = -1;
187   if (p < text_.data() + text_.size())
188     c = *p & 0xFF;
189 
190   Prog::Inst* ip = prog_->inst(id);
191   switch (ip->opcode()) {
192     default:
193       ABSL_LOG(FATAL) << "Unexpected opcode: " << ip->opcode();
194       return false;  // not reached
195 
196     case kInstAltMatch:
197       // Ignored.
198       return false;
199 
200     case kInstByteRange:
201       if (ip->Matches(c))
202         return Visit(ip->out(), p+1);
203       return false;
204 
205     case kInstCapture:
206       if (0 <= ip->cap() &&
207           ip->cap() < static_cast<int>(ABSL_ARRAYSIZE(cap_))) {
208         // Capture p to register, but save old value.
209         const char* q = cap_[ip->cap()];
210         cap_[ip->cap()] = p;
211         bool ret = Visit(ip->out(), p);
212         // Restore old value as we backtrack.
213         cap_[ip->cap()] = q;
214         return ret;
215       }
216       return Visit(ip->out(), p);
217 
218     case kInstEmptyWidth:
219       if (ip->empty() & ~Prog::EmptyFlags(context_, p))
220         return false;
221       return Visit(ip->out(), p);
222 
223     case kInstNop:
224       return Visit(ip->out(), p);
225 
226     case kInstMatch:
227       // We found a match.  If it's the best so far, record the
228       // parameters in the caller's submatch_ array.
229       if (endmatch_ && p != context_.data() + context_.size())
230         return false;
231       cap_[1] = p;
232       if (submatch_[0].data() == NULL ||
233           (longest_ && p > submatch_[0].data() + submatch_[0].size())) {
234         // First match so far - or better match.
235         for (int i = 0; i < nsubmatch_; i++)
236           submatch_[i] = absl::string_view(
237               cap_[2 * i], static_cast<size_t>(cap_[2 * i + 1] - cap_[2 * i]));
238       }
239       return true;
240 
241     case kInstFail:
242       return false;
243   }
244 }
245 
246 // Runs a backtracking search.
UnsafeSearchBacktrack(absl::string_view text,absl::string_view context,Anchor anchor,MatchKind kind,absl::string_view * match,int nmatch)247 bool Prog::UnsafeSearchBacktrack(absl::string_view text,
248                                  absl::string_view context, Anchor anchor,
249                                  MatchKind kind, absl::string_view* match,
250                                  int nmatch) {
251   // If full match, we ask for an anchored longest match
252   // and then check that match[0] == text.
253   // So make sure match[0] exists.
254   absl::string_view sp0;
255   if (kind == kFullMatch) {
256     anchor = kAnchored;
257     if (nmatch < 1) {
258       match = &sp0;
259       nmatch = 1;
260     }
261   }
262 
263   // Run the search.
264   Backtracker b(this);
265   bool anchored = anchor == kAnchored;
266   bool longest = kind != kFirstMatch;
267   if (!b.Search(text, context, anchored, longest, match, nmatch))
268     return false;
269   if (kind == kFullMatch && EndPtr(match[0]) != EndPtr(text))
270     return false;
271   return true;
272 }
273 
274 }  // namespace re2
275