1 // Copyright 2008 The RE2 Authors. All Rights Reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 // Tested by search_test.cc, exhaustive_test.cc, tester.cc
6
7 // Prog::SearchBitState is a regular expression search with submatch
8 // tracking for small regular expressions and texts. Similarly to
9 // testing/backtrack.cc, it allocates a bitmap with (count of
10 // lists) * (length of prog) bits to make sure it never explores the
11 // same (instruction list, character position) multiple times. This
12 // limits the search to run in time linear in the length of the text.
13 //
14 // Unlike testing/backtrack.cc, SearchBitState is not recursive
15 // on the text.
16 //
17 // SearchBitState is a fast replacement for the NFA code on small
18 // regexps and texts when SearchOnePass cannot be used.
19
20 #include <stddef.h>
21 #include <stdint.h>
22 #include <string.h>
23 #include <limits>
24 #include <utility>
25
26 #include "util/logging.h"
27 #include "re2/pod_array.h"
28 #include "re2/prog.h"
29 #include "re2/regexp.h"
30
31 namespace re2 {
32
33 struct Job {
34 int id;
35 int rle; // run length encoding
36 const char* p;
37 };
38
39 class BitState {
40 public:
41 explicit BitState(Prog* prog);
42
43 // The usual Search prototype.
44 // Can only call Search once per BitState.
45 bool Search(const StringPiece& text, const StringPiece& context,
46 bool anchored, bool longest,
47 StringPiece* submatch, int nsubmatch);
48
49 private:
50 inline bool ShouldVisit(int id, const char* p);
51 void Push(int id, const char* p);
52 void GrowStack();
53 bool TrySearch(int id, const char* p);
54
55 // Search parameters
56 Prog* prog_; // program being run
57 StringPiece text_; // text being searched
58 StringPiece context_; // greater context of text being searched
59 bool anchored_; // whether search is anchored at text.begin()
60 bool longest_; // whether search wants leftmost-longest match
61 bool endmatch_; // whether match must end at text.end()
62 StringPiece* submatch_; // submatches to fill in
63 int nsubmatch_; // # of submatches to fill in
64
65 // Search state
66 static const int VisitedBits = 32;
67 PODArray<uint32_t> visited_; // bitmap: (list ID, char*) pairs visited
68 PODArray<const char*> cap_; // capture registers
69 PODArray<Job> job_; // stack of text positions to explore
70 int njob_; // stack size
71 };
72
BitState(Prog * prog)73 BitState::BitState(Prog* prog)
74 : prog_(prog),
75 anchored_(false),
76 longest_(false),
77 endmatch_(false),
78 submatch_(NULL),
79 nsubmatch_(0),
80 njob_(0) {
81 }
82
83 // Given id, which *must* be a list head, we can look up its list ID.
84 // Then the question is: Should the search visit the (list ID, p) pair?
85 // If so, remember that it was visited so that the next time,
86 // we don't repeat the visit.
ShouldVisit(int id,const char * p)87 bool BitState::ShouldVisit(int id, const char* p) {
88 int n = prog_->list_heads()[id] * static_cast<int>(text_.size()+1) +
89 static_cast<int>(p-text_.data());
90 if (visited_[n/VisitedBits] & (1 << (n & (VisitedBits-1))))
91 return false;
92 visited_[n/VisitedBits] |= 1 << (n & (VisitedBits-1));
93 return true;
94 }
95
96 // Grow the stack.
GrowStack()97 void BitState::GrowStack() {
98 PODArray<Job> tmp(2*job_.size());
99 memmove(tmp.data(), job_.data(), njob_*sizeof job_[0]);
100 job_ = std::move(tmp);
101 }
102
103 // Push (id, p) onto the stack, growing it if necessary.
Push(int id,const char * p)104 void BitState::Push(int id, const char* p) {
105 if (njob_ >= job_.size()) {
106 GrowStack();
107 if (njob_ >= job_.size()) {
108 LOG(DFATAL) << "GrowStack() failed: "
109 << "njob_ = " << njob_ << ", "
110 << "job_.size() = " << job_.size();
111 return;
112 }
113 }
114
115 // If id < 0, it's undoing a Capture,
116 // so we mustn't interfere with that.
117 if (id >= 0 && njob_ > 0) {
118 Job* top = &job_[njob_-1];
119 if (id == top->id &&
120 p == top->p + top->rle + 1 &&
121 top->rle < std::numeric_limits<int>::max()) {
122 ++top->rle;
123 return;
124 }
125 }
126
127 Job* top = &job_[njob_++];
128 top->id = id;
129 top->rle = 0;
130 top->p = p;
131 }
132
133 // Try a search from instruction id0 in state p0.
134 // Return whether it succeeded.
TrySearch(int id0,const char * p0)135 bool BitState::TrySearch(int id0, const char* p0) {
136 bool matched = false;
137 const char* end = text_.data() + text_.size();
138 njob_ = 0;
139 // Push() no longer checks ShouldVisit(),
140 // so we must perform the check ourselves.
141 if (ShouldVisit(id0, p0))
142 Push(id0, p0);
143 while (njob_ > 0) {
144 // Pop job off stack.
145 --njob_;
146 int id = job_[njob_].id;
147 int& rle = job_[njob_].rle;
148 const char* p = job_[njob_].p;
149
150 if (id < 0) {
151 // Undo the Capture.
152 cap_[prog_->inst(-id)->cap()] = p;
153 continue;
154 }
155
156 if (rle > 0) {
157 p += rle;
158 // Revivify job on stack.
159 --rle;
160 ++njob_;
161 }
162
163 Loop:
164 // Visit id, p.
165 Prog::Inst* ip = prog_->inst(id);
166 switch (ip->opcode()) {
167 default:
168 LOG(DFATAL) << "Unexpected opcode: " << ip->opcode();
169 return false;
170
171 case kInstFail:
172 break;
173
174 case kInstAltMatch:
175 if (ip->greedy(prog_)) {
176 // out1 is the Match instruction.
177 id = ip->out1();
178 p = end;
179 goto Loop;
180 }
181 if (longest_) {
182 // ip must be non-greedy...
183 // out is the Match instruction.
184 id = ip->out();
185 p = end;
186 goto Loop;
187 }
188 goto Next;
189
190 case kInstByteRange: {
191 int c = -1;
192 if (p < end)
193 c = *p & 0xFF;
194 if (!ip->Matches(c))
195 goto Next;
196
197 if (ip->hint() != 0)
198 Push(id+ip->hint(), p); // try the next when we're done
199 id = ip->out();
200 p++;
201 goto CheckAndLoop;
202 }
203
204 case kInstCapture:
205 if (!ip->last())
206 Push(id+1, p); // try the next when we're done
207
208 if (0 <= ip->cap() && ip->cap() < cap_.size()) {
209 // Capture p to register, but save old value first.
210 Push(-id, cap_[ip->cap()]); // undo when we're done
211 cap_[ip->cap()] = p;
212 }
213
214 id = ip->out();
215 goto CheckAndLoop;
216
217 case kInstEmptyWidth:
218 if (ip->empty() & ~Prog::EmptyFlags(context_, p))
219 goto Next;
220
221 if (!ip->last())
222 Push(id+1, p); // try the next when we're done
223 id = ip->out();
224 goto CheckAndLoop;
225
226 case kInstNop:
227 if (!ip->last())
228 Push(id+1, p); // try the next when we're done
229 id = ip->out();
230
231 CheckAndLoop:
232 // Sanity check: id is the head of its list, which must
233 // be the case if id-1 is the last of *its* list. :)
234 DCHECK(id == 0 || prog_->inst(id-1)->last());
235 if (ShouldVisit(id, p))
236 goto Loop;
237 break;
238
239 case kInstMatch: {
240 if (endmatch_ && p != end)
241 goto Next;
242
243 // We found a match. If the caller doesn't care
244 // where the match is, no point going further.
245 if (nsubmatch_ == 0)
246 return true;
247
248 // Record best match so far.
249 // Only need to check end point, because this entire
250 // call is only considering one start position.
251 matched = true;
252 cap_[1] = p;
253 if (submatch_[0].data() == NULL ||
254 (longest_ && p > submatch_[0].data() + submatch_[0].size())) {
255 for (int i = 0; i < nsubmatch_; i++)
256 submatch_[i] =
257 StringPiece(cap_[2 * i],
258 static_cast<size_t>(cap_[2 * i + 1] - cap_[2 * i]));
259 }
260
261 // If going for first match, we're done.
262 if (!longest_)
263 return true;
264
265 // If we used the entire text, no longer match is possible.
266 if (p == end)
267 return true;
268
269 // Otherwise, continue on in hope of a longer match.
270 // Note the absence of the ShouldVisit() check here
271 // due to execution remaining in the same list.
272 Next:
273 if (!ip->last()) {
274 id++;
275 goto Loop;
276 }
277 break;
278 }
279 }
280 }
281 return matched;
282 }
283
284 // Search text (within context) for prog_.
Search(const StringPiece & text,const StringPiece & context,bool anchored,bool longest,StringPiece * submatch,int nsubmatch)285 bool BitState::Search(const StringPiece& text, const StringPiece& context,
286 bool anchored, bool longest,
287 StringPiece* submatch, int nsubmatch) {
288 // Search parameters.
289 text_ = text;
290 context_ = context;
291 if (context_.data() == NULL)
292 context_ = text;
293 if (prog_->anchor_start() && context_.begin() != text.begin())
294 return false;
295 if (prog_->anchor_end() && context_.end() != text.end())
296 return false;
297 anchored_ = anchored || prog_->anchor_start();
298 longest_ = longest || prog_->anchor_end();
299 endmatch_ = prog_->anchor_end();
300 submatch_ = submatch;
301 nsubmatch_ = nsubmatch;
302 for (int i = 0; i < nsubmatch_; i++)
303 submatch_[i] = StringPiece();
304
305 // Allocate scratch space.
306 int nvisited = prog_->list_count() * static_cast<int>(text.size()+1);
307 nvisited = (nvisited + VisitedBits-1) / VisitedBits;
308 visited_ = PODArray<uint32_t>(nvisited);
309 memset(visited_.data(), 0, nvisited*sizeof visited_[0]);
310
311 int ncap = 2*nsubmatch;
312 if (ncap < 2)
313 ncap = 2;
314 cap_ = PODArray<const char*>(ncap);
315 memset(cap_.data(), 0, ncap*sizeof cap_[0]);
316
317 // When sizeof(Job) == 16, we start with a nice round 1KiB. :)
318 job_ = PODArray<Job>(64);
319
320 // Anchored search must start at text.begin().
321 if (anchored_) {
322 cap_[0] = text.data();
323 return TrySearch(prog_->start(), text.data());
324 }
325
326 // Unanchored search, starting from each possible text position.
327 // Notice that we have to try the empty string at the end of
328 // the text, so the loop condition is p <= text.end(), not p < text.end().
329 // This looks like it's quadratic in the size of the text,
330 // but we are not clearing visited_ between calls to TrySearch,
331 // so no work is duplicated and it ends up still being linear.
332 for (const char* p = text.data(); p <= text.data() + text.size(); p++) {
333 // Try to use memchr to find the first byte quickly.
334 int fb = prog_->first_byte();
335 if (fb >= 0 && p < text.data() + text.size() && (p[0] & 0xFF) != fb) {
336 p = reinterpret_cast<const char*>(
337 memchr(p, fb, text.data() + text.size() - p));
338 if (p == NULL)
339 p = text.data() + text.size();
340 }
341
342 cap_[0] = p;
343 if (TrySearch(prog_->start(), p)) // Match must be leftmost; done.
344 return true;
345 // Avoid invoking undefined behavior (arithmetic on a null pointer)
346 // by simply not continuing the loop.
347 if (p == NULL)
348 break;
349 }
350 return false;
351 }
352
353 // Bit-state search.
SearchBitState(const StringPiece & text,const StringPiece & context,Anchor anchor,MatchKind kind,StringPiece * match,int nmatch)354 bool Prog::SearchBitState(const StringPiece& text,
355 const StringPiece& context,
356 Anchor anchor,
357 MatchKind kind,
358 StringPiece* match,
359 int nmatch) {
360 // If full match, we ask for an anchored longest match
361 // and then check that match[0] == text.
362 // So make sure match[0] exists.
363 StringPiece sp0;
364 if (kind == kFullMatch) {
365 anchor = kAnchored;
366 if (nmatch < 1) {
367 match = &sp0;
368 nmatch = 1;
369 }
370 }
371
372 // Run the search.
373 BitState b(this);
374 bool anchored = anchor == kAnchored;
375 bool longest = kind != kFirstMatch;
376 if (!b.Search(text, context, anchored, longest, match, nmatch))
377 return false;
378 if (kind == kFullMatch && match[0].end() != text.end())
379 return false;
380 return true;
381 }
382
383 } // namespace re2
384