1 // Copyright 2008 The RE2 Authors. All Rights Reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 // Tested by search_test.cc, exhaustive_test.cc, tester.cc
6
7 // Prog::SearchBitState is a regular expression search with submatch
8 // tracking for small regular expressions and texts. Similarly to
9 // testing/backtrack.cc, it allocates a bitmap with (count of
10 // lists) * (length of text) bits to make sure it never explores the
11 // same (instruction list, character position) multiple times. This
12 // limits the search to run in time linear in the length of the text.
13 //
14 // Unlike testing/backtrack.cc, SearchBitState is not recursive
15 // on the text.
16 //
17 // SearchBitState is a fast replacement for the NFA code on small
18 // regexps and texts when SearchOnePass cannot be used.
19
20 #include <stddef.h>
21 #include <stdint.h>
22 #include <string.h>
23 #include <limits>
24 #include <utility>
25
26 #include "util/logging.h"
27 #include "re2/pod_array.h"
28 #include "re2/prog.h"
29 #include "re2/regexp.h"
30
31 namespace re2 {
32
33 struct Job {
34 int id;
35 int rle; // run length encoding
36 const char* p;
37 };
38
39 class BitState {
40 public:
41 explicit BitState(Prog* prog);
42
43 // The usual Search prototype.
44 // Can only call Search once per BitState.
45 bool Search(const StringPiece& text, const StringPiece& context,
46 bool anchored, bool longest,
47 StringPiece* submatch, int nsubmatch);
48
49 private:
50 inline bool ShouldVisit(int id, const char* p);
51 void Push(int id, const char* p);
52 void GrowStack();
53 bool TrySearch(int id, const char* p);
54
55 // Search parameters
56 Prog* prog_; // program being run
57 StringPiece text_; // text being searched
58 StringPiece context_; // greater context of text being searched
59 bool anchored_; // whether search is anchored at text.begin()
60 bool longest_; // whether search wants leftmost-longest match
61 bool endmatch_; // whether match must end at text.end()
62 StringPiece* submatch_; // submatches to fill in
63 int nsubmatch_; // # of submatches to fill in
64
65 // Search state
66 static constexpr int kVisitedBits = 64;
67 PODArray<uint64_t> visited_; // bitmap: (list ID, char*) pairs visited
68 PODArray<const char*> cap_; // capture registers
69 PODArray<Job> job_; // stack of text positions to explore
70 int njob_; // stack size
71
72 BitState(const BitState&) = delete;
73 BitState& operator=(const BitState&) = delete;
74 };
75
BitState(Prog * prog)76 BitState::BitState(Prog* prog)
77 : prog_(prog),
78 anchored_(false),
79 longest_(false),
80 endmatch_(false),
81 submatch_(NULL),
82 nsubmatch_(0),
83 njob_(0) {
84 }
85
86 // Given id, which *must* be a list head, we can look up its list ID.
87 // Then the question is: Should the search visit the (list ID, p) pair?
88 // If so, remember that it was visited so that the next time,
89 // we don't repeat the visit.
ShouldVisit(int id,const char * p)90 bool BitState::ShouldVisit(int id, const char* p) {
91 int n = prog_->list_heads()[id] * static_cast<int>(text_.size()+1) +
92 static_cast<int>(p-text_.data());
93 if (visited_[n/kVisitedBits] & (uint64_t{1} << (n & (kVisitedBits-1))))
94 return false;
95 visited_[n/kVisitedBits] |= uint64_t{1} << (n & (kVisitedBits-1));
96 return true;
97 }
98
99 // Grow the stack.
GrowStack()100 void BitState::GrowStack() {
101 PODArray<Job> tmp(2*job_.size());
102 memmove(tmp.data(), job_.data(), njob_*sizeof job_[0]);
103 job_ = std::move(tmp);
104 }
105
106 // Push (id, p) onto the stack, growing it if necessary.
Push(int id,const char * p)107 void BitState::Push(int id, const char* p) {
108 if (njob_ >= job_.size()) {
109 GrowStack();
110 if (njob_ >= job_.size()) {
111 LOG(DFATAL) << "GrowStack() failed: "
112 << "njob_ = " << njob_ << ", "
113 << "job_.size() = " << job_.size();
114 return;
115 }
116 }
117
118 // If id < 0, it's undoing a Capture,
119 // so we mustn't interfere with that.
120 if (id >= 0 && njob_ > 0) {
121 Job* top = &job_[njob_-1];
122 if (id == top->id &&
123 p == top->p + top->rle + 1 &&
124 top->rle < std::numeric_limits<int>::max()) {
125 ++top->rle;
126 return;
127 }
128 }
129
130 Job* top = &job_[njob_++];
131 top->id = id;
132 top->rle = 0;
133 top->p = p;
134 }
135
136 // Try a search from instruction id0 in state p0.
137 // Return whether it succeeded.
TrySearch(int id0,const char * p0)138 bool BitState::TrySearch(int id0, const char* p0) {
139 bool matched = false;
140 const char* end = text_.data() + text_.size();
141 njob_ = 0;
142 // Push() no longer checks ShouldVisit(),
143 // so we must perform the check ourselves.
144 if (ShouldVisit(id0, p0))
145 Push(id0, p0);
146 while (njob_ > 0) {
147 // Pop job off stack.
148 --njob_;
149 int id = job_[njob_].id;
150 int& rle = job_[njob_].rle;
151 const char* p = job_[njob_].p;
152
153 if (id < 0) {
154 // Undo the Capture.
155 cap_[prog_->inst(-id)->cap()] = p;
156 continue;
157 }
158
159 if (rle > 0) {
160 p += rle;
161 // Revivify job on stack.
162 --rle;
163 ++njob_;
164 }
165
166 Loop:
167 // Visit id, p.
168 Prog::Inst* ip = prog_->inst(id);
169 switch (ip->opcode()) {
170 default:
171 LOG(DFATAL) << "Unexpected opcode: " << ip->opcode();
172 return false;
173
174 case kInstFail:
175 break;
176
177 case kInstAltMatch:
178 if (ip->greedy(prog_)) {
179 // out1 is the Match instruction.
180 id = ip->out1();
181 p = end;
182 goto Loop;
183 }
184 if (longest_) {
185 // ip must be non-greedy...
186 // out is the Match instruction.
187 id = ip->out();
188 p = end;
189 goto Loop;
190 }
191 goto Next;
192
193 case kInstByteRange: {
194 int c = -1;
195 if (p < end)
196 c = *p & 0xFF;
197 if (!ip->Matches(c))
198 goto Next;
199
200 if (ip->hint() != 0)
201 Push(id+ip->hint(), p); // try the next when we're done
202 id = ip->out();
203 p++;
204 goto CheckAndLoop;
205 }
206
207 case kInstCapture:
208 if (!ip->last())
209 Push(id+1, p); // try the next when we're done
210
211 if (0 <= ip->cap() && ip->cap() < cap_.size()) {
212 // Capture p to register, but save old value first.
213 Push(-id, cap_[ip->cap()]); // undo when we're done
214 cap_[ip->cap()] = p;
215 }
216
217 id = ip->out();
218 goto CheckAndLoop;
219
220 case kInstEmptyWidth:
221 if (ip->empty() & ~Prog::EmptyFlags(context_, p))
222 goto Next;
223
224 if (!ip->last())
225 Push(id+1, p); // try the next when we're done
226 id = ip->out();
227 goto CheckAndLoop;
228
229 case kInstNop:
230 if (!ip->last())
231 Push(id+1, p); // try the next when we're done
232 id = ip->out();
233
234 CheckAndLoop:
235 // Sanity check: id is the head of its list, which must
236 // be the case if id-1 is the last of *its* list. :)
237 DCHECK(id == 0 || prog_->inst(id-1)->last());
238 if (ShouldVisit(id, p))
239 goto Loop;
240 break;
241
242 case kInstMatch: {
243 if (endmatch_ && p != end)
244 goto Next;
245
246 // We found a match. If the caller doesn't care
247 // where the match is, no point going further.
248 if (nsubmatch_ == 0)
249 return true;
250
251 // Record best match so far.
252 // Only need to check end point, because this entire
253 // call is only considering one start position.
254 matched = true;
255 cap_[1] = p;
256 if (submatch_[0].data() == NULL ||
257 (longest_ && p > submatch_[0].data() + submatch_[0].size())) {
258 for (int i = 0; i < nsubmatch_; i++)
259 submatch_[i] =
260 StringPiece(cap_[2 * i],
261 static_cast<size_t>(cap_[2 * i + 1] - cap_[2 * i]));
262 }
263
264 // If going for first match, we're done.
265 if (!longest_)
266 return true;
267
268 // If we used the entire text, no longer match is possible.
269 if (p == end)
270 return true;
271
272 // Otherwise, continue on in hope of a longer match.
273 // Note the absence of the ShouldVisit() check here
274 // due to execution remaining in the same list.
275 Next:
276 if (!ip->last()) {
277 id++;
278 goto Loop;
279 }
280 break;
281 }
282 }
283 }
284 return matched;
285 }
286
287 // Search text (within context) for prog_.
Search(const StringPiece & text,const StringPiece & context,bool anchored,bool longest,StringPiece * submatch,int nsubmatch)288 bool BitState::Search(const StringPiece& text, const StringPiece& context,
289 bool anchored, bool longest,
290 StringPiece* submatch, int nsubmatch) {
291 // Search parameters.
292 text_ = text;
293 context_ = context;
294 if (context_.data() == NULL)
295 context_ = text;
296 if (prog_->anchor_start() && context_.begin() != text.begin())
297 return false;
298 if (prog_->anchor_end() && context_.end() != text.end())
299 return false;
300 anchored_ = anchored || prog_->anchor_start();
301 longest_ = longest || prog_->anchor_end();
302 endmatch_ = prog_->anchor_end();
303 submatch_ = submatch;
304 nsubmatch_ = nsubmatch;
305 for (int i = 0; i < nsubmatch_; i++)
306 submatch_[i] = StringPiece();
307
308 // Allocate scratch space.
309 int nvisited = prog_->list_count() * static_cast<int>(text.size()+1);
310 nvisited = (nvisited + kVisitedBits-1) / kVisitedBits;
311 visited_ = PODArray<uint64_t>(nvisited);
312 memset(visited_.data(), 0, nvisited*sizeof visited_[0]);
313
314 int ncap = 2*nsubmatch;
315 if (ncap < 2)
316 ncap = 2;
317 cap_ = PODArray<const char*>(ncap);
318 memset(cap_.data(), 0, ncap*sizeof cap_[0]);
319
320 // When sizeof(Job) == 16, we start with a nice round 1KiB. :)
321 job_ = PODArray<Job>(64);
322
323 // Anchored search must start at text.begin().
324 if (anchored_) {
325 cap_[0] = text.data();
326 return TrySearch(prog_->start(), text.data());
327 }
328
329 // Unanchored search, starting from each possible text position.
330 // Notice that we have to try the empty string at the end of
331 // the text, so the loop condition is p <= text.end(), not p < text.end().
332 // This looks like it's quadratic in the size of the text,
333 // but we are not clearing visited_ between calls to TrySearch,
334 // so no work is duplicated and it ends up still being linear.
335 const char* etext = text.data() + text.size();
336 for (const char* p = text.data(); p <= etext; p++) {
337 // Try to use prefix accel (e.g. memchr) to skip ahead.
338 if (p < etext && prog_->can_prefix_accel()) {
339 p = reinterpret_cast<const char*>(prog_->PrefixAccel(p, etext - p));
340 if (p == NULL)
341 p = etext;
342 }
343
344 cap_[0] = p;
345 if (TrySearch(prog_->start(), p)) // Match must be leftmost; done.
346 return true;
347 // Avoid invoking undefined behavior (arithmetic on a null pointer)
348 // by simply not continuing the loop.
349 if (p == NULL)
350 break;
351 }
352 return false;
353 }
354
355 // Bit-state search.
SearchBitState(const StringPiece & text,const StringPiece & context,Anchor anchor,MatchKind kind,StringPiece * match,int nmatch)356 bool Prog::SearchBitState(const StringPiece& text,
357 const StringPiece& context,
358 Anchor anchor,
359 MatchKind kind,
360 StringPiece* match,
361 int nmatch) {
362 // If full match, we ask for an anchored longest match
363 // and then check that match[0] == text.
364 // So make sure match[0] exists.
365 StringPiece sp0;
366 if (kind == kFullMatch) {
367 anchor = kAnchored;
368 if (nmatch < 1) {
369 match = &sp0;
370 nmatch = 1;
371 }
372 }
373
374 // Run the search.
375 BitState b(this);
376 bool anchored = anchor == kAnchored;
377 bool longest = kind != kFirstMatch;
378 if (!b.Search(text, context, anchored, longest, match, nmatch))
379 return false;
380 if (kind == kFullMatch && match[0].end() != text.end())
381 return false;
382 return true;
383 }
384
385 } // namespace re2
386