1 // Copyright 2007 The RE2 Authors. All Rights Reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 #ifndef RE2_PROG_H_
6 #define RE2_PROG_H_
7
8 // Compiled representation of regular expressions.
9 // See regexp.h for the Regexp class, which represents a regular
10 // expression symbolically.
11
12 #include <stdint.h>
13
14 #include <functional>
15 #include <string>
16 #include <type_traits>
17 #include <vector>
18
19 #include "absl/base/call_once.h"
20 #include "absl/log/absl_check.h"
21 #include "absl/log/absl_log.h"
22 #include "absl/strings/string_view.h"
23 #include "re2/pod_array.h"
24 #include "re2/re2.h"
25 #include "re2/sparse_array.h"
26 #include "re2/sparse_set.h"
27
28 namespace re2 {
29
30 // Opcodes for Inst
31 enum InstOp {
32 kInstAlt = 0, // choose between out_ and out1_
33 kInstAltMatch, // Alt: out_ is [00-FF] and back, out1_ is match; or vice versa.
34 kInstByteRange, // next (possible case-folded) byte must be in [lo_, hi_]
35 kInstCapture, // capturing parenthesis number cap_
36 kInstEmptyWidth, // empty-width special (^ $ ...); bit(s) set in empty_
37 kInstMatch, // found a match!
38 kInstNop, // no-op; occasionally unavoidable
39 kInstFail, // never match; occasionally unavoidable
40 kNumInst,
41 };
42
43 // Bit flags for empty-width specials
44 enum EmptyOp {
45 kEmptyBeginLine = 1<<0, // ^ - beginning of line
46 kEmptyEndLine = 1<<1, // $ - end of line
47 kEmptyBeginText = 1<<2, // \A - beginning of text
48 kEmptyEndText = 1<<3, // \z - end of text
49 kEmptyWordBoundary = 1<<4, // \b - word boundary
50 kEmptyNonWordBoundary = 1<<5, // \B - not \b
51 kEmptyAllFlags = (1<<6)-1,
52 };
53
54 class DFA;
55 class Regexp;
56
57 // Compiled form of regexp program.
58 class Prog {
59 public:
60 Prog();
61 ~Prog();
62
63 // Single instruction in regexp program.
64 class Inst {
65 public:
66 // See the assertion below for why this is so.
67 Inst() = default;
68
69 // Copyable.
70 Inst(const Inst&) = default;
71 Inst& operator=(const Inst&) = default;
72
73 // Constructors per opcode
74 void InitAlt(uint32_t out, uint32_t out1);
75 void InitByteRange(int lo, int hi, int foldcase, uint32_t out);
76 void InitCapture(int cap, uint32_t out);
77 void InitEmptyWidth(EmptyOp empty, uint32_t out);
78 void InitMatch(int id);
79 void InitNop(uint32_t out);
80 void InitFail();
81
82 // Getters
id(Prog * p)83 int id(Prog* p) { return static_cast<int>(this - p->inst_.data()); }
opcode()84 InstOp opcode() { return static_cast<InstOp>(out_opcode_ & 7); }
last()85 int last() { return (out_opcode_ >> 3) & 1; }
out()86 int out() { return out_opcode_ >> 4; }
out1()87 int out1() {
88 ABSL_DCHECK(opcode() == kInstAlt || opcode() == kInstAltMatch);
89 return out1_;
90 }
cap()91 int cap() {
92 ABSL_DCHECK_EQ(opcode(), kInstCapture);
93 return cap_;
94 }
lo()95 int lo() {
96 ABSL_DCHECK_EQ(opcode(), kInstByteRange);
97 return lo_;
98 }
hi()99 int hi() {
100 ABSL_DCHECK_EQ(opcode(), kInstByteRange);
101 return hi_;
102 }
foldcase()103 int foldcase() {
104 ABSL_DCHECK_EQ(opcode(), kInstByteRange);
105 return hint_foldcase_ & 1;
106 }
hint()107 int hint() {
108 ABSL_DCHECK_EQ(opcode(), kInstByteRange);
109 return hint_foldcase_ >> 1;
110 }
match_id()111 int match_id() {
112 ABSL_DCHECK_EQ(opcode(), kInstMatch);
113 return match_id_;
114 }
empty()115 EmptyOp empty() {
116 ABSL_DCHECK_EQ(opcode(), kInstEmptyWidth);
117 return empty_;
118 }
119
greedy(Prog * p)120 bool greedy(Prog* p) {
121 ABSL_DCHECK_EQ(opcode(), kInstAltMatch);
122 return p->inst(out())->opcode() == kInstByteRange ||
123 (p->inst(out())->opcode() == kInstNop &&
124 p->inst(p->inst(out())->out())->opcode() == kInstByteRange);
125 }
126
127 // Does this inst (an kInstByteRange) match c?
Matches(int c)128 inline bool Matches(int c) {
129 ABSL_DCHECK_EQ(opcode(), kInstByteRange);
130 if (foldcase() && 'A' <= c && c <= 'Z')
131 c += 'a' - 'A';
132 return lo_ <= c && c <= hi_;
133 }
134
135 // Returns string representation for debugging.
136 std::string Dump();
137
138 // Maximum instruction id.
139 // (Must fit in out_opcode_. PatchList/last steal another bit.)
140 static const int kMaxInst = (1<<28) - 1;
141
142 private:
set_opcode(InstOp opcode)143 void set_opcode(InstOp opcode) {
144 out_opcode_ = (out()<<4) | (last()<<3) | opcode;
145 }
146
set_last()147 void set_last() {
148 out_opcode_ = (out()<<4) | (1<<3) | opcode();
149 }
150
set_out(int out)151 void set_out(int out) {
152 out_opcode_ = (out<<4) | (last()<<3) | opcode();
153 }
154
set_out_opcode(int out,InstOp opcode)155 void set_out_opcode(int out, InstOp opcode) {
156 out_opcode_ = (out<<4) | (last()<<3) | opcode;
157 }
158
159 uint32_t out_opcode_; // 28 bits: out, 1 bit: last, 3 (low) bits: opcode
160 union { // additional instruction arguments:
161 uint32_t out1_; // opcode == kInstAlt
162 // alternate next instruction
163
164 int32_t cap_; // opcode == kInstCapture
165 // Index of capture register (holds text
166 // position recorded by capturing parentheses).
167 // For \n (the submatch for the nth parentheses),
168 // the left parenthesis captures into register 2*n
169 // and the right one captures into register 2*n+1.
170
171 int32_t match_id_; // opcode == kInstMatch
172 // Match ID to identify this match (for re2::Set).
173
174 struct { // opcode == kInstByteRange
175 uint8_t lo_; // byte range is lo_-hi_ inclusive
176 uint8_t hi_; //
177 uint16_t hint_foldcase_; // 15 bits: hint, 1 (low) bit: foldcase
178 // hint to execution engines: the delta to the
179 // next instruction (in the current list) worth
180 // exploring iff this instruction matched; 0
181 // means there are no remaining possibilities,
182 // which is most likely for character classes.
183 // foldcase: A-Z -> a-z before checking range.
184 };
185
186 EmptyOp empty_; // opcode == kInstEmptyWidth
187 // empty_ is bitwise OR of kEmpty* flags above.
188 };
189
190 friend class Compiler;
191 friend struct PatchList;
192 friend class Prog;
193 };
194
195 // Inst must be trivial so that we can freely clear it with memset(3).
196 // Arrays of Inst are initialised by copying the initial elements with
197 // memmove(3) and then clearing any remaining elements with memset(3).
198 static_assert(std::is_trivial<Inst>::value, "Inst must be trivial");
199
200 // Whether to anchor the search.
201 enum Anchor {
202 kUnanchored, // match anywhere
203 kAnchored, // match only starting at beginning of text
204 };
205
206 // Kind of match to look for (for anchor != kFullMatch)
207 //
208 // kLongestMatch mode finds the overall longest
209 // match but still makes its submatch choices the way
210 // Perl would, not in the way prescribed by POSIX.
211 // The POSIX rules are much more expensive to implement,
212 // and no one has needed them.
213 //
214 // kFullMatch is not strictly necessary -- we could use
215 // kLongestMatch and then check the length of the match -- but
216 // the matching code can run faster if it knows to consider only
217 // full matches.
218 enum MatchKind {
219 kFirstMatch, // like Perl, PCRE
220 kLongestMatch, // like egrep or POSIX
221 kFullMatch, // match only entire text; implies anchor==kAnchored
222 kManyMatch // for SearchDFA, records set of matches
223 };
224
inst(int id)225 Inst *inst(int id) { return &inst_[id]; }
start()226 int start() { return start_; }
set_start(int start)227 void set_start(int start) { start_ = start; }
start_unanchored()228 int start_unanchored() { return start_unanchored_; }
set_start_unanchored(int start)229 void set_start_unanchored(int start) { start_unanchored_ = start; }
size()230 int size() { return size_; }
reversed()231 bool reversed() { return reversed_; }
set_reversed(bool reversed)232 void set_reversed(bool reversed) { reversed_ = reversed; }
list_count()233 int list_count() { return list_count_; }
inst_count(InstOp op)234 int inst_count(InstOp op) { return inst_count_[op]; }
list_heads()235 uint16_t* list_heads() { return list_heads_.data(); }
bit_state_text_max_size()236 size_t bit_state_text_max_size() { return bit_state_text_max_size_; }
dfa_mem()237 int64_t dfa_mem() { return dfa_mem_; }
set_dfa_mem(int64_t dfa_mem)238 void set_dfa_mem(int64_t dfa_mem) { dfa_mem_ = dfa_mem; }
anchor_start()239 bool anchor_start() { return anchor_start_; }
set_anchor_start(bool b)240 void set_anchor_start(bool b) { anchor_start_ = b; }
anchor_end()241 bool anchor_end() { return anchor_end_; }
set_anchor_end(bool b)242 void set_anchor_end(bool b) { anchor_end_ = b; }
bytemap_range()243 int bytemap_range() { return bytemap_range_; }
bytemap()244 const uint8_t* bytemap() { return bytemap_; }
can_prefix_accel()245 bool can_prefix_accel() { return prefix_size_ != 0; }
246
247 // Accelerates to the first likely occurrence of the prefix.
248 // Returns a pointer to the first byte or NULL if not found.
PrefixAccel(const void * data,size_t size)249 const void* PrefixAccel(const void* data, size_t size) {
250 ABSL_DCHECK(can_prefix_accel());
251 if (prefix_foldcase_) {
252 return PrefixAccel_ShiftDFA(data, size);
253 } else if (prefix_size_ != 1) {
254 return PrefixAccel_FrontAndBack(data, size);
255 } else {
256 return memchr(data, prefix_front_, size);
257 }
258 }
259
260 // Configures prefix accel using the analysis performed during compilation.
261 void ConfigurePrefixAccel(const std::string& prefix, bool prefix_foldcase);
262
263 // An implementation of prefix accel that uses prefix_dfa_ to perform
264 // case-insensitive search.
265 const void* PrefixAccel_ShiftDFA(const void* data, size_t size);
266
267 // An implementation of prefix accel that looks for prefix_front_ and
268 // prefix_back_ to return fewer false positives than memchr(3) alone.
269 const void* PrefixAccel_FrontAndBack(const void* data, size_t size);
270
271 // Returns string representation of program for debugging.
272 std::string Dump();
273 std::string DumpUnanchored();
274 std::string DumpByteMap();
275
276 // Returns the set of kEmpty flags that are in effect at
277 // position p within context.
278 static uint32_t EmptyFlags(absl::string_view context, const char* p);
279
280 // Returns whether byte c is a word character: ASCII only.
281 // Used by the implementation of \b and \B.
282 // This is not right for Unicode, but:
283 // - it's hard to get right in a byte-at-a-time matching world
284 // (the DFA has only one-byte lookahead).
285 // - even if the lookahead were possible, the Progs would be huge.
286 // This crude approximation is the same one PCRE uses.
IsWordChar(uint8_t c)287 static bool IsWordChar(uint8_t c) {
288 return ('A' <= c && c <= 'Z') ||
289 ('a' <= c && c <= 'z') ||
290 ('0' <= c && c <= '9') ||
291 c == '_';
292 }
293
294 // Execution engines. They all search for the regexp (run the prog)
295 // in text, which is in the larger context (used for ^ $ \b etc).
296 // Anchor and kind control the kind of search.
297 // Returns true if match found, false if not.
298 // If match found, fills match[0..nmatch-1] with submatch info.
299 // match[0] is overall match, match[1] is first set of parens, etc.
300 // If a particular submatch is not matched during the regexp match,
301 // it is set to NULL.
302 //
303 // Matching text == absl::string_view() is treated as any other empty
304 // string, but note that on return, it will not be possible to distinguish
305 // submatches that matched that empty string from submatches that didn't
306 // match anything. Either way, match[i] == NULL.
307
308 // Search using NFA: can find submatches but kind of slow.
309 bool SearchNFA(absl::string_view text, absl::string_view context,
310 Anchor anchor, MatchKind kind, absl::string_view* match,
311 int nmatch);
312
313 // Search using DFA: much faster than NFA but only finds
314 // end of match and can use a lot more memory.
315 // Returns whether a match was found.
316 // If the DFA runs out of memory, sets *failed to true and returns false.
317 // If matches != NULL and kind == kManyMatch and there is a match,
318 // SearchDFA fills matches with the match IDs of the final matching state.
319 bool SearchDFA(absl::string_view text, absl::string_view context,
320 Anchor anchor, MatchKind kind, absl::string_view* match0,
321 bool* failed, SparseSet* matches);
322
323 // The callback issued after building each DFA state with BuildEntireDFA().
324 // If next is null, then the memory budget has been exhausted and building
325 // will halt. Otherwise, the state has been built and next points to an array
326 // of bytemap_range()+1 slots holding the next states as per the bytemap and
327 // kByteEndText. The number of the state is implied by the callback sequence:
328 // the first callback is for state 0, the second callback is for state 1, ...
329 // match indicates whether the state is a matching state.
330 using DFAStateCallback = std::function<void(const int* next, bool match)>;
331
332 // Build the entire DFA for the given match kind.
333 // Usually the DFA is built out incrementally, as needed, which
334 // avoids lots of unnecessary work.
335 // If cb is not empty, it receives one callback per state built.
336 // Returns the number of states built.
337 // FOR TESTING OR EXPERIMENTAL PURPOSES ONLY.
338 int BuildEntireDFA(MatchKind kind, const DFAStateCallback& cb);
339
340 // Compute bytemap.
341 void ComputeByteMap();
342
343 // Run peep-hole optimizer on program.
344 void Optimize();
345
346 // One-pass NFA: only correct if IsOnePass() is true,
347 // but much faster than NFA (competitive with PCRE)
348 // for those expressions.
349 bool IsOnePass();
350 bool SearchOnePass(absl::string_view text, absl::string_view context,
351 Anchor anchor, MatchKind kind, absl::string_view* match,
352 int nmatch);
353
354 // Bit-state backtracking. Fast on small cases but uses memory
355 // proportional to the product of the list count and the text size.
CanBitState()356 bool CanBitState() { return list_heads_.data() != NULL; }
357 bool SearchBitState(absl::string_view text, absl::string_view context,
358 Anchor anchor, MatchKind kind, absl::string_view* match,
359 int nmatch);
360
361 static const int kMaxOnePassCapture = 5; // $0 through $4
362
363 // Backtracking search: the gold standard against which the other
364 // implementations are checked. FOR TESTING ONLY.
365 // It allocates a ton of memory to avoid running forever.
366 // It is also recursive, so can't use in production (will overflow stacks).
367 // The name "Unsafe" here is supposed to be a flag that
368 // you should not be using this function.
369 bool UnsafeSearchBacktrack(absl::string_view text, absl::string_view context,
370 Anchor anchor, MatchKind kind,
371 absl::string_view* match, int nmatch);
372
373 // Computes range for any strings matching regexp. The min and max can in
374 // some cases be arbitrarily precise, so the caller gets to specify the
375 // maximum desired length of string returned.
376 //
377 // Assuming PossibleMatchRange(&min, &max, N) returns successfully, any
378 // string s that is an anchored match for this regexp satisfies
379 // min <= s && s <= max.
380 //
381 // Note that PossibleMatchRange() will only consider the first copy of an
382 // infinitely repeated element (i.e., any regexp element followed by a '*' or
383 // '+' operator). Regexps with "{N}" constructions are not affected, as those
384 // do not compile down to infinite repetitions.
385 //
386 // Returns true on success, false on error.
387 bool PossibleMatchRange(std::string* min, std::string* max, int maxlen);
388
389 // Outputs the program fanout into the given sparse array.
390 void Fanout(SparseArray<int>* fanout);
391
392 // Compiles a collection of regexps to Prog. Each regexp will have
393 // its own Match instruction recording the index in the output vector.
394 static Prog* CompileSet(Regexp* re, RE2::Anchor anchor, int64_t max_mem);
395
396 // Flattens the Prog from "tree" form to "list" form. This is an in-place
397 // operation in the sense that the old instructions are lost.
398 void Flatten();
399
400 // Walks the Prog; the "successor roots" or predecessors of the reachable
401 // instructions are marked in rootmap or predmap/predvec, respectively.
402 // reachable and stk are preallocated scratch structures.
403 void MarkSuccessors(SparseArray<int>* rootmap,
404 SparseArray<int>* predmap,
405 std::vector<std::vector<int>>* predvec,
406 SparseSet* reachable, std::vector<int>* stk);
407
408 // Walks the Prog from the given "root" instruction; the "dominator root"
409 // of the reachable instructions (if such exists) is marked in rootmap.
410 // reachable and stk are preallocated scratch structures.
411 void MarkDominator(int root, SparseArray<int>* rootmap,
412 SparseArray<int>* predmap,
413 std::vector<std::vector<int>>* predvec,
414 SparseSet* reachable, std::vector<int>* stk);
415
416 // Walks the Prog from the given "root" instruction; the reachable
417 // instructions are emitted in "list" form and appended to flat.
418 // reachable and stk are preallocated scratch structures.
419 void EmitList(int root, SparseArray<int>* rootmap,
420 std::vector<Inst>* flat,
421 SparseSet* reachable, std::vector<int>* stk);
422
423 // Computes hints for ByteRange instructions in [begin, end).
424 void ComputeHints(std::vector<Inst>* flat, int begin, int end);
425
426 // Controls whether the DFA should bail out early if the NFA would be faster.
427 // FOR TESTING ONLY.
428 static void TESTING_ONLY_set_dfa_should_bail_when_slow(bool b);
429
430 private:
431 friend class Compiler;
432
433 DFA* GetDFA(MatchKind kind);
434 void DeleteDFA(DFA* dfa);
435
436 bool anchor_start_; // regexp has explicit start anchor
437 bool anchor_end_; // regexp has explicit end anchor
438 bool reversed_; // whether program runs backward over input
439 bool did_flatten_; // has Flatten been called?
440 bool did_onepass_; // has IsOnePass been called?
441
442 int start_; // entry point for program
443 int start_unanchored_; // unanchored entry point for program
444 int size_; // number of instructions
445 int bytemap_range_; // bytemap_[x] < bytemap_range_
446
447 bool prefix_foldcase_; // whether prefix is case-insensitive
448 size_t prefix_size_; // size of prefix (0 if no prefix)
449 union {
450 uint64_t* prefix_dfa_; // "Shift DFA" for prefix
451 struct {
452 int prefix_front_; // first byte of prefix
453 int prefix_back_; // last byte of prefix
454 };
455 };
456
457 int list_count_; // count of lists (see above)
458 int inst_count_[kNumInst]; // count of instructions by opcode
459 PODArray<uint16_t> list_heads_; // sparse array enumerating list heads
460 // not populated if size_ is overly large
461 size_t bit_state_text_max_size_; // upper bound (inclusive) on text.size()
462
463 PODArray<Inst> inst_; // pointer to instruction array
464 PODArray<uint8_t> onepass_nodes_; // data for OnePass nodes
465
466 int64_t dfa_mem_; // Maximum memory for DFAs.
467 DFA* dfa_first_; // DFA cached for kFirstMatch/kManyMatch
468 DFA* dfa_longest_; // DFA cached for kLongestMatch/kFullMatch
469
470 uint8_t bytemap_[256]; // map from input bytes to byte classes
471
472 absl::once_flag dfa_first_once_;
473 absl::once_flag dfa_longest_once_;
474
475 Prog(const Prog&) = delete;
476 Prog& operator=(const Prog&) = delete;
477 };
478
479 // std::string_view in MSVC has iterators that aren't just pointers and
480 // that don't allow comparisons between different objects - not even if
481 // those objects are views into the same string! Thus, we provide these
482 // conversion functions for convenience.
BeginPtr(absl::string_view s)483 static inline const char* BeginPtr(absl::string_view s) {
484 return s.data();
485 }
EndPtr(absl::string_view s)486 static inline const char* EndPtr(absl::string_view s) {
487 return s.data() + s.size();
488 }
489
490 } // namespace re2
491
492 #endif // RE2_PROG_H_
493