• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2007 The RE2 Authors.  All Rights Reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4 
5 #ifndef RE2_PROG_H_
6 #define RE2_PROG_H_
7 
8 // Compiled representation of regular expressions.
9 // See regexp.h for the Regexp class, which represents a regular
10 // expression symbolically.
11 
12 #include <stdint.h>
13 
14 #include <functional>
15 #include <string>
16 #include <type_traits>
17 #include <vector>
18 
19 #include "absl/base/call_once.h"
20 #include "absl/log/absl_check.h"
21 #include "absl/log/absl_log.h"
22 #include "absl/strings/string_view.h"
23 #include "re2/pod_array.h"
24 #include "re2/re2.h"
25 #include "re2/sparse_array.h"
26 #include "re2/sparse_set.h"
27 
28 namespace re2 {
29 
30 // Opcodes for Inst
31 enum InstOp {
32   kInstAlt = 0,      // choose between out_ and out1_
33   kInstAltMatch,     // Alt: out_ is [00-FF] and back, out1_ is match; or vice versa.
34   kInstByteRange,    // next (possible case-folded) byte must be in [lo_, hi_]
35   kInstCapture,      // capturing parenthesis number cap_
36   kInstEmptyWidth,   // empty-width special (^ $ ...); bit(s) set in empty_
37   kInstMatch,        // found a match!
38   kInstNop,          // no-op; occasionally unavoidable
39   kInstFail,         // never match; occasionally unavoidable
40   kNumInst,
41 };
42 
43 // Bit flags for empty-width specials
44 enum EmptyOp {
45   kEmptyBeginLine        = 1<<0,      // ^ - beginning of line
46   kEmptyEndLine          = 1<<1,      // $ - end of line
47   kEmptyBeginText        = 1<<2,      // \A - beginning of text
48   kEmptyEndText          = 1<<3,      // \z - end of text
49   kEmptyWordBoundary     = 1<<4,      // \b - word boundary
50   kEmptyNonWordBoundary  = 1<<5,      // \B - not \b
51   kEmptyAllFlags         = (1<<6)-1,
52 };
53 
54 class DFA;
55 class Regexp;
56 
57 // Compiled form of regexp program.
58 class Prog {
59  public:
60   Prog();
61   ~Prog();
62 
63   // Single instruction in regexp program.
64   class Inst {
65    public:
66     // See the assertion below for why this is so.
67     Inst() = default;
68 
69     // Copyable.
70     Inst(const Inst&) = default;
71     Inst& operator=(const Inst&) = default;
72 
73     // Constructors per opcode
74     void InitAlt(uint32_t out, uint32_t out1);
75     void InitByteRange(int lo, int hi, int foldcase, uint32_t out);
76     void InitCapture(int cap, uint32_t out);
77     void InitEmptyWidth(EmptyOp empty, uint32_t out);
78     void InitMatch(int id);
79     void InitNop(uint32_t out);
80     void InitFail();
81 
82     // Getters
id(Prog * p)83     int id(Prog* p) { return static_cast<int>(this - p->inst_.data()); }
opcode()84     InstOp opcode() { return static_cast<InstOp>(out_opcode_ & 7); }
last()85     int last() { return (out_opcode_ >> 3) & 1; }
out()86     int out() { return out_opcode_ >> 4; }
out1()87     int out1() {
88       ABSL_DCHECK(opcode() == kInstAlt || opcode() == kInstAltMatch);
89       return out1_;
90     }
cap()91     int cap() {
92       ABSL_DCHECK_EQ(opcode(), kInstCapture);
93       return cap_;
94     }
lo()95     int lo() {
96       ABSL_DCHECK_EQ(opcode(), kInstByteRange);
97       return lo_;
98     }
hi()99     int hi() {
100       ABSL_DCHECK_EQ(opcode(), kInstByteRange);
101       return hi_;
102     }
foldcase()103     int foldcase() {
104       ABSL_DCHECK_EQ(opcode(), kInstByteRange);
105       return hint_foldcase_ & 1;
106     }
hint()107     int hint() {
108       ABSL_DCHECK_EQ(opcode(), kInstByteRange);
109       return hint_foldcase_ >> 1;
110     }
match_id()111     int match_id() {
112       ABSL_DCHECK_EQ(opcode(), kInstMatch);
113       return match_id_;
114     }
empty()115     EmptyOp empty() {
116       ABSL_DCHECK_EQ(opcode(), kInstEmptyWidth);
117       return empty_;
118     }
119 
greedy(Prog * p)120     bool greedy(Prog* p) {
121       ABSL_DCHECK_EQ(opcode(), kInstAltMatch);
122       return p->inst(out())->opcode() == kInstByteRange ||
123              (p->inst(out())->opcode() == kInstNop &&
124               p->inst(p->inst(out())->out())->opcode() == kInstByteRange);
125     }
126 
127     // Does this inst (an kInstByteRange) match c?
Matches(int c)128     inline bool Matches(int c) {
129       ABSL_DCHECK_EQ(opcode(), kInstByteRange);
130       if (foldcase() && 'A' <= c && c <= 'Z')
131         c += 'a' - 'A';
132       return lo_ <= c && c <= hi_;
133     }
134 
135     // Returns string representation for debugging.
136     std::string Dump();
137 
138     // Maximum instruction id.
139     // (Must fit in out_opcode_. PatchList/last steal another bit.)
140     static const int kMaxInst = (1<<28) - 1;
141 
142    private:
set_opcode(InstOp opcode)143     void set_opcode(InstOp opcode) {
144       out_opcode_ = (out()<<4) | (last()<<3) | opcode;
145     }
146 
set_last()147     void set_last() {
148       out_opcode_ = (out()<<4) | (1<<3) | opcode();
149     }
150 
set_out(int out)151     void set_out(int out) {
152       out_opcode_ = (out<<4) | (last()<<3) | opcode();
153     }
154 
set_out_opcode(int out,InstOp opcode)155     void set_out_opcode(int out, InstOp opcode) {
156       out_opcode_ = (out<<4) | (last()<<3) | opcode;
157     }
158 
159     uint32_t out_opcode_;  // 28 bits: out, 1 bit: last, 3 (low) bits: opcode
160     union {                // additional instruction arguments:
161       uint32_t out1_;      // opcode == kInstAlt
162                            //   alternate next instruction
163 
164       int32_t cap_;        // opcode == kInstCapture
165                            //   Index of capture register (holds text
166                            //   position recorded by capturing parentheses).
167                            //   For \n (the submatch for the nth parentheses),
168                            //   the left parenthesis captures into register 2*n
169                            //   and the right one captures into register 2*n+1.
170 
171       int32_t match_id_;   // opcode == kInstMatch
172                            //   Match ID to identify this match (for re2::Set).
173 
174       struct {             // opcode == kInstByteRange
175         uint8_t lo_;       //   byte range is lo_-hi_ inclusive
176         uint8_t hi_;       //
177         uint16_t hint_foldcase_;  // 15 bits: hint, 1 (low) bit: foldcase
178                            //   hint to execution engines: the delta to the
179                            //   next instruction (in the current list) worth
180                            //   exploring iff this instruction matched; 0
181                            //   means there are no remaining possibilities,
182                            //   which is most likely for character classes.
183                            //   foldcase: A-Z -> a-z before checking range.
184       };
185 
186       EmptyOp empty_;       // opcode == kInstEmptyWidth
187                             //   empty_ is bitwise OR of kEmpty* flags above.
188     };
189 
190     friend class Compiler;
191     friend struct PatchList;
192     friend class Prog;
193   };
194 
195   // Inst must be trivial so that we can freely clear it with memset(3).
196   // Arrays of Inst are initialised by copying the initial elements with
197   // memmove(3) and then clearing any remaining elements with memset(3).
198   static_assert(std::is_trivial<Inst>::value, "Inst must be trivial");
199 
200   // Whether to anchor the search.
201   enum Anchor {
202     kUnanchored,  // match anywhere
203     kAnchored,    // match only starting at beginning of text
204   };
205 
206   // Kind of match to look for (for anchor != kFullMatch)
207   //
208   // kLongestMatch mode finds the overall longest
209   // match but still makes its submatch choices the way
210   // Perl would, not in the way prescribed by POSIX.
211   // The POSIX rules are much more expensive to implement,
212   // and no one has needed them.
213   //
214   // kFullMatch is not strictly necessary -- we could use
215   // kLongestMatch and then check the length of the match -- but
216   // the matching code can run faster if it knows to consider only
217   // full matches.
218   enum MatchKind {
219     kFirstMatch,     // like Perl, PCRE
220     kLongestMatch,   // like egrep or POSIX
221     kFullMatch,      // match only entire text; implies anchor==kAnchored
222     kManyMatch       // for SearchDFA, records set of matches
223   };
224 
inst(int id)225   Inst *inst(int id) { return &inst_[id]; }
start()226   int start() { return start_; }
set_start(int start)227   void set_start(int start) { start_ = start; }
start_unanchored()228   int start_unanchored() { return start_unanchored_; }
set_start_unanchored(int start)229   void set_start_unanchored(int start) { start_unanchored_ = start; }
size()230   int size() { return size_; }
reversed()231   bool reversed() { return reversed_; }
set_reversed(bool reversed)232   void set_reversed(bool reversed) { reversed_ = reversed; }
list_count()233   int list_count() { return list_count_; }
inst_count(InstOp op)234   int inst_count(InstOp op) { return inst_count_[op]; }
list_heads()235   uint16_t* list_heads() { return list_heads_.data(); }
bit_state_text_max_size()236   size_t bit_state_text_max_size() { return bit_state_text_max_size_; }
dfa_mem()237   int64_t dfa_mem() { return dfa_mem_; }
set_dfa_mem(int64_t dfa_mem)238   void set_dfa_mem(int64_t dfa_mem) { dfa_mem_ = dfa_mem; }
anchor_start()239   bool anchor_start() { return anchor_start_; }
set_anchor_start(bool b)240   void set_anchor_start(bool b) { anchor_start_ = b; }
anchor_end()241   bool anchor_end() { return anchor_end_; }
set_anchor_end(bool b)242   void set_anchor_end(bool b) { anchor_end_ = b; }
bytemap_range()243   int bytemap_range() { return bytemap_range_; }
bytemap()244   const uint8_t* bytemap() { return bytemap_; }
can_prefix_accel()245   bool can_prefix_accel() { return prefix_size_ != 0; }
246 
247   // Accelerates to the first likely occurrence of the prefix.
248   // Returns a pointer to the first byte or NULL if not found.
PrefixAccel(const void * data,size_t size)249   const void* PrefixAccel(const void* data, size_t size) {
250     ABSL_DCHECK(can_prefix_accel());
251     if (prefix_foldcase_) {
252       return PrefixAccel_ShiftDFA(data, size);
253     } else if (prefix_size_ != 1) {
254       return PrefixAccel_FrontAndBack(data, size);
255     } else {
256       return memchr(data, prefix_front_, size);
257     }
258   }
259 
260   // Configures prefix accel using the analysis performed during compilation.
261   void ConfigurePrefixAccel(const std::string& prefix, bool prefix_foldcase);
262 
263   // An implementation of prefix accel that uses prefix_dfa_ to perform
264   // case-insensitive search.
265   const void* PrefixAccel_ShiftDFA(const void* data, size_t size);
266 
267   // An implementation of prefix accel that looks for prefix_front_ and
268   // prefix_back_ to return fewer false positives than memchr(3) alone.
269   const void* PrefixAccel_FrontAndBack(const void* data, size_t size);
270 
271   // Returns string representation of program for debugging.
272   std::string Dump();
273   std::string DumpUnanchored();
274   std::string DumpByteMap();
275 
276   // Returns the set of kEmpty flags that are in effect at
277   // position p within context.
278   static uint32_t EmptyFlags(absl::string_view context, const char* p);
279 
280   // Returns whether byte c is a word character: ASCII only.
281   // Used by the implementation of \b and \B.
282   // This is not right for Unicode, but:
283   //   - it's hard to get right in a byte-at-a-time matching world
284   //     (the DFA has only one-byte lookahead).
285   //   - even if the lookahead were possible, the Progs would be huge.
286   // This crude approximation is the same one PCRE uses.
IsWordChar(uint8_t c)287   static bool IsWordChar(uint8_t c) {
288     return ('A' <= c && c <= 'Z') ||
289            ('a' <= c && c <= 'z') ||
290            ('0' <= c && c <= '9') ||
291            c == '_';
292   }
293 
294   // Execution engines.  They all search for the regexp (run the prog)
295   // in text, which is in the larger context (used for ^ $ \b etc).
296   // Anchor and kind control the kind of search.
297   // Returns true if match found, false if not.
298   // If match found, fills match[0..nmatch-1] with submatch info.
299   // match[0] is overall match, match[1] is first set of parens, etc.
300   // If a particular submatch is not matched during the regexp match,
301   // it is set to NULL.
302   //
303   // Matching text == absl::string_view() is treated as any other empty
304   // string, but note that on return, it will not be possible to distinguish
305   // submatches that matched that empty string from submatches that didn't
306   // match anything.  Either way, match[i] == NULL.
307 
308   // Search using NFA: can find submatches but kind of slow.
309   bool SearchNFA(absl::string_view text, absl::string_view context,
310                  Anchor anchor, MatchKind kind, absl::string_view* match,
311                  int nmatch);
312 
313   // Search using DFA: much faster than NFA but only finds
314   // end of match and can use a lot more memory.
315   // Returns whether a match was found.
316   // If the DFA runs out of memory, sets *failed to true and returns false.
317   // If matches != NULL and kind == kManyMatch and there is a match,
318   // SearchDFA fills matches with the match IDs of the final matching state.
319   bool SearchDFA(absl::string_view text, absl::string_view context,
320                  Anchor anchor, MatchKind kind, absl::string_view* match0,
321                  bool* failed, SparseSet* matches);
322 
323   // The callback issued after building each DFA state with BuildEntireDFA().
324   // If next is null, then the memory budget has been exhausted and building
325   // will halt. Otherwise, the state has been built and next points to an array
326   // of bytemap_range()+1 slots holding the next states as per the bytemap and
327   // kByteEndText. The number of the state is implied by the callback sequence:
328   // the first callback is for state 0, the second callback is for state 1, ...
329   // match indicates whether the state is a matching state.
330   using DFAStateCallback = std::function<void(const int* next, bool match)>;
331 
332   // Build the entire DFA for the given match kind.
333   // Usually the DFA is built out incrementally, as needed, which
334   // avoids lots of unnecessary work.
335   // If cb is not empty, it receives one callback per state built.
336   // Returns the number of states built.
337   // FOR TESTING OR EXPERIMENTAL PURPOSES ONLY.
338   int BuildEntireDFA(MatchKind kind, const DFAStateCallback& cb);
339 
340   // Compute bytemap.
341   void ComputeByteMap();
342 
343   // Run peep-hole optimizer on program.
344   void Optimize();
345 
346   // One-pass NFA: only correct if IsOnePass() is true,
347   // but much faster than NFA (competitive with PCRE)
348   // for those expressions.
349   bool IsOnePass();
350   bool SearchOnePass(absl::string_view text, absl::string_view context,
351                      Anchor anchor, MatchKind kind, absl::string_view* match,
352                      int nmatch);
353 
354   // Bit-state backtracking.  Fast on small cases but uses memory
355   // proportional to the product of the list count and the text size.
CanBitState()356   bool CanBitState() { return list_heads_.data() != NULL; }
357   bool SearchBitState(absl::string_view text, absl::string_view context,
358                       Anchor anchor, MatchKind kind, absl::string_view* match,
359                       int nmatch);
360 
361   static const int kMaxOnePassCapture = 5;  // $0 through $4
362 
363   // Backtracking search: the gold standard against which the other
364   // implementations are checked.  FOR TESTING ONLY.
365   // It allocates a ton of memory to avoid running forever.
366   // It is also recursive, so can't use in production (will overflow stacks).
367   // The name "Unsafe" here is supposed to be a flag that
368   // you should not be using this function.
369   bool UnsafeSearchBacktrack(absl::string_view text, absl::string_view context,
370                              Anchor anchor, MatchKind kind,
371                              absl::string_view* match, int nmatch);
372 
373   // Computes range for any strings matching regexp. The min and max can in
374   // some cases be arbitrarily precise, so the caller gets to specify the
375   // maximum desired length of string returned.
376   //
377   // Assuming PossibleMatchRange(&min, &max, N) returns successfully, any
378   // string s that is an anchored match for this regexp satisfies
379   //   min <= s && s <= max.
380   //
381   // Note that PossibleMatchRange() will only consider the first copy of an
382   // infinitely repeated element (i.e., any regexp element followed by a '*' or
383   // '+' operator). Regexps with "{N}" constructions are not affected, as those
384   // do not compile down to infinite repetitions.
385   //
386   // Returns true on success, false on error.
387   bool PossibleMatchRange(std::string* min, std::string* max, int maxlen);
388 
389   // Outputs the program fanout into the given sparse array.
390   void Fanout(SparseArray<int>* fanout);
391 
392   // Compiles a collection of regexps to Prog.  Each regexp will have
393   // its own Match instruction recording the index in the output vector.
394   static Prog* CompileSet(Regexp* re, RE2::Anchor anchor, int64_t max_mem);
395 
396   // Flattens the Prog from "tree" form to "list" form. This is an in-place
397   // operation in the sense that the old instructions are lost.
398   void Flatten();
399 
400   // Walks the Prog; the "successor roots" or predecessors of the reachable
401   // instructions are marked in rootmap or predmap/predvec, respectively.
402   // reachable and stk are preallocated scratch structures.
403   void MarkSuccessors(SparseArray<int>* rootmap,
404                       SparseArray<int>* predmap,
405                       std::vector<std::vector<int>>* predvec,
406                       SparseSet* reachable, std::vector<int>* stk);
407 
408   // Walks the Prog from the given "root" instruction; the "dominator root"
409   // of the reachable instructions (if such exists) is marked in rootmap.
410   // reachable and stk are preallocated scratch structures.
411   void MarkDominator(int root, SparseArray<int>* rootmap,
412                      SparseArray<int>* predmap,
413                      std::vector<std::vector<int>>* predvec,
414                      SparseSet* reachable, std::vector<int>* stk);
415 
416   // Walks the Prog from the given "root" instruction; the reachable
417   // instructions are emitted in "list" form and appended to flat.
418   // reachable and stk are preallocated scratch structures.
419   void EmitList(int root, SparseArray<int>* rootmap,
420                 std::vector<Inst>* flat,
421                 SparseSet* reachable, std::vector<int>* stk);
422 
423   // Computes hints for ByteRange instructions in [begin, end).
424   void ComputeHints(std::vector<Inst>* flat, int begin, int end);
425 
426   // Controls whether the DFA should bail out early if the NFA would be faster.
427   // FOR TESTING ONLY.
428   static void TESTING_ONLY_set_dfa_should_bail_when_slow(bool b);
429 
430  private:
431   friend class Compiler;
432 
433   DFA* GetDFA(MatchKind kind);
434   void DeleteDFA(DFA* dfa);
435 
436   bool anchor_start_;       // regexp has explicit start anchor
437   bool anchor_end_;         // regexp has explicit end anchor
438   bool reversed_;           // whether program runs backward over input
439   bool did_flatten_;        // has Flatten been called?
440   bool did_onepass_;        // has IsOnePass been called?
441 
442   int start_;               // entry point for program
443   int start_unanchored_;    // unanchored entry point for program
444   int size_;                // number of instructions
445   int bytemap_range_;       // bytemap_[x] < bytemap_range_
446 
447   bool prefix_foldcase_;    // whether prefix is case-insensitive
448   size_t prefix_size_;      // size of prefix (0 if no prefix)
449   union {
450     uint64_t* prefix_dfa_;  // "Shift DFA" for prefix
451     struct {
452       int prefix_front_;    // first byte of prefix
453       int prefix_back_;     // last byte of prefix
454     };
455   };
456 
457   int list_count_;                  // count of lists (see above)
458   int inst_count_[kNumInst];        // count of instructions by opcode
459   PODArray<uint16_t> list_heads_;   // sparse array enumerating list heads
460                                     // not populated if size_ is overly large
461   size_t bit_state_text_max_size_;  // upper bound (inclusive) on text.size()
462 
463   PODArray<Inst> inst_;              // pointer to instruction array
464   PODArray<uint8_t> onepass_nodes_;  // data for OnePass nodes
465 
466   int64_t dfa_mem_;         // Maximum memory for DFAs.
467   DFA* dfa_first_;          // DFA cached for kFirstMatch/kManyMatch
468   DFA* dfa_longest_;        // DFA cached for kLongestMatch/kFullMatch
469 
470   uint8_t bytemap_[256];    // map from input bytes to byte classes
471 
472   absl::once_flag dfa_first_once_;
473   absl::once_flag dfa_longest_once_;
474 
475   Prog(const Prog&) = delete;
476   Prog& operator=(const Prog&) = delete;
477 };
478 
479 // std::string_view in MSVC has iterators that aren't just pointers and
480 // that don't allow comparisons between different objects - not even if
481 // those objects are views into the same string! Thus, we provide these
482 // conversion functions for convenience.
BeginPtr(absl::string_view s)483 static inline const char* BeginPtr(absl::string_view s) {
484   return s.data();
485 }
EndPtr(absl::string_view s)486 static inline const char* EndPtr(absl::string_view s) {
487   return s.data() + s.size();
488 }
489 
490 }  // namespace re2
491 
492 #endif  // RE2_PROG_H_
493