• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2003-2009 The RE2 Authors.  All Rights Reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4 
5 #ifndef RE2_RE2_H_
6 #define RE2_RE2_H_
7 
8 // C++ interface to the re2 regular-expression library.
9 // RE2 supports Perl-style regular expressions (with extensions like
10 // \d, \w, \s, ...).
11 //
12 // -----------------------------------------------------------------------
13 // REGEXP SYNTAX:
14 //
15 // This module uses the re2 library and hence supports
16 // its syntax for regular expressions, which is similar to Perl's with
17 // some of the more complicated things thrown away.  In particular,
18 // backreferences and generalized assertions are not available, nor is \Z.
19 //
20 // See https://github.com/google/re2/wiki/Syntax for the syntax
21 // supported by RE2, and a comparison with PCRE and PERL regexps.
22 //
23 // For those not familiar with Perl's regular expressions,
24 // here are some examples of the most commonly used extensions:
25 //
26 //   "hello (\\w+) world"  -- \w matches a "word" character
27 //   "version (\\d+)"      -- \d matches a digit
28 //   "hello\\s+world"      -- \s matches any whitespace character
29 //   "\\b(\\w+)\\b"        -- \b matches non-empty string at word boundary
30 //   "(?i)hello"           -- (?i) turns on case-insensitive matching
31 //   "/\\*(.*?)\\*/"       -- .*? matches . minimum no. of times possible
32 //
33 // The double backslashes are needed when writing C++ string literals.
34 // However, they should NOT be used when writing C++11 raw string literals:
35 //
36 //   R"(hello (\w+) world)"  -- \w matches a "word" character
37 //   R"(version (\d+))"      -- \d matches a digit
38 //   R"(hello\s+world)"      -- \s matches any whitespace character
39 //   R"(\b(\w+)\b)"          -- \b matches non-empty string at word boundary
40 //   R"((?i)hello)"          -- (?i) turns on case-insensitive matching
41 //   R"(/\*(.*?)\*/)"        -- .*? matches . minimum no. of times possible
42 //
43 // -----------------------------------------------------------------------
44 // MATCHING INTERFACE:
45 //
46 // The "FullMatch" operation checks that supplied text matches a
47 // supplied pattern exactly.
48 //
49 // Example: successful match
50 //    CHECK(RE2::FullMatch("hello", "h.*o"));
51 //
52 // Example: unsuccessful match (requires full match):
53 //    CHECK(!RE2::FullMatch("hello", "e"));
54 //
55 // -----------------------------------------------------------------------
56 // UTF-8 AND THE MATCHING INTERFACE:
57 //
58 // By default, the pattern and input text are interpreted as UTF-8.
59 // The RE2::Latin1 option causes them to be interpreted as Latin-1.
60 //
61 // Example:
62 //    CHECK(RE2::FullMatch(utf8_string, RE2(utf8_pattern)));
63 //    CHECK(RE2::FullMatch(latin1_string, RE2(latin1_pattern, RE2::Latin1)));
64 //
65 // -----------------------------------------------------------------------
66 // MATCHING WITH SUBSTRING EXTRACTION:
67 //
68 // You can supply extra pointer arguments to extract matched substrings.
69 // On match failure, none of the pointees will have been modified.
70 // On match success, the substrings will be converted (as necessary) and
71 // their values will be assigned to their pointees until all conversions
72 // have succeeded or one conversion has failed.
73 // On conversion failure, the pointees will be in an indeterminate state
74 // because the caller has no way of knowing which conversion failed.
75 // However, conversion cannot fail for types like string and StringPiece
76 // that do not inspect the substring contents. Hence, in the common case
77 // where all of the pointees are of such types, failure is always due to
78 // match failure and thus none of the pointees will have been modified.
79 //
80 // Example: extracts "ruby" into "s" and 1234 into "i"
81 //    int i;
82 //    std::string s;
83 //    CHECK(RE2::FullMatch("ruby:1234", "(\\w+):(\\d+)", &s, &i));
84 //
85 // Example: fails because string cannot be stored in integer
86 //    CHECK(!RE2::FullMatch("ruby", "(.*)", &i));
87 //
88 // Example: fails because there aren't enough sub-patterns
89 //    CHECK(!RE2::FullMatch("ruby:1234", "\\w+:\\d+", &s));
90 //
91 // Example: does not try to extract any extra sub-patterns
92 //    CHECK(RE2::FullMatch("ruby:1234", "(\\w+):(\\d+)", &s));
93 //
94 // Example: does not try to extract into NULL
95 //    CHECK(RE2::FullMatch("ruby:1234", "(\\w+):(\\d+)", NULL, &i));
96 //
97 // Example: integer overflow causes failure
98 //    CHECK(!RE2::FullMatch("ruby:1234567891234", "\\w+:(\\d+)", &i));
99 //
100 // NOTE(rsc): Asking for substrings slows successful matches quite a bit.
101 // This may get a little faster in the future, but right now is slower
102 // than PCRE.  On the other hand, failed matches run *very* fast (faster
103 // than PCRE), as do matches without substring extraction.
104 //
105 // -----------------------------------------------------------------------
106 // PARTIAL MATCHES
107 //
108 // You can use the "PartialMatch" operation when you want the pattern
109 // to match any substring of the text.
110 //
111 // Example: simple search for a string:
112 //      CHECK(RE2::PartialMatch("hello", "ell"));
113 //
114 // Example: find first number in a string
115 //      int number;
116 //      CHECK(RE2::PartialMatch("x*100 + 20", "(\\d+)", &number));
117 //      CHECK_EQ(number, 100);
118 //
119 // -----------------------------------------------------------------------
120 // PRE-COMPILED REGULAR EXPRESSIONS
121 //
122 // RE2 makes it easy to use any string as a regular expression, without
123 // requiring a separate compilation step.
124 //
125 // If speed is of the essence, you can create a pre-compiled "RE2"
126 // object from the pattern and use it multiple times.  If you do so,
127 // you can typically parse text faster than with sscanf.
128 //
129 // Example: precompile pattern for faster matching:
130 //    RE2 pattern("h.*o");
131 //    while (ReadLine(&str)) {
132 //      if (RE2::FullMatch(str, pattern)) ...;
133 //    }
134 //
135 // -----------------------------------------------------------------------
136 // SCANNING TEXT INCREMENTALLY
137 //
138 // The "Consume" operation may be useful if you want to repeatedly
139 // match regular expressions at the front of a string and skip over
140 // them as they match.  This requires use of the "StringPiece" type,
141 // which represents a sub-range of a real string.
142 //
143 // Example: read lines of the form "var = value" from a string.
144 //      std::string contents = ...;     // Fill string somehow
145 //      StringPiece input(contents);    // Wrap a StringPiece around it
146 //
147 //      std::string var;
148 //      int value;
149 //      while (RE2::Consume(&input, "(\\w+) = (\\d+)\n", &var, &value)) {
150 //        ...;
151 //      }
152 //
153 // Each successful call to "Consume" will set "var/value", and also
154 // advance "input" so it points past the matched text.  Note that if the
155 // regular expression matches an empty string, input will advance
156 // by 0 bytes.  If the regular expression being used might match
157 // an empty string, the loop body must check for this case and either
158 // advance the string or break out of the loop.
159 //
160 // The "FindAndConsume" operation is similar to "Consume" but does not
161 // anchor your match at the beginning of the string.  For example, you
162 // could extract all words from a string by repeatedly calling
163 //     RE2::FindAndConsume(&input, "(\\w+)", &word)
164 //
165 // -----------------------------------------------------------------------
166 // USING VARIABLE NUMBER OF ARGUMENTS
167 //
168 // The above operations require you to know the number of arguments
169 // when you write the code.  This is not always possible or easy (for
170 // example, the regular expression may be calculated at run time).
171 // You can use the "N" version of the operations when the number of
172 // match arguments are determined at run time.
173 //
174 // Example:
175 //   const RE2::Arg* args[10];
176 //   int n;
177 //   // ... populate args with pointers to RE2::Arg values ...
178 //   // ... set n to the number of RE2::Arg objects ...
179 //   bool match = RE2::FullMatchN(input, pattern, args, n);
180 //
181 // The last statement is equivalent to
182 //
183 //   bool match = RE2::FullMatch(input, pattern,
184 //                               *args[0], *args[1], ..., *args[n - 1]);
185 //
186 // -----------------------------------------------------------------------
187 // PARSING HEX/OCTAL/C-RADIX NUMBERS
188 //
189 // By default, if you pass a pointer to a numeric value, the
190 // corresponding text is interpreted as a base-10 number.  You can
191 // instead wrap the pointer with a call to one of the operators Hex(),
192 // Octal(), or CRadix() to interpret the text in another base.  The
193 // CRadix operator interprets C-style "0" (base-8) and "0x" (base-16)
194 // prefixes, but defaults to base-10.
195 //
196 // Example:
197 //   int a, b, c, d;
198 //   CHECK(RE2::FullMatch("100 40 0100 0x40", "(.*) (.*) (.*) (.*)",
199 //         RE2::Octal(&a), RE2::Hex(&b), RE2::CRadix(&c), RE2::CRadix(&d));
200 // will leave 64 in a, b, c, and d.
201 
202 #include <stddef.h>
203 #include <stdint.h>
204 #include <algorithm>
205 #include <map>
206 #include <mutex>
207 #include <string>
208 #include <vector>
209 
210 #if defined(__APPLE__)
211 #include <TargetConditionals.h>
212 #endif
213 
214 #include "re2/stringpiece.h"
215 
216 namespace re2 {
217 class Prog;
218 class Regexp;
219 }  // namespace re2
220 
221 namespace re2 {
222 
223 // Interface for regular expression matching.  Also corresponds to a
224 // pre-compiled regular expression.  An "RE2" object is safe for
225 // concurrent use by multiple threads.
226 class RE2 {
227  public:
228   // We convert user-passed pointers into special Arg objects
229   class Arg;
230   class Options;
231 
232   // Defined in set.h.
233   class Set;
234 
235   enum ErrorCode {
236     NoError = 0,
237 
238     // Unexpected error
239     ErrorInternal,
240 
241     // Parse errors
242     ErrorBadEscape,          // bad escape sequence
243     ErrorBadCharClass,       // bad character class
244     ErrorBadCharRange,       // bad character class range
245     ErrorMissingBracket,     // missing closing ]
246     ErrorMissingParen,       // missing closing )
247     ErrorTrailingBackslash,  // trailing \ at end of regexp
248     ErrorRepeatArgument,     // repeat argument missing, e.g. "*"
249     ErrorRepeatSize,         // bad repetition argument
250     ErrorRepeatOp,           // bad repetition operator
251     ErrorBadPerlOp,          // bad perl operator
252     ErrorBadUTF8,            // invalid UTF-8 in regexp
253     ErrorBadNamedCapture,    // bad named capture group
254     ErrorPatternTooLarge     // pattern too large (compile failed)
255   };
256 
257   // Predefined common options.
258   // If you need more complicated things, instantiate
259   // an Option class, possibly passing one of these to
260   // the Option constructor, change the settings, and pass that
261   // Option class to the RE2 constructor.
262   enum CannedOptions {
263     DefaultOptions = 0,
264     Latin1, // treat input as Latin-1 (default UTF-8)
265     POSIX, // POSIX syntax, leftmost-longest match
266     Quiet // do not log about regexp parse errors
267   };
268 
269   // Need to have the const char* and const std::string& forms for implicit
270   // conversions when passing string literals to FullMatch and PartialMatch.
271   // Otherwise the StringPiece form would be sufficient.
272 #ifndef SWIG
273   RE2(const char* pattern);
274   RE2(const std::string& pattern);
275 #endif
276   RE2(const StringPiece& pattern);
277   RE2(const StringPiece& pattern, const Options& options);
278   ~RE2();
279 
280   // Returns whether RE2 was created properly.
ok()281   bool ok() const { return error_code() == NoError; }
282 
283   // The string specification for this RE2.  E.g.
284   //   RE2 re("ab*c?d+");
285   //   re.pattern();    // "ab*c?d+"
pattern()286   const std::string& pattern() const { return pattern_; }
287 
288   // If RE2 could not be created properly, returns an error string.
289   // Else returns the empty string.
error()290   const std::string& error() const { return *error_; }
291 
292   // If RE2 could not be created properly, returns an error code.
293   // Else returns RE2::NoError (== 0).
error_code()294   ErrorCode error_code() const { return error_code_; }
295 
296   // If RE2 could not be created properly, returns the offending
297   // portion of the regexp.
error_arg()298   const std::string& error_arg() const { return error_arg_; }
299 
300   // Returns the program size, a very approximate measure of a regexp's "cost".
301   // Larger numbers are more expensive than smaller numbers.
302   int ProgramSize() const;
303   int ReverseProgramSize() const;
304 
305   // If histogram is not null, outputs the program fanout
306   // as a histogram bucketed by powers of 2.
307   // Returns the number of the largest non-empty bucket.
308   int ProgramFanout(std::vector<int>* histogram) const;
309   int ReverseProgramFanout(std::vector<int>* histogram) const;
310 
311   // Returns the underlying Regexp; not for general use.
312   // Returns entire_regexp_ so that callers don't need
313   // to know about prefix_ and prefix_foldcase_.
Regexp()314   re2::Regexp* Regexp() const { return entire_regexp_; }
315 
316   /***** The array-based matching interface ******/
317 
318   // The functions here have names ending in 'N' and are used to implement
319   // the functions whose names are the prefix before the 'N'. It is sometimes
320   // useful to invoke them directly, but the syntax is awkward, so the 'N'-less
321   // versions should be preferred.
322   static bool FullMatchN(const StringPiece& text, const RE2& re,
323                          const Arg* const args[], int n);
324   static bool PartialMatchN(const StringPiece& text, const RE2& re,
325                             const Arg* const args[], int n);
326   static bool ConsumeN(StringPiece* input, const RE2& re,
327                        const Arg* const args[], int n);
328   static bool FindAndConsumeN(StringPiece* input, const RE2& re,
329                               const Arg* const args[], int n);
330 
331 #ifndef SWIG
332  private:
333   template <typename F, typename SP>
Apply(F f,SP sp,const RE2 & re)334   static inline bool Apply(F f, SP sp, const RE2& re) {
335     return f(sp, re, NULL, 0);
336   }
337 
338   template <typename F, typename SP, typename... A>
Apply(F f,SP sp,const RE2 & re,const A &...a)339   static inline bool Apply(F f, SP sp, const RE2& re, const A&... a) {
340     const Arg* const args[] = {&a...};
341     const int n = sizeof...(a);
342     return f(sp, re, args, n);
343   }
344 
345  public:
346   // In order to allow FullMatch() et al. to be called with a varying number
347   // of arguments of varying types, we use two layers of variadic templates.
348   // The first layer constructs the temporary Arg objects. The second layer
349   // (above) constructs the array of pointers to the temporary Arg objects.
350 
351   /***** The useful part: the matching interface *****/
352 
353   // Matches "text" against "re".  If pointer arguments are
354   // supplied, copies matched sub-patterns into them.
355   //
356   // You can pass in a "const char*" or a "std::string" for "text".
357   // You can pass in a "const char*" or a "std::string" or a "RE2" for "re".
358   //
359   // The provided pointer arguments can be pointers to any scalar numeric
360   // type, or one of:
361   //    std::string     (matched piece is copied to string)
362   //    StringPiece     (StringPiece is mutated to point to matched piece)
363   //    T               (where "bool T::ParseFrom(const char*, size_t)" exists)
364   //    (void*)NULL     (the corresponding matched sub-pattern is not copied)
365   //
366   // Returns true iff all of the following conditions are satisfied:
367   //   a. "text" matches "re" exactly
368   //   b. The number of matched sub-patterns is >= number of supplied pointers
369   //   c. The "i"th argument has a suitable type for holding the
370   //      string captured as the "i"th sub-pattern.  If you pass in
371   //      NULL for the "i"th argument, or pass fewer arguments than
372   //      number of sub-patterns, "i"th captured sub-pattern is
373   //      ignored.
374   //
375   // CAVEAT: An optional sub-pattern that does not exist in the
376   // matched string is assigned the empty string.  Therefore, the
377   // following will return false (because the empty string is not a
378   // valid number):
379   //    int number;
380   //    RE2::FullMatch("abc", "[a-z]+(\\d+)?", &number);
381   template <typename... A>
FullMatch(const StringPiece & text,const RE2 & re,A &&...a)382   static bool FullMatch(const StringPiece& text, const RE2& re, A&&... a) {
383     return Apply(FullMatchN, text, re, Arg(std::forward<A>(a))...);
384   }
385 
386   // Exactly like FullMatch(), except that "re" is allowed to match
387   // a substring of "text".
388   template <typename... A>
PartialMatch(const StringPiece & text,const RE2 & re,A &&...a)389   static bool PartialMatch(const StringPiece& text, const RE2& re, A&&... a) {
390     return Apply(PartialMatchN, text, re, Arg(std::forward<A>(a))...);
391   }
392 
393   // Like FullMatch() and PartialMatch(), except that "re" has to match
394   // a prefix of the text, and "input" is advanced past the matched
395   // text.  Note: "input" is modified iff this routine returns true
396   // and "re" matched a non-empty substring of "text".
397   template <typename... A>
Consume(StringPiece * input,const RE2 & re,A &&...a)398   static bool Consume(StringPiece* input, const RE2& re, A&&... a) {
399     return Apply(ConsumeN, input, re, Arg(std::forward<A>(a))...);
400   }
401 
402   // Like Consume(), but does not anchor the match at the beginning of
403   // the text.  That is, "re" need not start its match at the beginning
404   // of "input".  For example, "FindAndConsume(s, "(\\w+)", &word)" finds
405   // the next word in "s" and stores it in "word".
406   template <typename... A>
FindAndConsume(StringPiece * input,const RE2 & re,A &&...a)407   static bool FindAndConsume(StringPiece* input, const RE2& re, A&&... a) {
408     return Apply(FindAndConsumeN, input, re, Arg(std::forward<A>(a))...);
409   }
410 #endif
411 
412   // Replace the first match of "re" in "str" with "rewrite".
413   // Within "rewrite", backslash-escaped digits (\1 to \9) can be
414   // used to insert text matching corresponding parenthesized group
415   // from the pattern.  \0 in "rewrite" refers to the entire matching
416   // text.  E.g.,
417   //
418   //   std::string s = "yabba dabba doo";
419   //   CHECK(RE2::Replace(&s, "b+", "d"));
420   //
421   // will leave "s" containing "yada dabba doo"
422   //
423   // Returns true if the pattern matches and a replacement occurs,
424   // false otherwise.
425   static bool Replace(std::string* str,
426                       const RE2& re,
427                       const StringPiece& rewrite);
428 
429   // Like Replace(), except replaces successive non-overlapping occurrences
430   // of the pattern in the string with the rewrite. E.g.
431   //
432   //   std::string s = "yabba dabba doo";
433   //   CHECK(RE2::GlobalReplace(&s, "b+", "d"));
434   //
435   // will leave "s" containing "yada dada doo"
436   // Replacements are not subject to re-matching.
437   //
438   // Because GlobalReplace only replaces non-overlapping matches,
439   // replacing "ana" within "banana" makes only one replacement, not two.
440   //
441   // Returns the number of replacements made.
442   static int GlobalReplace(std::string* str,
443                            const RE2& re,
444                            const StringPiece& rewrite);
445 
446   // Like Replace, except that if the pattern matches, "rewrite"
447   // is copied into "out" with substitutions.  The non-matching
448   // portions of "text" are ignored.
449   //
450   // Returns true iff a match occurred and the extraction happened
451   // successfully;  if no match occurs, the string is left unaffected.
452   //
453   // REQUIRES: "text" must not alias any part of "*out".
454   static bool Extract(const StringPiece& text,
455                       const RE2& re,
456                       const StringPiece& rewrite,
457                       std::string* out);
458 
459   // Escapes all potentially meaningful regexp characters in
460   // 'unquoted'.  The returned string, used as a regular expression,
461   // will match exactly the original string.  For example,
462   //           1.5-2.0?
463   // may become:
464   //           1\.5\-2\.0\?
465   static std::string QuoteMeta(const StringPiece& unquoted);
466 
467   // Computes range for any strings matching regexp. The min and max can in
468   // some cases be arbitrarily precise, so the caller gets to specify the
469   // maximum desired length of string returned.
470   //
471   // Assuming PossibleMatchRange(&min, &max, N) returns successfully, any
472   // string s that is an anchored match for this regexp satisfies
473   //   min <= s && s <= max.
474   //
475   // Note that PossibleMatchRange() will only consider the first copy of an
476   // infinitely repeated element (i.e., any regexp element followed by a '*' or
477   // '+' operator). Regexps with "{N}" constructions are not affected, as those
478   // do not compile down to infinite repetitions.
479   //
480   // Returns true on success, false on error.
481   bool PossibleMatchRange(std::string* min, std::string* max,
482                           int maxlen) const;
483 
484   // Generic matching interface
485 
486   // Type of match.
487   enum Anchor {
488     UNANCHORED,         // No anchoring
489     ANCHOR_START,       // Anchor at start only
490     ANCHOR_BOTH         // Anchor at start and end
491   };
492 
493   // Return the number of capturing subpatterns, or -1 if the
494   // regexp wasn't valid on construction.  The overall match ($0)
495   // does not count: if the regexp is "(a)(b)", returns 2.
NumberOfCapturingGroups()496   int NumberOfCapturingGroups() const { return num_captures_; }
497 
498   // Return a map from names to capturing indices.
499   // The map records the index of the leftmost group
500   // with the given name.
501   // Only valid until the re is deleted.
502   const std::map<std::string, int>& NamedCapturingGroups() const;
503 
504   // Return a map from capturing indices to names.
505   // The map has no entries for unnamed groups.
506   // Only valid until the re is deleted.
507   const std::map<int, std::string>& CapturingGroupNames() const;
508 
509   // General matching routine.
510   // Match against text starting at offset startpos
511   // and stopping the search at offset endpos.
512   // Returns true if match found, false if not.
513   // On a successful match, fills in submatch[] (up to nsubmatch entries)
514   // with information about submatches.
515   // I.e. matching RE2("(foo)|(bar)baz") on "barbazbla" will return true, with
516   // submatch[0] = "barbaz", submatch[1].data() = NULL, submatch[2] = "bar",
517   // submatch[3].data() = NULL, ..., up to submatch[nsubmatch-1].data() = NULL.
518   // Caveat: submatch[] may be clobbered even on match failure.
519   //
520   // Don't ask for more match information than you will use:
521   // runs much faster with nsubmatch == 1 than nsubmatch > 1, and
522   // runs even faster if nsubmatch == 0.
523   // Doesn't make sense to use nsubmatch > 1 + NumberOfCapturingGroups(),
524   // but will be handled correctly.
525   //
526   // Passing text == StringPiece(NULL, 0) will be handled like any other
527   // empty string, but note that on return, it will not be possible to tell
528   // whether submatch i matched the empty string or did not match:
529   // either way, submatch[i].data() == NULL.
530   bool Match(const StringPiece& text,
531              size_t startpos,
532              size_t endpos,
533              Anchor re_anchor,
534              StringPiece* submatch,
535              int nsubmatch) const;
536 
537   // Check that the given rewrite string is suitable for use with this
538   // regular expression.  It checks that:
539   //   * The regular expression has enough parenthesized subexpressions
540   //     to satisfy all of the \N tokens in rewrite
541   //   * The rewrite string doesn't have any syntax errors.  E.g.,
542   //     '\' followed by anything other than a digit or '\'.
543   // A true return value guarantees that Replace() and Extract() won't
544   // fail because of a bad rewrite string.
545   bool CheckRewriteString(const StringPiece& rewrite,
546                           std::string* error) const;
547 
548   // Returns the maximum submatch needed for the rewrite to be done by
549   // Replace(). E.g. if rewrite == "foo \\2,\\1", returns 2.
550   static int MaxSubmatch(const StringPiece& rewrite);
551 
552   // Append the "rewrite" string, with backslash subsitutions from "vec",
553   // to string "out".
554   // Returns true on success.  This method can fail because of a malformed
555   // rewrite string.  CheckRewriteString guarantees that the rewrite will
556   // be sucessful.
557   bool Rewrite(std::string* out,
558                const StringPiece& rewrite,
559                const StringPiece* vec,
560                int veclen) const;
561 
562   // Constructor options
563   class Options {
564    public:
565     // The options are (defaults in parentheses):
566     //
567     //   utf8             (true)  text and pattern are UTF-8; otherwise Latin-1
568     //   posix_syntax     (false) restrict regexps to POSIX egrep syntax
569     //   longest_match    (false) search for longest match, not first match
570     //   log_errors       (true)  log syntax and execution errors to ERROR
571     //   max_mem          (see below)  approx. max memory footprint of RE2
572     //   literal          (false) interpret string as literal, not regexp
573     //   never_nl         (false) never match \n, even if it is in regexp
574     //   dot_nl           (false) dot matches everything including new line
575     //   never_capture    (false) parse all parens as non-capturing
576     //   case_sensitive   (true)  match is case-sensitive (regexp can override
577     //                              with (?i) unless in posix_syntax mode)
578     //
579     // The following options are only consulted when posix_syntax == true.
580     // When posix_syntax == false, these features are always enabled and
581     // cannot be turned off; to perform multi-line matching in that case,
582     // begin the regexp with (?m).
583     //   perl_classes     (false) allow Perl's \d \s \w \D \S \W
584     //   word_boundary    (false) allow Perl's \b \B (word boundary and not)
585     //   one_line         (false) ^ and $ only match beginning and end of text
586     //
587     // The max_mem option controls how much memory can be used
588     // to hold the compiled form of the regexp (the Prog) and
589     // its cached DFA graphs.  Code Search placed limits on the number
590     // of Prog instructions and DFA states: 10,000 for both.
591     // In RE2, those limits would translate to about 240 KB per Prog
592     // and perhaps 2.5 MB per DFA (DFA state sizes vary by regexp; RE2 does a
593     // better job of keeping them small than Code Search did).
594     // Each RE2 has two Progs (one forward, one reverse), and each Prog
595     // can have two DFAs (one first match, one longest match).
596     // That makes 4 DFAs:
597     //
598     //   forward, first-match    - used for UNANCHORED or ANCHOR_START searches
599     //                               if opt.longest_match() == false
600     //   forward, longest-match  - used for all ANCHOR_BOTH searches,
601     //                               and the other two kinds if
602     //                               opt.longest_match() == true
603     //   reverse, first-match    - never used
604     //   reverse, longest-match  - used as second phase for unanchored searches
605     //
606     // The RE2 memory budget is statically divided between the two
607     // Progs and then the DFAs: two thirds to the forward Prog
608     // and one third to the reverse Prog.  The forward Prog gives half
609     // of what it has left over to each of its DFAs.  The reverse Prog
610     // gives it all to its longest-match DFA.
611     //
612     // Once a DFA fills its budget, it flushes its cache and starts over.
613     // If this happens too often, RE2 falls back on the NFA implementation.
614 
615     // For now, make the default budget something close to Code Search.
616     static const int kDefaultMaxMem = 8<<20;
617 
618     enum Encoding {
619       EncodingUTF8 = 1,
620       EncodingLatin1
621     };
622 
Options()623     Options() :
624       encoding_(EncodingUTF8),
625       posix_syntax_(false),
626       longest_match_(false),
627       log_errors_(true),
628       max_mem_(kDefaultMaxMem),
629       literal_(false),
630       never_nl_(false),
631       dot_nl_(false),
632       never_capture_(false),
633       case_sensitive_(true),
634       perl_classes_(false),
635       word_boundary_(false),
636       one_line_(false) {
637     }
638 
639     /*implicit*/ Options(CannedOptions);
640 
encoding()641     Encoding encoding() const { return encoding_; }
set_encoding(Encoding encoding)642     void set_encoding(Encoding encoding) { encoding_ = encoding; }
643 
posix_syntax()644     bool posix_syntax() const { return posix_syntax_; }
set_posix_syntax(bool b)645     void set_posix_syntax(bool b) { posix_syntax_ = b; }
646 
longest_match()647     bool longest_match() const { return longest_match_; }
set_longest_match(bool b)648     void set_longest_match(bool b) { longest_match_ = b; }
649 
log_errors()650     bool log_errors() const { return log_errors_; }
set_log_errors(bool b)651     void set_log_errors(bool b) { log_errors_ = b; }
652 
max_mem()653     int64_t max_mem() const { return max_mem_; }
set_max_mem(int64_t m)654     void set_max_mem(int64_t m) { max_mem_ = m; }
655 
literal()656     bool literal() const { return literal_; }
set_literal(bool b)657     void set_literal(bool b) { literal_ = b; }
658 
never_nl()659     bool never_nl() const { return never_nl_; }
set_never_nl(bool b)660     void set_never_nl(bool b) { never_nl_ = b; }
661 
dot_nl()662     bool dot_nl() const { return dot_nl_; }
set_dot_nl(bool b)663     void set_dot_nl(bool b) { dot_nl_ = b; }
664 
never_capture()665     bool never_capture() const { return never_capture_; }
set_never_capture(bool b)666     void set_never_capture(bool b) { never_capture_ = b; }
667 
case_sensitive()668     bool case_sensitive() const { return case_sensitive_; }
set_case_sensitive(bool b)669     void set_case_sensitive(bool b) { case_sensitive_ = b; }
670 
perl_classes()671     bool perl_classes() const { return perl_classes_; }
set_perl_classes(bool b)672     void set_perl_classes(bool b) { perl_classes_ = b; }
673 
word_boundary()674     bool word_boundary() const { return word_boundary_; }
set_word_boundary(bool b)675     void set_word_boundary(bool b) { word_boundary_ = b; }
676 
one_line()677     bool one_line() const { return one_line_; }
set_one_line(bool b)678     void set_one_line(bool b) { one_line_ = b; }
679 
Copy(const Options & src)680     void Copy(const Options& src) {
681       *this = src;
682     }
683 
684     int ParseFlags() const;
685 
686    private:
687     Encoding encoding_;
688     bool posix_syntax_;
689     bool longest_match_;
690     bool log_errors_;
691     int64_t max_mem_;
692     bool literal_;
693     bool never_nl_;
694     bool dot_nl_;
695     bool never_capture_;
696     bool case_sensitive_;
697     bool perl_classes_;
698     bool word_boundary_;
699     bool one_line_;
700   };
701 
702   // Returns the options set in the constructor.
options()703   const Options& options() const { return options_; }
704 
705   // Argument converters; see below.
706   static inline Arg CRadix(short* x);
707   static inline Arg CRadix(unsigned short* x);
708   static inline Arg CRadix(int* x);
709   static inline Arg CRadix(unsigned int* x);
710   static inline Arg CRadix(long* x);
711   static inline Arg CRadix(unsigned long* x);
712   static inline Arg CRadix(long long* x);
713   static inline Arg CRadix(unsigned long long* x);
714 
715   static inline Arg Hex(short* x);
716   static inline Arg Hex(unsigned short* x);
717   static inline Arg Hex(int* x);
718   static inline Arg Hex(unsigned int* x);
719   static inline Arg Hex(long* x);
720   static inline Arg Hex(unsigned long* x);
721   static inline Arg Hex(long long* x);
722   static inline Arg Hex(unsigned long long* x);
723 
724   static inline Arg Octal(short* x);
725   static inline Arg Octal(unsigned short* x);
726   static inline Arg Octal(int* x);
727   static inline Arg Octal(unsigned int* x);
728   static inline Arg Octal(long* x);
729   static inline Arg Octal(unsigned long* x);
730   static inline Arg Octal(long long* x);
731   static inline Arg Octal(unsigned long long* x);
732 
733  private:
734   void Init(const StringPiece& pattern, const Options& options);
735 
736   bool DoMatch(const StringPiece& text,
737                Anchor re_anchor,
738                size_t* consumed,
739                const Arg* const args[],
740                int n) const;
741 
742   re2::Prog* ReverseProg() const;
743 
744   std::string pattern_;         // string regular expression
745   Options options_;             // option flags
746   re2::Regexp* entire_regexp_;  // parsed regular expression
747   const std::string* error_;    // error indicator (or points to empty string)
748   ErrorCode error_code_;        // error code
749   std::string error_arg_;       // fragment of regexp showing error
750   std::string prefix_;          // required prefix (before suffix_regexp_)
751   bool prefix_foldcase_;        // prefix_ is ASCII case-insensitive
752   re2::Regexp* suffix_regexp_;  // parsed regular expression, prefix_ removed
753   re2::Prog* prog_;             // compiled program for regexp
754   int num_captures_;            // number of capturing groups
755   bool is_one_pass_;            // can use prog_->SearchOnePass?
756 
757   // Reverse Prog for DFA execution only
758   mutable re2::Prog* rprog_;
759   // Map from capture names to indices
760   mutable const std::map<std::string, int>* named_groups_;
761   // Map from capture indices to names
762   mutable const std::map<int, std::string>* group_names_;
763 
764   mutable std::once_flag rprog_once_;
765   mutable std::once_flag named_groups_once_;
766   mutable std::once_flag group_names_once_;
767 
768   RE2(const RE2&) = delete;
769   RE2& operator=(const RE2&) = delete;
770 };
771 
772 /***** Implementation details *****/
773 
774 // Hex/Octal/Binary?
775 
776 // Special class for parsing into objects that define a ParseFrom() method
777 template <class T>
778 class _RE2_MatchObject {
779  public:
Parse(const char * str,size_t n,void * dest)780   static inline bool Parse(const char* str, size_t n, void* dest) {
781     if (dest == NULL) return true;
782     T* object = reinterpret_cast<T*>(dest);
783     return object->ParseFrom(str, n);
784   }
785 };
786 
787 class RE2::Arg {
788  public:
789   // Empty constructor so we can declare arrays of RE2::Arg
790   Arg();
791 
792   // Constructor specially designed for NULL arguments
793   Arg(void*);
794   Arg(std::nullptr_t);
795 
796   typedef bool (*Parser)(const char* str, size_t n, void* dest);
797 
798 // Type-specific parsers
799 #define MAKE_PARSER(type, name)            \
800   Arg(type* p) : arg_(p), parser_(name) {} \
801   Arg(type* p, Parser parser) : arg_(p), parser_(parser) {}
802 
MAKE_PARSER(char,parse_char)803   MAKE_PARSER(char,               parse_char)
804   MAKE_PARSER(signed char,        parse_schar)
805   MAKE_PARSER(unsigned char,      parse_uchar)
806   MAKE_PARSER(float,              parse_float)
807   MAKE_PARSER(double,             parse_double)
808   MAKE_PARSER(std::string,        parse_string)
809   MAKE_PARSER(StringPiece,        parse_stringpiece)
810 
811   MAKE_PARSER(short,              parse_short)
812   MAKE_PARSER(unsigned short,     parse_ushort)
813   MAKE_PARSER(int,                parse_int)
814   MAKE_PARSER(unsigned int,       parse_uint)
815   MAKE_PARSER(long,               parse_long)
816   MAKE_PARSER(unsigned long,      parse_ulong)
817   MAKE_PARSER(long long,          parse_longlong)
818   MAKE_PARSER(unsigned long long, parse_ulonglong)
819 
820 #undef MAKE_PARSER
821 
822   // Generic constructor templates
823   template <class T> Arg(T* p)
824       : arg_(p), parser_(_RE2_MatchObject<T>::Parse) { }
Arg(T * p,Parser parser)825   template <class T> Arg(T* p, Parser parser)
826       : arg_(p), parser_(parser) { }
827 
828   // Parse the data
829   bool Parse(const char* str, size_t n) const;
830 
831  private:
832   void*         arg_;
833   Parser        parser_;
834 
835   static bool parse_null          (const char* str, size_t n, void* dest);
836   static bool parse_char          (const char* str, size_t n, void* dest);
837   static bool parse_schar         (const char* str, size_t n, void* dest);
838   static bool parse_uchar         (const char* str, size_t n, void* dest);
839   static bool parse_float         (const char* str, size_t n, void* dest);
840   static bool parse_double        (const char* str, size_t n, void* dest);
841   static bool parse_string        (const char* str, size_t n, void* dest);
842   static bool parse_stringpiece   (const char* str, size_t n, void* dest);
843 
844 #define DECLARE_INTEGER_PARSER(name)                                       \
845  private:                                                                  \
846   static bool parse_##name(const char* str, size_t n, void* dest);         \
847   static bool parse_##name##_radix(const char* str, size_t n, void* dest,  \
848                                    int radix);                             \
849                                                                            \
850  public:                                                                   \
851   static bool parse_##name##_hex(const char* str, size_t n, void* dest);   \
852   static bool parse_##name##_octal(const char* str, size_t n, void* dest); \
853   static bool parse_##name##_cradix(const char* str, size_t n, void* dest);
854 
855   DECLARE_INTEGER_PARSER(short)
856   DECLARE_INTEGER_PARSER(ushort)
857   DECLARE_INTEGER_PARSER(int)
858   DECLARE_INTEGER_PARSER(uint)
859   DECLARE_INTEGER_PARSER(long)
860   DECLARE_INTEGER_PARSER(ulong)
861   DECLARE_INTEGER_PARSER(longlong)
862   DECLARE_INTEGER_PARSER(ulonglong)
863 
864 #undef DECLARE_INTEGER_PARSER
865 
866 };
867 
Arg()868 inline RE2::Arg::Arg() : arg_(NULL), parser_(parse_null) { }
Arg(void * p)869 inline RE2::Arg::Arg(void* p) : arg_(p), parser_(parse_null) { }
Arg(std::nullptr_t p)870 inline RE2::Arg::Arg(std::nullptr_t p) : arg_(p), parser_(parse_null) { }
871 
Parse(const char * str,size_t n)872 inline bool RE2::Arg::Parse(const char* str, size_t n) const {
873   return (*parser_)(str, n, arg_);
874 }
875 
876 // This part of the parser, appropriate only for ints, deals with bases
877 #define MAKE_INTEGER_PARSER(type, name)                    \
878   inline RE2::Arg RE2::Hex(type* ptr) {                    \
879     return RE2::Arg(ptr, RE2::Arg::parse_##name##_hex);    \
880   }                                                        \
881   inline RE2::Arg RE2::Octal(type* ptr) {                  \
882     return RE2::Arg(ptr, RE2::Arg::parse_##name##_octal);  \
883   }                                                        \
884   inline RE2::Arg RE2::CRadix(type* ptr) {                 \
885     return RE2::Arg(ptr, RE2::Arg::parse_##name##_cradix); \
886   }
887 
MAKE_INTEGER_PARSER(short,short)888 MAKE_INTEGER_PARSER(short,              short)
889 MAKE_INTEGER_PARSER(unsigned short,     ushort)
890 MAKE_INTEGER_PARSER(int,                int)
891 MAKE_INTEGER_PARSER(unsigned int,       uint)
892 MAKE_INTEGER_PARSER(long,               long)
893 MAKE_INTEGER_PARSER(unsigned long,      ulong)
894 MAKE_INTEGER_PARSER(long long,          longlong)
895 MAKE_INTEGER_PARSER(unsigned long long, ulonglong)
896 
897 #undef MAKE_INTEGER_PARSER
898 
899 #ifndef SWIG
900 // Silence warnings about missing initializers for members of LazyRE2.
901 #if !defined(__clang__) && defined(__GNUC__) && __GNUC__ >= 6
902 #pragma GCC diagnostic ignored "-Wmissing-field-initializers"
903 #endif
904 
905 // Helper for writing global or static RE2s safely.
906 // Write
907 //     static LazyRE2 re = {".*"};
908 // and then use *re instead of writing
909 //     static RE2 re(".*");
910 // The former is more careful about multithreaded
911 // situations than the latter.
912 //
913 // N.B. This class never deletes the RE2 object that
914 // it constructs: that's a feature, so that it can be used
915 // for global and function static variables.
916 class LazyRE2 {
917  private:
918   struct NoArg {};
919 
920  public:
921   typedef RE2 element_type;  // support std::pointer_traits
922 
923   // Constructor omitted to preserve braced initialization in C++98.
924 
925   // Pretend to be a pointer to Type (never NULL due to on-demand creation):
926   RE2& operator*() const { return *get(); }
927   RE2* operator->() const { return get(); }
928 
929   // Named accessor/initializer:
930   RE2* get() const {
931     std::call_once(once_, &LazyRE2::Init, this);
932     return ptr_;
933   }
934 
935   // All data fields must be public to support {"foo"} initialization.
936   const char* pattern_;
937   RE2::CannedOptions options_;
938   NoArg barrier_against_excess_initializers_;
939 
940   mutable RE2* ptr_;
941   mutable std::once_flag once_;
942 
943  private:
944   static void Init(const LazyRE2* lazy_re2) {
945     lazy_re2->ptr_ = new RE2(lazy_re2->pattern_, lazy_re2->options_);
946   }
947 
948   void operator=(const LazyRE2&);  // disallowed
949 };
950 #endif
951 
952 namespace hooks {
953 
954 // Most platforms support thread_local. Older versions of iOS don't support
955 // thread_local, but for the sake of brevity, we lump together all versions
956 // of Apple platforms that aren't macOS. If an iOS application really needs
957 // the context pointee someday, we can get more specific then...
958 #define RE2_HAVE_THREAD_LOCAL
959 #if defined(__APPLE__) && !TARGET_OS_OSX
960 #undef RE2_HAVE_THREAD_LOCAL
961 #endif
962 
963 // A hook must not make any assumptions regarding the lifetime of the context
964 // pointee beyond the current invocation of the hook. Pointers and references
965 // obtained via the context pointee should be considered invalidated when the
966 // hook returns. Hence, any data about the context pointee (e.g. its pattern)
967 // would have to be copied in order for it to be kept for an indefinite time.
968 //
969 // A hook must not use RE2 for matching. Control flow reentering RE2::Match()
970 // could result in infinite mutual recursion. To discourage that possibility,
971 // RE2 will not maintain the context pointer correctly when used in that way.
972 #ifdef RE2_HAVE_THREAD_LOCAL
973 extern thread_local const RE2* context;
974 #endif
975 
976 struct DFAStateCacheReset {
977   int64_t state_budget;
978   size_t state_cache_size;
979 };
980 
981 struct DFASearchFailure {
982   // Nothing yet...
983 };
984 
985 #define DECLARE_HOOK(type)                  \
986   using type##Callback = void(const type&); \
987   void Set##type##Hook(type##Callback* cb); \
988   type##Callback* Get##type##Hook();
989 
990 DECLARE_HOOK(DFAStateCacheReset)
991 DECLARE_HOOK(DFASearchFailure)
992 
993 #undef DECLARE_HOOK
994 
995 }  // namespace hooks
996 
997 }  // namespace re2
998 
999 using re2::RE2;
1000 using re2::LazyRE2;
1001 
1002 #endif  // RE2_RE2_H_
1003