• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2003-2009 The RE2 Authors.  All Rights Reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4 
5 // Regular expression interface RE2.
6 //
7 // Originally the PCRE C++ wrapper, but adapted to use
8 // the new automata-based regular expression engines.
9 
10 #include "re2/re2.h"
11 
12 #include <assert.h>
13 #include <ctype.h>
14 #include <errno.h>
15 #include <stdint.h>
16 #include <stdlib.h>
17 #include <string.h>
18 #include <algorithm>
19 #include <iterator>
20 #include <mutex>
21 #include <string>
22 #include <utility>
23 #include <vector>
24 
25 #include "util/util.h"
26 #include "util/logging.h"
27 #include "util/strutil.h"
28 #include "util/utf.h"
29 #include "re2/prog.h"
30 #include "re2/regexp.h"
31 #include "re2/sparse_array.h"
32 
33 namespace re2 {
34 
35 // Maximum number of args we can set
36 static const int kMaxArgs = 16;
37 static const int kVecSize = 1+kMaxArgs;
38 
39 const int RE2::Options::kDefaultMaxMem;  // initialized in re2.h
40 
Options(RE2::CannedOptions opt)41 RE2::Options::Options(RE2::CannedOptions opt)
42   : encoding_(opt == RE2::Latin1 ? EncodingLatin1 : EncodingUTF8),
43     posix_syntax_(opt == RE2::POSIX),
44     longest_match_(opt == RE2::POSIX),
45     log_errors_(opt != RE2::Quiet),
46     max_mem_(kDefaultMaxMem),
47     literal_(false),
48     never_nl_(false),
49     dot_nl_(false),
50     never_capture_(false),
51     case_sensitive_(true),
52     perl_classes_(false),
53     word_boundary_(false),
54     one_line_(false) {
55 }
56 
57 // static empty objects for use as const references.
58 // To avoid global constructors, allocated in RE2::Init().
59 static const std::string* empty_string;
60 static const std::map<std::string, int>* empty_named_groups;
61 static const std::map<int, std::string>* empty_group_names;
62 
63 // Converts from Regexp error code to RE2 error code.
64 // Maybe some day they will diverge.  In any event, this
65 // hides the existence of Regexp from RE2 users.
RegexpErrorToRE2(re2::RegexpStatusCode code)66 static RE2::ErrorCode RegexpErrorToRE2(re2::RegexpStatusCode code) {
67   switch (code) {
68     case re2::kRegexpSuccess:
69       return RE2::NoError;
70     case re2::kRegexpInternalError:
71       return RE2::ErrorInternal;
72     case re2::kRegexpBadEscape:
73       return RE2::ErrorBadEscape;
74     case re2::kRegexpBadCharClass:
75       return RE2::ErrorBadCharClass;
76     case re2::kRegexpBadCharRange:
77       return RE2::ErrorBadCharRange;
78     case re2::kRegexpMissingBracket:
79       return RE2::ErrorMissingBracket;
80     case re2::kRegexpMissingParen:
81       return RE2::ErrorMissingParen;
82     case re2::kRegexpTrailingBackslash:
83       return RE2::ErrorTrailingBackslash;
84     case re2::kRegexpRepeatArgument:
85       return RE2::ErrorRepeatArgument;
86     case re2::kRegexpRepeatSize:
87       return RE2::ErrorRepeatSize;
88     case re2::kRegexpRepeatOp:
89       return RE2::ErrorRepeatOp;
90     case re2::kRegexpBadPerlOp:
91       return RE2::ErrorBadPerlOp;
92     case re2::kRegexpBadUTF8:
93       return RE2::ErrorBadUTF8;
94     case re2::kRegexpBadNamedCapture:
95       return RE2::ErrorBadNamedCapture;
96   }
97   return RE2::ErrorInternal;
98 }
99 
trunc(const StringPiece & pattern)100 static std::string trunc(const StringPiece& pattern) {
101   if (pattern.size() < 100)
102     return std::string(pattern);
103   return std::string(pattern.substr(0, 100)) + "...";
104 }
105 
106 
RE2(const char * pattern)107 RE2::RE2(const char* pattern) {
108   Init(pattern, DefaultOptions);
109 }
110 
RE2(const std::string & pattern)111 RE2::RE2(const std::string& pattern) {
112   Init(pattern, DefaultOptions);
113 }
114 
RE2(const StringPiece & pattern)115 RE2::RE2(const StringPiece& pattern) {
116   Init(pattern, DefaultOptions);
117 }
118 
RE2(const StringPiece & pattern,const Options & options)119 RE2::RE2(const StringPiece& pattern, const Options& options) {
120   Init(pattern, options);
121 }
122 
ParseFlags() const123 int RE2::Options::ParseFlags() const {
124   int flags = Regexp::ClassNL;
125   switch (encoding()) {
126     default:
127       if (log_errors())
128         LOG(ERROR) << "Unknown encoding " << encoding();
129       break;
130     case RE2::Options::EncodingUTF8:
131       break;
132     case RE2::Options::EncodingLatin1:
133       flags |= Regexp::Latin1;
134       break;
135   }
136 
137   if (!posix_syntax())
138     flags |= Regexp::LikePerl;
139 
140   if (literal())
141     flags |= Regexp::Literal;
142 
143   if (never_nl())
144     flags |= Regexp::NeverNL;
145 
146   if (dot_nl())
147     flags |= Regexp::DotNL;
148 
149   if (never_capture())
150     flags |= Regexp::NeverCapture;
151 
152   if (!case_sensitive())
153     flags |= Regexp::FoldCase;
154 
155   if (perl_classes())
156     flags |= Regexp::PerlClasses;
157 
158   if (word_boundary())
159     flags |= Regexp::PerlB;
160 
161   if (one_line())
162     flags |= Regexp::OneLine;
163 
164   return flags;
165 }
166 
Init(const StringPiece & pattern,const Options & options)167 void RE2::Init(const StringPiece& pattern, const Options& options) {
168   static std::once_flag empty_once;
169   std::call_once(empty_once, []() {
170     empty_string = new std::string;
171     empty_named_groups = new std::map<std::string, int>;
172     empty_group_names = new std::map<int, std::string>;
173   });
174 
175   pattern_ = std::string(pattern);
176   options_.Copy(options);
177   entire_regexp_ = NULL;
178   suffix_regexp_ = NULL;
179   prog_ = NULL;
180   num_captures_ = -1;
181   rprog_ = NULL;
182   error_ = empty_string;
183   error_code_ = NoError;
184   named_groups_ = NULL;
185   group_names_ = NULL;
186 
187   RegexpStatus status;
188   entire_regexp_ = Regexp::Parse(
189     pattern_,
190     static_cast<Regexp::ParseFlags>(options_.ParseFlags()),
191     &status);
192   if (entire_regexp_ == NULL) {
193     if (options_.log_errors()) {
194       LOG(ERROR) << "Error parsing '" << trunc(pattern_) << "': "
195                  << status.Text();
196     }
197     error_ = new std::string(status.Text());
198     error_code_ = RegexpErrorToRE2(status.code());
199     error_arg_ = std::string(status.error_arg());
200     return;
201   }
202 
203   re2::Regexp* suffix;
204   if (entire_regexp_->RequiredPrefix(&prefix_, &prefix_foldcase_, &suffix))
205     suffix_regexp_ = suffix;
206   else
207     suffix_regexp_ = entire_regexp_->Incref();
208 
209   // Two thirds of the memory goes to the forward Prog,
210   // one third to the reverse prog, because the forward
211   // Prog has two DFAs but the reverse prog has one.
212   prog_ = suffix_regexp_->CompileToProg(options_.max_mem()*2/3);
213   if (prog_ == NULL) {
214     if (options_.log_errors())
215       LOG(ERROR) << "Error compiling '" << trunc(pattern_) << "'";
216     error_ = new std::string("pattern too large - compile failed");
217     error_code_ = RE2::ErrorPatternTooLarge;
218     return;
219   }
220 
221   // We used to compute this lazily, but it's used during the
222   // typical control flow for a match call, so we now compute
223   // it eagerly, which avoids the overhead of std::once_flag.
224   num_captures_ = suffix_regexp_->NumCaptures();
225 
226   // Could delay this until the first match call that
227   // cares about submatch information, but the one-pass
228   // machine's memory gets cut from the DFA memory budget,
229   // and that is harder to do if the DFA has already
230   // been built.
231   is_one_pass_ = prog_->IsOnePass();
232 }
233 
234 // Returns rprog_, computing it if needed.
ReverseProg() const235 re2::Prog* RE2::ReverseProg() const {
236   std::call_once(rprog_once_, [](const RE2* re) {
237     re->rprog_ =
238         re->suffix_regexp_->CompileToReverseProg(re->options_.max_mem() / 3);
239     if (re->rprog_ == NULL) {
240       if (re->options_.log_errors())
241         LOG(ERROR) << "Error reverse compiling '" << trunc(re->pattern_) << "'";
242       re->error_ =
243           new std::string("pattern too large - reverse compile failed");
244       re->error_code_ = RE2::ErrorPatternTooLarge;
245     }
246   }, this);
247   return rprog_;
248 }
249 
~RE2()250 RE2::~RE2() {
251   if (suffix_regexp_)
252     suffix_regexp_->Decref();
253   if (entire_regexp_)
254     entire_regexp_->Decref();
255   delete prog_;
256   delete rprog_;
257   if (error_ != empty_string)
258     delete error_;
259   if (named_groups_ != NULL && named_groups_ != empty_named_groups)
260     delete named_groups_;
261   if (group_names_ != NULL &&  group_names_ != empty_group_names)
262     delete group_names_;
263 }
264 
ProgramSize() const265 int RE2::ProgramSize() const {
266   if (prog_ == NULL)
267     return -1;
268   return prog_->size();
269 }
270 
ReverseProgramSize() const271 int RE2::ReverseProgramSize() const {
272   if (prog_ == NULL)
273     return -1;
274   Prog* prog = ReverseProg();
275   if (prog == NULL)
276     return -1;
277   return prog->size();
278 }
279 
Fanout(Prog * prog,std::map<int,int> * histogram)280 static int Fanout(Prog* prog, std::map<int, int>* histogram) {
281   SparseArray<int> fanout(prog->size());
282   prog->Fanout(&fanout);
283   histogram->clear();
284   for (SparseArray<int>::iterator i = fanout.begin(); i != fanout.end(); ++i) {
285     // TODO(junyer): Optimise this?
286     int bucket = 0;
287     while (1 << bucket < i->value()) {
288       bucket++;
289     }
290     (*histogram)[bucket]++;
291   }
292   return histogram->rbegin()->first;
293 }
294 
ProgramFanout(std::map<int,int> * histogram) const295 int RE2::ProgramFanout(std::map<int, int>* histogram) const {
296   if (prog_ == NULL)
297     return -1;
298   return Fanout(prog_, histogram);
299 }
300 
ReverseProgramFanout(std::map<int,int> * histogram) const301 int RE2::ReverseProgramFanout(std::map<int, int>* histogram) const {
302   if (prog_ == NULL)
303     return -1;
304   Prog* prog = ReverseProg();
305   if (prog == NULL)
306     return -1;
307   return Fanout(prog, histogram);
308 }
309 
310 // Returns named_groups_, computing it if needed.
NamedCapturingGroups() const311 const std::map<std::string, int>& RE2::NamedCapturingGroups() const {
312   std::call_once(named_groups_once_, [](const RE2* re) {
313     if (re->suffix_regexp_ != NULL)
314       re->named_groups_ = re->suffix_regexp_->NamedCaptures();
315     if (re->named_groups_ == NULL)
316       re->named_groups_ = empty_named_groups;
317   }, this);
318   return *named_groups_;
319 }
320 
321 // Returns group_names_, computing it if needed.
CapturingGroupNames() const322 const std::map<int, std::string>& RE2::CapturingGroupNames() const {
323   std::call_once(group_names_once_, [](const RE2* re) {
324     if (re->suffix_regexp_ != NULL)
325       re->group_names_ = re->suffix_regexp_->CaptureNames();
326     if (re->group_names_ == NULL)
327       re->group_names_ = empty_group_names;
328   }, this);
329   return *group_names_;
330 }
331 
332 /***** Convenience interfaces *****/
333 
FullMatchN(const StringPiece & text,const RE2 & re,const Arg * const args[],int n)334 bool RE2::FullMatchN(const StringPiece& text, const RE2& re,
335                      const Arg* const args[], int n) {
336   return re.DoMatch(text, ANCHOR_BOTH, NULL, args, n);
337 }
338 
PartialMatchN(const StringPiece & text,const RE2 & re,const Arg * const args[],int n)339 bool RE2::PartialMatchN(const StringPiece& text, const RE2& re,
340                         const Arg* const args[], int n) {
341   return re.DoMatch(text, UNANCHORED, NULL, args, n);
342 }
343 
ConsumeN(StringPiece * input,const RE2 & re,const Arg * const args[],int n)344 bool RE2::ConsumeN(StringPiece* input, const RE2& re,
345                    const Arg* const args[], int n) {
346   size_t consumed;
347   if (re.DoMatch(*input, ANCHOR_START, &consumed, args, n)) {
348     input->remove_prefix(consumed);
349     return true;
350   } else {
351     return false;
352   }
353 }
354 
FindAndConsumeN(StringPiece * input,const RE2 & re,const Arg * const args[],int n)355 bool RE2::FindAndConsumeN(StringPiece* input, const RE2& re,
356                           const Arg* const args[], int n) {
357   size_t consumed;
358   if (re.DoMatch(*input, UNANCHORED, &consumed, args, n)) {
359     input->remove_prefix(consumed);
360     return true;
361   } else {
362     return false;
363   }
364 }
365 
Replace(std::string * str,const RE2 & re,const StringPiece & rewrite)366 bool RE2::Replace(std::string* str,
367                   const RE2& re,
368                   const StringPiece& rewrite) {
369   StringPiece vec[kVecSize];
370   int nvec = 1 + MaxSubmatch(rewrite);
371   if (nvec > static_cast<int>(arraysize(vec)))
372     return false;
373   if (!re.Match(*str, 0, str->size(), UNANCHORED, vec, nvec))
374     return false;
375 
376   std::string s;
377   if (!re.Rewrite(&s, rewrite, vec, nvec))
378     return false;
379 
380   assert(vec[0].data() >= str->data());
381   assert(vec[0].data() + vec[0].size() <= str->data() + str->size());
382   str->replace(vec[0].data() - str->data(), vec[0].size(), s);
383   return true;
384 }
385 
GlobalReplace(std::string * str,const RE2 & re,const StringPiece & rewrite)386 int RE2::GlobalReplace(std::string* str,
387                        const RE2& re,
388                        const StringPiece& rewrite) {
389   StringPiece vec[kVecSize];
390   int nvec = 1 + MaxSubmatch(rewrite);
391   if (nvec > static_cast<int>(arraysize(vec)))
392     return false;
393 
394   const char* p = str->data();
395   const char* ep = p + str->size();
396   const char* lastend = NULL;
397   std::string out;
398   int count = 0;
399 #ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
400   // Iterate just once when fuzzing. Otherwise, we easily get bogged down
401   // and coverage is unlikely to improve despite significant expense.
402   while (p == str->data()) {
403 #else
404   while (p <= ep) {
405 #endif
406     if (!re.Match(*str, static_cast<size_t>(p - str->data()),
407                   str->size(), UNANCHORED, vec, nvec))
408       break;
409     if (p < vec[0].data())
410       out.append(p, vec[0].data() - p);
411     if (vec[0].data() == lastend && vec[0].empty()) {
412       // Disallow empty match at end of last match: skip ahead.
413       //
414       // fullrune() takes int, not ptrdiff_t. However, it just looks
415       // at the leading byte and treats any length >= 4 the same.
416       if (re.options().encoding() == RE2::Options::EncodingUTF8 &&
417           fullrune(p, static_cast<int>(std::min(ptrdiff_t{4}, ep - p)))) {
418         // re is in UTF-8 mode and there is enough left of str
419         // to allow us to advance by up to UTFmax bytes.
420         Rune r;
421         int n = chartorune(&r, p);
422         // Some copies of chartorune have a bug that accepts
423         // encodings of values in (10FFFF, 1FFFFF] as valid.
424         if (r > Runemax) {
425           n = 1;
426           r = Runeerror;
427         }
428         if (!(n == 1 && r == Runeerror)) {  // no decoding error
429           out.append(p, n);
430           p += n;
431           continue;
432         }
433       }
434       // Most likely, re is in Latin-1 mode. If it is in UTF-8 mode,
435       // we fell through from above and the GIGO principle applies.
436       if (p < ep)
437         out.append(p, 1);
438       p++;
439       continue;
440     }
441     re.Rewrite(&out, rewrite, vec, nvec);
442     p = vec[0].data() + vec[0].size();
443     lastend = p;
444     count++;
445   }
446 
447   if (count == 0)
448     return 0;
449 
450   if (p < ep)
451     out.append(p, ep - p);
452   using std::swap;
453   swap(out, *str);
454   return count;
455 }
456 
457 bool RE2::Extract(const StringPiece& text,
458                   const RE2& re,
459                   const StringPiece& rewrite,
460                   std::string* out) {
461   StringPiece vec[kVecSize];
462   int nvec = 1 + MaxSubmatch(rewrite);
463   if (nvec > static_cast<int>(arraysize(vec)))
464     return false;
465 
466   if (!re.Match(text, 0, text.size(), UNANCHORED, vec, nvec))
467     return false;
468 
469   out->clear();
470   return re.Rewrite(out, rewrite, vec, nvec);
471 }
472 
473 std::string RE2::QuoteMeta(const StringPiece& unquoted) {
474   std::string result;
475   result.reserve(unquoted.size() << 1);
476 
477   // Escape any ascii character not in [A-Za-z_0-9].
478   //
479   // Note that it's legal to escape a character even if it has no
480   // special meaning in a regular expression -- so this function does
481   // that.  (This also makes it identical to the perl function of the
482   // same name except for the null-character special case;
483   // see `perldoc -f quotemeta`.)
484   for (size_t ii = 0; ii < unquoted.size(); ++ii) {
485     // Note that using 'isalnum' here raises the benchmark time from
486     // 32ns to 58ns:
487     if ((unquoted[ii] < 'a' || unquoted[ii] > 'z') &&
488         (unquoted[ii] < 'A' || unquoted[ii] > 'Z') &&
489         (unquoted[ii] < '0' || unquoted[ii] > '9') &&
490         unquoted[ii] != '_' &&
491         // If this is the part of a UTF8 or Latin1 character, we need
492         // to copy this byte without escaping.  Experimentally this is
493         // what works correctly with the regexp library.
494         !(unquoted[ii] & 128)) {
495       if (unquoted[ii] == '\0') {  // Special handling for null chars.
496         // Note that this special handling is not strictly required for RE2,
497         // but this quoting is required for other regexp libraries such as
498         // PCRE.
499         // Can't use "\\0" since the next character might be a digit.
500         result += "\\x00";
501         continue;
502       }
503       result += '\\';
504     }
505     result += unquoted[ii];
506   }
507 
508   return result;
509 }
510 
511 bool RE2::PossibleMatchRange(std::string* min, std::string* max,
512                              int maxlen) const {
513   if (prog_ == NULL)
514     return false;
515 
516   int n = static_cast<int>(prefix_.size());
517   if (n > maxlen)
518     n = maxlen;
519 
520   // Determine initial min max from prefix_ literal.
521   *min = prefix_.substr(0, n);
522   *max = prefix_.substr(0, n);
523   if (prefix_foldcase_) {
524     // prefix is ASCII lowercase; change *min to uppercase.
525     for (int i = 0; i < n; i++) {
526       char& c = (*min)[i];
527       if ('a' <= c && c <= 'z')
528         c += 'A' - 'a';
529     }
530   }
531 
532   // Add to prefix min max using PossibleMatchRange on regexp.
533   std::string dmin, dmax;
534   maxlen -= n;
535   if (maxlen > 0 && prog_->PossibleMatchRange(&dmin, &dmax, maxlen)) {
536     min->append(dmin);
537     max->append(dmax);
538   } else if (!max->empty()) {
539     // prog_->PossibleMatchRange has failed us,
540     // but we still have useful information from prefix_.
541     // Round up *max to allow any possible suffix.
542     PrefixSuccessor(max);
543   } else {
544     // Nothing useful.
545     *min = "";
546     *max = "";
547     return false;
548   }
549 
550   return true;
551 }
552 
553 // Avoid possible locale nonsense in standard strcasecmp.
554 // The string a is known to be all lowercase.
555 static int ascii_strcasecmp(const char* a, const char* b, size_t len) {
556   const char* ae = a + len;
557 
558   for (; a < ae; a++, b++) {
559     uint8_t x = *a;
560     uint8_t y = *b;
561     if ('A' <= y && y <= 'Z')
562       y += 'a' - 'A';
563     if (x != y)
564       return x - y;
565   }
566   return 0;
567 }
568 
569 
570 /***** Actual matching and rewriting code *****/
571 
572 bool RE2::Match(const StringPiece& text,
573                 size_t startpos,
574                 size_t endpos,
575                 Anchor re_anchor,
576                 StringPiece* submatch,
577                 int nsubmatch) const {
578   if (!ok()) {
579     if (options_.log_errors())
580       LOG(ERROR) << "Invalid RE2: " << *error_;
581     return false;
582   }
583 
584   if (startpos > endpos || endpos > text.size()) {
585     if (options_.log_errors())
586       LOG(ERROR) << "RE2: invalid startpos, endpos pair. ["
587                  << "startpos: " << startpos << ", "
588                  << "endpos: " << endpos << ", "
589                  << "text size: " << text.size() << "]";
590     return false;
591   }
592 
593   StringPiece subtext = text;
594   subtext.remove_prefix(startpos);
595   subtext.remove_suffix(text.size() - endpos);
596 
597   // Use DFAs to find exact location of match, filter out non-matches.
598 
599   // Don't ask for the location if we won't use it.
600   // SearchDFA can do extra optimizations in that case.
601   StringPiece match;
602   StringPiece* matchp = &match;
603   if (nsubmatch == 0)
604     matchp = NULL;
605 
606   int ncap = 1 + NumberOfCapturingGroups();
607   if (ncap > nsubmatch)
608     ncap = nsubmatch;
609 
610   // If the regexp is anchored explicitly, must not be in middle of text.
611   if (prog_->anchor_start() && startpos != 0)
612     return false;
613 
614   // If the regexp is anchored explicitly, update re_anchor
615   // so that we can potentially fall into a faster case below.
616   if (prog_->anchor_start() && prog_->anchor_end())
617     re_anchor = ANCHOR_BOTH;
618   else if (prog_->anchor_start() && re_anchor != ANCHOR_BOTH)
619     re_anchor = ANCHOR_START;
620 
621   // Check for the required prefix, if any.
622   size_t prefixlen = 0;
623   if (!prefix_.empty()) {
624     if (startpos != 0)
625       return false;
626     prefixlen = prefix_.size();
627     if (prefixlen > subtext.size())
628       return false;
629     if (prefix_foldcase_) {
630       if (ascii_strcasecmp(&prefix_[0], subtext.data(), prefixlen) != 0)
631         return false;
632     } else {
633       if (memcmp(&prefix_[0], subtext.data(), prefixlen) != 0)
634         return false;
635     }
636     subtext.remove_prefix(prefixlen);
637     // If there is a required prefix, the anchor must be at least ANCHOR_START.
638     if (re_anchor != ANCHOR_BOTH)
639       re_anchor = ANCHOR_START;
640   }
641 
642   Prog::Anchor anchor = Prog::kUnanchored;
643   Prog::MatchKind kind = Prog::kFirstMatch;
644   if (options_.longest_match())
645     kind = Prog::kLongestMatch;
646   bool skipped_test = false;
647 
648   bool can_one_pass = (is_one_pass_ && ncap <= Prog::kMaxOnePassCapture);
649 
650   // BitState allocates a bitmap of size prog_->list_count() * text.size().
651   // It also allocates a stack of 3-word structures which could potentially
652   // grow as large as prog_->list_count() * text.size(), but in practice is
653   // much smaller.
654   const int kMaxBitStateBitmapSize = 256*1024;  // bitmap size <= max (bits)
655   bool can_bit_state = prog_->CanBitState();
656   size_t bit_state_text_max = kMaxBitStateBitmapSize / prog_->list_count();
657 
658   bool dfa_failed = false;
659   switch (re_anchor) {
660     default:
661     case UNANCHORED: {
662       if (!prog_->SearchDFA(subtext, text, anchor, kind,
663                             matchp, &dfa_failed, NULL)) {
664         if (dfa_failed) {
665           if (options_.log_errors())
666             LOG(ERROR) << "DFA out of memory: size " << prog_->size() << ", "
667                        << "bytemap range " << prog_->bytemap_range() << ", "
668                        << "list count " << prog_->list_count();
669           // Fall back to NFA below.
670           skipped_test = true;
671           break;
672         }
673         return false;
674       }
675       if (matchp == NULL)  // Matched.  Don't care where
676         return true;
677       // SearchDFA set match[0].end() but didn't know where the
678       // match started.  Run the regexp backward from match[0].end()
679       // to find the longest possible match -- that's where it started.
680       Prog* prog = ReverseProg();
681       if (prog == NULL)
682         return false;
683       if (!prog->SearchDFA(match, text, Prog::kAnchored,
684                            Prog::kLongestMatch, &match, &dfa_failed, NULL)) {
685         if (dfa_failed) {
686           if (options_.log_errors())
687             LOG(ERROR) << "DFA out of memory: size " << prog->size() << ", "
688                        << "bytemap range " << prog->bytemap_range() << ", "
689                        << "list count " << prog->list_count();
690           // Fall back to NFA below.
691           skipped_test = true;
692           break;
693         }
694         if (options_.log_errors())
695           LOG(ERROR) << "SearchDFA inconsistency";
696         return false;
697       }
698       break;
699     }
700 
701     case ANCHOR_BOTH:
702     case ANCHOR_START:
703       if (re_anchor == ANCHOR_BOTH)
704         kind = Prog::kFullMatch;
705       anchor = Prog::kAnchored;
706 
707       // If only a small amount of text and need submatch
708       // information anyway and we're going to use OnePass or BitState
709       // to get it, we might as well not even bother with the DFA:
710       // OnePass or BitState will be fast enough.
711       // On tiny texts, OnePass outruns even the DFA, and
712       // it doesn't have the shared state and occasional mutex that
713       // the DFA does.
714       if (can_one_pass && text.size() <= 4096 &&
715           (ncap > 1 || text.size() <= 8)) {
716         skipped_test = true;
717         break;
718       }
719       if (can_bit_state && text.size() <= bit_state_text_max && ncap > 1) {
720         skipped_test = true;
721         break;
722       }
723       if (!prog_->SearchDFA(subtext, text, anchor, kind,
724                             &match, &dfa_failed, NULL)) {
725         if (dfa_failed) {
726           if (options_.log_errors())
727             LOG(ERROR) << "DFA out of memory: size " << prog_->size() << ", "
728                        << "bytemap range " << prog_->bytemap_range() << ", "
729                        << "list count " << prog_->list_count();
730           // Fall back to NFA below.
731           skipped_test = true;
732           break;
733         }
734         return false;
735       }
736       break;
737   }
738 
739   if (!skipped_test && ncap <= 1) {
740     // We know exactly where it matches.  That's enough.
741     if (ncap == 1)
742       submatch[0] = match;
743   } else {
744     StringPiece subtext1;
745     if (skipped_test) {
746       // DFA ran out of memory or was skipped:
747       // need to search in entire original text.
748       subtext1 = subtext;
749     } else {
750       // DFA found the exact match location:
751       // let NFA run an anchored, full match search
752       // to find submatch locations.
753       subtext1 = match;
754       anchor = Prog::kAnchored;
755       kind = Prog::kFullMatch;
756     }
757 
758     if (can_one_pass && anchor != Prog::kUnanchored) {
759       if (!prog_->SearchOnePass(subtext1, text, anchor, kind, submatch, ncap)) {
760         if (!skipped_test && options_.log_errors())
761           LOG(ERROR) << "SearchOnePass inconsistency";
762         return false;
763       }
764     } else if (can_bit_state && subtext1.size() <= bit_state_text_max) {
765       if (!prog_->SearchBitState(subtext1, text, anchor,
766                                  kind, submatch, ncap)) {
767         if (!skipped_test && options_.log_errors())
768           LOG(ERROR) << "SearchBitState inconsistency";
769         return false;
770       }
771     } else {
772       if (!prog_->SearchNFA(subtext1, text, anchor, kind, submatch, ncap)) {
773         if (!skipped_test && options_.log_errors())
774           LOG(ERROR) << "SearchNFA inconsistency";
775         return false;
776       }
777     }
778   }
779 
780   // Adjust overall match for required prefix that we stripped off.
781   if (prefixlen > 0 && nsubmatch > 0)
782     submatch[0] = StringPiece(submatch[0].data() - prefixlen,
783                               submatch[0].size() + prefixlen);
784 
785   // Zero submatches that don't exist in the regexp.
786   for (int i = ncap; i < nsubmatch; i++)
787     submatch[i] = StringPiece();
788   return true;
789 }
790 
791 // Internal matcher - like Match() but takes Args not StringPieces.
792 bool RE2::DoMatch(const StringPiece& text,
793                   Anchor re_anchor,
794                   size_t* consumed,
795                   const Arg* const* args,
796                   int n) const {
797   if (!ok()) {
798     if (options_.log_errors())
799       LOG(ERROR) << "Invalid RE2: " << *error_;
800     return false;
801   }
802 
803   if (NumberOfCapturingGroups() < n) {
804     // RE has fewer capturing groups than number of Arg pointers passed in.
805     return false;
806   }
807 
808   // Count number of capture groups needed.
809   int nvec;
810   if (n == 0 && consumed == NULL)
811     nvec = 0;
812   else
813     nvec = n+1;
814 
815   StringPiece* vec;
816   StringPiece stkvec[kVecSize];
817   StringPiece* heapvec = NULL;
818 
819   if (nvec <= static_cast<int>(arraysize(stkvec))) {
820     vec = stkvec;
821   } else {
822     vec = new StringPiece[nvec];
823     heapvec = vec;
824   }
825 
826   if (!Match(text, 0, text.size(), re_anchor, vec, nvec)) {
827     delete[] heapvec;
828     return false;
829   }
830 
831   if (consumed != NULL)
832     *consumed = static_cast<size_t>(vec[0].end() - text.begin());
833 
834   if (n == 0 || args == NULL) {
835     // We are not interested in results
836     delete[] heapvec;
837     return true;
838   }
839 
840   // If we got here, we must have matched the whole pattern.
841   for (int i = 0; i < n; i++) {
842     const StringPiece& s = vec[i+1];
843     if (!args[i]->Parse(s.data(), s.size())) {
844       // TODO: Should we indicate what the error was?
845       delete[] heapvec;
846       return false;
847     }
848   }
849 
850   delete[] heapvec;
851   return true;
852 }
853 
854 // Checks that the rewrite string is well-formed with respect to this
855 // regular expression.
856 bool RE2::CheckRewriteString(const StringPiece& rewrite,
857                              std::string* error) const {
858   int max_token = -1;
859   for (const char *s = rewrite.data(), *end = s + rewrite.size();
860        s < end; s++) {
861     int c = *s;
862     if (c != '\\') {
863       continue;
864     }
865     if (++s == end) {
866       *error = "Rewrite schema error: '\\' not allowed at end.";
867       return false;
868     }
869     c = *s;
870     if (c == '\\') {
871       continue;
872     }
873     if (!isdigit(c)) {
874       *error = "Rewrite schema error: "
875                "'\\' must be followed by a digit or '\\'.";
876       return false;
877     }
878     int n = (c - '0');
879     if (max_token < n) {
880       max_token = n;
881     }
882   }
883 
884   if (max_token > NumberOfCapturingGroups()) {
885     *error = StringPrintf(
886         "Rewrite schema requests %d matches, but the regexp only has %d "
887         "parenthesized subexpressions.",
888         max_token, NumberOfCapturingGroups());
889     return false;
890   }
891   return true;
892 }
893 
894 // Returns the maximum submatch needed for the rewrite to be done by Replace().
895 // E.g. if rewrite == "foo \\2,\\1", returns 2.
896 int RE2::MaxSubmatch(const StringPiece& rewrite) {
897   int max = 0;
898   for (const char *s = rewrite.data(), *end = s + rewrite.size();
899        s < end; s++) {
900     if (*s == '\\') {
901       s++;
902       int c = (s < end) ? *s : -1;
903       if (isdigit(c)) {
904         int n = (c - '0');
905         if (n > max)
906           max = n;
907       }
908     }
909   }
910   return max;
911 }
912 
913 // Append the "rewrite" string, with backslash subsitutions from "vec",
914 // to string "out".
915 bool RE2::Rewrite(std::string* out,
916                   const StringPiece& rewrite,
917                   const StringPiece* vec,
918                   int veclen) const {
919   for (const char *s = rewrite.data(), *end = s + rewrite.size();
920        s < end; s++) {
921     if (*s != '\\') {
922       out->push_back(*s);
923       continue;
924     }
925     s++;
926     int c = (s < end) ? *s : -1;
927     if (isdigit(c)) {
928       int n = (c - '0');
929       if (n >= veclen) {
930         if (options_.log_errors()) {
931           LOG(ERROR) << "requested group " << n
932                      << " in regexp " << rewrite.data();
933         }
934         return false;
935       }
936       StringPiece snip = vec[n];
937       if (!snip.empty())
938         out->append(snip.data(), snip.size());
939     } else if (c == '\\') {
940       out->push_back('\\');
941     } else {
942       if (options_.log_errors())
943         LOG(ERROR) << "invalid rewrite pattern: " << rewrite.data();
944       return false;
945     }
946   }
947   return true;
948 }
949 
950 /***** Parsers for various types *****/
951 
952 bool RE2::Arg::parse_null(const char* str, size_t n, void* dest) {
953   // We fail if somebody asked us to store into a non-NULL void* pointer
954   return (dest == NULL);
955 }
956 
957 bool RE2::Arg::parse_string(const char* str, size_t n, void* dest) {
958   if (dest == NULL) return true;
959   reinterpret_cast<std::string*>(dest)->assign(str, n);
960   return true;
961 }
962 
963 bool RE2::Arg::parse_stringpiece(const char* str, size_t n, void* dest) {
964   if (dest == NULL) return true;
965   *(reinterpret_cast<StringPiece*>(dest)) = StringPiece(str, n);
966   return true;
967 }
968 
969 bool RE2::Arg::parse_char(const char* str, size_t n, void* dest) {
970   if (n != 1) return false;
971   if (dest == NULL) return true;
972   *(reinterpret_cast<char*>(dest)) = str[0];
973   return true;
974 }
975 
976 bool RE2::Arg::parse_schar(const char* str, size_t n, void* dest) {
977   if (n != 1) return false;
978   if (dest == NULL) return true;
979   *(reinterpret_cast<signed char*>(dest)) = str[0];
980   return true;
981 }
982 
983 bool RE2::Arg::parse_uchar(const char* str, size_t n, void* dest) {
984   if (n != 1) return false;
985   if (dest == NULL) return true;
986   *(reinterpret_cast<unsigned char*>(dest)) = str[0];
987   return true;
988 }
989 
990 // Largest number spec that we are willing to parse
991 static const int kMaxNumberLength = 32;
992 
993 // REQUIRES "buf" must have length at least nbuf.
994 // Copies "str" into "buf" and null-terminates.
995 // Overwrites *np with the new length.
996 static const char* TerminateNumber(char* buf, size_t nbuf, const char* str,
997                                    size_t* np, bool accept_spaces) {
998   size_t n = *np;
999   if (n == 0) return "";
1000   if (n > 0 && isspace(*str)) {
1001     // We are less forgiving than the strtoxxx() routines and do not
1002     // allow leading spaces. We do allow leading spaces for floats.
1003     if (!accept_spaces) {
1004       return "";
1005     }
1006     while (n > 0 && isspace(*str)) {
1007       n--;
1008       str++;
1009     }
1010   }
1011 
1012   // Although buf has a fixed maximum size, we can still handle
1013   // arbitrarily large integers correctly by omitting leading zeros.
1014   // (Numbers that are still too long will be out of range.)
1015   // Before deciding whether str is too long,
1016   // remove leading zeros with s/000+/00/.
1017   // Leaving the leading two zeros in place means that
1018   // we don't change 0000x123 (invalid) into 0x123 (valid).
1019   // Skip over leading - before replacing.
1020   bool neg = false;
1021   if (n >= 1 && str[0] == '-') {
1022     neg = true;
1023     n--;
1024     str++;
1025   }
1026 
1027   if (n >= 3 && str[0] == '0' && str[1] == '0') {
1028     while (n >= 3 && str[2] == '0') {
1029       n--;
1030       str++;
1031     }
1032   }
1033 
1034   if (neg) {  // make room in buf for -
1035     n++;
1036     str--;
1037   }
1038 
1039   if (n > nbuf-1) return "";
1040 
1041   memmove(buf, str, n);
1042   if (neg) {
1043     buf[0] = '-';
1044   }
1045   buf[n] = '\0';
1046   *np = n;
1047   return buf;
1048 }
1049 
1050 bool RE2::Arg::parse_long_radix(const char* str,
1051                                 size_t n,
1052                                 void* dest,
1053                                 int radix) {
1054   if (n == 0) return false;
1055   char buf[kMaxNumberLength+1];
1056   str = TerminateNumber(buf, sizeof buf, str, &n, false);
1057   char* end;
1058   errno = 0;
1059   long r = strtol(str, &end, radix);
1060   if (end != str + n) return false;   // Leftover junk
1061   if (errno) return false;
1062   if (dest == NULL) return true;
1063   *(reinterpret_cast<long*>(dest)) = r;
1064   return true;
1065 }
1066 
1067 bool RE2::Arg::parse_ulong_radix(const char* str,
1068                                  size_t n,
1069                                  void* dest,
1070                                  int radix) {
1071   if (n == 0) return false;
1072   char buf[kMaxNumberLength+1];
1073   str = TerminateNumber(buf, sizeof buf, str, &n, false);
1074   if (str[0] == '-') {
1075     // strtoul() will silently accept negative numbers and parse
1076     // them.  This module is more strict and treats them as errors.
1077     return false;
1078   }
1079 
1080   char* end;
1081   errno = 0;
1082   unsigned long r = strtoul(str, &end, radix);
1083   if (end != str + n) return false;   // Leftover junk
1084   if (errno) return false;
1085   if (dest == NULL) return true;
1086   *(reinterpret_cast<unsigned long*>(dest)) = r;
1087   return true;
1088 }
1089 
1090 bool RE2::Arg::parse_short_radix(const char* str,
1091                                  size_t n,
1092                                  void* dest,
1093                                  int radix) {
1094   long r;
1095   if (!parse_long_radix(str, n, &r, radix)) return false;  // Could not parse
1096   if ((short)r != r) return false;                         // Out of range
1097   if (dest == NULL) return true;
1098   *(reinterpret_cast<short*>(dest)) = (short)r;
1099   return true;
1100 }
1101 
1102 bool RE2::Arg::parse_ushort_radix(const char* str,
1103                                   size_t n,
1104                                   void* dest,
1105                                   int radix) {
1106   unsigned long r;
1107   if (!parse_ulong_radix(str, n, &r, radix)) return false;  // Could not parse
1108   if ((unsigned short)r != r) return false;                 // Out of range
1109   if (dest == NULL) return true;
1110   *(reinterpret_cast<unsigned short*>(dest)) = (unsigned short)r;
1111   return true;
1112 }
1113 
1114 bool RE2::Arg::parse_int_radix(const char* str,
1115                                size_t n,
1116                                void* dest,
1117                                int radix) {
1118   long r;
1119   if (!parse_long_radix(str, n, &r, radix)) return false;  // Could not parse
1120   if ((int)r != r) return false;                           // Out of range
1121   if (dest == NULL) return true;
1122   *(reinterpret_cast<int*>(dest)) = (int)r;
1123   return true;
1124 }
1125 
1126 bool RE2::Arg::parse_uint_radix(const char* str,
1127                                 size_t n,
1128                                 void* dest,
1129                                 int radix) {
1130   unsigned long r;
1131   if (!parse_ulong_radix(str, n, &r, radix)) return false;  // Could not parse
1132   if ((unsigned int)r != r) return false;                   // Out of range
1133   if (dest == NULL) return true;
1134   *(reinterpret_cast<unsigned int*>(dest)) = (unsigned int)r;
1135   return true;
1136 }
1137 
1138 bool RE2::Arg::parse_longlong_radix(const char* str,
1139                                     size_t n,
1140                                     void* dest,
1141                                     int radix) {
1142   if (n == 0) return false;
1143   char buf[kMaxNumberLength+1];
1144   str = TerminateNumber(buf, sizeof buf, str, &n, false);
1145   char* end;
1146   errno = 0;
1147   long long r = strtoll(str, &end, radix);
1148   if (end != str + n) return false;   // Leftover junk
1149   if (errno) return false;
1150   if (dest == NULL) return true;
1151   *(reinterpret_cast<long long*>(dest)) = r;
1152   return true;
1153 }
1154 
1155 bool RE2::Arg::parse_ulonglong_radix(const char* str,
1156                                      size_t n,
1157                                      void* dest,
1158                                      int radix) {
1159   if (n == 0) return false;
1160   char buf[kMaxNumberLength+1];
1161   str = TerminateNumber(buf, sizeof buf, str, &n, false);
1162   if (str[0] == '-') {
1163     // strtoull() will silently accept negative numbers and parse
1164     // them.  This module is more strict and treats them as errors.
1165     return false;
1166   }
1167   char* end;
1168   errno = 0;
1169   unsigned long long r = strtoull(str, &end, radix);
1170   if (end != str + n) return false;   // Leftover junk
1171   if (errno) return false;
1172   if (dest == NULL) return true;
1173   *(reinterpret_cast<unsigned long long*>(dest)) = r;
1174   return true;
1175 }
1176 
1177 static bool parse_double_float(const char* str, size_t n, bool isfloat,
1178                                void* dest) {
1179   if (n == 0) return false;
1180   static const int kMaxLength = 200;
1181   char buf[kMaxLength+1];
1182   str = TerminateNumber(buf, sizeof buf, str, &n, true);
1183   char* end;
1184   errno = 0;
1185   double r;
1186   if (isfloat) {
1187     r = strtof(str, &end);
1188   } else {
1189     r = strtod(str, &end);
1190   }
1191   if (end != str + n) return false;   // Leftover junk
1192   if (errno) return false;
1193   if (dest == NULL) return true;
1194   if (isfloat) {
1195     *(reinterpret_cast<float*>(dest)) = (float)r;
1196   } else {
1197     *(reinterpret_cast<double*>(dest)) = r;
1198   }
1199   return true;
1200 }
1201 
1202 bool RE2::Arg::parse_double(const char* str, size_t n, void* dest) {
1203   return parse_double_float(str, n, false, dest);
1204 }
1205 
1206 bool RE2::Arg::parse_float(const char* str, size_t n, void* dest) {
1207   return parse_double_float(str, n, true, dest);
1208 }
1209 
1210 #define DEFINE_INTEGER_PARSER(name)                                            \
1211   bool RE2::Arg::parse_##name(const char* str, size_t n, void* dest) {         \
1212     return parse_##name##_radix(str, n, dest, 10);                             \
1213   }                                                                            \
1214   bool RE2::Arg::parse_##name##_hex(const char* str, size_t n, void* dest) {   \
1215     return parse_##name##_radix(str, n, dest, 16);                             \
1216   }                                                                            \
1217   bool RE2::Arg::parse_##name##_octal(const char* str, size_t n, void* dest) { \
1218     return parse_##name##_radix(str, n, dest, 8);                              \
1219   }                                                                            \
1220   bool RE2::Arg::parse_##name##_cradix(const char* str, size_t n,              \
1221                                        void* dest) {                           \
1222     return parse_##name##_radix(str, n, dest, 0);                              \
1223   }
1224 
1225 DEFINE_INTEGER_PARSER(short);
1226 DEFINE_INTEGER_PARSER(ushort);
1227 DEFINE_INTEGER_PARSER(int);
1228 DEFINE_INTEGER_PARSER(uint);
1229 DEFINE_INTEGER_PARSER(long);
1230 DEFINE_INTEGER_PARSER(ulong);
1231 DEFINE_INTEGER_PARSER(longlong);
1232 DEFINE_INTEGER_PARSER(ulonglong);
1233 
1234 #undef DEFINE_INTEGER_PARSER
1235 
1236 }  // namespace re2
1237