• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2003-2009 Google Inc.  All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4 
5 // This is a variant of PCRE's pcrecpp.cc, originally written at Google.
6 // The main changes are the addition of the HitLimit method and
7 // compilation as PCRE in namespace re2.
8 
9 #include <assert.h>
10 #include <ctype.h>
11 #include <errno.h>
12 #include <stdlib.h>
13 #include <string.h>
14 #include <limits>
15 #include <string>
16 #include <utility>
17 
18 #include "absl/flags/flag.h"
19 #include "absl/log/absl_check.h"
20 #include "absl/log/absl_log.h"
21 #include "absl/strings/str_format.h"
22 #include "util/pcre.h"
23 
24 // Silence warnings about the wacky formatting in the operator() functions.
25 #if defined(__GNUC__)
26 #pragma GCC diagnostic ignored "-Wmisleading-indentation"
27 #endif
28 
29 #define PCREPORT(level) ABSL_LOG(level)
30 
31 // Default PCRE limits.
32 // Defaults chosen to allow a plausible amount of CPU and
33 // not exceed main thread stacks.  Note that other threads
34 // often have smaller stacks, and therefore tightening
35 // regexp_stack_limit may frequently be necessary.
36 ABSL_FLAG(int, regexp_stack_limit, 256 << 10,
37           "default PCRE stack limit (bytes)");
38 ABSL_FLAG(int, regexp_match_limit, 1000000,
39           "default PCRE match limit (function calls)");
40 
41 #ifndef USEPCRE
42 
43 // Fake just enough of the PCRE API to allow this file to build. :)
44 
45 struct pcre_extra {
46   int flags;
47   int match_limit;
48   int match_limit_recursion;
49 };
50 
51 #define PCRE_EXTRA_MATCH_LIMIT 0
52 #define PCRE_EXTRA_MATCH_LIMIT_RECURSION 0
53 #define PCRE_ANCHORED 0
54 #define PCRE_NOTEMPTY 0
55 #define PCRE_ERROR_NOMATCH 1
56 #define PCRE_ERROR_MATCHLIMIT 2
57 #define PCRE_ERROR_RECURSIONLIMIT 3
58 #define PCRE_INFO_CAPTURECOUNT 0
59 
pcre_free(void *)60 void pcre_free(void*) {
61 }
62 
pcre_compile(const char *,int,const char **,int *,const unsigned char *)63 pcre* pcre_compile(const char*, int, const char**, int*, const unsigned char*) {
64   return NULL;
65 }
66 
pcre_exec(const pcre *,const pcre_extra *,const char *,int,int,int,int *,int)67 int pcre_exec(const pcre*, const pcre_extra*, const char*, int, int, int, int*, int) {
68   return 0;
69 }
70 
pcre_fullinfo(const pcre *,const pcre_extra *,int,void *)71 int pcre_fullinfo(const pcre*, const pcre_extra*, int, void*) {
72   return 0;
73 }
74 
75 #endif
76 
77 namespace re2 {
78 
79 // Maximum number of args we can set
80 static const int kMaxArgs = 16;
81 static const int kVecSize = (1 + kMaxArgs) * 3;  // results + PCRE workspace
82 
83 // Approximate size of a recursive invocation of PCRE's
84 // internal "match()" frame.  This varies depending on the
85 // compiler and architecture, of course, so the constant is
86 // just a conservative estimate.  To find the exact number,
87 // run regexp_unittest with --regexp_stack_limit=0 under
88 // a debugger and look at the frames when it crashes.
89 // The exact frame size was 656 in production on 2008/02/03.
90 static const int kPCREFrameSize = 700;
91 
92 // Special name for missing C++ arguments.
93 PCRE::Arg PCRE::no_more_args((void*)NULL);
94 
95 const PCRE::PartialMatchFunctor PCRE::PartialMatch = { };
96 const PCRE::FullMatchFunctor PCRE::FullMatch = { } ;
97 const PCRE::ConsumeFunctor PCRE::Consume = { };
98 const PCRE::FindAndConsumeFunctor PCRE::FindAndConsume = { };
99 
100 // If a regular expression has no error, its error_ field points here
101 static const std::string empty_string;
102 
Init(const char * pattern,Option options,int match_limit,int stack_limit,bool report_errors)103 void PCRE::Init(const char* pattern, Option options, int match_limit,
104               int stack_limit, bool report_errors) {
105   pattern_ = pattern;
106   options_ = options;
107   match_limit_ = match_limit;
108   stack_limit_ = stack_limit;
109   hit_limit_ = false;
110   error_ = &empty_string;
111   report_errors_ = report_errors;
112   re_full_ = NULL;
113   re_partial_ = NULL;
114 
115   if (options & ~(EnabledCompileOptions | EnabledExecOptions)) {
116     error_ = new std::string("illegal regexp option");
117     PCREPORT(ERROR)
118         << "Error compiling '" << pattern << "': illegal regexp option";
119   } else {
120     re_partial_ = Compile(UNANCHORED);
121     if (re_partial_ != NULL) {
122       re_full_ = Compile(ANCHOR_BOTH);
123     }
124   }
125 }
126 
PCRE(const char * pattern)127 PCRE::PCRE(const char* pattern) {
128   Init(pattern, None, 0, 0, true);
129 }
PCRE(const char * pattern,Option option)130 PCRE::PCRE(const char* pattern, Option option) {
131   Init(pattern, option, 0, 0, true);
132 }
PCRE(const std::string & pattern)133 PCRE::PCRE(const std::string& pattern) {
134   Init(pattern.c_str(), None, 0, 0, true);
135 }
PCRE(const std::string & pattern,Option option)136 PCRE::PCRE(const std::string& pattern, Option option) {
137   Init(pattern.c_str(), option, 0, 0, true);
138 }
PCRE(const std::string & pattern,const PCRE_Options & re_option)139 PCRE::PCRE(const std::string& pattern, const PCRE_Options& re_option) {
140   Init(pattern.c_str(), re_option.option(), re_option.match_limit(),
141        re_option.stack_limit(), re_option.report_errors());
142 }
143 
PCRE(const char * pattern,const PCRE_Options & re_option)144 PCRE::PCRE(const char *pattern, const PCRE_Options& re_option) {
145   Init(pattern, re_option.option(), re_option.match_limit(),
146        re_option.stack_limit(), re_option.report_errors());
147 }
148 
~PCRE()149 PCRE::~PCRE() {
150   if (re_full_ != NULL)         pcre_free(re_full_);
151   if (re_partial_ != NULL)      pcre_free(re_partial_);
152   if (error_ != &empty_string)  delete error_;
153 }
154 
Compile(Anchor anchor)155 pcre* PCRE::Compile(Anchor anchor) {
156   // Special treatment for anchoring.  This is needed because at
157   // runtime pcre only provides an option for anchoring at the
158   // beginning of a string.
159   //
160   // There are three types of anchoring we want:
161   //    UNANCHORED      Compile the original pattern, and use
162   //                    a pcre unanchored match.
163   //    ANCHOR_START    Compile the original pattern, and use
164   //                    a pcre anchored match.
165   //    ANCHOR_BOTH     Tack a "\z" to the end of the original pattern
166   //                    and use a pcre anchored match.
167 
168   const char* error = "";
169   int eoffset;
170   pcre* re;
171   if (anchor != ANCHOR_BOTH) {
172     re = pcre_compile(pattern_.c_str(),
173                       (options_ & EnabledCompileOptions),
174                       &error, &eoffset, NULL);
175   } else {
176     // Tack a '\z' at the end of PCRE.  Parenthesize it first so that
177     // the '\z' applies to all top-level alternatives in the regexp.
178     std::string wrapped = "(?:";  // A non-counting grouping operator
179     wrapped += pattern_;
180     wrapped += ")\\z";
181     re = pcre_compile(wrapped.c_str(),
182                       (options_ & EnabledCompileOptions),
183                       &error, &eoffset, NULL);
184   }
185   if (re == NULL) {
186     if (error_ == &empty_string) error_ = new std::string(error);
187     PCREPORT(ERROR) << "Error compiling '" << pattern_ << "': " << error;
188   }
189   return re;
190 }
191 
192 /***** Convenience interfaces *****/
193 
operator ()(absl::string_view text,const PCRE & re,const Arg & a0,const Arg & a1,const Arg & a2,const Arg & a3,const Arg & a4,const Arg & a5,const Arg & a6,const Arg & a7,const Arg & a8,const Arg & a9,const Arg & a10,const Arg & a11,const Arg & a12,const Arg & a13,const Arg & a14,const Arg & a15) const194 bool PCRE::FullMatchFunctor::operator()(
195     absl::string_view text, const PCRE& re, const Arg& a0, const Arg& a1,
196     const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5, const Arg& a6,
197     const Arg& a7, const Arg& a8, const Arg& a9, const Arg& a10, const Arg& a11,
198     const Arg& a12, const Arg& a13, const Arg& a14, const Arg& a15) const {
199   const Arg* args[kMaxArgs];
200   int n = 0;
201   if (&a0 == &no_more_args)  goto done; args[n++] = &a0;
202   if (&a1 == &no_more_args)  goto done; args[n++] = &a1;
203   if (&a2 == &no_more_args)  goto done; args[n++] = &a2;
204   if (&a3 == &no_more_args)  goto done; args[n++] = &a3;
205   if (&a4 == &no_more_args)  goto done; args[n++] = &a4;
206   if (&a5 == &no_more_args)  goto done; args[n++] = &a5;
207   if (&a6 == &no_more_args)  goto done; args[n++] = &a6;
208   if (&a7 == &no_more_args)  goto done; args[n++] = &a7;
209   if (&a8 == &no_more_args)  goto done; args[n++] = &a8;
210   if (&a9 == &no_more_args)  goto done; args[n++] = &a9;
211   if (&a10 == &no_more_args) goto done; args[n++] = &a10;
212   if (&a11 == &no_more_args) goto done; args[n++] = &a11;
213   if (&a12 == &no_more_args) goto done; args[n++] = &a12;
214   if (&a13 == &no_more_args) goto done; args[n++] = &a13;
215   if (&a14 == &no_more_args) goto done; args[n++] = &a14;
216   if (&a15 == &no_more_args) goto done; args[n++] = &a15;
217 done:
218 
219   size_t consumed;
220   int vec[kVecSize] = {};
221   return re.DoMatchImpl(text, ANCHOR_BOTH, &consumed, args, n, vec, kVecSize);
222 }
223 
operator ()(absl::string_view text,const PCRE & re,const Arg & a0,const Arg & a1,const Arg & a2,const Arg & a3,const Arg & a4,const Arg & a5,const Arg & a6,const Arg & a7,const Arg & a8,const Arg & a9,const Arg & a10,const Arg & a11,const Arg & a12,const Arg & a13,const Arg & a14,const Arg & a15) const224 bool PCRE::PartialMatchFunctor::operator()(
225     absl::string_view text, const PCRE& re, const Arg& a0, const Arg& a1,
226     const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5, const Arg& a6,
227     const Arg& a7, const Arg& a8, const Arg& a9, const Arg& a10, const Arg& a11,
228     const Arg& a12, const Arg& a13, const Arg& a14, const Arg& a15) const {
229   const Arg* args[kMaxArgs];
230   int n = 0;
231   if (&a0 == &no_more_args)  goto done; args[n++] = &a0;
232   if (&a1 == &no_more_args)  goto done; args[n++] = &a1;
233   if (&a2 == &no_more_args)  goto done; args[n++] = &a2;
234   if (&a3 == &no_more_args)  goto done; args[n++] = &a3;
235   if (&a4 == &no_more_args)  goto done; args[n++] = &a4;
236   if (&a5 == &no_more_args)  goto done; args[n++] = &a5;
237   if (&a6 == &no_more_args)  goto done; args[n++] = &a6;
238   if (&a7 == &no_more_args)  goto done; args[n++] = &a7;
239   if (&a8 == &no_more_args)  goto done; args[n++] = &a8;
240   if (&a9 == &no_more_args)  goto done; args[n++] = &a9;
241   if (&a10 == &no_more_args) goto done; args[n++] = &a10;
242   if (&a11 == &no_more_args) goto done; args[n++] = &a11;
243   if (&a12 == &no_more_args) goto done; args[n++] = &a12;
244   if (&a13 == &no_more_args) goto done; args[n++] = &a13;
245   if (&a14 == &no_more_args) goto done; args[n++] = &a14;
246   if (&a15 == &no_more_args) goto done; args[n++] = &a15;
247 done:
248 
249   size_t consumed;
250   int vec[kVecSize] = {};
251   return re.DoMatchImpl(text, UNANCHORED, &consumed, args, n, vec, kVecSize);
252 }
253 
operator ()(absl::string_view * input,const PCRE & pattern,const Arg & a0,const Arg & a1,const Arg & a2,const Arg & a3,const Arg & a4,const Arg & a5,const Arg & a6,const Arg & a7,const Arg & a8,const Arg & a9,const Arg & a10,const Arg & a11,const Arg & a12,const Arg & a13,const Arg & a14,const Arg & a15) const254 bool PCRE::ConsumeFunctor::operator()(
255     absl::string_view* input, const PCRE& pattern, const Arg& a0, const Arg& a1,
256     const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5, const Arg& a6,
257     const Arg& a7, const Arg& a8, const Arg& a9, const Arg& a10, const Arg& a11,
258     const Arg& a12, const Arg& a13, const Arg& a14, const Arg& a15) const {
259   const Arg* args[kMaxArgs];
260   int n = 0;
261   if (&a0 == &no_more_args)  goto done; args[n++] = &a0;
262   if (&a1 == &no_more_args)  goto done; args[n++] = &a1;
263   if (&a2 == &no_more_args)  goto done; args[n++] = &a2;
264   if (&a3 == &no_more_args)  goto done; args[n++] = &a3;
265   if (&a4 == &no_more_args)  goto done; args[n++] = &a4;
266   if (&a5 == &no_more_args)  goto done; args[n++] = &a5;
267   if (&a6 == &no_more_args)  goto done; args[n++] = &a6;
268   if (&a7 == &no_more_args)  goto done; args[n++] = &a7;
269   if (&a8 == &no_more_args)  goto done; args[n++] = &a8;
270   if (&a9 == &no_more_args)  goto done; args[n++] = &a9;
271   if (&a10 == &no_more_args) goto done; args[n++] = &a10;
272   if (&a11 == &no_more_args) goto done; args[n++] = &a11;
273   if (&a12 == &no_more_args) goto done; args[n++] = &a12;
274   if (&a13 == &no_more_args) goto done; args[n++] = &a13;
275   if (&a14 == &no_more_args) goto done; args[n++] = &a14;
276   if (&a15 == &no_more_args) goto done; args[n++] = &a15;
277 done:
278 
279   size_t consumed;
280   int vec[kVecSize] = {};
281   if (pattern.DoMatchImpl(*input, ANCHOR_START, &consumed,
282                           args, n, vec, kVecSize)) {
283     input->remove_prefix(consumed);
284     return true;
285   } else {
286     return false;
287   }
288 }
289 
operator ()(absl::string_view * input,const PCRE & pattern,const Arg & a0,const Arg & a1,const Arg & a2,const Arg & a3,const Arg & a4,const Arg & a5,const Arg & a6,const Arg & a7,const Arg & a8,const Arg & a9,const Arg & a10,const Arg & a11,const Arg & a12,const Arg & a13,const Arg & a14,const Arg & a15) const290 bool PCRE::FindAndConsumeFunctor::operator()(
291     absl::string_view* input, const PCRE& pattern, const Arg& a0, const Arg& a1,
292     const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5, const Arg& a6,
293     const Arg& a7, const Arg& a8, const Arg& a9, const Arg& a10, const Arg& a11,
294     const Arg& a12, const Arg& a13, const Arg& a14, const Arg& a15) const {
295   const Arg* args[kMaxArgs];
296   int n = 0;
297   if (&a0 == &no_more_args)  goto done; args[n++] = &a0;
298   if (&a1 == &no_more_args)  goto done; args[n++] = &a1;
299   if (&a2 == &no_more_args)  goto done; args[n++] = &a2;
300   if (&a3 == &no_more_args)  goto done; args[n++] = &a3;
301   if (&a4 == &no_more_args)  goto done; args[n++] = &a4;
302   if (&a5 == &no_more_args)  goto done; args[n++] = &a5;
303   if (&a6 == &no_more_args)  goto done; args[n++] = &a6;
304   if (&a7 == &no_more_args)  goto done; args[n++] = &a7;
305   if (&a8 == &no_more_args)  goto done; args[n++] = &a8;
306   if (&a9 == &no_more_args)  goto done; args[n++] = &a9;
307   if (&a10 == &no_more_args) goto done; args[n++] = &a10;
308   if (&a11 == &no_more_args) goto done; args[n++] = &a11;
309   if (&a12 == &no_more_args) goto done; args[n++] = &a12;
310   if (&a13 == &no_more_args) goto done; args[n++] = &a13;
311   if (&a14 == &no_more_args) goto done; args[n++] = &a14;
312   if (&a15 == &no_more_args) goto done; args[n++] = &a15;
313 done:
314 
315   size_t consumed;
316   int vec[kVecSize] = {};
317   if (pattern.DoMatchImpl(*input, UNANCHORED, &consumed,
318                           args, n, vec, kVecSize)) {
319     input->remove_prefix(consumed);
320     return true;
321   } else {
322     return false;
323   }
324 }
325 
Replace(std::string * str,const PCRE & pattern,absl::string_view rewrite)326 bool PCRE::Replace(std::string* str, const PCRE& pattern,
327                    absl::string_view rewrite) {
328   int vec[kVecSize] = {};
329   int matches = pattern.TryMatch(*str, 0, UNANCHORED, true, vec, kVecSize);
330   if (matches == 0)
331     return false;
332 
333   std::string s;
334   if (!pattern.Rewrite(&s, rewrite, *str, vec, matches))
335     return false;
336 
337   assert(vec[0] >= 0);
338   assert(vec[1] >= 0);
339   str->replace(vec[0], vec[1] - vec[0], s);
340   return true;
341 }
342 
GlobalReplace(std::string * str,const PCRE & pattern,absl::string_view rewrite)343 int PCRE::GlobalReplace(std::string* str, const PCRE& pattern,
344                         absl::string_view rewrite) {
345   int count = 0;
346   int vec[kVecSize] = {};
347   std::string out;
348   size_t start = 0;
349   bool last_match_was_empty_string = false;
350 
351   while (start <= str->size()) {
352     // If the previous match was for the empty string, we shouldn't
353     // just match again: we'll match in the same way and get an
354     // infinite loop.  Instead, we do the match in a special way:
355     // anchored -- to force another try at the same position --
356     // and with a flag saying that this time, ignore empty matches.
357     // If this special match returns, that means there's a non-empty
358     // match at this position as well, and we can continue.  If not,
359     // we do what perl does, and just advance by one.
360     // Notice that perl prints '@@@' for this;
361     //    perl -le '$_ = "aa"; s/b*|aa/@/g; print'
362     int matches;
363     if (last_match_was_empty_string) {
364       matches = pattern.TryMatch(*str, start, ANCHOR_START, false,
365                                  vec, kVecSize);
366       if (matches <= 0) {
367         if (start < str->size())
368           out.push_back((*str)[start]);
369         start++;
370         last_match_was_empty_string = false;
371         continue;
372       }
373     } else {
374       matches = pattern.TryMatch(*str, start, UNANCHORED, true,
375                                  vec, kVecSize);
376       if (matches <= 0)
377         break;
378     }
379     size_t matchstart = vec[0], matchend = vec[1];
380     assert(matchstart >= start);
381     assert(matchend >= matchstart);
382 
383     out.append(*str, start, matchstart - start);
384     pattern.Rewrite(&out, rewrite, *str, vec, matches);
385     start = matchend;
386     count++;
387     last_match_was_empty_string = (matchstart == matchend);
388   }
389 
390   if (count == 0)
391     return 0;
392 
393   if (start < str->size())
394     out.append(*str, start, str->size() - start);
395   using std::swap;
396   swap(out, *str);
397   return count;
398 }
399 
Extract(absl::string_view text,const PCRE & pattern,absl::string_view rewrite,std::string * out)400 bool PCRE::Extract(absl::string_view text, const PCRE& pattern,
401                    absl::string_view rewrite, std::string* out) {
402   int vec[kVecSize] = {};
403   int matches = pattern.TryMatch(text, 0, UNANCHORED, true, vec, kVecSize);
404   if (matches == 0)
405     return false;
406   out->clear();
407   return pattern.Rewrite(out, rewrite, text, vec, matches);
408 }
409 
QuoteMeta(absl::string_view unquoted)410 std::string PCRE::QuoteMeta(absl::string_view unquoted) {
411   std::string result;
412   result.reserve(unquoted.size() << 1);
413 
414   // Escape any ascii character not in [A-Za-z_0-9].
415   //
416   // Note that it's legal to escape a character even if it has no
417   // special meaning in a regular expression -- so this function does
418   // that.  (This also makes it identical to the perl function of the
419   // same name except for the null-character special case;
420   // see `perldoc -f quotemeta`.)
421   for (size_t ii = 0; ii < unquoted.size(); ++ii) {
422     // Note that using 'isalnum' here raises the benchmark time from
423     // 32ns to 58ns:
424     if ((unquoted[ii] < 'a' || unquoted[ii] > 'z') &&
425         (unquoted[ii] < 'A' || unquoted[ii] > 'Z') &&
426         (unquoted[ii] < '0' || unquoted[ii] > '9') &&
427         unquoted[ii] != '_' &&
428         // If this is the part of a UTF8 or Latin1 character, we need
429         // to copy this byte without escaping.  Experimentally this is
430         // what works correctly with the regexp library.
431         !(unquoted[ii] & 128)) {
432       if (unquoted[ii] == '\0') {  // Special handling for null chars.
433         // Can't use "\\0" since the next character might be a digit.
434         result += "\\x00";
435         continue;
436       }
437       result += '\\';
438     }
439     result += unquoted[ii];
440   }
441 
442   return result;
443 }
444 
445 /***** Actual matching and rewriting code *****/
446 
HitLimit()447 bool PCRE::HitLimit() {
448   return hit_limit_ != 0;
449 }
450 
ClearHitLimit()451 void PCRE::ClearHitLimit() {
452   hit_limit_ = 0;
453 }
454 
TryMatch(absl::string_view text,size_t startpos,Anchor anchor,bool empty_ok,int * vec,int vecsize) const455 int PCRE::TryMatch(absl::string_view text, size_t startpos, Anchor anchor,
456                    bool empty_ok, int* vec, int vecsize) const {
457   pcre* re = (anchor == ANCHOR_BOTH) ? re_full_ : re_partial_;
458   if (re == NULL) {
459     PCREPORT(ERROR) << "Matching against invalid re: " << *error_;
460     return 0;
461   }
462 
463   int match_limit = match_limit_;
464   if (match_limit <= 0) {
465     match_limit = absl::GetFlag(FLAGS_regexp_match_limit);
466   }
467 
468   int stack_limit = stack_limit_;
469   if (stack_limit <= 0) {
470     stack_limit = absl::GetFlag(FLAGS_regexp_stack_limit);
471   }
472 
473   pcre_extra extra = { 0 };
474   if (match_limit > 0) {
475     extra.flags |= PCRE_EXTRA_MATCH_LIMIT;
476     extra.match_limit = match_limit;
477   }
478   if (stack_limit > 0) {
479     extra.flags |= PCRE_EXTRA_MATCH_LIMIT_RECURSION;
480     extra.match_limit_recursion = stack_limit / kPCREFrameSize;
481   }
482 
483   int options = 0;
484   if (anchor != UNANCHORED)
485     options |= PCRE_ANCHORED;
486   if (!empty_ok)
487     options |= PCRE_NOTEMPTY;
488 
489   int rc = pcre_exec(re,              // The regular expression object
490                      &extra,
491                      (text.data() == NULL) ? "" : text.data(),
492                      static_cast<int>(text.size()),
493                      static_cast<int>(startpos),
494                      options,
495                      vec,
496                      vecsize);
497 
498   // Handle errors
499   if (rc == 0) {
500     // pcre_exec() returns 0 as a special case when the number of
501     // capturing subpatterns exceeds the size of the vector.
502     // When this happens, there is a match and the output vector
503     // is filled, but we miss out on the positions of the extra subpatterns.
504     rc = vecsize / 2;
505   } else if (rc < 0) {
506     switch (rc) {
507       case PCRE_ERROR_NOMATCH:
508         return 0;
509       case PCRE_ERROR_MATCHLIMIT:
510         // Writing to hit_limit is not safe if multiple threads
511         // are using the PCRE, but the flag is only intended
512         // for use by unit tests anyway, so we let it go.
513         hit_limit_ = true;
514         PCREPORT(WARNING) << "Exceeded match limit of " << match_limit
515                         << " when matching '" << pattern_ << "'"
516                         << " against text that is " << text.size() << " bytes.";
517         return 0;
518       case PCRE_ERROR_RECURSIONLIMIT:
519         // See comment about hit_limit above.
520         hit_limit_ = true;
521         PCREPORT(WARNING) << "Exceeded stack limit of " << stack_limit
522                         << " when matching '" << pattern_ << "'"
523                         << " against text that is " << text.size() << " bytes.";
524         return 0;
525       default:
526         // There are other return codes from pcre.h :
527         // PCRE_ERROR_NULL           (-2)
528         // PCRE_ERROR_BADOPTION      (-3)
529         // PCRE_ERROR_BADMAGIC       (-4)
530         // PCRE_ERROR_UNKNOWN_NODE   (-5)
531         // PCRE_ERROR_NOMEMORY       (-6)
532         // PCRE_ERROR_NOSUBSTRING    (-7)
533         // ...
534         PCREPORT(ERROR) << "Unexpected return code: " << rc
535                       << " when matching '" << pattern_ << "'"
536                       << ", re=" << re
537                       << ", text=" << text
538                       << ", vec=" << vec
539                       << ", vecsize=" << vecsize;
540         return 0;
541     }
542   }
543 
544   return rc;
545 }
546 
DoMatchImpl(absl::string_view text,Anchor anchor,size_t * consumed,const Arg * const * args,int n,int * vec,int vecsize) const547 bool PCRE::DoMatchImpl(absl::string_view text, Anchor anchor, size_t* consumed,
548                        const Arg* const* args, int n, int* vec,
549                        int vecsize) const {
550   assert((1 + n) * 3 <= vecsize);  // results + PCRE workspace
551   if (NumberOfCapturingGroups() < n) {
552     // RE has fewer capturing groups than number of Arg pointers passed in.
553     return false;
554   }
555 
556   int matches = TryMatch(text, 0, anchor, true, vec, vecsize);
557   assert(matches >= 0);  // TryMatch never returns negatives
558   if (matches == 0)
559     return false;
560 
561   *consumed = vec[1];
562 
563   if (n == 0 || args == NULL) {
564     // We are not interested in results
565     return true;
566   }
567 
568   // If we got here, we must have matched the whole pattern.
569   // We do not need (can not do) any more checks on the value of 'matches' here
570   // -- see the comment for TryMatch.
571   for (int i = 0; i < n; i++) {
572     const int start = vec[2*(i+1)];
573     const int limit = vec[2*(i+1)+1];
574 
575     // Avoid invoking undefined behavior when text.data() happens
576     // to be null and start happens to be -1, the latter being the
577     // case for an unmatched subexpression. Even if text.data() is
578     // not null, pointing one byte before was a longstanding bug.
579     const char* addr = NULL;
580     if (start != -1) {
581       addr = text.data() + start;
582     }
583 
584     if (!args[i]->Parse(addr, limit-start)) {
585       // TODO: Should we indicate what the error was?
586       return false;
587     }
588   }
589 
590   return true;
591 }
592 
DoMatch(absl::string_view text,Anchor anchor,size_t * consumed,const Arg * const args[],int n) const593 bool PCRE::DoMatch(absl::string_view text, Anchor anchor, size_t* consumed,
594                    const Arg* const args[], int n) const {
595   assert(n >= 0);
596   const int vecsize = (1 + n) * 3;  // results + PCRE workspace
597                                     // (as for kVecSize)
598   int* vec = new int[vecsize];
599   bool b = DoMatchImpl(text, anchor, consumed, args, n, vec, vecsize);
600   delete[] vec;
601   return b;
602 }
603 
Rewrite(std::string * out,absl::string_view rewrite,absl::string_view text,int * vec,int veclen) const604 bool PCRE::Rewrite(std::string* out, absl::string_view rewrite,
605                    absl::string_view text, int* vec, int veclen) const {
606   int number_of_capturing_groups = NumberOfCapturingGroups();
607   for (const char *s = rewrite.data(), *end = s + rewrite.size();
608        s < end; s++) {
609     int c = *s;
610     if (c == '\\') {
611       c = *++s;
612       if (isdigit(c)) {
613         int n = (c - '0');
614         if (n >= veclen) {
615           if (n <= number_of_capturing_groups) {
616             // unmatched optional capturing group. treat
617             // its value as empty string; i.e., nothing to append.
618           } else {
619             PCREPORT(ERROR) << "requested group " << n
620                           << " in regexp " << rewrite.data();
621             return false;
622           }
623         }
624         int start = vec[2 * n];
625         if (start >= 0)
626           out->append(text.data() + start, vec[2 * n + 1] - start);
627       } else if (c == '\\') {
628         out->push_back('\\');
629       } else {
630         PCREPORT(ERROR) << "invalid rewrite pattern: " << rewrite.data();
631         return false;
632       }
633     } else {
634       out->push_back(c);
635     }
636   }
637   return true;
638 }
639 
CheckRewriteString(absl::string_view rewrite,std::string * error) const640 bool PCRE::CheckRewriteString(absl::string_view rewrite,
641                               std::string* error) const {
642   int max_token = -1;
643   for (const char *s = rewrite.data(), *end = s + rewrite.size();
644        s < end; s++) {
645     int c = *s;
646     if (c != '\\') {
647       continue;
648     }
649     if (++s == end) {
650       *error = "Rewrite schema error: '\\' not allowed at end.";
651       return false;
652     }
653     c = *s;
654     if (c == '\\') {
655       continue;
656     }
657     if (!isdigit(c)) {
658       *error = "Rewrite schema error: "
659                "'\\' must be followed by a digit or '\\'.";
660       return false;
661     }
662     int n = (c - '0');
663     if (max_token < n) {
664       max_token = n;
665     }
666   }
667 
668   if (max_token > NumberOfCapturingGroups()) {
669     *error = absl::StrFormat(
670         "Rewrite schema requests %d matches, but the regexp only has %d "
671         "parenthesized subexpressions.",
672         max_token, NumberOfCapturingGroups());
673     return false;
674   }
675   return true;
676 }
677 
678 // Return the number of capturing subpatterns, or -1 if the
679 // regexp wasn't valid on construction.
NumberOfCapturingGroups() const680 int PCRE::NumberOfCapturingGroups() const {
681   if (re_partial_ == NULL) return -1;
682 
683   int result;
684   int rc = pcre_fullinfo(re_partial_,       // The regular expression object
685                          NULL,              // We did not study the pattern
686                          PCRE_INFO_CAPTURECOUNT,
687                          &result);
688   if (rc != 0) {
689     PCREPORT(ERROR) << "Unexpected return code: " << rc;
690     return -1;
691   }
692   return result;
693 }
694 
695 
696 /***** Parsers for various types *****/
697 
parse_null(const char * str,size_t n,void * dest)698 bool PCRE::Arg::parse_null(const char* str, size_t n, void* dest) {
699   // We fail if somebody asked us to store into a non-NULL void* pointer
700   return (dest == NULL);
701 }
702 
parse_string(const char * str,size_t n,void * dest)703 bool PCRE::Arg::parse_string(const char* str, size_t n, void* dest) {
704   if (dest == NULL) return true;
705   reinterpret_cast<std::string*>(dest)->assign(str, n);
706   return true;
707 }
708 
parse_string_view(const char * str,size_t n,void * dest)709 bool PCRE::Arg::parse_string_view(const char* str, size_t n, void* dest) {
710   if (dest == NULL) return true;
711   *(reinterpret_cast<absl::string_view*>(dest)) = absl::string_view(str, n);
712   return true;
713 }
714 
parse_char(const char * str,size_t n,void * dest)715 bool PCRE::Arg::parse_char(const char* str, size_t n, void* dest) {
716   if (n != 1) return false;
717   if (dest == NULL) return true;
718   *(reinterpret_cast<char*>(dest)) = str[0];
719   return true;
720 }
721 
parse_schar(const char * str,size_t n,void * dest)722 bool PCRE::Arg::parse_schar(const char* str, size_t n, void* dest) {
723   if (n != 1) return false;
724   if (dest == NULL) return true;
725   *(reinterpret_cast<signed char*>(dest)) = str[0];
726   return true;
727 }
728 
parse_uchar(const char * str,size_t n,void * dest)729 bool PCRE::Arg::parse_uchar(const char* str, size_t n, void* dest) {
730   if (n != 1) return false;
731   if (dest == NULL) return true;
732   *(reinterpret_cast<unsigned char*>(dest)) = str[0];
733   return true;
734 }
735 
736 // Largest number spec that we are willing to parse
737 static const int kMaxNumberLength = 32;
738 
739 // PCREQUIPCRES "buf" must have length at least kMaxNumberLength+1
740 // PCREQUIPCRES "n > 0"
741 // Copies "str" into "buf" and null-terminates if necessary.
742 // Returns one of:
743 //      a. "str" if no termination is needed
744 //      b. "buf" if the string was copied and null-terminated
745 //      c. "" if the input was invalid and has no hope of being parsed
TerminateNumber(char * buf,const char * str,size_t n)746 static const char* TerminateNumber(char* buf, const char* str, size_t n) {
747   if ((n > 0) && isspace(*str)) {
748     // We are less forgiving than the strtoxxx() routines and do not
749     // allow leading spaces.
750     return "";
751   }
752 
753   // See if the character right after the input text may potentially
754   // look like a digit.
755   if (isdigit(str[n]) ||
756       ((str[n] >= 'a') && (str[n] <= 'f')) ||
757       ((str[n] >= 'A') && (str[n] <= 'F'))) {
758     if (n > kMaxNumberLength) return ""; // Input too big to be a valid number
759     memcpy(buf, str, n);
760     buf[n] = '\0';
761     return buf;
762   } else {
763     // We can parse right out of the supplied string, so return it.
764     return str;
765   }
766 }
767 
parse_long_radix(const char * str,size_t n,void * dest,int radix)768 bool PCRE::Arg::parse_long_radix(const char* str,
769                                  size_t n,
770                                  void* dest,
771                                  int radix) {
772   if (n == 0) return false;
773   char buf[kMaxNumberLength+1];
774   str = TerminateNumber(buf, str, n);
775   char* end;
776   errno = 0;
777   long r = strtol(str, &end, radix);
778   if (end != str + n) return false;   // Leftover junk
779   if (errno) return false;
780   if (dest == NULL) return true;
781   *(reinterpret_cast<long*>(dest)) = r;
782   return true;
783 }
784 
parse_ulong_radix(const char * str,size_t n,void * dest,int radix)785 bool PCRE::Arg::parse_ulong_radix(const char* str,
786                                   size_t n,
787                                   void* dest,
788                                   int radix) {
789   if (n == 0) return false;
790   char buf[kMaxNumberLength+1];
791   str = TerminateNumber(buf, str, n);
792   if (str[0] == '-') {
793     // strtoul() will silently accept negative numbers and parse
794     // them.  This module is more strict and treats them as errors.
795     return false;
796   }
797 
798   char* end;
799   errno = 0;
800   unsigned long r = strtoul(str, &end, radix);
801   if (end != str + n) return false;   // Leftover junk
802   if (errno) return false;
803   if (dest == NULL) return true;
804   *(reinterpret_cast<unsigned long*>(dest)) = r;
805   return true;
806 }
807 
parse_short_radix(const char * str,size_t n,void * dest,int radix)808 bool PCRE::Arg::parse_short_radix(const char* str,
809                                   size_t n,
810                                   void* dest,
811                                   int radix) {
812   long r;
813   if (!parse_long_radix(str, n, &r, radix)) return false;  // Could not parse
814   if ((short)r != r) return false;                         // Out of range
815   if (dest == NULL) return true;
816   *(reinterpret_cast<short*>(dest)) = (short)r;
817   return true;
818 }
819 
parse_ushort_radix(const char * str,size_t n,void * dest,int radix)820 bool PCRE::Arg::parse_ushort_radix(const char* str,
821                                    size_t n,
822                                    void* dest,
823                                    int radix) {
824   unsigned long r;
825   if (!parse_ulong_radix(str, n, &r, radix)) return false;  // Could not parse
826   if ((unsigned short)r != r) return false;                 // Out of range
827   if (dest == NULL) return true;
828   *(reinterpret_cast<unsigned short*>(dest)) = (unsigned short)r;
829   return true;
830 }
831 
parse_int_radix(const char * str,size_t n,void * dest,int radix)832 bool PCRE::Arg::parse_int_radix(const char* str,
833                                 size_t n,
834                                 void* dest,
835                                 int radix) {
836   long r;
837   if (!parse_long_radix(str, n, &r, radix)) return false;  // Could not parse
838   if ((int)r != r) return false;                           // Out of range
839   if (dest == NULL) return true;
840   *(reinterpret_cast<int*>(dest)) = (int)r;
841   return true;
842 }
843 
parse_uint_radix(const char * str,size_t n,void * dest,int radix)844 bool PCRE::Arg::parse_uint_radix(const char* str,
845                                  size_t n,
846                                  void* dest,
847                                  int radix) {
848   unsigned long r;
849   if (!parse_ulong_radix(str, n, &r, radix)) return false;  // Could not parse
850   if ((unsigned int)r != r) return false;                   // Out of range
851   if (dest == NULL) return true;
852   *(reinterpret_cast<unsigned int*>(dest)) = (unsigned int)r;
853   return true;
854 }
855 
parse_longlong_radix(const char * str,size_t n,void * dest,int radix)856 bool PCRE::Arg::parse_longlong_radix(const char* str,
857                                      size_t n,
858                                      void* dest,
859                                      int radix) {
860   if (n == 0) return false;
861   char buf[kMaxNumberLength+1];
862   str = TerminateNumber(buf, str, n);
863   char* end;
864   errno = 0;
865   long long r = strtoll(str, &end, radix);
866   if (end != str + n) return false;   // Leftover junk
867   if (errno) return false;
868   if (dest == NULL) return true;
869   *(reinterpret_cast<long long*>(dest)) = r;
870   return true;
871 }
872 
parse_ulonglong_radix(const char * str,size_t n,void * dest,int radix)873 bool PCRE::Arg::parse_ulonglong_radix(const char* str,
874                                       size_t n,
875                                       void* dest,
876                                       int radix) {
877   if (n == 0) return false;
878   char buf[kMaxNumberLength+1];
879   str = TerminateNumber(buf, str, n);
880   if (str[0] == '-') {
881     // strtoull() will silently accept negative numbers and parse
882     // them.  This module is more strict and treats them as errors.
883     return false;
884   }
885   char* end;
886   errno = 0;
887   unsigned long long r = strtoull(str, &end, radix);
888   if (end != str + n) return false;   // Leftover junk
889   if (errno) return false;
890   if (dest == NULL) return true;
891   *(reinterpret_cast<unsigned long long*>(dest)) = r;
892   return true;
893 }
894 
parse_double_float(const char * str,size_t n,bool isfloat,void * dest)895 static bool parse_double_float(const char* str, size_t n, bool isfloat,
896                                void* dest) {
897   if (n == 0) return false;
898   static const int kMaxLength = 200;
899   char buf[kMaxLength];
900   if (n >= kMaxLength) return false;
901   memcpy(buf, str, n);
902   buf[n] = '\0';
903   char* end;
904   errno = 0;
905   double r;
906   if (isfloat) {
907     r = strtof(buf, &end);
908   } else {
909     r = strtod(buf, &end);
910   }
911   if (end != buf + n) return false;   // Leftover junk
912   if (errno) return false;
913   if (dest == NULL) return true;
914   if (isfloat) {
915     *(reinterpret_cast<float*>(dest)) = (float)r;
916   } else {
917     *(reinterpret_cast<double*>(dest)) = r;
918   }
919   return true;
920 }
921 
parse_double(const char * str,size_t n,void * dest)922 bool PCRE::Arg::parse_double(const char* str, size_t n, void* dest) {
923   return parse_double_float(str, n, false, dest);
924 }
925 
parse_float(const char * str,size_t n,void * dest)926 bool PCRE::Arg::parse_float(const char* str, size_t n, void* dest) {
927   return parse_double_float(str, n, true, dest);
928 }
929 
930 #define DEFINE_INTEGER_PARSER(name)                                           \
931   bool PCRE::Arg::parse_##name(const char* str, size_t n, void* dest) {       \
932     return parse_##name##_radix(str, n, dest, 10);                            \
933   }                                                                           \
934   bool PCRE::Arg::parse_##name##_hex(const char* str, size_t n, void* dest) { \
935     return parse_##name##_radix(str, n, dest, 16);                            \
936   }                                                                           \
937   bool PCRE::Arg::parse_##name##_octal(const char* str, size_t n,             \
938                                        void* dest) {                          \
939     return parse_##name##_radix(str, n, dest, 8);                             \
940   }                                                                           \
941   bool PCRE::Arg::parse_##name##_cradix(const char* str, size_t n,            \
942                                         void* dest) {                         \
943     return parse_##name##_radix(str, n, dest, 0);                             \
944   }
945 
946 DEFINE_INTEGER_PARSER(short);
947 DEFINE_INTEGER_PARSER(ushort);
948 DEFINE_INTEGER_PARSER(int);
949 DEFINE_INTEGER_PARSER(uint);
950 DEFINE_INTEGER_PARSER(long);
951 DEFINE_INTEGER_PARSER(ulong);
952 DEFINE_INTEGER_PARSER(longlong);
953 DEFINE_INTEGER_PARSER(ulonglong);
954 
955 #undef DEFINE_INTEGER_PARSER
956 
957 }  // namespace re2
958