1 // Copyright 2003-2009 Google Inc. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 // This is a variant of PCRE's pcrecpp.cc, originally written at Google.
6 // The main changes are the addition of the HitLimit method and
7 // compilation as PCRE in namespace re2.
8
9 #include <assert.h>
10 #include <ctype.h>
11 #include <errno.h>
12 #include <stdlib.h>
13 #include <string.h>
14 #include <limits>
15 #include <string>
16 #include <utility>
17
18 #include "absl/flags/flag.h"
19 #include "absl/log/absl_check.h"
20 #include "absl/log/absl_log.h"
21 #include "absl/strings/str_format.h"
22 #include "util/pcre.h"
23
24 // Silence warnings about the wacky formatting in the operator() functions.
25 #if defined(__GNUC__)
26 #pragma GCC diagnostic ignored "-Wmisleading-indentation"
27 #endif
28
29 #define PCREPORT(level) ABSL_LOG(level)
30
31 // Default PCRE limits.
32 // Defaults chosen to allow a plausible amount of CPU and
33 // not exceed main thread stacks. Note that other threads
34 // often have smaller stacks, and therefore tightening
35 // regexp_stack_limit may frequently be necessary.
36 ABSL_FLAG(int, regexp_stack_limit, 256 << 10,
37 "default PCRE stack limit (bytes)");
38 ABSL_FLAG(int, regexp_match_limit, 1000000,
39 "default PCRE match limit (function calls)");
40
41 #ifndef USEPCRE
42
43 // Fake just enough of the PCRE API to allow this file to build. :)
44
45 struct pcre_extra {
46 int flags;
47 int match_limit;
48 int match_limit_recursion;
49 };
50
51 #define PCRE_EXTRA_MATCH_LIMIT 0
52 #define PCRE_EXTRA_MATCH_LIMIT_RECURSION 0
53 #define PCRE_ANCHORED 0
54 #define PCRE_NOTEMPTY 0
55 #define PCRE_ERROR_NOMATCH 1
56 #define PCRE_ERROR_MATCHLIMIT 2
57 #define PCRE_ERROR_RECURSIONLIMIT 3
58 #define PCRE_INFO_CAPTURECOUNT 0
59
pcre_free(void *)60 void pcre_free(void*) {
61 }
62
pcre_compile(const char *,int,const char **,int *,const unsigned char *)63 pcre* pcre_compile(const char*, int, const char**, int*, const unsigned char*) {
64 return NULL;
65 }
66
pcre_exec(const pcre *,const pcre_extra *,const char *,int,int,int,int *,int)67 int pcre_exec(const pcre*, const pcre_extra*, const char*, int, int, int, int*, int) {
68 return 0;
69 }
70
pcre_fullinfo(const pcre *,const pcre_extra *,int,void *)71 int pcre_fullinfo(const pcre*, const pcre_extra*, int, void*) {
72 return 0;
73 }
74
75 #endif
76
77 namespace re2 {
78
79 // Maximum number of args we can set
80 static const int kMaxArgs = 16;
81 static const int kVecSize = (1 + kMaxArgs) * 3; // results + PCRE workspace
82
83 // Approximate size of a recursive invocation of PCRE's
84 // internal "match()" frame. This varies depending on the
85 // compiler and architecture, of course, so the constant is
86 // just a conservative estimate. To find the exact number,
87 // run regexp_unittest with --regexp_stack_limit=0 under
88 // a debugger and look at the frames when it crashes.
89 // The exact frame size was 656 in production on 2008/02/03.
90 static const int kPCREFrameSize = 700;
91
92 // Special name for missing C++ arguments.
93 PCRE::Arg PCRE::no_more_args((void*)NULL);
94
95 const PCRE::PartialMatchFunctor PCRE::PartialMatch = { };
96 const PCRE::FullMatchFunctor PCRE::FullMatch = { } ;
97 const PCRE::ConsumeFunctor PCRE::Consume = { };
98 const PCRE::FindAndConsumeFunctor PCRE::FindAndConsume = { };
99
100 // If a regular expression has no error, its error_ field points here
101 static const std::string empty_string;
102
Init(const char * pattern,Option options,int match_limit,int stack_limit,bool report_errors)103 void PCRE::Init(const char* pattern, Option options, int match_limit,
104 int stack_limit, bool report_errors) {
105 pattern_ = pattern;
106 options_ = options;
107 match_limit_ = match_limit;
108 stack_limit_ = stack_limit;
109 hit_limit_ = false;
110 error_ = &empty_string;
111 report_errors_ = report_errors;
112 re_full_ = NULL;
113 re_partial_ = NULL;
114
115 if (options & ~(EnabledCompileOptions | EnabledExecOptions)) {
116 error_ = new std::string("illegal regexp option");
117 PCREPORT(ERROR)
118 << "Error compiling '" << pattern << "': illegal regexp option";
119 } else {
120 re_partial_ = Compile(UNANCHORED);
121 if (re_partial_ != NULL) {
122 re_full_ = Compile(ANCHOR_BOTH);
123 }
124 }
125 }
126
PCRE(const char * pattern)127 PCRE::PCRE(const char* pattern) {
128 Init(pattern, None, 0, 0, true);
129 }
PCRE(const char * pattern,Option option)130 PCRE::PCRE(const char* pattern, Option option) {
131 Init(pattern, option, 0, 0, true);
132 }
PCRE(const std::string & pattern)133 PCRE::PCRE(const std::string& pattern) {
134 Init(pattern.c_str(), None, 0, 0, true);
135 }
PCRE(const std::string & pattern,Option option)136 PCRE::PCRE(const std::string& pattern, Option option) {
137 Init(pattern.c_str(), option, 0, 0, true);
138 }
PCRE(const std::string & pattern,const PCRE_Options & re_option)139 PCRE::PCRE(const std::string& pattern, const PCRE_Options& re_option) {
140 Init(pattern.c_str(), re_option.option(), re_option.match_limit(),
141 re_option.stack_limit(), re_option.report_errors());
142 }
143
PCRE(const char * pattern,const PCRE_Options & re_option)144 PCRE::PCRE(const char *pattern, const PCRE_Options& re_option) {
145 Init(pattern, re_option.option(), re_option.match_limit(),
146 re_option.stack_limit(), re_option.report_errors());
147 }
148
~PCRE()149 PCRE::~PCRE() {
150 if (re_full_ != NULL) pcre_free(re_full_);
151 if (re_partial_ != NULL) pcre_free(re_partial_);
152 if (error_ != &empty_string) delete error_;
153 }
154
Compile(Anchor anchor)155 pcre* PCRE::Compile(Anchor anchor) {
156 // Special treatment for anchoring. This is needed because at
157 // runtime pcre only provides an option for anchoring at the
158 // beginning of a string.
159 //
160 // There are three types of anchoring we want:
161 // UNANCHORED Compile the original pattern, and use
162 // a pcre unanchored match.
163 // ANCHOR_START Compile the original pattern, and use
164 // a pcre anchored match.
165 // ANCHOR_BOTH Tack a "\z" to the end of the original pattern
166 // and use a pcre anchored match.
167
168 const char* error = "";
169 int eoffset;
170 pcre* re;
171 if (anchor != ANCHOR_BOTH) {
172 re = pcre_compile(pattern_.c_str(),
173 (options_ & EnabledCompileOptions),
174 &error, &eoffset, NULL);
175 } else {
176 // Tack a '\z' at the end of PCRE. Parenthesize it first so that
177 // the '\z' applies to all top-level alternatives in the regexp.
178 std::string wrapped = "(?:"; // A non-counting grouping operator
179 wrapped += pattern_;
180 wrapped += ")\\z";
181 re = pcre_compile(wrapped.c_str(),
182 (options_ & EnabledCompileOptions),
183 &error, &eoffset, NULL);
184 }
185 if (re == NULL) {
186 if (error_ == &empty_string) error_ = new std::string(error);
187 PCREPORT(ERROR) << "Error compiling '" << pattern_ << "': " << error;
188 }
189 return re;
190 }
191
192 /***** Convenience interfaces *****/
193
operator ()(absl::string_view text,const PCRE & re,const Arg & a0,const Arg & a1,const Arg & a2,const Arg & a3,const Arg & a4,const Arg & a5,const Arg & a6,const Arg & a7,const Arg & a8,const Arg & a9,const Arg & a10,const Arg & a11,const Arg & a12,const Arg & a13,const Arg & a14,const Arg & a15) const194 bool PCRE::FullMatchFunctor::operator()(
195 absl::string_view text, const PCRE& re, const Arg& a0, const Arg& a1,
196 const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5, const Arg& a6,
197 const Arg& a7, const Arg& a8, const Arg& a9, const Arg& a10, const Arg& a11,
198 const Arg& a12, const Arg& a13, const Arg& a14, const Arg& a15) const {
199 const Arg* args[kMaxArgs];
200 int n = 0;
201 if (&a0 == &no_more_args) goto done; args[n++] = &a0;
202 if (&a1 == &no_more_args) goto done; args[n++] = &a1;
203 if (&a2 == &no_more_args) goto done; args[n++] = &a2;
204 if (&a3 == &no_more_args) goto done; args[n++] = &a3;
205 if (&a4 == &no_more_args) goto done; args[n++] = &a4;
206 if (&a5 == &no_more_args) goto done; args[n++] = &a5;
207 if (&a6 == &no_more_args) goto done; args[n++] = &a6;
208 if (&a7 == &no_more_args) goto done; args[n++] = &a7;
209 if (&a8 == &no_more_args) goto done; args[n++] = &a8;
210 if (&a9 == &no_more_args) goto done; args[n++] = &a9;
211 if (&a10 == &no_more_args) goto done; args[n++] = &a10;
212 if (&a11 == &no_more_args) goto done; args[n++] = &a11;
213 if (&a12 == &no_more_args) goto done; args[n++] = &a12;
214 if (&a13 == &no_more_args) goto done; args[n++] = &a13;
215 if (&a14 == &no_more_args) goto done; args[n++] = &a14;
216 if (&a15 == &no_more_args) goto done; args[n++] = &a15;
217 done:
218
219 size_t consumed;
220 int vec[kVecSize] = {};
221 return re.DoMatchImpl(text, ANCHOR_BOTH, &consumed, args, n, vec, kVecSize);
222 }
223
operator ()(absl::string_view text,const PCRE & re,const Arg & a0,const Arg & a1,const Arg & a2,const Arg & a3,const Arg & a4,const Arg & a5,const Arg & a6,const Arg & a7,const Arg & a8,const Arg & a9,const Arg & a10,const Arg & a11,const Arg & a12,const Arg & a13,const Arg & a14,const Arg & a15) const224 bool PCRE::PartialMatchFunctor::operator()(
225 absl::string_view text, const PCRE& re, const Arg& a0, const Arg& a1,
226 const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5, const Arg& a6,
227 const Arg& a7, const Arg& a8, const Arg& a9, const Arg& a10, const Arg& a11,
228 const Arg& a12, const Arg& a13, const Arg& a14, const Arg& a15) const {
229 const Arg* args[kMaxArgs];
230 int n = 0;
231 if (&a0 == &no_more_args) goto done; args[n++] = &a0;
232 if (&a1 == &no_more_args) goto done; args[n++] = &a1;
233 if (&a2 == &no_more_args) goto done; args[n++] = &a2;
234 if (&a3 == &no_more_args) goto done; args[n++] = &a3;
235 if (&a4 == &no_more_args) goto done; args[n++] = &a4;
236 if (&a5 == &no_more_args) goto done; args[n++] = &a5;
237 if (&a6 == &no_more_args) goto done; args[n++] = &a6;
238 if (&a7 == &no_more_args) goto done; args[n++] = &a7;
239 if (&a8 == &no_more_args) goto done; args[n++] = &a8;
240 if (&a9 == &no_more_args) goto done; args[n++] = &a9;
241 if (&a10 == &no_more_args) goto done; args[n++] = &a10;
242 if (&a11 == &no_more_args) goto done; args[n++] = &a11;
243 if (&a12 == &no_more_args) goto done; args[n++] = &a12;
244 if (&a13 == &no_more_args) goto done; args[n++] = &a13;
245 if (&a14 == &no_more_args) goto done; args[n++] = &a14;
246 if (&a15 == &no_more_args) goto done; args[n++] = &a15;
247 done:
248
249 size_t consumed;
250 int vec[kVecSize] = {};
251 return re.DoMatchImpl(text, UNANCHORED, &consumed, args, n, vec, kVecSize);
252 }
253
operator ()(absl::string_view * input,const PCRE & pattern,const Arg & a0,const Arg & a1,const Arg & a2,const Arg & a3,const Arg & a4,const Arg & a5,const Arg & a6,const Arg & a7,const Arg & a8,const Arg & a9,const Arg & a10,const Arg & a11,const Arg & a12,const Arg & a13,const Arg & a14,const Arg & a15) const254 bool PCRE::ConsumeFunctor::operator()(
255 absl::string_view* input, const PCRE& pattern, const Arg& a0, const Arg& a1,
256 const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5, const Arg& a6,
257 const Arg& a7, const Arg& a8, const Arg& a9, const Arg& a10, const Arg& a11,
258 const Arg& a12, const Arg& a13, const Arg& a14, const Arg& a15) const {
259 const Arg* args[kMaxArgs];
260 int n = 0;
261 if (&a0 == &no_more_args) goto done; args[n++] = &a0;
262 if (&a1 == &no_more_args) goto done; args[n++] = &a1;
263 if (&a2 == &no_more_args) goto done; args[n++] = &a2;
264 if (&a3 == &no_more_args) goto done; args[n++] = &a3;
265 if (&a4 == &no_more_args) goto done; args[n++] = &a4;
266 if (&a5 == &no_more_args) goto done; args[n++] = &a5;
267 if (&a6 == &no_more_args) goto done; args[n++] = &a6;
268 if (&a7 == &no_more_args) goto done; args[n++] = &a7;
269 if (&a8 == &no_more_args) goto done; args[n++] = &a8;
270 if (&a9 == &no_more_args) goto done; args[n++] = &a9;
271 if (&a10 == &no_more_args) goto done; args[n++] = &a10;
272 if (&a11 == &no_more_args) goto done; args[n++] = &a11;
273 if (&a12 == &no_more_args) goto done; args[n++] = &a12;
274 if (&a13 == &no_more_args) goto done; args[n++] = &a13;
275 if (&a14 == &no_more_args) goto done; args[n++] = &a14;
276 if (&a15 == &no_more_args) goto done; args[n++] = &a15;
277 done:
278
279 size_t consumed;
280 int vec[kVecSize] = {};
281 if (pattern.DoMatchImpl(*input, ANCHOR_START, &consumed,
282 args, n, vec, kVecSize)) {
283 input->remove_prefix(consumed);
284 return true;
285 } else {
286 return false;
287 }
288 }
289
operator ()(absl::string_view * input,const PCRE & pattern,const Arg & a0,const Arg & a1,const Arg & a2,const Arg & a3,const Arg & a4,const Arg & a5,const Arg & a6,const Arg & a7,const Arg & a8,const Arg & a9,const Arg & a10,const Arg & a11,const Arg & a12,const Arg & a13,const Arg & a14,const Arg & a15) const290 bool PCRE::FindAndConsumeFunctor::operator()(
291 absl::string_view* input, const PCRE& pattern, const Arg& a0, const Arg& a1,
292 const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5, const Arg& a6,
293 const Arg& a7, const Arg& a8, const Arg& a9, const Arg& a10, const Arg& a11,
294 const Arg& a12, const Arg& a13, const Arg& a14, const Arg& a15) const {
295 const Arg* args[kMaxArgs];
296 int n = 0;
297 if (&a0 == &no_more_args) goto done; args[n++] = &a0;
298 if (&a1 == &no_more_args) goto done; args[n++] = &a1;
299 if (&a2 == &no_more_args) goto done; args[n++] = &a2;
300 if (&a3 == &no_more_args) goto done; args[n++] = &a3;
301 if (&a4 == &no_more_args) goto done; args[n++] = &a4;
302 if (&a5 == &no_more_args) goto done; args[n++] = &a5;
303 if (&a6 == &no_more_args) goto done; args[n++] = &a6;
304 if (&a7 == &no_more_args) goto done; args[n++] = &a7;
305 if (&a8 == &no_more_args) goto done; args[n++] = &a8;
306 if (&a9 == &no_more_args) goto done; args[n++] = &a9;
307 if (&a10 == &no_more_args) goto done; args[n++] = &a10;
308 if (&a11 == &no_more_args) goto done; args[n++] = &a11;
309 if (&a12 == &no_more_args) goto done; args[n++] = &a12;
310 if (&a13 == &no_more_args) goto done; args[n++] = &a13;
311 if (&a14 == &no_more_args) goto done; args[n++] = &a14;
312 if (&a15 == &no_more_args) goto done; args[n++] = &a15;
313 done:
314
315 size_t consumed;
316 int vec[kVecSize] = {};
317 if (pattern.DoMatchImpl(*input, UNANCHORED, &consumed,
318 args, n, vec, kVecSize)) {
319 input->remove_prefix(consumed);
320 return true;
321 } else {
322 return false;
323 }
324 }
325
Replace(std::string * str,const PCRE & pattern,absl::string_view rewrite)326 bool PCRE::Replace(std::string* str, const PCRE& pattern,
327 absl::string_view rewrite) {
328 int vec[kVecSize] = {};
329 int matches = pattern.TryMatch(*str, 0, UNANCHORED, true, vec, kVecSize);
330 if (matches == 0)
331 return false;
332
333 std::string s;
334 if (!pattern.Rewrite(&s, rewrite, *str, vec, matches))
335 return false;
336
337 assert(vec[0] >= 0);
338 assert(vec[1] >= 0);
339 str->replace(vec[0], vec[1] - vec[0], s);
340 return true;
341 }
342
GlobalReplace(std::string * str,const PCRE & pattern,absl::string_view rewrite)343 int PCRE::GlobalReplace(std::string* str, const PCRE& pattern,
344 absl::string_view rewrite) {
345 int count = 0;
346 int vec[kVecSize] = {};
347 std::string out;
348 size_t start = 0;
349 bool last_match_was_empty_string = false;
350
351 while (start <= str->size()) {
352 // If the previous match was for the empty string, we shouldn't
353 // just match again: we'll match in the same way and get an
354 // infinite loop. Instead, we do the match in a special way:
355 // anchored -- to force another try at the same position --
356 // and with a flag saying that this time, ignore empty matches.
357 // If this special match returns, that means there's a non-empty
358 // match at this position as well, and we can continue. If not,
359 // we do what perl does, and just advance by one.
360 // Notice that perl prints '@@@' for this;
361 // perl -le '$_ = "aa"; s/b*|aa/@/g; print'
362 int matches;
363 if (last_match_was_empty_string) {
364 matches = pattern.TryMatch(*str, start, ANCHOR_START, false,
365 vec, kVecSize);
366 if (matches <= 0) {
367 if (start < str->size())
368 out.push_back((*str)[start]);
369 start++;
370 last_match_was_empty_string = false;
371 continue;
372 }
373 } else {
374 matches = pattern.TryMatch(*str, start, UNANCHORED, true,
375 vec, kVecSize);
376 if (matches <= 0)
377 break;
378 }
379 size_t matchstart = vec[0], matchend = vec[1];
380 assert(matchstart >= start);
381 assert(matchend >= matchstart);
382
383 out.append(*str, start, matchstart - start);
384 pattern.Rewrite(&out, rewrite, *str, vec, matches);
385 start = matchend;
386 count++;
387 last_match_was_empty_string = (matchstart == matchend);
388 }
389
390 if (count == 0)
391 return 0;
392
393 if (start < str->size())
394 out.append(*str, start, str->size() - start);
395 using std::swap;
396 swap(out, *str);
397 return count;
398 }
399
Extract(absl::string_view text,const PCRE & pattern,absl::string_view rewrite,std::string * out)400 bool PCRE::Extract(absl::string_view text, const PCRE& pattern,
401 absl::string_view rewrite, std::string* out) {
402 int vec[kVecSize] = {};
403 int matches = pattern.TryMatch(text, 0, UNANCHORED, true, vec, kVecSize);
404 if (matches == 0)
405 return false;
406 out->clear();
407 return pattern.Rewrite(out, rewrite, text, vec, matches);
408 }
409
QuoteMeta(absl::string_view unquoted)410 std::string PCRE::QuoteMeta(absl::string_view unquoted) {
411 std::string result;
412 result.reserve(unquoted.size() << 1);
413
414 // Escape any ascii character not in [A-Za-z_0-9].
415 //
416 // Note that it's legal to escape a character even if it has no
417 // special meaning in a regular expression -- so this function does
418 // that. (This also makes it identical to the perl function of the
419 // same name except for the null-character special case;
420 // see `perldoc -f quotemeta`.)
421 for (size_t ii = 0; ii < unquoted.size(); ++ii) {
422 // Note that using 'isalnum' here raises the benchmark time from
423 // 32ns to 58ns:
424 if ((unquoted[ii] < 'a' || unquoted[ii] > 'z') &&
425 (unquoted[ii] < 'A' || unquoted[ii] > 'Z') &&
426 (unquoted[ii] < '0' || unquoted[ii] > '9') &&
427 unquoted[ii] != '_' &&
428 // If this is the part of a UTF8 or Latin1 character, we need
429 // to copy this byte without escaping. Experimentally this is
430 // what works correctly with the regexp library.
431 !(unquoted[ii] & 128)) {
432 if (unquoted[ii] == '\0') { // Special handling for null chars.
433 // Can't use "\\0" since the next character might be a digit.
434 result += "\\x00";
435 continue;
436 }
437 result += '\\';
438 }
439 result += unquoted[ii];
440 }
441
442 return result;
443 }
444
445 /***** Actual matching and rewriting code *****/
446
HitLimit()447 bool PCRE::HitLimit() {
448 return hit_limit_ != 0;
449 }
450
ClearHitLimit()451 void PCRE::ClearHitLimit() {
452 hit_limit_ = 0;
453 }
454
TryMatch(absl::string_view text,size_t startpos,Anchor anchor,bool empty_ok,int * vec,int vecsize) const455 int PCRE::TryMatch(absl::string_view text, size_t startpos, Anchor anchor,
456 bool empty_ok, int* vec, int vecsize) const {
457 pcre* re = (anchor == ANCHOR_BOTH) ? re_full_ : re_partial_;
458 if (re == NULL) {
459 PCREPORT(ERROR) << "Matching against invalid re: " << *error_;
460 return 0;
461 }
462
463 int match_limit = match_limit_;
464 if (match_limit <= 0) {
465 match_limit = absl::GetFlag(FLAGS_regexp_match_limit);
466 }
467
468 int stack_limit = stack_limit_;
469 if (stack_limit <= 0) {
470 stack_limit = absl::GetFlag(FLAGS_regexp_stack_limit);
471 }
472
473 pcre_extra extra = { 0 };
474 if (match_limit > 0) {
475 extra.flags |= PCRE_EXTRA_MATCH_LIMIT;
476 extra.match_limit = match_limit;
477 }
478 if (stack_limit > 0) {
479 extra.flags |= PCRE_EXTRA_MATCH_LIMIT_RECURSION;
480 extra.match_limit_recursion = stack_limit / kPCREFrameSize;
481 }
482
483 int options = 0;
484 if (anchor != UNANCHORED)
485 options |= PCRE_ANCHORED;
486 if (!empty_ok)
487 options |= PCRE_NOTEMPTY;
488
489 int rc = pcre_exec(re, // The regular expression object
490 &extra,
491 (text.data() == NULL) ? "" : text.data(),
492 static_cast<int>(text.size()),
493 static_cast<int>(startpos),
494 options,
495 vec,
496 vecsize);
497
498 // Handle errors
499 if (rc == 0) {
500 // pcre_exec() returns 0 as a special case when the number of
501 // capturing subpatterns exceeds the size of the vector.
502 // When this happens, there is a match and the output vector
503 // is filled, but we miss out on the positions of the extra subpatterns.
504 rc = vecsize / 2;
505 } else if (rc < 0) {
506 switch (rc) {
507 case PCRE_ERROR_NOMATCH:
508 return 0;
509 case PCRE_ERROR_MATCHLIMIT:
510 // Writing to hit_limit is not safe if multiple threads
511 // are using the PCRE, but the flag is only intended
512 // for use by unit tests anyway, so we let it go.
513 hit_limit_ = true;
514 PCREPORT(WARNING) << "Exceeded match limit of " << match_limit
515 << " when matching '" << pattern_ << "'"
516 << " against text that is " << text.size() << " bytes.";
517 return 0;
518 case PCRE_ERROR_RECURSIONLIMIT:
519 // See comment about hit_limit above.
520 hit_limit_ = true;
521 PCREPORT(WARNING) << "Exceeded stack limit of " << stack_limit
522 << " when matching '" << pattern_ << "'"
523 << " against text that is " << text.size() << " bytes.";
524 return 0;
525 default:
526 // There are other return codes from pcre.h :
527 // PCRE_ERROR_NULL (-2)
528 // PCRE_ERROR_BADOPTION (-3)
529 // PCRE_ERROR_BADMAGIC (-4)
530 // PCRE_ERROR_UNKNOWN_NODE (-5)
531 // PCRE_ERROR_NOMEMORY (-6)
532 // PCRE_ERROR_NOSUBSTRING (-7)
533 // ...
534 PCREPORT(ERROR) << "Unexpected return code: " << rc
535 << " when matching '" << pattern_ << "'"
536 << ", re=" << re
537 << ", text=" << text
538 << ", vec=" << vec
539 << ", vecsize=" << vecsize;
540 return 0;
541 }
542 }
543
544 return rc;
545 }
546
DoMatchImpl(absl::string_view text,Anchor anchor,size_t * consumed,const Arg * const * args,int n,int * vec,int vecsize) const547 bool PCRE::DoMatchImpl(absl::string_view text, Anchor anchor, size_t* consumed,
548 const Arg* const* args, int n, int* vec,
549 int vecsize) const {
550 assert((1 + n) * 3 <= vecsize); // results + PCRE workspace
551 if (NumberOfCapturingGroups() < n) {
552 // RE has fewer capturing groups than number of Arg pointers passed in.
553 return false;
554 }
555
556 int matches = TryMatch(text, 0, anchor, true, vec, vecsize);
557 assert(matches >= 0); // TryMatch never returns negatives
558 if (matches == 0)
559 return false;
560
561 *consumed = vec[1];
562
563 if (n == 0 || args == NULL) {
564 // We are not interested in results
565 return true;
566 }
567
568 // If we got here, we must have matched the whole pattern.
569 // We do not need (can not do) any more checks on the value of 'matches' here
570 // -- see the comment for TryMatch.
571 for (int i = 0; i < n; i++) {
572 const int start = vec[2*(i+1)];
573 const int limit = vec[2*(i+1)+1];
574
575 // Avoid invoking undefined behavior when text.data() happens
576 // to be null and start happens to be -1, the latter being the
577 // case for an unmatched subexpression. Even if text.data() is
578 // not null, pointing one byte before was a longstanding bug.
579 const char* addr = NULL;
580 if (start != -1) {
581 addr = text.data() + start;
582 }
583
584 if (!args[i]->Parse(addr, limit-start)) {
585 // TODO: Should we indicate what the error was?
586 return false;
587 }
588 }
589
590 return true;
591 }
592
DoMatch(absl::string_view text,Anchor anchor,size_t * consumed,const Arg * const args[],int n) const593 bool PCRE::DoMatch(absl::string_view text, Anchor anchor, size_t* consumed,
594 const Arg* const args[], int n) const {
595 assert(n >= 0);
596 const int vecsize = (1 + n) * 3; // results + PCRE workspace
597 // (as for kVecSize)
598 int* vec = new int[vecsize];
599 bool b = DoMatchImpl(text, anchor, consumed, args, n, vec, vecsize);
600 delete[] vec;
601 return b;
602 }
603
Rewrite(std::string * out,absl::string_view rewrite,absl::string_view text,int * vec,int veclen) const604 bool PCRE::Rewrite(std::string* out, absl::string_view rewrite,
605 absl::string_view text, int* vec, int veclen) const {
606 int number_of_capturing_groups = NumberOfCapturingGroups();
607 for (const char *s = rewrite.data(), *end = s + rewrite.size();
608 s < end; s++) {
609 int c = *s;
610 if (c == '\\') {
611 c = *++s;
612 if (isdigit(c)) {
613 int n = (c - '0');
614 if (n >= veclen) {
615 if (n <= number_of_capturing_groups) {
616 // unmatched optional capturing group. treat
617 // its value as empty string; i.e., nothing to append.
618 } else {
619 PCREPORT(ERROR) << "requested group " << n
620 << " in regexp " << rewrite.data();
621 return false;
622 }
623 }
624 int start = vec[2 * n];
625 if (start >= 0)
626 out->append(text.data() + start, vec[2 * n + 1] - start);
627 } else if (c == '\\') {
628 out->push_back('\\');
629 } else {
630 PCREPORT(ERROR) << "invalid rewrite pattern: " << rewrite.data();
631 return false;
632 }
633 } else {
634 out->push_back(c);
635 }
636 }
637 return true;
638 }
639
CheckRewriteString(absl::string_view rewrite,std::string * error) const640 bool PCRE::CheckRewriteString(absl::string_view rewrite,
641 std::string* error) const {
642 int max_token = -1;
643 for (const char *s = rewrite.data(), *end = s + rewrite.size();
644 s < end; s++) {
645 int c = *s;
646 if (c != '\\') {
647 continue;
648 }
649 if (++s == end) {
650 *error = "Rewrite schema error: '\\' not allowed at end.";
651 return false;
652 }
653 c = *s;
654 if (c == '\\') {
655 continue;
656 }
657 if (!isdigit(c)) {
658 *error = "Rewrite schema error: "
659 "'\\' must be followed by a digit or '\\'.";
660 return false;
661 }
662 int n = (c - '0');
663 if (max_token < n) {
664 max_token = n;
665 }
666 }
667
668 if (max_token > NumberOfCapturingGroups()) {
669 *error = absl::StrFormat(
670 "Rewrite schema requests %d matches, but the regexp only has %d "
671 "parenthesized subexpressions.",
672 max_token, NumberOfCapturingGroups());
673 return false;
674 }
675 return true;
676 }
677
678 // Return the number of capturing subpatterns, or -1 if the
679 // regexp wasn't valid on construction.
NumberOfCapturingGroups() const680 int PCRE::NumberOfCapturingGroups() const {
681 if (re_partial_ == NULL) return -1;
682
683 int result;
684 int rc = pcre_fullinfo(re_partial_, // The regular expression object
685 NULL, // We did not study the pattern
686 PCRE_INFO_CAPTURECOUNT,
687 &result);
688 if (rc != 0) {
689 PCREPORT(ERROR) << "Unexpected return code: " << rc;
690 return -1;
691 }
692 return result;
693 }
694
695
696 /***** Parsers for various types *****/
697
parse_null(const char * str,size_t n,void * dest)698 bool PCRE::Arg::parse_null(const char* str, size_t n, void* dest) {
699 // We fail if somebody asked us to store into a non-NULL void* pointer
700 return (dest == NULL);
701 }
702
parse_string(const char * str,size_t n,void * dest)703 bool PCRE::Arg::parse_string(const char* str, size_t n, void* dest) {
704 if (dest == NULL) return true;
705 reinterpret_cast<std::string*>(dest)->assign(str, n);
706 return true;
707 }
708
parse_string_view(const char * str,size_t n,void * dest)709 bool PCRE::Arg::parse_string_view(const char* str, size_t n, void* dest) {
710 if (dest == NULL) return true;
711 *(reinterpret_cast<absl::string_view*>(dest)) = absl::string_view(str, n);
712 return true;
713 }
714
parse_char(const char * str,size_t n,void * dest)715 bool PCRE::Arg::parse_char(const char* str, size_t n, void* dest) {
716 if (n != 1) return false;
717 if (dest == NULL) return true;
718 *(reinterpret_cast<char*>(dest)) = str[0];
719 return true;
720 }
721
parse_schar(const char * str,size_t n,void * dest)722 bool PCRE::Arg::parse_schar(const char* str, size_t n, void* dest) {
723 if (n != 1) return false;
724 if (dest == NULL) return true;
725 *(reinterpret_cast<signed char*>(dest)) = str[0];
726 return true;
727 }
728
parse_uchar(const char * str,size_t n,void * dest)729 bool PCRE::Arg::parse_uchar(const char* str, size_t n, void* dest) {
730 if (n != 1) return false;
731 if (dest == NULL) return true;
732 *(reinterpret_cast<unsigned char*>(dest)) = str[0];
733 return true;
734 }
735
736 // Largest number spec that we are willing to parse
737 static const int kMaxNumberLength = 32;
738
739 // PCREQUIPCRES "buf" must have length at least kMaxNumberLength+1
740 // PCREQUIPCRES "n > 0"
741 // Copies "str" into "buf" and null-terminates if necessary.
742 // Returns one of:
743 // a. "str" if no termination is needed
744 // b. "buf" if the string was copied and null-terminated
745 // c. "" if the input was invalid and has no hope of being parsed
TerminateNumber(char * buf,const char * str,size_t n)746 static const char* TerminateNumber(char* buf, const char* str, size_t n) {
747 if ((n > 0) && isspace(*str)) {
748 // We are less forgiving than the strtoxxx() routines and do not
749 // allow leading spaces.
750 return "";
751 }
752
753 // See if the character right after the input text may potentially
754 // look like a digit.
755 if (isdigit(str[n]) ||
756 ((str[n] >= 'a') && (str[n] <= 'f')) ||
757 ((str[n] >= 'A') && (str[n] <= 'F'))) {
758 if (n > kMaxNumberLength) return ""; // Input too big to be a valid number
759 memcpy(buf, str, n);
760 buf[n] = '\0';
761 return buf;
762 } else {
763 // We can parse right out of the supplied string, so return it.
764 return str;
765 }
766 }
767
parse_long_radix(const char * str,size_t n,void * dest,int radix)768 bool PCRE::Arg::parse_long_radix(const char* str,
769 size_t n,
770 void* dest,
771 int radix) {
772 if (n == 0) return false;
773 char buf[kMaxNumberLength+1];
774 str = TerminateNumber(buf, str, n);
775 char* end;
776 errno = 0;
777 long r = strtol(str, &end, radix);
778 if (end != str + n) return false; // Leftover junk
779 if (errno) return false;
780 if (dest == NULL) return true;
781 *(reinterpret_cast<long*>(dest)) = r;
782 return true;
783 }
784
parse_ulong_radix(const char * str,size_t n,void * dest,int radix)785 bool PCRE::Arg::parse_ulong_radix(const char* str,
786 size_t n,
787 void* dest,
788 int radix) {
789 if (n == 0) return false;
790 char buf[kMaxNumberLength+1];
791 str = TerminateNumber(buf, str, n);
792 if (str[0] == '-') {
793 // strtoul() will silently accept negative numbers and parse
794 // them. This module is more strict and treats them as errors.
795 return false;
796 }
797
798 char* end;
799 errno = 0;
800 unsigned long r = strtoul(str, &end, radix);
801 if (end != str + n) return false; // Leftover junk
802 if (errno) return false;
803 if (dest == NULL) return true;
804 *(reinterpret_cast<unsigned long*>(dest)) = r;
805 return true;
806 }
807
parse_short_radix(const char * str,size_t n,void * dest,int radix)808 bool PCRE::Arg::parse_short_radix(const char* str,
809 size_t n,
810 void* dest,
811 int radix) {
812 long r;
813 if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse
814 if ((short)r != r) return false; // Out of range
815 if (dest == NULL) return true;
816 *(reinterpret_cast<short*>(dest)) = (short)r;
817 return true;
818 }
819
parse_ushort_radix(const char * str,size_t n,void * dest,int radix)820 bool PCRE::Arg::parse_ushort_radix(const char* str,
821 size_t n,
822 void* dest,
823 int radix) {
824 unsigned long r;
825 if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse
826 if ((unsigned short)r != r) return false; // Out of range
827 if (dest == NULL) return true;
828 *(reinterpret_cast<unsigned short*>(dest)) = (unsigned short)r;
829 return true;
830 }
831
parse_int_radix(const char * str,size_t n,void * dest,int radix)832 bool PCRE::Arg::parse_int_radix(const char* str,
833 size_t n,
834 void* dest,
835 int radix) {
836 long r;
837 if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse
838 if ((int)r != r) return false; // Out of range
839 if (dest == NULL) return true;
840 *(reinterpret_cast<int*>(dest)) = (int)r;
841 return true;
842 }
843
parse_uint_radix(const char * str,size_t n,void * dest,int radix)844 bool PCRE::Arg::parse_uint_radix(const char* str,
845 size_t n,
846 void* dest,
847 int radix) {
848 unsigned long r;
849 if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse
850 if ((unsigned int)r != r) return false; // Out of range
851 if (dest == NULL) return true;
852 *(reinterpret_cast<unsigned int*>(dest)) = (unsigned int)r;
853 return true;
854 }
855
parse_longlong_radix(const char * str,size_t n,void * dest,int radix)856 bool PCRE::Arg::parse_longlong_radix(const char* str,
857 size_t n,
858 void* dest,
859 int radix) {
860 if (n == 0) return false;
861 char buf[kMaxNumberLength+1];
862 str = TerminateNumber(buf, str, n);
863 char* end;
864 errno = 0;
865 long long r = strtoll(str, &end, radix);
866 if (end != str + n) return false; // Leftover junk
867 if (errno) return false;
868 if (dest == NULL) return true;
869 *(reinterpret_cast<long long*>(dest)) = r;
870 return true;
871 }
872
parse_ulonglong_radix(const char * str,size_t n,void * dest,int radix)873 bool PCRE::Arg::parse_ulonglong_radix(const char* str,
874 size_t n,
875 void* dest,
876 int radix) {
877 if (n == 0) return false;
878 char buf[kMaxNumberLength+1];
879 str = TerminateNumber(buf, str, n);
880 if (str[0] == '-') {
881 // strtoull() will silently accept negative numbers and parse
882 // them. This module is more strict and treats them as errors.
883 return false;
884 }
885 char* end;
886 errno = 0;
887 unsigned long long r = strtoull(str, &end, radix);
888 if (end != str + n) return false; // Leftover junk
889 if (errno) return false;
890 if (dest == NULL) return true;
891 *(reinterpret_cast<unsigned long long*>(dest)) = r;
892 return true;
893 }
894
parse_double_float(const char * str,size_t n,bool isfloat,void * dest)895 static bool parse_double_float(const char* str, size_t n, bool isfloat,
896 void* dest) {
897 if (n == 0) return false;
898 static const int kMaxLength = 200;
899 char buf[kMaxLength];
900 if (n >= kMaxLength) return false;
901 memcpy(buf, str, n);
902 buf[n] = '\0';
903 char* end;
904 errno = 0;
905 double r;
906 if (isfloat) {
907 r = strtof(buf, &end);
908 } else {
909 r = strtod(buf, &end);
910 }
911 if (end != buf + n) return false; // Leftover junk
912 if (errno) return false;
913 if (dest == NULL) return true;
914 if (isfloat) {
915 *(reinterpret_cast<float*>(dest)) = (float)r;
916 } else {
917 *(reinterpret_cast<double*>(dest)) = r;
918 }
919 return true;
920 }
921
parse_double(const char * str,size_t n,void * dest)922 bool PCRE::Arg::parse_double(const char* str, size_t n, void* dest) {
923 return parse_double_float(str, n, false, dest);
924 }
925
parse_float(const char * str,size_t n,void * dest)926 bool PCRE::Arg::parse_float(const char* str, size_t n, void* dest) {
927 return parse_double_float(str, n, true, dest);
928 }
929
930 #define DEFINE_INTEGER_PARSER(name) \
931 bool PCRE::Arg::parse_##name(const char* str, size_t n, void* dest) { \
932 return parse_##name##_radix(str, n, dest, 10); \
933 } \
934 bool PCRE::Arg::parse_##name##_hex(const char* str, size_t n, void* dest) { \
935 return parse_##name##_radix(str, n, dest, 16); \
936 } \
937 bool PCRE::Arg::parse_##name##_octal(const char* str, size_t n, \
938 void* dest) { \
939 return parse_##name##_radix(str, n, dest, 8); \
940 } \
941 bool PCRE::Arg::parse_##name##_cradix(const char* str, size_t n, \
942 void* dest) { \
943 return parse_##name##_radix(str, n, dest, 0); \
944 }
945
946 DEFINE_INTEGER_PARSER(short);
947 DEFINE_INTEGER_PARSER(ushort);
948 DEFINE_INTEGER_PARSER(int);
949 DEFINE_INTEGER_PARSER(uint);
950 DEFINE_INTEGER_PARSER(long);
951 DEFINE_INTEGER_PARSER(ulong);
952 DEFINE_INTEGER_PARSER(longlong);
953 DEFINE_INTEGER_PARSER(ulonglong);
954
955 #undef DEFINE_INTEGER_PARSER
956
957 } // namespace re2
958