1 // Copyright 2003-2009 The RE2 Authors. All Rights Reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 // Regular expression interface RE2.
6 //
7 // Originally the PCRE C++ wrapper, but adapted to use
8 // the new automata-based regular expression engines.
9
10 #include "re2/re2.h"
11
12 #include <assert.h>
13 #include <ctype.h>
14 #include <errno.h>
15 #ifdef _MSC_VER
16 #include <intrin.h>
17 #endif
18 #include <stdint.h>
19 #include <stdlib.h>
20 #include <string.h>
21 #include <algorithm>
22 #include <atomic>
23 #include <iterator>
24 #include <mutex>
25 #include <string>
26 #include <utility>
27 #include <vector>
28
29 #include "util/util.h"
30 #include "util/logging.h"
31 #include "util/strutil.h"
32 #include "util/utf.h"
33 #include "re2/prog.h"
34 #include "re2/regexp.h"
35 #include "re2/sparse_array.h"
36
37 namespace re2 {
38
39 // Maximum number of args we can set
40 static const int kMaxArgs = 16;
41 static const int kVecSize = 1+kMaxArgs;
42
43 const int RE2::Options::kDefaultMaxMem; // initialized in re2.h
44
Options(RE2::CannedOptions opt)45 RE2::Options::Options(RE2::CannedOptions opt)
46 : encoding_(opt == RE2::Latin1 ? EncodingLatin1 : EncodingUTF8),
47 posix_syntax_(opt == RE2::POSIX),
48 longest_match_(opt == RE2::POSIX),
49 log_errors_(opt != RE2::Quiet),
50 max_mem_(kDefaultMaxMem),
51 literal_(false),
52 never_nl_(false),
53 dot_nl_(false),
54 never_capture_(false),
55 case_sensitive_(true),
56 perl_classes_(false),
57 word_boundary_(false),
58 one_line_(false) {
59 }
60
61 // static empty objects for use as const references.
62 // To avoid global constructors, allocated in RE2::Init().
63 static const std::string* empty_string;
64 static const std::map<std::string, int>* empty_named_groups;
65 static const std::map<int, std::string>* empty_group_names;
66
67 // Converts from Regexp error code to RE2 error code.
68 // Maybe some day they will diverge. In any event, this
69 // hides the existence of Regexp from RE2 users.
RegexpErrorToRE2(re2::RegexpStatusCode code)70 static RE2::ErrorCode RegexpErrorToRE2(re2::RegexpStatusCode code) {
71 switch (code) {
72 case re2::kRegexpSuccess:
73 return RE2::NoError;
74 case re2::kRegexpInternalError:
75 return RE2::ErrorInternal;
76 case re2::kRegexpBadEscape:
77 return RE2::ErrorBadEscape;
78 case re2::kRegexpBadCharClass:
79 return RE2::ErrorBadCharClass;
80 case re2::kRegexpBadCharRange:
81 return RE2::ErrorBadCharRange;
82 case re2::kRegexpMissingBracket:
83 return RE2::ErrorMissingBracket;
84 case re2::kRegexpMissingParen:
85 return RE2::ErrorMissingParen;
86 case re2::kRegexpTrailingBackslash:
87 return RE2::ErrorTrailingBackslash;
88 case re2::kRegexpRepeatArgument:
89 return RE2::ErrorRepeatArgument;
90 case re2::kRegexpRepeatSize:
91 return RE2::ErrorRepeatSize;
92 case re2::kRegexpRepeatOp:
93 return RE2::ErrorRepeatOp;
94 case re2::kRegexpBadPerlOp:
95 return RE2::ErrorBadPerlOp;
96 case re2::kRegexpBadUTF8:
97 return RE2::ErrorBadUTF8;
98 case re2::kRegexpBadNamedCapture:
99 return RE2::ErrorBadNamedCapture;
100 }
101 return RE2::ErrorInternal;
102 }
103
trunc(const StringPiece & pattern)104 static std::string trunc(const StringPiece& pattern) {
105 if (pattern.size() < 100)
106 return std::string(pattern);
107 return std::string(pattern.substr(0, 100)) + "...";
108 }
109
110
RE2(const char * pattern)111 RE2::RE2(const char* pattern) {
112 Init(pattern, DefaultOptions);
113 }
114
RE2(const std::string & pattern)115 RE2::RE2(const std::string& pattern) {
116 Init(pattern, DefaultOptions);
117 }
118
RE2(const StringPiece & pattern)119 RE2::RE2(const StringPiece& pattern) {
120 Init(pattern, DefaultOptions);
121 }
122
RE2(const StringPiece & pattern,const Options & options)123 RE2::RE2(const StringPiece& pattern, const Options& options) {
124 Init(pattern, options);
125 }
126
ParseFlags() const127 int RE2::Options::ParseFlags() const {
128 int flags = Regexp::ClassNL;
129 switch (encoding()) {
130 default:
131 if (log_errors())
132 LOG(ERROR) << "Unknown encoding " << encoding();
133 break;
134 case RE2::Options::EncodingUTF8:
135 break;
136 case RE2::Options::EncodingLatin1:
137 flags |= Regexp::Latin1;
138 break;
139 }
140
141 if (!posix_syntax())
142 flags |= Regexp::LikePerl;
143
144 if (literal())
145 flags |= Regexp::Literal;
146
147 if (never_nl())
148 flags |= Regexp::NeverNL;
149
150 if (dot_nl())
151 flags |= Regexp::DotNL;
152
153 if (never_capture())
154 flags |= Regexp::NeverCapture;
155
156 if (!case_sensitive())
157 flags |= Regexp::FoldCase;
158
159 if (perl_classes())
160 flags |= Regexp::PerlClasses;
161
162 if (word_boundary())
163 flags |= Regexp::PerlB;
164
165 if (one_line())
166 flags |= Regexp::OneLine;
167
168 return flags;
169 }
170
Init(const StringPiece & pattern,const Options & options)171 void RE2::Init(const StringPiece& pattern, const Options& options) {
172 static std::once_flag empty_once;
173 std::call_once(empty_once, []() {
174 empty_string = new std::string;
175 empty_named_groups = new std::map<std::string, int>;
176 empty_group_names = new std::map<int, std::string>;
177 });
178
179 pattern_.assign(pattern.data(), pattern.size());
180 options_.Copy(options);
181 entire_regexp_ = NULL;
182 error_ = empty_string;
183 error_code_ = NoError;
184 error_arg_.clear();
185 prefix_.clear();
186 prefix_foldcase_ = false;
187 suffix_regexp_ = NULL;
188 prog_ = NULL;
189 num_captures_ = -1;
190 is_one_pass_ = false;
191
192 rprog_ = NULL;
193 named_groups_ = NULL;
194 group_names_ = NULL;
195
196 RegexpStatus status;
197 entire_regexp_ = Regexp::Parse(
198 pattern_,
199 static_cast<Regexp::ParseFlags>(options_.ParseFlags()),
200 &status);
201 if (entire_regexp_ == NULL) {
202 if (options_.log_errors()) {
203 LOG(ERROR) << "Error parsing '" << trunc(pattern_) << "': "
204 << status.Text();
205 }
206 error_ = new std::string(status.Text());
207 error_code_ = RegexpErrorToRE2(status.code());
208 error_arg_ = std::string(status.error_arg());
209 return;
210 }
211
212 re2::Regexp* suffix;
213 if (entire_regexp_->RequiredPrefix(&prefix_, &prefix_foldcase_, &suffix))
214 suffix_regexp_ = suffix;
215 else
216 suffix_regexp_ = entire_regexp_->Incref();
217
218 // Two thirds of the memory goes to the forward Prog,
219 // one third to the reverse prog, because the forward
220 // Prog has two DFAs but the reverse prog has one.
221 prog_ = suffix_regexp_->CompileToProg(options_.max_mem()*2/3);
222 if (prog_ == NULL) {
223 if (options_.log_errors())
224 LOG(ERROR) << "Error compiling '" << trunc(pattern_) << "'";
225 error_ = new std::string("pattern too large - compile failed");
226 error_code_ = RE2::ErrorPatternTooLarge;
227 return;
228 }
229
230 // We used to compute this lazily, but it's used during the
231 // typical control flow for a match call, so we now compute
232 // it eagerly, which avoids the overhead of std::once_flag.
233 num_captures_ = suffix_regexp_->NumCaptures();
234
235 // Could delay this until the first match call that
236 // cares about submatch information, but the one-pass
237 // machine's memory gets cut from the DFA memory budget,
238 // and that is harder to do if the DFA has already
239 // been built.
240 is_one_pass_ = prog_->IsOnePass();
241 }
242
243 // Returns rprog_, computing it if needed.
ReverseProg() const244 re2::Prog* RE2::ReverseProg() const {
245 std::call_once(rprog_once_, [](const RE2* re) {
246 re->rprog_ =
247 re->suffix_regexp_->CompileToReverseProg(re->options_.max_mem() / 3);
248 if (re->rprog_ == NULL) {
249 if (re->options_.log_errors())
250 LOG(ERROR) << "Error reverse compiling '" << trunc(re->pattern_) << "'";
251 // We no longer touch error_ and error_code_ because failing to compile
252 // the reverse Prog is not a showstopper: falling back to NFA execution
253 // is fine. More importantly, an RE2 object is supposed to be logically
254 // immutable: whatever ok() would have returned after Init() completed,
255 // it should continue to return that no matter what ReverseProg() does.
256 }
257 }, this);
258 return rprog_;
259 }
260
~RE2()261 RE2::~RE2() {
262 if (suffix_regexp_)
263 suffix_regexp_->Decref();
264 if (entire_regexp_)
265 entire_regexp_->Decref();
266 delete prog_;
267 delete rprog_;
268 if (error_ != empty_string)
269 delete error_;
270 if (named_groups_ != NULL && named_groups_ != empty_named_groups)
271 delete named_groups_;
272 if (group_names_ != NULL && group_names_ != empty_group_names)
273 delete group_names_;
274 }
275
ProgramSize() const276 int RE2::ProgramSize() const {
277 if (prog_ == NULL)
278 return -1;
279 return prog_->size();
280 }
281
ReverseProgramSize() const282 int RE2::ReverseProgramSize() const {
283 if (prog_ == NULL)
284 return -1;
285 Prog* prog = ReverseProg();
286 if (prog == NULL)
287 return -1;
288 return prog->size();
289 }
290
291 // Finds the most significant non-zero bit in n.
FindMSBSet(uint32_t n)292 static int FindMSBSet(uint32_t n) {
293 DCHECK_NE(n, 0);
294 #if defined(__GNUC__)
295 return 31 ^ __builtin_clz(n);
296 #elif defined(_MSC_VER) && (defined(_M_X64) || defined(_M_IX86))
297 unsigned long c;
298 _BitScanReverse(&c, n);
299 return static_cast<int>(c);
300 #else
301 int c = 0;
302 for (int shift = 1 << 4; shift != 0; shift >>= 1) {
303 uint32_t word = n >> shift;
304 if (word != 0) {
305 n = word;
306 c += shift;
307 }
308 }
309 return c;
310 #endif
311 }
312
Fanout(Prog * prog,std::vector<int> * histogram)313 static int Fanout(Prog* prog, std::vector<int>* histogram) {
314 SparseArray<int> fanout(prog->size());
315 prog->Fanout(&fanout);
316 int data[32] = {};
317 int size = 0;
318 for (SparseArray<int>::iterator i = fanout.begin(); i != fanout.end(); ++i) {
319 if (i->value() == 0)
320 continue;
321 uint32_t value = i->value();
322 int bucket = FindMSBSet(value);
323 bucket += value & (value-1) ? 1 : 0;
324 ++data[bucket];
325 size = std::max(size, bucket+1);
326 }
327 if (histogram != NULL)
328 histogram->assign(data, data+size);
329 return size-1;
330 }
331
ProgramFanout(std::vector<int> * histogram) const332 int RE2::ProgramFanout(std::vector<int>* histogram) const {
333 if (prog_ == NULL)
334 return -1;
335 return Fanout(prog_, histogram);
336 }
337
ReverseProgramFanout(std::vector<int> * histogram) const338 int RE2::ReverseProgramFanout(std::vector<int>* histogram) const {
339 if (prog_ == NULL)
340 return -1;
341 Prog* prog = ReverseProg();
342 if (prog == NULL)
343 return -1;
344 return Fanout(prog, histogram);
345 }
346
347 // Returns named_groups_, computing it if needed.
NamedCapturingGroups() const348 const std::map<std::string, int>& RE2::NamedCapturingGroups() const {
349 std::call_once(named_groups_once_, [](const RE2* re) {
350 if (re->suffix_regexp_ != NULL)
351 re->named_groups_ = re->suffix_regexp_->NamedCaptures();
352 if (re->named_groups_ == NULL)
353 re->named_groups_ = empty_named_groups;
354 }, this);
355 return *named_groups_;
356 }
357
358 // Returns group_names_, computing it if needed.
CapturingGroupNames() const359 const std::map<int, std::string>& RE2::CapturingGroupNames() const {
360 std::call_once(group_names_once_, [](const RE2* re) {
361 if (re->suffix_regexp_ != NULL)
362 re->group_names_ = re->suffix_regexp_->CaptureNames();
363 if (re->group_names_ == NULL)
364 re->group_names_ = empty_group_names;
365 }, this);
366 return *group_names_;
367 }
368
369 /***** Convenience interfaces *****/
370
FullMatchN(const StringPiece & text,const RE2 & re,const Arg * const args[],int n)371 bool RE2::FullMatchN(const StringPiece& text, const RE2& re,
372 const Arg* const args[], int n) {
373 return re.DoMatch(text, ANCHOR_BOTH, NULL, args, n);
374 }
375
PartialMatchN(const StringPiece & text,const RE2 & re,const Arg * const args[],int n)376 bool RE2::PartialMatchN(const StringPiece& text, const RE2& re,
377 const Arg* const args[], int n) {
378 return re.DoMatch(text, UNANCHORED, NULL, args, n);
379 }
380
ConsumeN(StringPiece * input,const RE2 & re,const Arg * const args[],int n)381 bool RE2::ConsumeN(StringPiece* input, const RE2& re,
382 const Arg* const args[], int n) {
383 size_t consumed;
384 if (re.DoMatch(*input, ANCHOR_START, &consumed, args, n)) {
385 input->remove_prefix(consumed);
386 return true;
387 } else {
388 return false;
389 }
390 }
391
FindAndConsumeN(StringPiece * input,const RE2 & re,const Arg * const args[],int n)392 bool RE2::FindAndConsumeN(StringPiece* input, const RE2& re,
393 const Arg* const args[], int n) {
394 size_t consumed;
395 if (re.DoMatch(*input, UNANCHORED, &consumed, args, n)) {
396 input->remove_prefix(consumed);
397 return true;
398 } else {
399 return false;
400 }
401 }
402
Replace(std::string * str,const RE2 & re,const StringPiece & rewrite)403 bool RE2::Replace(std::string* str,
404 const RE2& re,
405 const StringPiece& rewrite) {
406 StringPiece vec[kVecSize];
407 int nvec = 1 + MaxSubmatch(rewrite);
408 if (nvec > 1 + re.NumberOfCapturingGroups())
409 return false;
410 if (nvec > static_cast<int>(arraysize(vec)))
411 return false;
412 if (!re.Match(*str, 0, str->size(), UNANCHORED, vec, nvec))
413 return false;
414
415 std::string s;
416 if (!re.Rewrite(&s, rewrite, vec, nvec))
417 return false;
418
419 assert(vec[0].data() >= str->data());
420 assert(vec[0].data() + vec[0].size() <= str->data() + str->size());
421 str->replace(vec[0].data() - str->data(), vec[0].size(), s);
422 return true;
423 }
424
GlobalReplace(std::string * str,const RE2 & re,const StringPiece & rewrite)425 int RE2::GlobalReplace(std::string* str,
426 const RE2& re,
427 const StringPiece& rewrite) {
428 StringPiece vec[kVecSize];
429 int nvec = 1 + MaxSubmatch(rewrite);
430 if (nvec > 1 + re.NumberOfCapturingGroups())
431 return false;
432 if (nvec > static_cast<int>(arraysize(vec)))
433 return false;
434
435 const char* p = str->data();
436 const char* ep = p + str->size();
437 const char* lastend = NULL;
438 std::string out;
439 int count = 0;
440 #ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
441 // Iterate just once when fuzzing. Otherwise, we easily get bogged down
442 // and coverage is unlikely to improve despite significant expense.
443 while (p == str->data()) {
444 #else
445 while (p <= ep) {
446 #endif
447 if (!re.Match(*str, static_cast<size_t>(p - str->data()),
448 str->size(), UNANCHORED, vec, nvec))
449 break;
450 if (p < vec[0].data())
451 out.append(p, vec[0].data() - p);
452 if (vec[0].data() == lastend && vec[0].empty()) {
453 // Disallow empty match at end of last match: skip ahead.
454 //
455 // fullrune() takes int, not ptrdiff_t. However, it just looks
456 // at the leading byte and treats any length >= 4 the same.
457 if (re.options().encoding() == RE2::Options::EncodingUTF8 &&
458 fullrune(p, static_cast<int>(std::min(ptrdiff_t{4}, ep - p)))) {
459 // re is in UTF-8 mode and there is enough left of str
460 // to allow us to advance by up to UTFmax bytes.
461 Rune r;
462 int n = chartorune(&r, p);
463 // Some copies of chartorune have a bug that accepts
464 // encodings of values in (10FFFF, 1FFFFF] as valid.
465 if (r > Runemax) {
466 n = 1;
467 r = Runeerror;
468 }
469 if (!(n == 1 && r == Runeerror)) { // no decoding error
470 out.append(p, n);
471 p += n;
472 continue;
473 }
474 }
475 // Most likely, re is in Latin-1 mode. If it is in UTF-8 mode,
476 // we fell through from above and the GIGO principle applies.
477 if (p < ep)
478 out.append(p, 1);
479 p++;
480 continue;
481 }
482 re.Rewrite(&out, rewrite, vec, nvec);
483 p = vec[0].data() + vec[0].size();
484 lastend = p;
485 count++;
486 }
487
488 if (count == 0)
489 return 0;
490
491 if (p < ep)
492 out.append(p, ep - p);
493 using std::swap;
494 swap(out, *str);
495 return count;
496 }
497
498 bool RE2::Extract(const StringPiece& text,
499 const RE2& re,
500 const StringPiece& rewrite,
501 std::string* out) {
502 StringPiece vec[kVecSize];
503 int nvec = 1 + MaxSubmatch(rewrite);
504 if (nvec > 1 + re.NumberOfCapturingGroups())
505 return false;
506 if (nvec > static_cast<int>(arraysize(vec)))
507 return false;
508 if (!re.Match(text, 0, text.size(), UNANCHORED, vec, nvec))
509 return false;
510
511 out->clear();
512 return re.Rewrite(out, rewrite, vec, nvec);
513 }
514
515 std::string RE2::QuoteMeta(const StringPiece& unquoted) {
516 std::string result;
517 result.reserve(unquoted.size() << 1);
518
519 // Escape any ascii character not in [A-Za-z_0-9].
520 //
521 // Note that it's legal to escape a character even if it has no
522 // special meaning in a regular expression -- so this function does
523 // that. (This also makes it identical to the perl function of the
524 // same name except for the null-character special case;
525 // see `perldoc -f quotemeta`.)
526 for (size_t ii = 0; ii < unquoted.size(); ++ii) {
527 // Note that using 'isalnum' here raises the benchmark time from
528 // 32ns to 58ns:
529 if ((unquoted[ii] < 'a' || unquoted[ii] > 'z') &&
530 (unquoted[ii] < 'A' || unquoted[ii] > 'Z') &&
531 (unquoted[ii] < '0' || unquoted[ii] > '9') &&
532 unquoted[ii] != '_' &&
533 // If this is the part of a UTF8 or Latin1 character, we need
534 // to copy this byte without escaping. Experimentally this is
535 // what works correctly with the regexp library.
536 !(unquoted[ii] & 128)) {
537 if (unquoted[ii] == '\0') { // Special handling for null chars.
538 // Note that this special handling is not strictly required for RE2,
539 // but this quoting is required for other regexp libraries such as
540 // PCRE.
541 // Can't use "\\0" since the next character might be a digit.
542 result += "\\x00";
543 continue;
544 }
545 result += '\\';
546 }
547 result += unquoted[ii];
548 }
549
550 return result;
551 }
552
553 bool RE2::PossibleMatchRange(std::string* min, std::string* max,
554 int maxlen) const {
555 if (prog_ == NULL)
556 return false;
557
558 int n = static_cast<int>(prefix_.size());
559 if (n > maxlen)
560 n = maxlen;
561
562 // Determine initial min max from prefix_ literal.
563 *min = prefix_.substr(0, n);
564 *max = prefix_.substr(0, n);
565 if (prefix_foldcase_) {
566 // prefix is ASCII lowercase; change *min to uppercase.
567 for (int i = 0; i < n; i++) {
568 char& c = (*min)[i];
569 if ('a' <= c && c <= 'z')
570 c += 'A' - 'a';
571 }
572 }
573
574 // Add to prefix min max using PossibleMatchRange on regexp.
575 std::string dmin, dmax;
576 maxlen -= n;
577 if (maxlen > 0 && prog_->PossibleMatchRange(&dmin, &dmax, maxlen)) {
578 min->append(dmin);
579 max->append(dmax);
580 } else if (!max->empty()) {
581 // prog_->PossibleMatchRange has failed us,
582 // but we still have useful information from prefix_.
583 // Round up *max to allow any possible suffix.
584 PrefixSuccessor(max);
585 } else {
586 // Nothing useful.
587 *min = "";
588 *max = "";
589 return false;
590 }
591
592 return true;
593 }
594
595 // Avoid possible locale nonsense in standard strcasecmp.
596 // The string a is known to be all lowercase.
597 static int ascii_strcasecmp(const char* a, const char* b, size_t len) {
598 const char* ae = a + len;
599
600 for (; a < ae; a++, b++) {
601 uint8_t x = *a;
602 uint8_t y = *b;
603 if ('A' <= y && y <= 'Z')
604 y += 'a' - 'A';
605 if (x != y)
606 return x - y;
607 }
608 return 0;
609 }
610
611
612 /***** Actual matching and rewriting code *****/
613
614 bool RE2::Match(const StringPiece& text,
615 size_t startpos,
616 size_t endpos,
617 Anchor re_anchor,
618 StringPiece* submatch,
619 int nsubmatch) const {
620 if (!ok()) {
621 if (options_.log_errors())
622 LOG(ERROR) << "Invalid RE2: " << *error_;
623 return false;
624 }
625
626 if (startpos > endpos || endpos > text.size()) {
627 if (options_.log_errors())
628 LOG(ERROR) << "RE2: invalid startpos, endpos pair. ["
629 << "startpos: " << startpos << ", "
630 << "endpos: " << endpos << ", "
631 << "text size: " << text.size() << "]";
632 return false;
633 }
634
635 StringPiece subtext = text;
636 subtext.remove_prefix(startpos);
637 subtext.remove_suffix(text.size() - endpos);
638
639 // Use DFAs to find exact location of match, filter out non-matches.
640
641 // Don't ask for the location if we won't use it.
642 // SearchDFA can do extra optimizations in that case.
643 StringPiece match;
644 StringPiece* matchp = &match;
645 if (nsubmatch == 0)
646 matchp = NULL;
647
648 int ncap = 1 + NumberOfCapturingGroups();
649 if (ncap > nsubmatch)
650 ncap = nsubmatch;
651
652 // If the regexp is anchored explicitly, must not be in middle of text.
653 if (prog_->anchor_start() && startpos != 0)
654 return false;
655 if (prog_->anchor_end() && endpos != text.size())
656 return false;
657
658 // If the regexp is anchored explicitly, update re_anchor
659 // so that we can potentially fall into a faster case below.
660 if (prog_->anchor_start() && prog_->anchor_end())
661 re_anchor = ANCHOR_BOTH;
662 else if (prog_->anchor_start() && re_anchor != ANCHOR_BOTH)
663 re_anchor = ANCHOR_START;
664
665 // Check for the required prefix, if any.
666 size_t prefixlen = 0;
667 if (!prefix_.empty()) {
668 if (startpos != 0)
669 return false;
670 prefixlen = prefix_.size();
671 if (prefixlen > subtext.size())
672 return false;
673 if (prefix_foldcase_) {
674 if (ascii_strcasecmp(&prefix_[0], subtext.data(), prefixlen) != 0)
675 return false;
676 } else {
677 if (memcmp(&prefix_[0], subtext.data(), prefixlen) != 0)
678 return false;
679 }
680 subtext.remove_prefix(prefixlen);
681 // If there is a required prefix, the anchor must be at least ANCHOR_START.
682 if (re_anchor != ANCHOR_BOTH)
683 re_anchor = ANCHOR_START;
684 }
685
686 Prog::Anchor anchor = Prog::kUnanchored;
687 Prog::MatchKind kind = Prog::kFirstMatch;
688 if (options_.longest_match())
689 kind = Prog::kLongestMatch;
690
691 bool can_one_pass = (is_one_pass_ && ncap <= Prog::kMaxOnePassCapture);
692
693 // BitState allocates a bitmap of size prog_->list_count() * text.size().
694 // It also allocates a stack of 3-word structures which could potentially
695 // grow as large as prog_->list_count() * text.size(), but in practice is
696 // much smaller.
697 const int kMaxBitStateBitmapSize = 256*1024; // bitmap size <= max (bits)
698 bool can_bit_state = prog_->CanBitState();
699 size_t bit_state_text_max = kMaxBitStateBitmapSize / prog_->list_count();
700
701 #ifdef RE2_HAVE_THREAD_LOCAL
702 hooks::context = this;
703 #endif
704 bool dfa_failed = false;
705 bool skipped_test = false;
706 switch (re_anchor) {
707 default:
708 LOG(DFATAL) << "Unexpected re_anchor value: " << re_anchor;
709 return false;
710
711 case UNANCHORED: {
712 if (prog_->anchor_end()) {
713 // This is a very special case: we don't need the forward DFA because
714 // we already know where the match must end! Instead, the reverse DFA
715 // can say whether there is a match and (optionally) where it starts.
716 Prog* prog = ReverseProg();
717 if (prog == NULL) {
718 // Fall back to NFA below.
719 skipped_test = true;
720 break;
721 }
722 if (!prog->SearchDFA(subtext, text, Prog::kAnchored,
723 Prog::kLongestMatch, matchp, &dfa_failed, NULL)) {
724 if (dfa_failed) {
725 if (options_.log_errors())
726 LOG(ERROR) << "DFA out of memory: "
727 << "pattern length " << pattern_.size() << ", "
728 << "program size " << prog->size() << ", "
729 << "list count " << prog->list_count() << ", "
730 << "bytemap range " << prog->bytemap_range();
731 // Fall back to NFA below.
732 skipped_test = true;
733 break;
734 }
735 return false;
736 }
737 if (matchp == NULL) // Matched. Don't care where.
738 return true;
739 break;
740 }
741
742 if (!prog_->SearchDFA(subtext, text, anchor, kind,
743 matchp, &dfa_failed, NULL)) {
744 if (dfa_failed) {
745 if (options_.log_errors())
746 LOG(ERROR) << "DFA out of memory: "
747 << "pattern length " << pattern_.size() << ", "
748 << "program size " << prog_->size() << ", "
749 << "list count " << prog_->list_count() << ", "
750 << "bytemap range " << prog_->bytemap_range();
751 // Fall back to NFA below.
752 skipped_test = true;
753 break;
754 }
755 return false;
756 }
757 if (matchp == NULL) // Matched. Don't care where.
758 return true;
759 // SearchDFA set match.end() but didn't know where the
760 // match started. Run the regexp backward from match.end()
761 // to find the longest possible match -- that's where it started.
762 Prog* prog = ReverseProg();
763 if (prog == NULL) {
764 // Fall back to NFA below.
765 skipped_test = true;
766 break;
767 }
768 if (!prog->SearchDFA(match, text, Prog::kAnchored,
769 Prog::kLongestMatch, &match, &dfa_failed, NULL)) {
770 if (dfa_failed) {
771 if (options_.log_errors())
772 LOG(ERROR) << "DFA out of memory: "
773 << "pattern length " << pattern_.size() << ", "
774 << "program size " << prog->size() << ", "
775 << "list count " << prog->list_count() << ", "
776 << "bytemap range " << prog->bytemap_range();
777 // Fall back to NFA below.
778 skipped_test = true;
779 break;
780 }
781 if (options_.log_errors())
782 LOG(ERROR) << "SearchDFA inconsistency";
783 return false;
784 }
785 break;
786 }
787
788 case ANCHOR_BOTH:
789 case ANCHOR_START:
790 if (re_anchor == ANCHOR_BOTH)
791 kind = Prog::kFullMatch;
792 anchor = Prog::kAnchored;
793
794 // If only a small amount of text and need submatch
795 // information anyway and we're going to use OnePass or BitState
796 // to get it, we might as well not even bother with the DFA:
797 // OnePass or BitState will be fast enough.
798 // On tiny texts, OnePass outruns even the DFA, and
799 // it doesn't have the shared state and occasional mutex that
800 // the DFA does.
801 if (can_one_pass && text.size() <= 4096 &&
802 (ncap > 1 || text.size() <= 8)) {
803 skipped_test = true;
804 break;
805 }
806 if (can_bit_state && text.size() <= bit_state_text_max && ncap > 1) {
807 skipped_test = true;
808 break;
809 }
810 if (!prog_->SearchDFA(subtext, text, anchor, kind,
811 &match, &dfa_failed, NULL)) {
812 if (dfa_failed) {
813 if (options_.log_errors())
814 LOG(ERROR) << "DFA out of memory: "
815 << "pattern length " << pattern_.size() << ", "
816 << "program size " << prog_->size() << ", "
817 << "list count " << prog_->list_count() << ", "
818 << "bytemap range " << prog_->bytemap_range();
819 // Fall back to NFA below.
820 skipped_test = true;
821 break;
822 }
823 return false;
824 }
825 break;
826 }
827
828 if (!skipped_test && ncap <= 1) {
829 // We know exactly where it matches. That's enough.
830 if (ncap == 1)
831 submatch[0] = match;
832 } else {
833 StringPiece subtext1;
834 if (skipped_test) {
835 // DFA ran out of memory or was skipped:
836 // need to search in entire original text.
837 subtext1 = subtext;
838 } else {
839 // DFA found the exact match location:
840 // let NFA run an anchored, full match search
841 // to find submatch locations.
842 subtext1 = match;
843 anchor = Prog::kAnchored;
844 kind = Prog::kFullMatch;
845 }
846
847 if (can_one_pass && anchor != Prog::kUnanchored) {
848 if (!prog_->SearchOnePass(subtext1, text, anchor, kind, submatch, ncap)) {
849 if (!skipped_test && options_.log_errors())
850 LOG(ERROR) << "SearchOnePass inconsistency";
851 return false;
852 }
853 } else if (can_bit_state && subtext1.size() <= bit_state_text_max) {
854 if (!prog_->SearchBitState(subtext1, text, anchor,
855 kind, submatch, ncap)) {
856 if (!skipped_test && options_.log_errors())
857 LOG(ERROR) << "SearchBitState inconsistency";
858 return false;
859 }
860 } else {
861 if (!prog_->SearchNFA(subtext1, text, anchor, kind, submatch, ncap)) {
862 if (!skipped_test && options_.log_errors())
863 LOG(ERROR) << "SearchNFA inconsistency";
864 return false;
865 }
866 }
867 }
868
869 // Adjust overall match for required prefix that we stripped off.
870 if (prefixlen > 0 && nsubmatch > 0)
871 submatch[0] = StringPiece(submatch[0].data() - prefixlen,
872 submatch[0].size() + prefixlen);
873
874 // Zero submatches that don't exist in the regexp.
875 for (int i = ncap; i < nsubmatch; i++)
876 submatch[i] = StringPiece();
877 return true;
878 }
879
880 // Internal matcher - like Match() but takes Args not StringPieces.
881 bool RE2::DoMatch(const StringPiece& text,
882 Anchor re_anchor,
883 size_t* consumed,
884 const Arg* const* args,
885 int n) const {
886 if (!ok()) {
887 if (options_.log_errors())
888 LOG(ERROR) << "Invalid RE2: " << *error_;
889 return false;
890 }
891
892 if (NumberOfCapturingGroups() < n) {
893 // RE has fewer capturing groups than number of Arg pointers passed in.
894 return false;
895 }
896
897 // Count number of capture groups needed.
898 int nvec;
899 if (n == 0 && consumed == NULL)
900 nvec = 0;
901 else
902 nvec = n+1;
903
904 StringPiece* vec;
905 StringPiece stkvec[kVecSize];
906 StringPiece* heapvec = NULL;
907
908 if (nvec <= static_cast<int>(arraysize(stkvec))) {
909 vec = stkvec;
910 } else {
911 vec = new StringPiece[nvec];
912 heapvec = vec;
913 }
914
915 if (!Match(text, 0, text.size(), re_anchor, vec, nvec)) {
916 delete[] heapvec;
917 return false;
918 }
919
920 if (consumed != NULL)
921 *consumed = static_cast<size_t>(vec[0].end() - text.begin());
922
923 if (n == 0 || args == NULL) {
924 // We are not interested in results
925 delete[] heapvec;
926 return true;
927 }
928
929 // If we got here, we must have matched the whole pattern.
930 for (int i = 0; i < n; i++) {
931 const StringPiece& s = vec[i+1];
932 if (!args[i]->Parse(s.data(), s.size())) {
933 // TODO: Should we indicate what the error was?
934 delete[] heapvec;
935 return false;
936 }
937 }
938
939 delete[] heapvec;
940 return true;
941 }
942
943 // Checks that the rewrite string is well-formed with respect to this
944 // regular expression.
945 bool RE2::CheckRewriteString(const StringPiece& rewrite,
946 std::string* error) const {
947 int max_token = -1;
948 for (const char *s = rewrite.data(), *end = s + rewrite.size();
949 s < end; s++) {
950 int c = *s;
951 if (c != '\\') {
952 continue;
953 }
954 if (++s == end) {
955 *error = "Rewrite schema error: '\\' not allowed at end.";
956 return false;
957 }
958 c = *s;
959 if (c == '\\') {
960 continue;
961 }
962 if (!isdigit(c)) {
963 *error = "Rewrite schema error: "
964 "'\\' must be followed by a digit or '\\'.";
965 return false;
966 }
967 int n = (c - '0');
968 if (max_token < n) {
969 max_token = n;
970 }
971 }
972
973 if (max_token > NumberOfCapturingGroups()) {
974 *error = StringPrintf(
975 "Rewrite schema requests %d matches, but the regexp only has %d "
976 "parenthesized subexpressions.",
977 max_token, NumberOfCapturingGroups());
978 return false;
979 }
980 return true;
981 }
982
983 // Returns the maximum submatch needed for the rewrite to be done by Replace().
984 // E.g. if rewrite == "foo \\2,\\1", returns 2.
985 int RE2::MaxSubmatch(const StringPiece& rewrite) {
986 int max = 0;
987 for (const char *s = rewrite.data(), *end = s + rewrite.size();
988 s < end; s++) {
989 if (*s == '\\') {
990 s++;
991 int c = (s < end) ? *s : -1;
992 if (isdigit(c)) {
993 int n = (c - '0');
994 if (n > max)
995 max = n;
996 }
997 }
998 }
999 return max;
1000 }
1001
1002 // Append the "rewrite" string, with backslash subsitutions from "vec",
1003 // to string "out".
1004 bool RE2::Rewrite(std::string* out,
1005 const StringPiece& rewrite,
1006 const StringPiece* vec,
1007 int veclen) const {
1008 for (const char *s = rewrite.data(), *end = s + rewrite.size();
1009 s < end; s++) {
1010 if (*s != '\\') {
1011 out->push_back(*s);
1012 continue;
1013 }
1014 s++;
1015 int c = (s < end) ? *s : -1;
1016 if (isdigit(c)) {
1017 int n = (c - '0');
1018 if (n >= veclen) {
1019 if (options_.log_errors()) {
1020 LOG(ERROR) << "invalid substitution \\" << n
1021 << " from " << veclen << " groups";
1022 }
1023 return false;
1024 }
1025 StringPiece snip = vec[n];
1026 if (!snip.empty())
1027 out->append(snip.data(), snip.size());
1028 } else if (c == '\\') {
1029 out->push_back('\\');
1030 } else {
1031 if (options_.log_errors())
1032 LOG(ERROR) << "invalid rewrite pattern: " << rewrite.data();
1033 return false;
1034 }
1035 }
1036 return true;
1037 }
1038
1039 /***** Parsers for various types *****/
1040
1041 bool RE2::Arg::parse_null(const char* str, size_t n, void* dest) {
1042 // We fail if somebody asked us to store into a non-NULL void* pointer
1043 return (dest == NULL);
1044 }
1045
1046 bool RE2::Arg::parse_string(const char* str, size_t n, void* dest) {
1047 if (dest == NULL) return true;
1048 reinterpret_cast<std::string*>(dest)->assign(str, n);
1049 return true;
1050 }
1051
1052 bool RE2::Arg::parse_stringpiece(const char* str, size_t n, void* dest) {
1053 if (dest == NULL) return true;
1054 *(reinterpret_cast<StringPiece*>(dest)) = StringPiece(str, n);
1055 return true;
1056 }
1057
1058 bool RE2::Arg::parse_char(const char* str, size_t n, void* dest) {
1059 if (n != 1) return false;
1060 if (dest == NULL) return true;
1061 *(reinterpret_cast<char*>(dest)) = str[0];
1062 return true;
1063 }
1064
1065 bool RE2::Arg::parse_schar(const char* str, size_t n, void* dest) {
1066 if (n != 1) return false;
1067 if (dest == NULL) return true;
1068 *(reinterpret_cast<signed char*>(dest)) = str[0];
1069 return true;
1070 }
1071
1072 bool RE2::Arg::parse_uchar(const char* str, size_t n, void* dest) {
1073 if (n != 1) return false;
1074 if (dest == NULL) return true;
1075 *(reinterpret_cast<unsigned char*>(dest)) = str[0];
1076 return true;
1077 }
1078
1079 // Largest number spec that we are willing to parse
1080 static const int kMaxNumberLength = 32;
1081
1082 // REQUIRES "buf" must have length at least nbuf.
1083 // Copies "str" into "buf" and null-terminates.
1084 // Overwrites *np with the new length.
1085 static const char* TerminateNumber(char* buf, size_t nbuf, const char* str,
1086 size_t* np, bool accept_spaces) {
1087 size_t n = *np;
1088 if (n == 0) return "";
1089 if (n > 0 && isspace(*str)) {
1090 // We are less forgiving than the strtoxxx() routines and do not
1091 // allow leading spaces. We do allow leading spaces for floats.
1092 if (!accept_spaces) {
1093 return "";
1094 }
1095 while (n > 0 && isspace(*str)) {
1096 n--;
1097 str++;
1098 }
1099 }
1100
1101 // Although buf has a fixed maximum size, we can still handle
1102 // arbitrarily large integers correctly by omitting leading zeros.
1103 // (Numbers that are still too long will be out of range.)
1104 // Before deciding whether str is too long,
1105 // remove leading zeros with s/000+/00/.
1106 // Leaving the leading two zeros in place means that
1107 // we don't change 0000x123 (invalid) into 0x123 (valid).
1108 // Skip over leading - before replacing.
1109 bool neg = false;
1110 if (n >= 1 && str[0] == '-') {
1111 neg = true;
1112 n--;
1113 str++;
1114 }
1115
1116 if (n >= 3 && str[0] == '0' && str[1] == '0') {
1117 while (n >= 3 && str[2] == '0') {
1118 n--;
1119 str++;
1120 }
1121 }
1122
1123 if (neg) { // make room in buf for -
1124 n++;
1125 str--;
1126 }
1127
1128 if (n > nbuf-1) return "";
1129
1130 memmove(buf, str, n);
1131 if (neg) {
1132 buf[0] = '-';
1133 }
1134 buf[n] = '\0';
1135 *np = n;
1136 return buf;
1137 }
1138
1139 bool RE2::Arg::parse_long_radix(const char* str,
1140 size_t n,
1141 void* dest,
1142 int radix) {
1143 if (n == 0) return false;
1144 char buf[kMaxNumberLength+1];
1145 str = TerminateNumber(buf, sizeof buf, str, &n, false);
1146 char* end;
1147 errno = 0;
1148 long r = strtol(str, &end, radix);
1149 if (end != str + n) return false; // Leftover junk
1150 if (errno) return false;
1151 if (dest == NULL) return true;
1152 *(reinterpret_cast<long*>(dest)) = r;
1153 return true;
1154 }
1155
1156 bool RE2::Arg::parse_ulong_radix(const char* str,
1157 size_t n,
1158 void* dest,
1159 int radix) {
1160 if (n == 0) return false;
1161 char buf[kMaxNumberLength+1];
1162 str = TerminateNumber(buf, sizeof buf, str, &n, false);
1163 if (str[0] == '-') {
1164 // strtoul() will silently accept negative numbers and parse
1165 // them. This module is more strict and treats them as errors.
1166 return false;
1167 }
1168
1169 char* end;
1170 errno = 0;
1171 unsigned long r = strtoul(str, &end, radix);
1172 if (end != str + n) return false; // Leftover junk
1173 if (errno) return false;
1174 if (dest == NULL) return true;
1175 *(reinterpret_cast<unsigned long*>(dest)) = r;
1176 return true;
1177 }
1178
1179 bool RE2::Arg::parse_short_radix(const char* str,
1180 size_t n,
1181 void* dest,
1182 int radix) {
1183 long r;
1184 if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse
1185 if ((short)r != r) return false; // Out of range
1186 if (dest == NULL) return true;
1187 *(reinterpret_cast<short*>(dest)) = (short)r;
1188 return true;
1189 }
1190
1191 bool RE2::Arg::parse_ushort_radix(const char* str,
1192 size_t n,
1193 void* dest,
1194 int radix) {
1195 unsigned long r;
1196 if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse
1197 if ((unsigned short)r != r) return false; // Out of range
1198 if (dest == NULL) return true;
1199 *(reinterpret_cast<unsigned short*>(dest)) = (unsigned short)r;
1200 return true;
1201 }
1202
1203 bool RE2::Arg::parse_int_radix(const char* str,
1204 size_t n,
1205 void* dest,
1206 int radix) {
1207 long r;
1208 if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse
1209 if ((int)r != r) return false; // Out of range
1210 if (dest == NULL) return true;
1211 *(reinterpret_cast<int*>(dest)) = (int)r;
1212 return true;
1213 }
1214
1215 bool RE2::Arg::parse_uint_radix(const char* str,
1216 size_t n,
1217 void* dest,
1218 int radix) {
1219 unsigned long r;
1220 if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse
1221 if ((unsigned int)r != r) return false; // Out of range
1222 if (dest == NULL) return true;
1223 *(reinterpret_cast<unsigned int*>(dest)) = (unsigned int)r;
1224 return true;
1225 }
1226
1227 bool RE2::Arg::parse_longlong_radix(const char* str,
1228 size_t n,
1229 void* dest,
1230 int radix) {
1231 if (n == 0) return false;
1232 char buf[kMaxNumberLength+1];
1233 str = TerminateNumber(buf, sizeof buf, str, &n, false);
1234 char* end;
1235 errno = 0;
1236 long long r = strtoll(str, &end, radix);
1237 if (end != str + n) return false; // Leftover junk
1238 if (errno) return false;
1239 if (dest == NULL) return true;
1240 *(reinterpret_cast<long long*>(dest)) = r;
1241 return true;
1242 }
1243
1244 bool RE2::Arg::parse_ulonglong_radix(const char* str,
1245 size_t n,
1246 void* dest,
1247 int radix) {
1248 if (n == 0) return false;
1249 char buf[kMaxNumberLength+1];
1250 str = TerminateNumber(buf, sizeof buf, str, &n, false);
1251 if (str[0] == '-') {
1252 // strtoull() will silently accept negative numbers and parse
1253 // them. This module is more strict and treats them as errors.
1254 return false;
1255 }
1256 char* end;
1257 errno = 0;
1258 unsigned long long r = strtoull(str, &end, radix);
1259 if (end != str + n) return false; // Leftover junk
1260 if (errno) return false;
1261 if (dest == NULL) return true;
1262 *(reinterpret_cast<unsigned long long*>(dest)) = r;
1263 return true;
1264 }
1265
1266 static bool parse_double_float(const char* str, size_t n, bool isfloat,
1267 void* dest) {
1268 if (n == 0) return false;
1269 static const int kMaxLength = 200;
1270 char buf[kMaxLength+1];
1271 str = TerminateNumber(buf, sizeof buf, str, &n, true);
1272 char* end;
1273 errno = 0;
1274 double r;
1275 if (isfloat) {
1276 r = strtof(str, &end);
1277 } else {
1278 r = strtod(str, &end);
1279 }
1280 if (end != str + n) return false; // Leftover junk
1281 if (errno) return false;
1282 if (dest == NULL) return true;
1283 if (isfloat) {
1284 *(reinterpret_cast<float*>(dest)) = (float)r;
1285 } else {
1286 *(reinterpret_cast<double*>(dest)) = r;
1287 }
1288 return true;
1289 }
1290
1291 bool RE2::Arg::parse_double(const char* str, size_t n, void* dest) {
1292 return parse_double_float(str, n, false, dest);
1293 }
1294
1295 bool RE2::Arg::parse_float(const char* str, size_t n, void* dest) {
1296 return parse_double_float(str, n, true, dest);
1297 }
1298
1299 #define DEFINE_INTEGER_PARSER(name) \
1300 bool RE2::Arg::parse_##name(const char* str, size_t n, void* dest) { \
1301 return parse_##name##_radix(str, n, dest, 10); \
1302 } \
1303 bool RE2::Arg::parse_##name##_hex(const char* str, size_t n, void* dest) { \
1304 return parse_##name##_radix(str, n, dest, 16); \
1305 } \
1306 bool RE2::Arg::parse_##name##_octal(const char* str, size_t n, void* dest) { \
1307 return parse_##name##_radix(str, n, dest, 8); \
1308 } \
1309 bool RE2::Arg::parse_##name##_cradix(const char* str, size_t n, \
1310 void* dest) { \
1311 return parse_##name##_radix(str, n, dest, 0); \
1312 }
1313
1314 DEFINE_INTEGER_PARSER(short)
1315 DEFINE_INTEGER_PARSER(ushort)
1316 DEFINE_INTEGER_PARSER(int)
1317 DEFINE_INTEGER_PARSER(uint)
1318 DEFINE_INTEGER_PARSER(long)
1319 DEFINE_INTEGER_PARSER(ulong)
1320 DEFINE_INTEGER_PARSER(longlong)
1321 DEFINE_INTEGER_PARSER(ulonglong)
1322
1323 #undef DEFINE_INTEGER_PARSER
1324
1325 namespace hooks {
1326
1327 #ifdef RE2_HAVE_THREAD_LOCAL
1328 thread_local const RE2* context = NULL;
1329 #endif
1330
1331 template <typename T>
1332 union Hook {
1333 void Store(T* cb) { cb_.store(cb, std::memory_order_release); }
1334 T* Load() const { return cb_.load(std::memory_order_acquire); }
1335
1336 #if !defined(__clang__) && defined(_MSC_VER)
1337 // Citing https://github.com/protocolbuffers/protobuf/pull/4777 as precedent,
1338 // this is a gross hack to make std::atomic<T*> constant-initialized on MSVC.
1339 static_assert(ATOMIC_POINTER_LOCK_FREE == 2,
1340 "std::atomic<T*> must be always lock-free");
1341 T* cb_for_constinit_;
1342 #endif
1343
1344 std::atomic<T*> cb_;
1345 };
1346
1347 template <typename T>
1348 static void DoNothing(const T&) {}
1349
1350 #define DEFINE_HOOK(type, name) \
1351 static Hook<type##Callback> name##_hook = {{&DoNothing<type>}}; \
1352 void Set##type##Hook(type##Callback* cb) { name##_hook.Store(cb); } \
1353 type##Callback* Get##type##Hook() { return name##_hook.Load(); }
1354
1355 DEFINE_HOOK(DFAStateCacheReset, dfa_state_cache_reset)
1356 DEFINE_HOOK(DFASearchFailure, dfa_search_failure)
1357
1358 #undef DEFINE_HOOK
1359
1360 } // namespace hooks
1361
1362 } // namespace re2
1363