1 // Copyright 2003-2009 The RE2 Authors. All Rights Reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 // Regular expression interface RE2.
6 //
7 // Originally the PCRE C++ wrapper, but adapted to use
8 // the new automata-based regular expression engines.
9
10 #include "re2/re2.h"
11
12 #include <assert.h>
13 #include <ctype.h>
14 #include <errno.h>
15 #include <stdint.h>
16 #include <stdlib.h>
17 #include <string.h>
18 #include <algorithm>
19 #include <iterator>
20 #include <mutex>
21 #include <string>
22 #include <utility>
23 #include <vector>
24
25 #include "util/util.h"
26 #include "util/logging.h"
27 #include "util/strutil.h"
28 #include "util/utf.h"
29 #include "re2/prog.h"
30 #include "re2/regexp.h"
31 #include "re2/sparse_array.h"
32
33 namespace re2 {
34
35 // Maximum number of args we can set
36 static const int kMaxArgs = 16;
37 static const int kVecSize = 1+kMaxArgs;
38
39 const int RE2::Options::kDefaultMaxMem; // initialized in re2.h
40
Options(RE2::CannedOptions opt)41 RE2::Options::Options(RE2::CannedOptions opt)
42 : encoding_(opt == RE2::Latin1 ? EncodingLatin1 : EncodingUTF8),
43 posix_syntax_(opt == RE2::POSIX),
44 longest_match_(opt == RE2::POSIX),
45 log_errors_(opt != RE2::Quiet),
46 max_mem_(kDefaultMaxMem),
47 literal_(false),
48 never_nl_(false),
49 dot_nl_(false),
50 never_capture_(false),
51 case_sensitive_(true),
52 perl_classes_(false),
53 word_boundary_(false),
54 one_line_(false) {
55 }
56
57 // static empty objects for use as const references.
58 // To avoid global constructors, allocated in RE2::Init().
59 static const std::string* empty_string;
60 static const std::map<std::string, int>* empty_named_groups;
61 static const std::map<int, std::string>* empty_group_names;
62
63 // Converts from Regexp error code to RE2 error code.
64 // Maybe some day they will diverge. In any event, this
65 // hides the existence of Regexp from RE2 users.
RegexpErrorToRE2(re2::RegexpStatusCode code)66 static RE2::ErrorCode RegexpErrorToRE2(re2::RegexpStatusCode code) {
67 switch (code) {
68 case re2::kRegexpSuccess:
69 return RE2::NoError;
70 case re2::kRegexpInternalError:
71 return RE2::ErrorInternal;
72 case re2::kRegexpBadEscape:
73 return RE2::ErrorBadEscape;
74 case re2::kRegexpBadCharClass:
75 return RE2::ErrorBadCharClass;
76 case re2::kRegexpBadCharRange:
77 return RE2::ErrorBadCharRange;
78 case re2::kRegexpMissingBracket:
79 return RE2::ErrorMissingBracket;
80 case re2::kRegexpMissingParen:
81 return RE2::ErrorMissingParen;
82 case re2::kRegexpTrailingBackslash:
83 return RE2::ErrorTrailingBackslash;
84 case re2::kRegexpRepeatArgument:
85 return RE2::ErrorRepeatArgument;
86 case re2::kRegexpRepeatSize:
87 return RE2::ErrorRepeatSize;
88 case re2::kRegexpRepeatOp:
89 return RE2::ErrorRepeatOp;
90 case re2::kRegexpBadPerlOp:
91 return RE2::ErrorBadPerlOp;
92 case re2::kRegexpBadUTF8:
93 return RE2::ErrorBadUTF8;
94 case re2::kRegexpBadNamedCapture:
95 return RE2::ErrorBadNamedCapture;
96 }
97 return RE2::ErrorInternal;
98 }
99
trunc(const StringPiece & pattern)100 static std::string trunc(const StringPiece& pattern) {
101 if (pattern.size() < 100)
102 return std::string(pattern);
103 return std::string(pattern.substr(0, 100)) + "...";
104 }
105
106
RE2(const char * pattern)107 RE2::RE2(const char* pattern) {
108 Init(pattern, DefaultOptions);
109 }
110
RE2(const std::string & pattern)111 RE2::RE2(const std::string& pattern) {
112 Init(pattern, DefaultOptions);
113 }
114
RE2(const StringPiece & pattern)115 RE2::RE2(const StringPiece& pattern) {
116 Init(pattern, DefaultOptions);
117 }
118
RE2(const StringPiece & pattern,const Options & options)119 RE2::RE2(const StringPiece& pattern, const Options& options) {
120 Init(pattern, options);
121 }
122
ParseFlags() const123 int RE2::Options::ParseFlags() const {
124 int flags = Regexp::ClassNL;
125 switch (encoding()) {
126 default:
127 if (log_errors())
128 LOG(ERROR) << "Unknown encoding " << encoding();
129 break;
130 case RE2::Options::EncodingUTF8:
131 break;
132 case RE2::Options::EncodingLatin1:
133 flags |= Regexp::Latin1;
134 break;
135 }
136
137 if (!posix_syntax())
138 flags |= Regexp::LikePerl;
139
140 if (literal())
141 flags |= Regexp::Literal;
142
143 if (never_nl())
144 flags |= Regexp::NeverNL;
145
146 if (dot_nl())
147 flags |= Regexp::DotNL;
148
149 if (never_capture())
150 flags |= Regexp::NeverCapture;
151
152 if (!case_sensitive())
153 flags |= Regexp::FoldCase;
154
155 if (perl_classes())
156 flags |= Regexp::PerlClasses;
157
158 if (word_boundary())
159 flags |= Regexp::PerlB;
160
161 if (one_line())
162 flags |= Regexp::OneLine;
163
164 return flags;
165 }
166
Init(const StringPiece & pattern,const Options & options)167 void RE2::Init(const StringPiece& pattern, const Options& options) {
168 static std::once_flag empty_once;
169 std::call_once(empty_once, []() {
170 empty_string = new std::string;
171 empty_named_groups = new std::map<std::string, int>;
172 empty_group_names = new std::map<int, std::string>;
173 });
174
175 pattern_ = std::string(pattern);
176 options_.Copy(options);
177 entire_regexp_ = NULL;
178 suffix_regexp_ = NULL;
179 prog_ = NULL;
180 num_captures_ = -1;
181 rprog_ = NULL;
182 error_ = empty_string;
183 error_code_ = NoError;
184 named_groups_ = NULL;
185 group_names_ = NULL;
186
187 RegexpStatus status;
188 entire_regexp_ = Regexp::Parse(
189 pattern_,
190 static_cast<Regexp::ParseFlags>(options_.ParseFlags()),
191 &status);
192 if (entire_regexp_ == NULL) {
193 if (options_.log_errors()) {
194 LOG(ERROR) << "Error parsing '" << trunc(pattern_) << "': "
195 << status.Text();
196 }
197 error_ = new std::string(status.Text());
198 error_code_ = RegexpErrorToRE2(status.code());
199 error_arg_ = std::string(status.error_arg());
200 return;
201 }
202
203 re2::Regexp* suffix;
204 if (entire_regexp_->RequiredPrefix(&prefix_, &prefix_foldcase_, &suffix))
205 suffix_regexp_ = suffix;
206 else
207 suffix_regexp_ = entire_regexp_->Incref();
208
209 // Two thirds of the memory goes to the forward Prog,
210 // one third to the reverse prog, because the forward
211 // Prog has two DFAs but the reverse prog has one.
212 prog_ = suffix_regexp_->CompileToProg(options_.max_mem()*2/3);
213 if (prog_ == NULL) {
214 if (options_.log_errors())
215 LOG(ERROR) << "Error compiling '" << trunc(pattern_) << "'";
216 error_ = new std::string("pattern too large - compile failed");
217 error_code_ = RE2::ErrorPatternTooLarge;
218 return;
219 }
220
221 // We used to compute this lazily, but it's used during the
222 // typical control flow for a match call, so we now compute
223 // it eagerly, which avoids the overhead of std::once_flag.
224 num_captures_ = suffix_regexp_->NumCaptures();
225
226 // Could delay this until the first match call that
227 // cares about submatch information, but the one-pass
228 // machine's memory gets cut from the DFA memory budget,
229 // and that is harder to do if the DFA has already
230 // been built.
231 is_one_pass_ = prog_->IsOnePass();
232 }
233
234 // Returns rprog_, computing it if needed.
ReverseProg() const235 re2::Prog* RE2::ReverseProg() const {
236 std::call_once(rprog_once_, [](const RE2* re) {
237 re->rprog_ =
238 re->suffix_regexp_->CompileToReverseProg(re->options_.max_mem() / 3);
239 if (re->rprog_ == NULL) {
240 if (re->options_.log_errors())
241 LOG(ERROR) << "Error reverse compiling '" << trunc(re->pattern_) << "'";
242 re->error_ =
243 new std::string("pattern too large - reverse compile failed");
244 re->error_code_ = RE2::ErrorPatternTooLarge;
245 }
246 }, this);
247 return rprog_;
248 }
249
~RE2()250 RE2::~RE2() {
251 if (suffix_regexp_)
252 suffix_regexp_->Decref();
253 if (entire_regexp_)
254 entire_regexp_->Decref();
255 delete prog_;
256 delete rprog_;
257 if (error_ != empty_string)
258 delete error_;
259 if (named_groups_ != NULL && named_groups_ != empty_named_groups)
260 delete named_groups_;
261 if (group_names_ != NULL && group_names_ != empty_group_names)
262 delete group_names_;
263 }
264
ProgramSize() const265 int RE2::ProgramSize() const {
266 if (prog_ == NULL)
267 return -1;
268 return prog_->size();
269 }
270
ReverseProgramSize() const271 int RE2::ReverseProgramSize() const {
272 if (prog_ == NULL)
273 return -1;
274 Prog* prog = ReverseProg();
275 if (prog == NULL)
276 return -1;
277 return prog->size();
278 }
279
Fanout(Prog * prog,std::map<int,int> * histogram)280 static int Fanout(Prog* prog, std::map<int, int>* histogram) {
281 SparseArray<int> fanout(prog->size());
282 prog->Fanout(&fanout);
283 histogram->clear();
284 for (SparseArray<int>::iterator i = fanout.begin(); i != fanout.end(); ++i) {
285 // TODO(junyer): Optimise this?
286 int bucket = 0;
287 while (1 << bucket < i->value()) {
288 bucket++;
289 }
290 (*histogram)[bucket]++;
291 }
292 return histogram->rbegin()->first;
293 }
294
ProgramFanout(std::map<int,int> * histogram) const295 int RE2::ProgramFanout(std::map<int, int>* histogram) const {
296 if (prog_ == NULL)
297 return -1;
298 return Fanout(prog_, histogram);
299 }
300
ReverseProgramFanout(std::map<int,int> * histogram) const301 int RE2::ReverseProgramFanout(std::map<int, int>* histogram) const {
302 if (prog_ == NULL)
303 return -1;
304 Prog* prog = ReverseProg();
305 if (prog == NULL)
306 return -1;
307 return Fanout(prog, histogram);
308 }
309
310 // Returns named_groups_, computing it if needed.
NamedCapturingGroups() const311 const std::map<std::string, int>& RE2::NamedCapturingGroups() const {
312 std::call_once(named_groups_once_, [](const RE2* re) {
313 if (re->suffix_regexp_ != NULL)
314 re->named_groups_ = re->suffix_regexp_->NamedCaptures();
315 if (re->named_groups_ == NULL)
316 re->named_groups_ = empty_named_groups;
317 }, this);
318 return *named_groups_;
319 }
320
321 // Returns group_names_, computing it if needed.
CapturingGroupNames() const322 const std::map<int, std::string>& RE2::CapturingGroupNames() const {
323 std::call_once(group_names_once_, [](const RE2* re) {
324 if (re->suffix_regexp_ != NULL)
325 re->group_names_ = re->suffix_regexp_->CaptureNames();
326 if (re->group_names_ == NULL)
327 re->group_names_ = empty_group_names;
328 }, this);
329 return *group_names_;
330 }
331
332 /***** Convenience interfaces *****/
333
FullMatchN(const StringPiece & text,const RE2 & re,const Arg * const args[],int n)334 bool RE2::FullMatchN(const StringPiece& text, const RE2& re,
335 const Arg* const args[], int n) {
336 return re.DoMatch(text, ANCHOR_BOTH, NULL, args, n);
337 }
338
PartialMatchN(const StringPiece & text,const RE2 & re,const Arg * const args[],int n)339 bool RE2::PartialMatchN(const StringPiece& text, const RE2& re,
340 const Arg* const args[], int n) {
341 return re.DoMatch(text, UNANCHORED, NULL, args, n);
342 }
343
ConsumeN(StringPiece * input,const RE2 & re,const Arg * const args[],int n)344 bool RE2::ConsumeN(StringPiece* input, const RE2& re,
345 const Arg* const args[], int n) {
346 size_t consumed;
347 if (re.DoMatch(*input, ANCHOR_START, &consumed, args, n)) {
348 input->remove_prefix(consumed);
349 return true;
350 } else {
351 return false;
352 }
353 }
354
FindAndConsumeN(StringPiece * input,const RE2 & re,const Arg * const args[],int n)355 bool RE2::FindAndConsumeN(StringPiece* input, const RE2& re,
356 const Arg* const args[], int n) {
357 size_t consumed;
358 if (re.DoMatch(*input, UNANCHORED, &consumed, args, n)) {
359 input->remove_prefix(consumed);
360 return true;
361 } else {
362 return false;
363 }
364 }
365
Replace(std::string * str,const RE2 & re,const StringPiece & rewrite)366 bool RE2::Replace(std::string* str,
367 const RE2& re,
368 const StringPiece& rewrite) {
369 StringPiece vec[kVecSize];
370 int nvec = 1 + MaxSubmatch(rewrite);
371 if (nvec > static_cast<int>(arraysize(vec)))
372 return false;
373 if (!re.Match(*str, 0, str->size(), UNANCHORED, vec, nvec))
374 return false;
375
376 std::string s;
377 if (!re.Rewrite(&s, rewrite, vec, nvec))
378 return false;
379
380 assert(vec[0].data() >= str->data());
381 assert(vec[0].data() + vec[0].size() <= str->data() + str->size());
382 str->replace(vec[0].data() - str->data(), vec[0].size(), s);
383 return true;
384 }
385
GlobalReplace(std::string * str,const RE2 & re,const StringPiece & rewrite)386 int RE2::GlobalReplace(std::string* str,
387 const RE2& re,
388 const StringPiece& rewrite) {
389 StringPiece vec[kVecSize];
390 int nvec = 1 + MaxSubmatch(rewrite);
391 if (nvec > static_cast<int>(arraysize(vec)))
392 return false;
393
394 const char* p = str->data();
395 const char* ep = p + str->size();
396 const char* lastend = NULL;
397 std::string out;
398 int count = 0;
399 #ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
400 // Iterate just once when fuzzing. Otherwise, we easily get bogged down
401 // and coverage is unlikely to improve despite significant expense.
402 while (p == str->data()) {
403 #else
404 while (p <= ep) {
405 #endif
406 if (!re.Match(*str, static_cast<size_t>(p - str->data()),
407 str->size(), UNANCHORED, vec, nvec))
408 break;
409 if (p < vec[0].data())
410 out.append(p, vec[0].data() - p);
411 if (vec[0].data() == lastend && vec[0].empty()) {
412 // Disallow empty match at end of last match: skip ahead.
413 //
414 // fullrune() takes int, not ptrdiff_t. However, it just looks
415 // at the leading byte and treats any length >= 4 the same.
416 if (re.options().encoding() == RE2::Options::EncodingUTF8 &&
417 fullrune(p, static_cast<int>(std::min(ptrdiff_t{4}, ep - p)))) {
418 // re is in UTF-8 mode and there is enough left of str
419 // to allow us to advance by up to UTFmax bytes.
420 Rune r;
421 int n = chartorune(&r, p);
422 // Some copies of chartorune have a bug that accepts
423 // encodings of values in (10FFFF, 1FFFFF] as valid.
424 if (r > Runemax) {
425 n = 1;
426 r = Runeerror;
427 }
428 if (!(n == 1 && r == Runeerror)) { // no decoding error
429 out.append(p, n);
430 p += n;
431 continue;
432 }
433 }
434 // Most likely, re is in Latin-1 mode. If it is in UTF-8 mode,
435 // we fell through from above and the GIGO principle applies.
436 if (p < ep)
437 out.append(p, 1);
438 p++;
439 continue;
440 }
441 re.Rewrite(&out, rewrite, vec, nvec);
442 p = vec[0].data() + vec[0].size();
443 lastend = p;
444 count++;
445 }
446
447 if (count == 0)
448 return 0;
449
450 if (p < ep)
451 out.append(p, ep - p);
452 using std::swap;
453 swap(out, *str);
454 return count;
455 }
456
457 bool RE2::Extract(const StringPiece& text,
458 const RE2& re,
459 const StringPiece& rewrite,
460 std::string* out) {
461 StringPiece vec[kVecSize];
462 int nvec = 1 + MaxSubmatch(rewrite);
463 if (nvec > static_cast<int>(arraysize(vec)))
464 return false;
465
466 if (!re.Match(text, 0, text.size(), UNANCHORED, vec, nvec))
467 return false;
468
469 out->clear();
470 return re.Rewrite(out, rewrite, vec, nvec);
471 }
472
473 std::string RE2::QuoteMeta(const StringPiece& unquoted) {
474 std::string result;
475 result.reserve(unquoted.size() << 1);
476
477 // Escape any ascii character not in [A-Za-z_0-9].
478 //
479 // Note that it's legal to escape a character even if it has no
480 // special meaning in a regular expression -- so this function does
481 // that. (This also makes it identical to the perl function of the
482 // same name except for the null-character special case;
483 // see `perldoc -f quotemeta`.)
484 for (size_t ii = 0; ii < unquoted.size(); ++ii) {
485 // Note that using 'isalnum' here raises the benchmark time from
486 // 32ns to 58ns:
487 if ((unquoted[ii] < 'a' || unquoted[ii] > 'z') &&
488 (unquoted[ii] < 'A' || unquoted[ii] > 'Z') &&
489 (unquoted[ii] < '0' || unquoted[ii] > '9') &&
490 unquoted[ii] != '_' &&
491 // If this is the part of a UTF8 or Latin1 character, we need
492 // to copy this byte without escaping. Experimentally this is
493 // what works correctly with the regexp library.
494 !(unquoted[ii] & 128)) {
495 if (unquoted[ii] == '\0') { // Special handling for null chars.
496 // Note that this special handling is not strictly required for RE2,
497 // but this quoting is required for other regexp libraries such as
498 // PCRE.
499 // Can't use "\\0" since the next character might be a digit.
500 result += "\\x00";
501 continue;
502 }
503 result += '\\';
504 }
505 result += unquoted[ii];
506 }
507
508 return result;
509 }
510
511 bool RE2::PossibleMatchRange(std::string* min, std::string* max,
512 int maxlen) const {
513 if (prog_ == NULL)
514 return false;
515
516 int n = static_cast<int>(prefix_.size());
517 if (n > maxlen)
518 n = maxlen;
519
520 // Determine initial min max from prefix_ literal.
521 *min = prefix_.substr(0, n);
522 *max = prefix_.substr(0, n);
523 if (prefix_foldcase_) {
524 // prefix is ASCII lowercase; change *min to uppercase.
525 for (int i = 0; i < n; i++) {
526 char& c = (*min)[i];
527 if ('a' <= c && c <= 'z')
528 c += 'A' - 'a';
529 }
530 }
531
532 // Add to prefix min max using PossibleMatchRange on regexp.
533 std::string dmin, dmax;
534 maxlen -= n;
535 if (maxlen > 0 && prog_->PossibleMatchRange(&dmin, &dmax, maxlen)) {
536 min->append(dmin);
537 max->append(dmax);
538 } else if (!max->empty()) {
539 // prog_->PossibleMatchRange has failed us,
540 // but we still have useful information from prefix_.
541 // Round up *max to allow any possible suffix.
542 PrefixSuccessor(max);
543 } else {
544 // Nothing useful.
545 *min = "";
546 *max = "";
547 return false;
548 }
549
550 return true;
551 }
552
553 // Avoid possible locale nonsense in standard strcasecmp.
554 // The string a is known to be all lowercase.
555 static int ascii_strcasecmp(const char* a, const char* b, size_t len) {
556 const char* ae = a + len;
557
558 for (; a < ae; a++, b++) {
559 uint8_t x = *a;
560 uint8_t y = *b;
561 if ('A' <= y && y <= 'Z')
562 y += 'a' - 'A';
563 if (x != y)
564 return x - y;
565 }
566 return 0;
567 }
568
569
570 /***** Actual matching and rewriting code *****/
571
572 bool RE2::Match(const StringPiece& text,
573 size_t startpos,
574 size_t endpos,
575 Anchor re_anchor,
576 StringPiece* submatch,
577 int nsubmatch) const {
578 if (!ok()) {
579 if (options_.log_errors())
580 LOG(ERROR) << "Invalid RE2: " << *error_;
581 return false;
582 }
583
584 if (startpos > endpos || endpos > text.size()) {
585 if (options_.log_errors())
586 LOG(ERROR) << "RE2: invalid startpos, endpos pair. ["
587 << "startpos: " << startpos << ", "
588 << "endpos: " << endpos << ", "
589 << "text size: " << text.size() << "]";
590 return false;
591 }
592
593 StringPiece subtext = text;
594 subtext.remove_prefix(startpos);
595 subtext.remove_suffix(text.size() - endpos);
596
597 // Use DFAs to find exact location of match, filter out non-matches.
598
599 // Don't ask for the location if we won't use it.
600 // SearchDFA can do extra optimizations in that case.
601 StringPiece match;
602 StringPiece* matchp = &match;
603 if (nsubmatch == 0)
604 matchp = NULL;
605
606 int ncap = 1 + NumberOfCapturingGroups();
607 if (ncap > nsubmatch)
608 ncap = nsubmatch;
609
610 // If the regexp is anchored explicitly, must not be in middle of text.
611 if (prog_->anchor_start() && startpos != 0)
612 return false;
613
614 // If the regexp is anchored explicitly, update re_anchor
615 // so that we can potentially fall into a faster case below.
616 if (prog_->anchor_start() && prog_->anchor_end())
617 re_anchor = ANCHOR_BOTH;
618 else if (prog_->anchor_start() && re_anchor != ANCHOR_BOTH)
619 re_anchor = ANCHOR_START;
620
621 // Check for the required prefix, if any.
622 size_t prefixlen = 0;
623 if (!prefix_.empty()) {
624 if (startpos != 0)
625 return false;
626 prefixlen = prefix_.size();
627 if (prefixlen > subtext.size())
628 return false;
629 if (prefix_foldcase_) {
630 if (ascii_strcasecmp(&prefix_[0], subtext.data(), prefixlen) != 0)
631 return false;
632 } else {
633 if (memcmp(&prefix_[0], subtext.data(), prefixlen) != 0)
634 return false;
635 }
636 subtext.remove_prefix(prefixlen);
637 // If there is a required prefix, the anchor must be at least ANCHOR_START.
638 if (re_anchor != ANCHOR_BOTH)
639 re_anchor = ANCHOR_START;
640 }
641
642 Prog::Anchor anchor = Prog::kUnanchored;
643 Prog::MatchKind kind = Prog::kFirstMatch;
644 if (options_.longest_match())
645 kind = Prog::kLongestMatch;
646 bool skipped_test = false;
647
648 bool can_one_pass = (is_one_pass_ && ncap <= Prog::kMaxOnePassCapture);
649
650 // BitState allocates a bitmap of size prog_->list_count() * text.size().
651 // It also allocates a stack of 3-word structures which could potentially
652 // grow as large as prog_->list_count() * text.size(), but in practice is
653 // much smaller.
654 const int kMaxBitStateBitmapSize = 256*1024; // bitmap size <= max (bits)
655 bool can_bit_state = prog_->CanBitState();
656 size_t bit_state_text_max = kMaxBitStateBitmapSize / prog_->list_count();
657
658 bool dfa_failed = false;
659 switch (re_anchor) {
660 default:
661 case UNANCHORED: {
662 if (!prog_->SearchDFA(subtext, text, anchor, kind,
663 matchp, &dfa_failed, NULL)) {
664 if (dfa_failed) {
665 if (options_.log_errors())
666 LOG(ERROR) << "DFA out of memory: size " << prog_->size() << ", "
667 << "bytemap range " << prog_->bytemap_range() << ", "
668 << "list count " << prog_->list_count();
669 // Fall back to NFA below.
670 skipped_test = true;
671 break;
672 }
673 return false;
674 }
675 if (matchp == NULL) // Matched. Don't care where
676 return true;
677 // SearchDFA set match[0].end() but didn't know where the
678 // match started. Run the regexp backward from match[0].end()
679 // to find the longest possible match -- that's where it started.
680 Prog* prog = ReverseProg();
681 if (prog == NULL)
682 return false;
683 if (!prog->SearchDFA(match, text, Prog::kAnchored,
684 Prog::kLongestMatch, &match, &dfa_failed, NULL)) {
685 if (dfa_failed) {
686 if (options_.log_errors())
687 LOG(ERROR) << "DFA out of memory: size " << prog->size() << ", "
688 << "bytemap range " << prog->bytemap_range() << ", "
689 << "list count " << prog->list_count();
690 // Fall back to NFA below.
691 skipped_test = true;
692 break;
693 }
694 if (options_.log_errors())
695 LOG(ERROR) << "SearchDFA inconsistency";
696 return false;
697 }
698 break;
699 }
700
701 case ANCHOR_BOTH:
702 case ANCHOR_START:
703 if (re_anchor == ANCHOR_BOTH)
704 kind = Prog::kFullMatch;
705 anchor = Prog::kAnchored;
706
707 // If only a small amount of text and need submatch
708 // information anyway and we're going to use OnePass or BitState
709 // to get it, we might as well not even bother with the DFA:
710 // OnePass or BitState will be fast enough.
711 // On tiny texts, OnePass outruns even the DFA, and
712 // it doesn't have the shared state and occasional mutex that
713 // the DFA does.
714 if (can_one_pass && text.size() <= 4096 &&
715 (ncap > 1 || text.size() <= 8)) {
716 skipped_test = true;
717 break;
718 }
719 if (can_bit_state && text.size() <= bit_state_text_max && ncap > 1) {
720 skipped_test = true;
721 break;
722 }
723 if (!prog_->SearchDFA(subtext, text, anchor, kind,
724 &match, &dfa_failed, NULL)) {
725 if (dfa_failed) {
726 if (options_.log_errors())
727 LOG(ERROR) << "DFA out of memory: size " << prog_->size() << ", "
728 << "bytemap range " << prog_->bytemap_range() << ", "
729 << "list count " << prog_->list_count();
730 // Fall back to NFA below.
731 skipped_test = true;
732 break;
733 }
734 return false;
735 }
736 break;
737 }
738
739 if (!skipped_test && ncap <= 1) {
740 // We know exactly where it matches. That's enough.
741 if (ncap == 1)
742 submatch[0] = match;
743 } else {
744 StringPiece subtext1;
745 if (skipped_test) {
746 // DFA ran out of memory or was skipped:
747 // need to search in entire original text.
748 subtext1 = subtext;
749 } else {
750 // DFA found the exact match location:
751 // let NFA run an anchored, full match search
752 // to find submatch locations.
753 subtext1 = match;
754 anchor = Prog::kAnchored;
755 kind = Prog::kFullMatch;
756 }
757
758 if (can_one_pass && anchor != Prog::kUnanchored) {
759 if (!prog_->SearchOnePass(subtext1, text, anchor, kind, submatch, ncap)) {
760 if (!skipped_test && options_.log_errors())
761 LOG(ERROR) << "SearchOnePass inconsistency";
762 return false;
763 }
764 } else if (can_bit_state && subtext1.size() <= bit_state_text_max) {
765 if (!prog_->SearchBitState(subtext1, text, anchor,
766 kind, submatch, ncap)) {
767 if (!skipped_test && options_.log_errors())
768 LOG(ERROR) << "SearchBitState inconsistency";
769 return false;
770 }
771 } else {
772 if (!prog_->SearchNFA(subtext1, text, anchor, kind, submatch, ncap)) {
773 if (!skipped_test && options_.log_errors())
774 LOG(ERROR) << "SearchNFA inconsistency";
775 return false;
776 }
777 }
778 }
779
780 // Adjust overall match for required prefix that we stripped off.
781 if (prefixlen > 0 && nsubmatch > 0)
782 submatch[0] = StringPiece(submatch[0].data() - prefixlen,
783 submatch[0].size() + prefixlen);
784
785 // Zero submatches that don't exist in the regexp.
786 for (int i = ncap; i < nsubmatch; i++)
787 submatch[i] = StringPiece();
788 return true;
789 }
790
791 // Internal matcher - like Match() but takes Args not StringPieces.
792 bool RE2::DoMatch(const StringPiece& text,
793 Anchor re_anchor,
794 size_t* consumed,
795 const Arg* const* args,
796 int n) const {
797 if (!ok()) {
798 if (options_.log_errors())
799 LOG(ERROR) << "Invalid RE2: " << *error_;
800 return false;
801 }
802
803 if (NumberOfCapturingGroups() < n) {
804 // RE has fewer capturing groups than number of Arg pointers passed in.
805 return false;
806 }
807
808 // Count number of capture groups needed.
809 int nvec;
810 if (n == 0 && consumed == NULL)
811 nvec = 0;
812 else
813 nvec = n+1;
814
815 StringPiece* vec;
816 StringPiece stkvec[kVecSize];
817 StringPiece* heapvec = NULL;
818
819 if (nvec <= static_cast<int>(arraysize(stkvec))) {
820 vec = stkvec;
821 } else {
822 vec = new StringPiece[nvec];
823 heapvec = vec;
824 }
825
826 if (!Match(text, 0, text.size(), re_anchor, vec, nvec)) {
827 delete[] heapvec;
828 return false;
829 }
830
831 if (consumed != NULL)
832 *consumed = static_cast<size_t>(vec[0].end() - text.begin());
833
834 if (n == 0 || args == NULL) {
835 // We are not interested in results
836 delete[] heapvec;
837 return true;
838 }
839
840 // If we got here, we must have matched the whole pattern.
841 for (int i = 0; i < n; i++) {
842 const StringPiece& s = vec[i+1];
843 if (!args[i]->Parse(s.data(), s.size())) {
844 // TODO: Should we indicate what the error was?
845 delete[] heapvec;
846 return false;
847 }
848 }
849
850 delete[] heapvec;
851 return true;
852 }
853
854 // Checks that the rewrite string is well-formed with respect to this
855 // regular expression.
856 bool RE2::CheckRewriteString(const StringPiece& rewrite,
857 std::string* error) const {
858 int max_token = -1;
859 for (const char *s = rewrite.data(), *end = s + rewrite.size();
860 s < end; s++) {
861 int c = *s;
862 if (c != '\\') {
863 continue;
864 }
865 if (++s == end) {
866 *error = "Rewrite schema error: '\\' not allowed at end.";
867 return false;
868 }
869 c = *s;
870 if (c == '\\') {
871 continue;
872 }
873 if (!isdigit(c)) {
874 *error = "Rewrite schema error: "
875 "'\\' must be followed by a digit or '\\'.";
876 return false;
877 }
878 int n = (c - '0');
879 if (max_token < n) {
880 max_token = n;
881 }
882 }
883
884 if (max_token > NumberOfCapturingGroups()) {
885 *error = StringPrintf(
886 "Rewrite schema requests %d matches, but the regexp only has %d "
887 "parenthesized subexpressions.",
888 max_token, NumberOfCapturingGroups());
889 return false;
890 }
891 return true;
892 }
893
894 // Returns the maximum submatch needed for the rewrite to be done by Replace().
895 // E.g. if rewrite == "foo \\2,\\1", returns 2.
896 int RE2::MaxSubmatch(const StringPiece& rewrite) {
897 int max = 0;
898 for (const char *s = rewrite.data(), *end = s + rewrite.size();
899 s < end; s++) {
900 if (*s == '\\') {
901 s++;
902 int c = (s < end) ? *s : -1;
903 if (isdigit(c)) {
904 int n = (c - '0');
905 if (n > max)
906 max = n;
907 }
908 }
909 }
910 return max;
911 }
912
913 // Append the "rewrite" string, with backslash subsitutions from "vec",
914 // to string "out".
915 bool RE2::Rewrite(std::string* out,
916 const StringPiece& rewrite,
917 const StringPiece* vec,
918 int veclen) const {
919 for (const char *s = rewrite.data(), *end = s + rewrite.size();
920 s < end; s++) {
921 if (*s != '\\') {
922 out->push_back(*s);
923 continue;
924 }
925 s++;
926 int c = (s < end) ? *s : -1;
927 if (isdigit(c)) {
928 int n = (c - '0');
929 if (n >= veclen) {
930 if (options_.log_errors()) {
931 LOG(ERROR) << "requested group " << n
932 << " in regexp " << rewrite.data();
933 }
934 return false;
935 }
936 StringPiece snip = vec[n];
937 if (!snip.empty())
938 out->append(snip.data(), snip.size());
939 } else if (c == '\\') {
940 out->push_back('\\');
941 } else {
942 if (options_.log_errors())
943 LOG(ERROR) << "invalid rewrite pattern: " << rewrite.data();
944 return false;
945 }
946 }
947 return true;
948 }
949
950 /***** Parsers for various types *****/
951
952 bool RE2::Arg::parse_null(const char* str, size_t n, void* dest) {
953 // We fail if somebody asked us to store into a non-NULL void* pointer
954 return (dest == NULL);
955 }
956
957 bool RE2::Arg::parse_string(const char* str, size_t n, void* dest) {
958 if (dest == NULL) return true;
959 reinterpret_cast<std::string*>(dest)->assign(str, n);
960 return true;
961 }
962
963 bool RE2::Arg::parse_stringpiece(const char* str, size_t n, void* dest) {
964 if (dest == NULL) return true;
965 *(reinterpret_cast<StringPiece*>(dest)) = StringPiece(str, n);
966 return true;
967 }
968
969 bool RE2::Arg::parse_char(const char* str, size_t n, void* dest) {
970 if (n != 1) return false;
971 if (dest == NULL) return true;
972 *(reinterpret_cast<char*>(dest)) = str[0];
973 return true;
974 }
975
976 bool RE2::Arg::parse_schar(const char* str, size_t n, void* dest) {
977 if (n != 1) return false;
978 if (dest == NULL) return true;
979 *(reinterpret_cast<signed char*>(dest)) = str[0];
980 return true;
981 }
982
983 bool RE2::Arg::parse_uchar(const char* str, size_t n, void* dest) {
984 if (n != 1) return false;
985 if (dest == NULL) return true;
986 *(reinterpret_cast<unsigned char*>(dest)) = str[0];
987 return true;
988 }
989
990 // Largest number spec that we are willing to parse
991 static const int kMaxNumberLength = 32;
992
993 // REQUIRES "buf" must have length at least nbuf.
994 // Copies "str" into "buf" and null-terminates.
995 // Overwrites *np with the new length.
996 static const char* TerminateNumber(char* buf, size_t nbuf, const char* str,
997 size_t* np, bool accept_spaces) {
998 size_t n = *np;
999 if (n == 0) return "";
1000 if (n > 0 && isspace(*str)) {
1001 // We are less forgiving than the strtoxxx() routines and do not
1002 // allow leading spaces. We do allow leading spaces for floats.
1003 if (!accept_spaces) {
1004 return "";
1005 }
1006 while (n > 0 && isspace(*str)) {
1007 n--;
1008 str++;
1009 }
1010 }
1011
1012 // Although buf has a fixed maximum size, we can still handle
1013 // arbitrarily large integers correctly by omitting leading zeros.
1014 // (Numbers that are still too long will be out of range.)
1015 // Before deciding whether str is too long,
1016 // remove leading zeros with s/000+/00/.
1017 // Leaving the leading two zeros in place means that
1018 // we don't change 0000x123 (invalid) into 0x123 (valid).
1019 // Skip over leading - before replacing.
1020 bool neg = false;
1021 if (n >= 1 && str[0] == '-') {
1022 neg = true;
1023 n--;
1024 str++;
1025 }
1026
1027 if (n >= 3 && str[0] == '0' && str[1] == '0') {
1028 while (n >= 3 && str[2] == '0') {
1029 n--;
1030 str++;
1031 }
1032 }
1033
1034 if (neg) { // make room in buf for -
1035 n++;
1036 str--;
1037 }
1038
1039 if (n > nbuf-1) return "";
1040
1041 memmove(buf, str, n);
1042 if (neg) {
1043 buf[0] = '-';
1044 }
1045 buf[n] = '\0';
1046 *np = n;
1047 return buf;
1048 }
1049
1050 bool RE2::Arg::parse_long_radix(const char* str,
1051 size_t n,
1052 void* dest,
1053 int radix) {
1054 if (n == 0) return false;
1055 char buf[kMaxNumberLength+1];
1056 str = TerminateNumber(buf, sizeof buf, str, &n, false);
1057 char* end;
1058 errno = 0;
1059 long r = strtol(str, &end, radix);
1060 if (end != str + n) return false; // Leftover junk
1061 if (errno) return false;
1062 if (dest == NULL) return true;
1063 *(reinterpret_cast<long*>(dest)) = r;
1064 return true;
1065 }
1066
1067 bool RE2::Arg::parse_ulong_radix(const char* str,
1068 size_t n,
1069 void* dest,
1070 int radix) {
1071 if (n == 0) return false;
1072 char buf[kMaxNumberLength+1];
1073 str = TerminateNumber(buf, sizeof buf, str, &n, false);
1074 if (str[0] == '-') {
1075 // strtoul() will silently accept negative numbers and parse
1076 // them. This module is more strict and treats them as errors.
1077 return false;
1078 }
1079
1080 char* end;
1081 errno = 0;
1082 unsigned long r = strtoul(str, &end, radix);
1083 if (end != str + n) return false; // Leftover junk
1084 if (errno) return false;
1085 if (dest == NULL) return true;
1086 *(reinterpret_cast<unsigned long*>(dest)) = r;
1087 return true;
1088 }
1089
1090 bool RE2::Arg::parse_short_radix(const char* str,
1091 size_t n,
1092 void* dest,
1093 int radix) {
1094 long r;
1095 if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse
1096 if ((short)r != r) return false; // Out of range
1097 if (dest == NULL) return true;
1098 *(reinterpret_cast<short*>(dest)) = (short)r;
1099 return true;
1100 }
1101
1102 bool RE2::Arg::parse_ushort_radix(const char* str,
1103 size_t n,
1104 void* dest,
1105 int radix) {
1106 unsigned long r;
1107 if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse
1108 if ((unsigned short)r != r) return false; // Out of range
1109 if (dest == NULL) return true;
1110 *(reinterpret_cast<unsigned short*>(dest)) = (unsigned short)r;
1111 return true;
1112 }
1113
1114 bool RE2::Arg::parse_int_radix(const char* str,
1115 size_t n,
1116 void* dest,
1117 int radix) {
1118 long r;
1119 if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse
1120 if ((int)r != r) return false; // Out of range
1121 if (dest == NULL) return true;
1122 *(reinterpret_cast<int*>(dest)) = (int)r;
1123 return true;
1124 }
1125
1126 bool RE2::Arg::parse_uint_radix(const char* str,
1127 size_t n,
1128 void* dest,
1129 int radix) {
1130 unsigned long r;
1131 if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse
1132 if ((unsigned int)r != r) return false; // Out of range
1133 if (dest == NULL) return true;
1134 *(reinterpret_cast<unsigned int*>(dest)) = (unsigned int)r;
1135 return true;
1136 }
1137
1138 bool RE2::Arg::parse_longlong_radix(const char* str,
1139 size_t n,
1140 void* dest,
1141 int radix) {
1142 if (n == 0) return false;
1143 char buf[kMaxNumberLength+1];
1144 str = TerminateNumber(buf, sizeof buf, str, &n, false);
1145 char* end;
1146 errno = 0;
1147 long long r = strtoll(str, &end, radix);
1148 if (end != str + n) return false; // Leftover junk
1149 if (errno) return false;
1150 if (dest == NULL) return true;
1151 *(reinterpret_cast<long long*>(dest)) = r;
1152 return true;
1153 }
1154
1155 bool RE2::Arg::parse_ulonglong_radix(const char* str,
1156 size_t n,
1157 void* dest,
1158 int radix) {
1159 if (n == 0) return false;
1160 char buf[kMaxNumberLength+1];
1161 str = TerminateNumber(buf, sizeof buf, str, &n, false);
1162 if (str[0] == '-') {
1163 // strtoull() will silently accept negative numbers and parse
1164 // them. This module is more strict and treats them as errors.
1165 return false;
1166 }
1167 char* end;
1168 errno = 0;
1169 unsigned long long r = strtoull(str, &end, radix);
1170 if (end != str + n) return false; // Leftover junk
1171 if (errno) return false;
1172 if (dest == NULL) return true;
1173 *(reinterpret_cast<unsigned long long*>(dest)) = r;
1174 return true;
1175 }
1176
1177 static bool parse_double_float(const char* str, size_t n, bool isfloat,
1178 void* dest) {
1179 if (n == 0) return false;
1180 static const int kMaxLength = 200;
1181 char buf[kMaxLength+1];
1182 str = TerminateNumber(buf, sizeof buf, str, &n, true);
1183 char* end;
1184 errno = 0;
1185 double r;
1186 if (isfloat) {
1187 r = strtof(str, &end);
1188 } else {
1189 r = strtod(str, &end);
1190 }
1191 if (end != str + n) return false; // Leftover junk
1192 if (errno) return false;
1193 if (dest == NULL) return true;
1194 if (isfloat) {
1195 *(reinterpret_cast<float*>(dest)) = (float)r;
1196 } else {
1197 *(reinterpret_cast<double*>(dest)) = r;
1198 }
1199 return true;
1200 }
1201
1202 bool RE2::Arg::parse_double(const char* str, size_t n, void* dest) {
1203 return parse_double_float(str, n, false, dest);
1204 }
1205
1206 bool RE2::Arg::parse_float(const char* str, size_t n, void* dest) {
1207 return parse_double_float(str, n, true, dest);
1208 }
1209
1210 #define DEFINE_INTEGER_PARSER(name) \
1211 bool RE2::Arg::parse_##name(const char* str, size_t n, void* dest) { \
1212 return parse_##name##_radix(str, n, dest, 10); \
1213 } \
1214 bool RE2::Arg::parse_##name##_hex(const char* str, size_t n, void* dest) { \
1215 return parse_##name##_radix(str, n, dest, 16); \
1216 } \
1217 bool RE2::Arg::parse_##name##_octal(const char* str, size_t n, void* dest) { \
1218 return parse_##name##_radix(str, n, dest, 8); \
1219 } \
1220 bool RE2::Arg::parse_##name##_cradix(const char* str, size_t n, \
1221 void* dest) { \
1222 return parse_##name##_radix(str, n, dest, 0); \
1223 }
1224
1225 DEFINE_INTEGER_PARSER(short);
1226 DEFINE_INTEGER_PARSER(ushort);
1227 DEFINE_INTEGER_PARSER(int);
1228 DEFINE_INTEGER_PARSER(uint);
1229 DEFINE_INTEGER_PARSER(long);
1230 DEFINE_INTEGER_PARSER(ulong);
1231 DEFINE_INTEGER_PARSER(longlong);
1232 DEFINE_INTEGER_PARSER(ulonglong);
1233
1234 #undef DEFINE_INTEGER_PARSER
1235
1236 } // namespace re2
1237