• Home
  • Raw
  • Download

Lines Matching +full:escape +full:- +full:string +full:- +full:regexp

2 // Use of this source code is governed by a BSD-style
7 // The parser is a simple precedence-based parser with a
9 // of the ParseState class. The Regexp::Parse function is
15 // classes. It also allows the empty string as a regular expression
16 // and recognizes the Perl escape sequences \d, \s, \w, \D, \S, and \W.
17 // See regexp.h for rationale.
22 #include <string.h>
25 #include <string>
33 #include "re2/regexp.h"
37 #include "re2/walker-inl.h"
56 // Regexp pointers called the stack. Left parenthesis and vertical
58 // non-standard opcodes.
67 // form LeftParen regexp VerticalBar regexp VerticalBar ... regexp VerticalBar.
71 class Regexp::ParseState {
87 bool PushRegexp(Regexp* re);
92 // Pushes a regexp with the given op (and no args) onto the stack.
107 // Pushes a repeat operator regexp onto the stack.
112 // Pushes a repetition regexp onto the stack.
116 // Checks whether a particular regexp op is a marker.
130 // Processes the end of input, returning the final regexp.
131 Regexp* DoFinish();
133 // Finishes the regexp if necessary, preparing it for use
136 Regexp* FinishRegexp(Regexp*);
140 // ParseCharClass also manipulates the internals of Regexp
145 bool ParseCharClass(StringPiece* s, Regexp** out_re,
160 // Parse a Perl flag set or non-capturing group from s.
165 // collapsing it into a single regexp on the stack.
169 // collapsing it to a single regexp on the stack.
182 Regexp* stacktop_;
190 // Pseudo-operators - only on parse stack.
194 Regexp::ParseState::ParseState(ParseFlags flags, in ParseState()
206 Regexp::ParseState::~ParseState() { in ~ParseState()
207 Regexp* next; in ~ParseState()
208 for (Regexp* re = stacktop_; re != NULL; re = next) { in ~ParseState()
209 next = re->down_; in ~ParseState()
210 re->down_ = NULL; in ~ParseState()
211 if (re->op() == kLeftParen) in ~ParseState()
212 delete re->name_; in ~ParseState()
213 re->Decref(); in ~ParseState()
217 // Finishes the regexp if necessary, preparing it for use in
220 Regexp* Regexp::ParseState::FinishRegexp(Regexp* re) { in FinishRegexp()
223 re->down_ = NULL; in FinishRegexp()
225 if (re->op_ == kRegexpCharClass && re->ccb_ != NULL) { in FinishRegexp()
226 CharClassBuilder* ccb = re->ccb_; in FinishRegexp()
227 re->ccb_ = NULL; in FinishRegexp()
228 re->cc_ = ccb->GetCharClass(); in FinishRegexp()
237 bool Regexp::ParseState::PushRegexp(Regexp* re) { in PushRegexp()
238 MaybeConcatString(-1, NoParseFlags); in PushRegexp()
245 if (re->op_ == kRegexpCharClass && re->ccb_ != NULL) { in PushRegexp()
246 re->ccb_->RemoveAbove(rune_max_); in PushRegexp()
247 if (re->ccb_->size() == 1) { in PushRegexp()
248 Rune r = re->ccb_->begin()->lo; in PushRegexp()
249 re->Decref(); in PushRegexp()
250 re = new Regexp(kRegexpLiteral, flags_); in PushRegexp()
251 re->rune_ = r; in PushRegexp()
252 } else if (re->ccb_->size() == 2) { in PushRegexp()
253 Rune r = re->ccb_->begin()->lo; in PushRegexp()
254 if ('A' <= r && r <= 'Z' && re->ccb_->Contains(r + 'a' - 'A')) { in PushRegexp()
255 re->Decref(); in PushRegexp()
256 re = new Regexp(kRegexpLiteral, flags_ | FoldCase); in PushRegexp()
257 re->rune_ = r + 'a' - 'A'; in PushRegexp()
262 if (!IsMarker(re->op())) in PushRegexp()
263 re->simple_ = re->ComputeSimple(); in PushRegexp()
264 re->down_ = stacktop_; in PushRegexp()
270 // If there isn't one, returns the CaseFold* with smallest f->lo bigger than r.
284 n -= m+1; in LookupCaseFold()
301 switch (f->delta) { in ApplyFold()
303 return r + f->delta; in ApplyFold()
305 case EvenOddSkip: // even <-> odd but only applies to every other in ApplyFold()
306 if ((r - f->lo) % 2) in ApplyFold()
309 case EvenOdd: // even <-> odd in ApplyFold()
312 return r - 1; in ApplyFold()
314 case OddEvenSkip: // odd <-> even but only applies to every other in ApplyFold()
315 if ((r - f->lo) % 2) in ApplyFold()
318 case OddEven: // odd <-> even in ApplyFold()
321 return r - 1; in ApplyFold()
337 if (f == NULL || r < f->lo) in CycleFoldRune()
342 // Add lo-hi to the class, along with their fold-equivalent characters.
343 // If lo-hi is already in the class, assume that the fold-equivalent
349 // the cycles are not too long, and we double-check here using depth. in AddFoldedRange()
355 if (!cc->AddRange(lo, hi)) // lo-hi was already there? we're done in AddFoldedRange()
362 if (lo < f->lo) { // lo has no fold; next rune with a fold is f->lo in AddFoldedRange()
363 lo = f->lo; in AddFoldedRange()
367 // Add in the result of folding the range lo - f->hi in AddFoldedRange()
370 Rune hi1 = std::min<Rune>(hi, f->hi); in AddFoldedRange()
371 switch (f->delta) { in AddFoldedRange()
373 lo1 += f->delta; in AddFoldedRange()
374 hi1 += f->delta; in AddFoldedRange()
378 lo1--; in AddFoldedRange()
384 lo1--; in AddFoldedRange()
392 lo = f->hi + 1; in AddFoldedRange()
397 bool Regexp::ParseState::PushLiteral(Rune r) { in PushLiteral()
400 Regexp* re = new Regexp(kRegexpCharClass, flags_ & ~FoldCase); in PushLiteral()
401 re->ccb_ = new CharClassBuilder; in PushLiteral()
405 re->ccb_->AddRange(r, r); in PushLiteral()
414 return PushRegexp(new Regexp(kRegexpNoMatch, flags_)); in PushLiteral()
420 Regexp* re = new Regexp(kRegexpLiteral, flags_); in PushLiteral()
421 re->rune_ = r; in PushLiteral()
426 bool Regexp::ParseState::PushCarat() { in PushCarat()
434 bool Regexp::ParseState::PushWordBoundary(bool word) { in PushWordBoundary()
441 bool Regexp::ParseState::PushDollar() { in PushDollar()
445 Regexp::ParseFlags oflags = flags_; in PushDollar()
455 bool Regexp::ParseState::PushDot() { in PushDot()
459 Regexp* re = new Regexp(kRegexpCharClass, flags_ & ~FoldCase); in PushDot()
460 re->ccb_ = new CharClassBuilder; in PushDot()
461 re->ccb_->AddRange(0, '\n' - 1); in PushDot()
462 re->ccb_->AddRange('\n' + 1, rune_max_); in PushDot()
466 // Pushes a regexp with the given op (and no args) onto the stack.
467 bool Regexp::ParseState::PushSimpleOp(RegexpOp op) { in PushSimpleOp()
468 Regexp* re = new Regexp(op, flags_); in PushSimpleOp()
472 // Pushes a repeat operator regexp onto the stack.
475 bool Regexp::ParseState::PushRepeatOp(RegexpOp op, const StringPiece& s, in PushRepeatOp()
477 if (stacktop_ == NULL || IsMarker(stacktop_->op())) { in PushRepeatOp()
478 status_->set_code(kRegexpRepeatArgument); in PushRepeatOp()
479 status_->set_error_arg(s); in PushRepeatOp()
482 Regexp::ParseFlags fl = flags_; in PushRepeatOp()
486 // Squash **, ++ and ??. Regexp::Star() et al. handle this too, but in PushRepeatOp()
488 if (op == stacktop_->op() && fl == stacktop_->parse_flags()) in PushRepeatOp()
492 // op is a repeat, we just have to check that stacktop_->op() is too, in PushRepeatOp()
494 if ((stacktop_->op() == kRegexpStar || in PushRepeatOp()
495 stacktop_->op() == kRegexpPlus || in PushRepeatOp()
496 stacktop_->op() == kRegexpQuest) && in PushRepeatOp()
497 fl == stacktop_->parse_flags()) { in PushRepeatOp()
498 stacktop_->op_ = kRegexpStar; in PushRepeatOp()
502 Regexp* re = new Regexp(op, fl); in PushRepeatOp()
503 re->AllocSub(1); in PushRepeatOp()
504 re->down_ = stacktop_->down_; in PushRepeatOp()
505 re->sub()[0] = FinishRegexp(stacktop_); in PushRepeatOp()
506 re->simple_ = re->ComputeSimple(); in PushRepeatOp()
511 // RepetitionWalker reports whether the repetition regexp is valid.
512 // Valid means that the combination of the top-level repetition
515 // This rewalks the regexp tree and is called for every repetition,
520 class RepetitionWalker : public Regexp::Walker<int> {
523 virtual int PreVisit(Regexp* re, int parent_arg, bool* stop);
524 virtual int PostVisit(Regexp* re, int parent_arg, int pre_arg,
526 virtual int ShortVisit(Regexp* re, int parent_arg);
533 int RepetitionWalker::PreVisit(Regexp* re, int parent_arg, bool* stop) { in PreVisit()
535 if (re->op() == kRegexpRepeat) { in PreVisit()
536 int m = re->max(); in PreVisit()
538 m = re->min(); in PreVisit()
547 int RepetitionWalker::PostVisit(Regexp* re, int parent_arg, int pre_arg, in PostVisit()
558 int RepetitionWalker::ShortVisit(Regexp* re, int parent_arg) { in ShortVisit()
565 // Pushes a repetition regexp onto the stack.
567 bool Regexp::ParseState::PushRepetition(int min, int max, in PushRepetition()
570 if ((max != -1 && max < min) || min > kMaxRepeat || max > kMaxRepeat) { in PushRepetition()
571 status_->set_code(kRegexpRepeatSize); in PushRepetition()
572 status_->set_error_arg(s); in PushRepetition()
575 if (stacktop_ == NULL || IsMarker(stacktop_->op())) { in PushRepetition()
576 status_->set_code(kRegexpRepeatArgument); in PushRepetition()
577 status_->set_error_arg(s); in PushRepetition()
580 Regexp::ParseFlags fl = flags_; in PushRepetition()
583 Regexp* re = new Regexp(kRegexpRepeat, fl); in PushRepetition()
584 re->min_ = min; in PushRepetition()
585 re->max_ = max; in PushRepetition()
586 re->AllocSub(1); in PushRepetition()
587 re->down_ = stacktop_->down_; in PushRepetition()
588 re->sub()[0] = FinishRegexp(stacktop_); in PushRepetition()
589 re->simple_ = re->ComputeSimple(); in PushRepetition()
594 status_->set_code(kRegexpRepeatSize); in PushRepetition()
595 status_->set_error_arg(s); in PushRepetition()
602 // Checks whether a particular regexp op is a marker.
603 bool Regexp::ParseState::IsMarker(RegexpOp op) { in IsMarker()
609 bool Regexp::ParseState::DoLeftParen(const StringPiece& name) { in DoLeftParen()
610 Regexp* re = new Regexp(kLeftParen, flags_); in DoLeftParen()
611 re->cap_ = ++ncap_; in DoLeftParen()
613 re->name_ = new string(name); in DoLeftParen()
617 // Pushes a non-capturing marker onto the stack.
618 bool Regexp::ParseState::DoLeftParenNoCapture() { in DoLeftParenNoCapture()
619 Regexp* re = new Regexp(kLeftParen, flags_); in DoLeftParenNoCapture()
620 re->cap_ = -1; in DoLeftParenNoCapture()
625 bool Regexp::ParseState::DoVerticalBar() { in DoVerticalBar()
626 MaybeConcatString(-1, NoParseFlags); in DoVerticalBar()
634 Regexp* r1; in DoVerticalBar()
635 Regexp* r2; in DoVerticalBar()
637 (r2 = r1->down_) != NULL && in DoVerticalBar()
638 r2->op() == kVerticalBar) { in DoVerticalBar()
639 Regexp* r3; in DoVerticalBar()
640 if ((r3 = r2->down_) != NULL && in DoVerticalBar()
641 (r1->op() == kRegexpAnyChar || r3->op() == kRegexpAnyChar)) { in DoVerticalBar()
644 if (r3->op() == kRegexpAnyChar && in DoVerticalBar()
645 (r1->op() == kRegexpLiteral || in DoVerticalBar()
646 r1->op() == kRegexpCharClass || in DoVerticalBar()
647 r1->op() == kRegexpAnyChar)) { in DoVerticalBar()
650 r1->Decref(); in DoVerticalBar()
653 if (r1->op() == kRegexpAnyChar && in DoVerticalBar()
654 (r3->op() == kRegexpLiteral || in DoVerticalBar()
655 r3->op() == kRegexpCharClass || in DoVerticalBar()
656 r3->op() == kRegexpAnyChar)) { in DoVerticalBar()
658 r1->down_ = r3->down_; in DoVerticalBar()
659 r2->down_ = r1; in DoVerticalBar()
661 r3->Decref(); in DoVerticalBar()
666 r1->down_ = r2->down_; in DoVerticalBar()
667 r2->down_ = r1; in DoVerticalBar()
675 bool Regexp::ParseState::DoRightParen() { in DoRightParen()
679 // The stack should be: LeftParen regexp in DoRightParen()
680 // Remove the LeftParen, leaving the regexp, in DoRightParen()
682 Regexp* r1; in DoRightParen()
683 Regexp* r2; in DoRightParen()
685 (r2 = r1->down_) == NULL || in DoRightParen()
686 r2->op() != kLeftParen) { in DoRightParen()
687 status_->set_code(kRegexpMissingParen); in DoRightParen()
688 status_->set_error_arg(whole_regexp_); in DoRightParen()
693 stacktop_ = r2->down_; in DoRightParen()
696 Regexp* re = r2; in DoRightParen()
697 flags_ = re->parse_flags(); in DoRightParen()
700 if (re->cap_ > 0) { in DoRightParen()
701 re->op_ = kRegexpCapture; in DoRightParen()
702 // re->cap_ is already set in DoRightParen()
703 re->AllocSub(1); in DoRightParen()
704 re->sub()[0] = FinishRegexp(r1); in DoRightParen()
705 re->simple_ = re->ComputeSimple(); in DoRightParen()
707 re->Decref(); in DoRightParen()
713 // Processes the end of input, returning the final regexp.
714 Regexp* Regexp::ParseState::DoFinish() { in DoFinish()
716 Regexp* re = stacktop_; in DoFinish()
717 if (re != NULL && re->down_ != NULL) { in DoFinish()
718 status_->set_code(kRegexpMissingParen); in DoFinish()
719 status_->set_error_arg(whole_regexp_); in DoFinish()
726 // Returns the leading regexp that re starts with.
727 // The returned Regexp* points into a piece of re,
728 // so it must not be used after the caller calls re->Decref().
729 Regexp* Regexp::LeadingRegexp(Regexp* re) { in LeadingRegexp()
730 if (re->op() == kRegexpEmptyMatch) in LeadingRegexp()
732 if (re->op() == kRegexpConcat && re->nsub() >= 2) { in LeadingRegexp()
733 Regexp** sub = re->sub(); in LeadingRegexp()
734 if (sub[0]->op() == kRegexpEmptyMatch) in LeadingRegexp()
745 Regexp* Regexp::RemoveLeadingRegexp(Regexp* re) { in RemoveLeadingRegexp()
746 if (re->op() == kRegexpEmptyMatch) in RemoveLeadingRegexp()
748 if (re->op() == kRegexpConcat && re->nsub() >= 2) { in RemoveLeadingRegexp()
749 Regexp** sub = re->sub(); in RemoveLeadingRegexp()
750 if (sub[0]->op() == kRegexpEmptyMatch) in RemoveLeadingRegexp()
752 sub[0]->Decref(); in RemoveLeadingRegexp()
754 if (re->nsub() == 2) { in RemoveLeadingRegexp()
755 // Collapse concatenation to single regexp. in RemoveLeadingRegexp()
756 Regexp* nre = sub[1]; in RemoveLeadingRegexp()
758 re->Decref(); in RemoveLeadingRegexp()
761 // 3 or more -> 2 or more. in RemoveLeadingRegexp()
762 re->nsub_--; in RemoveLeadingRegexp()
763 memmove(sub, sub + 1, re->nsub_ * sizeof sub[0]); in RemoveLeadingRegexp()
766 Regexp::ParseFlags pf = re->parse_flags(); in RemoveLeadingRegexp()
767 re->Decref(); in RemoveLeadingRegexp()
768 return new Regexp(kRegexpEmptyMatch, pf); in RemoveLeadingRegexp()
771 // Returns the leading string that re starts with.
773 // so it must not be used after the caller calls re->Decref().
774 Rune* Regexp::LeadingString(Regexp* re, int *nrune, in LeadingString()
775 Regexp::ParseFlags *flags) { in LeadingString()
776 while (re->op() == kRegexpConcat && re->nsub() > 0) in LeadingString()
777 re = re->sub()[0]; in LeadingString()
779 *flags = static_cast<Regexp::ParseFlags>(re->parse_flags_ & Regexp::FoldCase); in LeadingString()
781 if (re->op() == kRegexpLiteral) { in LeadingString()
783 return &re->rune_; in LeadingString()
786 if (re->op() == kRegexpLiteralString) { in LeadingString()
787 *nrune = re->nrunes_; in LeadingString()
788 return re->runes_; in LeadingString()
797 void Regexp::RemoveLeadingString(Regexp* re, int n) { in RemoveLeadingString()
798 // Chase down concats to find first string. in RemoveLeadingString()
800 // flattened except when doing so would overflow the 16-bit in RemoveLeadingString()
803 Regexp* stk[4]; in RemoveLeadingString()
805 while (re->op() == kRegexpConcat) { in RemoveLeadingString()
808 re = re->sub()[0]; in RemoveLeadingString()
811 // Remove leading string from re. in RemoveLeadingString()
812 if (re->op() == kRegexpLiteral) { in RemoveLeadingString()
813 re->rune_ = 0; in RemoveLeadingString()
814 re->op_ = kRegexpEmptyMatch; in RemoveLeadingString()
815 } else if (re->op() == kRegexpLiteralString) { in RemoveLeadingString()
816 if (n >= re->nrunes_) { in RemoveLeadingString()
817 delete[] re->runes_; in RemoveLeadingString()
818 re->runes_ = NULL; in RemoveLeadingString()
819 re->nrunes_ = 0; in RemoveLeadingString()
820 re->op_ = kRegexpEmptyMatch; in RemoveLeadingString()
821 } else if (n == re->nrunes_ - 1) { in RemoveLeadingString()
822 Rune rune = re->runes_[re->nrunes_ - 1]; in RemoveLeadingString()
823 delete[] re->runes_; in RemoveLeadingString()
824 re->runes_ = NULL; in RemoveLeadingString()
825 re->nrunes_ = 0; in RemoveLeadingString()
826 re->rune_ = rune; in RemoveLeadingString()
827 re->op_ = kRegexpLiteral; in RemoveLeadingString()
829 re->nrunes_ -= n; in RemoveLeadingString()
830 memmove(re->runes_, re->runes_ + n, re->nrunes_ * sizeof re->runes_[0]); in RemoveLeadingString()
835 while (d-- > 0) { in RemoveLeadingString()
837 Regexp** sub = re->sub(); in RemoveLeadingString()
838 if (sub[0]->op() == kRegexpEmptyMatch) { in RemoveLeadingString()
839 sub[0]->Decref(); in RemoveLeadingString()
842 switch (re->nsub()) { in RemoveLeadingString()
846 LOG(DFATAL) << "Concat of " << re->nsub(); in RemoveLeadingString()
847 re->submany_ = NULL; in RemoveLeadingString()
848 re->op_ = kRegexpEmptyMatch; in RemoveLeadingString()
853 Regexp* old = sub[1]; in RemoveLeadingString()
855 re->Swap(old); in RemoveLeadingString()
856 old->Decref(); in RemoveLeadingString()
862 re->nsub_--; in RemoveLeadingString()
863 memmove(sub, sub + 1, re->nsub_ * sizeof sub[0]); in RemoveLeadingString()
877 Splice(Regexp* prefix, Regexp** sub, int nsub) in Splice()
881 nsuffix(-1) {} in Splice()
883 Regexp* prefix;
884 Regexp** sub;
894 Frame(Regexp** sub, int nsub) in Frame()
899 Regexp** sub;
906 // Bundled into a class for friend access to Regexp without needing to declare
907 // (or define) Splice in regexp.h.
910 static void Round1(Regexp** sub, int nsub,
911 Regexp::ParseFlags flags,
913 static void Round2(Regexp** sub, int nsub,
914 Regexp::ParseFlags flags,
916 static void Round3(Regexp** sub, int nsub,
917 Regexp::ParseFlags flags,
932 int Regexp::FactorAlternation(Regexp** sub, int nsub, ParseFlags flags) { in FactorAlternation()
957 while (sub + i < iter->sub) in FactorAlternation()
963 Regexp* re[2]; in FactorAlternation()
964 re[0] = iter->prefix; in FactorAlternation()
965 re[1] = Regexp::AlternateNoFactor(iter->sub, iter->nsuffix, flags); in FactorAlternation()
966 sub[out++] = Regexp::Concat(re, 2, flags); in FactorAlternation()
967 i += iter->nsub; in FactorAlternation()
972 sub[out++] = iter->prefix; in FactorAlternation()
973 i += iter->nsub; in FactorAlternation()
1028 void FactorAlternationImpl::Round1(Regexp** sub, int nsub, in Round1()
1029 Regexp::ParseFlags flags, in Round1()
1035 Regexp::ParseFlags runeflags = Regexp::NoParseFlags; in Round1()
1041 Regexp::ParseFlags runeflags_i = Regexp::NoParseFlags; in Round1()
1043 rune_i = Regexp::LeadingString(sub[i], &nrune_i, &runeflags_i); in Round1()
1056 // Found end of a run with common leading literal string: in Round1()
1060 // Nothing to do - first iteration. in Round1()
1064 Regexp* prefix = Regexp::LiteralString(rune, nrune, runeflags); in Round1()
1066 Regexp::RemoveLeadingString(sub[j], nrune); in Round1()
1067 splices->emplace_back(prefix, sub + start, i - start); in Round1()
1080 void FactorAlternationImpl::Round2(Regexp** sub, int nsub, in Round2()
1081 Regexp::ParseFlags flags, in Round2()
1092 Regexp* first = NULL; in Round2()
1096 Regexp* first_i = NULL; in Round2()
1098 first_i = Regexp::LeadingRegexp(sub[i]); in Round2()
1100 // first must be an empty-width op in Round2()
1103 (first->op() == kRegexpBeginLine || in Round2()
1104 first->op() == kRegexpEndLine || in Round2()
1105 first->op() == kRegexpWordBoundary || in Round2()
1106 first->op() == kRegexpNoWordBoundary || in Round2()
1107 first->op() == kRegexpBeginText || in Round2()
1108 first->op() == kRegexpEndText || in Round2()
1109 first->op() == kRegexpCharClass || in Round2()
1110 first->op() == kRegexpAnyChar || in Round2()
1111 first->op() == kRegexpAnyByte || in Round2()
1112 (first->op() == kRegexpRepeat && in Round2()
1113 first->min() == first->max() && in Round2()
1114 (first->sub()[0]->op() == kRegexpLiteral || in Round2()
1115 first->sub()[0]->op() == kRegexpCharClass || in Round2()
1116 first->sub()[0]->op() == kRegexpAnyChar || in Round2()
1117 first->sub()[0]->op() == kRegexpAnyByte))) && in Round2()
1118 Regexp::Equal(first, first_i)) in Round2()
1122 // Found end of a run with common leading regexp: in Round2()
1126 // Nothing to do - first iteration. in Round2()
1130 Regexp* prefix = first->Incref(); in Round2()
1132 sub[j] = Regexp::RemoveLeadingRegexp(sub[j]); in Round2()
1133 splices->emplace_back(prefix, sub + start, i - start); in Round2()
1144 void FactorAlternationImpl::Round3(Regexp** sub, int nsub, in Round3()
1145 Regexp::ParseFlags flags, in Round3()
1149 Regexp* first = NULL; in Round3()
1153 Regexp* first_i = NULL; in Round3()
1157 (first->op() == kRegexpLiteral || in Round3()
1158 first->op() == kRegexpCharClass) && in Round3()
1159 (first_i->op() == kRegexpLiteral || in Round3()
1160 first_i->op() == kRegexpCharClass)) in Round3()
1168 // Nothing to do - first iteration. in Round3()
1174 Regexp* re = sub[j]; in Round3()
1175 if (re->op() == kRegexpCharClass) { in Round3()
1176 CharClass* cc = re->cc(); in Round3()
1177 for (CharClass::iterator it = cc->begin(); it != cc->end(); ++it) in Round3()
1178 ccb.AddRange(it->lo, it->hi); in Round3()
1179 } else if (re->op() == kRegexpLiteral) { in Round3()
1180 ccb.AddRangeFlags(re->rune(), re->rune(), re->parse_flags()); in Round3()
1182 LOG(DFATAL) << "RE2: unexpected op: " << re->op() << " " in Round3()
1183 << re->ToString(); in Round3()
1185 re->Decref(); in Round3()
1187 Regexp* re = Regexp::NewCharClass(ccb.GetCharClass(), flags); in Round3()
1188 splices->emplace_back(re, sub + start, i - start); in Round3()
1202 void Regexp::ParseState::DoCollapse(RegexpOp op) { in DoCollapse()
1205 Regexp* next = NULL; in DoCollapse()
1206 Regexp* sub; in DoCollapse()
1207 for (sub = stacktop_; sub != NULL && !IsMarker(sub->op()); sub = next) { in DoCollapse()
1208 next = sub->down_; in DoCollapse()
1209 if (sub->op_ == op) in DoCollapse()
1210 n += sub->nsub_; in DoCollapse()
1217 if (stacktop_ != NULL && stacktop_->down_ == next) in DoCollapse()
1221 PODArray<Regexp*> subs(n); in DoCollapse()
1224 for (sub = stacktop_; sub != NULL && !IsMarker(sub->op()); sub = next) { in DoCollapse()
1225 next = sub->down_; in DoCollapse()
1226 if (sub->op_ == op) { in DoCollapse()
1227 Regexp** sub_subs = sub->sub(); in DoCollapse()
1228 for (int k = sub->nsub_ - 1; k >= 0; k--) in DoCollapse()
1229 subs[--i] = sub_subs[k]->Incref(); in DoCollapse()
1230 sub->Decref(); in DoCollapse()
1232 subs[--i] = FinishRegexp(sub); in DoCollapse()
1236 Regexp* re = ConcatOrAlternate(op, subs.data(), n, flags_, true); in DoCollapse()
1237 re->simple_ = re->ComputeSimple(); in DoCollapse()
1238 re->down_ = next; in DoCollapse()
1243 // collapsing it into a single regexp on the stack.
1244 void Regexp::ParseState::DoConcatenation() { in DoConcatenation()
1245 Regexp* r1 = stacktop_; in DoConcatenation()
1246 if (r1 == NULL || IsMarker(r1->op())) { in DoConcatenation()
1248 Regexp* re = new Regexp(kRegexpEmptyMatch, flags_); in DoConcatenation()
1255 // collapsing it to a single regexp on the stack.
1256 void Regexp::ParseState::DoAlternation() { in DoAlternation()
1259 Regexp* r1 = stacktop_; in DoAlternation()
1260 stacktop_ = r1->down_; in DoAlternation()
1261 r1->Decref(); in DoAlternation()
1266 // If top two elements on stack are both literal or string,
1267 // collapse into single string.
1268 // Don't walk down the stack -- the parser calls this frequently
1270 // Only called when another regexp is about to be pushed
1275 bool Regexp::ParseState::MaybeConcatString(int r, ParseFlags flags) { in MaybeConcatString()
1276 Regexp* re1; in MaybeConcatString()
1277 Regexp* re2; in MaybeConcatString()
1278 if ((re1 = stacktop_) == NULL || (re2 = re1->down_) == NULL) in MaybeConcatString()
1281 if (re1->op_ != kRegexpLiteral && re1->op_ != kRegexpLiteralString) in MaybeConcatString()
1283 if (re2->op_ != kRegexpLiteral && re2->op_ != kRegexpLiteralString) in MaybeConcatString()
1285 if ((re1->parse_flags_ & FoldCase) != (re2->parse_flags_ & FoldCase)) in MaybeConcatString()
1288 if (re2->op_ == kRegexpLiteral) { in MaybeConcatString()
1289 // convert into string in MaybeConcatString()
1290 Rune rune = re2->rune_; in MaybeConcatString()
1291 re2->op_ = kRegexpLiteralString; in MaybeConcatString()
1292 re2->nrunes_ = 0; in MaybeConcatString()
1293 re2->runes_ = NULL; in MaybeConcatString()
1294 re2->AddRuneToString(rune); in MaybeConcatString()
1298 if (re1->op_ == kRegexpLiteral) { in MaybeConcatString()
1299 re2->AddRuneToString(re1->rune_); in MaybeConcatString()
1301 for (int i = 0; i < re1->nrunes_; i++) in MaybeConcatString()
1302 re2->AddRuneToString(re1->runes_[i]); in MaybeConcatString()
1303 re1->nrunes_ = 0; in MaybeConcatString()
1304 delete[] re1->runes_; in MaybeConcatString()
1305 re1->runes_ = NULL; in MaybeConcatString()
1310 re1->op_ = kRegexpLiteral; in MaybeConcatString()
1311 re1->rune_ = r; in MaybeConcatString()
1312 re1->parse_flags_ = static_cast<uint16_t>(flags); in MaybeConcatString()
1317 re1->Decref(); in MaybeConcatString()
1324 // Sets *s to span the remainder of the string.
1326 if (s->size() == 0 || !isdigit((*s)[0] & 0xFF)) in ParseInteger()
1329 if (s->size() >= 2 && (*s)[0] == '0' && isdigit((*s)[1] & 0xFF)) in ParseInteger()
1333 while (s->size() > 0 && isdigit(c = (*s)[0] & 0xFF)) { in ParseInteger()
1337 n = n*10 + c - '0'; in ParseInteger()
1338 s->remove_prefix(1); // digit in ParseInteger()
1345 // Sets *s to span the remainder of the string on success.
1348 // sets *hi to -1 to signify this.
1350 // The Maybe in the name signifies that the regexp parse
1368 *hi = -1; in MaybeParseRepetition()
1393 if (fullrune(sp->data(), static_cast<int>(std::min(size_t{4}, sp->size())))) { in StringPieceToRune()
1394 int n = chartorune(r, sp->data()); in StringPieceToRune()
1404 sp->remove_prefix(n); in StringPieceToRune()
1409 status->set_code(kRegexpBadUTF8); in StringPieceToRune()
1410 status->set_error_arg(StringPiece()); in StringPieceToRune()
1411 return -1; in StringPieceToRune()
1414 // Return whether name is valid UTF-8.
1436 return c - '0'; in UnHex()
1438 return c - 'A' + 10; in UnHex()
1440 return c - 'a' + 10; in UnHex()
1445 // Parse an escape sequence (e.g., \n, \{).
1446 // Sets *s to span the remainder of the string.
1450 const char* begin = s->begin(); in ParseEscape()
1451 if (s->size() < 1 || (*s)[0] != '\\') { in ParseEscape()
1452 // Should not happen - caller always checks. in ParseEscape()
1453 status->set_code(kRegexpInternalError); in ParseEscape()
1454 status->set_error_arg(StringPiece()); in ParseEscape()
1457 if (s->size() < 2) { in ParseEscape()
1458 status->set_code(kRegexpTrailingBackslash); in ParseEscape()
1459 status->set_error_arg(StringPiece()); in ParseEscape()
1463 s->remove_prefix(1); // backslash in ParseEscape()
1470 // Escaped non-word characters are always themselves. in ParseEscape()
1487 // Single non-zero octal digit is a backreference; not supported. in ParseEscape()
1488 if (s->size() == 0 || (*s)[0] < '0' || (*s)[0] > '7') in ParseEscape()
1493 code = c - '0'; in ParseEscape()
1494 if (s->size() > 0 && '0' <= (c = (*s)[0]) && c <= '7') { in ParseEscape()
1495 code = code * 8 + c - '0'; in ParseEscape()
1496 s->remove_prefix(1); // digit in ParseEscape()
1497 if (s->size() > 0) { in ParseEscape()
1500 code = code * 8 + c - '0'; in ParseEscape()
1501 s->remove_prefix(1); // digit in ParseEscape()
1512 if (s->size() == 0) in ParseEscape()
1518 // Update n as we consume the string, so that in ParseEscape()
1521 // after the first non-hex digit. We require only hex digits, in ParseEscape()
1532 if (s->size() == 0) in ParseEscape()
1543 if (s->size() == 0) in ParseEscape()
1575 // the Perl word-boundary \b as a backspace in ParseEscape()
1576 // when in POSIX regexp mode. Surprisingly, in ParseEscape()
1577 // in Perl, \b means word-boundary but [\b] in ParseEscape()
1590 // Unrecognized escape sequence. in ParseEscape()
1591 status->set_code(kRegexpBadEscape); in ParseEscape()
1592 status->set_error_arg( in ParseEscape()
1593 StringPiece(begin, static_cast<size_t>(s->begin() - begin))); in ParseEscape()
1600 Rune lo, Rune hi, Regexp::ParseFlags parse_flags) { in AddRangeFlags()
1603 bool cutnl = !(parse_flags & Regexp::ClassNL) || in AddRangeFlags()
1604 (parse_flags & Regexp::NeverNL); in AddRangeFlags()
1607 AddRangeFlags(lo, '\n' - 1, parse_flags); in AddRangeFlags()
1613 // If folding case, add fold-equivalent characters too. in AddRangeFlags()
1614 if (parse_flags & Regexp::FoldCase) in AddRangeFlags()
1656 Regexp::ParseFlags parse_flags) { in AddUGroup()
1658 for (int i = 0; i < g->nr16; i++) { in AddUGroup()
1659 cc->AddRangeFlags(g->r16[i].lo, g->r16[i].hi, parse_flags); in AddUGroup()
1661 for (int i = 0; i < g->nr32; i++) { in AddUGroup()
1662 cc->AddRangeFlags(g->r32[i].lo, g->r32[i].hi, parse_flags); in AddUGroup()
1665 if (parse_flags & Regexp::FoldCase) { in AddUGroup()
1666 // Normally adding a case-folded group means in AddUGroup()
1667 // adding all the extra fold-equivalent runes too. in AddUGroup()
1669 // we have to exclude all the runes that are fold-equivalent in AddUGroup()
1675 bool cutnl = !(parse_flags & Regexp::ClassNL) || in AddUGroup()
1676 (parse_flags & Regexp::NeverNL); in AddUGroup()
1681 cc->AddCharClass(&ccb1); in AddUGroup()
1685 for (int i = 0; i < g->nr16; i++) { in AddUGroup()
1686 if (next < g->r16[i].lo) in AddUGroup()
1687 cc->AddRangeFlags(next, g->r16[i].lo - 1, parse_flags); in AddUGroup()
1688 next = g->r16[i].hi + 1; in AddUGroup()
1690 for (int i = 0; i < g->nr32; i++) { in AddUGroup()
1691 if (next < g->r32[i].lo) in AddUGroup()
1692 cc->AddRangeFlags(next, g->r32[i].lo - 1, parse_flags); in AddUGroup()
1693 next = g->r32[i].hi + 1; in AddUGroup()
1696 cc->AddRangeFlags(next, Runemax, parse_flags); in AddUGroup()
1700 // Maybe parse a Perl character class escape sequence.
1702 // not the Perl empty-string classes (\b \B \A \Z \z).
1703 // On success, sets *s to span the remainder of the string
1706 const UGroup* MaybeParsePerlCCEscape(StringPiece* s, Regexp::ParseFlags parse_flags) { in MaybeParsePerlCCEscape()
1707 if (!(parse_flags & Regexp::PerlClasses)) in MaybeParsePerlCCEscape()
1709 if (s->size() < 2 || (*s)[0] != '\\') in MaybeParsePerlCCEscape()
1712 // any non-ASCII Perl group names. in MaybeParsePerlCCEscape()
1713 StringPiece name(s->begin(), 2); in MaybeParsePerlCCEscape()
1717 s->remove_prefix(name.size()); in MaybeParsePerlCCEscape()
1729 ParseStatus ParseUnicodeGroup(StringPiece* s, Regexp::ParseFlags parse_flags, in ParseUnicodeGroup()
1733 if (!(parse_flags & Regexp::UnicodeGroups)) in ParseUnicodeGroup()
1735 if (s->size() < 2 || (*s)[0] != '\\') in ParseUnicodeGroup()
1742 int sign = +1; // -1 = negated char class in ParseUnicodeGroup()
1744 sign = -sign; in ParseUnicodeGroup()
1747 s->remove_prefix(2); // '\\', 'p' in ParseUnicodeGroup()
1752 // Name is the bit of string we just skipped over for c. in ParseUnicodeGroup()
1754 name = StringPiece(p, static_cast<size_t>(s->begin() - p)); in ParseUnicodeGroup()
1757 size_t end = s->find('}', 0); in ParseUnicodeGroup()
1761 status->set_code(kRegexpBadCharRange); in ParseUnicodeGroup()
1762 status->set_error_arg(seq); in ParseUnicodeGroup()
1765 name = StringPiece(s->begin(), end); // without '}' in ParseUnicodeGroup()
1766 s->remove_prefix(end + 1); // with '}' in ParseUnicodeGroup()
1772 seq = StringPiece(seq.begin(), static_cast<size_t>(s->begin() - seq.begin())); in ParseUnicodeGroup()
1775 sign = -sign; in ParseUnicodeGroup()
1783 status->set_code(kRegexpBadCharRange); in ParseUnicodeGroup()
1784 status->set_error_arg(seq); in ParseUnicodeGroup()
1793 string("\\p{") + string(name) + string("}")); in ParseUnicodeGroup()
1797 status->set_code(kRegexpBadCharRange); in ParseUnicodeGroup()
1798 status->set_error_arg(seq); in ParseUnicodeGroup()
1818 // Sets *s to span the remainder of the string.
1820 static ParseStatus ParseCCName(StringPiece* s, Regexp::ParseFlags parse_flags, in ParseCCName()
1824 const char* p = s->data(); in ParseCCName()
1825 const char* ep = s->data() + s->size(); in ParseCCName()
1826 if (ep - p < 2 || p[0] != '[' || p[1] != ':') in ParseCCName()
1831 for (q = p+2; q <= ep-2 && (*q != ':' || *(q+1) != ']'); q++) in ParseCCName()
1835 if (q > ep-2) in ParseCCName()
1840 StringPiece name(p, static_cast<size_t>(q - p)); in ParseCCName()
1844 status->set_code(kRegexpBadCharRange); in ParseCCName()
1845 status->set_error_arg(name); in ParseCCName()
1849 s->remove_prefix(name.size()); in ParseCCName()
1850 AddUGroup(cc, g, g->sign, parse_flags); in ParseCCName()
1855 // There are fewer special characters here than in the rest of the regexp.
1856 // Sets *s to span the remainder of the string.
1858 bool Regexp::ParseState::ParseCCCharacter(StringPiece* s, Rune *rp, in ParseCCCharacter()
1861 if (s->size() == 0) { in ParseCCCharacter()
1862 status->set_code(kRegexpMissingBracket); in ParseCCCharacter()
1863 status->set_error_arg(whole_class); in ParseCCCharacter()
1867 // Allow regular escape sequences even though in ParseCCCharacter()
1869 if (s->size() >= 1 && (*s)[0] == '\\') in ParseCCCharacter()
1878 // For single characters, rr->lo == rr->hi.
1879 // Sets *s to span the remainder of the string.
1881 bool Regexp::ParseState::ParseCCRange(StringPiece* s, RuneRange* rr, in ParseCCRange()
1885 if (!ParseCCCharacter(s, &rr->lo, whole_class, status)) in ParseCCRange()
1887 // [a-] means (a|-), so check for final ]. in ParseCCRange()
1888 if (s->size() >= 2 && (*s)[0] == '-' && (*s)[1] != ']') { in ParseCCRange()
1889 s->remove_prefix(1); // '-' in ParseCCRange()
1890 if (!ParseCCCharacter(s, &rr->hi, whole_class, status)) in ParseCCRange()
1892 if (rr->hi < rr->lo) { in ParseCCRange()
1893 status->set_code(kRegexpBadCharRange); in ParseCCRange()
1894 status->set_error_arg( in ParseCCRange()
1895 StringPiece(os.data(), static_cast<size_t>(s->data() - os.data()))); in ParseCCRange()
1899 rr->hi = rr->lo; in ParseCCRange()
1904 // Parses a possibly-negated character class expression like [^abx-z[:digit:]].
1905 // Sets *s to span the remainder of the string.
1906 // Sets *out_re to the regexp for the class.
1907 bool Regexp::ParseState::ParseCharClass(StringPiece* s, in ParseCharClass()
1908 Regexp** out_re, in ParseCharClass()
1911 if (s->size() == 0 || (*s)[0] != '[') { in ParseCharClass()
1913 status->set_code(kRegexpInternalError); in ParseCharClass()
1914 status->set_error_arg(StringPiece()); in ParseCharClass()
1918 Regexp* re = new Regexp(kRegexpCharClass, flags_ & ~FoldCase); in ParseCharClass()
1919 re->ccb_ = new CharClassBuilder; in ParseCharClass()
1920 s->remove_prefix(1); // '[' in ParseCharClass()
1921 if (s->size() > 0 && (*s)[0] == '^') { in ParseCharClass()
1922 s->remove_prefix(1); // '^' in ParseCharClass()
1927 re->ccb_->AddRange('\n', '\n'); in ParseCharClass()
1931 while (s->size() > 0 && ((*s)[0] != ']' || first)) { in ParseCharClass()
1932 // - is only okay unescaped as first or last in class. in ParseCharClass()
1933 // Except that Perl allows - anywhere. in ParseCharClass()
1934 if ((*s)[0] == '-' && !first && !(flags_&PerlX) && in ParseCharClass()
1935 (s->size() == 1 || (*s)[1] != ']')) { in ParseCharClass()
1937 t.remove_prefix(1); // '-' in ParseCharClass()
1941 re->Decref(); in ParseCharClass()
1944 status->set_code(kRegexpBadCharRange); in ParseCharClass()
1945 status->set_error_arg(StringPiece(s->data(), 1+n)); in ParseCharClass()
1946 re->Decref(); in ParseCharClass()
1952 if (s->size() > 2 && (*s)[0] == '[' && (*s)[1] == ':') { in ParseCharClass()
1953 switch (ParseCCName(s, flags_, re->ccb_, status)) { in ParseCharClass()
1957 re->Decref(); in ParseCharClass()
1965 if (s->size() > 2 && in ParseCharClass()
1968 switch (ParseUnicodeGroup(s, flags_, re->ccb_, status)) { in ParseCharClass()
1972 re->Decref(); in ParseCharClass()
1982 AddUGroup(re->ccb_, g, g->sign, flags_); in ParseCharClass()
1989 re->Decref(); in ParseCharClass()
1994 // Regexp::ClassNL is set. In an explicit range or singleton in ParseCharClass()
1997 re->ccb_->AddRangeFlags(rr.lo, rr.hi, flags_ | Regexp::ClassNL); in ParseCharClass()
1999 if (s->size() == 0) { in ParseCharClass()
2000 status->set_code(kRegexpMissingBracket); in ParseCharClass()
2001 status->set_error_arg(whole_class); in ParseCharClass()
2002 re->Decref(); in ParseCharClass()
2005 s->remove_prefix(1); // ']' in ParseCharClass()
2008 re->ccb_->Negate(); in ParseCharClass()
2014 // Is this a valid capture name? [A-Za-z0-9_]+
2033 // Parses a Perl flag setting or non-capturing group or both,
2037 // well-formed or not supported, sets status_ and returns false.
2038 bool Regexp::ParseState::ParsePerlFlags(StringPiece* s) { in ParsePerlFlags()
2044 status_->set_code(kRegexpInternalError); in ParsePerlFlags()
2050 // Check for named captures, first introduced in Python's regexp library. in ParsePerlFlags()
2071 status_->set_code(kRegexpBadNamedCapture); in ParsePerlFlags()
2072 status_->set_error_arg(*s); in ParsePerlFlags()
2077 StringPiece capture(t.begin()-2, end+3); // "(?P<name>" in ParsePerlFlags()
2078 StringPiece name(t.begin()+2, end-2); // "name" in ParsePerlFlags()
2082 status_->set_code(kRegexpBadNamedCapture); in ParsePerlFlags()
2083 status_->set_error_arg(capture); in ParsePerlFlags()
2092 s->remove_prefix(static_cast<size_t>(capture.end() - s->begin())); in ParsePerlFlags()
2143 case '-': in ParsePerlFlags()
2169 flags_ = static_cast<Regexp::ParseFlags>(nflags); in ParsePerlFlags()
2174 status_->set_code(kRegexpBadPerlOp); in ParsePerlFlags()
2175 status_->set_error_arg( in ParsePerlFlags()
2176 StringPiece(s->begin(), static_cast<size_t>(t.begin() - s->begin()))); in ParsePerlFlags()
2181 // into UTF8 encoding in string.
2183 // deprecated and because it rejects code points 0x80-0x9F.
2184 void ConvertLatin1ToUTF8(const StringPiece& latin1, string* utf) { in ConvertLatin1ToUTF8()
2187 utf->clear(); in ConvertLatin1ToUTF8()
2191 utf->append(buf, n); in ConvertLatin1ToUTF8()
2196 // returning the corresponding Regexp tree.
2199 Regexp* Regexp::Parse(const StringPiece& s, ParseFlags global_flags, in Parse()
2201 // Make status non-NULL (easier on everyone else). in Parse()
2209 // Convert regexp to UTF-8 (easier on the rest of the parser). in Parse()
2211 string* tmp = new string; in Parse()
2213 status->set_tmp(tmp); in Parse()
2218 // Special parse loop for literal string. in Parse()
2243 // "(?" introduces Perl escape. in Parse()
2245 // Flag changes and non-capturing groups. in Parse()
2291 Regexp* re; in Parse()
2320 // a** is a syntax error, not a double-star. in Parse()
2322 status->set_code(kRegexpRepeatOp); in Parse()
2323 status->set_error_arg(StringPiece( in Parse()
2325 static_cast<size_t>(t.begin() - lastunary.begin()))); in Parse()
2330 static_cast<size_t>(t.data() - opstr.data())); in Parse()
2355 status->set_code(kRegexpRepeatOp); in Parse()
2356 status->set_error_arg(StringPiece( in Parse()
2358 static_cast<size_t>(t.begin() - lastunary.begin()))); in Parse()
2363 static_cast<size_t>(t.data() - opstr.data())); in Parse()
2372 if ((ps.flags() & Regexp::PerlB) && in Parse()
2380 if ((ps.flags() & Regexp::PerlX) && t.size() >= 2) { in Parse()
2395 // (This library treats "(?-m)$" as \z, even though in Parse()
2423 Regexp* re = new Regexp(kRegexpCharClass, ps.flags() & ~FoldCase); in Parse()
2424 re->ccb_ = new CharClassBuilder; in Parse()
2425 switch (ParseUnicodeGroup(&t, ps.flags(), re->ccb_, status)) { in Parse()
2431 re->Decref(); in Parse()
2434 re->Decref(); in Parse()
2441 Regexp* re = new Regexp(kRegexpCharClass, ps.flags() & ~FoldCase); in Parse()
2442 re->ccb_ = new CharClassBuilder; in Parse()
2443 AddUGroup(re->ccb_, g, g->sign, ps.flags()); in Parse()