parse.cc - OpenGrok cross reference for /external/regex-re2/re2/parse.cc

Lines Matching +full:escape +full:- +full:string +full:- +full:regexp
2 // Use of this source code is governed by a BSD-style
7 // The parser is a simple precedence-based parser with a
9 // of the ParseState class.  The Regexp::Parse function is
15 // classes.  It also allows the empty string as a regular expression
16 // and recognizes the Perl escape sequences \d, \s, \w, \D, \S, and \W.
17 // See regexp.h for rationale.
22 #include <string.h>
25 #include <string>
33 #include "re2/regexp.h"
37 #include "re2/walker-inl.h"
56 // Regexp pointers called the stack.  Left parenthesis and vertical
58 // non-standard opcodes.
67 // form LeftParen regexp VerticalBar regexp VerticalBar ... regexp VerticalBar.
71 class Regexp::ParseState {
87   bool PushRegexp(Regexp* re);
92   // Pushes a regexp with the given op (and no args) onto the stack.
107   // Pushes a repeat operator regexp onto the stack.
112   // Pushes a repetition regexp onto the stack.
116   // Checks whether a particular regexp op is a marker.
130   // Processes the end of input, returning the final regexp.
131   Regexp* DoFinish();
133   // Finishes the regexp if necessary, preparing it for use
136   Regexp* FinishRegexp(Regexp*);
140   // ParseCharClass also manipulates the internals of Regexp
145   bool ParseCharClass(StringPiece* s, Regexp** out_re,
160   // Parse a Perl flag set or non-capturing group from s.
165   // collapsing it into a single regexp on the stack.
169   // collapsing it to a single regexp on the stack.
182   Regexp* stacktop_;
190 // Pseudo-operators - only on parse stack.
194 Regexp::ParseState::ParseState(ParseFlags flags,  in ParseState()
206 Regexp::ParseState::~ParseState() {  in ~ParseState()
207   Regexp* next;  in ~ParseState()
208   for (Regexp* re = stacktop_; re != NULL; re = next) {  in ~ParseState()
209     next = re->down_;  in ~ParseState()
210     re->down_ = NULL;  in ~ParseState()
211     if (re->op() == kLeftParen)  in ~ParseState()
212       delete re->name_;  in ~ParseState()
213     re->Decref();  in ~ParseState()
217 // Finishes the regexp if necessary, preparing it for use in
220 Regexp* Regexp::ParseState::FinishRegexp(Regexp* re) {  in FinishRegexp()
223   re->down_ = NULL;  in FinishRegexp()
225   if (re->op_ == kRegexpCharClass && re->ccb_ != NULL) {  in FinishRegexp()
226     CharClassBuilder* ccb = re->ccb_;  in FinishRegexp()
227     re->ccb_ = NULL;  in FinishRegexp()
228     re->cc_ = ccb->GetCharClass();  in FinishRegexp()
237 bool Regexp::ParseState::PushRegexp(Regexp* re) {  in PushRegexp()
238   MaybeConcatString(-1, NoParseFlags);  in PushRegexp()
245   if (re->op_ == kRegexpCharClass && re->ccb_ != NULL) {  in PushRegexp()
246     re->ccb_->RemoveAbove(rune_max_);  in PushRegexp()
247     if (re->ccb_->size() == 1) {  in PushRegexp()
248       Rune r = re->ccb_->begin()->lo;  in PushRegexp()
249       re->Decref();  in PushRegexp()
250       re = new Regexp(kRegexpLiteral, flags_);  in PushRegexp()
251       re->rune_ = r;  in PushRegexp()
252     } else if (re->ccb_->size() == 2) {  in PushRegexp()
253       Rune r = re->ccb_->begin()->lo;  in PushRegexp()
254       if ('A' <= r && r <= 'Z' && re->ccb_->Contains(r + 'a' - 'A')) {  in PushRegexp()
255         re->Decref();  in PushRegexp()
256         re = new Regexp(kRegexpLiteral, flags_ | FoldCase);  in PushRegexp()
257         re->rune_ = r + 'a' - 'A';  in PushRegexp()
262   if (!IsMarker(re->op()))  in PushRegexp()
263     re->simple_ = re->ComputeSimple();  in PushRegexp()
264   re->down_ = stacktop_;  in PushRegexp()
270 // If there isn't one, returns the CaseFold* with smallest f->lo bigger than r.
284       n -= m+1;  in LookupCaseFold()
301   switch (f->delta) {  in ApplyFold()
303       return r + f->delta;  in ApplyFold()
305     case EvenOddSkip:  // even <-> odd but only applies to every other  in ApplyFold()
306       if ((r - f->lo) % 2)  in ApplyFold()
309     case EvenOdd:  // even <-> odd  in ApplyFold()
312       return r - 1;  in ApplyFold()
314     case OddEvenSkip:  // odd <-> even but only applies to every other  in ApplyFold()
315       if ((r - f->lo) % 2)  in ApplyFold()
318     case OddEven:  // odd <-> even  in ApplyFold()
321       return r - 1;  in ApplyFold()
337   if (f == NULL || r < f->lo)  in CycleFoldRune()
342 // Add lo-hi to the class, along with their fold-equivalent characters.
343 // If lo-hi is already in the class, assume that the fold-equivalent
349   // the cycles are not too long, and we double-check here using depth.  in AddFoldedRange()
355   if (!cc->AddRange(lo, hi))  // lo-hi was already there? we're done  in AddFoldedRange()
362     if (lo < f->lo) {  // lo has no fold; next rune with a fold is f->lo  in AddFoldedRange()
363       lo = f->lo;  in AddFoldedRange()
367     // Add in the result of folding the range lo - f->hi  in AddFoldedRange()
370     Rune hi1 = std::min<Rune>(hi, f->hi);  in AddFoldedRange()
371     switch (f->delta) {  in AddFoldedRange()
373         lo1 += f->delta;  in AddFoldedRange()
374         hi1 += f->delta;  in AddFoldedRange()
378           lo1--;  in AddFoldedRange()
384           lo1--;  in AddFoldedRange()
392     lo = f->hi + 1;  in AddFoldedRange()
397 bool Regexp::ParseState::PushLiteral(Rune r) {  in PushLiteral()
400     Regexp* re = new Regexp(kRegexpCharClass, flags_ & ~FoldCase);  in PushLiteral()
401     re->ccb_ = new CharClassBuilder;  in PushLiteral()
405         re->ccb_->AddRange(r, r);  in PushLiteral()
414     return PushRegexp(new Regexp(kRegexpNoMatch, flags_));  in PushLiteral()
420   Regexp* re = new Regexp(kRegexpLiteral, flags_);  in PushLiteral()
421   re->rune_ = r;  in PushLiteral()
426 bool Regexp::ParseState::PushCarat() {  in PushCarat()
434 bool Regexp::ParseState::PushWordBoundary(bool word) {  in PushWordBoundary()
441 bool Regexp::ParseState::PushDollar() {  in PushDollar()
445     Regexp::ParseFlags oflags = flags_;  in PushDollar()
455 bool Regexp::ParseState::PushDot() {  in PushDot()
459   Regexp* re = new Regexp(kRegexpCharClass, flags_ & ~FoldCase);  in PushDot()
460   re->ccb_ = new CharClassBuilder;  in PushDot()
461   re->ccb_->AddRange(0, '\n' - 1);  in PushDot()
462   re->ccb_->AddRange('\n' + 1, rune_max_);  in PushDot()
466 // Pushes a regexp with the given op (and no args) onto the stack.
467 bool Regexp::ParseState::PushSimpleOp(RegexpOp op) {  in PushSimpleOp()
468   Regexp* re = new Regexp(op, flags_);  in PushSimpleOp()
472 // Pushes a repeat operator regexp onto the stack.
475 bool Regexp::ParseState::PushRepeatOp(RegexpOp op, const StringPiece& s,  in PushRepeatOp()
477   if (stacktop_ == NULL || IsMarker(stacktop_->op())) {  in PushRepeatOp()
478     status_->set_code(kRegexpRepeatArgument);  in PushRepeatOp()
479     status_->set_error_arg(s);  in PushRepeatOp()
482   Regexp::ParseFlags fl = flags_;  in PushRepeatOp()
486   // Squash **, ++ and ??. Regexp::Star() et al. handle this too, but  in PushRepeatOp()
488   if (op == stacktop_->op() && fl == stacktop_->parse_flags())  in PushRepeatOp()
492   // op is a repeat, we just have to check that stacktop_->op() is too,  in PushRepeatOp()
494   if ((stacktop_->op() == kRegexpStar ||  in PushRepeatOp()
495        stacktop_->op() == kRegexpPlus ||  in PushRepeatOp()
496        stacktop_->op() == kRegexpQuest) &&  in PushRepeatOp()
497       fl == stacktop_->parse_flags()) {  in PushRepeatOp()
498     stacktop_->op_ = kRegexpStar;  in PushRepeatOp()
502   Regexp* re = new Regexp(op, fl);  in PushRepeatOp()
503   re->AllocSub(1);  in PushRepeatOp()
504   re->down_ = stacktop_->down_;  in PushRepeatOp()
505   re->sub()[0] = FinishRegexp(stacktop_);  in PushRepeatOp()
506   re->simple_ = re->ComputeSimple();  in PushRepeatOp()
511 // RepetitionWalker reports whether the repetition regexp is valid.
512 // Valid means that the combination of the top-level repetition
515 // This rewalks the regexp tree and is called for every repetition,
520 class RepetitionWalker : public Regexp::Walker<int> {
523   virtual int PreVisit(Regexp* re, int parent_arg, bool* stop);
524   virtual int PostVisit(Regexp* re, int parent_arg, int pre_arg,
526   virtual int ShortVisit(Regexp* re, int parent_arg);
533 int RepetitionWalker::PreVisit(Regexp* re, int parent_arg, bool* stop) {  in PreVisit()
535   if (re->op() == kRegexpRepeat) {  in PreVisit()
536     int m = re->max();  in PreVisit()
538       m = re->min();  in PreVisit()
547 int RepetitionWalker::PostVisit(Regexp* re, int parent_arg, int pre_arg,  in PostVisit()
558 int RepetitionWalker::ShortVisit(Regexp* re, int parent_arg) {  in ShortVisit()
565 // Pushes a repetition regexp onto the stack.
567 bool Regexp::ParseState::PushRepetition(int min, int max,  in PushRepetition()
570   if ((max != -1 && max < min) || min > kMaxRepeat || max > kMaxRepeat) {  in PushRepetition()
571     status_->set_code(kRegexpRepeatSize);  in PushRepetition()
572     status_->set_error_arg(s);  in PushRepetition()
575   if (stacktop_ == NULL || IsMarker(stacktop_->op())) {  in PushRepetition()
576     status_->set_code(kRegexpRepeatArgument);  in PushRepetition()
577     status_->set_error_arg(s);  in PushRepetition()
580   Regexp::ParseFlags fl = flags_;  in PushRepetition()
583   Regexp* re = new Regexp(kRegexpRepeat, fl);  in PushRepetition()
584   re->min_ = min;  in PushRepetition()
585   re->max_ = max;  in PushRepetition()
586   re->AllocSub(1);  in PushRepetition()
587   re->down_ = stacktop_->down_;  in PushRepetition()
588   re->sub()[0] = FinishRegexp(stacktop_);  in PushRepetition()
589   re->simple_ = re->ComputeSimple();  in PushRepetition()
594       status_->set_code(kRegexpRepeatSize);  in PushRepetition()
595       status_->set_error_arg(s);  in PushRepetition()
602 // Checks whether a particular regexp op is a marker.
603 bool Regexp::ParseState::IsMarker(RegexpOp op) {  in IsMarker()
609 bool Regexp::ParseState::DoLeftParen(const StringPiece& name) {  in DoLeftParen()
610   Regexp* re = new Regexp(kLeftParen, flags_);  in DoLeftParen()
611   re->cap_ = ++ncap_;  in DoLeftParen()
613     re->name_ = new string(name);  in DoLeftParen()
617 // Pushes a non-capturing marker onto the stack.
618 bool Regexp::ParseState::DoLeftParenNoCapture() {  in DoLeftParenNoCapture()
619   Regexp* re = new Regexp(kLeftParen, flags_);  in DoLeftParenNoCapture()
620   re->cap_ = -1;  in DoLeftParenNoCapture()
625 bool Regexp::ParseState::DoVerticalBar() {  in DoVerticalBar()
626   MaybeConcatString(-1, NoParseFlags);  in DoVerticalBar()
634   Regexp* r1;  in DoVerticalBar()
635   Regexp* r2;  in DoVerticalBar()
637       (r2 = r1->down_) != NULL &&  in DoVerticalBar()
638       r2->op() == kVerticalBar) {  in DoVerticalBar()
639     Regexp* r3;  in DoVerticalBar()
640     if ((r3 = r2->down_) != NULL &&  in DoVerticalBar()
641         (r1->op() == kRegexpAnyChar || r3->op() == kRegexpAnyChar)) {  in DoVerticalBar()
644       if (r3->op() == kRegexpAnyChar &&  in DoVerticalBar()
645           (r1->op() == kRegexpLiteral ||  in DoVerticalBar()
646            r1->op() == kRegexpCharClass ||  in DoVerticalBar()
647            r1->op() == kRegexpAnyChar)) {  in DoVerticalBar()
650         r1->Decref();  in DoVerticalBar()
653       if (r1->op() == kRegexpAnyChar &&  in DoVerticalBar()
654           (r3->op() == kRegexpLiteral ||  in DoVerticalBar()
655            r3->op() == kRegexpCharClass ||  in DoVerticalBar()
656            r3->op() == kRegexpAnyChar)) {  in DoVerticalBar()
658         r1->down_ = r3->down_;  in DoVerticalBar()
659         r2->down_ = r1;  in DoVerticalBar()
661         r3->Decref();  in DoVerticalBar()
666     r1->down_ = r2->down_;  in DoVerticalBar()
667     r2->down_ = r1;  in DoVerticalBar()
675 bool Regexp::ParseState::DoRightParen() {  in DoRightParen()
679   // The stack should be: LeftParen regexp  in DoRightParen()
680   // Remove the LeftParen, leaving the regexp,  in DoRightParen()
682   Regexp* r1;  in DoRightParen()
683   Regexp* r2;  in DoRightParen()
685       (r2 = r1->down_) == NULL ||  in DoRightParen()
686       r2->op() != kLeftParen) {  in DoRightParen()
687     status_->set_code(kRegexpMissingParen);  in DoRightParen()
688     status_->set_error_arg(whole_regexp_);  in DoRightParen()
693   stacktop_ = r2->down_;  in DoRightParen()
696   Regexp* re = r2;  in DoRightParen()
697   flags_ = re->parse_flags();  in DoRightParen()
700   if (re->cap_ > 0) {  in DoRightParen()
701     re->op_ = kRegexpCapture;  in DoRightParen()
702     // re->cap_ is already set  in DoRightParen()
703     re->AllocSub(1);  in DoRightParen()
704     re->sub()[0] = FinishRegexp(r1);  in DoRightParen()
705     re->simple_ = re->ComputeSimple();  in DoRightParen()
707     re->Decref();  in DoRightParen()
713 // Processes the end of input, returning the final regexp.
714 Regexp* Regexp::ParseState::DoFinish() {  in DoFinish()
716   Regexp* re = stacktop_;  in DoFinish()
717   if (re != NULL && re->down_ != NULL) {  in DoFinish()
718     status_->set_code(kRegexpMissingParen);  in DoFinish()
719     status_->set_error_arg(whole_regexp_);  in DoFinish()
726 // Returns the leading regexp that re starts with.
727 // The returned Regexp* points into a piece of re,
728 // so it must not be used after the caller calls re->Decref().
729 Regexp* Regexp::LeadingRegexp(Regexp* re) {  in LeadingRegexp()
730   if (re->op() == kRegexpEmptyMatch)  in LeadingRegexp()
732   if (re->op() == kRegexpConcat && re->nsub() >= 2) {  in LeadingRegexp()
733     Regexp** sub = re->sub();  in LeadingRegexp()
734     if (sub[0]->op() == kRegexpEmptyMatch)  in LeadingRegexp()
745 Regexp* Regexp::RemoveLeadingRegexp(Regexp* re) {  in RemoveLeadingRegexp()
746   if (re->op() == kRegexpEmptyMatch)  in RemoveLeadingRegexp()
748   if (re->op() == kRegexpConcat && re->nsub() >= 2) {  in RemoveLeadingRegexp()
749     Regexp** sub = re->sub();  in RemoveLeadingRegexp()
750     if (sub[0]->op() == kRegexpEmptyMatch)  in RemoveLeadingRegexp()
752     sub[0]->Decref();  in RemoveLeadingRegexp()
754     if (re->nsub() == 2) {  in RemoveLeadingRegexp()
755       // Collapse concatenation to single regexp.  in RemoveLeadingRegexp()
756       Regexp* nre = sub[1];  in RemoveLeadingRegexp()
758       re->Decref();  in RemoveLeadingRegexp()
761     // 3 or more -> 2 or more.  in RemoveLeadingRegexp()
762     re->nsub_--;  in RemoveLeadingRegexp()
763     memmove(sub, sub + 1, re->nsub_ * sizeof sub[0]);  in RemoveLeadingRegexp()
766   Regexp::ParseFlags pf = re->parse_flags();  in RemoveLeadingRegexp()
767   re->Decref();  in RemoveLeadingRegexp()
768   return new Regexp(kRegexpEmptyMatch, pf);  in RemoveLeadingRegexp()
771 // Returns the leading string that re starts with.
773 // so it must not be used after the caller calls re->Decref().
774 Rune* Regexp::LeadingString(Regexp* re, int *nrune,  in LeadingString()
775                             Regexp::ParseFlags *flags) {  in LeadingString()
776   while (re->op() == kRegexpConcat && re->nsub() > 0)  in LeadingString()
777     re = re->sub()[0];  in LeadingString()
779   *flags = static_cast<Regexp::ParseFlags>(re->parse_flags_ & Regexp::FoldCase);  in LeadingString()
781   if (re->op() == kRegexpLiteral) {  in LeadingString()
783     return &re->rune_;  in LeadingString()
786   if (re->op() == kRegexpLiteralString) {  in LeadingString()
787     *nrune = re->nrunes_;  in LeadingString()
788     return re->runes_;  in LeadingString()
797 void Regexp::RemoveLeadingString(Regexp* re, int n) {  in RemoveLeadingString()
798   // Chase down concats to find first string.  in RemoveLeadingString()
800   // flattened except when doing so would overflow the 16-bit  in RemoveLeadingString()
803   Regexp* stk[4];  in RemoveLeadingString()
805   while (re->op() == kRegexpConcat) {  in RemoveLeadingString()
808     re = re->sub()[0];  in RemoveLeadingString()
811   // Remove leading string from re.  in RemoveLeadingString()
812   if (re->op() == kRegexpLiteral) {  in RemoveLeadingString()
813     re->rune_ = 0;  in RemoveLeadingString()
814     re->op_ = kRegexpEmptyMatch;  in RemoveLeadingString()
815   } else if (re->op() == kRegexpLiteralString) {  in RemoveLeadingString()
816     if (n >= re->nrunes_) {  in RemoveLeadingString()
817       delete[] re->runes_;  in RemoveLeadingString()
818       re->runes_ = NULL;  in RemoveLeadingString()
819       re->nrunes_ = 0;  in RemoveLeadingString()
820       re->op_ = kRegexpEmptyMatch;  in RemoveLeadingString()
821     } else if (n == re->nrunes_ - 1) {  in RemoveLeadingString()
822       Rune rune = re->runes_[re->nrunes_ - 1];  in RemoveLeadingString()
823       delete[] re->runes_;  in RemoveLeadingString()
824       re->runes_ = NULL;  in RemoveLeadingString()
825       re->nrunes_ = 0;  in RemoveLeadingString()
826       re->rune_ = rune;  in RemoveLeadingString()
827       re->op_ = kRegexpLiteral;  in RemoveLeadingString()
829       re->nrunes_ -= n;  in RemoveLeadingString()
830       memmove(re->runes_, re->runes_ + n, re->nrunes_ * sizeof re->runes_[0]);  in RemoveLeadingString()
835   while (d-- > 0) {  in RemoveLeadingString()
837     Regexp** sub = re->sub();  in RemoveLeadingString()
838     if (sub[0]->op() == kRegexpEmptyMatch) {  in RemoveLeadingString()
839       sub[0]->Decref();  in RemoveLeadingString()
842       switch (re->nsub()) {  in RemoveLeadingString()
846           LOG(DFATAL) << "Concat of " << re->nsub();  in RemoveLeadingString()
847           re->submany_ = NULL;  in RemoveLeadingString()
848           re->op_ = kRegexpEmptyMatch;  in RemoveLeadingString()
853           Regexp* old = sub[1];  in RemoveLeadingString()
855           re->Swap(old);  in RemoveLeadingString()
856           old->Decref();  in RemoveLeadingString()
862           re->nsub_--;  in RemoveLeadingString()
863           memmove(sub, sub + 1, re->nsub_ * sizeof sub[0]);  in RemoveLeadingString()
877   Splice(Regexp* prefix, Regexp** sub, int nsub)  in Splice()
881         nsuffix(-1) {}  in Splice()
883   Regexp* prefix;
884   Regexp** sub;
894   Frame(Regexp** sub, int nsub)  in Frame()
899   Regexp** sub;
906 // Bundled into a class for friend access to Regexp without needing to declare
907 // (or define) Splice in regexp.h.
910   static void Round1(Regexp** sub, int nsub,
911                      Regexp::ParseFlags flags,
913   static void Round2(Regexp** sub, int nsub,
914                      Regexp::ParseFlags flags,
916   static void Round3(Regexp** sub, int nsub,
917                      Regexp::ParseFlags flags,
932 int Regexp::FactorAlternation(Regexp** sub, int nsub, ParseFlags flags) {  in FactorAlternation()
957         while (sub + i < iter->sub)  in FactorAlternation()
963             Regexp* re[2];  in FactorAlternation()
964             re[0] = iter->prefix;  in FactorAlternation()
965             re[1] = Regexp::AlternateNoFactor(iter->sub, iter->nsuffix, flags);  in FactorAlternation()
966             sub[out++] = Regexp::Concat(re, 2, flags);  in FactorAlternation()
967             i += iter->nsub;  in FactorAlternation()
972             sub[out++] = iter->prefix;  in FactorAlternation()
973             i += iter->nsub;  in FactorAlternation()
1028 void FactorAlternationImpl::Round1(Regexp** sub, int nsub,  in Round1()
1029                                    Regexp::ParseFlags flags,  in Round1()
1035   Regexp::ParseFlags runeflags = Regexp::NoParseFlags;  in Round1()
1041     Regexp::ParseFlags runeflags_i = Regexp::NoParseFlags;  in Round1()
1043       rune_i = Regexp::LeadingString(sub[i], &nrune_i, &runeflags_i);  in Round1()
1056     // Found end of a run with common leading literal string:  in Round1()
1060       // Nothing to do - first iteration.  in Round1()
1064       Regexp* prefix = Regexp::LiteralString(rune, nrune, runeflags);  in Round1()
1066         Regexp::RemoveLeadingString(sub[j], nrune);  in Round1()
1067       splices->emplace_back(prefix, sub + start, i - start);  in Round1()
1080 void FactorAlternationImpl::Round2(Regexp** sub, int nsub,  in Round2()
1081                                    Regexp::ParseFlags flags,  in Round2()
1092   Regexp* first = NULL;  in Round2()
1096     Regexp* first_i = NULL;  in Round2()
1098       first_i = Regexp::LeadingRegexp(sub[i]);  in Round2()
1100           // first must be an empty-width op  in Round2()
1103           (first->op() == kRegexpBeginLine ||  in Round2()
1104            first->op() == kRegexpEndLine ||  in Round2()
1105            first->op() == kRegexpWordBoundary ||  in Round2()
1106            first->op() == kRegexpNoWordBoundary ||  in Round2()
1107            first->op() == kRegexpBeginText ||  in Round2()
1108            first->op() == kRegexpEndText ||  in Round2()
1109            first->op() == kRegexpCharClass ||  in Round2()
1110            first->op() == kRegexpAnyChar ||  in Round2()
1111            first->op() == kRegexpAnyByte ||  in Round2()
1112            (first->op() == kRegexpRepeat &&  in Round2()
1113             first->min() == first->max() &&  in Round2()
1114             (first->sub()[0]->op() == kRegexpLiteral ||  in Round2()
1115              first->sub()[0]->op() == kRegexpCharClass ||  in Round2()
1116              first->sub()[0]->op() == kRegexpAnyChar ||  in Round2()
1117              first->sub()[0]->op() == kRegexpAnyByte))) &&  in Round2()
1118           Regexp::Equal(first, first_i))  in Round2()
1122     // Found end of a run with common leading regexp:  in Round2()
1126       // Nothing to do - first iteration.  in Round2()
1130       Regexp* prefix = first->Incref();  in Round2()
1132         sub[j] = Regexp::RemoveLeadingRegexp(sub[j]);  in Round2()
1133       splices->emplace_back(prefix, sub + start, i - start);  in Round2()
1144 void FactorAlternationImpl::Round3(Regexp** sub, int nsub,  in Round3()
1145                                    Regexp::ParseFlags flags,  in Round3()
1149   Regexp* first = NULL;  in Round3()
1153     Regexp* first_i = NULL;  in Round3()
1157           (first->op() == kRegexpLiteral ||  in Round3()
1158            first->op() == kRegexpCharClass) &&  in Round3()
1159           (first_i->op() == kRegexpLiteral ||  in Round3()
1160            first_i->op() == kRegexpCharClass))  in Round3()
1168       // Nothing to do - first iteration.  in Round3()
1174         Regexp* re = sub[j];  in Round3()
1175         if (re->op() == kRegexpCharClass) {  in Round3()
1176           CharClass* cc = re->cc();  in Round3()
1177           for (CharClass::iterator it = cc->begin(); it != cc->end(); ++it)  in Round3()
1178             ccb.AddRange(it->lo, it->hi);  in Round3()
1179         } else if (re->op() == kRegexpLiteral) {  in Round3()
1180           ccb.AddRangeFlags(re->rune(), re->rune(), re->parse_flags());  in Round3()
1182           LOG(DFATAL) << "RE2: unexpected op: " << re->op() << " "  in Round3()
1183                       << re->ToString();  in Round3()
1185         re->Decref();  in Round3()
1187       Regexp* re = Regexp::NewCharClass(ccb.GetCharClass(), flags);  in Round3()
1188       splices->emplace_back(re, sub + start, i - start);  in Round3()
1202 void Regexp::ParseState::DoCollapse(RegexpOp op) {  in DoCollapse()
1205   Regexp* next = NULL;  in DoCollapse()
1206   Regexp* sub;  in DoCollapse()
1207   for (sub = stacktop_; sub != NULL && !IsMarker(sub->op()); sub = next) {  in DoCollapse()
1208     next = sub->down_;  in DoCollapse()
1209     if (sub->op_ == op)  in DoCollapse()
1210       n += sub->nsub_;  in DoCollapse()
1217   if (stacktop_ != NULL && stacktop_->down_ == next)  in DoCollapse()
1221   PODArray<Regexp*> subs(n);  in DoCollapse()
1224   for (sub = stacktop_; sub != NULL && !IsMarker(sub->op()); sub = next) {  in DoCollapse()
1225     next = sub->down_;  in DoCollapse()
1226     if (sub->op_ == op) {  in DoCollapse()
1227       Regexp** sub_subs = sub->sub();  in DoCollapse()
1228       for (int k = sub->nsub_ - 1; k >= 0; k--)  in DoCollapse()
1229         subs[--i] = sub_subs[k]->Incref();  in DoCollapse()
1230       sub->Decref();  in DoCollapse()
1232       subs[--i] = FinishRegexp(sub);  in DoCollapse()
1236   Regexp* re = ConcatOrAlternate(op, subs.data(), n, flags_, true);  in DoCollapse()
1237   re->simple_ = re->ComputeSimple();  in DoCollapse()
1238   re->down_ = next;  in DoCollapse()
1243 // collapsing it into a single regexp on the stack.
1244 void Regexp::ParseState::DoConcatenation() {  in DoConcatenation()
1245   Regexp* r1 = stacktop_;  in DoConcatenation()
1246   if (r1 == NULL || IsMarker(r1->op())) {  in DoConcatenation()
1248     Regexp* re = new Regexp(kRegexpEmptyMatch, flags_);  in DoConcatenation()
1255 // collapsing it to a single regexp on the stack.
1256 void Regexp::ParseState::DoAlternation() {  in DoAlternation()
1259   Regexp* r1 = stacktop_;  in DoAlternation()
1260   stacktop_ = r1->down_;  in DoAlternation()
1261   r1->Decref();  in DoAlternation()
1266 // If top two elements on stack are both literal or string,
1267 // collapse into single string.
1268 // Don't walk down the stack -- the parser calls this frequently
1270 // Only called when another regexp is about to be pushed
1275 bool Regexp::ParseState::MaybeConcatString(int r, ParseFlags flags) {  in MaybeConcatString()
1276   Regexp* re1;  in MaybeConcatString()
1277   Regexp* re2;  in MaybeConcatString()
1278   if ((re1 = stacktop_) == NULL || (re2 = re1->down_) == NULL)  in MaybeConcatString()
1281   if (re1->op_ != kRegexpLiteral && re1->op_ != kRegexpLiteralString)  in MaybeConcatString()
1283   if (re2->op_ != kRegexpLiteral && re2->op_ != kRegexpLiteralString)  in MaybeConcatString()
1285   if ((re1->parse_flags_ & FoldCase) != (re2->parse_flags_ & FoldCase))  in MaybeConcatString()
1288   if (re2->op_ == kRegexpLiteral) {  in MaybeConcatString()
1289     // convert into string  in MaybeConcatString()
1290     Rune rune = re2->rune_;  in MaybeConcatString()
1291     re2->op_ = kRegexpLiteralString;  in MaybeConcatString()
1292     re2->nrunes_ = 0;  in MaybeConcatString()
1293     re2->runes_ = NULL;  in MaybeConcatString()
1294     re2->AddRuneToString(rune);  in MaybeConcatString()
1298   if (re1->op_ == kRegexpLiteral) {  in MaybeConcatString()
1299     re2->AddRuneToString(re1->rune_);  in MaybeConcatString()
1301     for (int i = 0; i < re1->nrunes_; i++)  in MaybeConcatString()
1302       re2->AddRuneToString(re1->runes_[i]);  in MaybeConcatString()
1303     re1->nrunes_ = 0;  in MaybeConcatString()
1304     delete[] re1->runes_;  in MaybeConcatString()
1305     re1->runes_ = NULL;  in MaybeConcatString()
1310     re1->op_ = kRegexpLiteral;  in MaybeConcatString()
1311     re1->rune_ = r;  in MaybeConcatString()
1312     re1->parse_flags_ = static_cast<uint16_t>(flags);  in MaybeConcatString()
1317   re1->Decref();  in MaybeConcatString()
1324 // Sets *s to span the remainder of the string.
1326   if (s->size() == 0 || !isdigit((*s)[0] & 0xFF))  in ParseInteger()
1329   if (s->size() >= 2 && (*s)[0] == '0' && isdigit((*s)[1] & 0xFF))  in ParseInteger()
1333   while (s->size() > 0 && isdigit(c = (*s)[0] & 0xFF)) {  in ParseInteger()
1337     n = n*10 + c - '0';  in ParseInteger()
1338     s->remove_prefix(1);  // digit  in ParseInteger()
1345 // Sets *s to span the remainder of the string on success.
1348 // sets *hi to -1 to signify this.
1350 // The Maybe in the name signifies that the regexp parse
1368       *hi = -1;  in MaybeParseRepetition()
1393   if (fullrune(sp->data(), static_cast<int>(std::min(size_t{4}, sp->size())))) {  in StringPieceToRune()
1394     int n = chartorune(r, sp->data());  in StringPieceToRune()
1404       sp->remove_prefix(n);  in StringPieceToRune()
1409   status->set_code(kRegexpBadUTF8);  in StringPieceToRune()
1410   status->set_error_arg(StringPiece());  in StringPieceToRune()
1411   return -1;  in StringPieceToRune()
1414 // Return whether name is valid UTF-8.
1436     return c - '0';  in UnHex()
1438     return c - 'A' + 10;  in UnHex()
1440     return c - 'a' + 10;  in UnHex()
1445 // Parse an escape sequence (e.g., \n, \{).
1446 // Sets *s to span the remainder of the string.
1450   const char* begin = s->begin();  in ParseEscape()
1451   if (s->size() < 1 || (*s)[0] != '\\') {  in ParseEscape()
1452     // Should not happen - caller always checks.  in ParseEscape()
1453     status->set_code(kRegexpInternalError);  in ParseEscape()
1454     status->set_error_arg(StringPiece());  in ParseEscape()
1457   if (s->size() < 2) {  in ParseEscape()
1458     status->set_code(kRegexpTrailingBackslash);  in ParseEscape()
1459     status->set_error_arg(StringPiece());  in ParseEscape()
1463   s->remove_prefix(1);  // backslash  in ParseEscape()
1470         // Escaped non-word characters are always themselves.  in ParseEscape()
1487       // Single non-zero octal digit is a backreference; not supported.  in ParseEscape()
1488       if (s->size() == 0 || (*s)[0] < '0' || (*s)[0] > '7')  in ParseEscape()
1493       code = c - '0';  in ParseEscape()
1494       if (s->size() > 0 && '0' <= (c = (*s)[0]) && c <= '7') {  in ParseEscape()
1495         code = code * 8 + c - '0';  in ParseEscape()
1496         s->remove_prefix(1);  // digit  in ParseEscape()
1497         if (s->size() > 0) {  in ParseEscape()
1500             code = code * 8 + c - '0';  in ParseEscape()
1501             s->remove_prefix(1);  // digit  in ParseEscape()
1512       if (s->size() == 0)  in ParseEscape()
1518         // Update n as we consume the string, so that  in ParseEscape()
1521         // after the first non-hex digit.  We require only hex digits,  in ParseEscape()
1532           if (s->size() == 0)  in ParseEscape()
1543       if (s->size() == 0)  in ParseEscape()
1575     // the Perl word-boundary \b as a backspace  in ParseEscape()
1576     // when in POSIX regexp mode.  Surprisingly,  in ParseEscape()
1577     // in Perl, \b means word-boundary but [\b]  in ParseEscape()
1590   // Unrecognized escape sequence.  in ParseEscape()
1591   status->set_code(kRegexpBadEscape);  in ParseEscape()
1592   status->set_error_arg(  in ParseEscape()
1593       StringPiece(begin, static_cast<size_t>(s->begin() - begin)));  in ParseEscape()
1600     Rune lo, Rune hi, Regexp::ParseFlags parse_flags) {  in AddRangeFlags()
1603   bool cutnl = !(parse_flags & Regexp::ClassNL) ||  in AddRangeFlags()
1604                (parse_flags & Regexp::NeverNL);  in AddRangeFlags()
1607       AddRangeFlags(lo, '\n' - 1, parse_flags);  in AddRangeFlags()
1613   // If folding case, add fold-equivalent characters too.  in AddRangeFlags()
1614   if (parse_flags & Regexp::FoldCase)  in AddRangeFlags()
1656                       Regexp::ParseFlags parse_flags) {  in AddUGroup()
1658     for (int i = 0; i < g->nr16; i++) {  in AddUGroup()
1659       cc->AddRangeFlags(g->r16[i].lo, g->r16[i].hi, parse_flags);  in AddUGroup()
1661     for (int i = 0; i < g->nr32; i++) {  in AddUGroup()
1662       cc->AddRangeFlags(g->r32[i].lo, g->r32[i].hi, parse_flags);  in AddUGroup()
1665     if (parse_flags & Regexp::FoldCase) {  in AddUGroup()
1666       // Normally adding a case-folded group means  in AddUGroup()
1667       // adding all the extra fold-equivalent runes too.  in AddUGroup()
1669       // we have to exclude all the runes that are fold-equivalent  in AddUGroup()
1675       bool cutnl = !(parse_flags & Regexp::ClassNL) ||  in AddUGroup()
1676                    (parse_flags & Regexp::NeverNL);  in AddUGroup()
1681       cc->AddCharClass(&ccb1);  in AddUGroup()
1685     for (int i = 0; i < g->nr16; i++) {  in AddUGroup()
1686       if (next < g->r16[i].lo)  in AddUGroup()
1687         cc->AddRangeFlags(next, g->r16[i].lo - 1, parse_flags);  in AddUGroup()
1688       next = g->r16[i].hi + 1;  in AddUGroup()
1690     for (int i = 0; i < g->nr32; i++) {  in AddUGroup()
1691       if (next < g->r32[i].lo)  in AddUGroup()
1692         cc->AddRangeFlags(next, g->r32[i].lo - 1, parse_flags);  in AddUGroup()
1693       next = g->r32[i].hi + 1;  in AddUGroup()
1696       cc->AddRangeFlags(next, Runemax, parse_flags);  in AddUGroup()
1700 // Maybe parse a Perl character class escape sequence.
1702 // not the Perl empty-string classes (\b \B \A \Z \z).
1703 // On success, sets *s to span the remainder of the string
1706 const UGroup* MaybeParsePerlCCEscape(StringPiece* s, Regexp::ParseFlags parse_flags) {  in MaybeParsePerlCCEscape()
1707   if (!(parse_flags & Regexp::PerlClasses))  in MaybeParsePerlCCEscape()
1709   if (s->size() < 2 || (*s)[0] != '\\')  in MaybeParsePerlCCEscape()
1712   // any non-ASCII Perl group names.  in MaybeParsePerlCCEscape()
1713   StringPiece name(s->begin(), 2);  in MaybeParsePerlCCEscape()
1717   s->remove_prefix(name.size());  in MaybeParsePerlCCEscape()
1729 ParseStatus ParseUnicodeGroup(StringPiece* s, Regexp::ParseFlags parse_flags,  in ParseUnicodeGroup()
1733   if (!(parse_flags & Regexp::UnicodeGroups))  in ParseUnicodeGroup()
1735   if (s->size() < 2 || (*s)[0] != '\\')  in ParseUnicodeGroup()
1742   int sign = +1;  // -1 = negated char class  in ParseUnicodeGroup()
1744     sign = -sign;  in ParseUnicodeGroup()
1747   s->remove_prefix(2);  // '\\', 'p'  in ParseUnicodeGroup()
1752     // Name is the bit of string we just skipped over for c.  in ParseUnicodeGroup()
1754     name = StringPiece(p, static_cast<size_t>(s->begin() - p));  in ParseUnicodeGroup()
1757     size_t end = s->find('}', 0);  in ParseUnicodeGroup()
1761       status->set_code(kRegexpBadCharRange);  in ParseUnicodeGroup()
1762       status->set_error_arg(seq);  in ParseUnicodeGroup()
1765     name = StringPiece(s->begin(), end);  // without '}'  in ParseUnicodeGroup()
1766     s->remove_prefix(end + 1);  // with '}'  in ParseUnicodeGroup()
1772   seq = StringPiece(seq.begin(), static_cast<size_t>(s->begin() - seq.begin()));  in ParseUnicodeGroup()
1775     sign = -sign;  in ParseUnicodeGroup()
1783     status->set_code(kRegexpBadCharRange);  in ParseUnicodeGroup()
1784     status->set_error_arg(seq);  in ParseUnicodeGroup()
1793       string("\\p{") + string(name) + string("}"));  in ParseUnicodeGroup()
1797     status->set_code(kRegexpBadCharRange);  in ParseUnicodeGroup()
1798     status->set_error_arg(seq);  in ParseUnicodeGroup()
1818 // Sets *s to span the remainder of the string.
1820 static ParseStatus ParseCCName(StringPiece* s, Regexp::ParseFlags parse_flags,  in ParseCCName()
1824   const char* p = s->data();  in ParseCCName()
1825   const char* ep = s->data() + s->size();  in ParseCCName()
1826   if (ep - p < 2 || p[0] != '[' || p[1] != ':')  in ParseCCName()
1831   for (q = p+2; q <= ep-2 && (*q != ':' || *(q+1) != ']'); q++)  in ParseCCName()
1835   if (q > ep-2)  in ParseCCName()
1840   StringPiece name(p, static_cast<size_t>(q - p));  in ParseCCName()
1844     status->set_code(kRegexpBadCharRange);  in ParseCCName()
1845     status->set_error_arg(name);  in ParseCCName()
1849   s->remove_prefix(name.size());  in ParseCCName()
1850   AddUGroup(cc, g, g->sign, parse_flags);  in ParseCCName()
1855 // There are fewer special characters here than in the rest of the regexp.
1856 // Sets *s to span the remainder of the string.
1858 bool Regexp::ParseState::ParseCCCharacter(StringPiece* s, Rune *rp,  in ParseCCCharacter()
1861   if (s->size() == 0) {  in ParseCCCharacter()
1862     status->set_code(kRegexpMissingBracket);  in ParseCCCharacter()
1863     status->set_error_arg(whole_class);  in ParseCCCharacter()
1867   // Allow regular escape sequences even though  in ParseCCCharacter()
1869   if (s->size() >= 1 && (*s)[0] == '\\')  in ParseCCCharacter()
1878 // For single characters, rr->lo == rr->hi.
1879 // Sets *s to span the remainder of the string.
1881 bool Regexp::ParseState::ParseCCRange(StringPiece* s, RuneRange* rr,  in ParseCCRange()
1885   if (!ParseCCCharacter(s, &rr->lo, whole_class, status))  in ParseCCRange()
1887   // [a-] means (a|-), so check for final ].  in ParseCCRange()
1888   if (s->size() >= 2 && (*s)[0] == '-' && (*s)[1] != ']') {  in ParseCCRange()
1889     s->remove_prefix(1);  // '-'  in ParseCCRange()
1890     if (!ParseCCCharacter(s, &rr->hi, whole_class, status))  in ParseCCRange()
1892     if (rr->hi < rr->lo) {  in ParseCCRange()
1893       status->set_code(kRegexpBadCharRange);  in ParseCCRange()
1894       status->set_error_arg(  in ParseCCRange()
1895           StringPiece(os.data(), static_cast<size_t>(s->data() - os.data())));  in ParseCCRange()
1899     rr->hi = rr->lo;  in ParseCCRange()
1904 // Parses a possibly-negated character class expression like [^abx-z[:digit:]].
1905 // Sets *s to span the remainder of the string.
1906 // Sets *out_re to the regexp for the class.
1907 bool Regexp::ParseState::ParseCharClass(StringPiece* s,  in ParseCharClass()
1908                                         Regexp** out_re,  in ParseCharClass()
1911   if (s->size() == 0 || (*s)[0] != '[') {  in ParseCharClass()
1913     status->set_code(kRegexpInternalError);  in ParseCharClass()
1914     status->set_error_arg(StringPiece());  in ParseCharClass()
1918   Regexp* re = new Regexp(kRegexpCharClass, flags_ & ~FoldCase);  in ParseCharClass()
1919   re->ccb_ = new CharClassBuilder;  in ParseCharClass()
1920   s->remove_prefix(1);  // '['  in ParseCharClass()
1921   if (s->size() > 0 && (*s)[0] == '^') {  in ParseCharClass()
1922     s->remove_prefix(1);  // '^'  in ParseCharClass()
1927       re->ccb_->AddRange('\n', '\n');  in ParseCharClass()
1931   while (s->size() > 0 && ((*s)[0] != ']' || first)) {  in ParseCharClass()
1932     // - is only okay unescaped as first or last in class.  in ParseCharClass()
1933     // Except that Perl allows - anywhere.  in ParseCharClass()
1934     if ((*s)[0] == '-' && !first && !(flags_&PerlX) &&  in ParseCharClass()
1935         (s->size() == 1 || (*s)[1] != ']')) {  in ParseCharClass()
1937       t.remove_prefix(1);  // '-'  in ParseCharClass()
1941         re->Decref();  in ParseCharClass()
1944       status->set_code(kRegexpBadCharRange);  in ParseCharClass()
1945       status->set_error_arg(StringPiece(s->data(), 1+n));  in ParseCharClass()
1946       re->Decref();  in ParseCharClass()
1952     if (s->size() > 2 && (*s)[0] == '[' && (*s)[1] == ':') {  in ParseCharClass()
1953       switch (ParseCCName(s, flags_, re->ccb_, status)) {  in ParseCharClass()
1957           re->Decref();  in ParseCharClass()
1965     if (s->size() > 2 &&  in ParseCharClass()
1968       switch (ParseUnicodeGroup(s, flags_, re->ccb_, status)) {  in ParseCharClass()
1972           re->Decref();  in ParseCharClass()
1982       AddUGroup(re->ccb_, g, g->sign, flags_);  in ParseCharClass()
1989       re->Decref();  in ParseCharClass()
1994     // Regexp::ClassNL is set.  In an explicit range or singleton  in ParseCharClass()
1997     re->ccb_->AddRangeFlags(rr.lo, rr.hi, flags_ | Regexp::ClassNL);  in ParseCharClass()
1999   if (s->size() == 0) {  in ParseCharClass()
2000     status->set_code(kRegexpMissingBracket);  in ParseCharClass()
2001     status->set_error_arg(whole_class);  in ParseCharClass()
2002     re->Decref();  in ParseCharClass()
2005   s->remove_prefix(1);  // ']'  in ParseCharClass()
2008     re->ccb_->Negate();  in ParseCharClass()
2014 // Is this a valid capture name?  [A-Za-z0-9_]+
2033 // Parses a Perl flag setting or non-capturing group or both,
2037 // well-formed or not supported, sets status_ and returns false.
2038 bool Regexp::ParseState::ParsePerlFlags(StringPiece* s) {  in ParsePerlFlags()
2044     status_->set_code(kRegexpInternalError);  in ParsePerlFlags()
2050   // Check for named captures, first introduced in Python's regexp library.  in ParsePerlFlags()
2071       status_->set_code(kRegexpBadNamedCapture);  in ParsePerlFlags()
2072       status_->set_error_arg(*s);  in ParsePerlFlags()
2077     StringPiece capture(t.begin()-2, end+3);  // "(?P<name>"  in ParsePerlFlags()
2078     StringPiece name(t.begin()+2, end-2);     // "name"  in ParsePerlFlags()
2082       status_->set_code(kRegexpBadNamedCapture);  in ParsePerlFlags()
2083       status_->set_error_arg(capture);  in ParsePerlFlags()
2092     s->remove_prefix(static_cast<size_t>(capture.end() - s->begin()));  in ParsePerlFlags()
2143       case '-':  in ParsePerlFlags()
2169   flags_ = static_cast<Regexp::ParseFlags>(nflags);  in ParsePerlFlags()
2174   status_->set_code(kRegexpBadPerlOp);  in ParsePerlFlags()
2175   status_->set_error_arg(  in ParsePerlFlags()
2176       StringPiece(s->begin(), static_cast<size_t>(t.begin() - s->begin())));  in ParsePerlFlags()
2181 // into UTF8 encoding in string.
2183 // deprecated and because it rejects code points 0x80-0x9F.
2184 void ConvertLatin1ToUTF8(const StringPiece& latin1, string* utf) {  in ConvertLatin1ToUTF8()
2187   utf->clear();  in ConvertLatin1ToUTF8()
2191     utf->append(buf, n);  in ConvertLatin1ToUTF8()
2196 // returning the corresponding Regexp tree.
2199 Regexp* Regexp::Parse(const StringPiece& s, ParseFlags global_flags,  in Parse()
2201   // Make status non-NULL (easier on everyone else).  in Parse()
2209   // Convert regexp to UTF-8 (easier on the rest of the parser).  in Parse()
2211     string* tmp = new string;  in Parse()
2213     status->set_tmp(tmp);  in Parse()
2218     // Special parse loop for literal string.  in Parse()
2243         // "(?" introduces Perl escape.  in Parse()
2245           // Flag changes and non-capturing groups.  in Parse()
2291         Regexp* re;  in Parse()
2320             //   a** is a syntax error, not a double-star.  in Parse()
2322             status->set_code(kRegexpRepeatOp);  in Parse()
2323             status->set_error_arg(StringPiece(  in Parse()
2325                 static_cast<size_t>(t.begin() - lastunary.begin())));  in Parse()
2330                             static_cast<size_t>(t.data() - opstr.data()));  in Parse()
2355             status->set_code(kRegexpRepeatOp);  in Parse()
2356             status->set_error_arg(StringPiece(  in Parse()
2358                 static_cast<size_t>(t.begin() - lastunary.begin())));  in Parse()
2363                             static_cast<size_t>(t.data() - opstr.data()));  in Parse()
2372         if ((ps.flags() & Regexp::PerlB) &&  in Parse()
2380         if ((ps.flags() & Regexp::PerlX) && t.size() >= 2) {  in Parse()
2395           // (This library treats "(?-m)$" as \z, even though  in Parse()
2423           Regexp* re = new Regexp(kRegexpCharClass, ps.flags() & ~FoldCase);  in Parse()
2424           re->ccb_ = new CharClassBuilder;  in Parse()
2425           switch (ParseUnicodeGroup(&t, ps.flags(), re->ccb_, status)) {  in Parse()
2431               re->Decref();  in Parse()
2434               re->Decref();  in Parse()
2441           Regexp* re = new Regexp(kRegexpCharClass, ps.flags() & ~FoldCase);  in Parse()
2442           re->ccb_ = new CharClassBuilder;  in Parse()
2443           AddUGroup(re->ccb_, g, g->sign, ps.flags());  in Parse()