• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2006 The RE2 Authors.  All Rights Reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4 
5 // Regular expression representation.
6 // Tested by parse_test.cc
7 
8 #include "re2/regexp.h"
9 
10 #include <stddef.h>
11 #include <stdint.h>
12 #include <string.h>
13 #include <algorithm>
14 #include <map>
15 #include <mutex>
16 #include <string>
17 #include <vector>
18 
19 #include "util/util.h"
20 #include "util/logging.h"
21 #include "util/mutex.h"
22 #include "util/utf.h"
23 #include "re2/stringpiece.h"
24 #include "re2/walker-inl.h"
25 
26 namespace re2 {
27 
28 // Constructor.  Allocates vectors as appropriate for operator.
Regexp(RegexpOp op,ParseFlags parse_flags)29 Regexp::Regexp(RegexpOp op, ParseFlags parse_flags)
30   : op_(static_cast<uint8_t>(op)),
31     simple_(false),
32     parse_flags_(static_cast<uint16_t>(parse_flags)),
33     ref_(1),
34     nsub_(0),
35     down_(NULL) {
36   subone_ = NULL;
37   memset(the_union_, 0, sizeof the_union_);
38 }
39 
40 // Destructor.  Assumes already cleaned up children.
41 // Private: use Decref() instead of delete to destroy Regexps.
42 // Can't call Decref on the sub-Regexps here because
43 // that could cause arbitrarily deep recursion, so
44 // required Decref() to have handled them for us.
~Regexp()45 Regexp::~Regexp() {
46   if (nsub_ > 0)
47     LOG(DFATAL) << "Regexp not destroyed.";
48 
49   switch (op_) {
50     default:
51       break;
52     case kRegexpCapture:
53       delete name_;
54       break;
55     case kRegexpLiteralString:
56       delete[] runes_;
57       break;
58     case kRegexpCharClass:
59       if (cc_)
60         cc_->Delete();
61       delete ccb_;
62       break;
63   }
64 }
65 
66 // If it's possible to destroy this regexp without recurring,
67 // do so and return true.  Else return false.
QuickDestroy()68 bool Regexp::QuickDestroy() {
69   if (nsub_ == 0) {
70     delete this;
71     return true;
72   }
73   return false;
74 }
75 
76 // Lazily allocated.
77 static Mutex* ref_mutex;
78 static std::map<Regexp*, int>* ref_map;
79 
Ref()80 int Regexp::Ref() {
81   if (ref_ < kMaxRef)
82     return ref_;
83 
84   MutexLock l(ref_mutex);
85   return (*ref_map)[this];
86 }
87 
88 // Increments reference count, returns object as convenience.
Incref()89 Regexp* Regexp::Incref() {
90   if (ref_ >= kMaxRef-1) {
91     static std::once_flag ref_once;
92     std::call_once(ref_once, []() {
93       ref_mutex = new Mutex;
94       ref_map = new std::map<Regexp*, int>;
95     });
96 
97     // Store ref count in overflow map.
98     MutexLock l(ref_mutex);
99     if (ref_ == kMaxRef) {
100       // already overflowed
101       (*ref_map)[this]++;
102     } else {
103       // overflowing now
104       (*ref_map)[this] = kMaxRef;
105       ref_ = kMaxRef;
106     }
107     return this;
108   }
109 
110   ref_++;
111   return this;
112 }
113 
114 // Decrements reference count and deletes this object if count reaches 0.
Decref()115 void Regexp::Decref() {
116   if (ref_ == kMaxRef) {
117     // Ref count is stored in overflow map.
118     MutexLock l(ref_mutex);
119     int r = (*ref_map)[this] - 1;
120     if (r < kMaxRef) {
121       ref_ = static_cast<uint16_t>(r);
122       ref_map->erase(this);
123     } else {
124       (*ref_map)[this] = r;
125     }
126     return;
127   }
128   ref_--;
129   if (ref_ == 0)
130     Destroy();
131 }
132 
133 // Deletes this object; ref count has count reached 0.
Destroy()134 void Regexp::Destroy() {
135   if (QuickDestroy())
136     return;
137 
138   // Handle recursive Destroy with explicit stack
139   // to avoid arbitrarily deep recursion on process stack [sigh].
140   down_ = NULL;
141   Regexp* stack = this;
142   while (stack != NULL) {
143     Regexp* re = stack;
144     stack = re->down_;
145     if (re->ref_ != 0)
146       LOG(DFATAL) << "Bad reference count " << re->ref_;
147     if (re->nsub_ > 0) {
148       Regexp** subs = re->sub();
149       for (int i = 0; i < re->nsub_; i++) {
150         Regexp* sub = subs[i];
151         if (sub == NULL)
152           continue;
153         if (sub->ref_ == kMaxRef)
154           sub->Decref();
155         else
156           --sub->ref_;
157         if (sub->ref_ == 0 && !sub->QuickDestroy()) {
158           sub->down_ = stack;
159           stack = sub;
160         }
161       }
162       if (re->nsub_ > 1)
163         delete[] subs;
164       re->nsub_ = 0;
165     }
166     delete re;
167   }
168 }
169 
AddRuneToString(Rune r)170 void Regexp::AddRuneToString(Rune r) {
171   DCHECK(op_ == kRegexpLiteralString);
172   if (nrunes_ == 0) {
173     // start with 8
174     runes_ = new Rune[8];
175   } else if (nrunes_ >= 8 && (nrunes_ & (nrunes_ - 1)) == 0) {
176     // double on powers of two
177     Rune *old = runes_;
178     runes_ = new Rune[nrunes_ * 2];
179     for (int i = 0; i < nrunes_; i++)
180       runes_[i] = old[i];
181     delete[] old;
182   }
183 
184   runes_[nrunes_++] = r;
185 }
186 
HaveMatch(int match_id,ParseFlags flags)187 Regexp* Regexp::HaveMatch(int match_id, ParseFlags flags) {
188   Regexp* re = new Regexp(kRegexpHaveMatch, flags);
189   re->match_id_ = match_id;
190   return re;
191 }
192 
StarPlusOrQuest(RegexpOp op,Regexp * sub,ParseFlags flags)193 Regexp* Regexp::StarPlusOrQuest(RegexpOp op, Regexp* sub, ParseFlags flags) {
194   // Squash **, ++ and ??.
195   if (op == sub->op() && flags == sub->parse_flags())
196     return sub;
197 
198   // Squash *+, *?, +*, +?, ?* and ?+. They all squash to *, so because
199   // op is Star/Plus/Quest, we just have to check that sub->op() is too.
200   if ((sub->op() == kRegexpStar ||
201        sub->op() == kRegexpPlus ||
202        sub->op() == kRegexpQuest) &&
203       flags == sub->parse_flags()) {
204     // If sub is Star, no need to rewrite it.
205     if (sub->op() == kRegexpStar)
206       return sub;
207 
208     // Rewrite sub to Star.
209     Regexp* re = new Regexp(kRegexpStar, flags);
210     re->AllocSub(1);
211     re->sub()[0] = sub->sub()[0]->Incref();
212     sub->Decref();  // We didn't consume the reference after all.
213     return re;
214   }
215 
216   Regexp* re = new Regexp(op, flags);
217   re->AllocSub(1);
218   re->sub()[0] = sub;
219   return re;
220 }
221 
Plus(Regexp * sub,ParseFlags flags)222 Regexp* Regexp::Plus(Regexp* sub, ParseFlags flags) {
223   return StarPlusOrQuest(kRegexpPlus, sub, flags);
224 }
225 
Star(Regexp * sub,ParseFlags flags)226 Regexp* Regexp::Star(Regexp* sub, ParseFlags flags) {
227   return StarPlusOrQuest(kRegexpStar, sub, flags);
228 }
229 
Quest(Regexp * sub,ParseFlags flags)230 Regexp* Regexp::Quest(Regexp* sub, ParseFlags flags) {
231   return StarPlusOrQuest(kRegexpQuest, sub, flags);
232 }
233 
ConcatOrAlternate(RegexpOp op,Regexp ** sub,int nsub,ParseFlags flags,bool can_factor)234 Regexp* Regexp::ConcatOrAlternate(RegexpOp op, Regexp** sub, int nsub,
235                                   ParseFlags flags, bool can_factor) {
236   if (nsub == 1)
237     return sub[0];
238 
239   if (nsub == 0) {
240     if (op == kRegexpAlternate)
241       return new Regexp(kRegexpNoMatch, flags);
242     else
243       return new Regexp(kRegexpEmptyMatch, flags);
244   }
245 
246   Regexp** subcopy = NULL;
247   if (op == kRegexpAlternate && can_factor) {
248     // Going to edit sub; make a copy so we don't step on caller.
249     subcopy = new Regexp*[nsub];
250     memmove(subcopy, sub, nsub * sizeof sub[0]);
251     sub = subcopy;
252     nsub = FactorAlternation(sub, nsub, flags);
253     if (nsub == 1) {
254       Regexp* re = sub[0];
255       delete[] subcopy;
256       return re;
257     }
258   }
259 
260   if (nsub > kMaxNsub) {
261     // Too many subexpressions to fit in a single Regexp.
262     // Make a two-level tree.  Two levels gets us to 65535^2.
263     int nbigsub = (nsub+kMaxNsub-1)/kMaxNsub;
264     Regexp* re = new Regexp(op, flags);
265     re->AllocSub(nbigsub);
266     Regexp** subs = re->sub();
267     for (int i = 0; i < nbigsub - 1; i++)
268       subs[i] = ConcatOrAlternate(op, sub+i*kMaxNsub, kMaxNsub, flags, false);
269     subs[nbigsub - 1] = ConcatOrAlternate(op, sub+(nbigsub-1)*kMaxNsub,
270                                           nsub - (nbigsub-1)*kMaxNsub, flags,
271                                           false);
272     delete[] subcopy;
273     return re;
274   }
275 
276   Regexp* re = new Regexp(op, flags);
277   re->AllocSub(nsub);
278   Regexp** subs = re->sub();
279   for (int i = 0; i < nsub; i++)
280     subs[i] = sub[i];
281 
282   delete[] subcopy;
283   return re;
284 }
285 
Concat(Regexp ** sub,int nsub,ParseFlags flags)286 Regexp* Regexp::Concat(Regexp** sub, int nsub, ParseFlags flags) {
287   return ConcatOrAlternate(kRegexpConcat, sub, nsub, flags, false);
288 }
289 
Alternate(Regexp ** sub,int nsub,ParseFlags flags)290 Regexp* Regexp::Alternate(Regexp** sub, int nsub, ParseFlags flags) {
291   return ConcatOrAlternate(kRegexpAlternate, sub, nsub, flags, true);
292 }
293 
AlternateNoFactor(Regexp ** sub,int nsub,ParseFlags flags)294 Regexp* Regexp::AlternateNoFactor(Regexp** sub, int nsub, ParseFlags flags) {
295   return ConcatOrAlternate(kRegexpAlternate, sub, nsub, flags, false);
296 }
297 
Capture(Regexp * sub,ParseFlags flags,int cap)298 Regexp* Regexp::Capture(Regexp* sub, ParseFlags flags, int cap) {
299   Regexp* re = new Regexp(kRegexpCapture, flags);
300   re->AllocSub(1);
301   re->sub()[0] = sub;
302   re->cap_ = cap;
303   return re;
304 }
305 
Repeat(Regexp * sub,ParseFlags flags,int min,int max)306 Regexp* Regexp::Repeat(Regexp* sub, ParseFlags flags, int min, int max) {
307   Regexp* re = new Regexp(kRegexpRepeat, flags);
308   re->AllocSub(1);
309   re->sub()[0] = sub;
310   re->min_ = min;
311   re->max_ = max;
312   return re;
313 }
314 
NewLiteral(Rune rune,ParseFlags flags)315 Regexp* Regexp::NewLiteral(Rune rune, ParseFlags flags) {
316   Regexp* re = new Regexp(kRegexpLiteral, flags);
317   re->rune_ = rune;
318   return re;
319 }
320 
LiteralString(Rune * runes,int nrunes,ParseFlags flags)321 Regexp* Regexp::LiteralString(Rune* runes, int nrunes, ParseFlags flags) {
322   if (nrunes <= 0)
323     return new Regexp(kRegexpEmptyMatch, flags);
324   if (nrunes == 1)
325     return NewLiteral(runes[0], flags);
326   Regexp* re = new Regexp(kRegexpLiteralString, flags);
327   for (int i = 0; i < nrunes; i++)
328     re->AddRuneToString(runes[i]);
329   return re;
330 }
331 
NewCharClass(CharClass * cc,ParseFlags flags)332 Regexp* Regexp::NewCharClass(CharClass* cc, ParseFlags flags) {
333   Regexp* re = new Regexp(kRegexpCharClass, flags);
334   re->cc_ = cc;
335   return re;
336 }
337 
Swap(Regexp * that)338 void Regexp::Swap(Regexp* that) {
339   // Regexp is not trivially copyable, so we cannot freely copy it with
340   // memmove(3), but swapping objects like so is safe for our purposes.
341   char tmp[sizeof *this];
342   void* vthis = reinterpret_cast<void*>(this);
343   void* vthat = reinterpret_cast<void*>(that);
344   memmove(tmp, vthis, sizeof *this);
345   memmove(vthis, vthat, sizeof *this);
346   memmove(vthat, tmp, sizeof *this);
347 }
348 
349 // Tests equality of all top-level structure but not subregexps.
TopEqual(Regexp * a,Regexp * b)350 static bool TopEqual(Regexp* a, Regexp* b) {
351   if (a->op() != b->op())
352     return false;
353 
354   switch (a->op()) {
355     case kRegexpNoMatch:
356     case kRegexpEmptyMatch:
357     case kRegexpAnyChar:
358     case kRegexpAnyByte:
359     case kRegexpBeginLine:
360     case kRegexpEndLine:
361     case kRegexpWordBoundary:
362     case kRegexpNoWordBoundary:
363     case kRegexpBeginText:
364       return true;
365 
366     case kRegexpEndText:
367       // The parse flags remember whether it's \z or (?-m:$),
368       // which matters when testing against PCRE.
369       return ((a->parse_flags() ^ b->parse_flags()) & Regexp::WasDollar) == 0;
370 
371     case kRegexpLiteral:
372       return a->rune() == b->rune() &&
373              ((a->parse_flags() ^ b->parse_flags()) & Regexp::FoldCase) == 0;
374 
375     case kRegexpLiteralString:
376       return a->nrunes() == b->nrunes() &&
377              ((a->parse_flags() ^ b->parse_flags()) & Regexp::FoldCase) == 0 &&
378              memcmp(a->runes(), b->runes(),
379                     a->nrunes() * sizeof a->runes()[0]) == 0;
380 
381     case kRegexpAlternate:
382     case kRegexpConcat:
383       return a->nsub() == b->nsub();
384 
385     case kRegexpStar:
386     case kRegexpPlus:
387     case kRegexpQuest:
388       return ((a->parse_flags() ^ b->parse_flags()) & Regexp::NonGreedy) == 0;
389 
390     case kRegexpRepeat:
391       return ((a->parse_flags() ^ b->parse_flags()) & Regexp::NonGreedy) == 0 &&
392              a->min() == b->min() &&
393              a->max() == b->max();
394 
395     case kRegexpCapture:
396       return a->cap() == b->cap() && a->name() == b->name();
397 
398     case kRegexpHaveMatch:
399       return a->match_id() == b->match_id();
400 
401     case kRegexpCharClass: {
402       CharClass* acc = a->cc();
403       CharClass* bcc = b->cc();
404       return acc->size() == bcc->size() &&
405              acc->end() - acc->begin() == bcc->end() - bcc->begin() &&
406              memcmp(acc->begin(), bcc->begin(),
407                     (acc->end() - acc->begin()) * sizeof acc->begin()[0]) == 0;
408     }
409   }
410 
411   LOG(DFATAL) << "Unexpected op in Regexp::Equal: " << a->op();
412   return 0;
413 }
414 
Equal(Regexp * a,Regexp * b)415 bool Regexp::Equal(Regexp* a, Regexp* b) {
416   if (a == NULL || b == NULL)
417     return a == b;
418 
419   if (!TopEqual(a, b))
420     return false;
421 
422   // Fast path:
423   // return without allocating vector if there are no subregexps.
424   switch (a->op()) {
425     case kRegexpAlternate:
426     case kRegexpConcat:
427     case kRegexpStar:
428     case kRegexpPlus:
429     case kRegexpQuest:
430     case kRegexpRepeat:
431     case kRegexpCapture:
432       break;
433 
434     default:
435       return true;
436   }
437 
438   // Committed to doing real work.
439   // The stack (vector) has pairs of regexps waiting to
440   // be compared.  The regexps are only equal if
441   // all the pairs end up being equal.
442   std::vector<Regexp*> stk;
443 
444   for (;;) {
445     // Invariant: TopEqual(a, b) == true.
446     Regexp* a2;
447     Regexp* b2;
448     switch (a->op()) {
449       default:
450         break;
451       case kRegexpAlternate:
452       case kRegexpConcat:
453         for (int i = 0; i < a->nsub(); i++) {
454           a2 = a->sub()[i];
455           b2 = b->sub()[i];
456           if (!TopEqual(a2, b2))
457             return false;
458           stk.push_back(a2);
459           stk.push_back(b2);
460         }
461         break;
462 
463       case kRegexpStar:
464       case kRegexpPlus:
465       case kRegexpQuest:
466       case kRegexpRepeat:
467       case kRegexpCapture:
468         a2 = a->sub()[0];
469         b2 = b->sub()[0];
470         if (!TopEqual(a2, b2))
471           return false;
472         // Really:
473         //   stk.push_back(a2);
474         //   stk.push_back(b2);
475         //   break;
476         // but faster to assign directly and loop.
477         a = a2;
478         b = b2;
479         continue;
480     }
481 
482     size_t n = stk.size();
483     if (n == 0)
484       break;
485 
486     DCHECK_GE(n, 2);
487     a = stk[n-2];
488     b = stk[n-1];
489     stk.resize(n-2);
490   }
491 
492   return true;
493 }
494 
495 // Keep in sync with enum RegexpStatusCode in regexp.h
496 static const char *kErrorStrings[] = {
497   "no error",
498   "unexpected error",
499   "invalid escape sequence",
500   "invalid character class",
501   "invalid character class range",
502   "missing ]",
503   "missing )",
504   "trailing \\",
505   "no argument for repetition operator",
506   "invalid repetition size",
507   "bad repetition operator",
508   "invalid perl operator",
509   "invalid UTF-8",
510   "invalid named capture group",
511 };
512 
CodeText(enum RegexpStatusCode code)513 std::string RegexpStatus::CodeText(enum RegexpStatusCode code) {
514   if (code < 0 || code >= arraysize(kErrorStrings))
515     code = kRegexpInternalError;
516   return kErrorStrings[code];
517 }
518 
Text() const519 std::string RegexpStatus::Text() const {
520   if (error_arg_.empty())
521     return CodeText(code_);
522   std::string s;
523   s.append(CodeText(code_));
524   s.append(": ");
525   s.append(error_arg_.data(), error_arg_.size());
526   return s;
527 }
528 
Copy(const RegexpStatus & status)529 void RegexpStatus::Copy(const RegexpStatus& status) {
530   code_ = status.code_;
531   error_arg_ = status.error_arg_;
532 }
533 
534 typedef int Ignored;  // Walker<void> doesn't exist
535 
536 // Walker subclass to count capturing parens in regexp.
537 class NumCapturesWalker : public Regexp::Walker<Ignored> {
538  public:
NumCapturesWalker()539   NumCapturesWalker() : ncapture_(0) {}
ncapture()540   int ncapture() { return ncapture_; }
541 
PreVisit(Regexp * re,Ignored ignored,bool * stop)542   virtual Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) {
543     if (re->op() == kRegexpCapture)
544       ncapture_++;
545     return ignored;
546   }
ShortVisit(Regexp * re,Ignored ignored)547   virtual Ignored ShortVisit(Regexp* re, Ignored ignored) {
548     // Should never be called: we use Walk not WalkExponential.
549     LOG(DFATAL) << "NumCapturesWalker::ShortVisit called";
550     return ignored;
551   }
552 
553  private:
554   int ncapture_;
555 
556   NumCapturesWalker(const NumCapturesWalker&) = delete;
557   NumCapturesWalker& operator=(const NumCapturesWalker&) = delete;
558 };
559 
NumCaptures()560 int Regexp::NumCaptures() {
561   NumCapturesWalker w;
562   w.Walk(this, 0);
563   return w.ncapture();
564 }
565 
566 // Walker class to build map of named capture groups and their indices.
567 class NamedCapturesWalker : public Regexp::Walker<Ignored> {
568  public:
NamedCapturesWalker()569   NamedCapturesWalker() : map_(NULL) {}
~NamedCapturesWalker()570   ~NamedCapturesWalker() { delete map_; }
571 
TakeMap()572   std::map<std::string, int>* TakeMap() {
573     std::map<std::string, int>* m = map_;
574     map_ = NULL;
575     return m;
576   }
577 
PreVisit(Regexp * re,Ignored ignored,bool * stop)578   Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) {
579     if (re->op() == kRegexpCapture && re->name() != NULL) {
580       // Allocate map once we find a name.
581       if (map_ == NULL)
582         map_ = new std::map<std::string, int>;
583 
584       // Record first occurrence of each name.
585       // (The rule is that if you have the same name
586       // multiple times, only the leftmost one counts.)
587       if (map_->find(*re->name()) == map_->end())
588         (*map_)[*re->name()] = re->cap();
589     }
590     return ignored;
591   }
592 
ShortVisit(Regexp * re,Ignored ignored)593   virtual Ignored ShortVisit(Regexp* re, Ignored ignored) {
594     // Should never be called: we use Walk not WalkExponential.
595     LOG(DFATAL) << "NamedCapturesWalker::ShortVisit called";
596     return ignored;
597   }
598 
599  private:
600   std::map<std::string, int>* map_;
601 
602   NamedCapturesWalker(const NamedCapturesWalker&) = delete;
603   NamedCapturesWalker& operator=(const NamedCapturesWalker&) = delete;
604 };
605 
NamedCaptures()606 std::map<std::string, int>* Regexp::NamedCaptures() {
607   NamedCapturesWalker w;
608   w.Walk(this, 0);
609   return w.TakeMap();
610 }
611 
612 // Walker class to build map from capture group indices to their names.
613 class CaptureNamesWalker : public Regexp::Walker<Ignored> {
614  public:
CaptureNamesWalker()615   CaptureNamesWalker() : map_(NULL) {}
~CaptureNamesWalker()616   ~CaptureNamesWalker() { delete map_; }
617 
TakeMap()618   std::map<int, std::string>* TakeMap() {
619     std::map<int, std::string>* m = map_;
620     map_ = NULL;
621     return m;
622   }
623 
PreVisit(Regexp * re,Ignored ignored,bool * stop)624   Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) {
625     if (re->op() == kRegexpCapture && re->name() != NULL) {
626       // Allocate map once we find a name.
627       if (map_ == NULL)
628         map_ = new std::map<int, std::string>;
629 
630       (*map_)[re->cap()] = *re->name();
631     }
632     return ignored;
633   }
634 
ShortVisit(Regexp * re,Ignored ignored)635   virtual Ignored ShortVisit(Regexp* re, Ignored ignored) {
636     // Should never be called: we use Walk not WalkExponential.
637     LOG(DFATAL) << "CaptureNamesWalker::ShortVisit called";
638     return ignored;
639   }
640 
641  private:
642   std::map<int, std::string>* map_;
643 
644   CaptureNamesWalker(const CaptureNamesWalker&) = delete;
645   CaptureNamesWalker& operator=(const CaptureNamesWalker&) = delete;
646 };
647 
CaptureNames()648 std::map<int, std::string>* Regexp::CaptureNames() {
649   CaptureNamesWalker w;
650   w.Walk(this, 0);
651   return w.TakeMap();
652 }
653 
654 // Determines whether regexp matches must be anchored
655 // with a fixed string prefix.  If so, returns the prefix and
656 // the regexp that remains after the prefix.  The prefix might
657 // be ASCII case-insensitive.
RequiredPrefix(std::string * prefix,bool * foldcase,Regexp ** suffix)658 bool Regexp::RequiredPrefix(std::string* prefix, bool* foldcase,
659                             Regexp** suffix) {
660   // No need for a walker: the regexp must be of the form
661   // 1. some number of ^ anchors
662   // 2. a literal char or string
663   // 3. the rest
664   prefix->clear();
665   *foldcase = false;
666   *suffix = NULL;
667   if (op_ != kRegexpConcat)
668     return false;
669 
670   // Some number of anchors, then a literal or concatenation.
671   int i = 0;
672   Regexp** sub = this->sub();
673   while (i < nsub_ && sub[i]->op_ == kRegexpBeginText)
674     i++;
675   if (i == 0 || i >= nsub_)
676     return false;
677 
678   Regexp* re = sub[i];
679   switch (re->op_) {
680     default:
681       return false;
682 
683     case kRegexpLiteralString:
684       // Convert to string in proper encoding.
685       if (re->parse_flags() & Latin1) {
686         prefix->resize(re->nrunes_);
687         for (int j = 0; j < re->nrunes_; j++)
688           (*prefix)[j] = static_cast<char>(re->runes_[j]);
689       } else {
690         // Convert to UTF-8 in place.
691         // Assume worst-case space and then trim.
692         prefix->resize(re->nrunes_ * UTFmax);
693         char *p = &(*prefix)[0];
694         for (int j = 0; j < re->nrunes_; j++) {
695           Rune r = re->runes_[j];
696           if (r < Runeself)
697             *p++ = static_cast<char>(r);
698           else
699             p += runetochar(p, &r);
700         }
701         prefix->resize(p - &(*prefix)[0]);
702       }
703       break;
704 
705     case kRegexpLiteral:
706       if ((re->parse_flags() & Latin1) || re->rune_ < Runeself) {
707         prefix->append(1, static_cast<char>(re->rune_));
708       } else {
709         char buf[UTFmax];
710         prefix->append(buf, runetochar(buf, &re->rune_));
711       }
712       break;
713   }
714   *foldcase = (sub[i]->parse_flags() & FoldCase) != 0;
715   i++;
716 
717   // The rest.
718   if (i < nsub_) {
719     for (int j = i; j < nsub_; j++)
720       sub[j]->Incref();
721     re = Concat(sub + i, nsub_ - i, parse_flags());
722   } else {
723     re = new Regexp(kRegexpEmptyMatch, parse_flags());
724   }
725   *suffix = re;
726   return true;
727 }
728 
729 // Character class builder is a balanced binary tree (STL set)
730 // containing non-overlapping, non-abutting RuneRanges.
731 // The less-than operator used in the tree treats two
732 // ranges as equal if they overlap at all, so that
733 // lookups for a particular Rune are possible.
734 
CharClassBuilder()735 CharClassBuilder::CharClassBuilder() {
736   nrunes_ = 0;
737   upper_ = 0;
738   lower_ = 0;
739 }
740 
741 // Add lo-hi to the class; return whether class got bigger.
AddRange(Rune lo,Rune hi)742 bool CharClassBuilder::AddRange(Rune lo, Rune hi) {
743   if (hi < lo)
744     return false;
745 
746   if (lo <= 'z' && hi >= 'A') {
747     // Overlaps some alpha, maybe not all.
748     // Update bitmaps telling which ASCII letters are in the set.
749     Rune lo1 = std::max<Rune>(lo, 'A');
750     Rune hi1 = std::min<Rune>(hi, 'Z');
751     if (lo1 <= hi1)
752       upper_ |= ((1 << (hi1 - lo1 + 1)) - 1) << (lo1 - 'A');
753 
754     lo1 = std::max<Rune>(lo, 'a');
755     hi1 = std::min<Rune>(hi, 'z');
756     if (lo1 <= hi1)
757       lower_ |= ((1 << (hi1 - lo1 + 1)) - 1) << (lo1 - 'a');
758   }
759 
760   {  // Check whether lo, hi is already in the class.
761     iterator it = ranges_.find(RuneRange(lo, lo));
762     if (it != end() && it->lo <= lo && hi <= it->hi)
763       return false;
764   }
765 
766   // Look for a range abutting lo on the left.
767   // If it exists, take it out and increase our range.
768   if (lo > 0) {
769     iterator it = ranges_.find(RuneRange(lo-1, lo-1));
770     if (it != end()) {
771       lo = it->lo;
772       if (it->hi > hi)
773         hi = it->hi;
774       nrunes_ -= it->hi - it->lo + 1;
775       ranges_.erase(it);
776     }
777   }
778 
779   // Look for a range abutting hi on the right.
780   // If it exists, take it out and increase our range.
781   if (hi < Runemax) {
782     iterator it = ranges_.find(RuneRange(hi+1, hi+1));
783     if (it != end()) {
784       hi = it->hi;
785       nrunes_ -= it->hi - it->lo + 1;
786       ranges_.erase(it);
787     }
788   }
789 
790   // Look for ranges between lo and hi.  Take them out.
791   // This is only safe because the set has no overlapping ranges.
792   // We've already removed any ranges abutting lo and hi, so
793   // any that overlap [lo, hi] must be contained within it.
794   for (;;) {
795     iterator it = ranges_.find(RuneRange(lo, hi));
796     if (it == end())
797       break;
798     nrunes_ -= it->hi - it->lo + 1;
799     ranges_.erase(it);
800   }
801 
802   // Finally, add [lo, hi].
803   nrunes_ += hi - lo + 1;
804   ranges_.insert(RuneRange(lo, hi));
805   return true;
806 }
807 
AddCharClass(CharClassBuilder * cc)808 void CharClassBuilder::AddCharClass(CharClassBuilder *cc) {
809   for (iterator it = cc->begin(); it != cc->end(); ++it)
810     AddRange(it->lo, it->hi);
811 }
812 
Contains(Rune r)813 bool CharClassBuilder::Contains(Rune r) {
814   return ranges_.find(RuneRange(r, r)) != end();
815 }
816 
817 // Does the character class behave the same on A-Z as on a-z?
FoldsASCII()818 bool CharClassBuilder::FoldsASCII() {
819   return ((upper_ ^ lower_) & AlphaMask) == 0;
820 }
821 
Copy()822 CharClassBuilder* CharClassBuilder::Copy() {
823   CharClassBuilder* cc = new CharClassBuilder;
824   for (iterator it = begin(); it != end(); ++it)
825     cc->ranges_.insert(RuneRange(it->lo, it->hi));
826   cc->upper_ = upper_;
827   cc->lower_ = lower_;
828   cc->nrunes_ = nrunes_;
829   return cc;
830 }
831 
832 
833 
RemoveAbove(Rune r)834 void CharClassBuilder::RemoveAbove(Rune r) {
835   if (r >= Runemax)
836     return;
837 
838   if (r < 'z') {
839     if (r < 'a')
840       lower_ = 0;
841     else
842       lower_ &= AlphaMask >> ('z' - r);
843   }
844 
845   if (r < 'Z') {
846     if (r < 'A')
847       upper_ = 0;
848     else
849       upper_ &= AlphaMask >> ('Z' - r);
850   }
851 
852   for (;;) {
853 
854     iterator it = ranges_.find(RuneRange(r + 1, Runemax));
855     if (it == end())
856       break;
857     RuneRange rr = *it;
858     ranges_.erase(it);
859     nrunes_ -= rr.hi - rr.lo + 1;
860     if (rr.lo <= r) {
861       rr.hi = r;
862       ranges_.insert(rr);
863       nrunes_ += rr.hi - rr.lo + 1;
864     }
865   }
866 }
867 
Negate()868 void CharClassBuilder::Negate() {
869   // Build up negation and then copy in.
870   // Could edit ranges in place, but C++ won't let me.
871   std::vector<RuneRange> v;
872   v.reserve(ranges_.size() + 1);
873 
874   // In negation, first range begins at 0, unless
875   // the current class begins at 0.
876   iterator it = begin();
877   if (it == end()) {
878     v.push_back(RuneRange(0, Runemax));
879   } else {
880     int nextlo = 0;
881     if (it->lo == 0) {
882       nextlo = it->hi + 1;
883       ++it;
884     }
885     for (; it != end(); ++it) {
886       v.push_back(RuneRange(nextlo, it->lo - 1));
887       nextlo = it->hi + 1;
888     }
889     if (nextlo <= Runemax)
890       v.push_back(RuneRange(nextlo, Runemax));
891   }
892 
893   ranges_.clear();
894   for (size_t i = 0; i < v.size(); i++)
895     ranges_.insert(v[i]);
896 
897   upper_ = AlphaMask & ~upper_;
898   lower_ = AlphaMask & ~lower_;
899   nrunes_ = Runemax+1 - nrunes_;
900 }
901 
902 // Character class is a sorted list of ranges.
903 // The ranges are allocated in the same block as the header,
904 // necessitating a special allocator and Delete method.
905 
New(int maxranges)906 CharClass* CharClass::New(int maxranges) {
907   CharClass* cc;
908   uint8_t* data = new uint8_t[sizeof *cc + maxranges*sizeof cc->ranges_[0]];
909   cc = reinterpret_cast<CharClass*>(data);
910   cc->ranges_ = reinterpret_cast<RuneRange*>(data + sizeof *cc);
911   cc->nranges_ = 0;
912   cc->folds_ascii_ = false;
913   cc->nrunes_ = 0;
914   return cc;
915 }
916 
Delete()917 void CharClass::Delete() {
918   uint8_t* data = reinterpret_cast<uint8_t*>(this);
919   delete[] data;
920 }
921 
Negate()922 CharClass* CharClass::Negate() {
923   CharClass* cc = CharClass::New(nranges_+1);
924   cc->folds_ascii_ = folds_ascii_;
925   cc->nrunes_ = Runemax + 1 - nrunes_;
926   int n = 0;
927   int nextlo = 0;
928   for (CharClass::iterator it = begin(); it != end(); ++it) {
929     if (it->lo == nextlo) {
930       nextlo = it->hi + 1;
931     } else {
932       cc->ranges_[n++] = RuneRange(nextlo, it->lo - 1);
933       nextlo = it->hi + 1;
934     }
935   }
936   if (nextlo <= Runemax)
937     cc->ranges_[n++] = RuneRange(nextlo, Runemax);
938   cc->nranges_ = n;
939   return cc;
940 }
941 
Contains(Rune r)942 bool CharClass::Contains(Rune r) {
943   RuneRange* rr = ranges_;
944   int n = nranges_;
945   while (n > 0) {
946     int m = n/2;
947     if (rr[m].hi < r) {
948       rr += m+1;
949       n -= m+1;
950     } else if (r < rr[m].lo) {
951       n = m;
952     } else {  // rr[m].lo <= r && r <= rr[m].hi
953       return true;
954     }
955   }
956   return false;
957 }
958 
GetCharClass()959 CharClass* CharClassBuilder::GetCharClass() {
960   CharClass* cc = CharClass::New(static_cast<int>(ranges_.size()));
961   int n = 0;
962   for (iterator it = begin(); it != end(); ++it)
963     cc->ranges_[n++] = *it;
964   cc->nranges_ = n;
965   DCHECK_LE(n, static_cast<int>(ranges_.size()));
966   cc->nrunes_ = nrunes_;
967   cc->folds_ascii_ = FoldsASCII();
968   return cc;
969 }
970 
971 }  // namespace re2
972