• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2006 The RE2 Authors.  All Rights Reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4 
5 // Regular expression representation.
6 // Tested by parse_test.cc
7 
8 #include "util/util.h"
9 #include "re2/regexp.h"
10 #include "re2/stringpiece.h"
11 #include "re2/walker-inl.h"
12 
13 namespace re2 {
14 
15 // Constructor.  Allocates vectors as appropriate for operator.
Regexp(RegexpOp op,ParseFlags parse_flags)16 Regexp::Regexp(RegexpOp op, ParseFlags parse_flags)
17   : op_(op),
18     simple_(false),
19     parse_flags_(static_cast<uint16>(parse_flags)),
20     ref_(1),
21     nsub_(0),
22     down_(NULL) {
23   subone_ = NULL;
24   memset(the_union_, 0, sizeof the_union_);
25 }
26 
27 // Destructor.  Assumes already cleaned up children.
28 // Private: use Decref() instead of delete to destroy Regexps.
29 // Can't call Decref on the sub-Regexps here because
30 // that could cause arbitrarily deep recursion, so
31 // required Decref() to have handled them for us.
~Regexp()32 Regexp::~Regexp() {
33   if (nsub_ > 0)
34     LOG(DFATAL) << "Regexp not destroyed.";
35 
36   switch (op_) {
37     default:
38       break;
39     case kRegexpCapture:
40       delete name_;
41       break;
42     case kRegexpLiteralString:
43       delete[] runes_;
44       break;
45     case kRegexpCharClass:
46       cc_->Delete();
47       delete ccb_;
48       break;
49   }
50 }
51 
52 // If it's possible to destroy this regexp without recurring,
53 // do so and return true.  Else return false.
QuickDestroy()54 bool Regexp::QuickDestroy() {
55   if (nsub_ == 0) {
56     delete this;
57     return true;
58   }
59   return false;
60 }
61 
62 static map<Regexp*, int> ref_map;
63 static Mutex ref_mutex;
64 
Ref()65 int Regexp::Ref() {
66   if (ref_ < kMaxRef)
67     return ref_;
68 
69   MutexLock l(&ref_mutex);
70   return ref_map[this];
71 }
72 
73 // Increments reference count, returns object as convenience.
Incref()74 Regexp* Regexp::Incref() {
75   if (ref_ >= kMaxRef-1) {
76     // Store ref count in overflow map.
77     MutexLock l(&ref_mutex);
78     if (ref_ == kMaxRef) {  // already overflowed
79       ref_map[this]++;
80       return this;
81     }
82     // overflowing now
83     ref_map[this] = kMaxRef;
84     ref_ = kMaxRef;
85     return this;
86   }
87 
88   ref_++;
89   return this;
90 }
91 
92 // Decrements reference count and deletes this object if count reaches 0.
Decref()93 void Regexp::Decref() {
94   if (ref_ == kMaxRef) {
95     // Ref count is stored in overflow map.
96     MutexLock l(&ref_mutex);
97     int r = ref_map[this] - 1;
98     if (r < kMaxRef) {
99       ref_ = r;
100       ref_map.erase(this);
101     } else {
102       ref_map[this] = r;
103     }
104     return;
105   }
106   ref_--;
107   if (ref_ == 0)
108     Destroy();
109 }
110 
111 // Deletes this object; ref count has count reached 0.
Destroy()112 void Regexp::Destroy() {
113   if (QuickDestroy())
114     return;
115 
116   // Handle recursive Destroy with explicit stack
117   // to avoid arbitrarily deep recursion on process stack [sigh].
118   down_ = NULL;
119   Regexp* stack = this;
120   while (stack != NULL) {
121     Regexp* re = stack;
122     stack = re->down_;
123     if (re->ref_ != 0)
124       LOG(DFATAL) << "Bad reference count " << re->ref_;
125     if (re->nsub_ > 0) {
126       Regexp** subs = re->sub();
127       for (int i = 0; i < re->nsub_; i++) {
128         Regexp* sub = subs[i];
129         if (sub == NULL)
130           continue;
131         if (sub->ref_ == kMaxRef)
132           sub->Decref();
133         else
134           --sub->ref_;
135         if (sub->ref_ == 0 && !sub->QuickDestroy()) {
136           sub->down_ = stack;
137           stack = sub;
138         }
139       }
140       if (re->nsub_ > 1)
141         delete[] subs;
142       re->nsub_ = 0;
143     }
144     delete re;
145   }
146 }
147 
AddRuneToString(Rune r)148 void Regexp::AddRuneToString(Rune r) {
149   DCHECK(op_ == kRegexpLiteralString);
150   if (nrunes_ == 0) {
151     // start with 8
152     runes_ = new Rune[8];
153   } else if (nrunes_ >= 8 && (nrunes_ & (nrunes_ - 1)) == 0) {
154     // double on powers of two
155     Rune *old = runes_;
156     runes_ = new Rune[nrunes_ * 2];
157     for (int i = 0; i < nrunes_; i++)
158       runes_[i] = old[i];
159     delete[] old;
160   }
161 
162   runes_[nrunes_++] = r;
163 }
164 
HaveMatch(int match_id,ParseFlags flags)165 Regexp* Regexp::HaveMatch(int match_id, ParseFlags flags) {
166   Regexp* re = new Regexp(kRegexpHaveMatch, flags);
167   re->match_id_ = match_id;
168   return re;
169 }
170 
Plus(Regexp * sub,ParseFlags flags)171 Regexp* Regexp::Plus(Regexp* sub, ParseFlags flags) {
172   if (sub->op() == kRegexpPlus && sub->parse_flags() == flags)
173     return sub;
174   Regexp* re = new Regexp(kRegexpPlus, flags);
175   re->AllocSub(1);
176   re->sub()[0] = sub;
177   return re;
178 }
179 
Star(Regexp * sub,ParseFlags flags)180 Regexp* Regexp::Star(Regexp* sub, ParseFlags flags) {
181   if (sub->op() == kRegexpStar && sub->parse_flags() == flags)
182     return sub;
183   Regexp* re = new Regexp(kRegexpStar, flags);
184   re->AllocSub(1);
185   re->sub()[0] = sub;
186   return re;
187 }
188 
Quest(Regexp * sub,ParseFlags flags)189 Regexp* Regexp::Quest(Regexp* sub, ParseFlags flags) {
190   if (sub->op() == kRegexpQuest && sub->parse_flags() == flags)
191     return sub;
192   Regexp* re = new Regexp(kRegexpQuest, flags);
193   re->AllocSub(1);
194   re->sub()[0] = sub;
195   return re;
196 }
197 
ConcatOrAlternate(RegexpOp op,Regexp ** sub,int nsub,ParseFlags flags,bool can_factor)198 Regexp* Regexp::ConcatOrAlternate(RegexpOp op, Regexp** sub, int nsub,
199                                   ParseFlags flags, bool can_factor) {
200   if (nsub == 1)
201     return sub[0];
202 
203   Regexp** subcopy = NULL;
204   if (op == kRegexpAlternate && can_factor) {
205     // Going to edit sub; make a copy so we don't step on caller.
206     subcopy = new Regexp*[nsub];
207     memmove(subcopy, sub, nsub * sizeof sub[0]);
208     sub = subcopy;
209     nsub = FactorAlternation(sub, nsub, flags);
210     if (nsub == 1) {
211       Regexp* re = sub[0];
212       delete[] subcopy;
213       return re;
214     }
215   }
216 
217   if (nsub > kMaxNsub) {
218     // Too many subexpressions to fit in a single Regexp.
219     // Make a two-level tree.  Two levels gets us to 65535^2.
220     int nbigsub = (nsub+kMaxNsub-1)/kMaxNsub;
221     Regexp* re = new Regexp(op, flags);
222     re->AllocSub(nbigsub);
223     Regexp** subs = re->sub();
224     for (int i = 0; i < nbigsub - 1; i++)
225       subs[i] = ConcatOrAlternate(op, sub+i*kMaxNsub, kMaxNsub, flags, false);
226     subs[nbigsub - 1] = ConcatOrAlternate(op, sub+(nbigsub-1)*kMaxNsub,
227                                           nsub - (nbigsub-1)*kMaxNsub, flags,
228                                           false);
229     delete[] subcopy;
230     return re;
231   }
232 
233   Regexp* re = new Regexp(op, flags);
234   re->AllocSub(nsub);
235   Regexp** subs = re->sub();
236   for (int i = 0; i < nsub; i++)
237     subs[i] = sub[i];
238 
239   delete[] subcopy;
240   return re;
241 }
242 
Concat(Regexp ** sub,int nsub,ParseFlags flags)243 Regexp* Regexp::Concat(Regexp** sub, int nsub, ParseFlags flags) {
244   return ConcatOrAlternate(kRegexpConcat, sub, nsub, flags, false);
245 }
246 
Alternate(Regexp ** sub,int nsub,ParseFlags flags)247 Regexp* Regexp::Alternate(Regexp** sub, int nsub, ParseFlags flags) {
248   return ConcatOrAlternate(kRegexpAlternate, sub, nsub, flags, true);
249 }
250 
AlternateNoFactor(Regexp ** sub,int nsub,ParseFlags flags)251 Regexp* Regexp::AlternateNoFactor(Regexp** sub, int nsub, ParseFlags flags) {
252   return ConcatOrAlternate(kRegexpAlternate, sub, nsub, flags, false);
253 }
254 
Capture(Regexp * sub,ParseFlags flags,int cap)255 Regexp* Regexp::Capture(Regexp* sub, ParseFlags flags, int cap) {
256   Regexp* re = new Regexp(kRegexpCapture, flags);
257   re->AllocSub(1);
258   re->sub()[0] = sub;
259   re->cap_ = cap;
260   return re;
261 }
262 
Repeat(Regexp * sub,ParseFlags flags,int min,int max)263 Regexp* Regexp::Repeat(Regexp* sub, ParseFlags flags, int min, int max) {
264   Regexp* re = new Regexp(kRegexpRepeat, flags);
265   re->AllocSub(1);
266   re->sub()[0] = sub;
267   re->min_ = min;
268   re->max_ = max;
269   return re;
270 }
271 
NewLiteral(Rune rune,ParseFlags flags)272 Regexp* Regexp::NewLiteral(Rune rune, ParseFlags flags) {
273   Regexp* re = new Regexp(kRegexpLiteral, flags);
274   re->rune_ = rune;
275   return re;
276 }
277 
LiteralString(Rune * runes,int nrunes,ParseFlags flags)278 Regexp* Regexp::LiteralString(Rune* runes, int nrunes, ParseFlags flags) {
279   if (nrunes <= 0)
280     return new Regexp(kRegexpEmptyMatch, flags);
281   if (nrunes == 1)
282     return NewLiteral(runes[0], flags);
283   Regexp* re = new Regexp(kRegexpLiteralString, flags);
284   for (int i = 0; i < nrunes; i++)
285     re->AddRuneToString(runes[i]);
286   return re;
287 }
288 
NewCharClass(CharClass * cc,ParseFlags flags)289 Regexp* Regexp::NewCharClass(CharClass* cc, ParseFlags flags) {
290   Regexp* re = new Regexp(kRegexpCharClass, flags);
291   re->cc_ = cc;
292   return re;
293 }
294 
295 // Swaps this and that in place.
Swap(Regexp * that)296 void Regexp::Swap(Regexp* that) {
297   // Can use memmove because Regexp is just a struct (no vtable).
298   char tmp[sizeof *this];
299   memmove(tmp, this, sizeof tmp);
300   memmove(this, that, sizeof tmp);
301   memmove(that, tmp, sizeof tmp);
302 }
303 
304 // Tests equality of all top-level structure but not subregexps.
TopEqual(Regexp * a,Regexp * b)305 static bool TopEqual(Regexp* a, Regexp* b) {
306   if (a->op() != b->op())
307     return false;
308 
309   switch (a->op()) {
310     case kRegexpNoMatch:
311     case kRegexpEmptyMatch:
312     case kRegexpAnyChar:
313     case kRegexpAnyByte:
314     case kRegexpBeginLine:
315     case kRegexpEndLine:
316     case kRegexpWordBoundary:
317     case kRegexpNoWordBoundary:
318     case kRegexpBeginText:
319       return true;
320 
321     case kRegexpEndText:
322       // The parse flags remember whether it's \z or (?-m:$),
323       // which matters when testing against PCRE.
324       return ((a->parse_flags() ^ b->parse_flags()) & Regexp::WasDollar) == 0;
325 
326     case kRegexpLiteral:
327       return a->rune() == b->rune() &&
328              ((a->parse_flags() ^ b->parse_flags()) & Regexp::FoldCase) == 0;
329 
330     case kRegexpLiteralString:
331       return a->nrunes() == b->nrunes() &&
332              ((a->parse_flags() ^ b->parse_flags()) & Regexp::FoldCase) == 0 &&
333              memcmp(a->runes(), b->runes(),
334                     a->nrunes() * sizeof a->runes()[0]) == 0;
335 
336     case kRegexpAlternate:
337     case kRegexpConcat:
338       return a->nsub() == b->nsub();
339 
340     case kRegexpStar:
341     case kRegexpPlus:
342     case kRegexpQuest:
343       return ((a->parse_flags() ^ b->parse_flags()) & Regexp::NonGreedy) == 0;
344 
345     case kRegexpRepeat:
346       return ((a->parse_flags() ^ b->parse_flags()) & Regexp::NonGreedy) == 0 &&
347              a->min() == b->min() &&
348              a->max() == b->max();
349 
350     case kRegexpCapture:
351       return a->cap() == b->cap() && a->name() == b->name();
352 
353     case kRegexpHaveMatch:
354       return a->match_id() == b->match_id();
355 
356     case kRegexpCharClass: {
357       CharClass* acc = a->cc();
358       CharClass* bcc = b->cc();
359       return acc->size() == bcc->size() &&
360              acc->end() - acc->begin() == bcc->end() - bcc->begin() &&
361              memcmp(acc->begin(), bcc->begin(),
362                     (acc->end() - acc->begin()) * sizeof acc->begin()[0]) == 0;
363     }
364   }
365 
366   LOG(DFATAL) << "Unexpected op in Regexp::Equal: " << a->op();
367   return 0;
368 }
369 
Equal(Regexp * a,Regexp * b)370 bool Regexp::Equal(Regexp* a, Regexp* b) {
371   if (a == NULL || b == NULL)
372     return a == b;
373 
374   if (!TopEqual(a, b))
375     return false;
376 
377   // Fast path:
378   // return without allocating vector if there are no subregexps.
379   switch (a->op()) {
380     case kRegexpAlternate:
381     case kRegexpConcat:
382     case kRegexpStar:
383     case kRegexpPlus:
384     case kRegexpQuest:
385     case kRegexpRepeat:
386     case kRegexpCapture:
387       break;
388 
389     default:
390       return true;
391   }
392 
393   // Committed to doing real work.
394   // The stack (vector) has pairs of regexps waiting to
395   // be compared.  The regexps are only equal if
396   // all the pairs end up being equal.
397   vector<Regexp*> stk;
398 
399   for (;;) {
400     // Invariant: TopEqual(a, b) == true.
401     Regexp* a2;
402     Regexp* b2;
403     switch (a->op()) {
404       default:
405         break;
406       case kRegexpAlternate:
407       case kRegexpConcat:
408         for (int i = 0; i < a->nsub(); i++) {
409           a2 = a->sub()[i];
410           b2 = b->sub()[i];
411           if (!TopEqual(a2, b2))
412             return false;
413           stk.push_back(a2);
414           stk.push_back(b2);
415         }
416         break;
417 
418       case kRegexpStar:
419       case kRegexpPlus:
420       case kRegexpQuest:
421       case kRegexpRepeat:
422       case kRegexpCapture:
423         a2 = a->sub()[0];
424         b2 = b->sub()[0];
425         if (!TopEqual(a2, b2))
426           return false;
427         // Really:
428         //   stk.push_back(a2);
429         //   stk.push_back(b2);
430         //   break;
431         // but faster to assign directly and loop.
432         a = a2;
433         b = b2;
434         continue;
435     }
436 
437     int n = stk.size();
438     if (n == 0)
439       break;
440 
441     a = stk[n-2];
442     b = stk[n-1];
443     stk.resize(n-2);
444   }
445 
446   return true;
447 }
448 
449 // Keep in sync with enum RegexpStatusCode in regexp.h
450 static const string kErrorStrings[] = {
451   "no error",
452   "unexpected error",
453   "invalid escape sequence",
454   "invalid character class",
455   "invalid character class range",
456   "missing ]",
457   "missing )",
458   "trailing \\",
459   "no argument for repetition operator",
460   "invalid repetition size",
461   "bad repetition operator",
462   "invalid perl operator",
463   "invalid UTF-8",
464   "invalid named capture group",
465 };
466 
CodeText(enum RegexpStatusCode code)467 const string& RegexpStatus::CodeText(enum RegexpStatusCode code) {
468   if (code < 0 || code >= arraysize(kErrorStrings))
469     code = kRegexpInternalError;
470   return kErrorStrings[code];
471 }
472 
Text() const473 string RegexpStatus::Text() const {
474   if (error_arg_.empty())
475     return CodeText(code_);
476   string s;
477   s.append(CodeText(code_));
478   s.append(": ");
479   s.append(error_arg_.data(), error_arg_.size());
480   return s;
481 }
482 
Copy(const RegexpStatus & status)483 void RegexpStatus::Copy(const RegexpStatus& status) {
484   code_ = status.code_;
485   error_arg_ = status.error_arg_;
486 }
487 
488 typedef int Ignored;  // Walker<void> doesn't exist
489 
490 // Walker subclass to count capturing parens in regexp.
491 class NumCapturesWalker : public Regexp::Walker<Ignored> {
492  public:
NumCapturesWalker()493   NumCapturesWalker() : ncapture_(0) {}
ncapture()494   int ncapture() { return ncapture_; }
495 
PreVisit(Regexp * re,Ignored ignored,bool * stop)496   virtual Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) {
497     if (re->op() == kRegexpCapture)
498       ncapture_++;
499     return ignored;
500   }
ShortVisit(Regexp * re,Ignored ignored)501   virtual Ignored ShortVisit(Regexp* re, Ignored ignored) {
502     // Should never be called: we use Walk not WalkExponential.
503     LOG(DFATAL) << "NumCapturesWalker::ShortVisit called";
504     return ignored;
505   }
506 
507  private:
508   int ncapture_;
509   DISALLOW_EVIL_CONSTRUCTORS(NumCapturesWalker);
510 };
511 
NumCaptures()512 int Regexp::NumCaptures() {
513   NumCapturesWalker w;
514   w.Walk(this, 0);
515   return w.ncapture();
516 }
517 
518 // Walker class to build map of named capture groups and their indices.
519 class NamedCapturesWalker : public Regexp::Walker<Ignored> {
520  public:
NamedCapturesWalker()521   NamedCapturesWalker() : map_(NULL) {}
~NamedCapturesWalker()522   ~NamedCapturesWalker() { delete map_; }
523 
TakeMap()524   map<string, int>* TakeMap() {
525     map<string, int>* m = map_;
526     map_ = NULL;
527     return m;
528   }
529 
PreVisit(Regexp * re,Ignored ignored,bool * stop)530   Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) {
531     if (re->op() == kRegexpCapture && re->name() != NULL) {
532       // Allocate map once we find a name.
533       if (map_ == NULL)
534         map_ = new map<string, int>;
535 
536       // Record first occurrence of each name.
537       // (The rule is that if you have the same name
538       // multiple times, only the leftmost one counts.)
539       if (map_->find(*re->name()) == map_->end())
540         (*map_)[*re->name()] = re->cap();
541     }
542     return ignored;
543   }
544 
ShortVisit(Regexp * re,Ignored ignored)545   virtual Ignored ShortVisit(Regexp* re, Ignored ignored) {
546     // Should never be called: we use Walk not WalkExponential.
547     LOG(DFATAL) << "NamedCapturesWalker::ShortVisit called";
548     return ignored;
549   }
550 
551  private:
552   map<string, int>* map_;
553   DISALLOW_EVIL_CONSTRUCTORS(NamedCapturesWalker);
554 };
555 
NamedCaptures()556 map<string, int>* Regexp::NamedCaptures() {
557   NamedCapturesWalker w;
558   w.Walk(this, 0);
559   return w.TakeMap();
560 }
561 
562 // Walker class to build map from capture group indices to their names.
563 class CaptureNamesWalker : public Regexp::Walker<Ignored> {
564  public:
CaptureNamesWalker()565   CaptureNamesWalker() : map_(NULL) {}
~CaptureNamesWalker()566   ~CaptureNamesWalker() { delete map_; }
567 
TakeMap()568   map<int, string>* TakeMap() {
569     map<int, string>* m = map_;
570     map_ = NULL;
571     return m;
572   }
573 
PreVisit(Regexp * re,Ignored ignored,bool * stop)574   Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) {
575     if (re->op() == kRegexpCapture && re->name() != NULL) {
576       // Allocate map once we find a name.
577       if (map_ == NULL)
578         map_ = new map<int, string>;
579 
580       (*map_)[re->cap()] = *re->name();
581     }
582     return ignored;
583   }
584 
ShortVisit(Regexp * re,Ignored ignored)585   virtual Ignored ShortVisit(Regexp* re, Ignored ignored) {
586     // Should never be called: we use Walk not WalkExponential.
587     LOG(DFATAL) << "CaptureNamesWalker::ShortVisit called";
588     return ignored;
589   }
590 
591  private:
592   map<int, string>* map_;
593   DISALLOW_EVIL_CONSTRUCTORS(CaptureNamesWalker);
594 };
595 
CaptureNames()596 map<int, string>* Regexp::CaptureNames() {
597   CaptureNamesWalker w;
598   w.Walk(this, 0);
599   return w.TakeMap();
600 }
601 
602 // Determines whether regexp matches must be anchored
603 // with a fixed string prefix.  If so, returns the prefix and
604 // the regexp that remains after the prefix.  The prefix might
605 // be ASCII case-insensitive.
RequiredPrefix(string * prefix,bool * foldcase,Regexp ** suffix)606 bool Regexp::RequiredPrefix(string *prefix, bool *foldcase, Regexp** suffix) {
607   // No need for a walker: the regexp must be of the form
608   // 1. some number of ^ anchors
609   // 2. a literal char or string
610   // 3. the rest
611   prefix->clear();
612   *foldcase = false;
613   *suffix = NULL;
614   if (op_ != kRegexpConcat)
615     return false;
616 
617   // Some number of anchors, then a literal or concatenation.
618   int i = 0;
619   Regexp** sub = this->sub();
620   while (i < nsub_ && sub[i]->op_ == kRegexpBeginText)
621     i++;
622   if (i == 0 || i >= nsub_)
623     return false;
624 
625   Regexp* re = sub[i];
626   switch (re->op_) {
627     default:
628       return false;
629 
630     case kRegexpLiteralString:
631       // Convert to string in proper encoding.
632       if (re->parse_flags() & Latin1) {
633         prefix->resize(re->nrunes_);
634         for (int j = 0; j < re->nrunes_; j++)
635           (*prefix)[j] = re->runes_[j];
636       } else {
637         // Convert to UTF-8 in place.
638         // Assume worst-case space and then trim.
639         prefix->resize(re->nrunes_ * UTFmax);
640         char *p = &(*prefix)[0];
641         for (int j = 0; j < re->nrunes_; j++) {
642           Rune r = re->runes_[j];
643           if (r < Runeself)
644             *p++ = r;
645           else
646             p += runetochar(p, &r);
647         }
648         prefix->resize(p - &(*prefix)[0]);
649       }
650       break;
651 
652     case kRegexpLiteral:
653       if ((re->parse_flags() & Latin1) || re->rune_ < Runeself) {
654         prefix->append(1, re->rune_);
655       } else {
656         char buf[UTFmax];
657         prefix->append(buf, runetochar(buf, &re->rune_));
658       }
659       break;
660   }
661   *foldcase = (sub[i]->parse_flags() & FoldCase);
662   i++;
663 
664   // The rest.
665   if (i < nsub_) {
666     for (int j = i; j < nsub_; j++)
667       sub[j]->Incref();
668     re = Concat(sub + i, nsub_ - i, parse_flags());
669   } else {
670     re = new Regexp(kRegexpEmptyMatch, parse_flags());
671   }
672   *suffix = re;
673   return true;
674 }
675 
676 // Character class builder is a balanced binary tree (STL set)
677 // containing non-overlapping, non-abutting RuneRanges.
678 // The less-than operator used in the tree treats two
679 // ranges as equal if they overlap at all, so that
680 // lookups for a particular Rune are possible.
681 
CharClassBuilder()682 CharClassBuilder::CharClassBuilder() {
683   nrunes_ = 0;
684   upper_ = 0;
685   lower_ = 0;
686 }
687 
688 // Add lo-hi to the class; return whether class got bigger.
AddRange(Rune lo,Rune hi)689 bool CharClassBuilder::AddRange(Rune lo, Rune hi) {
690   if (hi < lo)
691     return false;
692 
693   if (lo <= 'z' && hi >= 'A') {
694     // Overlaps some alpha, maybe not all.
695     // Update bitmaps telling which ASCII letters are in the set.
696     Rune lo1 = max<Rune>(lo, 'A');
697     Rune hi1 = min<Rune>(hi, 'Z');
698     if (lo1 <= hi1)
699       upper_ |= ((1 << (hi1 - lo1 + 1)) - 1) << (lo1 - 'A');
700 
701     lo1 = max<Rune>(lo, 'a');
702     hi1 = min<Rune>(hi, 'z');
703     if (lo1 <= hi1)
704       lower_ |= ((1 << (hi1 - lo1 + 1)) - 1) << (lo1 - 'a');
705   }
706 
707   {  // Check whether lo, hi is already in the class.
708     iterator it = ranges_.find(RuneRange(lo, lo));
709     if (it != end() && it->lo <= lo && hi <= it->hi)
710       return false;
711   }
712 
713   // Look for a range abutting lo on the left.
714   // If it exists, take it out and increase our range.
715   if (lo > 0) {
716     iterator it = ranges_.find(RuneRange(lo-1, lo-1));
717     if (it != end()) {
718       lo = it->lo;
719       if (it->hi > hi)
720         hi = it->hi;
721       nrunes_ -= it->hi - it->lo + 1;
722       ranges_.erase(it);
723     }
724   }
725 
726   // Look for a range abutting hi on the right.
727   // If it exists, take it out and increase our range.
728   if (hi < Runemax) {
729     iterator it = ranges_.find(RuneRange(hi+1, hi+1));
730     if (it != end()) {
731       hi = it->hi;
732       nrunes_ -= it->hi - it->lo + 1;
733       ranges_.erase(it);
734     }
735   }
736 
737   // Look for ranges between lo and hi.  Take them out.
738   // This is only safe because the set has no overlapping ranges.
739   // We've already removed any ranges abutting lo and hi, so
740   // any that overlap [lo, hi] must be contained within it.
741   for (;;) {
742     iterator it = ranges_.find(RuneRange(lo, hi));
743     if (it == end())
744       break;
745     nrunes_ -= it->hi - it->lo + 1;
746     ranges_.erase(it);
747   }
748 
749   // Finally, add [lo, hi].
750   nrunes_ += hi - lo + 1;
751   ranges_.insert(RuneRange(lo, hi));
752   return true;
753 }
754 
AddCharClass(CharClassBuilder * cc)755 void CharClassBuilder::AddCharClass(CharClassBuilder *cc) {
756   for (iterator it = cc->begin(); it != cc->end(); ++it)
757     AddRange(it->lo, it->hi);
758 }
759 
Contains(Rune r)760 bool CharClassBuilder::Contains(Rune r) {
761   return ranges_.find(RuneRange(r, r)) != end();
762 }
763 
764 // Does the character class behave the same on A-Z as on a-z?
FoldsASCII()765 bool CharClassBuilder::FoldsASCII() {
766   return ((upper_ ^ lower_) & AlphaMask) == 0;
767 }
768 
Copy()769 CharClassBuilder* CharClassBuilder::Copy() {
770   CharClassBuilder* cc = new CharClassBuilder;
771   for (iterator it = begin(); it != end(); ++it)
772     cc->ranges_.insert(RuneRange(it->lo, it->hi));
773   cc->upper_ = upper_;
774   cc->lower_ = lower_;
775   cc->nrunes_ = nrunes_;
776   return cc;
777 }
778 
779 
780 
RemoveAbove(Rune r)781 void CharClassBuilder::RemoveAbove(Rune r) {
782   if (r >= Runemax)
783     return;
784 
785   if (r < 'z') {
786     if (r < 'a')
787       lower_ = 0;
788     else
789       lower_ &= AlphaMask >> ('z' - r);
790   }
791 
792   if (r < 'Z') {
793     if (r < 'A')
794       upper_ = 0;
795     else
796       upper_ &= AlphaMask >> ('Z' - r);
797   }
798 
799   for (;;) {
800 
801     iterator it = ranges_.find(RuneRange(r + 1, Runemax));
802     if (it == end())
803       break;
804     RuneRange rr = *it;
805     ranges_.erase(it);
806     nrunes_ -= rr.hi - rr.lo + 1;
807     if (rr.lo <= r) {
808       rr.hi = r;
809       ranges_.insert(rr);
810       nrunes_ += rr.hi - rr.lo + 1;
811     }
812   }
813 }
814 
Negate()815 void CharClassBuilder::Negate() {
816   // Build up negation and then copy in.
817   // Could edit ranges in place, but C++ won't let me.
818   vector<RuneRange> v;
819   v.reserve(ranges_.size() + 1);
820 
821   // In negation, first range begins at 0, unless
822   // the current class begins at 0.
823   iterator it = begin();
824   if (it == end()) {
825     v.push_back(RuneRange(0, Runemax));
826   } else {
827     int nextlo = 0;
828     if (it->lo == 0) {
829       nextlo = it->hi + 1;
830       ++it;
831     }
832     for (; it != end(); ++it) {
833       v.push_back(RuneRange(nextlo, it->lo - 1));
834       nextlo = it->hi + 1;
835     }
836     if (nextlo <= Runemax)
837       v.push_back(RuneRange(nextlo, Runemax));
838   }
839 
840   ranges_.clear();
841   for (int i = 0; i < v.size(); i++)
842     ranges_.insert(v[i]);
843 
844   upper_ = AlphaMask & ~upper_;
845   lower_ = AlphaMask & ~lower_;
846   nrunes_ = Runemax+1 - nrunes_;
847 }
848 
849 // Character class is a sorted list of ranges.
850 // The ranges are allocated in the same block as the header,
851 // necessitating a special allocator and Delete method.
852 
New(int maxranges)853 CharClass* CharClass::New(int maxranges) {
854   CharClass* cc;
855   uint8* data = new uint8[sizeof *cc + maxranges*sizeof cc->ranges_[0]];
856   cc = reinterpret_cast<CharClass*>(data);
857   cc->ranges_ = reinterpret_cast<RuneRange*>(data + sizeof *cc);
858   cc->nranges_ = 0;
859   cc->folds_ascii_ = false;
860   cc->nrunes_ = 0;
861   return cc;
862 }
863 
Delete()864 void CharClass::Delete() {
865   if (this == NULL)
866     return;
867   uint8 *data = reinterpret_cast<uint8*>(this);
868   delete[] data;
869 }
870 
Negate()871 CharClass* CharClass::Negate() {
872   CharClass* cc = CharClass::New(nranges_+1);
873   cc->folds_ascii_ = folds_ascii_;
874   cc->nrunes_ = Runemax + 1 - nrunes_;
875   int n = 0;
876   int nextlo = 0;
877   for (CharClass::iterator it = begin(); it != end(); ++it) {
878     if (it->lo == nextlo) {
879       nextlo = it->hi + 1;
880     } else {
881       cc->ranges_[n++] = RuneRange(nextlo, it->lo - 1);
882       nextlo = it->hi + 1;
883     }
884   }
885   if (nextlo <= Runemax)
886     cc->ranges_[n++] = RuneRange(nextlo, Runemax);
887   cc->nranges_ = n;
888   return cc;
889 }
890 
Contains(Rune r)891 bool CharClass::Contains(Rune r) {
892   RuneRange* rr = ranges_;
893   int n = nranges_;
894   while (n > 0) {
895     int m = n/2;
896     if (rr[m].hi < r) {
897       rr += m+1;
898       n -= m+1;
899     } else if (r < rr[m].lo) {
900       n = m;
901     } else {  // rr[m].lo <= r && r <= rr[m].hi
902       return true;
903     }
904   }
905   return false;
906 }
907 
GetCharClass()908 CharClass* CharClassBuilder::GetCharClass() {
909   CharClass* cc = CharClass::New(ranges_.size());
910   int n = 0;
911   for (iterator it = begin(); it != end(); ++it)
912     cc->ranges_[n++] = *it;
913   cc->nranges_ = n;
914   DCHECK_LE(n, ranges_.size());
915   cc->nrunes_ = nrunes_;
916   cc->folds_ascii_ = FoldsASCII();
917   return cc;
918 }
919 
920 }  // namespace re2
921