1 // Copyright 2006 The RE2 Authors. All Rights Reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 // Regular expression representation.
6 // Tested by parse_test.cc
7
8 #include "util/util.h"
9 #include "re2/regexp.h"
10 #include "re2/stringpiece.h"
11 #include "re2/walker-inl.h"
12
13 namespace re2 {
14
15 // Constructor. Allocates vectors as appropriate for operator.
Regexp(RegexpOp op,ParseFlags parse_flags)16 Regexp::Regexp(RegexpOp op, ParseFlags parse_flags)
17 : op_(op),
18 simple_(false),
19 parse_flags_(static_cast<uint16>(parse_flags)),
20 ref_(1),
21 nsub_(0),
22 down_(NULL) {
23 subone_ = NULL;
24 memset(the_union_, 0, sizeof the_union_);
25 }
26
27 // Destructor. Assumes already cleaned up children.
28 // Private: use Decref() instead of delete to destroy Regexps.
29 // Can't call Decref on the sub-Regexps here because
30 // that could cause arbitrarily deep recursion, so
31 // required Decref() to have handled them for us.
~Regexp()32 Regexp::~Regexp() {
33 if (nsub_ > 0)
34 LOG(DFATAL) << "Regexp not destroyed.";
35
36 switch (op_) {
37 default:
38 break;
39 case kRegexpCapture:
40 delete name_;
41 break;
42 case kRegexpLiteralString:
43 delete[] runes_;
44 break;
45 case kRegexpCharClass:
46 cc_->Delete();
47 delete ccb_;
48 break;
49 }
50 }
51
52 // If it's possible to destroy this regexp without recurring,
53 // do so and return true. Else return false.
QuickDestroy()54 bool Regexp::QuickDestroy() {
55 if (nsub_ == 0) {
56 delete this;
57 return true;
58 }
59 return false;
60 }
61
62 static map<Regexp*, int> ref_map;
63 static Mutex ref_mutex;
64
Ref()65 int Regexp::Ref() {
66 if (ref_ < kMaxRef)
67 return ref_;
68
69 MutexLock l(&ref_mutex);
70 return ref_map[this];
71 }
72
73 // Increments reference count, returns object as convenience.
Incref()74 Regexp* Regexp::Incref() {
75 if (ref_ >= kMaxRef-1) {
76 // Store ref count in overflow map.
77 MutexLock l(&ref_mutex);
78 if (ref_ == kMaxRef) { // already overflowed
79 ref_map[this]++;
80 return this;
81 }
82 // overflowing now
83 ref_map[this] = kMaxRef;
84 ref_ = kMaxRef;
85 return this;
86 }
87
88 ref_++;
89 return this;
90 }
91
92 // Decrements reference count and deletes this object if count reaches 0.
Decref()93 void Regexp::Decref() {
94 if (ref_ == kMaxRef) {
95 // Ref count is stored in overflow map.
96 MutexLock l(&ref_mutex);
97 int r = ref_map[this] - 1;
98 if (r < kMaxRef) {
99 ref_ = r;
100 ref_map.erase(this);
101 } else {
102 ref_map[this] = r;
103 }
104 return;
105 }
106 ref_--;
107 if (ref_ == 0)
108 Destroy();
109 }
110
111 // Deletes this object; ref count has count reached 0.
Destroy()112 void Regexp::Destroy() {
113 if (QuickDestroy())
114 return;
115
116 // Handle recursive Destroy with explicit stack
117 // to avoid arbitrarily deep recursion on process stack [sigh].
118 down_ = NULL;
119 Regexp* stack = this;
120 while (stack != NULL) {
121 Regexp* re = stack;
122 stack = re->down_;
123 if (re->ref_ != 0)
124 LOG(DFATAL) << "Bad reference count " << re->ref_;
125 if (re->nsub_ > 0) {
126 Regexp** subs = re->sub();
127 for (int i = 0; i < re->nsub_; i++) {
128 Regexp* sub = subs[i];
129 if (sub == NULL)
130 continue;
131 if (sub->ref_ == kMaxRef)
132 sub->Decref();
133 else
134 --sub->ref_;
135 if (sub->ref_ == 0 && !sub->QuickDestroy()) {
136 sub->down_ = stack;
137 stack = sub;
138 }
139 }
140 if (re->nsub_ > 1)
141 delete[] subs;
142 re->nsub_ = 0;
143 }
144 delete re;
145 }
146 }
147
AddRuneToString(Rune r)148 void Regexp::AddRuneToString(Rune r) {
149 DCHECK(op_ == kRegexpLiteralString);
150 if (nrunes_ == 0) {
151 // start with 8
152 runes_ = new Rune[8];
153 } else if (nrunes_ >= 8 && (nrunes_ & (nrunes_ - 1)) == 0) {
154 // double on powers of two
155 Rune *old = runes_;
156 runes_ = new Rune[nrunes_ * 2];
157 for (int i = 0; i < nrunes_; i++)
158 runes_[i] = old[i];
159 delete[] old;
160 }
161
162 runes_[nrunes_++] = r;
163 }
164
HaveMatch(int match_id,ParseFlags flags)165 Regexp* Regexp::HaveMatch(int match_id, ParseFlags flags) {
166 Regexp* re = new Regexp(kRegexpHaveMatch, flags);
167 re->match_id_ = match_id;
168 return re;
169 }
170
Plus(Regexp * sub,ParseFlags flags)171 Regexp* Regexp::Plus(Regexp* sub, ParseFlags flags) {
172 if (sub->op() == kRegexpPlus && sub->parse_flags() == flags)
173 return sub;
174 Regexp* re = new Regexp(kRegexpPlus, flags);
175 re->AllocSub(1);
176 re->sub()[0] = sub;
177 return re;
178 }
179
Star(Regexp * sub,ParseFlags flags)180 Regexp* Regexp::Star(Regexp* sub, ParseFlags flags) {
181 if (sub->op() == kRegexpStar && sub->parse_flags() == flags)
182 return sub;
183 Regexp* re = new Regexp(kRegexpStar, flags);
184 re->AllocSub(1);
185 re->sub()[0] = sub;
186 return re;
187 }
188
Quest(Regexp * sub,ParseFlags flags)189 Regexp* Regexp::Quest(Regexp* sub, ParseFlags flags) {
190 if (sub->op() == kRegexpQuest && sub->parse_flags() == flags)
191 return sub;
192 Regexp* re = new Regexp(kRegexpQuest, flags);
193 re->AllocSub(1);
194 re->sub()[0] = sub;
195 return re;
196 }
197
ConcatOrAlternate(RegexpOp op,Regexp ** sub,int nsub,ParseFlags flags,bool can_factor)198 Regexp* Regexp::ConcatOrAlternate(RegexpOp op, Regexp** sub, int nsub,
199 ParseFlags flags, bool can_factor) {
200 if (nsub == 1)
201 return sub[0];
202
203 Regexp** subcopy = NULL;
204 if (op == kRegexpAlternate && can_factor) {
205 // Going to edit sub; make a copy so we don't step on caller.
206 subcopy = new Regexp*[nsub];
207 memmove(subcopy, sub, nsub * sizeof sub[0]);
208 sub = subcopy;
209 nsub = FactorAlternation(sub, nsub, flags);
210 if (nsub == 1) {
211 Regexp* re = sub[0];
212 delete[] subcopy;
213 return re;
214 }
215 }
216
217 if (nsub > kMaxNsub) {
218 // Too many subexpressions to fit in a single Regexp.
219 // Make a two-level tree. Two levels gets us to 65535^2.
220 int nbigsub = (nsub+kMaxNsub-1)/kMaxNsub;
221 Regexp* re = new Regexp(op, flags);
222 re->AllocSub(nbigsub);
223 Regexp** subs = re->sub();
224 for (int i = 0; i < nbigsub - 1; i++)
225 subs[i] = ConcatOrAlternate(op, sub+i*kMaxNsub, kMaxNsub, flags, false);
226 subs[nbigsub - 1] = ConcatOrAlternate(op, sub+(nbigsub-1)*kMaxNsub,
227 nsub - (nbigsub-1)*kMaxNsub, flags,
228 false);
229 delete[] subcopy;
230 return re;
231 }
232
233 Regexp* re = new Regexp(op, flags);
234 re->AllocSub(nsub);
235 Regexp** subs = re->sub();
236 for (int i = 0; i < nsub; i++)
237 subs[i] = sub[i];
238
239 delete[] subcopy;
240 return re;
241 }
242
Concat(Regexp ** sub,int nsub,ParseFlags flags)243 Regexp* Regexp::Concat(Regexp** sub, int nsub, ParseFlags flags) {
244 return ConcatOrAlternate(kRegexpConcat, sub, nsub, flags, false);
245 }
246
Alternate(Regexp ** sub,int nsub,ParseFlags flags)247 Regexp* Regexp::Alternate(Regexp** sub, int nsub, ParseFlags flags) {
248 return ConcatOrAlternate(kRegexpAlternate, sub, nsub, flags, true);
249 }
250
AlternateNoFactor(Regexp ** sub,int nsub,ParseFlags flags)251 Regexp* Regexp::AlternateNoFactor(Regexp** sub, int nsub, ParseFlags flags) {
252 return ConcatOrAlternate(kRegexpAlternate, sub, nsub, flags, false);
253 }
254
Capture(Regexp * sub,ParseFlags flags,int cap)255 Regexp* Regexp::Capture(Regexp* sub, ParseFlags flags, int cap) {
256 Regexp* re = new Regexp(kRegexpCapture, flags);
257 re->AllocSub(1);
258 re->sub()[0] = sub;
259 re->cap_ = cap;
260 return re;
261 }
262
Repeat(Regexp * sub,ParseFlags flags,int min,int max)263 Regexp* Regexp::Repeat(Regexp* sub, ParseFlags flags, int min, int max) {
264 Regexp* re = new Regexp(kRegexpRepeat, flags);
265 re->AllocSub(1);
266 re->sub()[0] = sub;
267 re->min_ = min;
268 re->max_ = max;
269 return re;
270 }
271
NewLiteral(Rune rune,ParseFlags flags)272 Regexp* Regexp::NewLiteral(Rune rune, ParseFlags flags) {
273 Regexp* re = new Regexp(kRegexpLiteral, flags);
274 re->rune_ = rune;
275 return re;
276 }
277
LiteralString(Rune * runes,int nrunes,ParseFlags flags)278 Regexp* Regexp::LiteralString(Rune* runes, int nrunes, ParseFlags flags) {
279 if (nrunes <= 0)
280 return new Regexp(kRegexpEmptyMatch, flags);
281 if (nrunes == 1)
282 return NewLiteral(runes[0], flags);
283 Regexp* re = new Regexp(kRegexpLiteralString, flags);
284 for (int i = 0; i < nrunes; i++)
285 re->AddRuneToString(runes[i]);
286 return re;
287 }
288
NewCharClass(CharClass * cc,ParseFlags flags)289 Regexp* Regexp::NewCharClass(CharClass* cc, ParseFlags flags) {
290 Regexp* re = new Regexp(kRegexpCharClass, flags);
291 re->cc_ = cc;
292 return re;
293 }
294
295 // Swaps this and that in place.
Swap(Regexp * that)296 void Regexp::Swap(Regexp* that) {
297 // Can use memmove because Regexp is just a struct (no vtable).
298 char tmp[sizeof *this];
299 memmove(tmp, this, sizeof tmp);
300 memmove(this, that, sizeof tmp);
301 memmove(that, tmp, sizeof tmp);
302 }
303
304 // Tests equality of all top-level structure but not subregexps.
TopEqual(Regexp * a,Regexp * b)305 static bool TopEqual(Regexp* a, Regexp* b) {
306 if (a->op() != b->op())
307 return false;
308
309 switch (a->op()) {
310 case kRegexpNoMatch:
311 case kRegexpEmptyMatch:
312 case kRegexpAnyChar:
313 case kRegexpAnyByte:
314 case kRegexpBeginLine:
315 case kRegexpEndLine:
316 case kRegexpWordBoundary:
317 case kRegexpNoWordBoundary:
318 case kRegexpBeginText:
319 return true;
320
321 case kRegexpEndText:
322 // The parse flags remember whether it's \z or (?-m:$),
323 // which matters when testing against PCRE.
324 return ((a->parse_flags() ^ b->parse_flags()) & Regexp::WasDollar) == 0;
325
326 case kRegexpLiteral:
327 return a->rune() == b->rune() &&
328 ((a->parse_flags() ^ b->parse_flags()) & Regexp::FoldCase) == 0;
329
330 case kRegexpLiteralString:
331 return a->nrunes() == b->nrunes() &&
332 ((a->parse_flags() ^ b->parse_flags()) & Regexp::FoldCase) == 0 &&
333 memcmp(a->runes(), b->runes(),
334 a->nrunes() * sizeof a->runes()[0]) == 0;
335
336 case kRegexpAlternate:
337 case kRegexpConcat:
338 return a->nsub() == b->nsub();
339
340 case kRegexpStar:
341 case kRegexpPlus:
342 case kRegexpQuest:
343 return ((a->parse_flags() ^ b->parse_flags()) & Regexp::NonGreedy) == 0;
344
345 case kRegexpRepeat:
346 return ((a->parse_flags() ^ b->parse_flags()) & Regexp::NonGreedy) == 0 &&
347 a->min() == b->min() &&
348 a->max() == b->max();
349
350 case kRegexpCapture:
351 return a->cap() == b->cap() && a->name() == b->name();
352
353 case kRegexpHaveMatch:
354 return a->match_id() == b->match_id();
355
356 case kRegexpCharClass: {
357 CharClass* acc = a->cc();
358 CharClass* bcc = b->cc();
359 return acc->size() == bcc->size() &&
360 acc->end() - acc->begin() == bcc->end() - bcc->begin() &&
361 memcmp(acc->begin(), bcc->begin(),
362 (acc->end() - acc->begin()) * sizeof acc->begin()[0]) == 0;
363 }
364 }
365
366 LOG(DFATAL) << "Unexpected op in Regexp::Equal: " << a->op();
367 return 0;
368 }
369
Equal(Regexp * a,Regexp * b)370 bool Regexp::Equal(Regexp* a, Regexp* b) {
371 if (a == NULL || b == NULL)
372 return a == b;
373
374 if (!TopEqual(a, b))
375 return false;
376
377 // Fast path:
378 // return without allocating vector if there are no subregexps.
379 switch (a->op()) {
380 case kRegexpAlternate:
381 case kRegexpConcat:
382 case kRegexpStar:
383 case kRegexpPlus:
384 case kRegexpQuest:
385 case kRegexpRepeat:
386 case kRegexpCapture:
387 break;
388
389 default:
390 return true;
391 }
392
393 // Committed to doing real work.
394 // The stack (vector) has pairs of regexps waiting to
395 // be compared. The regexps are only equal if
396 // all the pairs end up being equal.
397 vector<Regexp*> stk;
398
399 for (;;) {
400 // Invariant: TopEqual(a, b) == true.
401 Regexp* a2;
402 Regexp* b2;
403 switch (a->op()) {
404 default:
405 break;
406 case kRegexpAlternate:
407 case kRegexpConcat:
408 for (int i = 0; i < a->nsub(); i++) {
409 a2 = a->sub()[i];
410 b2 = b->sub()[i];
411 if (!TopEqual(a2, b2))
412 return false;
413 stk.push_back(a2);
414 stk.push_back(b2);
415 }
416 break;
417
418 case kRegexpStar:
419 case kRegexpPlus:
420 case kRegexpQuest:
421 case kRegexpRepeat:
422 case kRegexpCapture:
423 a2 = a->sub()[0];
424 b2 = b->sub()[0];
425 if (!TopEqual(a2, b2))
426 return false;
427 // Really:
428 // stk.push_back(a2);
429 // stk.push_back(b2);
430 // break;
431 // but faster to assign directly and loop.
432 a = a2;
433 b = b2;
434 continue;
435 }
436
437 int n = stk.size();
438 if (n == 0)
439 break;
440
441 a = stk[n-2];
442 b = stk[n-1];
443 stk.resize(n-2);
444 }
445
446 return true;
447 }
448
449 // Keep in sync with enum RegexpStatusCode in regexp.h
450 static const string kErrorStrings[] = {
451 "no error",
452 "unexpected error",
453 "invalid escape sequence",
454 "invalid character class",
455 "invalid character class range",
456 "missing ]",
457 "missing )",
458 "trailing \\",
459 "no argument for repetition operator",
460 "invalid repetition size",
461 "bad repetition operator",
462 "invalid perl operator",
463 "invalid UTF-8",
464 "invalid named capture group",
465 };
466
CodeText(enum RegexpStatusCode code)467 const string& RegexpStatus::CodeText(enum RegexpStatusCode code) {
468 if (code < 0 || code >= arraysize(kErrorStrings))
469 code = kRegexpInternalError;
470 return kErrorStrings[code];
471 }
472
Text() const473 string RegexpStatus::Text() const {
474 if (error_arg_.empty())
475 return CodeText(code_);
476 string s;
477 s.append(CodeText(code_));
478 s.append(": ");
479 s.append(error_arg_.data(), error_arg_.size());
480 return s;
481 }
482
Copy(const RegexpStatus & status)483 void RegexpStatus::Copy(const RegexpStatus& status) {
484 code_ = status.code_;
485 error_arg_ = status.error_arg_;
486 }
487
488 typedef int Ignored; // Walker<void> doesn't exist
489
490 // Walker subclass to count capturing parens in regexp.
491 class NumCapturesWalker : public Regexp::Walker<Ignored> {
492 public:
NumCapturesWalker()493 NumCapturesWalker() : ncapture_(0) {}
ncapture()494 int ncapture() { return ncapture_; }
495
PreVisit(Regexp * re,Ignored ignored,bool * stop)496 virtual Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) {
497 if (re->op() == kRegexpCapture)
498 ncapture_++;
499 return ignored;
500 }
ShortVisit(Regexp * re,Ignored ignored)501 virtual Ignored ShortVisit(Regexp* re, Ignored ignored) {
502 // Should never be called: we use Walk not WalkExponential.
503 LOG(DFATAL) << "NumCapturesWalker::ShortVisit called";
504 return ignored;
505 }
506
507 private:
508 int ncapture_;
509 DISALLOW_EVIL_CONSTRUCTORS(NumCapturesWalker);
510 };
511
NumCaptures()512 int Regexp::NumCaptures() {
513 NumCapturesWalker w;
514 w.Walk(this, 0);
515 return w.ncapture();
516 }
517
518 // Walker class to build map of named capture groups and their indices.
519 class NamedCapturesWalker : public Regexp::Walker<Ignored> {
520 public:
NamedCapturesWalker()521 NamedCapturesWalker() : map_(NULL) {}
~NamedCapturesWalker()522 ~NamedCapturesWalker() { delete map_; }
523
TakeMap()524 map<string, int>* TakeMap() {
525 map<string, int>* m = map_;
526 map_ = NULL;
527 return m;
528 }
529
PreVisit(Regexp * re,Ignored ignored,bool * stop)530 Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) {
531 if (re->op() == kRegexpCapture && re->name() != NULL) {
532 // Allocate map once we find a name.
533 if (map_ == NULL)
534 map_ = new map<string, int>;
535
536 // Record first occurrence of each name.
537 // (The rule is that if you have the same name
538 // multiple times, only the leftmost one counts.)
539 if (map_->find(*re->name()) == map_->end())
540 (*map_)[*re->name()] = re->cap();
541 }
542 return ignored;
543 }
544
ShortVisit(Regexp * re,Ignored ignored)545 virtual Ignored ShortVisit(Regexp* re, Ignored ignored) {
546 // Should never be called: we use Walk not WalkExponential.
547 LOG(DFATAL) << "NamedCapturesWalker::ShortVisit called";
548 return ignored;
549 }
550
551 private:
552 map<string, int>* map_;
553 DISALLOW_EVIL_CONSTRUCTORS(NamedCapturesWalker);
554 };
555
NamedCaptures()556 map<string, int>* Regexp::NamedCaptures() {
557 NamedCapturesWalker w;
558 w.Walk(this, 0);
559 return w.TakeMap();
560 }
561
562 // Walker class to build map from capture group indices to their names.
563 class CaptureNamesWalker : public Regexp::Walker<Ignored> {
564 public:
CaptureNamesWalker()565 CaptureNamesWalker() : map_(NULL) {}
~CaptureNamesWalker()566 ~CaptureNamesWalker() { delete map_; }
567
TakeMap()568 map<int, string>* TakeMap() {
569 map<int, string>* m = map_;
570 map_ = NULL;
571 return m;
572 }
573
PreVisit(Regexp * re,Ignored ignored,bool * stop)574 Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) {
575 if (re->op() == kRegexpCapture && re->name() != NULL) {
576 // Allocate map once we find a name.
577 if (map_ == NULL)
578 map_ = new map<int, string>;
579
580 (*map_)[re->cap()] = *re->name();
581 }
582 return ignored;
583 }
584
ShortVisit(Regexp * re,Ignored ignored)585 virtual Ignored ShortVisit(Regexp* re, Ignored ignored) {
586 // Should never be called: we use Walk not WalkExponential.
587 LOG(DFATAL) << "CaptureNamesWalker::ShortVisit called";
588 return ignored;
589 }
590
591 private:
592 map<int, string>* map_;
593 DISALLOW_EVIL_CONSTRUCTORS(CaptureNamesWalker);
594 };
595
CaptureNames()596 map<int, string>* Regexp::CaptureNames() {
597 CaptureNamesWalker w;
598 w.Walk(this, 0);
599 return w.TakeMap();
600 }
601
602 // Determines whether regexp matches must be anchored
603 // with a fixed string prefix. If so, returns the prefix and
604 // the regexp that remains after the prefix. The prefix might
605 // be ASCII case-insensitive.
RequiredPrefix(string * prefix,bool * foldcase,Regexp ** suffix)606 bool Regexp::RequiredPrefix(string *prefix, bool *foldcase, Regexp** suffix) {
607 // No need for a walker: the regexp must be of the form
608 // 1. some number of ^ anchors
609 // 2. a literal char or string
610 // 3. the rest
611 prefix->clear();
612 *foldcase = false;
613 *suffix = NULL;
614 if (op_ != kRegexpConcat)
615 return false;
616
617 // Some number of anchors, then a literal or concatenation.
618 int i = 0;
619 Regexp** sub = this->sub();
620 while (i < nsub_ && sub[i]->op_ == kRegexpBeginText)
621 i++;
622 if (i == 0 || i >= nsub_)
623 return false;
624
625 Regexp* re = sub[i];
626 switch (re->op_) {
627 default:
628 return false;
629
630 case kRegexpLiteralString:
631 // Convert to string in proper encoding.
632 if (re->parse_flags() & Latin1) {
633 prefix->resize(re->nrunes_);
634 for (int j = 0; j < re->nrunes_; j++)
635 (*prefix)[j] = re->runes_[j];
636 } else {
637 // Convert to UTF-8 in place.
638 // Assume worst-case space and then trim.
639 prefix->resize(re->nrunes_ * UTFmax);
640 char *p = &(*prefix)[0];
641 for (int j = 0; j < re->nrunes_; j++) {
642 Rune r = re->runes_[j];
643 if (r < Runeself)
644 *p++ = r;
645 else
646 p += runetochar(p, &r);
647 }
648 prefix->resize(p - &(*prefix)[0]);
649 }
650 break;
651
652 case kRegexpLiteral:
653 if ((re->parse_flags() & Latin1) || re->rune_ < Runeself) {
654 prefix->append(1, re->rune_);
655 } else {
656 char buf[UTFmax];
657 prefix->append(buf, runetochar(buf, &re->rune_));
658 }
659 break;
660 }
661 *foldcase = (sub[i]->parse_flags() & FoldCase);
662 i++;
663
664 // The rest.
665 if (i < nsub_) {
666 for (int j = i; j < nsub_; j++)
667 sub[j]->Incref();
668 re = Concat(sub + i, nsub_ - i, parse_flags());
669 } else {
670 re = new Regexp(kRegexpEmptyMatch, parse_flags());
671 }
672 *suffix = re;
673 return true;
674 }
675
676 // Character class builder is a balanced binary tree (STL set)
677 // containing non-overlapping, non-abutting RuneRanges.
678 // The less-than operator used in the tree treats two
679 // ranges as equal if they overlap at all, so that
680 // lookups for a particular Rune are possible.
681
CharClassBuilder()682 CharClassBuilder::CharClassBuilder() {
683 nrunes_ = 0;
684 upper_ = 0;
685 lower_ = 0;
686 }
687
688 // Add lo-hi to the class; return whether class got bigger.
AddRange(Rune lo,Rune hi)689 bool CharClassBuilder::AddRange(Rune lo, Rune hi) {
690 if (hi < lo)
691 return false;
692
693 if (lo <= 'z' && hi >= 'A') {
694 // Overlaps some alpha, maybe not all.
695 // Update bitmaps telling which ASCII letters are in the set.
696 Rune lo1 = max<Rune>(lo, 'A');
697 Rune hi1 = min<Rune>(hi, 'Z');
698 if (lo1 <= hi1)
699 upper_ |= ((1 << (hi1 - lo1 + 1)) - 1) << (lo1 - 'A');
700
701 lo1 = max<Rune>(lo, 'a');
702 hi1 = min<Rune>(hi, 'z');
703 if (lo1 <= hi1)
704 lower_ |= ((1 << (hi1 - lo1 + 1)) - 1) << (lo1 - 'a');
705 }
706
707 { // Check whether lo, hi is already in the class.
708 iterator it = ranges_.find(RuneRange(lo, lo));
709 if (it != end() && it->lo <= lo && hi <= it->hi)
710 return false;
711 }
712
713 // Look for a range abutting lo on the left.
714 // If it exists, take it out and increase our range.
715 if (lo > 0) {
716 iterator it = ranges_.find(RuneRange(lo-1, lo-1));
717 if (it != end()) {
718 lo = it->lo;
719 if (it->hi > hi)
720 hi = it->hi;
721 nrunes_ -= it->hi - it->lo + 1;
722 ranges_.erase(it);
723 }
724 }
725
726 // Look for a range abutting hi on the right.
727 // If it exists, take it out and increase our range.
728 if (hi < Runemax) {
729 iterator it = ranges_.find(RuneRange(hi+1, hi+1));
730 if (it != end()) {
731 hi = it->hi;
732 nrunes_ -= it->hi - it->lo + 1;
733 ranges_.erase(it);
734 }
735 }
736
737 // Look for ranges between lo and hi. Take them out.
738 // This is only safe because the set has no overlapping ranges.
739 // We've already removed any ranges abutting lo and hi, so
740 // any that overlap [lo, hi] must be contained within it.
741 for (;;) {
742 iterator it = ranges_.find(RuneRange(lo, hi));
743 if (it == end())
744 break;
745 nrunes_ -= it->hi - it->lo + 1;
746 ranges_.erase(it);
747 }
748
749 // Finally, add [lo, hi].
750 nrunes_ += hi - lo + 1;
751 ranges_.insert(RuneRange(lo, hi));
752 return true;
753 }
754
AddCharClass(CharClassBuilder * cc)755 void CharClassBuilder::AddCharClass(CharClassBuilder *cc) {
756 for (iterator it = cc->begin(); it != cc->end(); ++it)
757 AddRange(it->lo, it->hi);
758 }
759
Contains(Rune r)760 bool CharClassBuilder::Contains(Rune r) {
761 return ranges_.find(RuneRange(r, r)) != end();
762 }
763
764 // Does the character class behave the same on A-Z as on a-z?
FoldsASCII()765 bool CharClassBuilder::FoldsASCII() {
766 return ((upper_ ^ lower_) & AlphaMask) == 0;
767 }
768
Copy()769 CharClassBuilder* CharClassBuilder::Copy() {
770 CharClassBuilder* cc = new CharClassBuilder;
771 for (iterator it = begin(); it != end(); ++it)
772 cc->ranges_.insert(RuneRange(it->lo, it->hi));
773 cc->upper_ = upper_;
774 cc->lower_ = lower_;
775 cc->nrunes_ = nrunes_;
776 return cc;
777 }
778
779
780
RemoveAbove(Rune r)781 void CharClassBuilder::RemoveAbove(Rune r) {
782 if (r >= Runemax)
783 return;
784
785 if (r < 'z') {
786 if (r < 'a')
787 lower_ = 0;
788 else
789 lower_ &= AlphaMask >> ('z' - r);
790 }
791
792 if (r < 'Z') {
793 if (r < 'A')
794 upper_ = 0;
795 else
796 upper_ &= AlphaMask >> ('Z' - r);
797 }
798
799 for (;;) {
800
801 iterator it = ranges_.find(RuneRange(r + 1, Runemax));
802 if (it == end())
803 break;
804 RuneRange rr = *it;
805 ranges_.erase(it);
806 nrunes_ -= rr.hi - rr.lo + 1;
807 if (rr.lo <= r) {
808 rr.hi = r;
809 ranges_.insert(rr);
810 nrunes_ += rr.hi - rr.lo + 1;
811 }
812 }
813 }
814
Negate()815 void CharClassBuilder::Negate() {
816 // Build up negation and then copy in.
817 // Could edit ranges in place, but C++ won't let me.
818 vector<RuneRange> v;
819 v.reserve(ranges_.size() + 1);
820
821 // In negation, first range begins at 0, unless
822 // the current class begins at 0.
823 iterator it = begin();
824 if (it == end()) {
825 v.push_back(RuneRange(0, Runemax));
826 } else {
827 int nextlo = 0;
828 if (it->lo == 0) {
829 nextlo = it->hi + 1;
830 ++it;
831 }
832 for (; it != end(); ++it) {
833 v.push_back(RuneRange(nextlo, it->lo - 1));
834 nextlo = it->hi + 1;
835 }
836 if (nextlo <= Runemax)
837 v.push_back(RuneRange(nextlo, Runemax));
838 }
839
840 ranges_.clear();
841 for (int i = 0; i < v.size(); i++)
842 ranges_.insert(v[i]);
843
844 upper_ = AlphaMask & ~upper_;
845 lower_ = AlphaMask & ~lower_;
846 nrunes_ = Runemax+1 - nrunes_;
847 }
848
849 // Character class is a sorted list of ranges.
850 // The ranges are allocated in the same block as the header,
851 // necessitating a special allocator and Delete method.
852
New(int maxranges)853 CharClass* CharClass::New(int maxranges) {
854 CharClass* cc;
855 uint8* data = new uint8[sizeof *cc + maxranges*sizeof cc->ranges_[0]];
856 cc = reinterpret_cast<CharClass*>(data);
857 cc->ranges_ = reinterpret_cast<RuneRange*>(data + sizeof *cc);
858 cc->nranges_ = 0;
859 cc->folds_ascii_ = false;
860 cc->nrunes_ = 0;
861 return cc;
862 }
863
Delete()864 void CharClass::Delete() {
865 if (this == NULL)
866 return;
867 uint8 *data = reinterpret_cast<uint8*>(this);
868 delete[] data;
869 }
870
Negate()871 CharClass* CharClass::Negate() {
872 CharClass* cc = CharClass::New(nranges_+1);
873 cc->folds_ascii_ = folds_ascii_;
874 cc->nrunes_ = Runemax + 1 - nrunes_;
875 int n = 0;
876 int nextlo = 0;
877 for (CharClass::iterator it = begin(); it != end(); ++it) {
878 if (it->lo == nextlo) {
879 nextlo = it->hi + 1;
880 } else {
881 cc->ranges_[n++] = RuneRange(nextlo, it->lo - 1);
882 nextlo = it->hi + 1;
883 }
884 }
885 if (nextlo <= Runemax)
886 cc->ranges_[n++] = RuneRange(nextlo, Runemax);
887 cc->nranges_ = n;
888 return cc;
889 }
890
Contains(Rune r)891 bool CharClass::Contains(Rune r) {
892 RuneRange* rr = ranges_;
893 int n = nranges_;
894 while (n > 0) {
895 int m = n/2;
896 if (rr[m].hi < r) {
897 rr += m+1;
898 n -= m+1;
899 } else if (r < rr[m].lo) {
900 n = m;
901 } else { // rr[m].lo <= r && r <= rr[m].hi
902 return true;
903 }
904 }
905 return false;
906 }
907
GetCharClass()908 CharClass* CharClassBuilder::GetCharClass() {
909 CharClass* cc = CharClass::New(ranges_.size());
910 int n = 0;
911 for (iterator it = begin(); it != end(); ++it)
912 cc->ranges_[n++] = *it;
913 cc->nranges_ = n;
914 DCHECK_LE(n, ranges_.size());
915 cc->nrunes_ = nrunes_;
916 cc->folds_ascii_ = FoldsASCII();
917 return cc;
918 }
919
920 } // namespace re2
921