1 // Copyright 2006 The RE2 Authors. All Rights Reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 // Regular expression representation.
6 // Tested by parse_test.cc
7
8 #include "re2/regexp.h"
9
10 #include <stddef.h>
11 #include <stdint.h>
12 #include <string.h>
13 #include <algorithm>
14 #include <map>
15 #include <mutex>
16 #include <string>
17 #include <vector>
18
19 #include "util/util.h"
20 #include "util/logging.h"
21 #include "util/mutex.h"
22 #include "util/utf.h"
23 #include "re2/stringpiece.h"
24 #include "re2/walker-inl.h"
25
26 namespace re2 {
27
28 // Constructor. Allocates vectors as appropriate for operator.
Regexp(RegexpOp op,ParseFlags parse_flags)29 Regexp::Regexp(RegexpOp op, ParseFlags parse_flags)
30 : op_(static_cast<uint8_t>(op)),
31 simple_(false),
32 parse_flags_(static_cast<uint16_t>(parse_flags)),
33 ref_(1),
34 nsub_(0),
35 down_(NULL) {
36 subone_ = NULL;
37 memset(the_union_, 0, sizeof the_union_);
38 }
39
40 // Destructor. Assumes already cleaned up children.
41 // Private: use Decref() instead of delete to destroy Regexps.
42 // Can't call Decref on the sub-Regexps here because
43 // that could cause arbitrarily deep recursion, so
44 // required Decref() to have handled them for us.
~Regexp()45 Regexp::~Regexp() {
46 if (nsub_ > 0)
47 LOG(DFATAL) << "Regexp not destroyed.";
48
49 switch (op_) {
50 default:
51 break;
52 case kRegexpCapture:
53 delete name_;
54 break;
55 case kRegexpLiteralString:
56 delete[] runes_;
57 break;
58 case kRegexpCharClass:
59 if (cc_)
60 cc_->Delete();
61 delete ccb_;
62 break;
63 }
64 }
65
66 // If it's possible to destroy this regexp without recurring,
67 // do so and return true. Else return false.
QuickDestroy()68 bool Regexp::QuickDestroy() {
69 if (nsub_ == 0) {
70 delete this;
71 return true;
72 }
73 return false;
74 }
75
76 // Lazily allocated.
77 static Mutex* ref_mutex;
78 static std::map<Regexp*, int>* ref_map;
79
Ref()80 int Regexp::Ref() {
81 if (ref_ < kMaxRef)
82 return ref_;
83
84 MutexLock l(ref_mutex);
85 return (*ref_map)[this];
86 }
87
88 // Increments reference count, returns object as convenience.
Incref()89 Regexp* Regexp::Incref() {
90 if (ref_ >= kMaxRef-1) {
91 static std::once_flag ref_once;
92 std::call_once(ref_once, []() {
93 ref_mutex = new Mutex;
94 ref_map = new std::map<Regexp*, int>;
95 });
96
97 // Store ref count in overflow map.
98 MutexLock l(ref_mutex);
99 if (ref_ == kMaxRef) {
100 // already overflowed
101 (*ref_map)[this]++;
102 } else {
103 // overflowing now
104 (*ref_map)[this] = kMaxRef;
105 ref_ = kMaxRef;
106 }
107 return this;
108 }
109
110 ref_++;
111 return this;
112 }
113
114 // Decrements reference count and deletes this object if count reaches 0.
Decref()115 void Regexp::Decref() {
116 if (ref_ == kMaxRef) {
117 // Ref count is stored in overflow map.
118 MutexLock l(ref_mutex);
119 int r = (*ref_map)[this] - 1;
120 if (r < kMaxRef) {
121 ref_ = static_cast<uint16_t>(r);
122 ref_map->erase(this);
123 } else {
124 (*ref_map)[this] = r;
125 }
126 return;
127 }
128 ref_--;
129 if (ref_ == 0)
130 Destroy();
131 }
132
133 // Deletes this object; ref count has count reached 0.
Destroy()134 void Regexp::Destroy() {
135 if (QuickDestroy())
136 return;
137
138 // Handle recursive Destroy with explicit stack
139 // to avoid arbitrarily deep recursion on process stack [sigh].
140 down_ = NULL;
141 Regexp* stack = this;
142 while (stack != NULL) {
143 Regexp* re = stack;
144 stack = re->down_;
145 if (re->ref_ != 0)
146 LOG(DFATAL) << "Bad reference count " << re->ref_;
147 if (re->nsub_ > 0) {
148 Regexp** subs = re->sub();
149 for (int i = 0; i < re->nsub_; i++) {
150 Regexp* sub = subs[i];
151 if (sub == NULL)
152 continue;
153 if (sub->ref_ == kMaxRef)
154 sub->Decref();
155 else
156 --sub->ref_;
157 if (sub->ref_ == 0 && !sub->QuickDestroy()) {
158 sub->down_ = stack;
159 stack = sub;
160 }
161 }
162 if (re->nsub_ > 1)
163 delete[] subs;
164 re->nsub_ = 0;
165 }
166 delete re;
167 }
168 }
169
AddRuneToString(Rune r)170 void Regexp::AddRuneToString(Rune r) {
171 DCHECK(op_ == kRegexpLiteralString);
172 if (nrunes_ == 0) {
173 // start with 8
174 runes_ = new Rune[8];
175 } else if (nrunes_ >= 8 && (nrunes_ & (nrunes_ - 1)) == 0) {
176 // double on powers of two
177 Rune *old = runes_;
178 runes_ = new Rune[nrunes_ * 2];
179 for (int i = 0; i < nrunes_; i++)
180 runes_[i] = old[i];
181 delete[] old;
182 }
183
184 runes_[nrunes_++] = r;
185 }
186
HaveMatch(int match_id,ParseFlags flags)187 Regexp* Regexp::HaveMatch(int match_id, ParseFlags flags) {
188 Regexp* re = new Regexp(kRegexpHaveMatch, flags);
189 re->match_id_ = match_id;
190 return re;
191 }
192
StarPlusOrQuest(RegexpOp op,Regexp * sub,ParseFlags flags)193 Regexp* Regexp::StarPlusOrQuest(RegexpOp op, Regexp* sub, ParseFlags flags) {
194 // Squash **, ++ and ??.
195 if (op == sub->op() && flags == sub->parse_flags())
196 return sub;
197
198 // Squash *+, *?, +*, +?, ?* and ?+. They all squash to *, so because
199 // op is Star/Plus/Quest, we just have to check that sub->op() is too.
200 if ((sub->op() == kRegexpStar ||
201 sub->op() == kRegexpPlus ||
202 sub->op() == kRegexpQuest) &&
203 flags == sub->parse_flags()) {
204 // If sub is Star, no need to rewrite it.
205 if (sub->op() == kRegexpStar)
206 return sub;
207
208 // Rewrite sub to Star.
209 Regexp* re = new Regexp(kRegexpStar, flags);
210 re->AllocSub(1);
211 re->sub()[0] = sub->sub()[0]->Incref();
212 sub->Decref(); // We didn't consume the reference after all.
213 return re;
214 }
215
216 Regexp* re = new Regexp(op, flags);
217 re->AllocSub(1);
218 re->sub()[0] = sub;
219 return re;
220 }
221
Plus(Regexp * sub,ParseFlags flags)222 Regexp* Regexp::Plus(Regexp* sub, ParseFlags flags) {
223 return StarPlusOrQuest(kRegexpPlus, sub, flags);
224 }
225
Star(Regexp * sub,ParseFlags flags)226 Regexp* Regexp::Star(Regexp* sub, ParseFlags flags) {
227 return StarPlusOrQuest(kRegexpStar, sub, flags);
228 }
229
Quest(Regexp * sub,ParseFlags flags)230 Regexp* Regexp::Quest(Regexp* sub, ParseFlags flags) {
231 return StarPlusOrQuest(kRegexpQuest, sub, flags);
232 }
233
ConcatOrAlternate(RegexpOp op,Regexp ** sub,int nsub,ParseFlags flags,bool can_factor)234 Regexp* Regexp::ConcatOrAlternate(RegexpOp op, Regexp** sub, int nsub,
235 ParseFlags flags, bool can_factor) {
236 if (nsub == 1)
237 return sub[0];
238
239 if (nsub == 0) {
240 if (op == kRegexpAlternate)
241 return new Regexp(kRegexpNoMatch, flags);
242 else
243 return new Regexp(kRegexpEmptyMatch, flags);
244 }
245
246 Regexp** subcopy = NULL;
247 if (op == kRegexpAlternate && can_factor) {
248 // Going to edit sub; make a copy so we don't step on caller.
249 subcopy = new Regexp*[nsub];
250 memmove(subcopy, sub, nsub * sizeof sub[0]);
251 sub = subcopy;
252 nsub = FactorAlternation(sub, nsub, flags);
253 if (nsub == 1) {
254 Regexp* re = sub[0];
255 delete[] subcopy;
256 return re;
257 }
258 }
259
260 if (nsub > kMaxNsub) {
261 // Too many subexpressions to fit in a single Regexp.
262 // Make a two-level tree. Two levels gets us to 65535^2.
263 int nbigsub = (nsub+kMaxNsub-1)/kMaxNsub;
264 Regexp* re = new Regexp(op, flags);
265 re->AllocSub(nbigsub);
266 Regexp** subs = re->sub();
267 for (int i = 0; i < nbigsub - 1; i++)
268 subs[i] = ConcatOrAlternate(op, sub+i*kMaxNsub, kMaxNsub, flags, false);
269 subs[nbigsub - 1] = ConcatOrAlternate(op, sub+(nbigsub-1)*kMaxNsub,
270 nsub - (nbigsub-1)*kMaxNsub, flags,
271 false);
272 delete[] subcopy;
273 return re;
274 }
275
276 Regexp* re = new Regexp(op, flags);
277 re->AllocSub(nsub);
278 Regexp** subs = re->sub();
279 for (int i = 0; i < nsub; i++)
280 subs[i] = sub[i];
281
282 delete[] subcopy;
283 return re;
284 }
285
Concat(Regexp ** sub,int nsub,ParseFlags flags)286 Regexp* Regexp::Concat(Regexp** sub, int nsub, ParseFlags flags) {
287 return ConcatOrAlternate(kRegexpConcat, sub, nsub, flags, false);
288 }
289
Alternate(Regexp ** sub,int nsub,ParseFlags flags)290 Regexp* Regexp::Alternate(Regexp** sub, int nsub, ParseFlags flags) {
291 return ConcatOrAlternate(kRegexpAlternate, sub, nsub, flags, true);
292 }
293
AlternateNoFactor(Regexp ** sub,int nsub,ParseFlags flags)294 Regexp* Regexp::AlternateNoFactor(Regexp** sub, int nsub, ParseFlags flags) {
295 return ConcatOrAlternate(kRegexpAlternate, sub, nsub, flags, false);
296 }
297
Capture(Regexp * sub,ParseFlags flags,int cap)298 Regexp* Regexp::Capture(Regexp* sub, ParseFlags flags, int cap) {
299 Regexp* re = new Regexp(kRegexpCapture, flags);
300 re->AllocSub(1);
301 re->sub()[0] = sub;
302 re->cap_ = cap;
303 return re;
304 }
305
Repeat(Regexp * sub,ParseFlags flags,int min,int max)306 Regexp* Regexp::Repeat(Regexp* sub, ParseFlags flags, int min, int max) {
307 Regexp* re = new Regexp(kRegexpRepeat, flags);
308 re->AllocSub(1);
309 re->sub()[0] = sub;
310 re->min_ = min;
311 re->max_ = max;
312 return re;
313 }
314
NewLiteral(Rune rune,ParseFlags flags)315 Regexp* Regexp::NewLiteral(Rune rune, ParseFlags flags) {
316 Regexp* re = new Regexp(kRegexpLiteral, flags);
317 re->rune_ = rune;
318 return re;
319 }
320
LiteralString(Rune * runes,int nrunes,ParseFlags flags)321 Regexp* Regexp::LiteralString(Rune* runes, int nrunes, ParseFlags flags) {
322 if (nrunes <= 0)
323 return new Regexp(kRegexpEmptyMatch, flags);
324 if (nrunes == 1)
325 return NewLiteral(runes[0], flags);
326 Regexp* re = new Regexp(kRegexpLiteralString, flags);
327 for (int i = 0; i < nrunes; i++)
328 re->AddRuneToString(runes[i]);
329 return re;
330 }
331
NewCharClass(CharClass * cc,ParseFlags flags)332 Regexp* Regexp::NewCharClass(CharClass* cc, ParseFlags flags) {
333 Regexp* re = new Regexp(kRegexpCharClass, flags);
334 re->cc_ = cc;
335 return re;
336 }
337
Swap(Regexp * that)338 void Regexp::Swap(Regexp* that) {
339 // Regexp is not trivially copyable, so we cannot freely copy it with
340 // memmove(3), but swapping objects like so is safe for our purposes.
341 char tmp[sizeof *this];
342 void* vthis = reinterpret_cast<void*>(this);
343 void* vthat = reinterpret_cast<void*>(that);
344 memmove(tmp, vthis, sizeof *this);
345 memmove(vthis, vthat, sizeof *this);
346 memmove(vthat, tmp, sizeof *this);
347 }
348
349 // Tests equality of all top-level structure but not subregexps.
TopEqual(Regexp * a,Regexp * b)350 static bool TopEqual(Regexp* a, Regexp* b) {
351 if (a->op() != b->op())
352 return false;
353
354 switch (a->op()) {
355 case kRegexpNoMatch:
356 case kRegexpEmptyMatch:
357 case kRegexpAnyChar:
358 case kRegexpAnyByte:
359 case kRegexpBeginLine:
360 case kRegexpEndLine:
361 case kRegexpWordBoundary:
362 case kRegexpNoWordBoundary:
363 case kRegexpBeginText:
364 return true;
365
366 case kRegexpEndText:
367 // The parse flags remember whether it's \z or (?-m:$),
368 // which matters when testing against PCRE.
369 return ((a->parse_flags() ^ b->parse_flags()) & Regexp::WasDollar) == 0;
370
371 case kRegexpLiteral:
372 return a->rune() == b->rune() &&
373 ((a->parse_flags() ^ b->parse_flags()) & Regexp::FoldCase) == 0;
374
375 case kRegexpLiteralString:
376 return a->nrunes() == b->nrunes() &&
377 ((a->parse_flags() ^ b->parse_flags()) & Regexp::FoldCase) == 0 &&
378 memcmp(a->runes(), b->runes(),
379 a->nrunes() * sizeof a->runes()[0]) == 0;
380
381 case kRegexpAlternate:
382 case kRegexpConcat:
383 return a->nsub() == b->nsub();
384
385 case kRegexpStar:
386 case kRegexpPlus:
387 case kRegexpQuest:
388 return ((a->parse_flags() ^ b->parse_flags()) & Regexp::NonGreedy) == 0;
389
390 case kRegexpRepeat:
391 return ((a->parse_flags() ^ b->parse_flags()) & Regexp::NonGreedy) == 0 &&
392 a->min() == b->min() &&
393 a->max() == b->max();
394
395 case kRegexpCapture:
396 return a->cap() == b->cap() && a->name() == b->name();
397
398 case kRegexpHaveMatch:
399 return a->match_id() == b->match_id();
400
401 case kRegexpCharClass: {
402 CharClass* acc = a->cc();
403 CharClass* bcc = b->cc();
404 return acc->size() == bcc->size() &&
405 acc->end() - acc->begin() == bcc->end() - bcc->begin() &&
406 memcmp(acc->begin(), bcc->begin(),
407 (acc->end() - acc->begin()) * sizeof acc->begin()[0]) == 0;
408 }
409 }
410
411 LOG(DFATAL) << "Unexpected op in Regexp::Equal: " << a->op();
412 return 0;
413 }
414
Equal(Regexp * a,Regexp * b)415 bool Regexp::Equal(Regexp* a, Regexp* b) {
416 if (a == NULL || b == NULL)
417 return a == b;
418
419 if (!TopEqual(a, b))
420 return false;
421
422 // Fast path:
423 // return without allocating vector if there are no subregexps.
424 switch (a->op()) {
425 case kRegexpAlternate:
426 case kRegexpConcat:
427 case kRegexpStar:
428 case kRegexpPlus:
429 case kRegexpQuest:
430 case kRegexpRepeat:
431 case kRegexpCapture:
432 break;
433
434 default:
435 return true;
436 }
437
438 // Committed to doing real work.
439 // The stack (vector) has pairs of regexps waiting to
440 // be compared. The regexps are only equal if
441 // all the pairs end up being equal.
442 std::vector<Regexp*> stk;
443
444 for (;;) {
445 // Invariant: TopEqual(a, b) == true.
446 Regexp* a2;
447 Regexp* b2;
448 switch (a->op()) {
449 default:
450 break;
451 case kRegexpAlternate:
452 case kRegexpConcat:
453 for (int i = 0; i < a->nsub(); i++) {
454 a2 = a->sub()[i];
455 b2 = b->sub()[i];
456 if (!TopEqual(a2, b2))
457 return false;
458 stk.push_back(a2);
459 stk.push_back(b2);
460 }
461 break;
462
463 case kRegexpStar:
464 case kRegexpPlus:
465 case kRegexpQuest:
466 case kRegexpRepeat:
467 case kRegexpCapture:
468 a2 = a->sub()[0];
469 b2 = b->sub()[0];
470 if (!TopEqual(a2, b2))
471 return false;
472 // Really:
473 // stk.push_back(a2);
474 // stk.push_back(b2);
475 // break;
476 // but faster to assign directly and loop.
477 a = a2;
478 b = b2;
479 continue;
480 }
481
482 size_t n = stk.size();
483 if (n == 0)
484 break;
485
486 DCHECK_GE(n, 2);
487 a = stk[n-2];
488 b = stk[n-1];
489 stk.resize(n-2);
490 }
491
492 return true;
493 }
494
495 // Keep in sync with enum RegexpStatusCode in regexp.h
496 static const char *kErrorStrings[] = {
497 "no error",
498 "unexpected error",
499 "invalid escape sequence",
500 "invalid character class",
501 "invalid character class range",
502 "missing ]",
503 "missing )",
504 "trailing \\",
505 "no argument for repetition operator",
506 "invalid repetition size",
507 "bad repetition operator",
508 "invalid perl operator",
509 "invalid UTF-8",
510 "invalid named capture group",
511 };
512
CodeText(enum RegexpStatusCode code)513 std::string RegexpStatus::CodeText(enum RegexpStatusCode code) {
514 if (code < 0 || code >= arraysize(kErrorStrings))
515 code = kRegexpInternalError;
516 return kErrorStrings[code];
517 }
518
Text() const519 std::string RegexpStatus::Text() const {
520 if (error_arg_.empty())
521 return CodeText(code_);
522 std::string s;
523 s.append(CodeText(code_));
524 s.append(": ");
525 s.append(error_arg_.data(), error_arg_.size());
526 return s;
527 }
528
Copy(const RegexpStatus & status)529 void RegexpStatus::Copy(const RegexpStatus& status) {
530 code_ = status.code_;
531 error_arg_ = status.error_arg_;
532 }
533
534 typedef int Ignored; // Walker<void> doesn't exist
535
536 // Walker subclass to count capturing parens in regexp.
537 class NumCapturesWalker : public Regexp::Walker<Ignored> {
538 public:
NumCapturesWalker()539 NumCapturesWalker() : ncapture_(0) {}
ncapture()540 int ncapture() { return ncapture_; }
541
PreVisit(Regexp * re,Ignored ignored,bool * stop)542 virtual Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) {
543 if (re->op() == kRegexpCapture)
544 ncapture_++;
545 return ignored;
546 }
ShortVisit(Regexp * re,Ignored ignored)547 virtual Ignored ShortVisit(Regexp* re, Ignored ignored) {
548 // Should never be called: we use Walk not WalkExponential.
549 LOG(DFATAL) << "NumCapturesWalker::ShortVisit called";
550 return ignored;
551 }
552
553 private:
554 int ncapture_;
555
556 NumCapturesWalker(const NumCapturesWalker&) = delete;
557 NumCapturesWalker& operator=(const NumCapturesWalker&) = delete;
558 };
559
NumCaptures()560 int Regexp::NumCaptures() {
561 NumCapturesWalker w;
562 w.Walk(this, 0);
563 return w.ncapture();
564 }
565
566 // Walker class to build map of named capture groups and their indices.
567 class NamedCapturesWalker : public Regexp::Walker<Ignored> {
568 public:
NamedCapturesWalker()569 NamedCapturesWalker() : map_(NULL) {}
~NamedCapturesWalker()570 ~NamedCapturesWalker() { delete map_; }
571
TakeMap()572 std::map<std::string, int>* TakeMap() {
573 std::map<std::string, int>* m = map_;
574 map_ = NULL;
575 return m;
576 }
577
PreVisit(Regexp * re,Ignored ignored,bool * stop)578 Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) {
579 if (re->op() == kRegexpCapture && re->name() != NULL) {
580 // Allocate map once we find a name.
581 if (map_ == NULL)
582 map_ = new std::map<std::string, int>;
583
584 // Record first occurrence of each name.
585 // (The rule is that if you have the same name
586 // multiple times, only the leftmost one counts.)
587 if (map_->find(*re->name()) == map_->end())
588 (*map_)[*re->name()] = re->cap();
589 }
590 return ignored;
591 }
592
ShortVisit(Regexp * re,Ignored ignored)593 virtual Ignored ShortVisit(Regexp* re, Ignored ignored) {
594 // Should never be called: we use Walk not WalkExponential.
595 LOG(DFATAL) << "NamedCapturesWalker::ShortVisit called";
596 return ignored;
597 }
598
599 private:
600 std::map<std::string, int>* map_;
601
602 NamedCapturesWalker(const NamedCapturesWalker&) = delete;
603 NamedCapturesWalker& operator=(const NamedCapturesWalker&) = delete;
604 };
605
NamedCaptures()606 std::map<std::string, int>* Regexp::NamedCaptures() {
607 NamedCapturesWalker w;
608 w.Walk(this, 0);
609 return w.TakeMap();
610 }
611
612 // Walker class to build map from capture group indices to their names.
613 class CaptureNamesWalker : public Regexp::Walker<Ignored> {
614 public:
CaptureNamesWalker()615 CaptureNamesWalker() : map_(NULL) {}
~CaptureNamesWalker()616 ~CaptureNamesWalker() { delete map_; }
617
TakeMap()618 std::map<int, std::string>* TakeMap() {
619 std::map<int, std::string>* m = map_;
620 map_ = NULL;
621 return m;
622 }
623
PreVisit(Regexp * re,Ignored ignored,bool * stop)624 Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) {
625 if (re->op() == kRegexpCapture && re->name() != NULL) {
626 // Allocate map once we find a name.
627 if (map_ == NULL)
628 map_ = new std::map<int, std::string>;
629
630 (*map_)[re->cap()] = *re->name();
631 }
632 return ignored;
633 }
634
ShortVisit(Regexp * re,Ignored ignored)635 virtual Ignored ShortVisit(Regexp* re, Ignored ignored) {
636 // Should never be called: we use Walk not WalkExponential.
637 LOG(DFATAL) << "CaptureNamesWalker::ShortVisit called";
638 return ignored;
639 }
640
641 private:
642 std::map<int, std::string>* map_;
643
644 CaptureNamesWalker(const CaptureNamesWalker&) = delete;
645 CaptureNamesWalker& operator=(const CaptureNamesWalker&) = delete;
646 };
647
CaptureNames()648 std::map<int, std::string>* Regexp::CaptureNames() {
649 CaptureNamesWalker w;
650 w.Walk(this, 0);
651 return w.TakeMap();
652 }
653
654 // Determines whether regexp matches must be anchored
655 // with a fixed string prefix. If so, returns the prefix and
656 // the regexp that remains after the prefix. The prefix might
657 // be ASCII case-insensitive.
RequiredPrefix(std::string * prefix,bool * foldcase,Regexp ** suffix)658 bool Regexp::RequiredPrefix(std::string* prefix, bool* foldcase,
659 Regexp** suffix) {
660 // No need for a walker: the regexp must be of the form
661 // 1. some number of ^ anchors
662 // 2. a literal char or string
663 // 3. the rest
664 prefix->clear();
665 *foldcase = false;
666 *suffix = NULL;
667 if (op_ != kRegexpConcat)
668 return false;
669
670 // Some number of anchors, then a literal or concatenation.
671 int i = 0;
672 Regexp** sub = this->sub();
673 while (i < nsub_ && sub[i]->op_ == kRegexpBeginText)
674 i++;
675 if (i == 0 || i >= nsub_)
676 return false;
677
678 Regexp* re = sub[i];
679 switch (re->op_) {
680 default:
681 return false;
682
683 case kRegexpLiteralString:
684 // Convert to string in proper encoding.
685 if (re->parse_flags() & Latin1) {
686 prefix->resize(re->nrunes_);
687 for (int j = 0; j < re->nrunes_; j++)
688 (*prefix)[j] = static_cast<char>(re->runes_[j]);
689 } else {
690 // Convert to UTF-8 in place.
691 // Assume worst-case space and then trim.
692 prefix->resize(re->nrunes_ * UTFmax);
693 char *p = &(*prefix)[0];
694 for (int j = 0; j < re->nrunes_; j++) {
695 Rune r = re->runes_[j];
696 if (r < Runeself)
697 *p++ = static_cast<char>(r);
698 else
699 p += runetochar(p, &r);
700 }
701 prefix->resize(p - &(*prefix)[0]);
702 }
703 break;
704
705 case kRegexpLiteral:
706 if ((re->parse_flags() & Latin1) || re->rune_ < Runeself) {
707 prefix->append(1, static_cast<char>(re->rune_));
708 } else {
709 char buf[UTFmax];
710 prefix->append(buf, runetochar(buf, &re->rune_));
711 }
712 break;
713 }
714 *foldcase = (sub[i]->parse_flags() & FoldCase) != 0;
715 i++;
716
717 // The rest.
718 if (i < nsub_) {
719 for (int j = i; j < nsub_; j++)
720 sub[j]->Incref();
721 re = Concat(sub + i, nsub_ - i, parse_flags());
722 } else {
723 re = new Regexp(kRegexpEmptyMatch, parse_flags());
724 }
725 *suffix = re;
726 return true;
727 }
728
729 // Character class builder is a balanced binary tree (STL set)
730 // containing non-overlapping, non-abutting RuneRanges.
731 // The less-than operator used in the tree treats two
732 // ranges as equal if they overlap at all, so that
733 // lookups for a particular Rune are possible.
734
CharClassBuilder()735 CharClassBuilder::CharClassBuilder() {
736 nrunes_ = 0;
737 upper_ = 0;
738 lower_ = 0;
739 }
740
741 // Add lo-hi to the class; return whether class got bigger.
AddRange(Rune lo,Rune hi)742 bool CharClassBuilder::AddRange(Rune lo, Rune hi) {
743 if (hi < lo)
744 return false;
745
746 if (lo <= 'z' && hi >= 'A') {
747 // Overlaps some alpha, maybe not all.
748 // Update bitmaps telling which ASCII letters are in the set.
749 Rune lo1 = std::max<Rune>(lo, 'A');
750 Rune hi1 = std::min<Rune>(hi, 'Z');
751 if (lo1 <= hi1)
752 upper_ |= ((1 << (hi1 - lo1 + 1)) - 1) << (lo1 - 'A');
753
754 lo1 = std::max<Rune>(lo, 'a');
755 hi1 = std::min<Rune>(hi, 'z');
756 if (lo1 <= hi1)
757 lower_ |= ((1 << (hi1 - lo1 + 1)) - 1) << (lo1 - 'a');
758 }
759
760 { // Check whether lo, hi is already in the class.
761 iterator it = ranges_.find(RuneRange(lo, lo));
762 if (it != end() && it->lo <= lo && hi <= it->hi)
763 return false;
764 }
765
766 // Look for a range abutting lo on the left.
767 // If it exists, take it out and increase our range.
768 if (lo > 0) {
769 iterator it = ranges_.find(RuneRange(lo-1, lo-1));
770 if (it != end()) {
771 lo = it->lo;
772 if (it->hi > hi)
773 hi = it->hi;
774 nrunes_ -= it->hi - it->lo + 1;
775 ranges_.erase(it);
776 }
777 }
778
779 // Look for a range abutting hi on the right.
780 // If it exists, take it out and increase our range.
781 if (hi < Runemax) {
782 iterator it = ranges_.find(RuneRange(hi+1, hi+1));
783 if (it != end()) {
784 hi = it->hi;
785 nrunes_ -= it->hi - it->lo + 1;
786 ranges_.erase(it);
787 }
788 }
789
790 // Look for ranges between lo and hi. Take them out.
791 // This is only safe because the set has no overlapping ranges.
792 // We've already removed any ranges abutting lo and hi, so
793 // any that overlap [lo, hi] must be contained within it.
794 for (;;) {
795 iterator it = ranges_.find(RuneRange(lo, hi));
796 if (it == end())
797 break;
798 nrunes_ -= it->hi - it->lo + 1;
799 ranges_.erase(it);
800 }
801
802 // Finally, add [lo, hi].
803 nrunes_ += hi - lo + 1;
804 ranges_.insert(RuneRange(lo, hi));
805 return true;
806 }
807
AddCharClass(CharClassBuilder * cc)808 void CharClassBuilder::AddCharClass(CharClassBuilder *cc) {
809 for (iterator it = cc->begin(); it != cc->end(); ++it)
810 AddRange(it->lo, it->hi);
811 }
812
Contains(Rune r)813 bool CharClassBuilder::Contains(Rune r) {
814 return ranges_.find(RuneRange(r, r)) != end();
815 }
816
817 // Does the character class behave the same on A-Z as on a-z?
FoldsASCII()818 bool CharClassBuilder::FoldsASCII() {
819 return ((upper_ ^ lower_) & AlphaMask) == 0;
820 }
821
Copy()822 CharClassBuilder* CharClassBuilder::Copy() {
823 CharClassBuilder* cc = new CharClassBuilder;
824 for (iterator it = begin(); it != end(); ++it)
825 cc->ranges_.insert(RuneRange(it->lo, it->hi));
826 cc->upper_ = upper_;
827 cc->lower_ = lower_;
828 cc->nrunes_ = nrunes_;
829 return cc;
830 }
831
832
833
RemoveAbove(Rune r)834 void CharClassBuilder::RemoveAbove(Rune r) {
835 if (r >= Runemax)
836 return;
837
838 if (r < 'z') {
839 if (r < 'a')
840 lower_ = 0;
841 else
842 lower_ &= AlphaMask >> ('z' - r);
843 }
844
845 if (r < 'Z') {
846 if (r < 'A')
847 upper_ = 0;
848 else
849 upper_ &= AlphaMask >> ('Z' - r);
850 }
851
852 for (;;) {
853
854 iterator it = ranges_.find(RuneRange(r + 1, Runemax));
855 if (it == end())
856 break;
857 RuneRange rr = *it;
858 ranges_.erase(it);
859 nrunes_ -= rr.hi - rr.lo + 1;
860 if (rr.lo <= r) {
861 rr.hi = r;
862 ranges_.insert(rr);
863 nrunes_ += rr.hi - rr.lo + 1;
864 }
865 }
866 }
867
Negate()868 void CharClassBuilder::Negate() {
869 // Build up negation and then copy in.
870 // Could edit ranges in place, but C++ won't let me.
871 std::vector<RuneRange> v;
872 v.reserve(ranges_.size() + 1);
873
874 // In negation, first range begins at 0, unless
875 // the current class begins at 0.
876 iterator it = begin();
877 if (it == end()) {
878 v.push_back(RuneRange(0, Runemax));
879 } else {
880 int nextlo = 0;
881 if (it->lo == 0) {
882 nextlo = it->hi + 1;
883 ++it;
884 }
885 for (; it != end(); ++it) {
886 v.push_back(RuneRange(nextlo, it->lo - 1));
887 nextlo = it->hi + 1;
888 }
889 if (nextlo <= Runemax)
890 v.push_back(RuneRange(nextlo, Runemax));
891 }
892
893 ranges_.clear();
894 for (size_t i = 0; i < v.size(); i++)
895 ranges_.insert(v[i]);
896
897 upper_ = AlphaMask & ~upper_;
898 lower_ = AlphaMask & ~lower_;
899 nrunes_ = Runemax+1 - nrunes_;
900 }
901
902 // Character class is a sorted list of ranges.
903 // The ranges are allocated in the same block as the header,
904 // necessitating a special allocator and Delete method.
905
New(int maxranges)906 CharClass* CharClass::New(int maxranges) {
907 CharClass* cc;
908 uint8_t* data = new uint8_t[sizeof *cc + maxranges*sizeof cc->ranges_[0]];
909 cc = reinterpret_cast<CharClass*>(data);
910 cc->ranges_ = reinterpret_cast<RuneRange*>(data + sizeof *cc);
911 cc->nranges_ = 0;
912 cc->folds_ascii_ = false;
913 cc->nrunes_ = 0;
914 return cc;
915 }
916
Delete()917 void CharClass::Delete() {
918 uint8_t* data = reinterpret_cast<uint8_t*>(this);
919 delete[] data;
920 }
921
Negate()922 CharClass* CharClass::Negate() {
923 CharClass* cc = CharClass::New(nranges_+1);
924 cc->folds_ascii_ = folds_ascii_;
925 cc->nrunes_ = Runemax + 1 - nrunes_;
926 int n = 0;
927 int nextlo = 0;
928 for (CharClass::iterator it = begin(); it != end(); ++it) {
929 if (it->lo == nextlo) {
930 nextlo = it->hi + 1;
931 } else {
932 cc->ranges_[n++] = RuneRange(nextlo, it->lo - 1);
933 nextlo = it->hi + 1;
934 }
935 }
936 if (nextlo <= Runemax)
937 cc->ranges_[n++] = RuneRange(nextlo, Runemax);
938 cc->nranges_ = n;
939 return cc;
940 }
941
Contains(Rune r)942 bool CharClass::Contains(Rune r) {
943 RuneRange* rr = ranges_;
944 int n = nranges_;
945 while (n > 0) {
946 int m = n/2;
947 if (rr[m].hi < r) {
948 rr += m+1;
949 n -= m+1;
950 } else if (r < rr[m].lo) {
951 n = m;
952 } else { // rr[m].lo <= r && r <= rr[m].hi
953 return true;
954 }
955 }
956 return false;
957 }
958
GetCharClass()959 CharClass* CharClassBuilder::GetCharClass() {
960 CharClass* cc = CharClass::New(static_cast<int>(ranges_.size()));
961 int n = 0;
962 for (iterator it = begin(); it != end(); ++it)
963 cc->ranges_[n++] = *it;
964 cc->nranges_ = n;
965 DCHECK_LE(n, static_cast<int>(ranges_.size()));
966 cc->nrunes_ = nrunes_;
967 cc->folds_ascii_ = FoldsASCII();
968 return cc;
969 }
970
971 } // namespace re2
972