1 // Copyright 2006 The RE2 Authors. All Rights Reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 // Regular expression representation.
6 // Tested by parse_test.cc
7
8 #include "re2/regexp.h"
9
10 #include <stddef.h>
11 #include <stdint.h>
12 #include <string.h>
13 #include <algorithm>
14 #include <map>
15 #include <mutex>
16 #include <string>
17 #include <vector>
18
19 #include "util/util.h"
20 #include "util/logging.h"
21 #include "util/mutex.h"
22 #include "util/utf.h"
23 #include "re2/pod_array.h"
24 #include "re2/stringpiece.h"
25 #include "re2/walker-inl.h"
26
27 namespace re2 {
28
29 // Constructor. Allocates vectors as appropriate for operator.
Regexp(RegexpOp op,ParseFlags parse_flags)30 Regexp::Regexp(RegexpOp op, ParseFlags parse_flags)
31 : op_(static_cast<uint8_t>(op)),
32 simple_(false),
33 parse_flags_(static_cast<uint16_t>(parse_flags)),
34 ref_(1),
35 nsub_(0),
36 down_(NULL) {
37 subone_ = NULL;
38 memset(the_union_, 0, sizeof the_union_);
39 }
40
41 // Destructor. Assumes already cleaned up children.
42 // Private: use Decref() instead of delete to destroy Regexps.
43 // Can't call Decref on the sub-Regexps here because
44 // that could cause arbitrarily deep recursion, so
45 // required Decref() to have handled them for us.
~Regexp()46 Regexp::~Regexp() {
47 if (nsub_ > 0)
48 LOG(DFATAL) << "Regexp not destroyed.";
49
50 switch (op_) {
51 default:
52 break;
53 case kRegexpCapture:
54 delete name_;
55 break;
56 case kRegexpLiteralString:
57 delete[] runes_;
58 break;
59 case kRegexpCharClass:
60 if (cc_)
61 cc_->Delete();
62 delete ccb_;
63 break;
64 }
65 }
66
67 // If it's possible to destroy this regexp without recurring,
68 // do so and return true. Else return false.
QuickDestroy()69 bool Regexp::QuickDestroy() {
70 if (nsub_ == 0) {
71 delete this;
72 return true;
73 }
74 return false;
75 }
76
77 // Lazily allocated.
78 static Mutex* ref_mutex;
79 static std::map<Regexp*, int>* ref_map;
80
Ref()81 int Regexp::Ref() {
82 if (ref_ < kMaxRef)
83 return ref_;
84
85 MutexLock l(ref_mutex);
86 return (*ref_map)[this];
87 }
88
89 // Increments reference count, returns object as convenience.
Incref()90 Regexp* Regexp::Incref() {
91 if (ref_ >= kMaxRef-1) {
92 static std::once_flag ref_once;
93 std::call_once(ref_once, []() {
94 ref_mutex = new Mutex;
95 ref_map = new std::map<Regexp*, int>;
96 });
97
98 // Store ref count in overflow map.
99 MutexLock l(ref_mutex);
100 if (ref_ == kMaxRef) {
101 // already overflowed
102 (*ref_map)[this]++;
103 } else {
104 // overflowing now
105 (*ref_map)[this] = kMaxRef;
106 ref_ = kMaxRef;
107 }
108 return this;
109 }
110
111 ref_++;
112 return this;
113 }
114
115 // Decrements reference count and deletes this object if count reaches 0.
Decref()116 void Regexp::Decref() {
117 if (ref_ == kMaxRef) {
118 // Ref count is stored in overflow map.
119 MutexLock l(ref_mutex);
120 int r = (*ref_map)[this] - 1;
121 if (r < kMaxRef) {
122 ref_ = static_cast<uint16_t>(r);
123 ref_map->erase(this);
124 } else {
125 (*ref_map)[this] = r;
126 }
127 return;
128 }
129 ref_--;
130 if (ref_ == 0)
131 Destroy();
132 }
133
134 // Deletes this object; ref count has count reached 0.
Destroy()135 void Regexp::Destroy() {
136 if (QuickDestroy())
137 return;
138
139 // Handle recursive Destroy with explicit stack
140 // to avoid arbitrarily deep recursion on process stack [sigh].
141 down_ = NULL;
142 Regexp* stack = this;
143 while (stack != NULL) {
144 Regexp* re = stack;
145 stack = re->down_;
146 if (re->ref_ != 0)
147 LOG(DFATAL) << "Bad reference count " << re->ref_;
148 if (re->nsub_ > 0) {
149 Regexp** subs = re->sub();
150 for (int i = 0; i < re->nsub_; i++) {
151 Regexp* sub = subs[i];
152 if (sub == NULL)
153 continue;
154 if (sub->ref_ == kMaxRef)
155 sub->Decref();
156 else
157 --sub->ref_;
158 if (sub->ref_ == 0 && !sub->QuickDestroy()) {
159 sub->down_ = stack;
160 stack = sub;
161 }
162 }
163 if (re->nsub_ > 1)
164 delete[] subs;
165 re->nsub_ = 0;
166 }
167 delete re;
168 }
169 }
170
AddRuneToString(Rune r)171 void Regexp::AddRuneToString(Rune r) {
172 DCHECK(op_ == kRegexpLiteralString);
173 if (nrunes_ == 0) {
174 // start with 8
175 runes_ = new Rune[8];
176 } else if (nrunes_ >= 8 && (nrunes_ & (nrunes_ - 1)) == 0) {
177 // double on powers of two
178 Rune *old = runes_;
179 runes_ = new Rune[nrunes_ * 2];
180 for (int i = 0; i < nrunes_; i++)
181 runes_[i] = old[i];
182 delete[] old;
183 }
184
185 runes_[nrunes_++] = r;
186 }
187
HaveMatch(int match_id,ParseFlags flags)188 Regexp* Regexp::HaveMatch(int match_id, ParseFlags flags) {
189 Regexp* re = new Regexp(kRegexpHaveMatch, flags);
190 re->match_id_ = match_id;
191 return re;
192 }
193
StarPlusOrQuest(RegexpOp op,Regexp * sub,ParseFlags flags)194 Regexp* Regexp::StarPlusOrQuest(RegexpOp op, Regexp* sub, ParseFlags flags) {
195 // Squash **, ++ and ??.
196 if (op == sub->op() && flags == sub->parse_flags())
197 return sub;
198
199 // Squash *+, *?, +*, +?, ?* and ?+. They all squash to *, so because
200 // op is Star/Plus/Quest, we just have to check that sub->op() is too.
201 if ((sub->op() == kRegexpStar ||
202 sub->op() == kRegexpPlus ||
203 sub->op() == kRegexpQuest) &&
204 flags == sub->parse_flags()) {
205 // If sub is Star, no need to rewrite it.
206 if (sub->op() == kRegexpStar)
207 return sub;
208
209 // Rewrite sub to Star.
210 Regexp* re = new Regexp(kRegexpStar, flags);
211 re->AllocSub(1);
212 re->sub()[0] = sub->sub()[0]->Incref();
213 sub->Decref(); // We didn't consume the reference after all.
214 return re;
215 }
216
217 Regexp* re = new Regexp(op, flags);
218 re->AllocSub(1);
219 re->sub()[0] = sub;
220 return re;
221 }
222
Plus(Regexp * sub,ParseFlags flags)223 Regexp* Regexp::Plus(Regexp* sub, ParseFlags flags) {
224 return StarPlusOrQuest(kRegexpPlus, sub, flags);
225 }
226
Star(Regexp * sub,ParseFlags flags)227 Regexp* Regexp::Star(Regexp* sub, ParseFlags flags) {
228 return StarPlusOrQuest(kRegexpStar, sub, flags);
229 }
230
Quest(Regexp * sub,ParseFlags flags)231 Regexp* Regexp::Quest(Regexp* sub, ParseFlags flags) {
232 return StarPlusOrQuest(kRegexpQuest, sub, flags);
233 }
234
ConcatOrAlternate(RegexpOp op,Regexp ** sub,int nsub,ParseFlags flags,bool can_factor)235 Regexp* Regexp::ConcatOrAlternate(RegexpOp op, Regexp** sub, int nsub,
236 ParseFlags flags, bool can_factor) {
237 if (nsub == 1)
238 return sub[0];
239
240 if (nsub == 0) {
241 if (op == kRegexpAlternate)
242 return new Regexp(kRegexpNoMatch, flags);
243 else
244 return new Regexp(kRegexpEmptyMatch, flags);
245 }
246
247 PODArray<Regexp*> subcopy;
248 if (op == kRegexpAlternate && can_factor) {
249 // Going to edit sub; make a copy so we don't step on caller.
250 subcopy = PODArray<Regexp*>(nsub);
251 memmove(subcopy.data(), sub, nsub * sizeof sub[0]);
252 sub = subcopy.data();
253 nsub = FactorAlternation(sub, nsub, flags);
254 if (nsub == 1) {
255 Regexp* re = sub[0];
256 return re;
257 }
258 }
259
260 if (nsub > kMaxNsub) {
261 // Too many subexpressions to fit in a single Regexp.
262 // Make a two-level tree. Two levels gets us to 65535^2.
263 int nbigsub = (nsub+kMaxNsub-1)/kMaxNsub;
264 Regexp* re = new Regexp(op, flags);
265 re->AllocSub(nbigsub);
266 Regexp** subs = re->sub();
267 for (int i = 0; i < nbigsub - 1; i++)
268 subs[i] = ConcatOrAlternate(op, sub+i*kMaxNsub, kMaxNsub, flags, false);
269 subs[nbigsub - 1] = ConcatOrAlternate(op, sub+(nbigsub-1)*kMaxNsub,
270 nsub - (nbigsub-1)*kMaxNsub, flags,
271 false);
272 return re;
273 }
274
275 Regexp* re = new Regexp(op, flags);
276 re->AllocSub(nsub);
277 Regexp** subs = re->sub();
278 for (int i = 0; i < nsub; i++)
279 subs[i] = sub[i];
280 return re;
281 }
282
Concat(Regexp ** sub,int nsub,ParseFlags flags)283 Regexp* Regexp::Concat(Regexp** sub, int nsub, ParseFlags flags) {
284 return ConcatOrAlternate(kRegexpConcat, sub, nsub, flags, false);
285 }
286
Alternate(Regexp ** sub,int nsub,ParseFlags flags)287 Regexp* Regexp::Alternate(Regexp** sub, int nsub, ParseFlags flags) {
288 return ConcatOrAlternate(kRegexpAlternate, sub, nsub, flags, true);
289 }
290
AlternateNoFactor(Regexp ** sub,int nsub,ParseFlags flags)291 Regexp* Regexp::AlternateNoFactor(Regexp** sub, int nsub, ParseFlags flags) {
292 return ConcatOrAlternate(kRegexpAlternate, sub, nsub, flags, false);
293 }
294
Capture(Regexp * sub,ParseFlags flags,int cap)295 Regexp* Regexp::Capture(Regexp* sub, ParseFlags flags, int cap) {
296 Regexp* re = new Regexp(kRegexpCapture, flags);
297 re->AllocSub(1);
298 re->sub()[0] = sub;
299 re->cap_ = cap;
300 return re;
301 }
302
Repeat(Regexp * sub,ParseFlags flags,int min,int max)303 Regexp* Regexp::Repeat(Regexp* sub, ParseFlags flags, int min, int max) {
304 Regexp* re = new Regexp(kRegexpRepeat, flags);
305 re->AllocSub(1);
306 re->sub()[0] = sub;
307 re->min_ = min;
308 re->max_ = max;
309 return re;
310 }
311
NewLiteral(Rune rune,ParseFlags flags)312 Regexp* Regexp::NewLiteral(Rune rune, ParseFlags flags) {
313 Regexp* re = new Regexp(kRegexpLiteral, flags);
314 re->rune_ = rune;
315 return re;
316 }
317
LiteralString(Rune * runes,int nrunes,ParseFlags flags)318 Regexp* Regexp::LiteralString(Rune* runes, int nrunes, ParseFlags flags) {
319 if (nrunes <= 0)
320 return new Regexp(kRegexpEmptyMatch, flags);
321 if (nrunes == 1)
322 return NewLiteral(runes[0], flags);
323 Regexp* re = new Regexp(kRegexpLiteralString, flags);
324 for (int i = 0; i < nrunes; i++)
325 re->AddRuneToString(runes[i]);
326 return re;
327 }
328
NewCharClass(CharClass * cc,ParseFlags flags)329 Regexp* Regexp::NewCharClass(CharClass* cc, ParseFlags flags) {
330 Regexp* re = new Regexp(kRegexpCharClass, flags);
331 re->cc_ = cc;
332 return re;
333 }
334
Swap(Regexp * that)335 void Regexp::Swap(Regexp* that) {
336 // Regexp is not trivially copyable, so we cannot freely copy it with
337 // memmove(3), but swapping objects like so is safe for our purposes.
338 char tmp[sizeof *this];
339 void* vthis = reinterpret_cast<void*>(this);
340 void* vthat = reinterpret_cast<void*>(that);
341 memmove(tmp, vthis, sizeof *this);
342 memmove(vthis, vthat, sizeof *this);
343 memmove(vthat, tmp, sizeof *this);
344 }
345
346 // Tests equality of all top-level structure but not subregexps.
TopEqual(Regexp * a,Regexp * b)347 static bool TopEqual(Regexp* a, Regexp* b) {
348 if (a->op() != b->op())
349 return false;
350
351 switch (a->op()) {
352 case kRegexpNoMatch:
353 case kRegexpEmptyMatch:
354 case kRegexpAnyChar:
355 case kRegexpAnyByte:
356 case kRegexpBeginLine:
357 case kRegexpEndLine:
358 case kRegexpWordBoundary:
359 case kRegexpNoWordBoundary:
360 case kRegexpBeginText:
361 return true;
362
363 case kRegexpEndText:
364 // The parse flags remember whether it's \z or (?-m:$),
365 // which matters when testing against PCRE.
366 return ((a->parse_flags() ^ b->parse_flags()) & Regexp::WasDollar) == 0;
367
368 case kRegexpLiteral:
369 return a->rune() == b->rune() &&
370 ((a->parse_flags() ^ b->parse_flags()) & Regexp::FoldCase) == 0;
371
372 case kRegexpLiteralString:
373 return a->nrunes() == b->nrunes() &&
374 ((a->parse_flags() ^ b->parse_flags()) & Regexp::FoldCase) == 0 &&
375 memcmp(a->runes(), b->runes(),
376 a->nrunes() * sizeof a->runes()[0]) == 0;
377
378 case kRegexpAlternate:
379 case kRegexpConcat:
380 return a->nsub() == b->nsub();
381
382 case kRegexpStar:
383 case kRegexpPlus:
384 case kRegexpQuest:
385 return ((a->parse_flags() ^ b->parse_flags()) & Regexp::NonGreedy) == 0;
386
387 case kRegexpRepeat:
388 return ((a->parse_flags() ^ b->parse_flags()) & Regexp::NonGreedy) == 0 &&
389 a->min() == b->min() &&
390 a->max() == b->max();
391
392 case kRegexpCapture:
393 return a->cap() == b->cap() && a->name() == b->name();
394
395 case kRegexpHaveMatch:
396 return a->match_id() == b->match_id();
397
398 case kRegexpCharClass: {
399 CharClass* acc = a->cc();
400 CharClass* bcc = b->cc();
401 return acc->size() == bcc->size() &&
402 acc->end() - acc->begin() == bcc->end() - bcc->begin() &&
403 memcmp(acc->begin(), bcc->begin(),
404 (acc->end() - acc->begin()) * sizeof acc->begin()[0]) == 0;
405 }
406 }
407
408 LOG(DFATAL) << "Unexpected op in Regexp::Equal: " << a->op();
409 return 0;
410 }
411
Equal(Regexp * a,Regexp * b)412 bool Regexp::Equal(Regexp* a, Regexp* b) {
413 if (a == NULL || b == NULL)
414 return a == b;
415
416 if (!TopEqual(a, b))
417 return false;
418
419 // Fast path:
420 // return without allocating vector if there are no subregexps.
421 switch (a->op()) {
422 case kRegexpAlternate:
423 case kRegexpConcat:
424 case kRegexpStar:
425 case kRegexpPlus:
426 case kRegexpQuest:
427 case kRegexpRepeat:
428 case kRegexpCapture:
429 break;
430
431 default:
432 return true;
433 }
434
435 // Committed to doing real work.
436 // The stack (vector) has pairs of regexps waiting to
437 // be compared. The regexps are only equal if
438 // all the pairs end up being equal.
439 std::vector<Regexp*> stk;
440
441 for (;;) {
442 // Invariant: TopEqual(a, b) == true.
443 Regexp* a2;
444 Regexp* b2;
445 switch (a->op()) {
446 default:
447 break;
448 case kRegexpAlternate:
449 case kRegexpConcat:
450 for (int i = 0; i < a->nsub(); i++) {
451 a2 = a->sub()[i];
452 b2 = b->sub()[i];
453 if (!TopEqual(a2, b2))
454 return false;
455 stk.push_back(a2);
456 stk.push_back(b2);
457 }
458 break;
459
460 case kRegexpStar:
461 case kRegexpPlus:
462 case kRegexpQuest:
463 case kRegexpRepeat:
464 case kRegexpCapture:
465 a2 = a->sub()[0];
466 b2 = b->sub()[0];
467 if (!TopEqual(a2, b2))
468 return false;
469 // Really:
470 // stk.push_back(a2);
471 // stk.push_back(b2);
472 // break;
473 // but faster to assign directly and loop.
474 a = a2;
475 b = b2;
476 continue;
477 }
478
479 size_t n = stk.size();
480 if (n == 0)
481 break;
482
483 DCHECK_GE(n, 2);
484 a = stk[n-2];
485 b = stk[n-1];
486 stk.resize(n-2);
487 }
488
489 return true;
490 }
491
492 // Keep in sync with enum RegexpStatusCode in regexp.h
493 static const char *kErrorStrings[] = {
494 "no error",
495 "unexpected error",
496 "invalid escape sequence",
497 "invalid character class",
498 "invalid character class range",
499 "missing ]",
500 "missing )",
501 "trailing \\",
502 "no argument for repetition operator",
503 "invalid repetition size",
504 "bad repetition operator",
505 "invalid perl operator",
506 "invalid UTF-8",
507 "invalid named capture group",
508 };
509
CodeText(enum RegexpStatusCode code)510 std::string RegexpStatus::CodeText(enum RegexpStatusCode code) {
511 if (code < 0 || code >= arraysize(kErrorStrings))
512 code = kRegexpInternalError;
513 return kErrorStrings[code];
514 }
515
Text() const516 std::string RegexpStatus::Text() const {
517 if (error_arg_.empty())
518 return CodeText(code_);
519 std::string s;
520 s.append(CodeText(code_));
521 s.append(": ");
522 s.append(error_arg_.data(), error_arg_.size());
523 return s;
524 }
525
Copy(const RegexpStatus & status)526 void RegexpStatus::Copy(const RegexpStatus& status) {
527 code_ = status.code_;
528 error_arg_ = status.error_arg_;
529 }
530
531 typedef int Ignored; // Walker<void> doesn't exist
532
533 // Walker subclass to count capturing parens in regexp.
534 class NumCapturesWalker : public Regexp::Walker<Ignored> {
535 public:
NumCapturesWalker()536 NumCapturesWalker() : ncapture_(0) {}
ncapture()537 int ncapture() { return ncapture_; }
538
PreVisit(Regexp * re,Ignored ignored,bool * stop)539 virtual Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) {
540 if (re->op() == kRegexpCapture)
541 ncapture_++;
542 return ignored;
543 }
544
ShortVisit(Regexp * re,Ignored ignored)545 virtual Ignored ShortVisit(Regexp* re, Ignored ignored) {
546 // Should never be called: we use Walk(), not WalkExponential().
547 #ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
548 LOG(DFATAL) << "NumCapturesWalker::ShortVisit called";
549 #endif
550 return ignored;
551 }
552
553 private:
554 int ncapture_;
555
556 NumCapturesWalker(const NumCapturesWalker&) = delete;
557 NumCapturesWalker& operator=(const NumCapturesWalker&) = delete;
558 };
559
NumCaptures()560 int Regexp::NumCaptures() {
561 NumCapturesWalker w;
562 w.Walk(this, 0);
563 return w.ncapture();
564 }
565
566 // Walker class to build map of named capture groups and their indices.
567 class NamedCapturesWalker : public Regexp::Walker<Ignored> {
568 public:
NamedCapturesWalker()569 NamedCapturesWalker() : map_(NULL) {}
~NamedCapturesWalker()570 ~NamedCapturesWalker() { delete map_; }
571
TakeMap()572 std::map<std::string, int>* TakeMap() {
573 std::map<std::string, int>* m = map_;
574 map_ = NULL;
575 return m;
576 }
577
PreVisit(Regexp * re,Ignored ignored,bool * stop)578 virtual Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) {
579 if (re->op() == kRegexpCapture && re->name() != NULL) {
580 // Allocate map once we find a name.
581 if (map_ == NULL)
582 map_ = new std::map<std::string, int>;
583
584 // Record first occurrence of each name.
585 // (The rule is that if you have the same name
586 // multiple times, only the leftmost one counts.)
587 if (map_->find(*re->name()) == map_->end())
588 (*map_)[*re->name()] = re->cap();
589 }
590 return ignored;
591 }
592
ShortVisit(Regexp * re,Ignored ignored)593 virtual Ignored ShortVisit(Regexp* re, Ignored ignored) {
594 // Should never be called: we use Walk(), not WalkExponential().
595 #ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
596 LOG(DFATAL) << "NamedCapturesWalker::ShortVisit called";
597 #endif
598 return ignored;
599 }
600
601 private:
602 std::map<std::string, int>* map_;
603
604 NamedCapturesWalker(const NamedCapturesWalker&) = delete;
605 NamedCapturesWalker& operator=(const NamedCapturesWalker&) = delete;
606 };
607
NamedCaptures()608 std::map<std::string, int>* Regexp::NamedCaptures() {
609 NamedCapturesWalker w;
610 w.Walk(this, 0);
611 return w.TakeMap();
612 }
613
614 // Walker class to build map from capture group indices to their names.
615 class CaptureNamesWalker : public Regexp::Walker<Ignored> {
616 public:
CaptureNamesWalker()617 CaptureNamesWalker() : map_(NULL) {}
~CaptureNamesWalker()618 ~CaptureNamesWalker() { delete map_; }
619
TakeMap()620 std::map<int, std::string>* TakeMap() {
621 std::map<int, std::string>* m = map_;
622 map_ = NULL;
623 return m;
624 }
625
PreVisit(Regexp * re,Ignored ignored,bool * stop)626 virtual Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) {
627 if (re->op() == kRegexpCapture && re->name() != NULL) {
628 // Allocate map once we find a name.
629 if (map_ == NULL)
630 map_ = new std::map<int, std::string>;
631
632 (*map_)[re->cap()] = *re->name();
633 }
634 return ignored;
635 }
636
ShortVisit(Regexp * re,Ignored ignored)637 virtual Ignored ShortVisit(Regexp* re, Ignored ignored) {
638 // Should never be called: we use Walk(), not WalkExponential().
639 #ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
640 LOG(DFATAL) << "CaptureNamesWalker::ShortVisit called";
641 #endif
642 return ignored;
643 }
644
645 private:
646 std::map<int, std::string>* map_;
647
648 CaptureNamesWalker(const CaptureNamesWalker&) = delete;
649 CaptureNamesWalker& operator=(const CaptureNamesWalker&) = delete;
650 };
651
CaptureNames()652 std::map<int, std::string>* Regexp::CaptureNames() {
653 CaptureNamesWalker w;
654 w.Walk(this, 0);
655 return w.TakeMap();
656 }
657
ConvertRunesToBytes(bool latin1,Rune * runes,int nrunes,std::string * bytes)658 void ConvertRunesToBytes(bool latin1, Rune* runes, int nrunes,
659 std::string* bytes) {
660 if (latin1) {
661 bytes->resize(nrunes);
662 for (int i = 0; i < nrunes; i++)
663 (*bytes)[i] = static_cast<char>(runes[i]);
664 } else {
665 bytes->resize(nrunes * UTFmax); // worst case
666 char* p = &(*bytes)[0];
667 for (int i = 0; i < nrunes; i++)
668 p += runetochar(p, &runes[i]);
669 bytes->resize(p - &(*bytes)[0]);
670 bytes->shrink_to_fit();
671 }
672 }
673
674 // Determines whether regexp matches must be anchored
675 // with a fixed string prefix. If so, returns the prefix and
676 // the regexp that remains after the prefix. The prefix might
677 // be ASCII case-insensitive.
RequiredPrefix(std::string * prefix,bool * foldcase,Regexp ** suffix)678 bool Regexp::RequiredPrefix(std::string* prefix, bool* foldcase,
679 Regexp** suffix) {
680 prefix->clear();
681 *foldcase = false;
682 *suffix = NULL;
683
684 // No need for a walker: the regexp must be of the form
685 // 1. some number of ^ anchors
686 // 2. a literal char or string
687 // 3. the rest
688 if (op_ != kRegexpConcat)
689 return false;
690 int i = 0;
691 while (i < nsub_ && sub()[i]->op_ == kRegexpBeginText)
692 i++;
693 if (i == 0 || i >= nsub_)
694 return false;
695 Regexp* re = sub()[i];
696 if (re->op_ != kRegexpLiteral &&
697 re->op_ != kRegexpLiteralString)
698 return false;
699 i++;
700 if (i < nsub_) {
701 for (int j = i; j < nsub_; j++)
702 sub()[j]->Incref();
703 *suffix = Concat(sub() + i, nsub_ - i, parse_flags());
704 } else {
705 *suffix = new Regexp(kRegexpEmptyMatch, parse_flags());
706 }
707
708 bool latin1 = (re->parse_flags() & Latin1) != 0;
709 Rune* runes = re->op_ == kRegexpLiteral ? &re->rune_ : re->runes_;
710 int nrunes = re->op_ == kRegexpLiteral ? 1 : re->nrunes_;
711 ConvertRunesToBytes(latin1, runes, nrunes, prefix);
712 *foldcase = (re->parse_flags() & FoldCase) != 0;
713 return true;
714 }
715
716 // Determines whether regexp matches must be unanchored
717 // with a fixed string prefix. If so, returns the prefix.
718 // The prefix might be ASCII case-insensitive.
RequiredPrefixForAccel(std::string * prefix,bool * foldcase)719 bool Regexp::RequiredPrefixForAccel(std::string* prefix, bool* foldcase) {
720 prefix->clear();
721 *foldcase = false;
722
723 // No need for a walker: the regexp must either begin with or be
724 // a literal char or string.
725 Regexp* re = op_ == kRegexpConcat && nsub_ > 0 ? sub()[0] : this;
726 if (re->op_ != kRegexpLiteral &&
727 re->op_ != kRegexpLiteralString)
728 return false;
729
730 bool latin1 = (re->parse_flags() & Latin1) != 0;
731 Rune* runes = re->op_ == kRegexpLiteral ? &re->rune_ : re->runes_;
732 int nrunes = re->op_ == kRegexpLiteral ? 1 : re->nrunes_;
733 ConvertRunesToBytes(latin1, runes, nrunes, prefix);
734 *foldcase = (re->parse_flags() & FoldCase) != 0;
735 return true;
736 }
737
738 // Character class builder is a balanced binary tree (STL set)
739 // containing non-overlapping, non-abutting RuneRanges.
740 // The less-than operator used in the tree treats two
741 // ranges as equal if they overlap at all, so that
742 // lookups for a particular Rune are possible.
743
CharClassBuilder()744 CharClassBuilder::CharClassBuilder() {
745 nrunes_ = 0;
746 upper_ = 0;
747 lower_ = 0;
748 }
749
750 // Add lo-hi to the class; return whether class got bigger.
AddRange(Rune lo,Rune hi)751 bool CharClassBuilder::AddRange(Rune lo, Rune hi) {
752 if (hi < lo)
753 return false;
754
755 if (lo <= 'z' && hi >= 'A') {
756 // Overlaps some alpha, maybe not all.
757 // Update bitmaps telling which ASCII letters are in the set.
758 Rune lo1 = std::max<Rune>(lo, 'A');
759 Rune hi1 = std::min<Rune>(hi, 'Z');
760 if (lo1 <= hi1)
761 upper_ |= ((1 << (hi1 - lo1 + 1)) - 1) << (lo1 - 'A');
762
763 lo1 = std::max<Rune>(lo, 'a');
764 hi1 = std::min<Rune>(hi, 'z');
765 if (lo1 <= hi1)
766 lower_ |= ((1 << (hi1 - lo1 + 1)) - 1) << (lo1 - 'a');
767 }
768
769 { // Check whether lo, hi is already in the class.
770 iterator it = ranges_.find(RuneRange(lo, lo));
771 if (it != end() && it->lo <= lo && hi <= it->hi)
772 return false;
773 }
774
775 // Look for a range abutting lo on the left.
776 // If it exists, take it out and increase our range.
777 if (lo > 0) {
778 iterator it = ranges_.find(RuneRange(lo-1, lo-1));
779 if (it != end()) {
780 lo = it->lo;
781 if (it->hi > hi)
782 hi = it->hi;
783 nrunes_ -= it->hi - it->lo + 1;
784 ranges_.erase(it);
785 }
786 }
787
788 // Look for a range abutting hi on the right.
789 // If it exists, take it out and increase our range.
790 if (hi < Runemax) {
791 iterator it = ranges_.find(RuneRange(hi+1, hi+1));
792 if (it != end()) {
793 hi = it->hi;
794 nrunes_ -= it->hi - it->lo + 1;
795 ranges_.erase(it);
796 }
797 }
798
799 // Look for ranges between lo and hi. Take them out.
800 // This is only safe because the set has no overlapping ranges.
801 // We've already removed any ranges abutting lo and hi, so
802 // any that overlap [lo, hi] must be contained within it.
803 for (;;) {
804 iterator it = ranges_.find(RuneRange(lo, hi));
805 if (it == end())
806 break;
807 nrunes_ -= it->hi - it->lo + 1;
808 ranges_.erase(it);
809 }
810
811 // Finally, add [lo, hi].
812 nrunes_ += hi - lo + 1;
813 ranges_.insert(RuneRange(lo, hi));
814 return true;
815 }
816
AddCharClass(CharClassBuilder * cc)817 void CharClassBuilder::AddCharClass(CharClassBuilder *cc) {
818 for (iterator it = cc->begin(); it != cc->end(); ++it)
819 AddRange(it->lo, it->hi);
820 }
821
Contains(Rune r)822 bool CharClassBuilder::Contains(Rune r) {
823 return ranges_.find(RuneRange(r, r)) != end();
824 }
825
826 // Does the character class behave the same on A-Z as on a-z?
FoldsASCII()827 bool CharClassBuilder::FoldsASCII() {
828 return ((upper_ ^ lower_) & AlphaMask) == 0;
829 }
830
Copy()831 CharClassBuilder* CharClassBuilder::Copy() {
832 CharClassBuilder* cc = new CharClassBuilder;
833 for (iterator it = begin(); it != end(); ++it)
834 cc->ranges_.insert(RuneRange(it->lo, it->hi));
835 cc->upper_ = upper_;
836 cc->lower_ = lower_;
837 cc->nrunes_ = nrunes_;
838 return cc;
839 }
840
841
842
RemoveAbove(Rune r)843 void CharClassBuilder::RemoveAbove(Rune r) {
844 if (r >= Runemax)
845 return;
846
847 if (r < 'z') {
848 if (r < 'a')
849 lower_ = 0;
850 else
851 lower_ &= AlphaMask >> ('z' - r);
852 }
853
854 if (r < 'Z') {
855 if (r < 'A')
856 upper_ = 0;
857 else
858 upper_ &= AlphaMask >> ('Z' - r);
859 }
860
861 for (;;) {
862
863 iterator it = ranges_.find(RuneRange(r + 1, Runemax));
864 if (it == end())
865 break;
866 RuneRange rr = *it;
867 ranges_.erase(it);
868 nrunes_ -= rr.hi - rr.lo + 1;
869 if (rr.lo <= r) {
870 rr.hi = r;
871 ranges_.insert(rr);
872 nrunes_ += rr.hi - rr.lo + 1;
873 }
874 }
875 }
876
Negate()877 void CharClassBuilder::Negate() {
878 // Build up negation and then copy in.
879 // Could edit ranges in place, but C++ won't let me.
880 std::vector<RuneRange> v;
881 v.reserve(ranges_.size() + 1);
882
883 // In negation, first range begins at 0, unless
884 // the current class begins at 0.
885 iterator it = begin();
886 if (it == end()) {
887 v.push_back(RuneRange(0, Runemax));
888 } else {
889 int nextlo = 0;
890 if (it->lo == 0) {
891 nextlo = it->hi + 1;
892 ++it;
893 }
894 for (; it != end(); ++it) {
895 v.push_back(RuneRange(nextlo, it->lo - 1));
896 nextlo = it->hi + 1;
897 }
898 if (nextlo <= Runemax)
899 v.push_back(RuneRange(nextlo, Runemax));
900 }
901
902 ranges_.clear();
903 for (size_t i = 0; i < v.size(); i++)
904 ranges_.insert(v[i]);
905
906 upper_ = AlphaMask & ~upper_;
907 lower_ = AlphaMask & ~lower_;
908 nrunes_ = Runemax+1 - nrunes_;
909 }
910
911 // Character class is a sorted list of ranges.
912 // The ranges are allocated in the same block as the header,
913 // necessitating a special allocator and Delete method.
914
New(int maxranges)915 CharClass* CharClass::New(int maxranges) {
916 CharClass* cc;
917 uint8_t* data = new uint8_t[sizeof *cc + maxranges*sizeof cc->ranges_[0]];
918 cc = reinterpret_cast<CharClass*>(data);
919 cc->ranges_ = reinterpret_cast<RuneRange*>(data + sizeof *cc);
920 cc->nranges_ = 0;
921 cc->folds_ascii_ = false;
922 cc->nrunes_ = 0;
923 return cc;
924 }
925
Delete()926 void CharClass::Delete() {
927 uint8_t* data = reinterpret_cast<uint8_t*>(this);
928 delete[] data;
929 }
930
Negate()931 CharClass* CharClass::Negate() {
932 CharClass* cc = CharClass::New(nranges_+1);
933 cc->folds_ascii_ = folds_ascii_;
934 cc->nrunes_ = Runemax + 1 - nrunes_;
935 int n = 0;
936 int nextlo = 0;
937 for (CharClass::iterator it = begin(); it != end(); ++it) {
938 if (it->lo == nextlo) {
939 nextlo = it->hi + 1;
940 } else {
941 cc->ranges_[n++] = RuneRange(nextlo, it->lo - 1);
942 nextlo = it->hi + 1;
943 }
944 }
945 if (nextlo <= Runemax)
946 cc->ranges_[n++] = RuneRange(nextlo, Runemax);
947 cc->nranges_ = n;
948 return cc;
949 }
950
Contains(Rune r)951 bool CharClass::Contains(Rune r) {
952 RuneRange* rr = ranges_;
953 int n = nranges_;
954 while (n > 0) {
955 int m = n/2;
956 if (rr[m].hi < r) {
957 rr += m+1;
958 n -= m+1;
959 } else if (r < rr[m].lo) {
960 n = m;
961 } else { // rr[m].lo <= r && r <= rr[m].hi
962 return true;
963 }
964 }
965 return false;
966 }
967
GetCharClass()968 CharClass* CharClassBuilder::GetCharClass() {
969 CharClass* cc = CharClass::New(static_cast<int>(ranges_.size()));
970 int n = 0;
971 for (iterator it = begin(); it != end(); ++it)
972 cc->ranges_[n++] = *it;
973 cc->nranges_ = n;
974 DCHECK_LE(n, static_cast<int>(ranges_.size()));
975 cc->nrunes_ = nrunes_;
976 cc->folds_ascii_ = FoldsASCII();
977 return cc;
978 }
979
980 } // namespace re2
981