• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2006 The RE2 Authors.  All Rights Reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4 
5 // Regular expression representation.
6 // Tested by parse_test.cc
7 
8 #include "re2/regexp.h"
9 
10 #include <stddef.h>
11 #include <stdint.h>
12 #include <string.h>
13 
14 #include <algorithm>
15 #include <map>
16 #include <string>
17 #include <vector>
18 
19 #include "absl/base/call_once.h"
20 #include "absl/base/macros.h"
21 #include "absl/container/flat_hash_map.h"
22 #include "absl/log/absl_check.h"
23 #include "absl/log/absl_log.h"
24 #include "absl/synchronization/mutex.h"
25 #include "re2/pod_array.h"
26 #include "re2/walker-inl.h"
27 #include "util/utf.h"
28 
29 namespace re2 {
30 
31 // Constructor.  Allocates vectors as appropriate for operator.
Regexp(RegexpOp op,ParseFlags parse_flags)32 Regexp::Regexp(RegexpOp op, ParseFlags parse_flags)
33   : op_(static_cast<uint8_t>(op)),
34     simple_(false),
35     parse_flags_(static_cast<uint16_t>(parse_flags)),
36     ref_(1),
37     nsub_(0),
38     down_(NULL) {
39   subone_ = NULL;
40   memset(the_union_, 0, sizeof the_union_);
41 }
42 
43 // Destructor.  Assumes already cleaned up children.
44 // Private: use Decref() instead of delete to destroy Regexps.
45 // Can't call Decref on the sub-Regexps here because
46 // that could cause arbitrarily deep recursion, so
47 // required Decref() to have handled them for us.
~Regexp()48 Regexp::~Regexp() {
49   if (nsub_ > 0)
50     ABSL_LOG(DFATAL) << "Regexp not destroyed.";
51 
52   switch (op_) {
53     default:
54       break;
55     case kRegexpCapture:
56       delete name_;
57       break;
58     case kRegexpLiteralString:
59       delete[] runes_;
60       break;
61     case kRegexpCharClass:
62       if (cc_)
63         cc_->Delete();
64       delete ccb_;
65       break;
66   }
67 }
68 
69 // If it's possible to destroy this regexp without recurring,
70 // do so and return true.  Else return false.
QuickDestroy()71 bool Regexp::QuickDestroy() {
72   if (nsub_ == 0) {
73     delete this;
74     return true;
75   }
76   return false;
77 }
78 
79 // Similar to EmptyStorage in re2.cc.
80 struct RefStorage {
81   absl::Mutex ref_mutex;
82   absl::flat_hash_map<Regexp*, int> ref_map;
83 };
84 alignas(RefStorage) static char ref_storage[sizeof(RefStorage)];
85 
ref_mutex()86 static inline absl::Mutex* ref_mutex() {
87   return &reinterpret_cast<RefStorage*>(ref_storage)->ref_mutex;
88 }
89 
ref_map()90 static inline absl::flat_hash_map<Regexp*, int>* ref_map() {
91   return &reinterpret_cast<RefStorage*>(ref_storage)->ref_map;
92 }
93 
Ref()94 int Regexp::Ref() {
95   if (ref_ < kMaxRef)
96     return ref_;
97 
98   absl::MutexLock l(ref_mutex());
99   return (*ref_map())[this];
100 }
101 
102 // Increments reference count, returns object as convenience.
Incref()103 Regexp* Regexp::Incref() {
104   if (ref_ >= kMaxRef-1) {
105     static absl::once_flag ref_once;
106     absl::call_once(ref_once, []() {
107       (void) new (ref_storage) RefStorage;
108     });
109 
110     // Store ref count in overflow map.
111     absl::MutexLock l(ref_mutex());
112     if (ref_ == kMaxRef) {
113       // already overflowed
114       (*ref_map())[this]++;
115     } else {
116       // overflowing now
117       (*ref_map())[this] = kMaxRef;
118       ref_ = kMaxRef;
119     }
120     return this;
121   }
122 
123   ref_++;
124   return this;
125 }
126 
127 // Decrements reference count and deletes this object if count reaches 0.
Decref()128 void Regexp::Decref() {
129   if (ref_ == kMaxRef) {
130     // Ref count is stored in overflow map.
131     absl::MutexLock l(ref_mutex());
132     int r = (*ref_map())[this] - 1;
133     if (r < kMaxRef) {
134       ref_ = static_cast<uint16_t>(r);
135       ref_map()->erase(this);
136     } else {
137       (*ref_map())[this] = r;
138     }
139     return;
140   }
141   ref_--;
142   if (ref_ == 0)
143     Destroy();
144 }
145 
146 // Deletes this object; ref count has count reached 0.
Destroy()147 void Regexp::Destroy() {
148   if (QuickDestroy())
149     return;
150 
151   // Handle recursive Destroy with explicit stack
152   // to avoid arbitrarily deep recursion on process stack [sigh].
153   down_ = NULL;
154   Regexp* stack = this;
155   while (stack != NULL) {
156     Regexp* re = stack;
157     stack = re->down_;
158     if (re->ref_ != 0)
159       ABSL_LOG(DFATAL) << "Bad reference count " << re->ref_;
160     if (re->nsub_ > 0) {
161       Regexp** subs = re->sub();
162       for (int i = 0; i < re->nsub_; i++) {
163         Regexp* sub = subs[i];
164         if (sub == NULL)
165           continue;
166         if (sub->ref_ == kMaxRef)
167           sub->Decref();
168         else
169           --sub->ref_;
170         if (sub->ref_ == 0 && !sub->QuickDestroy()) {
171           sub->down_ = stack;
172           stack = sub;
173         }
174       }
175       if (re->nsub_ > 1)
176         delete[] subs;
177       re->nsub_ = 0;
178     }
179     delete re;
180   }
181 }
182 
AddRuneToString(Rune r)183 void Regexp::AddRuneToString(Rune r) {
184   ABSL_DCHECK(op_ == kRegexpLiteralString);
185   if (nrunes_ == 0) {
186     // start with 8
187     runes_ = new Rune[8];
188   } else if (nrunes_ >= 8 && (nrunes_ & (nrunes_ - 1)) == 0) {
189     // double on powers of two
190     Rune *old = runes_;
191     runes_ = new Rune[nrunes_ * 2];
192     for (int i = 0; i < nrunes_; i++)
193       runes_[i] = old[i];
194     delete[] old;
195   }
196 
197   runes_[nrunes_++] = r;
198 }
199 
HaveMatch(int match_id,ParseFlags flags)200 Regexp* Regexp::HaveMatch(int match_id, ParseFlags flags) {
201   Regexp* re = new Regexp(kRegexpHaveMatch, flags);
202   re->match_id_ = match_id;
203   return re;
204 }
205 
StarPlusOrQuest(RegexpOp op,Regexp * sub,ParseFlags flags)206 Regexp* Regexp::StarPlusOrQuest(RegexpOp op, Regexp* sub, ParseFlags flags) {
207   // Squash **, ++ and ??.
208   if (op == sub->op() && flags == sub->parse_flags())
209     return sub;
210 
211   // Squash *+, *?, +*, +?, ?* and ?+. They all squash to *, so because
212   // op is Star/Plus/Quest, we just have to check that sub->op() is too.
213   if ((sub->op() == kRegexpStar ||
214        sub->op() == kRegexpPlus ||
215        sub->op() == kRegexpQuest) &&
216       flags == sub->parse_flags()) {
217     // If sub is Star, no need to rewrite it.
218     if (sub->op() == kRegexpStar)
219       return sub;
220 
221     // Rewrite sub to Star.
222     Regexp* re = new Regexp(kRegexpStar, flags);
223     re->AllocSub(1);
224     re->sub()[0] = sub->sub()[0]->Incref();
225     sub->Decref();  // We didn't consume the reference after all.
226     return re;
227   }
228 
229   Regexp* re = new Regexp(op, flags);
230   re->AllocSub(1);
231   re->sub()[0] = sub;
232   return re;
233 }
234 
Plus(Regexp * sub,ParseFlags flags)235 Regexp* Regexp::Plus(Regexp* sub, ParseFlags flags) {
236   return StarPlusOrQuest(kRegexpPlus, sub, flags);
237 }
238 
Star(Regexp * sub,ParseFlags flags)239 Regexp* Regexp::Star(Regexp* sub, ParseFlags flags) {
240   return StarPlusOrQuest(kRegexpStar, sub, flags);
241 }
242 
Quest(Regexp * sub,ParseFlags flags)243 Regexp* Regexp::Quest(Regexp* sub, ParseFlags flags) {
244   return StarPlusOrQuest(kRegexpQuest, sub, flags);
245 }
246 
ConcatOrAlternate(RegexpOp op,Regexp ** sub,int nsub,ParseFlags flags,bool can_factor)247 Regexp* Regexp::ConcatOrAlternate(RegexpOp op, Regexp** sub, int nsub,
248                                   ParseFlags flags, bool can_factor) {
249   if (nsub == 1)
250     return sub[0];
251 
252   if (nsub == 0) {
253     if (op == kRegexpAlternate)
254       return new Regexp(kRegexpNoMatch, flags);
255     else
256       return new Regexp(kRegexpEmptyMatch, flags);
257   }
258 
259   PODArray<Regexp*> subcopy;
260   if (op == kRegexpAlternate && can_factor) {
261     // Going to edit sub; make a copy so we don't step on caller.
262     subcopy = PODArray<Regexp*>(nsub);
263     memmove(subcopy.data(), sub, nsub * sizeof sub[0]);
264     sub = subcopy.data();
265     nsub = FactorAlternation(sub, nsub, flags);
266     if (nsub == 1) {
267       Regexp* re = sub[0];
268       return re;
269     }
270   }
271 
272   if (nsub > kMaxNsub) {
273     // Too many subexpressions to fit in a single Regexp.
274     // Make a two-level tree.  Two levels gets us to 65535^2.
275     int nbigsub = (nsub+kMaxNsub-1)/kMaxNsub;
276     Regexp* re = new Regexp(op, flags);
277     re->AllocSub(nbigsub);
278     Regexp** subs = re->sub();
279     for (int i = 0; i < nbigsub - 1; i++)
280       subs[i] = ConcatOrAlternate(op, sub+i*kMaxNsub, kMaxNsub, flags, false);
281     subs[nbigsub - 1] = ConcatOrAlternate(op, sub+(nbigsub-1)*kMaxNsub,
282                                           nsub - (nbigsub-1)*kMaxNsub, flags,
283                                           false);
284     return re;
285   }
286 
287   Regexp* re = new Regexp(op, flags);
288   re->AllocSub(nsub);
289   Regexp** subs = re->sub();
290   for (int i = 0; i < nsub; i++)
291     subs[i] = sub[i];
292   return re;
293 }
294 
Concat(Regexp ** sub,int nsub,ParseFlags flags)295 Regexp* Regexp::Concat(Regexp** sub, int nsub, ParseFlags flags) {
296   return ConcatOrAlternate(kRegexpConcat, sub, nsub, flags, false);
297 }
298 
Alternate(Regexp ** sub,int nsub,ParseFlags flags)299 Regexp* Regexp::Alternate(Regexp** sub, int nsub, ParseFlags flags) {
300   return ConcatOrAlternate(kRegexpAlternate, sub, nsub, flags, true);
301 }
302 
AlternateNoFactor(Regexp ** sub,int nsub,ParseFlags flags)303 Regexp* Regexp::AlternateNoFactor(Regexp** sub, int nsub, ParseFlags flags) {
304   return ConcatOrAlternate(kRegexpAlternate, sub, nsub, flags, false);
305 }
306 
Capture(Regexp * sub,ParseFlags flags,int cap)307 Regexp* Regexp::Capture(Regexp* sub, ParseFlags flags, int cap) {
308   Regexp* re = new Regexp(kRegexpCapture, flags);
309   re->AllocSub(1);
310   re->sub()[0] = sub;
311   re->cap_ = cap;
312   return re;
313 }
314 
Repeat(Regexp * sub,ParseFlags flags,int min,int max)315 Regexp* Regexp::Repeat(Regexp* sub, ParseFlags flags, int min, int max) {
316   Regexp* re = new Regexp(kRegexpRepeat, flags);
317   re->AllocSub(1);
318   re->sub()[0] = sub;
319   re->min_ = min;
320   re->max_ = max;
321   return re;
322 }
323 
NewLiteral(Rune rune,ParseFlags flags)324 Regexp* Regexp::NewLiteral(Rune rune, ParseFlags flags) {
325   Regexp* re = new Regexp(kRegexpLiteral, flags);
326   re->rune_ = rune;
327   return re;
328 }
329 
LiteralString(Rune * runes,int nrunes,ParseFlags flags)330 Regexp* Regexp::LiteralString(Rune* runes, int nrunes, ParseFlags flags) {
331   if (nrunes <= 0)
332     return new Regexp(kRegexpEmptyMatch, flags);
333   if (nrunes == 1)
334     return NewLiteral(runes[0], flags);
335   Regexp* re = new Regexp(kRegexpLiteralString, flags);
336   for (int i = 0; i < nrunes; i++)
337     re->AddRuneToString(runes[i]);
338   return re;
339 }
340 
NewCharClass(CharClass * cc,ParseFlags flags)341 Regexp* Regexp::NewCharClass(CharClass* cc, ParseFlags flags) {
342   Regexp* re = new Regexp(kRegexpCharClass, flags);
343   re->cc_ = cc;
344   return re;
345 }
346 
Swap(Regexp * that)347 void Regexp::Swap(Regexp* that) {
348   // Regexp is not trivially copyable, so we cannot freely copy it with
349   // memmove(3), but swapping objects like so is safe for our purposes.
350   char tmp[sizeof *this];
351   void* vthis = reinterpret_cast<void*>(this);
352   void* vthat = reinterpret_cast<void*>(that);
353   memmove(tmp, vthis, sizeof *this);
354   memmove(vthis, vthat, sizeof *this);
355   memmove(vthat, tmp, sizeof *this);
356 }
357 
358 // Tests equality of all top-level structure but not subregexps.
TopEqual(Regexp * a,Regexp * b)359 static bool TopEqual(Regexp* a, Regexp* b) {
360   if (a->op() != b->op())
361     return false;
362 
363   switch (a->op()) {
364     case kRegexpNoMatch:
365     case kRegexpEmptyMatch:
366     case kRegexpAnyChar:
367     case kRegexpAnyByte:
368     case kRegexpBeginLine:
369     case kRegexpEndLine:
370     case kRegexpWordBoundary:
371     case kRegexpNoWordBoundary:
372     case kRegexpBeginText:
373       return true;
374 
375     case kRegexpEndText:
376       // The parse flags remember whether it's \z or (?-m:$),
377       // which matters when testing against PCRE.
378       return ((a->parse_flags() ^ b->parse_flags()) & Regexp::WasDollar) == 0;
379 
380     case kRegexpLiteral:
381       return a->rune() == b->rune() &&
382              ((a->parse_flags() ^ b->parse_flags()) & Regexp::FoldCase) == 0;
383 
384     case kRegexpLiteralString:
385       return a->nrunes() == b->nrunes() &&
386              ((a->parse_flags() ^ b->parse_flags()) & Regexp::FoldCase) == 0 &&
387              memcmp(a->runes(), b->runes(),
388                     a->nrunes() * sizeof a->runes()[0]) == 0;
389 
390     case kRegexpAlternate:
391     case kRegexpConcat:
392       return a->nsub() == b->nsub();
393 
394     case kRegexpStar:
395     case kRegexpPlus:
396     case kRegexpQuest:
397       return ((a->parse_flags() ^ b->parse_flags()) & Regexp::NonGreedy) == 0;
398 
399     case kRegexpRepeat:
400       return ((a->parse_flags() ^ b->parse_flags()) & Regexp::NonGreedy) == 0 &&
401              a->min() == b->min() &&
402              a->max() == b->max();
403 
404     case kRegexpCapture:
405       if (a->name() == NULL || b->name() == NULL) {
406         // One pointer is null, so the other pointer should also be null.
407         return a->cap() == b->cap() && a->name() == b->name();
408       } else {
409         // Neither pointer is null, so compare the pointees for equality.
410         return a->cap() == b->cap() && *a->name() == *b->name();
411       }
412 
413     case kRegexpHaveMatch:
414       return a->match_id() == b->match_id();
415 
416     case kRegexpCharClass: {
417       CharClass* acc = a->cc();
418       CharClass* bcc = b->cc();
419       return acc->size() == bcc->size() &&
420              acc->end() - acc->begin() == bcc->end() - bcc->begin() &&
421              memcmp(acc->begin(), bcc->begin(),
422                     (acc->end() - acc->begin()) * sizeof acc->begin()[0]) == 0;
423     }
424   }
425 
426   ABSL_LOG(DFATAL) << "Unexpected op in Regexp::Equal: " << a->op();
427   return 0;
428 }
429 
Equal(Regexp * a,Regexp * b)430 bool Regexp::Equal(Regexp* a, Regexp* b) {
431   if (a == NULL || b == NULL)
432     return a == b;
433 
434   if (!TopEqual(a, b))
435     return false;
436 
437   // Fast path:
438   // return without allocating vector if there are no subregexps.
439   switch (a->op()) {
440     case kRegexpAlternate:
441     case kRegexpConcat:
442     case kRegexpStar:
443     case kRegexpPlus:
444     case kRegexpQuest:
445     case kRegexpRepeat:
446     case kRegexpCapture:
447       break;
448 
449     default:
450       return true;
451   }
452 
453   // Committed to doing real work.
454   // The stack (vector) has pairs of regexps waiting to
455   // be compared.  The regexps are only equal if
456   // all the pairs end up being equal.
457   std::vector<Regexp*> stk;
458 
459   for (;;) {
460     // Invariant: TopEqual(a, b) == true.
461     Regexp* a2;
462     Regexp* b2;
463     switch (a->op()) {
464       default:
465         break;
466       case kRegexpAlternate:
467       case kRegexpConcat:
468         for (int i = 0; i < a->nsub(); i++) {
469           a2 = a->sub()[i];
470           b2 = b->sub()[i];
471           if (!TopEqual(a2, b2))
472             return false;
473           stk.push_back(a2);
474           stk.push_back(b2);
475         }
476         break;
477 
478       case kRegexpStar:
479       case kRegexpPlus:
480       case kRegexpQuest:
481       case kRegexpRepeat:
482       case kRegexpCapture:
483         a2 = a->sub()[0];
484         b2 = b->sub()[0];
485         if (!TopEqual(a2, b2))
486           return false;
487         // Really:
488         //   stk.push_back(a2);
489         //   stk.push_back(b2);
490         //   break;
491         // but faster to assign directly and loop.
492         a = a2;
493         b = b2;
494         continue;
495     }
496 
497     size_t n = stk.size();
498     if (n == 0)
499       break;
500 
501     ABSL_DCHECK_GE(n, size_t{2});
502     a = stk[n-2];
503     b = stk[n-1];
504     stk.resize(n-2);
505   }
506 
507   return true;
508 }
509 
510 // Keep in sync with enum RegexpStatusCode in regexp.h
511 static const char *kErrorStrings[] = {
512   "no error",
513   "unexpected error",
514   "invalid escape sequence",
515   "invalid character class",
516   "invalid character class range",
517   "missing ]",
518   "missing )",
519   "unexpected )",
520   "trailing \\",
521   "no argument for repetition operator",
522   "invalid repetition size",
523   "bad repetition operator",
524   "invalid perl operator",
525   "invalid UTF-8",
526   "invalid named capture group",
527 };
528 
CodeText(enum RegexpStatusCode code)529 std::string RegexpStatus::CodeText(enum RegexpStatusCode code) {
530   if (code < 0 || code >= ABSL_ARRAYSIZE(kErrorStrings))
531     code = kRegexpInternalError;
532   return kErrorStrings[code];
533 }
534 
Text() const535 std::string RegexpStatus::Text() const {
536   if (error_arg_.empty())
537     return CodeText(code_);
538   std::string s;
539   s.append(CodeText(code_));
540   s.append(": ");
541   s.append(error_arg_.data(), error_arg_.size());
542   return s;
543 }
544 
Copy(const RegexpStatus & status)545 void RegexpStatus::Copy(const RegexpStatus& status) {
546   code_ = status.code_;
547   error_arg_ = status.error_arg_;
548 }
549 
550 typedef int Ignored;  // Walker<void> doesn't exist
551 
552 // Walker subclass to count capturing parens in regexp.
553 class NumCapturesWalker : public Regexp::Walker<Ignored> {
554  public:
NumCapturesWalker()555   NumCapturesWalker() : ncapture_(0) {}
ncapture()556   int ncapture() { return ncapture_; }
557 
PreVisit(Regexp * re,Ignored ignored,bool * stop)558   virtual Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) {
559     if (re->op() == kRegexpCapture)
560       ncapture_++;
561     return ignored;
562   }
563 
ShortVisit(Regexp * re,Ignored ignored)564   virtual Ignored ShortVisit(Regexp* re, Ignored ignored) {
565     // Should never be called: we use Walk(), not WalkExponential().
566 #ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
567     ABSL_LOG(DFATAL) << "NumCapturesWalker::ShortVisit called";
568 #endif
569     return ignored;
570   }
571 
572  private:
573   int ncapture_;
574 
575   NumCapturesWalker(const NumCapturesWalker&) = delete;
576   NumCapturesWalker& operator=(const NumCapturesWalker&) = delete;
577 };
578 
NumCaptures()579 int Regexp::NumCaptures() {
580   NumCapturesWalker w;
581   w.Walk(this, 0);
582   return w.ncapture();
583 }
584 
585 // Walker class to build map of named capture groups and their indices.
586 class NamedCapturesWalker : public Regexp::Walker<Ignored> {
587  public:
NamedCapturesWalker()588   NamedCapturesWalker() : map_(NULL) {}
~NamedCapturesWalker()589   ~NamedCapturesWalker() { delete map_; }
590 
TakeMap()591   std::map<std::string, int>* TakeMap() {
592     std::map<std::string, int>* m = map_;
593     map_ = NULL;
594     return m;
595   }
596 
PreVisit(Regexp * re,Ignored ignored,bool * stop)597   virtual Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) {
598     if (re->op() == kRegexpCapture && re->name() != NULL) {
599       // Allocate map once we find a name.
600       if (map_ == NULL)
601         map_ = new std::map<std::string, int>;
602 
603       // Record first occurrence of each name.
604       // (The rule is that if you have the same name
605       // multiple times, only the leftmost one counts.)
606       map_->insert({*re->name(), re->cap()});
607     }
608     return ignored;
609   }
610 
ShortVisit(Regexp * re,Ignored ignored)611   virtual Ignored ShortVisit(Regexp* re, Ignored ignored) {
612     // Should never be called: we use Walk(), not WalkExponential().
613 #ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
614     ABSL_LOG(DFATAL) << "NamedCapturesWalker::ShortVisit called";
615 #endif
616     return ignored;
617   }
618 
619  private:
620   std::map<std::string, int>* map_;
621 
622   NamedCapturesWalker(const NamedCapturesWalker&) = delete;
623   NamedCapturesWalker& operator=(const NamedCapturesWalker&) = delete;
624 };
625 
NamedCaptures()626 std::map<std::string, int>* Regexp::NamedCaptures() {
627   NamedCapturesWalker w;
628   w.Walk(this, 0);
629   return w.TakeMap();
630 }
631 
632 // Walker class to build map from capture group indices to their names.
633 class CaptureNamesWalker : public Regexp::Walker<Ignored> {
634  public:
CaptureNamesWalker()635   CaptureNamesWalker() : map_(NULL) {}
~CaptureNamesWalker()636   ~CaptureNamesWalker() { delete map_; }
637 
TakeMap()638   std::map<int, std::string>* TakeMap() {
639     std::map<int, std::string>* m = map_;
640     map_ = NULL;
641     return m;
642   }
643 
PreVisit(Regexp * re,Ignored ignored,bool * stop)644   virtual Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) {
645     if (re->op() == kRegexpCapture && re->name() != NULL) {
646       // Allocate map once we find a name.
647       if (map_ == NULL)
648         map_ = new std::map<int, std::string>;
649 
650       (*map_)[re->cap()] = *re->name();
651     }
652     return ignored;
653   }
654 
ShortVisit(Regexp * re,Ignored ignored)655   virtual Ignored ShortVisit(Regexp* re, Ignored ignored) {
656     // Should never be called: we use Walk(), not WalkExponential().
657 #ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
658     ABSL_LOG(DFATAL) << "CaptureNamesWalker::ShortVisit called";
659 #endif
660     return ignored;
661   }
662 
663  private:
664   std::map<int, std::string>* map_;
665 
666   CaptureNamesWalker(const CaptureNamesWalker&) = delete;
667   CaptureNamesWalker& operator=(const CaptureNamesWalker&) = delete;
668 };
669 
CaptureNames()670 std::map<int, std::string>* Regexp::CaptureNames() {
671   CaptureNamesWalker w;
672   w.Walk(this, 0);
673   return w.TakeMap();
674 }
675 
ConvertRunesToBytes(bool latin1,Rune * runes,int nrunes,std::string * bytes)676 void ConvertRunesToBytes(bool latin1, Rune* runes, int nrunes,
677                          std::string* bytes) {
678   if (latin1) {
679     bytes->resize(nrunes);
680     for (int i = 0; i < nrunes; i++)
681       (*bytes)[i] = static_cast<char>(runes[i]);
682   } else {
683     bytes->resize(nrunes * UTFmax);  // worst case
684     char* p = &(*bytes)[0];
685     for (int i = 0; i < nrunes; i++)
686       p += runetochar(p, &runes[i]);
687     bytes->resize(p - &(*bytes)[0]);
688     bytes->shrink_to_fit();
689   }
690 }
691 
692 // Determines whether regexp matches must be anchored
693 // with a fixed string prefix.  If so, returns the prefix and
694 // the regexp that remains after the prefix.  The prefix might
695 // be ASCII case-insensitive.
RequiredPrefix(std::string * prefix,bool * foldcase,Regexp ** suffix)696 bool Regexp::RequiredPrefix(std::string* prefix, bool* foldcase,
697                             Regexp** suffix) {
698   prefix->clear();
699   *foldcase = false;
700   *suffix = NULL;
701 
702   // No need for a walker: the regexp must be of the form
703   // 1. some number of ^ anchors
704   // 2. a literal char or string
705   // 3. the rest
706   if (op_ != kRegexpConcat)
707     return false;
708   int i = 0;
709   while (i < nsub_ && sub()[i]->op_ == kRegexpBeginText)
710     i++;
711   if (i == 0 || i >= nsub_)
712     return false;
713   Regexp* re = sub()[i];
714   if (re->op_ != kRegexpLiteral &&
715       re->op_ != kRegexpLiteralString)
716     return false;
717   i++;
718   if (i < nsub_) {
719     for (int j = i; j < nsub_; j++)
720       sub()[j]->Incref();
721     *suffix = Concat(sub() + i, nsub_ - i, parse_flags());
722   } else {
723     *suffix = new Regexp(kRegexpEmptyMatch, parse_flags());
724   }
725 
726   bool latin1 = (re->parse_flags() & Latin1) != 0;
727   Rune* runes = re->op_ == kRegexpLiteral ? &re->rune_ : re->runes_;
728   int nrunes = re->op_ == kRegexpLiteral ? 1 : re->nrunes_;
729   ConvertRunesToBytes(latin1, runes, nrunes, prefix);
730   *foldcase = (re->parse_flags() & FoldCase) != 0;
731   return true;
732 }
733 
734 // Determines whether regexp matches must be unanchored
735 // with a fixed string prefix.  If so, returns the prefix.
736 // The prefix might be ASCII case-insensitive.
RequiredPrefixForAccel(std::string * prefix,bool * foldcase)737 bool Regexp::RequiredPrefixForAccel(std::string* prefix, bool* foldcase) {
738   prefix->clear();
739   *foldcase = false;
740 
741   // No need for a walker: the regexp must either begin with or be
742   // a literal char or string. We "see through" capturing groups,
743   // but make no effort to glue multiple prefix fragments together.
744   Regexp* re = op_ == kRegexpConcat && nsub_ > 0 ? sub()[0] : this;
745   while (re->op_ == kRegexpCapture) {
746     re = re->sub()[0];
747     if (re->op_ == kRegexpConcat && re->nsub_ > 0)
748       re = re->sub()[0];
749   }
750   if (re->op_ != kRegexpLiteral &&
751       re->op_ != kRegexpLiteralString)
752     return false;
753 
754   bool latin1 = (re->parse_flags() & Latin1) != 0;
755   Rune* runes = re->op_ == kRegexpLiteral ? &re->rune_ : re->runes_;
756   int nrunes = re->op_ == kRegexpLiteral ? 1 : re->nrunes_;
757   ConvertRunesToBytes(latin1, runes, nrunes, prefix);
758   *foldcase = (re->parse_flags() & FoldCase) != 0;
759   return true;
760 }
761 
762 // Character class builder is a balanced binary tree (STL set)
763 // containing non-overlapping, non-abutting RuneRanges.
764 // The less-than operator used in the tree treats two
765 // ranges as equal if they overlap at all, so that
766 // lookups for a particular Rune are possible.
767 
CharClassBuilder()768 CharClassBuilder::CharClassBuilder() {
769   nrunes_ = 0;
770   upper_ = 0;
771   lower_ = 0;
772 }
773 
774 // Add lo-hi to the class; return whether class got bigger.
AddRange(Rune lo,Rune hi)775 bool CharClassBuilder::AddRange(Rune lo, Rune hi) {
776   if (hi < lo)
777     return false;
778 
779   if (lo <= 'z' && hi >= 'A') {
780     // Overlaps some alpha, maybe not all.
781     // Update bitmaps telling which ASCII letters are in the set.
782     Rune lo1 = std::max<Rune>(lo, 'A');
783     Rune hi1 = std::min<Rune>(hi, 'Z');
784     if (lo1 <= hi1)
785       upper_ |= ((1 << (hi1 - lo1 + 1)) - 1) << (lo1 - 'A');
786 
787     lo1 = std::max<Rune>(lo, 'a');
788     hi1 = std::min<Rune>(hi, 'z');
789     if (lo1 <= hi1)
790       lower_ |= ((1 << (hi1 - lo1 + 1)) - 1) << (lo1 - 'a');
791   }
792 
793   {  // Check whether lo, hi is already in the class.
794     iterator it = ranges_.find(RuneRange(lo, lo));
795     if (it != end() && it->lo <= lo && hi <= it->hi)
796       return false;
797   }
798 
799   // Look for a range abutting lo on the left.
800   // If it exists, take it out and increase our range.
801   if (lo > 0) {
802     iterator it = ranges_.find(RuneRange(lo-1, lo-1));
803     if (it != end()) {
804       lo = it->lo;
805       if (it->hi > hi)
806         hi = it->hi;
807       nrunes_ -= it->hi - it->lo + 1;
808       ranges_.erase(it);
809     }
810   }
811 
812   // Look for a range abutting hi on the right.
813   // If it exists, take it out and increase our range.
814   if (hi < Runemax) {
815     iterator it = ranges_.find(RuneRange(hi+1, hi+1));
816     if (it != end()) {
817       hi = it->hi;
818       nrunes_ -= it->hi - it->lo + 1;
819       ranges_.erase(it);
820     }
821   }
822 
823   // Look for ranges between lo and hi.  Take them out.
824   // This is only safe because the set has no overlapping ranges.
825   // We've already removed any ranges abutting lo and hi, so
826   // any that overlap [lo, hi] must be contained within it.
827   for (;;) {
828     iterator it = ranges_.find(RuneRange(lo, hi));
829     if (it == end())
830       break;
831     nrunes_ -= it->hi - it->lo + 1;
832     ranges_.erase(it);
833   }
834 
835   // Finally, add [lo, hi].
836   nrunes_ += hi - lo + 1;
837   ranges_.insert(RuneRange(lo, hi));
838   return true;
839 }
840 
AddCharClass(CharClassBuilder * cc)841 void CharClassBuilder::AddCharClass(CharClassBuilder *cc) {
842   for (iterator it = cc->begin(); it != cc->end(); ++it)
843     AddRange(it->lo, it->hi);
844 }
845 
Contains(Rune r)846 bool CharClassBuilder::Contains(Rune r) {
847   return ranges_.find(RuneRange(r, r)) != end();
848 }
849 
850 // Does the character class behave the same on A-Z as on a-z?
FoldsASCII()851 bool CharClassBuilder::FoldsASCII() {
852   return ((upper_ ^ lower_) & AlphaMask) == 0;
853 }
854 
Copy()855 CharClassBuilder* CharClassBuilder::Copy() {
856   CharClassBuilder* cc = new CharClassBuilder;
857   for (iterator it = begin(); it != end(); ++it)
858     cc->ranges_.insert(RuneRange(it->lo, it->hi));
859   cc->upper_ = upper_;
860   cc->lower_ = lower_;
861   cc->nrunes_ = nrunes_;
862   return cc;
863 }
864 
865 
866 
RemoveAbove(Rune r)867 void CharClassBuilder::RemoveAbove(Rune r) {
868   if (r >= Runemax)
869     return;
870 
871   if (r < 'z') {
872     if (r < 'a')
873       lower_ = 0;
874     else
875       lower_ &= AlphaMask >> ('z' - r);
876   }
877 
878   if (r < 'Z') {
879     if (r < 'A')
880       upper_ = 0;
881     else
882       upper_ &= AlphaMask >> ('Z' - r);
883   }
884 
885   for (;;) {
886 
887     iterator it = ranges_.find(RuneRange(r + 1, Runemax));
888     if (it == end())
889       break;
890     RuneRange rr = *it;
891     ranges_.erase(it);
892     nrunes_ -= rr.hi - rr.lo + 1;
893     if (rr.lo <= r) {
894       rr.hi = r;
895       ranges_.insert(rr);
896       nrunes_ += rr.hi - rr.lo + 1;
897     }
898   }
899 }
900 
Negate()901 void CharClassBuilder::Negate() {
902   // Build up negation and then copy in.
903   // Could edit ranges in place, but C++ won't let me.
904   std::vector<RuneRange> v;
905   v.reserve(ranges_.size() + 1);
906 
907   // In negation, first range begins at 0, unless
908   // the current class begins at 0.
909   iterator it = begin();
910   if (it == end()) {
911     v.push_back(RuneRange(0, Runemax));
912   } else {
913     int nextlo = 0;
914     if (it->lo == 0) {
915       nextlo = it->hi + 1;
916       ++it;
917     }
918     for (; it != end(); ++it) {
919       v.push_back(RuneRange(nextlo, it->lo - 1));
920       nextlo = it->hi + 1;
921     }
922     if (nextlo <= Runemax)
923       v.push_back(RuneRange(nextlo, Runemax));
924   }
925 
926   ranges_.clear();
927   for (size_t i = 0; i < v.size(); i++)
928     ranges_.insert(v[i]);
929 
930   upper_ = AlphaMask & ~upper_;
931   lower_ = AlphaMask & ~lower_;
932   nrunes_ = Runemax+1 - nrunes_;
933 }
934 
935 // Character class is a sorted list of ranges.
936 // The ranges are allocated in the same block as the header,
937 // necessitating a special allocator and Delete method.
938 
New(size_t maxranges)939 CharClass* CharClass::New(size_t maxranges) {
940   CharClass* cc;
941   uint8_t* data = new uint8_t[sizeof *cc + maxranges*sizeof cc->ranges_[0]];
942   cc = reinterpret_cast<CharClass*>(data);
943   cc->ranges_ = reinterpret_cast<RuneRange*>(data + sizeof *cc);
944   cc->nranges_ = 0;
945   cc->folds_ascii_ = false;
946   cc->nrunes_ = 0;
947   return cc;
948 }
949 
Delete()950 void CharClass::Delete() {
951   uint8_t* data = reinterpret_cast<uint8_t*>(this);
952   delete[] data;
953 }
954 
Negate()955 CharClass* CharClass::Negate() {
956   CharClass* cc = CharClass::New(static_cast<size_t>(nranges_+1));
957   cc->folds_ascii_ = folds_ascii_;
958   cc->nrunes_ = Runemax + 1 - nrunes_;
959   int n = 0;
960   int nextlo = 0;
961   for (CharClass::iterator it = begin(); it != end(); ++it) {
962     if (it->lo == nextlo) {
963       nextlo = it->hi + 1;
964     } else {
965       cc->ranges_[n++] = RuneRange(nextlo, it->lo - 1);
966       nextlo = it->hi + 1;
967     }
968   }
969   if (nextlo <= Runemax)
970     cc->ranges_[n++] = RuneRange(nextlo, Runemax);
971   cc->nranges_ = n;
972   return cc;
973 }
974 
Contains(Rune r) const975 bool CharClass::Contains(Rune r) const {
976   RuneRange* rr = ranges_;
977   int n = nranges_;
978   while (n > 0) {
979     int m = n/2;
980     if (rr[m].hi < r) {
981       rr += m+1;
982       n -= m+1;
983     } else if (r < rr[m].lo) {
984       n = m;
985     } else {  // rr[m].lo <= r && r <= rr[m].hi
986       return true;
987     }
988   }
989   return false;
990 }
991 
GetCharClass()992 CharClass* CharClassBuilder::GetCharClass() {
993   CharClass* cc = CharClass::New(ranges_.size());
994   int n = 0;
995   for (iterator it = begin(); it != end(); ++it)
996     cc->ranges_[n++] = *it;
997   cc->nranges_ = n;
998   ABSL_DCHECK_LE(n, static_cast<int>(ranges_.size()));
999   cc->nrunes_ = nrunes_;
1000   cc->folds_ascii_ = FoldsASCII();
1001   return cc;
1002 }
1003 
1004 }  // namespace re2
1005