1 // Copyright 2006 The RE2 Authors. All Rights Reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 // Regular expression representation.
6 // Tested by parse_test.cc
7
8 #include "re2/regexp.h"
9
10 #include <stddef.h>
11 #include <stdint.h>
12 #include <string.h>
13
14 #include <algorithm>
15 #include <map>
16 #include <string>
17 #include <vector>
18
19 #include "absl/base/call_once.h"
20 #include "absl/base/macros.h"
21 #include "absl/container/flat_hash_map.h"
22 #include "absl/log/absl_check.h"
23 #include "absl/log/absl_log.h"
24 #include "absl/synchronization/mutex.h"
25 #include "re2/pod_array.h"
26 #include "re2/walker-inl.h"
27 #include "util/utf.h"
28
29 namespace re2 {
30
31 // Constructor. Allocates vectors as appropriate for operator.
Regexp(RegexpOp op,ParseFlags parse_flags)32 Regexp::Regexp(RegexpOp op, ParseFlags parse_flags)
33 : op_(static_cast<uint8_t>(op)),
34 simple_(false),
35 parse_flags_(static_cast<uint16_t>(parse_flags)),
36 ref_(1),
37 nsub_(0),
38 down_(NULL) {
39 subone_ = NULL;
40 memset(the_union_, 0, sizeof the_union_);
41 }
42
43 // Destructor. Assumes already cleaned up children.
44 // Private: use Decref() instead of delete to destroy Regexps.
45 // Can't call Decref on the sub-Regexps here because
46 // that could cause arbitrarily deep recursion, so
47 // required Decref() to have handled them for us.
~Regexp()48 Regexp::~Regexp() {
49 if (nsub_ > 0)
50 ABSL_LOG(DFATAL) << "Regexp not destroyed.";
51
52 switch (op_) {
53 default:
54 break;
55 case kRegexpCapture:
56 delete name_;
57 break;
58 case kRegexpLiteralString:
59 delete[] runes_;
60 break;
61 case kRegexpCharClass:
62 if (cc_)
63 cc_->Delete();
64 delete ccb_;
65 break;
66 }
67 }
68
69 // If it's possible to destroy this regexp without recurring,
70 // do so and return true. Else return false.
QuickDestroy()71 bool Regexp::QuickDestroy() {
72 if (nsub_ == 0) {
73 delete this;
74 return true;
75 }
76 return false;
77 }
78
79 // Similar to EmptyStorage in re2.cc.
80 struct RefStorage {
81 absl::Mutex ref_mutex;
82 absl::flat_hash_map<Regexp*, int> ref_map;
83 };
84 alignas(RefStorage) static char ref_storage[sizeof(RefStorage)];
85
ref_mutex()86 static inline absl::Mutex* ref_mutex() {
87 return &reinterpret_cast<RefStorage*>(ref_storage)->ref_mutex;
88 }
89
ref_map()90 static inline absl::flat_hash_map<Regexp*, int>* ref_map() {
91 return &reinterpret_cast<RefStorage*>(ref_storage)->ref_map;
92 }
93
Ref()94 int Regexp::Ref() {
95 if (ref_ < kMaxRef)
96 return ref_;
97
98 absl::MutexLock l(ref_mutex());
99 return (*ref_map())[this];
100 }
101
102 // Increments reference count, returns object as convenience.
Incref()103 Regexp* Regexp::Incref() {
104 if (ref_ >= kMaxRef-1) {
105 static absl::once_flag ref_once;
106 absl::call_once(ref_once, []() {
107 (void) new (ref_storage) RefStorage;
108 });
109
110 // Store ref count in overflow map.
111 absl::MutexLock l(ref_mutex());
112 if (ref_ == kMaxRef) {
113 // already overflowed
114 (*ref_map())[this]++;
115 } else {
116 // overflowing now
117 (*ref_map())[this] = kMaxRef;
118 ref_ = kMaxRef;
119 }
120 return this;
121 }
122
123 ref_++;
124 return this;
125 }
126
127 // Decrements reference count and deletes this object if count reaches 0.
Decref()128 void Regexp::Decref() {
129 if (ref_ == kMaxRef) {
130 // Ref count is stored in overflow map.
131 absl::MutexLock l(ref_mutex());
132 int r = (*ref_map())[this] - 1;
133 if (r < kMaxRef) {
134 ref_ = static_cast<uint16_t>(r);
135 ref_map()->erase(this);
136 } else {
137 (*ref_map())[this] = r;
138 }
139 return;
140 }
141 ref_--;
142 if (ref_ == 0)
143 Destroy();
144 }
145
146 // Deletes this object; ref count has count reached 0.
Destroy()147 void Regexp::Destroy() {
148 if (QuickDestroy())
149 return;
150
151 // Handle recursive Destroy with explicit stack
152 // to avoid arbitrarily deep recursion on process stack [sigh].
153 down_ = NULL;
154 Regexp* stack = this;
155 while (stack != NULL) {
156 Regexp* re = stack;
157 stack = re->down_;
158 if (re->ref_ != 0)
159 ABSL_LOG(DFATAL) << "Bad reference count " << re->ref_;
160 if (re->nsub_ > 0) {
161 Regexp** subs = re->sub();
162 for (int i = 0; i < re->nsub_; i++) {
163 Regexp* sub = subs[i];
164 if (sub == NULL)
165 continue;
166 if (sub->ref_ == kMaxRef)
167 sub->Decref();
168 else
169 --sub->ref_;
170 if (sub->ref_ == 0 && !sub->QuickDestroy()) {
171 sub->down_ = stack;
172 stack = sub;
173 }
174 }
175 if (re->nsub_ > 1)
176 delete[] subs;
177 re->nsub_ = 0;
178 }
179 delete re;
180 }
181 }
182
AddRuneToString(Rune r)183 void Regexp::AddRuneToString(Rune r) {
184 ABSL_DCHECK(op_ == kRegexpLiteralString);
185 if (nrunes_ == 0) {
186 // start with 8
187 runes_ = new Rune[8];
188 } else if (nrunes_ >= 8 && (nrunes_ & (nrunes_ - 1)) == 0) {
189 // double on powers of two
190 Rune *old = runes_;
191 runes_ = new Rune[nrunes_ * 2];
192 for (int i = 0; i < nrunes_; i++)
193 runes_[i] = old[i];
194 delete[] old;
195 }
196
197 runes_[nrunes_++] = r;
198 }
199
HaveMatch(int match_id,ParseFlags flags)200 Regexp* Regexp::HaveMatch(int match_id, ParseFlags flags) {
201 Regexp* re = new Regexp(kRegexpHaveMatch, flags);
202 re->match_id_ = match_id;
203 return re;
204 }
205
StarPlusOrQuest(RegexpOp op,Regexp * sub,ParseFlags flags)206 Regexp* Regexp::StarPlusOrQuest(RegexpOp op, Regexp* sub, ParseFlags flags) {
207 // Squash **, ++ and ??.
208 if (op == sub->op() && flags == sub->parse_flags())
209 return sub;
210
211 // Squash *+, *?, +*, +?, ?* and ?+. They all squash to *, so because
212 // op is Star/Plus/Quest, we just have to check that sub->op() is too.
213 if ((sub->op() == kRegexpStar ||
214 sub->op() == kRegexpPlus ||
215 sub->op() == kRegexpQuest) &&
216 flags == sub->parse_flags()) {
217 // If sub is Star, no need to rewrite it.
218 if (sub->op() == kRegexpStar)
219 return sub;
220
221 // Rewrite sub to Star.
222 Regexp* re = new Regexp(kRegexpStar, flags);
223 re->AllocSub(1);
224 re->sub()[0] = sub->sub()[0]->Incref();
225 sub->Decref(); // We didn't consume the reference after all.
226 return re;
227 }
228
229 Regexp* re = new Regexp(op, flags);
230 re->AllocSub(1);
231 re->sub()[0] = sub;
232 return re;
233 }
234
Plus(Regexp * sub,ParseFlags flags)235 Regexp* Regexp::Plus(Regexp* sub, ParseFlags flags) {
236 return StarPlusOrQuest(kRegexpPlus, sub, flags);
237 }
238
Star(Regexp * sub,ParseFlags flags)239 Regexp* Regexp::Star(Regexp* sub, ParseFlags flags) {
240 return StarPlusOrQuest(kRegexpStar, sub, flags);
241 }
242
Quest(Regexp * sub,ParseFlags flags)243 Regexp* Regexp::Quest(Regexp* sub, ParseFlags flags) {
244 return StarPlusOrQuest(kRegexpQuest, sub, flags);
245 }
246
ConcatOrAlternate(RegexpOp op,Regexp ** sub,int nsub,ParseFlags flags,bool can_factor)247 Regexp* Regexp::ConcatOrAlternate(RegexpOp op, Regexp** sub, int nsub,
248 ParseFlags flags, bool can_factor) {
249 if (nsub == 1)
250 return sub[0];
251
252 if (nsub == 0) {
253 if (op == kRegexpAlternate)
254 return new Regexp(kRegexpNoMatch, flags);
255 else
256 return new Regexp(kRegexpEmptyMatch, flags);
257 }
258
259 PODArray<Regexp*> subcopy;
260 if (op == kRegexpAlternate && can_factor) {
261 // Going to edit sub; make a copy so we don't step on caller.
262 subcopy = PODArray<Regexp*>(nsub);
263 memmove(subcopy.data(), sub, nsub * sizeof sub[0]);
264 sub = subcopy.data();
265 nsub = FactorAlternation(sub, nsub, flags);
266 if (nsub == 1) {
267 Regexp* re = sub[0];
268 return re;
269 }
270 }
271
272 if (nsub > kMaxNsub) {
273 // Too many subexpressions to fit in a single Regexp.
274 // Make a two-level tree. Two levels gets us to 65535^2.
275 int nbigsub = (nsub+kMaxNsub-1)/kMaxNsub;
276 Regexp* re = new Regexp(op, flags);
277 re->AllocSub(nbigsub);
278 Regexp** subs = re->sub();
279 for (int i = 0; i < nbigsub - 1; i++)
280 subs[i] = ConcatOrAlternate(op, sub+i*kMaxNsub, kMaxNsub, flags, false);
281 subs[nbigsub - 1] = ConcatOrAlternate(op, sub+(nbigsub-1)*kMaxNsub,
282 nsub - (nbigsub-1)*kMaxNsub, flags,
283 false);
284 return re;
285 }
286
287 Regexp* re = new Regexp(op, flags);
288 re->AllocSub(nsub);
289 Regexp** subs = re->sub();
290 for (int i = 0; i < nsub; i++)
291 subs[i] = sub[i];
292 return re;
293 }
294
Concat(Regexp ** sub,int nsub,ParseFlags flags)295 Regexp* Regexp::Concat(Regexp** sub, int nsub, ParseFlags flags) {
296 return ConcatOrAlternate(kRegexpConcat, sub, nsub, flags, false);
297 }
298
Alternate(Regexp ** sub,int nsub,ParseFlags flags)299 Regexp* Regexp::Alternate(Regexp** sub, int nsub, ParseFlags flags) {
300 return ConcatOrAlternate(kRegexpAlternate, sub, nsub, flags, true);
301 }
302
AlternateNoFactor(Regexp ** sub,int nsub,ParseFlags flags)303 Regexp* Regexp::AlternateNoFactor(Regexp** sub, int nsub, ParseFlags flags) {
304 return ConcatOrAlternate(kRegexpAlternate, sub, nsub, flags, false);
305 }
306
Capture(Regexp * sub,ParseFlags flags,int cap)307 Regexp* Regexp::Capture(Regexp* sub, ParseFlags flags, int cap) {
308 Regexp* re = new Regexp(kRegexpCapture, flags);
309 re->AllocSub(1);
310 re->sub()[0] = sub;
311 re->cap_ = cap;
312 return re;
313 }
314
Repeat(Regexp * sub,ParseFlags flags,int min,int max)315 Regexp* Regexp::Repeat(Regexp* sub, ParseFlags flags, int min, int max) {
316 Regexp* re = new Regexp(kRegexpRepeat, flags);
317 re->AllocSub(1);
318 re->sub()[0] = sub;
319 re->min_ = min;
320 re->max_ = max;
321 return re;
322 }
323
NewLiteral(Rune rune,ParseFlags flags)324 Regexp* Regexp::NewLiteral(Rune rune, ParseFlags flags) {
325 Regexp* re = new Regexp(kRegexpLiteral, flags);
326 re->rune_ = rune;
327 return re;
328 }
329
LiteralString(Rune * runes,int nrunes,ParseFlags flags)330 Regexp* Regexp::LiteralString(Rune* runes, int nrunes, ParseFlags flags) {
331 if (nrunes <= 0)
332 return new Regexp(kRegexpEmptyMatch, flags);
333 if (nrunes == 1)
334 return NewLiteral(runes[0], flags);
335 Regexp* re = new Regexp(kRegexpLiteralString, flags);
336 for (int i = 0; i < nrunes; i++)
337 re->AddRuneToString(runes[i]);
338 return re;
339 }
340
NewCharClass(CharClass * cc,ParseFlags flags)341 Regexp* Regexp::NewCharClass(CharClass* cc, ParseFlags flags) {
342 Regexp* re = new Regexp(kRegexpCharClass, flags);
343 re->cc_ = cc;
344 return re;
345 }
346
Swap(Regexp * that)347 void Regexp::Swap(Regexp* that) {
348 // Regexp is not trivially copyable, so we cannot freely copy it with
349 // memmove(3), but swapping objects like so is safe for our purposes.
350 char tmp[sizeof *this];
351 void* vthis = reinterpret_cast<void*>(this);
352 void* vthat = reinterpret_cast<void*>(that);
353 memmove(tmp, vthis, sizeof *this);
354 memmove(vthis, vthat, sizeof *this);
355 memmove(vthat, tmp, sizeof *this);
356 }
357
358 // Tests equality of all top-level structure but not subregexps.
TopEqual(Regexp * a,Regexp * b)359 static bool TopEqual(Regexp* a, Regexp* b) {
360 if (a->op() != b->op())
361 return false;
362
363 switch (a->op()) {
364 case kRegexpNoMatch:
365 case kRegexpEmptyMatch:
366 case kRegexpAnyChar:
367 case kRegexpAnyByte:
368 case kRegexpBeginLine:
369 case kRegexpEndLine:
370 case kRegexpWordBoundary:
371 case kRegexpNoWordBoundary:
372 case kRegexpBeginText:
373 return true;
374
375 case kRegexpEndText:
376 // The parse flags remember whether it's \z or (?-m:$),
377 // which matters when testing against PCRE.
378 return ((a->parse_flags() ^ b->parse_flags()) & Regexp::WasDollar) == 0;
379
380 case kRegexpLiteral:
381 return a->rune() == b->rune() &&
382 ((a->parse_flags() ^ b->parse_flags()) & Regexp::FoldCase) == 0;
383
384 case kRegexpLiteralString:
385 return a->nrunes() == b->nrunes() &&
386 ((a->parse_flags() ^ b->parse_flags()) & Regexp::FoldCase) == 0 &&
387 memcmp(a->runes(), b->runes(),
388 a->nrunes() * sizeof a->runes()[0]) == 0;
389
390 case kRegexpAlternate:
391 case kRegexpConcat:
392 return a->nsub() == b->nsub();
393
394 case kRegexpStar:
395 case kRegexpPlus:
396 case kRegexpQuest:
397 return ((a->parse_flags() ^ b->parse_flags()) & Regexp::NonGreedy) == 0;
398
399 case kRegexpRepeat:
400 return ((a->parse_flags() ^ b->parse_flags()) & Regexp::NonGreedy) == 0 &&
401 a->min() == b->min() &&
402 a->max() == b->max();
403
404 case kRegexpCapture:
405 if (a->name() == NULL || b->name() == NULL) {
406 // One pointer is null, so the other pointer should also be null.
407 return a->cap() == b->cap() && a->name() == b->name();
408 } else {
409 // Neither pointer is null, so compare the pointees for equality.
410 return a->cap() == b->cap() && *a->name() == *b->name();
411 }
412
413 case kRegexpHaveMatch:
414 return a->match_id() == b->match_id();
415
416 case kRegexpCharClass: {
417 CharClass* acc = a->cc();
418 CharClass* bcc = b->cc();
419 return acc->size() == bcc->size() &&
420 acc->end() - acc->begin() == bcc->end() - bcc->begin() &&
421 memcmp(acc->begin(), bcc->begin(),
422 (acc->end() - acc->begin()) * sizeof acc->begin()[0]) == 0;
423 }
424 }
425
426 ABSL_LOG(DFATAL) << "Unexpected op in Regexp::Equal: " << a->op();
427 return 0;
428 }
429
Equal(Regexp * a,Regexp * b)430 bool Regexp::Equal(Regexp* a, Regexp* b) {
431 if (a == NULL || b == NULL)
432 return a == b;
433
434 if (!TopEqual(a, b))
435 return false;
436
437 // Fast path:
438 // return without allocating vector if there are no subregexps.
439 switch (a->op()) {
440 case kRegexpAlternate:
441 case kRegexpConcat:
442 case kRegexpStar:
443 case kRegexpPlus:
444 case kRegexpQuest:
445 case kRegexpRepeat:
446 case kRegexpCapture:
447 break;
448
449 default:
450 return true;
451 }
452
453 // Committed to doing real work.
454 // The stack (vector) has pairs of regexps waiting to
455 // be compared. The regexps are only equal if
456 // all the pairs end up being equal.
457 std::vector<Regexp*> stk;
458
459 for (;;) {
460 // Invariant: TopEqual(a, b) == true.
461 Regexp* a2;
462 Regexp* b2;
463 switch (a->op()) {
464 default:
465 break;
466 case kRegexpAlternate:
467 case kRegexpConcat:
468 for (int i = 0; i < a->nsub(); i++) {
469 a2 = a->sub()[i];
470 b2 = b->sub()[i];
471 if (!TopEqual(a2, b2))
472 return false;
473 stk.push_back(a2);
474 stk.push_back(b2);
475 }
476 break;
477
478 case kRegexpStar:
479 case kRegexpPlus:
480 case kRegexpQuest:
481 case kRegexpRepeat:
482 case kRegexpCapture:
483 a2 = a->sub()[0];
484 b2 = b->sub()[0];
485 if (!TopEqual(a2, b2))
486 return false;
487 // Really:
488 // stk.push_back(a2);
489 // stk.push_back(b2);
490 // break;
491 // but faster to assign directly and loop.
492 a = a2;
493 b = b2;
494 continue;
495 }
496
497 size_t n = stk.size();
498 if (n == 0)
499 break;
500
501 ABSL_DCHECK_GE(n, size_t{2});
502 a = stk[n-2];
503 b = stk[n-1];
504 stk.resize(n-2);
505 }
506
507 return true;
508 }
509
510 // Keep in sync with enum RegexpStatusCode in regexp.h
511 static const char *kErrorStrings[] = {
512 "no error",
513 "unexpected error",
514 "invalid escape sequence",
515 "invalid character class",
516 "invalid character class range",
517 "missing ]",
518 "missing )",
519 "unexpected )",
520 "trailing \\",
521 "no argument for repetition operator",
522 "invalid repetition size",
523 "bad repetition operator",
524 "invalid perl operator",
525 "invalid UTF-8",
526 "invalid named capture group",
527 };
528
CodeText(enum RegexpStatusCode code)529 std::string RegexpStatus::CodeText(enum RegexpStatusCode code) {
530 if (code < 0 || code >= ABSL_ARRAYSIZE(kErrorStrings))
531 code = kRegexpInternalError;
532 return kErrorStrings[code];
533 }
534
Text() const535 std::string RegexpStatus::Text() const {
536 if (error_arg_.empty())
537 return CodeText(code_);
538 std::string s;
539 s.append(CodeText(code_));
540 s.append(": ");
541 s.append(error_arg_.data(), error_arg_.size());
542 return s;
543 }
544
Copy(const RegexpStatus & status)545 void RegexpStatus::Copy(const RegexpStatus& status) {
546 code_ = status.code_;
547 error_arg_ = status.error_arg_;
548 }
549
550 typedef int Ignored; // Walker<void> doesn't exist
551
552 // Walker subclass to count capturing parens in regexp.
553 class NumCapturesWalker : public Regexp::Walker<Ignored> {
554 public:
NumCapturesWalker()555 NumCapturesWalker() : ncapture_(0) {}
ncapture()556 int ncapture() { return ncapture_; }
557
PreVisit(Regexp * re,Ignored ignored,bool * stop)558 virtual Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) {
559 if (re->op() == kRegexpCapture)
560 ncapture_++;
561 return ignored;
562 }
563
ShortVisit(Regexp * re,Ignored ignored)564 virtual Ignored ShortVisit(Regexp* re, Ignored ignored) {
565 // Should never be called: we use Walk(), not WalkExponential().
566 #ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
567 ABSL_LOG(DFATAL) << "NumCapturesWalker::ShortVisit called";
568 #endif
569 return ignored;
570 }
571
572 private:
573 int ncapture_;
574
575 NumCapturesWalker(const NumCapturesWalker&) = delete;
576 NumCapturesWalker& operator=(const NumCapturesWalker&) = delete;
577 };
578
NumCaptures()579 int Regexp::NumCaptures() {
580 NumCapturesWalker w;
581 w.Walk(this, 0);
582 return w.ncapture();
583 }
584
585 // Walker class to build map of named capture groups and their indices.
586 class NamedCapturesWalker : public Regexp::Walker<Ignored> {
587 public:
NamedCapturesWalker()588 NamedCapturesWalker() : map_(NULL) {}
~NamedCapturesWalker()589 ~NamedCapturesWalker() { delete map_; }
590
TakeMap()591 std::map<std::string, int>* TakeMap() {
592 std::map<std::string, int>* m = map_;
593 map_ = NULL;
594 return m;
595 }
596
PreVisit(Regexp * re,Ignored ignored,bool * stop)597 virtual Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) {
598 if (re->op() == kRegexpCapture && re->name() != NULL) {
599 // Allocate map once we find a name.
600 if (map_ == NULL)
601 map_ = new std::map<std::string, int>;
602
603 // Record first occurrence of each name.
604 // (The rule is that if you have the same name
605 // multiple times, only the leftmost one counts.)
606 map_->insert({*re->name(), re->cap()});
607 }
608 return ignored;
609 }
610
ShortVisit(Regexp * re,Ignored ignored)611 virtual Ignored ShortVisit(Regexp* re, Ignored ignored) {
612 // Should never be called: we use Walk(), not WalkExponential().
613 #ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
614 ABSL_LOG(DFATAL) << "NamedCapturesWalker::ShortVisit called";
615 #endif
616 return ignored;
617 }
618
619 private:
620 std::map<std::string, int>* map_;
621
622 NamedCapturesWalker(const NamedCapturesWalker&) = delete;
623 NamedCapturesWalker& operator=(const NamedCapturesWalker&) = delete;
624 };
625
NamedCaptures()626 std::map<std::string, int>* Regexp::NamedCaptures() {
627 NamedCapturesWalker w;
628 w.Walk(this, 0);
629 return w.TakeMap();
630 }
631
632 // Walker class to build map from capture group indices to their names.
633 class CaptureNamesWalker : public Regexp::Walker<Ignored> {
634 public:
CaptureNamesWalker()635 CaptureNamesWalker() : map_(NULL) {}
~CaptureNamesWalker()636 ~CaptureNamesWalker() { delete map_; }
637
TakeMap()638 std::map<int, std::string>* TakeMap() {
639 std::map<int, std::string>* m = map_;
640 map_ = NULL;
641 return m;
642 }
643
PreVisit(Regexp * re,Ignored ignored,bool * stop)644 virtual Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) {
645 if (re->op() == kRegexpCapture && re->name() != NULL) {
646 // Allocate map once we find a name.
647 if (map_ == NULL)
648 map_ = new std::map<int, std::string>;
649
650 (*map_)[re->cap()] = *re->name();
651 }
652 return ignored;
653 }
654
ShortVisit(Regexp * re,Ignored ignored)655 virtual Ignored ShortVisit(Regexp* re, Ignored ignored) {
656 // Should never be called: we use Walk(), not WalkExponential().
657 #ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
658 ABSL_LOG(DFATAL) << "CaptureNamesWalker::ShortVisit called";
659 #endif
660 return ignored;
661 }
662
663 private:
664 std::map<int, std::string>* map_;
665
666 CaptureNamesWalker(const CaptureNamesWalker&) = delete;
667 CaptureNamesWalker& operator=(const CaptureNamesWalker&) = delete;
668 };
669
CaptureNames()670 std::map<int, std::string>* Regexp::CaptureNames() {
671 CaptureNamesWalker w;
672 w.Walk(this, 0);
673 return w.TakeMap();
674 }
675
ConvertRunesToBytes(bool latin1,Rune * runes,int nrunes,std::string * bytes)676 void ConvertRunesToBytes(bool latin1, Rune* runes, int nrunes,
677 std::string* bytes) {
678 if (latin1) {
679 bytes->resize(nrunes);
680 for (int i = 0; i < nrunes; i++)
681 (*bytes)[i] = static_cast<char>(runes[i]);
682 } else {
683 bytes->resize(nrunes * UTFmax); // worst case
684 char* p = &(*bytes)[0];
685 for (int i = 0; i < nrunes; i++)
686 p += runetochar(p, &runes[i]);
687 bytes->resize(p - &(*bytes)[0]);
688 bytes->shrink_to_fit();
689 }
690 }
691
692 // Determines whether regexp matches must be anchored
693 // with a fixed string prefix. If so, returns the prefix and
694 // the regexp that remains after the prefix. The prefix might
695 // be ASCII case-insensitive.
RequiredPrefix(std::string * prefix,bool * foldcase,Regexp ** suffix)696 bool Regexp::RequiredPrefix(std::string* prefix, bool* foldcase,
697 Regexp** suffix) {
698 prefix->clear();
699 *foldcase = false;
700 *suffix = NULL;
701
702 // No need for a walker: the regexp must be of the form
703 // 1. some number of ^ anchors
704 // 2. a literal char or string
705 // 3. the rest
706 if (op_ != kRegexpConcat)
707 return false;
708 int i = 0;
709 while (i < nsub_ && sub()[i]->op_ == kRegexpBeginText)
710 i++;
711 if (i == 0 || i >= nsub_)
712 return false;
713 Regexp* re = sub()[i];
714 if (re->op_ != kRegexpLiteral &&
715 re->op_ != kRegexpLiteralString)
716 return false;
717 i++;
718 if (i < nsub_) {
719 for (int j = i; j < nsub_; j++)
720 sub()[j]->Incref();
721 *suffix = Concat(sub() + i, nsub_ - i, parse_flags());
722 } else {
723 *suffix = new Regexp(kRegexpEmptyMatch, parse_flags());
724 }
725
726 bool latin1 = (re->parse_flags() & Latin1) != 0;
727 Rune* runes = re->op_ == kRegexpLiteral ? &re->rune_ : re->runes_;
728 int nrunes = re->op_ == kRegexpLiteral ? 1 : re->nrunes_;
729 ConvertRunesToBytes(latin1, runes, nrunes, prefix);
730 *foldcase = (re->parse_flags() & FoldCase) != 0;
731 return true;
732 }
733
734 // Determines whether regexp matches must be unanchored
735 // with a fixed string prefix. If so, returns the prefix.
736 // The prefix might be ASCII case-insensitive.
RequiredPrefixForAccel(std::string * prefix,bool * foldcase)737 bool Regexp::RequiredPrefixForAccel(std::string* prefix, bool* foldcase) {
738 prefix->clear();
739 *foldcase = false;
740
741 // No need for a walker: the regexp must either begin with or be
742 // a literal char or string. We "see through" capturing groups,
743 // but make no effort to glue multiple prefix fragments together.
744 Regexp* re = op_ == kRegexpConcat && nsub_ > 0 ? sub()[0] : this;
745 while (re->op_ == kRegexpCapture) {
746 re = re->sub()[0];
747 if (re->op_ == kRegexpConcat && re->nsub_ > 0)
748 re = re->sub()[0];
749 }
750 if (re->op_ != kRegexpLiteral &&
751 re->op_ != kRegexpLiteralString)
752 return false;
753
754 bool latin1 = (re->parse_flags() & Latin1) != 0;
755 Rune* runes = re->op_ == kRegexpLiteral ? &re->rune_ : re->runes_;
756 int nrunes = re->op_ == kRegexpLiteral ? 1 : re->nrunes_;
757 ConvertRunesToBytes(latin1, runes, nrunes, prefix);
758 *foldcase = (re->parse_flags() & FoldCase) != 0;
759 return true;
760 }
761
762 // Character class builder is a balanced binary tree (STL set)
763 // containing non-overlapping, non-abutting RuneRanges.
764 // The less-than operator used in the tree treats two
765 // ranges as equal if they overlap at all, so that
766 // lookups for a particular Rune are possible.
767
CharClassBuilder()768 CharClassBuilder::CharClassBuilder() {
769 nrunes_ = 0;
770 upper_ = 0;
771 lower_ = 0;
772 }
773
774 // Add lo-hi to the class; return whether class got bigger.
AddRange(Rune lo,Rune hi)775 bool CharClassBuilder::AddRange(Rune lo, Rune hi) {
776 if (hi < lo)
777 return false;
778
779 if (lo <= 'z' && hi >= 'A') {
780 // Overlaps some alpha, maybe not all.
781 // Update bitmaps telling which ASCII letters are in the set.
782 Rune lo1 = std::max<Rune>(lo, 'A');
783 Rune hi1 = std::min<Rune>(hi, 'Z');
784 if (lo1 <= hi1)
785 upper_ |= ((1 << (hi1 - lo1 + 1)) - 1) << (lo1 - 'A');
786
787 lo1 = std::max<Rune>(lo, 'a');
788 hi1 = std::min<Rune>(hi, 'z');
789 if (lo1 <= hi1)
790 lower_ |= ((1 << (hi1 - lo1 + 1)) - 1) << (lo1 - 'a');
791 }
792
793 { // Check whether lo, hi is already in the class.
794 iterator it = ranges_.find(RuneRange(lo, lo));
795 if (it != end() && it->lo <= lo && hi <= it->hi)
796 return false;
797 }
798
799 // Look for a range abutting lo on the left.
800 // If it exists, take it out and increase our range.
801 if (lo > 0) {
802 iterator it = ranges_.find(RuneRange(lo-1, lo-1));
803 if (it != end()) {
804 lo = it->lo;
805 if (it->hi > hi)
806 hi = it->hi;
807 nrunes_ -= it->hi - it->lo + 1;
808 ranges_.erase(it);
809 }
810 }
811
812 // Look for a range abutting hi on the right.
813 // If it exists, take it out and increase our range.
814 if (hi < Runemax) {
815 iterator it = ranges_.find(RuneRange(hi+1, hi+1));
816 if (it != end()) {
817 hi = it->hi;
818 nrunes_ -= it->hi - it->lo + 1;
819 ranges_.erase(it);
820 }
821 }
822
823 // Look for ranges between lo and hi. Take them out.
824 // This is only safe because the set has no overlapping ranges.
825 // We've already removed any ranges abutting lo and hi, so
826 // any that overlap [lo, hi] must be contained within it.
827 for (;;) {
828 iterator it = ranges_.find(RuneRange(lo, hi));
829 if (it == end())
830 break;
831 nrunes_ -= it->hi - it->lo + 1;
832 ranges_.erase(it);
833 }
834
835 // Finally, add [lo, hi].
836 nrunes_ += hi - lo + 1;
837 ranges_.insert(RuneRange(lo, hi));
838 return true;
839 }
840
AddCharClass(CharClassBuilder * cc)841 void CharClassBuilder::AddCharClass(CharClassBuilder *cc) {
842 for (iterator it = cc->begin(); it != cc->end(); ++it)
843 AddRange(it->lo, it->hi);
844 }
845
Contains(Rune r)846 bool CharClassBuilder::Contains(Rune r) {
847 return ranges_.find(RuneRange(r, r)) != end();
848 }
849
850 // Does the character class behave the same on A-Z as on a-z?
FoldsASCII()851 bool CharClassBuilder::FoldsASCII() {
852 return ((upper_ ^ lower_) & AlphaMask) == 0;
853 }
854
Copy()855 CharClassBuilder* CharClassBuilder::Copy() {
856 CharClassBuilder* cc = new CharClassBuilder;
857 for (iterator it = begin(); it != end(); ++it)
858 cc->ranges_.insert(RuneRange(it->lo, it->hi));
859 cc->upper_ = upper_;
860 cc->lower_ = lower_;
861 cc->nrunes_ = nrunes_;
862 return cc;
863 }
864
865
866
RemoveAbove(Rune r)867 void CharClassBuilder::RemoveAbove(Rune r) {
868 if (r >= Runemax)
869 return;
870
871 if (r < 'z') {
872 if (r < 'a')
873 lower_ = 0;
874 else
875 lower_ &= AlphaMask >> ('z' - r);
876 }
877
878 if (r < 'Z') {
879 if (r < 'A')
880 upper_ = 0;
881 else
882 upper_ &= AlphaMask >> ('Z' - r);
883 }
884
885 for (;;) {
886
887 iterator it = ranges_.find(RuneRange(r + 1, Runemax));
888 if (it == end())
889 break;
890 RuneRange rr = *it;
891 ranges_.erase(it);
892 nrunes_ -= rr.hi - rr.lo + 1;
893 if (rr.lo <= r) {
894 rr.hi = r;
895 ranges_.insert(rr);
896 nrunes_ += rr.hi - rr.lo + 1;
897 }
898 }
899 }
900
Negate()901 void CharClassBuilder::Negate() {
902 // Build up negation and then copy in.
903 // Could edit ranges in place, but C++ won't let me.
904 std::vector<RuneRange> v;
905 v.reserve(ranges_.size() + 1);
906
907 // In negation, first range begins at 0, unless
908 // the current class begins at 0.
909 iterator it = begin();
910 if (it == end()) {
911 v.push_back(RuneRange(0, Runemax));
912 } else {
913 int nextlo = 0;
914 if (it->lo == 0) {
915 nextlo = it->hi + 1;
916 ++it;
917 }
918 for (; it != end(); ++it) {
919 v.push_back(RuneRange(nextlo, it->lo - 1));
920 nextlo = it->hi + 1;
921 }
922 if (nextlo <= Runemax)
923 v.push_back(RuneRange(nextlo, Runemax));
924 }
925
926 ranges_.clear();
927 for (size_t i = 0; i < v.size(); i++)
928 ranges_.insert(v[i]);
929
930 upper_ = AlphaMask & ~upper_;
931 lower_ = AlphaMask & ~lower_;
932 nrunes_ = Runemax+1 - nrunes_;
933 }
934
935 // Character class is a sorted list of ranges.
936 // The ranges are allocated in the same block as the header,
937 // necessitating a special allocator and Delete method.
938
New(size_t maxranges)939 CharClass* CharClass::New(size_t maxranges) {
940 CharClass* cc;
941 uint8_t* data = new uint8_t[sizeof *cc + maxranges*sizeof cc->ranges_[0]];
942 cc = reinterpret_cast<CharClass*>(data);
943 cc->ranges_ = reinterpret_cast<RuneRange*>(data + sizeof *cc);
944 cc->nranges_ = 0;
945 cc->folds_ascii_ = false;
946 cc->nrunes_ = 0;
947 return cc;
948 }
949
Delete()950 void CharClass::Delete() {
951 uint8_t* data = reinterpret_cast<uint8_t*>(this);
952 delete[] data;
953 }
954
Negate()955 CharClass* CharClass::Negate() {
956 CharClass* cc = CharClass::New(static_cast<size_t>(nranges_+1));
957 cc->folds_ascii_ = folds_ascii_;
958 cc->nrunes_ = Runemax + 1 - nrunes_;
959 int n = 0;
960 int nextlo = 0;
961 for (CharClass::iterator it = begin(); it != end(); ++it) {
962 if (it->lo == nextlo) {
963 nextlo = it->hi + 1;
964 } else {
965 cc->ranges_[n++] = RuneRange(nextlo, it->lo - 1);
966 nextlo = it->hi + 1;
967 }
968 }
969 if (nextlo <= Runemax)
970 cc->ranges_[n++] = RuneRange(nextlo, Runemax);
971 cc->nranges_ = n;
972 return cc;
973 }
974
Contains(Rune r) const975 bool CharClass::Contains(Rune r) const {
976 RuneRange* rr = ranges_;
977 int n = nranges_;
978 while (n > 0) {
979 int m = n/2;
980 if (rr[m].hi < r) {
981 rr += m+1;
982 n -= m+1;
983 } else if (r < rr[m].lo) {
984 n = m;
985 } else { // rr[m].lo <= r && r <= rr[m].hi
986 return true;
987 }
988 }
989 return false;
990 }
991
GetCharClass()992 CharClass* CharClassBuilder::GetCharClass() {
993 CharClass* cc = CharClass::New(ranges_.size());
994 int n = 0;
995 for (iterator it = begin(); it != end(); ++it)
996 cc->ranges_[n++] = *it;
997 cc->nranges_ = n;
998 ABSL_DCHECK_LE(n, static_cast<int>(ranges_.size()));
999 cc->nrunes_ = nrunes_;
1000 cc->folds_ascii_ = FoldsASCII();
1001 return cc;
1002 }
1003
1004 } // namespace re2
1005