• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2006 The RE2 Authors.  All Rights Reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4 
5 #ifndef RE2_REGEXP_H_
6 #define RE2_REGEXP_H_
7 
8 // --- SPONSORED LINK --------------------------------------------------
9 // If you want to use this library for regular expression matching,
10 // you should use re2/re2.h, which provides a class RE2 that
11 // mimics the PCRE interface provided by PCRE's C++ wrappers.
12 // This header describes the low-level interface used to implement RE2
13 // and may change in backwards-incompatible ways from time to time.
14 // In contrast, RE2's interface will not.
15 // ---------------------------------------------------------------------
16 
17 // Regular expression library: parsing, execution, and manipulation
18 // of regular expressions.
19 //
20 // Any operation that traverses the Regexp structures should be written
21 // using Regexp::Walker (see walker-inl.h), not recursively, because deeply nested
22 // regular expressions such as x++++++++++++++++++++... might cause recursive
23 // traversals to overflow the stack.
24 //
25 // It is the caller's responsibility to provide appropriate mutual exclusion
26 // around manipulation of the regexps.  RE2 does this.
27 //
28 // PARSING
29 //
30 // Regexp::Parse parses regular expressions encoded in UTF-8.
31 // The default syntax is POSIX extended regular expressions,
32 // with the following changes:
33 //
34 //   1.  Backreferences (optional in POSIX EREs) are not supported.
35 //         (Supporting them precludes the use of DFA-based
36 //          matching engines.)
37 //
38 //   2.  Collating elements and collation classes are not supported.
39 //         (No one has needed or wanted them.)
40 //
41 // The exact syntax accepted can be modified by passing flags to
42 // Regexp::Parse.  In particular, many of the basic Perl additions
43 // are available.  The flags are documented below (search for LikePerl).
44 //
45 // If parsed with the flag Regexp::Latin1, both the regular expression
46 // and the input to the matching routines are assumed to be encoded in
47 // Latin-1, not UTF-8.
48 //
49 // EXECUTION
50 //
51 // Once Regexp has parsed a regular expression, it provides methods
52 // to search text using that regular expression.  These methods are
53 // implemented via calling out to other regular expression libraries.
54 // (Let's call them the sublibraries.)
55 //
56 // To call a sublibrary, Regexp does not simply prepare a
57 // string version of the regular expression and hand it to the
58 // sublibrary.  Instead, Regexp prepares, from its own parsed form, the
59 // corresponding internal representation used by the sublibrary.
60 // This has the drawback of needing to know the internal representation
61 // used by the sublibrary, but it has two important benefits:
62 //
63 //   1. The syntax and meaning of regular expressions is guaranteed
64 //      to be that used by Regexp's parser, not the syntax expected
65 //      by the sublibrary.  Regexp might accept a restricted or
66 //      expanded syntax for regular expressions as compared with
67 //      the sublibrary.  As long as Regexp can translate from its
68 //      internal form into the sublibrary's, clients need not know
69 //      exactly which sublibrary they are using.
70 //
71 //   2. The sublibrary parsers are bypassed.  For whatever reason,
72 //      sublibrary regular expression parsers often have security
73 //      problems.  For example, plan9grep's regular expression parser
74 //      has a buffer overflow in its handling of large character
75 //      classes, and PCRE's parser has had buffer overflow problems
76 //      in the past.  Security-team requires sandboxing of sublibrary
77 //      regular expression parsers.  Avoiding the sublibrary parsers
78 //      avoids the sandbox.
79 //
80 // The execution methods we use now are provided by the compiled form,
81 // Prog, described in prog.h
82 //
83 // MANIPULATION
84 //
85 // Unlike other regular expression libraries, Regexp makes its parsed
86 // form accessible to clients, so that client code can analyze the
87 // parsed regular expressions.
88 
89 #include <stddef.h>
90 #include <stdint.h>
91 
92 #include <map>
93 #include <set>
94 #include <string>
95 
96 #include "absl/log/absl_check.h"
97 #include "absl/log/absl_log.h"
98 #include "absl/strings/string_view.h"
99 #include "util/utf.h"
100 
101 namespace re2 {
102 
103 // Keep in sync with string list kOpcodeNames[] in testing/dump.cc
104 enum RegexpOp {
105   // Matches no strings.
106   kRegexpNoMatch = 1,
107 
108   // Matches empty string.
109   kRegexpEmptyMatch,
110 
111   // Matches rune_.
112   kRegexpLiteral,
113 
114   // Matches runes_.
115   kRegexpLiteralString,
116 
117   // Matches concatenation of sub_[0..nsub-1].
118   kRegexpConcat,
119   // Matches union of sub_[0..nsub-1].
120   kRegexpAlternate,
121 
122   // Matches sub_[0] zero or more times.
123   kRegexpStar,
124   // Matches sub_[0] one or more times.
125   kRegexpPlus,
126   // Matches sub_[0] zero or one times.
127   kRegexpQuest,
128 
129   // Matches sub_[0] at least min_ times, at most max_ times.
130   // max_ == -1 means no upper limit.
131   kRegexpRepeat,
132 
133   // Parenthesized (capturing) subexpression.  Index is cap_.
134   // Optionally, capturing name is name_.
135   kRegexpCapture,
136 
137   // Matches any character.
138   kRegexpAnyChar,
139 
140   // Matches any byte [sic].
141   kRegexpAnyByte,
142 
143   // Matches empty string at beginning of line.
144   kRegexpBeginLine,
145   // Matches empty string at end of line.
146   kRegexpEndLine,
147 
148   // Matches word boundary "\b".
149   kRegexpWordBoundary,
150   // Matches not-a-word boundary "\B".
151   kRegexpNoWordBoundary,
152 
153   // Matches empty string at beginning of text.
154   kRegexpBeginText,
155   // Matches empty string at end of text.
156   kRegexpEndText,
157 
158   // Matches character class given by cc_.
159   kRegexpCharClass,
160 
161   // Forces match of entire expression right now,
162   // with match ID match_id_ (used by RE2::Set).
163   kRegexpHaveMatch,
164 
165   kMaxRegexpOp = kRegexpHaveMatch,
166 };
167 
168 // Keep in sync with string list in regexp.cc
169 enum RegexpStatusCode {
170   // No error
171   kRegexpSuccess = 0,
172 
173   // Unexpected error
174   kRegexpInternalError,
175 
176   // Parse errors
177   kRegexpBadEscape,          // bad escape sequence
178   kRegexpBadCharClass,       // bad character class
179   kRegexpBadCharRange,       // bad character class range
180   kRegexpMissingBracket,     // missing closing ]
181   kRegexpMissingParen,       // missing closing )
182   kRegexpUnexpectedParen,    // unexpected closing )
183   kRegexpTrailingBackslash,  // at end of regexp
184   kRegexpRepeatArgument,     // repeat argument missing, e.g. "*"
185   kRegexpRepeatSize,         // bad repetition argument
186   kRegexpRepeatOp,           // bad repetition operator
187   kRegexpBadPerlOp,          // bad perl operator
188   kRegexpBadUTF8,            // invalid UTF-8 in regexp
189   kRegexpBadNamedCapture,    // bad named capture
190 };
191 
192 // Error status for certain operations.
193 class RegexpStatus {
194  public:
RegexpStatus()195   RegexpStatus() : code_(kRegexpSuccess), tmp_(NULL) {}
~RegexpStatus()196   ~RegexpStatus() { delete tmp_; }
197 
set_code(RegexpStatusCode code)198   void set_code(RegexpStatusCode code) { code_ = code; }
set_error_arg(absl::string_view error_arg)199   void set_error_arg(absl::string_view error_arg) { error_arg_ = error_arg; }
set_tmp(std::string * tmp)200   void set_tmp(std::string* tmp) { delete tmp_; tmp_ = tmp; }
code()201   RegexpStatusCode code() const { return code_; }
error_arg()202   absl::string_view error_arg() const { return error_arg_; }
ok()203   bool ok() const { return code() == kRegexpSuccess; }
204 
205   // Copies state from status.
206   void Copy(const RegexpStatus& status);
207 
208   // Returns text equivalent of code, e.g.:
209   //   "Bad character class"
210   static std::string CodeText(RegexpStatusCode code);
211 
212   // Returns text describing error, e.g.:
213   //   "Bad character class: [z-a]"
214   std::string Text() const;
215 
216  private:
217   RegexpStatusCode code_;        // Kind of error.
218   absl::string_view error_arg_;  // Piece of regexp containing syntax error.
219   std::string* tmp_;             // Temporary storage, possibly for error_arg_.
220 
221   RegexpStatus(const RegexpStatus&) = delete;
222   RegexpStatus& operator=(const RegexpStatus&) = delete;
223 };
224 
225 // Compiled form; see prog.h
226 class Prog;
227 
228 struct RuneRange {
RuneRangeRuneRange229   RuneRange() : lo(0), hi(0) { }
RuneRangeRuneRange230   RuneRange(int l, int h) : lo(l), hi(h) { }
231   Rune lo;
232   Rune hi;
233 };
234 
235 // Less-than on RuneRanges treats a == b if they overlap at all.
236 // This lets us look in a set to find the range covering a particular Rune.
237 struct RuneRangeLess {
operatorRuneRangeLess238   bool operator()(const RuneRange& a, const RuneRange& b) const {
239     return a.hi < b.lo;
240   }
241 };
242 
243 class CharClassBuilder;
244 
245 class CharClass {
246  public:
247   void Delete();
248 
249   typedef RuneRange* iterator;
begin()250   iterator begin() { return ranges_; }
end()251   iterator end() { return ranges_ + nranges_; }
252 
size()253   int size() { return nrunes_; }
empty()254   bool empty() { return nrunes_ == 0; }
full()255   bool full() { return nrunes_ == Runemax+1; }
FoldsASCII()256   bool FoldsASCII() { return folds_ascii_; }
257 
258   bool Contains(Rune r) const;
259   CharClass* Negate();
260 
261  private:
262   CharClass();  // not implemented
263   ~CharClass();  // not implemented
264   static CharClass* New(size_t maxranges);
265 
266   friend class CharClassBuilder;
267 
268   bool folds_ascii_;
269   int nrunes_;
270   RuneRange *ranges_;
271   int nranges_;
272 
273   CharClass(const CharClass&) = delete;
274   CharClass& operator=(const CharClass&) = delete;
275 };
276 
277 class Regexp {
278  public:
279 
280   // Flags for parsing.  Can be ORed together.
281   enum ParseFlags {
282     NoParseFlags  = 0,
283     FoldCase      = 1<<0,   // Fold case during matching (case-insensitive).
284     Literal       = 1<<1,   // Treat s as literal string instead of a regexp.
285     ClassNL       = 1<<2,   // Allow char classes like [^a-z] and \D and \s
286                             // and [[:space:]] to match newline.
287     DotNL         = 1<<3,   // Allow . to match newline.
288     MatchNL       = ClassNL | DotNL,
289     OneLine       = 1<<4,   // Treat ^ and $ as only matching at beginning and
290                             // end of text, not around embedded newlines.
291                             // (Perl's default)
292     Latin1        = 1<<5,   // Regexp and text are in Latin1, not UTF-8.
293     NonGreedy     = 1<<6,   // Repetition operators are non-greedy by default.
294     PerlClasses   = 1<<7,   // Allow Perl character classes like \d.
295     PerlB         = 1<<8,   // Allow Perl's \b and \B.
296     PerlX         = 1<<9,   // Perl extensions:
297                             //   non-capturing parens - (?: )
298                             //   non-greedy operators - *? +? ?? {}?
299                             //   flag edits - (?i) (?-i) (?i: )
300                             //     i - FoldCase
301                             //     m - !OneLine
302                             //     s - DotNL
303                             //     U - NonGreedy
304                             //   line ends: \A \z
305                             //   \Q and \E to disable/enable metacharacters
306                             //   (?P<name>expr) for named captures
307                             //   \C to match any single byte
308     UnicodeGroups = 1<<10,  // Allow \p{Han} for Unicode Han group
309                             //   and \P{Han} for its negation.
310     NeverNL       = 1<<11,  // Never match NL, even if the regexp mentions
311                             //   it explicitly.
312     NeverCapture  = 1<<12,  // Parse all parens as non-capturing.
313 
314     // As close to Perl as we can get.
315     LikePerl      = ClassNL | OneLine | PerlClasses | PerlB | PerlX |
316                     UnicodeGroups,
317 
318     // Internal use only.
319     WasDollar     = 1<<13,  // on kRegexpEndText: was $ in regexp text
320     AllParseFlags = (1<<14)-1,
321   };
322 
323   // Get.  No set, Regexps are logically immutable once created.
op()324   RegexpOp op() { return static_cast<RegexpOp>(op_); }
nsub()325   int nsub() { return nsub_; }
simple()326   bool simple() { return simple_ != 0; }
parse_flags()327   ParseFlags parse_flags() { return static_cast<ParseFlags>(parse_flags_); }
328   int Ref();  // For testing.
329 
sub()330   Regexp** sub() {
331     if(nsub_ <= 1)
332       return &subone_;
333     else
334       return submany_;
335   }
336 
min()337   int min() {
338     ABSL_DCHECK_EQ(op_, kRegexpRepeat);
339     return min_;
340   }
max()341   int max() {
342     ABSL_DCHECK_EQ(op_, kRegexpRepeat);
343     return max_;
344   }
rune()345   Rune rune() {
346     ABSL_DCHECK_EQ(op_, kRegexpLiteral);
347     return rune_;
348   }
cc()349   CharClass* cc() {
350     ABSL_DCHECK_EQ(op_, kRegexpCharClass);
351     return cc_;
352   }
cap()353   int cap() {
354     ABSL_DCHECK_EQ(op_, kRegexpCapture);
355     return cap_;
356   }
name()357   const std::string* name() {
358     ABSL_DCHECK_EQ(op_, kRegexpCapture);
359     return name_;
360   }
runes()361   Rune* runes() {
362     ABSL_DCHECK_EQ(op_, kRegexpLiteralString);
363     return runes_;
364   }
nrunes()365   int nrunes() {
366     ABSL_DCHECK_EQ(op_, kRegexpLiteralString);
367     return nrunes_;
368   }
match_id()369   int match_id() {
370     ABSL_DCHECK_EQ(op_, kRegexpHaveMatch);
371     return match_id_;
372   }
373 
374   // Increments reference count, returns object as convenience.
375   Regexp* Incref();
376 
377   // Decrements reference count and deletes this object if count reaches 0.
378   void Decref();
379 
380   // Parses string s to produce regular expression, returned.
381   // Caller must release return value with re->Decref().
382   // On failure, sets *status (if status != NULL) and returns NULL.
383   static Regexp* Parse(absl::string_view s, ParseFlags flags,
384                        RegexpStatus* status);
385 
386   // Returns a _new_ simplified version of the current regexp.
387   // Does not edit the current regexp.
388   // Caller must release return value with re->Decref().
389   // Simplified means that counted repetition has been rewritten
390   // into simpler terms and all Perl/POSIX features have been
391   // removed.  The result will capture exactly the same
392   // subexpressions the original did, unless formatted with ToString.
393   Regexp* Simplify();
394   friend class CoalesceWalker;
395   friend class SimplifyWalker;
396 
397   // Parses the regexp src and then simplifies it and sets *dst to the
398   // string representation of the simplified form.  Returns true on success.
399   // Returns false and sets *status (if status != NULL) on parse error.
400   static bool SimplifyRegexp(absl::string_view src, ParseFlags flags,
401                              std::string* dst, RegexpStatus* status);
402 
403   // Returns the number of capturing groups in the regexp.
404   int NumCaptures();
405   friend class NumCapturesWalker;
406 
407   // Returns a map from names to capturing group indices,
408   // or NULL if the regexp contains no named capture groups.
409   // The caller is responsible for deleting the map.
410   std::map<std::string, int>* NamedCaptures();
411 
412   // Returns a map from capturing group indices to capturing group
413   // names or NULL if the regexp contains no named capture groups. The
414   // caller is responsible for deleting the map.
415   std::map<int, std::string>* CaptureNames();
416 
417   // Returns a string representation of the current regexp,
418   // using as few parentheses as possible.
419   std::string ToString();
420 
421   // Convenience functions.  They consume the passed reference,
422   // so in many cases you should use, e.g., Plus(re->Incref(), flags).
423   // They do not consume allocated arrays like subs or runes.
424   static Regexp* Plus(Regexp* sub, ParseFlags flags);
425   static Regexp* Star(Regexp* sub, ParseFlags flags);
426   static Regexp* Quest(Regexp* sub, ParseFlags flags);
427   static Regexp* Concat(Regexp** subs, int nsubs, ParseFlags flags);
428   static Regexp* Alternate(Regexp** subs, int nsubs, ParseFlags flags);
429   static Regexp* Capture(Regexp* sub, ParseFlags flags, int cap);
430   static Regexp* Repeat(Regexp* sub, ParseFlags flags, int min, int max);
431   static Regexp* NewLiteral(Rune rune, ParseFlags flags);
432   static Regexp* NewCharClass(CharClass* cc, ParseFlags flags);
433   static Regexp* LiteralString(Rune* runes, int nrunes, ParseFlags flags);
434   static Regexp* HaveMatch(int match_id, ParseFlags flags);
435 
436   // Like Alternate but does not factor out common prefixes.
437   static Regexp* AlternateNoFactor(Regexp** subs, int nsubs, ParseFlags flags);
438 
439   // Debugging function.  Returns string format for regexp
440   // that makes structure clear.  Does NOT use regexp syntax.
441   std::string Dump();
442 
443   // Helper traversal class, defined fully in walker-inl.h.
444   template<typename T> class Walker;
445 
446   // Compile to Prog.  See prog.h
447   // Reverse prog expects to be run over text backward.
448   // Construction and execution of prog will
449   // stay within approximately max_mem bytes of memory.
450   // If max_mem <= 0, a reasonable default is used.
451   Prog* CompileToProg(int64_t max_mem);
452   Prog* CompileToReverseProg(int64_t max_mem);
453 
454   // Whether to expect this library to find exactly the same answer as PCRE
455   // when running this regexp.  Most regexps do mimic PCRE exactly, but a few
456   // obscure cases behave differently.  Technically this is more a property
457   // of the Prog than the Regexp, but the computation is much easier to do
458   // on the Regexp.  See mimics_pcre.cc for the exact conditions.
459   bool MimicsPCRE();
460 
461   // Benchmarking function.
462   void NullWalk();
463 
464   // Whether every match of this regexp must be anchored and
465   // begin with a non-empty fixed string (perhaps after ASCII
466   // case-folding).  If so, returns the prefix and the sub-regexp that
467   // follows it.
468   // Callers should expect *prefix, *foldcase and *suffix to be "zeroed"
469   // regardless of the return value.
470   bool RequiredPrefix(std::string* prefix, bool* foldcase,
471                       Regexp** suffix);
472 
473   // Whether every match of this regexp must be unanchored and
474   // begin with a non-empty fixed string (perhaps after ASCII
475   // case-folding).  If so, returns the prefix.
476   // Callers should expect *prefix and *foldcase to be "zeroed"
477   // regardless of the return value.
478   bool RequiredPrefixForAccel(std::string* prefix, bool* foldcase);
479 
480   // Controls the maximum repeat count permitted by the parser.
481   // FOR FUZZING ONLY.
482   static void FUZZING_ONLY_set_maximum_repeat_count(int i);
483 
484  private:
485   // Constructor allocates vectors as appropriate for operator.
486   explicit Regexp(RegexpOp op, ParseFlags parse_flags);
487 
488   // Use Decref() instead of delete to release Regexps.
489   // This is private to catch deletes at compile time.
490   ~Regexp();
491   void Destroy();
492   bool QuickDestroy();
493 
494   // Helpers for Parse.  Listed here so they can edit Regexps.
495   class ParseState;
496 
497   friend class ParseState;
498   friend bool ParseCharClass(absl::string_view* s, Regexp** out_re,
499                              RegexpStatus* status);
500 
501   // Helper for testing [sic].
502   friend bool RegexpEqualTestingOnly(Regexp*, Regexp*);
503 
504   // Computes whether Regexp is already simple.
505   bool ComputeSimple();
506 
507   // Constructor that generates a Star, Plus or Quest,
508   // squashing the pair if sub is also a Star, Plus or Quest.
509   static Regexp* StarPlusOrQuest(RegexpOp op, Regexp* sub, ParseFlags flags);
510 
511   // Constructor that generates a concatenation or alternation,
512   // enforcing the limit on the number of subexpressions for
513   // a particular Regexp.
514   static Regexp* ConcatOrAlternate(RegexpOp op, Regexp** subs, int nsubs,
515                                    ParseFlags flags, bool can_factor);
516 
517   // Returns the leading string that re starts with.
518   // The returned Rune* points into a piece of re,
519   // so it must not be used after the caller calls re->Decref().
520   static Rune* LeadingString(Regexp* re, int* nrune, ParseFlags* flags);
521 
522   // Removes the first n leading runes from the beginning of re.
523   // Edits re in place.
524   static void RemoveLeadingString(Regexp* re, int n);
525 
526   // Returns the leading regexp in re's top-level concatenation.
527   // The returned Regexp* points at re or a sub-expression of re,
528   // so it must not be used after the caller calls re->Decref().
529   static Regexp* LeadingRegexp(Regexp* re);
530 
531   // Removes LeadingRegexp(re) from re and returns the remainder.
532   // Might edit re in place.
533   static Regexp* RemoveLeadingRegexp(Regexp* re);
534 
535   // Simplifies an alternation of literal strings by factoring out
536   // common prefixes.
537   static int FactorAlternation(Regexp** sub, int nsub, ParseFlags flags);
538   friend class FactorAlternationImpl;
539 
540   // Is a == b?  Only efficient on regexps that have not been through
541   // Simplify yet - the expansion of a kRegexpRepeat will make this
542   // take a long time.  Do not call on such regexps, hence private.
543   static bool Equal(Regexp* a, Regexp* b);
544 
545   // Allocate space for n sub-regexps.
AllocSub(int n)546   void AllocSub(int n) {
547     ABSL_DCHECK(n >= 0 && static_cast<uint16_t>(n) == n);
548     if (n > 1)
549       submany_ = new Regexp*[n];
550     nsub_ = static_cast<uint16_t>(n);
551   }
552 
553   // Add Rune to LiteralString
554   void AddRuneToString(Rune r);
555 
556   // Swaps this with that, in place.
557   void Swap(Regexp *that);
558 
559   // Operator.  See description of operators above.
560   // uint8_t instead of RegexpOp to control space usage.
561   uint8_t op_;
562 
563   // Is this regexp structure already simple
564   // (has it been returned by Simplify)?
565   // uint8_t instead of bool to control space usage.
566   uint8_t simple_;
567 
568   // Flags saved from parsing and used during execution.
569   // (Only FoldCase is used.)
570   // uint16_t instead of ParseFlags to control space usage.
571   uint16_t parse_flags_;
572 
573   // Reference count.  Exists so that SimplifyRegexp can build
574   // regexp structures that are dags rather than trees to avoid
575   // exponential blowup in space requirements.
576   // uint16_t to control space usage.
577   // The standard regexp routines will never generate a
578   // ref greater than the maximum repeat count (kMaxRepeat),
579   // but even so, Incref and Decref consult an overflow map
580   // when ref_ reaches kMaxRef.
581   uint16_t ref_;
582   static const uint16_t kMaxRef = 0xffff;
583 
584   // Subexpressions.
585   // uint16_t to control space usage.
586   // Concat and Alternate handle larger numbers of subexpressions
587   // by building concatenation or alternation trees.
588   // Other routines should call Concat or Alternate instead of
589   // filling in sub() by hand.
590   uint16_t nsub_;
591   static const uint16_t kMaxNsub = 0xffff;
592   union {
593     Regexp** submany_;  // if nsub_ > 1
594     Regexp* subone_;  // if nsub_ == 1
595   };
596 
597   // Extra space for parse and teardown stacks.
598   Regexp* down_;
599 
600   // Arguments to operator.  See description of operators above.
601   union {
602     struct {  // Repeat
603       int max_;
604       int min_;
605     };
606     struct {  // Capture
607       int cap_;
608       std::string* name_;
609     };
610     struct {  // LiteralString
611       int nrunes_;
612       Rune* runes_;
613     };
614     struct {  // CharClass
615       // These two could be in separate union members,
616       // but it wouldn't save any space (there are other two-word structs)
617       // and keeping them separate avoids confusion during parsing.
618       CharClass* cc_;
619       CharClassBuilder* ccb_;
620     };
621     Rune rune_;  // Literal
622     int match_id_;  // HaveMatch
623     void *the_union_[2];  // as big as any other element, for memset
624   };
625 
626   Regexp(const Regexp&) = delete;
627   Regexp& operator=(const Regexp&) = delete;
628 };
629 
630 // Character class set: contains non-overlapping, non-abutting RuneRanges.
631 typedef std::set<RuneRange, RuneRangeLess> RuneRangeSet;
632 
633 class CharClassBuilder {
634  public:
635   CharClassBuilder();
636 
637   typedef RuneRangeSet::iterator iterator;
begin()638   iterator begin() { return ranges_.begin(); }
end()639   iterator end() { return ranges_.end(); }
640 
size()641   int size() { return nrunes_; }
empty()642   bool empty() { return nrunes_ == 0; }
full()643   bool full() { return nrunes_ == Runemax+1; }
644 
645   bool Contains(Rune r);
646   bool FoldsASCII();
647   bool AddRange(Rune lo, Rune hi);  // returns whether class changed
648   CharClassBuilder* Copy();
649   void AddCharClass(CharClassBuilder* cc);
650   void Negate();
651   void RemoveAbove(Rune r);
652   CharClass* GetCharClass();
653   void AddRangeFlags(Rune lo, Rune hi, Regexp::ParseFlags parse_flags);
654 
655  private:
656   static const uint32_t AlphaMask = (1<<26) - 1;
657   uint32_t upper_;  // bitmap of A-Z
658   uint32_t lower_;  // bitmap of a-z
659   int nrunes_;
660   RuneRangeSet ranges_;
661 
662   CharClassBuilder(const CharClassBuilder&) = delete;
663   CharClassBuilder& operator=(const CharClassBuilder&) = delete;
664 };
665 
666 // Bitwise ops on ParseFlags produce ParseFlags.
667 inline Regexp::ParseFlags operator|(Regexp::ParseFlags a,
668                                     Regexp::ParseFlags b) {
669   return static_cast<Regexp::ParseFlags>(
670       static_cast<int>(a) | static_cast<int>(b));
671 }
672 
673 inline Regexp::ParseFlags operator^(Regexp::ParseFlags a,
674                                     Regexp::ParseFlags b) {
675   return static_cast<Regexp::ParseFlags>(
676       static_cast<int>(a) ^ static_cast<int>(b));
677 }
678 
679 inline Regexp::ParseFlags operator&(Regexp::ParseFlags a,
680                                     Regexp::ParseFlags b) {
681   return static_cast<Regexp::ParseFlags>(
682       static_cast<int>(a) & static_cast<int>(b));
683 }
684 
685 inline Regexp::ParseFlags operator~(Regexp::ParseFlags a) {
686   // Attempting to produce a value out of enum's range has undefined behaviour.
687   return static_cast<Regexp::ParseFlags>(
688       ~static_cast<int>(a) & static_cast<int>(Regexp::AllParseFlags));
689 }
690 
691 }  // namespace re2
692 
693 #endif  // RE2_REGEXP_H_
694