1 // Copyright 2006 The RE2 Authors. All Rights Reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 #ifndef RE2_REGEXP_H_ 6 #define RE2_REGEXP_H_ 7 8 // --- SPONSORED LINK -------------------------------------------------- 9 // If you want to use this library for regular expression matching, 10 // you should use re2/re2.h, which provides a class RE2 that 11 // mimics the PCRE interface provided by PCRE's C++ wrappers. 12 // This header describes the low-level interface used to implement RE2 13 // and may change in backwards-incompatible ways from time to time. 14 // In contrast, RE2's interface will not. 15 // --------------------------------------------------------------------- 16 17 // Regular expression library: parsing, execution, and manipulation 18 // of regular expressions. 19 // 20 // Any operation that traverses the Regexp structures should be written 21 // using Regexp::Walker (see walker-inl.h), not recursively, because deeply nested 22 // regular expressions such as x++++++++++++++++++++... might cause recursive 23 // traversals to overflow the stack. 24 // 25 // It is the caller's responsibility to provide appropriate mutual exclusion 26 // around manipulation of the regexps. RE2 does this. 27 // 28 // PARSING 29 // 30 // Regexp::Parse parses regular expressions encoded in UTF-8. 31 // The default syntax is POSIX extended regular expressions, 32 // with the following changes: 33 // 34 // 1. Backreferences (optional in POSIX EREs) are not supported. 35 // (Supporting them precludes the use of DFA-based 36 // matching engines.) 37 // 38 // 2. Collating elements and collation classes are not supported. 39 // (No one has needed or wanted them.) 40 // 41 // The exact syntax accepted can be modified by passing flags to 42 // Regexp::Parse. In particular, many of the basic Perl additions 43 // are available. The flags are documented below (search for LikePerl). 44 // 45 // If parsed with the flag Regexp::Latin1, both the regular expression 46 // and the input to the matching routines are assumed to be encoded in 47 // Latin-1, not UTF-8. 48 // 49 // EXECUTION 50 // 51 // Once Regexp has parsed a regular expression, it provides methods 52 // to search text using that regular expression. These methods are 53 // implemented via calling out to other regular expression libraries. 54 // (Let's call them the sublibraries.) 55 // 56 // To call a sublibrary, Regexp does not simply prepare a 57 // string version of the regular expression and hand it to the 58 // sublibrary. Instead, Regexp prepares, from its own parsed form, the 59 // corresponding internal representation used by the sublibrary. 60 // This has the drawback of needing to know the internal representation 61 // used by the sublibrary, but it has two important benefits: 62 // 63 // 1. The syntax and meaning of regular expressions is guaranteed 64 // to be that used by Regexp's parser, not the syntax expected 65 // by the sublibrary. Regexp might accept a restricted or 66 // expanded syntax for regular expressions as compared with 67 // the sublibrary. As long as Regexp can translate from its 68 // internal form into the sublibrary's, clients need not know 69 // exactly which sublibrary they are using. 70 // 71 // 2. The sublibrary parsers are bypassed. For whatever reason, 72 // sublibrary regular expression parsers often have security 73 // problems. For example, plan9grep's regular expression parser 74 // has a buffer overflow in its handling of large character 75 // classes, and PCRE's parser has had buffer overflow problems 76 // in the past. Security-team requires sandboxing of sublibrary 77 // regular expression parsers. Avoiding the sublibrary parsers 78 // avoids the sandbox. 79 // 80 // The execution methods we use now are provided by the compiled form, 81 // Prog, described in prog.h 82 // 83 // MANIPULATION 84 // 85 // Unlike other regular expression libraries, Regexp makes its parsed 86 // form accessible to clients, so that client code can analyze the 87 // parsed regular expressions. 88 89 #include <stddef.h> 90 #include <stdint.h> 91 92 #include <map> 93 #include <set> 94 #include <string> 95 96 #include "absl/log/absl_check.h" 97 #include "absl/log/absl_log.h" 98 #include "absl/strings/string_view.h" 99 #include "util/utf.h" 100 101 namespace re2 { 102 103 // Keep in sync with string list kOpcodeNames[] in testing/dump.cc 104 enum RegexpOp { 105 // Matches no strings. 106 kRegexpNoMatch = 1, 107 108 // Matches empty string. 109 kRegexpEmptyMatch, 110 111 // Matches rune_. 112 kRegexpLiteral, 113 114 // Matches runes_. 115 kRegexpLiteralString, 116 117 // Matches concatenation of sub_[0..nsub-1]. 118 kRegexpConcat, 119 // Matches union of sub_[0..nsub-1]. 120 kRegexpAlternate, 121 122 // Matches sub_[0] zero or more times. 123 kRegexpStar, 124 // Matches sub_[0] one or more times. 125 kRegexpPlus, 126 // Matches sub_[0] zero or one times. 127 kRegexpQuest, 128 129 // Matches sub_[0] at least min_ times, at most max_ times. 130 // max_ == -1 means no upper limit. 131 kRegexpRepeat, 132 133 // Parenthesized (capturing) subexpression. Index is cap_. 134 // Optionally, capturing name is name_. 135 kRegexpCapture, 136 137 // Matches any character. 138 kRegexpAnyChar, 139 140 // Matches any byte [sic]. 141 kRegexpAnyByte, 142 143 // Matches empty string at beginning of line. 144 kRegexpBeginLine, 145 // Matches empty string at end of line. 146 kRegexpEndLine, 147 148 // Matches word boundary "\b". 149 kRegexpWordBoundary, 150 // Matches not-a-word boundary "\B". 151 kRegexpNoWordBoundary, 152 153 // Matches empty string at beginning of text. 154 kRegexpBeginText, 155 // Matches empty string at end of text. 156 kRegexpEndText, 157 158 // Matches character class given by cc_. 159 kRegexpCharClass, 160 161 // Forces match of entire expression right now, 162 // with match ID match_id_ (used by RE2::Set). 163 kRegexpHaveMatch, 164 165 kMaxRegexpOp = kRegexpHaveMatch, 166 }; 167 168 // Keep in sync with string list in regexp.cc 169 enum RegexpStatusCode { 170 // No error 171 kRegexpSuccess = 0, 172 173 // Unexpected error 174 kRegexpInternalError, 175 176 // Parse errors 177 kRegexpBadEscape, // bad escape sequence 178 kRegexpBadCharClass, // bad character class 179 kRegexpBadCharRange, // bad character class range 180 kRegexpMissingBracket, // missing closing ] 181 kRegexpMissingParen, // missing closing ) 182 kRegexpUnexpectedParen, // unexpected closing ) 183 kRegexpTrailingBackslash, // at end of regexp 184 kRegexpRepeatArgument, // repeat argument missing, e.g. "*" 185 kRegexpRepeatSize, // bad repetition argument 186 kRegexpRepeatOp, // bad repetition operator 187 kRegexpBadPerlOp, // bad perl operator 188 kRegexpBadUTF8, // invalid UTF-8 in regexp 189 kRegexpBadNamedCapture, // bad named capture 190 }; 191 192 // Error status for certain operations. 193 class RegexpStatus { 194 public: RegexpStatus()195 RegexpStatus() : code_(kRegexpSuccess), tmp_(NULL) {} ~RegexpStatus()196 ~RegexpStatus() { delete tmp_; } 197 set_code(RegexpStatusCode code)198 void set_code(RegexpStatusCode code) { code_ = code; } set_error_arg(absl::string_view error_arg)199 void set_error_arg(absl::string_view error_arg) { error_arg_ = error_arg; } set_tmp(std::string * tmp)200 void set_tmp(std::string* tmp) { delete tmp_; tmp_ = tmp; } code()201 RegexpStatusCode code() const { return code_; } error_arg()202 absl::string_view error_arg() const { return error_arg_; } ok()203 bool ok() const { return code() == kRegexpSuccess; } 204 205 // Copies state from status. 206 void Copy(const RegexpStatus& status); 207 208 // Returns text equivalent of code, e.g.: 209 // "Bad character class" 210 static std::string CodeText(RegexpStatusCode code); 211 212 // Returns text describing error, e.g.: 213 // "Bad character class: [z-a]" 214 std::string Text() const; 215 216 private: 217 RegexpStatusCode code_; // Kind of error. 218 absl::string_view error_arg_; // Piece of regexp containing syntax error. 219 std::string* tmp_; // Temporary storage, possibly for error_arg_. 220 221 RegexpStatus(const RegexpStatus&) = delete; 222 RegexpStatus& operator=(const RegexpStatus&) = delete; 223 }; 224 225 // Compiled form; see prog.h 226 class Prog; 227 228 struct RuneRange { RuneRangeRuneRange229 RuneRange() : lo(0), hi(0) { } RuneRangeRuneRange230 RuneRange(int l, int h) : lo(l), hi(h) { } 231 Rune lo; 232 Rune hi; 233 }; 234 235 // Less-than on RuneRanges treats a == b if they overlap at all. 236 // This lets us look in a set to find the range covering a particular Rune. 237 struct RuneRangeLess { operatorRuneRangeLess238 bool operator()(const RuneRange& a, const RuneRange& b) const { 239 return a.hi < b.lo; 240 } 241 }; 242 243 class CharClassBuilder; 244 245 class CharClass { 246 public: 247 void Delete(); 248 249 typedef RuneRange* iterator; begin()250 iterator begin() { return ranges_; } end()251 iterator end() { return ranges_ + nranges_; } 252 size()253 int size() { return nrunes_; } empty()254 bool empty() { return nrunes_ == 0; } full()255 bool full() { return nrunes_ == Runemax+1; } FoldsASCII()256 bool FoldsASCII() { return folds_ascii_; } 257 258 bool Contains(Rune r) const; 259 CharClass* Negate(); 260 261 private: 262 CharClass(); // not implemented 263 ~CharClass(); // not implemented 264 static CharClass* New(size_t maxranges); 265 266 friend class CharClassBuilder; 267 268 bool folds_ascii_; 269 int nrunes_; 270 RuneRange *ranges_; 271 int nranges_; 272 273 CharClass(const CharClass&) = delete; 274 CharClass& operator=(const CharClass&) = delete; 275 }; 276 277 class Regexp { 278 public: 279 280 // Flags for parsing. Can be ORed together. 281 enum ParseFlags { 282 NoParseFlags = 0, 283 FoldCase = 1<<0, // Fold case during matching (case-insensitive). 284 Literal = 1<<1, // Treat s as literal string instead of a regexp. 285 ClassNL = 1<<2, // Allow char classes like [^a-z] and \D and \s 286 // and [[:space:]] to match newline. 287 DotNL = 1<<3, // Allow . to match newline. 288 MatchNL = ClassNL | DotNL, 289 OneLine = 1<<4, // Treat ^ and $ as only matching at beginning and 290 // end of text, not around embedded newlines. 291 // (Perl's default) 292 Latin1 = 1<<5, // Regexp and text are in Latin1, not UTF-8. 293 NonGreedy = 1<<6, // Repetition operators are non-greedy by default. 294 PerlClasses = 1<<7, // Allow Perl character classes like \d. 295 PerlB = 1<<8, // Allow Perl's \b and \B. 296 PerlX = 1<<9, // Perl extensions: 297 // non-capturing parens - (?: ) 298 // non-greedy operators - *? +? ?? {}? 299 // flag edits - (?i) (?-i) (?i: ) 300 // i - FoldCase 301 // m - !OneLine 302 // s - DotNL 303 // U - NonGreedy 304 // line ends: \A \z 305 // \Q and \E to disable/enable metacharacters 306 // (?P<name>expr) for named captures 307 // \C to match any single byte 308 UnicodeGroups = 1<<10, // Allow \p{Han} for Unicode Han group 309 // and \P{Han} for its negation. 310 NeverNL = 1<<11, // Never match NL, even if the regexp mentions 311 // it explicitly. 312 NeverCapture = 1<<12, // Parse all parens as non-capturing. 313 314 // As close to Perl as we can get. 315 LikePerl = ClassNL | OneLine | PerlClasses | PerlB | PerlX | 316 UnicodeGroups, 317 318 // Internal use only. 319 WasDollar = 1<<13, // on kRegexpEndText: was $ in regexp text 320 AllParseFlags = (1<<14)-1, 321 }; 322 323 // Get. No set, Regexps are logically immutable once created. op()324 RegexpOp op() { return static_cast<RegexpOp>(op_); } nsub()325 int nsub() { return nsub_; } simple()326 bool simple() { return simple_ != 0; } parse_flags()327 ParseFlags parse_flags() { return static_cast<ParseFlags>(parse_flags_); } 328 int Ref(); // For testing. 329 sub()330 Regexp** sub() { 331 if(nsub_ <= 1) 332 return &subone_; 333 else 334 return submany_; 335 } 336 min()337 int min() { 338 ABSL_DCHECK_EQ(op_, kRegexpRepeat); 339 return min_; 340 } max()341 int max() { 342 ABSL_DCHECK_EQ(op_, kRegexpRepeat); 343 return max_; 344 } rune()345 Rune rune() { 346 ABSL_DCHECK_EQ(op_, kRegexpLiteral); 347 return rune_; 348 } cc()349 CharClass* cc() { 350 ABSL_DCHECK_EQ(op_, kRegexpCharClass); 351 return cc_; 352 } cap()353 int cap() { 354 ABSL_DCHECK_EQ(op_, kRegexpCapture); 355 return cap_; 356 } name()357 const std::string* name() { 358 ABSL_DCHECK_EQ(op_, kRegexpCapture); 359 return name_; 360 } runes()361 Rune* runes() { 362 ABSL_DCHECK_EQ(op_, kRegexpLiteralString); 363 return runes_; 364 } nrunes()365 int nrunes() { 366 ABSL_DCHECK_EQ(op_, kRegexpLiteralString); 367 return nrunes_; 368 } match_id()369 int match_id() { 370 ABSL_DCHECK_EQ(op_, kRegexpHaveMatch); 371 return match_id_; 372 } 373 374 // Increments reference count, returns object as convenience. 375 Regexp* Incref(); 376 377 // Decrements reference count and deletes this object if count reaches 0. 378 void Decref(); 379 380 // Parses string s to produce regular expression, returned. 381 // Caller must release return value with re->Decref(). 382 // On failure, sets *status (if status != NULL) and returns NULL. 383 static Regexp* Parse(absl::string_view s, ParseFlags flags, 384 RegexpStatus* status); 385 386 // Returns a _new_ simplified version of the current regexp. 387 // Does not edit the current regexp. 388 // Caller must release return value with re->Decref(). 389 // Simplified means that counted repetition has been rewritten 390 // into simpler terms and all Perl/POSIX features have been 391 // removed. The result will capture exactly the same 392 // subexpressions the original did, unless formatted with ToString. 393 Regexp* Simplify(); 394 friend class CoalesceWalker; 395 friend class SimplifyWalker; 396 397 // Parses the regexp src and then simplifies it and sets *dst to the 398 // string representation of the simplified form. Returns true on success. 399 // Returns false and sets *status (if status != NULL) on parse error. 400 static bool SimplifyRegexp(absl::string_view src, ParseFlags flags, 401 std::string* dst, RegexpStatus* status); 402 403 // Returns the number of capturing groups in the regexp. 404 int NumCaptures(); 405 friend class NumCapturesWalker; 406 407 // Returns a map from names to capturing group indices, 408 // or NULL if the regexp contains no named capture groups. 409 // The caller is responsible for deleting the map. 410 std::map<std::string, int>* NamedCaptures(); 411 412 // Returns a map from capturing group indices to capturing group 413 // names or NULL if the regexp contains no named capture groups. The 414 // caller is responsible for deleting the map. 415 std::map<int, std::string>* CaptureNames(); 416 417 // Returns a string representation of the current regexp, 418 // using as few parentheses as possible. 419 std::string ToString(); 420 421 // Convenience functions. They consume the passed reference, 422 // so in many cases you should use, e.g., Plus(re->Incref(), flags). 423 // They do not consume allocated arrays like subs or runes. 424 static Regexp* Plus(Regexp* sub, ParseFlags flags); 425 static Regexp* Star(Regexp* sub, ParseFlags flags); 426 static Regexp* Quest(Regexp* sub, ParseFlags flags); 427 static Regexp* Concat(Regexp** subs, int nsubs, ParseFlags flags); 428 static Regexp* Alternate(Regexp** subs, int nsubs, ParseFlags flags); 429 static Regexp* Capture(Regexp* sub, ParseFlags flags, int cap); 430 static Regexp* Repeat(Regexp* sub, ParseFlags flags, int min, int max); 431 static Regexp* NewLiteral(Rune rune, ParseFlags flags); 432 static Regexp* NewCharClass(CharClass* cc, ParseFlags flags); 433 static Regexp* LiteralString(Rune* runes, int nrunes, ParseFlags flags); 434 static Regexp* HaveMatch(int match_id, ParseFlags flags); 435 436 // Like Alternate but does not factor out common prefixes. 437 static Regexp* AlternateNoFactor(Regexp** subs, int nsubs, ParseFlags flags); 438 439 // Debugging function. Returns string format for regexp 440 // that makes structure clear. Does NOT use regexp syntax. 441 std::string Dump(); 442 443 // Helper traversal class, defined fully in walker-inl.h. 444 template<typename T> class Walker; 445 446 // Compile to Prog. See prog.h 447 // Reverse prog expects to be run over text backward. 448 // Construction and execution of prog will 449 // stay within approximately max_mem bytes of memory. 450 // If max_mem <= 0, a reasonable default is used. 451 Prog* CompileToProg(int64_t max_mem); 452 Prog* CompileToReverseProg(int64_t max_mem); 453 454 // Whether to expect this library to find exactly the same answer as PCRE 455 // when running this regexp. Most regexps do mimic PCRE exactly, but a few 456 // obscure cases behave differently. Technically this is more a property 457 // of the Prog than the Regexp, but the computation is much easier to do 458 // on the Regexp. See mimics_pcre.cc for the exact conditions. 459 bool MimicsPCRE(); 460 461 // Benchmarking function. 462 void NullWalk(); 463 464 // Whether every match of this regexp must be anchored and 465 // begin with a non-empty fixed string (perhaps after ASCII 466 // case-folding). If so, returns the prefix and the sub-regexp that 467 // follows it. 468 // Callers should expect *prefix, *foldcase and *suffix to be "zeroed" 469 // regardless of the return value. 470 bool RequiredPrefix(std::string* prefix, bool* foldcase, 471 Regexp** suffix); 472 473 // Whether every match of this regexp must be unanchored and 474 // begin with a non-empty fixed string (perhaps after ASCII 475 // case-folding). If so, returns the prefix. 476 // Callers should expect *prefix and *foldcase to be "zeroed" 477 // regardless of the return value. 478 bool RequiredPrefixForAccel(std::string* prefix, bool* foldcase); 479 480 // Controls the maximum repeat count permitted by the parser. 481 // FOR FUZZING ONLY. 482 static void FUZZING_ONLY_set_maximum_repeat_count(int i); 483 484 private: 485 // Constructor allocates vectors as appropriate for operator. 486 explicit Regexp(RegexpOp op, ParseFlags parse_flags); 487 488 // Use Decref() instead of delete to release Regexps. 489 // This is private to catch deletes at compile time. 490 ~Regexp(); 491 void Destroy(); 492 bool QuickDestroy(); 493 494 // Helpers for Parse. Listed here so they can edit Regexps. 495 class ParseState; 496 497 friend class ParseState; 498 friend bool ParseCharClass(absl::string_view* s, Regexp** out_re, 499 RegexpStatus* status); 500 501 // Helper for testing [sic]. 502 friend bool RegexpEqualTestingOnly(Regexp*, Regexp*); 503 504 // Computes whether Regexp is already simple. 505 bool ComputeSimple(); 506 507 // Constructor that generates a Star, Plus or Quest, 508 // squashing the pair if sub is also a Star, Plus or Quest. 509 static Regexp* StarPlusOrQuest(RegexpOp op, Regexp* sub, ParseFlags flags); 510 511 // Constructor that generates a concatenation or alternation, 512 // enforcing the limit on the number of subexpressions for 513 // a particular Regexp. 514 static Regexp* ConcatOrAlternate(RegexpOp op, Regexp** subs, int nsubs, 515 ParseFlags flags, bool can_factor); 516 517 // Returns the leading string that re starts with. 518 // The returned Rune* points into a piece of re, 519 // so it must not be used after the caller calls re->Decref(). 520 static Rune* LeadingString(Regexp* re, int* nrune, ParseFlags* flags); 521 522 // Removes the first n leading runes from the beginning of re. 523 // Edits re in place. 524 static void RemoveLeadingString(Regexp* re, int n); 525 526 // Returns the leading regexp in re's top-level concatenation. 527 // The returned Regexp* points at re or a sub-expression of re, 528 // so it must not be used after the caller calls re->Decref(). 529 static Regexp* LeadingRegexp(Regexp* re); 530 531 // Removes LeadingRegexp(re) from re and returns the remainder. 532 // Might edit re in place. 533 static Regexp* RemoveLeadingRegexp(Regexp* re); 534 535 // Simplifies an alternation of literal strings by factoring out 536 // common prefixes. 537 static int FactorAlternation(Regexp** sub, int nsub, ParseFlags flags); 538 friend class FactorAlternationImpl; 539 540 // Is a == b? Only efficient on regexps that have not been through 541 // Simplify yet - the expansion of a kRegexpRepeat will make this 542 // take a long time. Do not call on such regexps, hence private. 543 static bool Equal(Regexp* a, Regexp* b); 544 545 // Allocate space for n sub-regexps. AllocSub(int n)546 void AllocSub(int n) { 547 ABSL_DCHECK(n >= 0 && static_cast<uint16_t>(n) == n); 548 if (n > 1) 549 submany_ = new Regexp*[n]; 550 nsub_ = static_cast<uint16_t>(n); 551 } 552 553 // Add Rune to LiteralString 554 void AddRuneToString(Rune r); 555 556 // Swaps this with that, in place. 557 void Swap(Regexp *that); 558 559 // Operator. See description of operators above. 560 // uint8_t instead of RegexpOp to control space usage. 561 uint8_t op_; 562 563 // Is this regexp structure already simple 564 // (has it been returned by Simplify)? 565 // uint8_t instead of bool to control space usage. 566 uint8_t simple_; 567 568 // Flags saved from parsing and used during execution. 569 // (Only FoldCase is used.) 570 // uint16_t instead of ParseFlags to control space usage. 571 uint16_t parse_flags_; 572 573 // Reference count. Exists so that SimplifyRegexp can build 574 // regexp structures that are dags rather than trees to avoid 575 // exponential blowup in space requirements. 576 // uint16_t to control space usage. 577 // The standard regexp routines will never generate a 578 // ref greater than the maximum repeat count (kMaxRepeat), 579 // but even so, Incref and Decref consult an overflow map 580 // when ref_ reaches kMaxRef. 581 uint16_t ref_; 582 static const uint16_t kMaxRef = 0xffff; 583 584 // Subexpressions. 585 // uint16_t to control space usage. 586 // Concat and Alternate handle larger numbers of subexpressions 587 // by building concatenation or alternation trees. 588 // Other routines should call Concat or Alternate instead of 589 // filling in sub() by hand. 590 uint16_t nsub_; 591 static const uint16_t kMaxNsub = 0xffff; 592 union { 593 Regexp** submany_; // if nsub_ > 1 594 Regexp* subone_; // if nsub_ == 1 595 }; 596 597 // Extra space for parse and teardown stacks. 598 Regexp* down_; 599 600 // Arguments to operator. See description of operators above. 601 union { 602 struct { // Repeat 603 int max_; 604 int min_; 605 }; 606 struct { // Capture 607 int cap_; 608 std::string* name_; 609 }; 610 struct { // LiteralString 611 int nrunes_; 612 Rune* runes_; 613 }; 614 struct { // CharClass 615 // These two could be in separate union members, 616 // but it wouldn't save any space (there are other two-word structs) 617 // and keeping them separate avoids confusion during parsing. 618 CharClass* cc_; 619 CharClassBuilder* ccb_; 620 }; 621 Rune rune_; // Literal 622 int match_id_; // HaveMatch 623 void *the_union_[2]; // as big as any other element, for memset 624 }; 625 626 Regexp(const Regexp&) = delete; 627 Regexp& operator=(const Regexp&) = delete; 628 }; 629 630 // Character class set: contains non-overlapping, non-abutting RuneRanges. 631 typedef std::set<RuneRange, RuneRangeLess> RuneRangeSet; 632 633 class CharClassBuilder { 634 public: 635 CharClassBuilder(); 636 637 typedef RuneRangeSet::iterator iterator; begin()638 iterator begin() { return ranges_.begin(); } end()639 iterator end() { return ranges_.end(); } 640 size()641 int size() { return nrunes_; } empty()642 bool empty() { return nrunes_ == 0; } full()643 bool full() { return nrunes_ == Runemax+1; } 644 645 bool Contains(Rune r); 646 bool FoldsASCII(); 647 bool AddRange(Rune lo, Rune hi); // returns whether class changed 648 CharClassBuilder* Copy(); 649 void AddCharClass(CharClassBuilder* cc); 650 void Negate(); 651 void RemoveAbove(Rune r); 652 CharClass* GetCharClass(); 653 void AddRangeFlags(Rune lo, Rune hi, Regexp::ParseFlags parse_flags); 654 655 private: 656 static const uint32_t AlphaMask = (1<<26) - 1; 657 uint32_t upper_; // bitmap of A-Z 658 uint32_t lower_; // bitmap of a-z 659 int nrunes_; 660 RuneRangeSet ranges_; 661 662 CharClassBuilder(const CharClassBuilder&) = delete; 663 CharClassBuilder& operator=(const CharClassBuilder&) = delete; 664 }; 665 666 // Bitwise ops on ParseFlags produce ParseFlags. 667 inline Regexp::ParseFlags operator|(Regexp::ParseFlags a, 668 Regexp::ParseFlags b) { 669 return static_cast<Regexp::ParseFlags>( 670 static_cast<int>(a) | static_cast<int>(b)); 671 } 672 673 inline Regexp::ParseFlags operator^(Regexp::ParseFlags a, 674 Regexp::ParseFlags b) { 675 return static_cast<Regexp::ParseFlags>( 676 static_cast<int>(a) ^ static_cast<int>(b)); 677 } 678 679 inline Regexp::ParseFlags operator&(Regexp::ParseFlags a, 680 Regexp::ParseFlags b) { 681 return static_cast<Regexp::ParseFlags>( 682 static_cast<int>(a) & static_cast<int>(b)); 683 } 684 685 inline Regexp::ParseFlags operator~(Regexp::ParseFlags a) { 686 // Attempting to produce a value out of enum's range has undefined behaviour. 687 return static_cast<Regexp::ParseFlags>( 688 ~static_cast<int>(a) & static_cast<int>(Regexp::AllParseFlags)); 689 } 690 691 } // namespace re2 692 693 #endif // RE2_REGEXP_H_ 694