Lines Matching +full:escape +full:- +full:string +full:- +full:regexp
2 // Use of this source code is governed by a BSD-style
8 // --- SPONSORED LINK --------------------------------------------------
12 // This header describes the low-level interface used to implement RE2
13 // and may change in backwards-incompatible ways from time to time.
15 // ---------------------------------------------------------------------
20 // Any operation that traverses the Regexp structures should be written
21 // using Regexp::Walker (see walker-inl.h), not recursively, because deeply nested
30 // Regexp::Parse parses regular expressions encoded in UTF-8.
35 // (Supporting them precludes the use of DFA-based
42 // Regexp::Parse. In particular, many of the basic Perl additions
45 // If parsed with the flag Regexp::Latin1, both the regular expression
47 // Latin-1, not UTF-8.
51 // Once Regexp has parsed a regular expression, it provides methods
56 // To call a sublibrary, Regexp does not simply prepare a
57 // string version of the regular expression and hand it to the
58 // sublibrary. Instead, Regexp prepares, from its own parsed form, the
64 // to be that used by Regexp's parser, not the syntax expected
65 // by the sublibrary. Regexp might accept a restricted or
67 // the sublibrary. As long as Regexp can translate from its
76 // in the past. Security-team requires sandboxing of sublibrary
85 // Unlike other regular expression libraries, Regexp makes its parsed
92 #include <string>
101 // Keep in sync with string list kOpcodeNames[] in testing/dump.cc
106 // Matches empty string.
115 // Matches concatenation of sub_[0..nsub-1].
117 // Matches union of sub_[0..nsub-1].
128 // max_ == -1 means no upper limit.
141 // Matches empty string at beginning of line.
143 // Matches empty string at end of line.
148 // Matches not-a-word boundary "\B".
151 // Matches empty string at beginning of text.
153 // Matches empty string at end of text.
166 // Keep in sync with string list in regexp.cc
175 kRegexpBadEscape, // bad escape sequence
180 kRegexpTrailingBackslash, // at end of regexp
185 kRegexpBadUTF8, // invalid UTF-8 in regexp
197 void set_tmp(string* tmp) { delete tmp_; tmp_ = tmp; } in set_tmp()
207 static string CodeText(RegexpStatusCode code);
210 // "Bad character class: [z-a]"
211 string Text() const;
215 StringPiece error_arg_; // Piece of regexp containing syntax error.
216 string* tmp_; // Temporary storage, possibly where error_arg_ is.
232 // Less-than on RuneRanges treats a == b if they overlap at all.
274 class Regexp {
280 FoldCase = 1<<0, // Fold case during matching (case-insensitive).
281 Literal = 1<<1, // Treat s as literal string instead of a regexp.
282 ClassNL = 1<<2, // Allow char classes like [^a-z] and \D and \s
289 Latin1 = 1<<5, // Regexp and text are in Latin1, not UTF-8.
290 NonGreedy = 1<<6, // Repetition operators are non-greedy by default.
294 // non-capturing parens - (?: )
295 // non-greedy operators - *? +? ?? {}?
296 // flag edits - (?i) (?-i) (?i: )
297 // i - FoldCase
298 // m - !OneLine
299 // s - DotNL
300 // U - NonGreedy
307 NeverNL = 1<<11, // Never match NL, even if the regexp mentions
309 NeverCapture = 1<<12, // Parse all parens as non-capturing.
316 WasDollar = 1<<13, // on kRegexpEndText: was $ in regexp text
317 AllParseFlags = (1<<14)-1,
327 Regexp** sub() { in sub()
339 const string* name() { DCHECK_EQ(op_, kRegexpCapture); return name_; } in name()
345 Regexp* Incref();
350 // Parses string s to produce regular expression, returned.
351 // Caller must release return value with re->Decref().
353 static Regexp* Parse(const StringPiece& s, ParseFlags flags,
356 // Returns a _new_ simplified version of the current regexp.
357 // Does not edit the current regexp.
358 // Caller must release return value with re->Decref().
363 Regexp* Simplify();
367 // Parses the regexp src and then simplifies it and sets *dst to the
368 // string representation of the simplified form. Returns true on success.
371 string* dst,
374 // Returns the number of capturing groups in the regexp.
379 // or NULL if the regexp contains no named capture groups.
381 std::map<string, int>* NamedCaptures();
384 // names or NULL if the regexp contains no named capture groups. The
386 std::map<int, string>* CaptureNames();
388 // Returns a string representation of the current regexp,
390 string ToString();
393 // so in many cases you should use, e.g., Plus(re->Incref(), flags).
395 static Regexp* Plus(Regexp* sub, ParseFlags flags);
396 static Regexp* Star(Regexp* sub, ParseFlags flags);
397 static Regexp* Quest(Regexp* sub, ParseFlags flags);
398 static Regexp* Concat(Regexp** subs, int nsubs, ParseFlags flags);
399 static Regexp* Alternate(Regexp** subs, int nsubs, ParseFlags flags);
400 static Regexp* Capture(Regexp* sub, ParseFlags flags, int cap);
401 static Regexp* Repeat(Regexp* sub, ParseFlags flags, int min, int max);
402 static Regexp* NewLiteral(Rune rune, ParseFlags flags);
403 static Regexp* NewCharClass(CharClass* cc, ParseFlags flags);
404 static Regexp* LiteralString(Rune* runes, int nrunes, ParseFlags flags);
405 static Regexp* HaveMatch(int match_id, ParseFlags flags);
408 static Regexp* AlternateNoFactor(Regexp** subs, int nsubs, ParseFlags flags);
410 // Debugging function. Returns string format for regexp
411 // that makes structure clear. Does NOT use regexp syntax.
412 string Dump();
414 // Helper traversal class, defined fully in walker-inl.h.
426 // when running this regexp. Most regexps do mimic PCRE exactly, but a few
428 // of the Prog than the Regexp, but the computation is much easier to do
429 // on the Regexp. See mimics_pcre.cc for the exact conditions.
435 // Whether every match of this regexp must be anchored and
436 // begin with a non-empty fixed string (perhaps after ASCII
437 // case-folding). If so, returns the prefix and the sub-regexp that
441 bool RequiredPrefix(string* prefix, bool* foldcase, Regexp** suffix);
445 explicit Regexp(RegexpOp op, ParseFlags parse_flags);
449 ~Regexp();
457 friend bool ParseCharClass(StringPiece* s, Regexp** out_re,
461 friend bool RegexpEqualTestingOnly(Regexp*, Regexp*);
463 // Computes whether Regexp is already simple.
468 static Regexp* StarPlusOrQuest(RegexpOp op, Regexp* sub, ParseFlags flags);
472 // a particular Regexp.
473 static Regexp* ConcatOrAlternate(RegexpOp op, Regexp** subs, int nsubs,
476 // Returns the leading string that re starts with.
478 // so it must not be used after the caller calls re->Decref().
479 static Rune* LeadingString(Regexp* re, int* nrune, ParseFlags* flags);
483 static void RemoveLeadingString(Regexp* re, int n);
485 // Returns the leading regexp in re's top-level concatenation.
486 // The returned Regexp* points at re or a sub-expression of re,
487 // so it must not be used after the caller calls re->Decref().
488 static Regexp* LeadingRegexp(Regexp* re);
492 static Regexp* RemoveLeadingRegexp(Regexp* re);
496 static int FactorAlternation(Regexp** sub, int nsub, ParseFlags flags);
500 // Simplify yet - the expansion of a kRegexpRepeat will make this
502 static bool Equal(Regexp* a, Regexp* b);
504 // Allocate space for n sub-regexps.
508 submany_ = new Regexp*[n]; in AllocSub()
516 void Swap(Regexp *that);
522 // Is this regexp structure already simple
533 // regexp structures that are dags rather than trees to avoid
536 // The standard regexp routines will never generate a
552 Regexp** submany_; // if nsub_ > 1
553 Regexp* subone_; // if nsub_ == 1
557 Regexp* down_;
567 string* name_;
575 // but it wouldn't save any space (there are other two-word structs)
585 Regexp(const Regexp&) = delete;
586 Regexp& operator=(const Regexp&) = delete;
589 // Character class set: contains non-overlapping, non-abutting RuneRanges.
612 void AddRangeFlags(Rune lo, Rune hi, Regexp::ParseFlags parse_flags);
615 static const uint32_t AlphaMask = (1<<26) - 1;
616 uint32_t upper_; // bitmap of A-Z
617 uint32_t lower_; // bitmap of a-z
626 inline Regexp::ParseFlags operator|(Regexp::ParseFlags a,
627 Regexp::ParseFlags b) {
628 return static_cast<Regexp::ParseFlags>(
632 inline Regexp::ParseFlags operator^(Regexp::ParseFlags a,
633 Regexp::ParseFlags b) {
634 return static_cast<Regexp::ParseFlags>(
638 inline Regexp::ParseFlags operator&(Regexp::ParseFlags a,
639 Regexp::ParseFlags b) {
640 return static_cast<Regexp::ParseFlags>(
644 inline Regexp::ParseFlags operator~(Regexp::ParseFlags a) {
646 return static_cast<Regexp::ParseFlags>(
647 ~static_cast<int>(a) & static_cast<int>(Regexp::AllParseFlags));