• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*!
2 Defines an abstract syntax for regular expressions.
3 */
4 
5 use std::cmp::Ordering;
6 use std::error;
7 use std::fmt;
8 
9 pub use crate::ast::visitor::{visit, Visitor};
10 
11 pub mod parse;
12 pub mod print;
13 mod visitor;
14 
15 /// An error that occurred while parsing a regular expression into an abstract
16 /// syntax tree.
17 ///
18 /// Note that not all ASTs represents a valid regular expression. For example,
19 /// an AST is constructed without error for `\p{Quux}`, but `Quux` is not a
20 /// valid Unicode property name. That particular error is reported when
21 /// translating an AST to the high-level intermediate representation (`HIR`).
22 #[derive(Clone, Debug, Eq, PartialEq)]
23 pub struct Error {
24     /// The kind of error.
25     kind: ErrorKind,
26     /// The original pattern that the parser generated the error from. Every
27     /// span in an error is a valid range into this string.
28     pattern: String,
29     /// The span of this error.
30     span: Span,
31 }
32 
33 impl Error {
34     /// Return the type of this error.
kind(&self) -> &ErrorKind35     pub fn kind(&self) -> &ErrorKind {
36         &self.kind
37     }
38 
39     /// The original pattern string in which this error occurred.
40     ///
41     /// Every span reported by this error is reported in terms of this string.
pattern(&self) -> &str42     pub fn pattern(&self) -> &str {
43         &self.pattern
44     }
45 
46     /// Return the span at which this error occurred.
span(&self) -> &Span47     pub fn span(&self) -> &Span {
48         &self.span
49     }
50 
51     /// Return an auxiliary span. This span exists only for some errors that
52     /// benefit from being able to point to two locations in the original
53     /// regular expression. For example, "duplicate" errors will have the
54     /// main error position set to the duplicate occurrence while its
55     /// auxiliary span will be set to the initial occurrence.
auxiliary_span(&self) -> Option<&Span>56     pub fn auxiliary_span(&self) -> Option<&Span> {
57         use self::ErrorKind::*;
58         match self.kind {
59             FlagDuplicate { ref original } => Some(original),
60             FlagRepeatedNegation { ref original, .. } => Some(original),
61             GroupNameDuplicate { ref original, .. } => Some(original),
62             _ => None,
63         }
64     }
65 }
66 
67 /// The type of an error that occurred while building an AST.
68 #[derive(Clone, Debug, Eq, PartialEq)]
69 pub enum ErrorKind {
70     /// The capturing group limit was exceeded.
71     ///
72     /// Note that this represents a limit on the total number of capturing
73     /// groups in a regex and not necessarily the number of nested capturing
74     /// groups. That is, the nest limit can be low and it is still possible for
75     /// this error to occur.
76     CaptureLimitExceeded,
77     /// An invalid escape sequence was found in a character class set.
78     ClassEscapeInvalid,
79     /// An invalid character class range was found. An invalid range is any
80     /// range where the start is greater than the end.
81     ClassRangeInvalid,
82     /// An invalid range boundary was found in a character class. Range
83     /// boundaries must be a single literal codepoint, but this error indicates
84     /// that something else was found, such as a nested class.
85     ClassRangeLiteral,
86     /// An opening `[` was found with no corresponding closing `]`.
87     ClassUnclosed,
88     /// Note that this error variant is no longer used. Namely, a decimal
89     /// number can only appear as a repetition quantifier. When the number
90     /// in a repetition quantifier is empty, then it gets its own specialized
91     /// error, `RepetitionCountDecimalEmpty`.
92     DecimalEmpty,
93     /// An invalid decimal number was given where one was expected.
94     DecimalInvalid,
95     /// A bracketed hex literal was empty.
96     EscapeHexEmpty,
97     /// A bracketed hex literal did not correspond to a Unicode scalar value.
98     EscapeHexInvalid,
99     /// An invalid hexadecimal digit was found.
100     EscapeHexInvalidDigit,
101     /// EOF was found before an escape sequence was completed.
102     EscapeUnexpectedEof,
103     /// An unrecognized escape sequence.
104     EscapeUnrecognized,
105     /// A dangling negation was used when setting flags, e.g., `i-`.
106     FlagDanglingNegation,
107     /// A flag was used twice, e.g., `i-i`.
108     FlagDuplicate {
109         /// The position of the original flag. The error position
110         /// points to the duplicate flag.
111         original: Span,
112     },
113     /// The negation operator was used twice, e.g., `-i-s`.
114     FlagRepeatedNegation {
115         /// The position of the original negation operator. The error position
116         /// points to the duplicate negation operator.
117         original: Span,
118     },
119     /// Expected a flag but got EOF, e.g., `(?`.
120     FlagUnexpectedEof,
121     /// Unrecognized flag, e.g., `a`.
122     FlagUnrecognized,
123     /// A duplicate capture name was found.
124     GroupNameDuplicate {
125         /// The position of the initial occurrence of the capture name. The
126         /// error position itself points to the duplicate occurrence.
127         original: Span,
128     },
129     /// A capture group name is empty, e.g., `(?P<>abc)`.
130     GroupNameEmpty,
131     /// An invalid character was seen for a capture group name. This includes
132     /// errors where the first character is a digit (even though subsequent
133     /// characters are allowed to be digits).
134     GroupNameInvalid,
135     /// A closing `>` could not be found for a capture group name.
136     GroupNameUnexpectedEof,
137     /// An unclosed group, e.g., `(ab`.
138     ///
139     /// The span of this error corresponds to the unclosed parenthesis.
140     GroupUnclosed,
141     /// An unopened group, e.g., `ab)`.
142     GroupUnopened,
143     /// The nest limit was exceeded. The limit stored here is the limit
144     /// configured in the parser.
145     NestLimitExceeded(u32),
146     /// The range provided in a counted repetition operator is invalid. The
147     /// range is invalid if the start is greater than the end.
148     RepetitionCountInvalid,
149     /// An opening `{` was not followed by a valid decimal value.
150     /// For example, `x{}` or `x{]}` would fail.
151     RepetitionCountDecimalEmpty,
152     /// An opening `{` was found with no corresponding closing `}`.
153     RepetitionCountUnclosed,
154     /// A repetition operator was applied to a missing sub-expression. This
155     /// occurs, for example, in the regex consisting of just a `*` or even
156     /// `(?i)*`. It is, however, possible to create a repetition operating on
157     /// an empty sub-expression. For example, `()*` is still considered valid.
158     RepetitionMissing,
159     /// The Unicode class is not valid. This typically occurs when a `\p` is
160     /// followed by something other than a `{`.
161     UnicodeClassInvalid,
162     /// When octal support is disabled, this error is produced when an octal
163     /// escape is used. The octal escape is assumed to be an invocation of
164     /// a backreference, which is the common case.
165     UnsupportedBackreference,
166     /// When syntax similar to PCRE's look-around is used, this error is
167     /// returned. Some example syntaxes that are rejected include, but are
168     /// not necessarily limited to, `(?=re)`, `(?!re)`, `(?<=re)` and
169     /// `(?<!re)`. Note that all of these syntaxes are otherwise invalid; this
170     /// error is used to improve the user experience.
171     UnsupportedLookAround,
172     /// Hints that destructuring should not be exhaustive.
173     ///
174     /// This enum may grow additional variants, so this makes sure clients
175     /// don't count on exhaustive matching. (Otherwise, adding a new variant
176     /// could break existing code.)
177     #[doc(hidden)]
178     __Nonexhaustive,
179 }
180 
181 impl error::Error for Error {
182     // TODO: Remove this method entirely on the next breaking semver release.
183     #[allow(deprecated)]
description(&self) -> &str184     fn description(&self) -> &str {
185         use self::ErrorKind::*;
186         match self.kind {
187             CaptureLimitExceeded => "capture group limit exceeded",
188             ClassEscapeInvalid => "invalid escape sequence in character class",
189             ClassRangeInvalid => "invalid character class range",
190             ClassRangeLiteral => "invalid range boundary, must be a literal",
191             ClassUnclosed => "unclosed character class",
192             DecimalEmpty => "empty decimal literal",
193             DecimalInvalid => "invalid decimal literal",
194             EscapeHexEmpty => "empty hexadecimal literal",
195             EscapeHexInvalid => "invalid hexadecimal literal",
196             EscapeHexInvalidDigit => "invalid hexadecimal digit",
197             EscapeUnexpectedEof => "unexpected eof (escape sequence)",
198             EscapeUnrecognized => "unrecognized escape sequence",
199             FlagDanglingNegation => "dangling flag negation operator",
200             FlagDuplicate { .. } => "duplicate flag",
201             FlagRepeatedNegation { .. } => "repeated negation",
202             FlagUnexpectedEof => "unexpected eof (flag)",
203             FlagUnrecognized => "unrecognized flag",
204             GroupNameDuplicate { .. } => "duplicate capture group name",
205             GroupNameEmpty => "empty capture group name",
206             GroupNameInvalid => "invalid capture group name",
207             GroupNameUnexpectedEof => "unclosed capture group name",
208             GroupUnclosed => "unclosed group",
209             GroupUnopened => "unopened group",
210             NestLimitExceeded(_) => "nest limit exceeded",
211             RepetitionCountInvalid => "invalid repetition count range",
212             RepetitionCountUnclosed => "unclosed counted repetition",
213             RepetitionMissing => "repetition operator missing expression",
214             UnicodeClassInvalid => "invalid Unicode character class",
215             UnsupportedBackreference => "backreferences are not supported",
216             UnsupportedLookAround => "look-around is not supported",
217             _ => unreachable!(),
218         }
219     }
220 }
221 
222 impl fmt::Display for Error {
fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result223     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
224         crate::error::Formatter::from(self).fmt(f)
225     }
226 }
227 
228 impl fmt::Display for ErrorKind {
fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result229     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
230         use self::ErrorKind::*;
231         match *self {
232             CaptureLimitExceeded => write!(
233                 f,
234                 "exceeded the maximum number of \
235                  capturing groups ({})",
236                 ::std::u32::MAX
237             ),
238             ClassEscapeInvalid => {
239                 write!(f, "invalid escape sequence found in character class")
240             }
241             ClassRangeInvalid => write!(
242                 f,
243                 "invalid character class range, \
244                  the start must be <= the end"
245             ),
246             ClassRangeLiteral => {
247                 write!(f, "invalid range boundary, must be a literal")
248             }
249             ClassUnclosed => write!(f, "unclosed character class"),
250             DecimalEmpty => write!(f, "decimal literal empty"),
251             DecimalInvalid => write!(f, "decimal literal invalid"),
252             EscapeHexEmpty => write!(f, "hexadecimal literal empty"),
253             EscapeHexInvalid => {
254                 write!(f, "hexadecimal literal is not a Unicode scalar value")
255             }
256             EscapeHexInvalidDigit => write!(f, "invalid hexadecimal digit"),
257             EscapeUnexpectedEof => write!(
258                 f,
259                 "incomplete escape sequence, \
260                  reached end of pattern prematurely"
261             ),
262             EscapeUnrecognized => write!(f, "unrecognized escape sequence"),
263             FlagDanglingNegation => {
264                 write!(f, "dangling flag negation operator")
265             }
266             FlagDuplicate { .. } => write!(f, "duplicate flag"),
267             FlagRepeatedNegation { .. } => {
268                 write!(f, "flag negation operator repeated")
269             }
270             FlagUnexpectedEof => {
271                 write!(f, "expected flag but got end of regex")
272             }
273             FlagUnrecognized => write!(f, "unrecognized flag"),
274             GroupNameDuplicate { .. } => {
275                 write!(f, "duplicate capture group name")
276             }
277             GroupNameEmpty => write!(f, "empty capture group name"),
278             GroupNameInvalid => write!(f, "invalid capture group character"),
279             GroupNameUnexpectedEof => write!(f, "unclosed capture group name"),
280             GroupUnclosed => write!(f, "unclosed group"),
281             GroupUnopened => write!(f, "unopened group"),
282             NestLimitExceeded(limit) => write!(
283                 f,
284                 "exceed the maximum number of \
285                  nested parentheses/brackets ({})",
286                 limit
287             ),
288             RepetitionCountInvalid => write!(
289                 f,
290                 "invalid repetition count range, \
291                  the start must be <= the end"
292             ),
293             RepetitionCountDecimalEmpty => {
294                 write!(f, "repetition quantifier expects a valid decimal")
295             }
296             RepetitionCountUnclosed => {
297                 write!(f, "unclosed counted repetition")
298             }
299             RepetitionMissing => {
300                 write!(f, "repetition operator missing expression")
301             }
302             UnicodeClassInvalid => {
303                 write!(f, "invalid Unicode character class")
304             }
305             UnsupportedBackreference => {
306                 write!(f, "backreferences are not supported")
307             }
308             UnsupportedLookAround => write!(
309                 f,
310                 "look-around, including look-ahead and look-behind, \
311                  is not supported"
312             ),
313             _ => unreachable!(),
314         }
315     }
316 }
317 
318 /// Span represents the position information of a single AST item.
319 ///
320 /// All span positions are absolute byte offsets that can be used on the
321 /// original regular expression that was parsed.
322 #[derive(Clone, Copy, Eq, PartialEq)]
323 pub struct Span {
324     /// The start byte offset.
325     pub start: Position,
326     /// The end byte offset.
327     pub end: Position,
328 }
329 
330 impl fmt::Debug for Span {
fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result331     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
332         write!(f, "Span({:?}, {:?})", self.start, self.end)
333     }
334 }
335 
336 impl Ord for Span {
cmp(&self, other: &Span) -> Ordering337     fn cmp(&self, other: &Span) -> Ordering {
338         (&self.start, &self.end).cmp(&(&other.start, &other.end))
339     }
340 }
341 
342 impl PartialOrd for Span {
partial_cmp(&self, other: &Span) -> Option<Ordering>343     fn partial_cmp(&self, other: &Span) -> Option<Ordering> {
344         Some(self.cmp(other))
345     }
346 }
347 
348 /// A single position in a regular expression.
349 ///
350 /// A position encodes one half of a span, and include the byte offset, line
351 /// number and column number.
352 #[derive(Clone, Copy, Eq, PartialEq)]
353 pub struct Position {
354     /// The absolute offset of this position, starting at `0` from the
355     /// beginning of the regular expression pattern string.
356     pub offset: usize,
357     /// The line number, starting at `1`.
358     pub line: usize,
359     /// The approximate column number, starting at `1`.
360     pub column: usize,
361 }
362 
363 impl fmt::Debug for Position {
fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result364     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
365         write!(
366             f,
367             "Position(o: {:?}, l: {:?}, c: {:?})",
368             self.offset, self.line, self.column
369         )
370     }
371 }
372 
373 impl Ord for Position {
cmp(&self, other: &Position) -> Ordering374     fn cmp(&self, other: &Position) -> Ordering {
375         self.offset.cmp(&other.offset)
376     }
377 }
378 
379 impl PartialOrd for Position {
partial_cmp(&self, other: &Position) -> Option<Ordering>380     fn partial_cmp(&self, other: &Position) -> Option<Ordering> {
381         Some(self.cmp(other))
382     }
383 }
384 
385 impl Span {
386     /// Create a new span with the given positions.
new(start: Position, end: Position) -> Span387     pub fn new(start: Position, end: Position) -> Span {
388         Span { start, end }
389     }
390 
391     /// Create a new span using the given position as the start and end.
splat(pos: Position) -> Span392     pub fn splat(pos: Position) -> Span {
393         Span::new(pos, pos)
394     }
395 
396     /// Create a new span by replacing the starting the position with the one
397     /// given.
with_start(self, pos: Position) -> Span398     pub fn with_start(self, pos: Position) -> Span {
399         Span { start: pos, ..self }
400     }
401 
402     /// Create a new span by replacing the ending the position with the one
403     /// given.
with_end(self, pos: Position) -> Span404     pub fn with_end(self, pos: Position) -> Span {
405         Span { end: pos, ..self }
406     }
407 
408     /// Returns true if and only if this span occurs on a single line.
is_one_line(&self) -> bool409     pub fn is_one_line(&self) -> bool {
410         self.start.line == self.end.line
411     }
412 
413     /// Returns true if and only if this span is empty. That is, it points to
414     /// a single position in the concrete syntax of a regular expression.
is_empty(&self) -> bool415     pub fn is_empty(&self) -> bool {
416         self.start.offset == self.end.offset
417     }
418 }
419 
420 impl Position {
421     /// Create a new position with the given information.
422     ///
423     /// `offset` is the absolute offset of the position, starting at `0` from
424     /// the beginning of the regular expression pattern string.
425     ///
426     /// `line` is the line number, starting at `1`.
427     ///
428     /// `column` is the approximate column number, starting at `1`.
new(offset: usize, line: usize, column: usize) -> Position429     pub fn new(offset: usize, line: usize, column: usize) -> Position {
430         Position { offset, line, column }
431     }
432 }
433 
434 /// An abstract syntax tree for a singular expression along with comments
435 /// found.
436 ///
437 /// Comments are not stored in the tree itself to avoid complexity. Each
438 /// comment contains a span of precisely where it occurred in the original
439 /// regular expression.
440 #[derive(Clone, Debug, Eq, PartialEq)]
441 pub struct WithComments {
442     /// The actual ast.
443     pub ast: Ast,
444     /// All comments found in the original regular expression.
445     pub comments: Vec<Comment>,
446 }
447 
448 /// A comment from a regular expression with an associated span.
449 ///
450 /// A regular expression can only contain comments when the `x` flag is
451 /// enabled.
452 #[derive(Clone, Debug, Eq, PartialEq)]
453 pub struct Comment {
454     /// The span of this comment, including the beginning `#` and ending `\n`.
455     pub span: Span,
456     /// The comment text, starting with the first character following the `#`
457     /// and ending with the last character preceding the `\n`.
458     pub comment: String,
459 }
460 
461 /// An abstract syntax tree for a single regular expression.
462 ///
463 /// An `Ast`'s `fmt::Display` implementation uses constant stack space and heap
464 /// space proportional to the size of the `Ast`.
465 ///
466 /// This type defines its own destructor that uses constant stack space and
467 /// heap space proportional to the size of the `Ast`.
468 #[derive(Clone, Debug, Eq, PartialEq)]
469 pub enum Ast {
470     /// An empty regex that matches everything.
471     Empty(Span),
472     /// A set of flags, e.g., `(?is)`.
473     Flags(SetFlags),
474     /// A single character literal, which includes escape sequences.
475     Literal(Literal),
476     /// The "any character" class.
477     Dot(Span),
478     /// A single zero-width assertion.
479     Assertion(Assertion),
480     /// A single character class. This includes all forms of character classes
481     /// except for `.`. e.g., `\d`, `\pN`, `[a-z]` and `[[:alpha:]]`.
482     Class(Class),
483     /// A repetition operator applied to an arbitrary regular expression.
484     Repetition(Repetition),
485     /// A grouped regular expression.
486     Group(Group),
487     /// An alternation of regular expressions.
488     Alternation(Alternation),
489     /// A concatenation of regular expressions.
490     Concat(Concat),
491 }
492 
493 impl Ast {
494     /// Return the span of this abstract syntax tree.
span(&self) -> &Span495     pub fn span(&self) -> &Span {
496         match *self {
497             Ast::Empty(ref span) => span,
498             Ast::Flags(ref x) => &x.span,
499             Ast::Literal(ref x) => &x.span,
500             Ast::Dot(ref span) => span,
501             Ast::Assertion(ref x) => &x.span,
502             Ast::Class(ref x) => x.span(),
503             Ast::Repetition(ref x) => &x.span,
504             Ast::Group(ref x) => &x.span,
505             Ast::Alternation(ref x) => &x.span,
506             Ast::Concat(ref x) => &x.span,
507         }
508     }
509 
510     /// Return true if and only if this Ast is empty.
is_empty(&self) -> bool511     pub fn is_empty(&self) -> bool {
512         match *self {
513             Ast::Empty(_) => true,
514             _ => false,
515         }
516     }
517 
518     /// Returns true if and only if this AST has any (including possibly empty)
519     /// subexpressions.
has_subexprs(&self) -> bool520     fn has_subexprs(&self) -> bool {
521         match *self {
522             Ast::Empty(_)
523             | Ast::Flags(_)
524             | Ast::Literal(_)
525             | Ast::Dot(_)
526             | Ast::Assertion(_) => false,
527             Ast::Class(_)
528             | Ast::Repetition(_)
529             | Ast::Group(_)
530             | Ast::Alternation(_)
531             | Ast::Concat(_) => true,
532         }
533     }
534 }
535 
536 /// Print a display representation of this Ast.
537 ///
538 /// This does not preserve any of the original whitespace formatting that may
539 /// have originally been present in the concrete syntax from which this Ast
540 /// was generated.
541 ///
542 /// This implementation uses constant stack space and heap space proportional
543 /// to the size of the `Ast`.
544 impl fmt::Display for Ast {
fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result545     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
546         use crate::ast::print::Printer;
547         Printer::new().print(self, f)
548     }
549 }
550 
551 /// An alternation of regular expressions.
552 #[derive(Clone, Debug, Eq, PartialEq)]
553 pub struct Alternation {
554     /// The span of this alternation.
555     pub span: Span,
556     /// The alternate regular expressions.
557     pub asts: Vec<Ast>,
558 }
559 
560 impl Alternation {
561     /// Return this alternation as an AST.
562     ///
563     /// If this alternation contains zero ASTs, then Ast::Empty is
564     /// returned. If this alternation contains exactly 1 AST, then the
565     /// corresponding AST is returned. Otherwise, Ast::Alternation is returned.
into_ast(mut self) -> Ast566     pub fn into_ast(mut self) -> Ast {
567         match self.asts.len() {
568             0 => Ast::Empty(self.span),
569             1 => self.asts.pop().unwrap(),
570             _ => Ast::Alternation(self),
571         }
572     }
573 }
574 
575 /// A concatenation of regular expressions.
576 #[derive(Clone, Debug, Eq, PartialEq)]
577 pub struct Concat {
578     /// The span of this concatenation.
579     pub span: Span,
580     /// The concatenation regular expressions.
581     pub asts: Vec<Ast>,
582 }
583 
584 impl Concat {
585     /// Return this concatenation as an AST.
586     ///
587     /// If this concatenation contains zero ASTs, then Ast::Empty is
588     /// returned. If this concatenation contains exactly 1 AST, then the
589     /// corresponding AST is returned. Otherwise, Ast::Concat is returned.
into_ast(mut self) -> Ast590     pub fn into_ast(mut self) -> Ast {
591         match self.asts.len() {
592             0 => Ast::Empty(self.span),
593             1 => self.asts.pop().unwrap(),
594             _ => Ast::Concat(self),
595         }
596     }
597 }
598 
599 /// A single literal expression.
600 ///
601 /// A literal corresponds to a single Unicode scalar value. Literals may be
602 /// represented in their literal form, e.g., `a` or in their escaped form,
603 /// e.g., `\x61`.
604 #[derive(Clone, Debug, Eq, PartialEq)]
605 pub struct Literal {
606     /// The span of this literal.
607     pub span: Span,
608     /// The kind of this literal.
609     pub kind: LiteralKind,
610     /// The Unicode scalar value corresponding to this literal.
611     pub c: char,
612 }
613 
614 impl Literal {
615     /// If this literal was written as a `\x` hex escape, then this returns
616     /// the corresponding byte value. Otherwise, this returns `None`.
byte(&self) -> Option<u8>617     pub fn byte(&self) -> Option<u8> {
618         let short_hex = LiteralKind::HexFixed(HexLiteralKind::X);
619         if self.c as u32 <= 255 && self.kind == short_hex {
620             Some(self.c as u8)
621         } else {
622             None
623         }
624     }
625 }
626 
627 /// The kind of a single literal expression.
628 #[derive(Clone, Debug, Eq, PartialEq)]
629 pub enum LiteralKind {
630     /// The literal is written verbatim, e.g., `a` or `☃`.
631     Verbatim,
632     /// The literal is written as an escape because it is punctuation, e.g.,
633     /// `\*` or `\[`.
634     Punctuation,
635     /// The literal is written as an octal escape, e.g., `\141`.
636     Octal,
637     /// The literal is written as a hex code with a fixed number of digits
638     /// depending on the type of the escape, e.g., `\x61` or or `\u0061` or
639     /// `\U00000061`.
640     HexFixed(HexLiteralKind),
641     /// The literal is written as a hex code with a bracketed number of
642     /// digits. The only restriction is that the bracketed hex code must refer
643     /// to a valid Unicode scalar value.
644     HexBrace(HexLiteralKind),
645     /// The literal is written as a specially recognized escape, e.g., `\f`
646     /// or `\n`.
647     Special(SpecialLiteralKind),
648 }
649 
650 /// The type of a special literal.
651 ///
652 /// A special literal is a special escape sequence recognized by the regex
653 /// parser, e.g., `\f` or `\n`.
654 #[derive(Clone, Debug, Eq, PartialEq)]
655 pub enum SpecialLiteralKind {
656     /// Bell, spelled `\a` (`\x07`).
657     Bell,
658     /// Form feed, spelled `\f` (`\x0C`).
659     FormFeed,
660     /// Tab, spelled `\t` (`\x09`).
661     Tab,
662     /// Line feed, spelled `\n` (`\x0A`).
663     LineFeed,
664     /// Carriage return, spelled `\r` (`\x0D`).
665     CarriageReturn,
666     /// Vertical tab, spelled `\v` (`\x0B`).
667     VerticalTab,
668     /// Space, spelled `\ ` (`\x20`). Note that this can only appear when
669     /// parsing in verbose mode.
670     Space,
671 }
672 
673 /// The type of a Unicode hex literal.
674 ///
675 /// Note that all variants behave the same when used with brackets. They only
676 /// differ when used without brackets in the number of hex digits that must
677 /// follow.
678 #[derive(Clone, Debug, Eq, PartialEq)]
679 pub enum HexLiteralKind {
680     /// A `\x` prefix. When used without brackets, this form is limited to
681     /// two digits.
682     X,
683     /// A `\u` prefix. When used without brackets, this form is limited to
684     /// four digits.
685     UnicodeShort,
686     /// A `\U` prefix. When used without brackets, this form is limited to
687     /// eight digits.
688     UnicodeLong,
689 }
690 
691 impl HexLiteralKind {
692     /// The number of digits that must be used with this literal form when
693     /// used without brackets. When used with brackets, there is no
694     /// restriction on the number of digits.
digits(&self) -> u32695     pub fn digits(&self) -> u32 {
696         match *self {
697             HexLiteralKind::X => 2,
698             HexLiteralKind::UnicodeShort => 4,
699             HexLiteralKind::UnicodeLong => 8,
700         }
701     }
702 }
703 
704 /// A single character class expression.
705 #[derive(Clone, Debug, Eq, PartialEq)]
706 pub enum Class {
707     /// A Unicode character class, e.g., `\pL` or `\p{Greek}`.
708     Unicode(ClassUnicode),
709     /// A perl character class, e.g., `\d` or `\W`.
710     Perl(ClassPerl),
711     /// A bracketed character class set, which may contain zero or more
712     /// character ranges and/or zero or more nested classes. e.g.,
713     /// `[a-zA-Z\pL]`.
714     Bracketed(ClassBracketed),
715 }
716 
717 impl Class {
718     /// Return the span of this character class.
span(&self) -> &Span719     pub fn span(&self) -> &Span {
720         match *self {
721             Class::Perl(ref x) => &x.span,
722             Class::Unicode(ref x) => &x.span,
723             Class::Bracketed(ref x) => &x.span,
724         }
725     }
726 }
727 
728 /// A Perl character class.
729 #[derive(Clone, Debug, Eq, PartialEq)]
730 pub struct ClassPerl {
731     /// The span of this class.
732     pub span: Span,
733     /// The kind of Perl class.
734     pub kind: ClassPerlKind,
735     /// Whether the class is negated or not. e.g., `\d` is not negated but
736     /// `\D` is.
737     pub negated: bool,
738 }
739 
740 /// The available Perl character classes.
741 #[derive(Clone, Debug, Eq, PartialEq)]
742 pub enum ClassPerlKind {
743     /// Decimal numbers.
744     Digit,
745     /// Whitespace.
746     Space,
747     /// Word characters.
748     Word,
749 }
750 
751 /// An ASCII character class.
752 #[derive(Clone, Debug, Eq, PartialEq)]
753 pub struct ClassAscii {
754     /// The span of this class.
755     pub span: Span,
756     /// The kind of ASCII class.
757     pub kind: ClassAsciiKind,
758     /// Whether the class is negated or not. e.g., `[[:alpha:]]` is not negated
759     /// but `[[:^alpha:]]` is.
760     pub negated: bool,
761 }
762 
763 /// The available ASCII character classes.
764 #[derive(Clone, Debug, Eq, PartialEq)]
765 pub enum ClassAsciiKind {
766     /// `[0-9A-Za-z]`
767     Alnum,
768     /// `[A-Za-z]`
769     Alpha,
770     /// `[\x00-\x7F]`
771     Ascii,
772     /// `[ \t]`
773     Blank,
774     /// `[\x00-\x1F\x7F]`
775     Cntrl,
776     /// `[0-9]`
777     Digit,
778     /// `[!-~]`
779     Graph,
780     /// `[a-z]`
781     Lower,
782     /// `[ -~]`
783     Print,
784     /// `[!-/:-@\[-`{-~]`
785     Punct,
786     /// `[\t\n\v\f\r ]`
787     Space,
788     /// `[A-Z]`
789     Upper,
790     /// `[0-9A-Za-z_]`
791     Word,
792     /// `[0-9A-Fa-f]`
793     Xdigit,
794 }
795 
796 impl ClassAsciiKind {
797     /// Return the corresponding ClassAsciiKind variant for the given name.
798     ///
799     /// The name given should correspond to the lowercase version of the
800     /// variant name. e.g., `cntrl` is the name for `ClassAsciiKind::Cntrl`.
801     ///
802     /// If no variant with the corresponding name exists, then `None` is
803     /// returned.
from_name(name: &str) -> Option<ClassAsciiKind>804     pub fn from_name(name: &str) -> Option<ClassAsciiKind> {
805         use self::ClassAsciiKind::*;
806         match name {
807             "alnum" => Some(Alnum),
808             "alpha" => Some(Alpha),
809             "ascii" => Some(Ascii),
810             "blank" => Some(Blank),
811             "cntrl" => Some(Cntrl),
812             "digit" => Some(Digit),
813             "graph" => Some(Graph),
814             "lower" => Some(Lower),
815             "print" => Some(Print),
816             "punct" => Some(Punct),
817             "space" => Some(Space),
818             "upper" => Some(Upper),
819             "word" => Some(Word),
820             "xdigit" => Some(Xdigit),
821             _ => None,
822         }
823     }
824 }
825 
826 /// A Unicode character class.
827 #[derive(Clone, Debug, Eq, PartialEq)]
828 pub struct ClassUnicode {
829     /// The span of this class.
830     pub span: Span,
831     /// Whether this class is negated or not.
832     ///
833     /// Note: be careful when using this attribute. This specifically refers
834     /// to whether the class is written as `\p` or `\P`, where the latter
835     /// is `negated = true`. However, it also possible to write something like
836     /// `\P{scx!=Katakana}` which is actually equivalent to
837     /// `\p{scx=Katakana}` and is therefore not actually negated even though
838     /// `negated = true` here. To test whether this class is truly negated
839     /// or not, use the `is_negated` method.
840     pub negated: bool,
841     /// The kind of Unicode class.
842     pub kind: ClassUnicodeKind,
843 }
844 
845 impl ClassUnicode {
846     /// Returns true if this class has been negated.
847     ///
848     /// Note that this takes the Unicode op into account, if it's present.
849     /// e.g., `is_negated` for `\P{scx!=Katakana}` will return `false`.
is_negated(&self) -> bool850     pub fn is_negated(&self) -> bool {
851         match self.kind {
852             ClassUnicodeKind::NamedValue {
853                 op: ClassUnicodeOpKind::NotEqual,
854                 ..
855             } => !self.negated,
856             _ => self.negated,
857         }
858     }
859 }
860 
861 /// The available forms of Unicode character classes.
862 #[derive(Clone, Debug, Eq, PartialEq)]
863 pub enum ClassUnicodeKind {
864     /// A one letter abbreviated class, e.g., `\pN`.
865     OneLetter(char),
866     /// A binary property, general category or script. The string may be
867     /// empty.
868     Named(String),
869     /// A property name and an associated value.
870     NamedValue {
871         /// The type of Unicode op used to associate `name` with `value`.
872         op: ClassUnicodeOpKind,
873         /// The property name (which may be empty).
874         name: String,
875         /// The property value (which may be empty).
876         value: String,
877     },
878 }
879 
880 /// The type of op used in a Unicode character class.
881 #[derive(Clone, Debug, Eq, PartialEq)]
882 pub enum ClassUnicodeOpKind {
883     /// A property set to a specific value, e.g., `\p{scx=Katakana}`.
884     Equal,
885     /// A property set to a specific value using a colon, e.g.,
886     /// `\p{scx:Katakana}`.
887     Colon,
888     /// A property that isn't a particular value, e.g., `\p{scx!=Katakana}`.
889     NotEqual,
890 }
891 
892 impl ClassUnicodeOpKind {
893     /// Whether the op is an equality op or not.
is_equal(&self) -> bool894     pub fn is_equal(&self) -> bool {
895         match *self {
896             ClassUnicodeOpKind::Equal | ClassUnicodeOpKind::Colon => true,
897             _ => false,
898         }
899     }
900 }
901 
902 /// A bracketed character class, e.g., `[a-z0-9]`.
903 #[derive(Clone, Debug, Eq, PartialEq)]
904 pub struct ClassBracketed {
905     /// The span of this class.
906     pub span: Span,
907     /// Whether this class is negated or not. e.g., `[a]` is not negated but
908     /// `[^a]` is.
909     pub negated: bool,
910     /// The type of this set. A set is either a normal union of things, e.g.,
911     /// `[abc]` or a result of applying set operations, e.g., `[\pL--c]`.
912     pub kind: ClassSet,
913 }
914 
915 /// A character class set.
916 ///
917 /// This type corresponds to the internal structure of a bracketed character
918 /// class. That is, every bracketed character is one of two types: a union of
919 /// items (literals, ranges, other bracketed classes) or a tree of binary set
920 /// operations.
921 #[derive(Clone, Debug, Eq, PartialEq)]
922 pub enum ClassSet {
923     /// An item, which can be a single literal, range, nested character class
924     /// or a union of items.
925     Item(ClassSetItem),
926     /// A single binary operation (i.e., &&, -- or ~~).
927     BinaryOp(ClassSetBinaryOp),
928 }
929 
930 impl ClassSet {
931     /// Build a set from a union.
union(ast: ClassSetUnion) -> ClassSet932     pub fn union(ast: ClassSetUnion) -> ClassSet {
933         ClassSet::Item(ClassSetItem::Union(ast))
934     }
935 
936     /// Return the span of this character class set.
span(&self) -> &Span937     pub fn span(&self) -> &Span {
938         match *self {
939             ClassSet::Item(ref x) => x.span(),
940             ClassSet::BinaryOp(ref x) => &x.span,
941         }
942     }
943 
944     /// Return true if and only if this class set is empty.
is_empty(&self) -> bool945     fn is_empty(&self) -> bool {
946         match *self {
947             ClassSet::Item(ClassSetItem::Empty(_)) => true,
948             _ => false,
949         }
950     }
951 }
952 
953 /// A single component of a character class set.
954 #[derive(Clone, Debug, Eq, PartialEq)]
955 pub enum ClassSetItem {
956     /// An empty item.
957     ///
958     /// Note that a bracketed character class cannot contain a single empty
959     /// item. Empty items can appear when using one of the binary operators.
960     /// For example, `[&&]` is the intersection of two empty classes.
961     Empty(Span),
962     /// A single literal.
963     Literal(Literal),
964     /// A range between two literals.
965     Range(ClassSetRange),
966     /// An ASCII character class, e.g., `[:alnum:]` or `[:punct:]`.
967     Ascii(ClassAscii),
968     /// A Unicode character class, e.g., `\pL` or `\p{Greek}`.
969     Unicode(ClassUnicode),
970     /// A perl character class, e.g., `\d` or `\W`.
971     Perl(ClassPerl),
972     /// A bracketed character class set, which may contain zero or more
973     /// character ranges and/or zero or more nested classes. e.g.,
974     /// `[a-zA-Z\pL]`.
975     Bracketed(Box<ClassBracketed>),
976     /// A union of items.
977     Union(ClassSetUnion),
978 }
979 
980 impl ClassSetItem {
981     /// Return the span of this character class set item.
span(&self) -> &Span982     pub fn span(&self) -> &Span {
983         match *self {
984             ClassSetItem::Empty(ref span) => span,
985             ClassSetItem::Literal(ref x) => &x.span,
986             ClassSetItem::Range(ref x) => &x.span,
987             ClassSetItem::Ascii(ref x) => &x.span,
988             ClassSetItem::Perl(ref x) => &x.span,
989             ClassSetItem::Unicode(ref x) => &x.span,
990             ClassSetItem::Bracketed(ref x) => &x.span,
991             ClassSetItem::Union(ref x) => &x.span,
992         }
993     }
994 }
995 
996 /// A single character class range in a set.
997 #[derive(Clone, Debug, Eq, PartialEq)]
998 pub struct ClassSetRange {
999     /// The span of this range.
1000     pub span: Span,
1001     /// The start of this range.
1002     pub start: Literal,
1003     /// The end of this range.
1004     pub end: Literal,
1005 }
1006 
1007 impl ClassSetRange {
1008     /// Returns true if and only if this character class range is valid.
1009     ///
1010     /// The only case where a range is invalid is if its start is greater than
1011     /// its end.
is_valid(&self) -> bool1012     pub fn is_valid(&self) -> bool {
1013         self.start.c <= self.end.c
1014     }
1015 }
1016 
1017 /// A union of items inside a character class set.
1018 #[derive(Clone, Debug, Eq, PartialEq)]
1019 pub struct ClassSetUnion {
1020     /// The span of the items in this operation. e.g., the `a-z0-9` in
1021     /// `[^a-z0-9]`
1022     pub span: Span,
1023     /// The sequence of items that make up this union.
1024     pub items: Vec<ClassSetItem>,
1025 }
1026 
1027 impl ClassSetUnion {
1028     /// Push a new item in this union.
1029     ///
1030     /// The ending position of this union's span is updated to the ending
1031     /// position of the span of the item given. If the union is empty, then
1032     /// the starting position of this union is set to the starting position
1033     /// of this item.
1034     ///
1035     /// In other words, if you only use this method to add items to a union
1036     /// and you set the spans on each item correctly, then you should never
1037     /// need to adjust the span of the union directly.
push(&mut self, item: ClassSetItem)1038     pub fn push(&mut self, item: ClassSetItem) {
1039         if self.items.is_empty() {
1040             self.span.start = item.span().start;
1041         }
1042         self.span.end = item.span().end;
1043         self.items.push(item);
1044     }
1045 
1046     /// Return this union as a character class set item.
1047     ///
1048     /// If this union contains zero items, then an empty union is
1049     /// returned. If this concatenation contains exactly 1 item, then the
1050     /// corresponding item is returned. Otherwise, ClassSetItem::Union is
1051     /// returned.
into_item(mut self) -> ClassSetItem1052     pub fn into_item(mut self) -> ClassSetItem {
1053         match self.items.len() {
1054             0 => ClassSetItem::Empty(self.span),
1055             1 => self.items.pop().unwrap(),
1056             _ => ClassSetItem::Union(self),
1057         }
1058     }
1059 }
1060 
1061 /// A Unicode character class set operation.
1062 #[derive(Clone, Debug, Eq, PartialEq)]
1063 pub struct ClassSetBinaryOp {
1064     /// The span of this operation. e.g., the `a-z--[h-p]` in `[a-z--h-p]`.
1065     pub span: Span,
1066     /// The type of this set operation.
1067     pub kind: ClassSetBinaryOpKind,
1068     /// The left hand side of the operation.
1069     pub lhs: Box<ClassSet>,
1070     /// The right hand side of the operation.
1071     pub rhs: Box<ClassSet>,
1072 }
1073 
1074 /// The type of a Unicode character class set operation.
1075 ///
1076 /// Note that this doesn't explicitly represent union since there is no
1077 /// explicit union operator. Concatenation inside a character class corresponds
1078 /// to the union operation.
1079 #[derive(Clone, Copy, Debug, Eq, PartialEq)]
1080 pub enum ClassSetBinaryOpKind {
1081     /// The intersection of two sets, e.g., `\pN&&[a-z]`.
1082     Intersection,
1083     /// The difference of two sets, e.g., `\pN--[0-9]`.
1084     Difference,
1085     /// The symmetric difference of two sets. The symmetric difference is the
1086     /// set of elements belonging to one but not both sets.
1087     /// e.g., `[\pL~~[:ascii:]]`.
1088     SymmetricDifference,
1089 }
1090 
1091 /// A single zero-width assertion.
1092 #[derive(Clone, Debug, Eq, PartialEq)]
1093 pub struct Assertion {
1094     /// The span of this assertion.
1095     pub span: Span,
1096     /// The assertion kind, e.g., `\b` or `^`.
1097     pub kind: AssertionKind,
1098 }
1099 
1100 /// An assertion kind.
1101 #[derive(Clone, Debug, Eq, PartialEq)]
1102 pub enum AssertionKind {
1103     /// `^`
1104     StartLine,
1105     /// `$`
1106     EndLine,
1107     /// `\A`
1108     StartText,
1109     /// `\z`
1110     EndText,
1111     /// `\b`
1112     WordBoundary,
1113     /// `\B`
1114     NotWordBoundary,
1115 }
1116 
1117 /// A repetition operation applied to a regular expression.
1118 #[derive(Clone, Debug, Eq, PartialEq)]
1119 pub struct Repetition {
1120     /// The span of this operation.
1121     pub span: Span,
1122     /// The actual operation.
1123     pub op: RepetitionOp,
1124     /// Whether this operation was applied greedily or not.
1125     pub greedy: bool,
1126     /// The regular expression under repetition.
1127     pub ast: Box<Ast>,
1128 }
1129 
1130 /// The repetition operator itself.
1131 #[derive(Clone, Debug, Eq, PartialEq)]
1132 pub struct RepetitionOp {
1133     /// The span of this operator. This includes things like `+`, `*?` and
1134     /// `{m,n}`.
1135     pub span: Span,
1136     /// The type of operation.
1137     pub kind: RepetitionKind,
1138 }
1139 
1140 /// The kind of a repetition operator.
1141 #[derive(Clone, Debug, Eq, PartialEq)]
1142 pub enum RepetitionKind {
1143     /// `?`
1144     ZeroOrOne,
1145     /// `*`
1146     ZeroOrMore,
1147     /// `+`
1148     OneOrMore,
1149     /// `{m,n}`
1150     Range(RepetitionRange),
1151 }
1152 
1153 /// A range repetition operator.
1154 #[derive(Clone, Debug, Eq, PartialEq)]
1155 pub enum RepetitionRange {
1156     /// `{m}`
1157     Exactly(u32),
1158     /// `{m,}`
1159     AtLeast(u32),
1160     /// `{m,n}`
1161     Bounded(u32, u32),
1162 }
1163 
1164 impl RepetitionRange {
1165     /// Returns true if and only if this repetition range is valid.
1166     ///
1167     /// The only case where a repetition range is invalid is if it is bounded
1168     /// and its start is greater than its end.
is_valid(&self) -> bool1169     pub fn is_valid(&self) -> bool {
1170         match *self {
1171             RepetitionRange::Bounded(s, e) if s > e => false,
1172             _ => true,
1173         }
1174     }
1175 }
1176 
1177 /// A grouped regular expression.
1178 ///
1179 /// This includes both capturing and non-capturing groups. This does **not**
1180 /// include flag-only groups like `(?is)`, but does contain any group that
1181 /// contains a sub-expression, e.g., `(a)`, `(?P<name>a)`, `(?:a)` and
1182 /// `(?is:a)`.
1183 #[derive(Clone, Debug, Eq, PartialEq)]
1184 pub struct Group {
1185     /// The span of this group.
1186     pub span: Span,
1187     /// The kind of this group.
1188     pub kind: GroupKind,
1189     /// The regular expression in this group.
1190     pub ast: Box<Ast>,
1191 }
1192 
1193 impl Group {
1194     /// If this group is non-capturing, then this returns the (possibly empty)
1195     /// set of flags. Otherwise, `None` is returned.
flags(&self) -> Option<&Flags>1196     pub fn flags(&self) -> Option<&Flags> {
1197         match self.kind {
1198             GroupKind::NonCapturing(ref flags) => Some(flags),
1199             _ => None,
1200         }
1201     }
1202 
1203     /// Returns true if and only if this group is capturing.
is_capturing(&self) -> bool1204     pub fn is_capturing(&self) -> bool {
1205         match self.kind {
1206             GroupKind::CaptureIndex(_) | GroupKind::CaptureName(_) => true,
1207             GroupKind::NonCapturing(_) => false,
1208         }
1209     }
1210 
1211     /// Returns the capture index of this group, if this is a capturing group.
1212     ///
1213     /// This returns a capture index precisely when `is_capturing` is `true`.
capture_index(&self) -> Option<u32>1214     pub fn capture_index(&self) -> Option<u32> {
1215         match self.kind {
1216             GroupKind::CaptureIndex(i) => Some(i),
1217             GroupKind::CaptureName(ref x) => Some(x.index),
1218             GroupKind::NonCapturing(_) => None,
1219         }
1220     }
1221 }
1222 
1223 /// The kind of a group.
1224 #[derive(Clone, Debug, Eq, PartialEq)]
1225 pub enum GroupKind {
1226     /// `(a)`
1227     CaptureIndex(u32),
1228     /// `(?P<name>a)`
1229     CaptureName(CaptureName),
1230     /// `(?:a)` and `(?i:a)`
1231     NonCapturing(Flags),
1232 }
1233 
1234 /// A capture name.
1235 ///
1236 /// This corresponds to the name itself between the angle brackets in, e.g.,
1237 /// `(?P<foo>expr)`.
1238 #[derive(Clone, Debug, Eq, PartialEq)]
1239 pub struct CaptureName {
1240     /// The span of this capture name.
1241     pub span: Span,
1242     /// The capture name.
1243     pub name: String,
1244     /// The capture index.
1245     pub index: u32,
1246 }
1247 
1248 /// A group of flags that is not applied to a particular regular expression.
1249 #[derive(Clone, Debug, Eq, PartialEq)]
1250 pub struct SetFlags {
1251     /// The span of these flags, including the grouping parentheses.
1252     pub span: Span,
1253     /// The actual sequence of flags.
1254     pub flags: Flags,
1255 }
1256 
1257 /// A group of flags.
1258 ///
1259 /// This corresponds only to the sequence of flags themselves, e.g., `is-u`.
1260 #[derive(Clone, Debug, Eq, PartialEq)]
1261 pub struct Flags {
1262     /// The span of this group of flags.
1263     pub span: Span,
1264     /// A sequence of flag items. Each item is either a flag or a negation
1265     /// operator.
1266     pub items: Vec<FlagsItem>,
1267 }
1268 
1269 impl Flags {
1270     /// Add the given item to this sequence of flags.
1271     ///
1272     /// If the item was added successfully, then `None` is returned. If the
1273     /// given item is a duplicate, then `Some(i)` is returned, where
1274     /// `items[i].kind == item.kind`.
add_item(&mut self, item: FlagsItem) -> Option<usize>1275     pub fn add_item(&mut self, item: FlagsItem) -> Option<usize> {
1276         for (i, x) in self.items.iter().enumerate() {
1277             if x.kind == item.kind {
1278                 return Some(i);
1279             }
1280         }
1281         self.items.push(item);
1282         None
1283     }
1284 
1285     /// Returns the state of the given flag in this set.
1286     ///
1287     /// If the given flag is in the set but is negated, then `Some(false)` is
1288     /// returned.
1289     ///
1290     /// If the given flag is in the set and is not negated, then `Some(true)`
1291     /// is returned.
1292     ///
1293     /// Otherwise, `None` is returned.
flag_state(&self, flag: Flag) -> Option<bool>1294     pub fn flag_state(&self, flag: Flag) -> Option<bool> {
1295         let mut negated = false;
1296         for x in &self.items {
1297             match x.kind {
1298                 FlagsItemKind::Negation => {
1299                     negated = true;
1300                 }
1301                 FlagsItemKind::Flag(ref xflag) if xflag == &flag => {
1302                     return Some(!negated);
1303                 }
1304                 _ => {}
1305             }
1306         }
1307         None
1308     }
1309 }
1310 
1311 /// A single item in a group of flags.
1312 #[derive(Clone, Debug, Eq, PartialEq)]
1313 pub struct FlagsItem {
1314     /// The span of this item.
1315     pub span: Span,
1316     /// The kind of this item.
1317     pub kind: FlagsItemKind,
1318 }
1319 
1320 /// The kind of an item in a group of flags.
1321 #[derive(Clone, Debug, Eq, PartialEq)]
1322 pub enum FlagsItemKind {
1323     /// A negation operator applied to all subsequent flags in the enclosing
1324     /// group.
1325     Negation,
1326     /// A single flag in a group.
1327     Flag(Flag),
1328 }
1329 
1330 impl FlagsItemKind {
1331     /// Returns true if and only if this item is a negation operator.
is_negation(&self) -> bool1332     pub fn is_negation(&self) -> bool {
1333         match *self {
1334             FlagsItemKind::Negation => true,
1335             _ => false,
1336         }
1337     }
1338 }
1339 
1340 /// A single flag.
1341 #[derive(Clone, Copy, Debug, Eq, PartialEq)]
1342 pub enum Flag {
1343     /// `i`
1344     CaseInsensitive,
1345     /// `m`
1346     MultiLine,
1347     /// `s`
1348     DotMatchesNewLine,
1349     /// `U`
1350     SwapGreed,
1351     /// `u`
1352     Unicode,
1353     /// `x`
1354     IgnoreWhitespace,
1355 }
1356 
1357 /// A custom `Drop` impl is used for `Ast` such that it uses constant stack
1358 /// space but heap space proportional to the depth of the `Ast`.
1359 impl Drop for Ast {
drop(&mut self)1360     fn drop(&mut self) {
1361         use std::mem;
1362 
1363         match *self {
1364             Ast::Empty(_)
1365             | Ast::Flags(_)
1366             | Ast::Literal(_)
1367             | Ast::Dot(_)
1368             | Ast::Assertion(_)
1369             // Classes are recursive, so they get their own Drop impl.
1370             | Ast::Class(_) => return,
1371             Ast::Repetition(ref x) if !x.ast.has_subexprs() => return,
1372             Ast::Group(ref x) if !x.ast.has_subexprs() => return,
1373             Ast::Alternation(ref x) if x.asts.is_empty() => return,
1374             Ast::Concat(ref x) if x.asts.is_empty() => return,
1375             _ => {}
1376         }
1377 
1378         let empty_span = || Span::splat(Position::new(0, 0, 0));
1379         let empty_ast = || Ast::Empty(empty_span());
1380         let mut stack = vec![mem::replace(self, empty_ast())];
1381         while let Some(mut ast) = stack.pop() {
1382             match ast {
1383                 Ast::Empty(_)
1384                 | Ast::Flags(_)
1385                 | Ast::Literal(_)
1386                 | Ast::Dot(_)
1387                 | Ast::Assertion(_)
1388                 // Classes are recursive, so they get their own Drop impl.
1389                 | Ast::Class(_) => {}
1390                 Ast::Repetition(ref mut x) => {
1391                     stack.push(mem::replace(&mut x.ast, empty_ast()));
1392                 }
1393                 Ast::Group(ref mut x) => {
1394                     stack.push(mem::replace(&mut x.ast, empty_ast()));
1395                 }
1396                 Ast::Alternation(ref mut x) => {
1397                     stack.extend(x.asts.drain(..));
1398                 }
1399                 Ast::Concat(ref mut x) => {
1400                     stack.extend(x.asts.drain(..));
1401                 }
1402             }
1403         }
1404     }
1405 }
1406 
1407 /// A custom `Drop` impl is used for `ClassSet` such that it uses constant
1408 /// stack space but heap space proportional to the depth of the `ClassSet`.
1409 impl Drop for ClassSet {
drop(&mut self)1410     fn drop(&mut self) {
1411         use std::mem;
1412 
1413         match *self {
1414             ClassSet::Item(ref item) => match *item {
1415                 ClassSetItem::Empty(_)
1416                 | ClassSetItem::Literal(_)
1417                 | ClassSetItem::Range(_)
1418                 | ClassSetItem::Ascii(_)
1419                 | ClassSetItem::Unicode(_)
1420                 | ClassSetItem::Perl(_) => return,
1421                 ClassSetItem::Bracketed(ref x) => {
1422                     if x.kind.is_empty() {
1423                         return;
1424                     }
1425                 }
1426                 ClassSetItem::Union(ref x) => {
1427                     if x.items.is_empty() {
1428                         return;
1429                     }
1430                 }
1431             },
1432             ClassSet::BinaryOp(ref op) => {
1433                 if op.lhs.is_empty() && op.rhs.is_empty() {
1434                     return;
1435                 }
1436             }
1437         }
1438 
1439         let empty_span = || Span::splat(Position::new(0, 0, 0));
1440         let empty_set = || ClassSet::Item(ClassSetItem::Empty(empty_span()));
1441         let mut stack = vec![mem::replace(self, empty_set())];
1442         while let Some(mut set) = stack.pop() {
1443             match set {
1444                 ClassSet::Item(ref mut item) => match *item {
1445                     ClassSetItem::Empty(_)
1446                     | ClassSetItem::Literal(_)
1447                     | ClassSetItem::Range(_)
1448                     | ClassSetItem::Ascii(_)
1449                     | ClassSetItem::Unicode(_)
1450                     | ClassSetItem::Perl(_) => {}
1451                     ClassSetItem::Bracketed(ref mut x) => {
1452                         stack.push(mem::replace(&mut x.kind, empty_set()));
1453                     }
1454                     ClassSetItem::Union(ref mut x) => {
1455                         stack.extend(x.items.drain(..).map(ClassSet::Item));
1456                     }
1457                 },
1458                 ClassSet::BinaryOp(ref mut op) => {
1459                     stack.push(mem::replace(&mut op.lhs, empty_set()));
1460                     stack.push(mem::replace(&mut op.rhs, empty_set()));
1461                 }
1462             }
1463         }
1464     }
1465 }
1466 
1467 #[cfg(test)]
1468 mod tests {
1469     use super::*;
1470 
1471     // We use a thread with an explicit stack size to test that our destructor
1472     // for Ast can handle arbitrarily sized expressions in constant stack
1473     // space. In case we run on a platform without threads (WASM?), we limit
1474     // this test to Windows/Unix.
1475     #[test]
1476     #[cfg(any(unix, windows))]
no_stack_overflow_on_drop()1477     fn no_stack_overflow_on_drop() {
1478         use std::thread;
1479 
1480         let run = || {
1481             let span = || Span::splat(Position::new(0, 0, 0));
1482             let mut ast = Ast::Empty(span());
1483             for i in 0..200 {
1484                 ast = Ast::Group(Group {
1485                     span: span(),
1486                     kind: GroupKind::CaptureIndex(i),
1487                     ast: Box::new(ast),
1488                 });
1489             }
1490             assert!(!ast.is_empty());
1491         };
1492 
1493         // We run our test on a thread with a small stack size so we can
1494         // force the issue more easily.
1495         //
1496         // NOTE(2023-03-21): It turns out that some platforms (like FreeBSD)
1497         // will just barf with very small stack sizes. So we bump this up a bit
1498         // to give more room to breath. When I did this, I confirmed that if
1499         // I remove the custom `Drop` impl for `Ast`, then this test does
1500         // indeed still fail with a stack overflow. (At the time of writing, I
1501         // had to bump it all the way up to 32K before the test would pass even
1502         // without the custom `Drop` impl. So 16K seems like a safe number
1503         // here.)
1504         //
1505         // See: https://github.com/rust-lang/regex/issues/967
1506         thread::Builder::new()
1507             .stack_size(16 << 10)
1508             .spawn(run)
1509             .unwrap()
1510             .join()
1511             .unwrap();
1512     }
1513 }
1514