1 /*! 2 Defines an abstract syntax for regular expressions. 3 */ 4 5 use std::cmp::Ordering; 6 use std::error; 7 use std::fmt; 8 9 pub use crate::ast::visitor::{visit, Visitor}; 10 11 pub mod parse; 12 pub mod print; 13 mod visitor; 14 15 /// An error that occurred while parsing a regular expression into an abstract 16 /// syntax tree. 17 /// 18 /// Note that not all ASTs represents a valid regular expression. For example, 19 /// an AST is constructed without error for `\p{Quux}`, but `Quux` is not a 20 /// valid Unicode property name. That particular error is reported when 21 /// translating an AST to the high-level intermediate representation (`HIR`). 22 #[derive(Clone, Debug, Eq, PartialEq)] 23 pub struct Error { 24 /// The kind of error. 25 kind: ErrorKind, 26 /// The original pattern that the parser generated the error from. Every 27 /// span in an error is a valid range into this string. 28 pattern: String, 29 /// The span of this error. 30 span: Span, 31 } 32 33 impl Error { 34 /// Return the type of this error. kind(&self) -> &ErrorKind35 pub fn kind(&self) -> &ErrorKind { 36 &self.kind 37 } 38 39 /// The original pattern string in which this error occurred. 40 /// 41 /// Every span reported by this error is reported in terms of this string. pattern(&self) -> &str42 pub fn pattern(&self) -> &str { 43 &self.pattern 44 } 45 46 /// Return the span at which this error occurred. span(&self) -> &Span47 pub fn span(&self) -> &Span { 48 &self.span 49 } 50 51 /// Return an auxiliary span. This span exists only for some errors that 52 /// benefit from being able to point to two locations in the original 53 /// regular expression. For example, "duplicate" errors will have the 54 /// main error position set to the duplicate occurrence while its 55 /// auxiliary span will be set to the initial occurrence. auxiliary_span(&self) -> Option<&Span>56 pub fn auxiliary_span(&self) -> Option<&Span> { 57 use self::ErrorKind::*; 58 match self.kind { 59 FlagDuplicate { ref original } => Some(original), 60 FlagRepeatedNegation { ref original, .. } => Some(original), 61 GroupNameDuplicate { ref original, .. } => Some(original), 62 _ => None, 63 } 64 } 65 } 66 67 /// The type of an error that occurred while building an AST. 68 #[derive(Clone, Debug, Eq, PartialEq)] 69 pub enum ErrorKind { 70 /// The capturing group limit was exceeded. 71 /// 72 /// Note that this represents a limit on the total number of capturing 73 /// groups in a regex and not necessarily the number of nested capturing 74 /// groups. That is, the nest limit can be low and it is still possible for 75 /// this error to occur. 76 CaptureLimitExceeded, 77 /// An invalid escape sequence was found in a character class set. 78 ClassEscapeInvalid, 79 /// An invalid character class range was found. An invalid range is any 80 /// range where the start is greater than the end. 81 ClassRangeInvalid, 82 /// An invalid range boundary was found in a character class. Range 83 /// boundaries must be a single literal codepoint, but this error indicates 84 /// that something else was found, such as a nested class. 85 ClassRangeLiteral, 86 /// An opening `[` was found with no corresponding closing `]`. 87 ClassUnclosed, 88 /// Note that this error variant is no longer used. Namely, a decimal 89 /// number can only appear as a repetition quantifier. When the number 90 /// in a repetition quantifier is empty, then it gets its own specialized 91 /// error, `RepetitionCountDecimalEmpty`. 92 DecimalEmpty, 93 /// An invalid decimal number was given where one was expected. 94 DecimalInvalid, 95 /// A bracketed hex literal was empty. 96 EscapeHexEmpty, 97 /// A bracketed hex literal did not correspond to a Unicode scalar value. 98 EscapeHexInvalid, 99 /// An invalid hexadecimal digit was found. 100 EscapeHexInvalidDigit, 101 /// EOF was found before an escape sequence was completed. 102 EscapeUnexpectedEof, 103 /// An unrecognized escape sequence. 104 EscapeUnrecognized, 105 /// A dangling negation was used when setting flags, e.g., `i-`. 106 FlagDanglingNegation, 107 /// A flag was used twice, e.g., `i-i`. 108 FlagDuplicate { 109 /// The position of the original flag. The error position 110 /// points to the duplicate flag. 111 original: Span, 112 }, 113 /// The negation operator was used twice, e.g., `-i-s`. 114 FlagRepeatedNegation { 115 /// The position of the original negation operator. The error position 116 /// points to the duplicate negation operator. 117 original: Span, 118 }, 119 /// Expected a flag but got EOF, e.g., `(?`. 120 FlagUnexpectedEof, 121 /// Unrecognized flag, e.g., `a`. 122 FlagUnrecognized, 123 /// A duplicate capture name was found. 124 GroupNameDuplicate { 125 /// The position of the initial occurrence of the capture name. The 126 /// error position itself points to the duplicate occurrence. 127 original: Span, 128 }, 129 /// A capture group name is empty, e.g., `(?P<>abc)`. 130 GroupNameEmpty, 131 /// An invalid character was seen for a capture group name. This includes 132 /// errors where the first character is a digit (even though subsequent 133 /// characters are allowed to be digits). 134 GroupNameInvalid, 135 /// A closing `>` could not be found for a capture group name. 136 GroupNameUnexpectedEof, 137 /// An unclosed group, e.g., `(ab`. 138 /// 139 /// The span of this error corresponds to the unclosed parenthesis. 140 GroupUnclosed, 141 /// An unopened group, e.g., `ab)`. 142 GroupUnopened, 143 /// The nest limit was exceeded. The limit stored here is the limit 144 /// configured in the parser. 145 NestLimitExceeded(u32), 146 /// The range provided in a counted repetition operator is invalid. The 147 /// range is invalid if the start is greater than the end. 148 RepetitionCountInvalid, 149 /// An opening `{` was not followed by a valid decimal value. 150 /// For example, `x{}` or `x{]}` would fail. 151 RepetitionCountDecimalEmpty, 152 /// An opening `{` was found with no corresponding closing `}`. 153 RepetitionCountUnclosed, 154 /// A repetition operator was applied to a missing sub-expression. This 155 /// occurs, for example, in the regex consisting of just a `*` or even 156 /// `(?i)*`. It is, however, possible to create a repetition operating on 157 /// an empty sub-expression. For example, `()*` is still considered valid. 158 RepetitionMissing, 159 /// The Unicode class is not valid. This typically occurs when a `\p` is 160 /// followed by something other than a `{`. 161 UnicodeClassInvalid, 162 /// When octal support is disabled, this error is produced when an octal 163 /// escape is used. The octal escape is assumed to be an invocation of 164 /// a backreference, which is the common case. 165 UnsupportedBackreference, 166 /// When syntax similar to PCRE's look-around is used, this error is 167 /// returned. Some example syntaxes that are rejected include, but are 168 /// not necessarily limited to, `(?=re)`, `(?!re)`, `(?<=re)` and 169 /// `(?<!re)`. Note that all of these syntaxes are otherwise invalid; this 170 /// error is used to improve the user experience. 171 UnsupportedLookAround, 172 /// Hints that destructuring should not be exhaustive. 173 /// 174 /// This enum may grow additional variants, so this makes sure clients 175 /// don't count on exhaustive matching. (Otherwise, adding a new variant 176 /// could break existing code.) 177 #[doc(hidden)] 178 __Nonexhaustive, 179 } 180 181 impl error::Error for Error { 182 // TODO: Remove this method entirely on the next breaking semver release. 183 #[allow(deprecated)] description(&self) -> &str184 fn description(&self) -> &str { 185 use self::ErrorKind::*; 186 match self.kind { 187 CaptureLimitExceeded => "capture group limit exceeded", 188 ClassEscapeInvalid => "invalid escape sequence in character class", 189 ClassRangeInvalid => "invalid character class range", 190 ClassRangeLiteral => "invalid range boundary, must be a literal", 191 ClassUnclosed => "unclosed character class", 192 DecimalEmpty => "empty decimal literal", 193 DecimalInvalid => "invalid decimal literal", 194 EscapeHexEmpty => "empty hexadecimal literal", 195 EscapeHexInvalid => "invalid hexadecimal literal", 196 EscapeHexInvalidDigit => "invalid hexadecimal digit", 197 EscapeUnexpectedEof => "unexpected eof (escape sequence)", 198 EscapeUnrecognized => "unrecognized escape sequence", 199 FlagDanglingNegation => "dangling flag negation operator", 200 FlagDuplicate { .. } => "duplicate flag", 201 FlagRepeatedNegation { .. } => "repeated negation", 202 FlagUnexpectedEof => "unexpected eof (flag)", 203 FlagUnrecognized => "unrecognized flag", 204 GroupNameDuplicate { .. } => "duplicate capture group name", 205 GroupNameEmpty => "empty capture group name", 206 GroupNameInvalid => "invalid capture group name", 207 GroupNameUnexpectedEof => "unclosed capture group name", 208 GroupUnclosed => "unclosed group", 209 GroupUnopened => "unopened group", 210 NestLimitExceeded(_) => "nest limit exceeded", 211 RepetitionCountInvalid => "invalid repetition count range", 212 RepetitionCountUnclosed => "unclosed counted repetition", 213 RepetitionMissing => "repetition operator missing expression", 214 UnicodeClassInvalid => "invalid Unicode character class", 215 UnsupportedBackreference => "backreferences are not supported", 216 UnsupportedLookAround => "look-around is not supported", 217 _ => unreachable!(), 218 } 219 } 220 } 221 222 impl fmt::Display for Error { fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result223 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 224 crate::error::Formatter::from(self).fmt(f) 225 } 226 } 227 228 impl fmt::Display for ErrorKind { fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result229 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 230 use self::ErrorKind::*; 231 match *self { 232 CaptureLimitExceeded => write!( 233 f, 234 "exceeded the maximum number of \ 235 capturing groups ({})", 236 ::std::u32::MAX 237 ), 238 ClassEscapeInvalid => { 239 write!(f, "invalid escape sequence found in character class") 240 } 241 ClassRangeInvalid => write!( 242 f, 243 "invalid character class range, \ 244 the start must be <= the end" 245 ), 246 ClassRangeLiteral => { 247 write!(f, "invalid range boundary, must be a literal") 248 } 249 ClassUnclosed => write!(f, "unclosed character class"), 250 DecimalEmpty => write!(f, "decimal literal empty"), 251 DecimalInvalid => write!(f, "decimal literal invalid"), 252 EscapeHexEmpty => write!(f, "hexadecimal literal empty"), 253 EscapeHexInvalid => { 254 write!(f, "hexadecimal literal is not a Unicode scalar value") 255 } 256 EscapeHexInvalidDigit => write!(f, "invalid hexadecimal digit"), 257 EscapeUnexpectedEof => write!( 258 f, 259 "incomplete escape sequence, \ 260 reached end of pattern prematurely" 261 ), 262 EscapeUnrecognized => write!(f, "unrecognized escape sequence"), 263 FlagDanglingNegation => { 264 write!(f, "dangling flag negation operator") 265 } 266 FlagDuplicate { .. } => write!(f, "duplicate flag"), 267 FlagRepeatedNegation { .. } => { 268 write!(f, "flag negation operator repeated") 269 } 270 FlagUnexpectedEof => { 271 write!(f, "expected flag but got end of regex") 272 } 273 FlagUnrecognized => write!(f, "unrecognized flag"), 274 GroupNameDuplicate { .. } => { 275 write!(f, "duplicate capture group name") 276 } 277 GroupNameEmpty => write!(f, "empty capture group name"), 278 GroupNameInvalid => write!(f, "invalid capture group character"), 279 GroupNameUnexpectedEof => write!(f, "unclosed capture group name"), 280 GroupUnclosed => write!(f, "unclosed group"), 281 GroupUnopened => write!(f, "unopened group"), 282 NestLimitExceeded(limit) => write!( 283 f, 284 "exceed the maximum number of \ 285 nested parentheses/brackets ({})", 286 limit 287 ), 288 RepetitionCountInvalid => write!( 289 f, 290 "invalid repetition count range, \ 291 the start must be <= the end" 292 ), 293 RepetitionCountDecimalEmpty => { 294 write!(f, "repetition quantifier expects a valid decimal") 295 } 296 RepetitionCountUnclosed => { 297 write!(f, "unclosed counted repetition") 298 } 299 RepetitionMissing => { 300 write!(f, "repetition operator missing expression") 301 } 302 UnicodeClassInvalid => { 303 write!(f, "invalid Unicode character class") 304 } 305 UnsupportedBackreference => { 306 write!(f, "backreferences are not supported") 307 } 308 UnsupportedLookAround => write!( 309 f, 310 "look-around, including look-ahead and look-behind, \ 311 is not supported" 312 ), 313 _ => unreachable!(), 314 } 315 } 316 } 317 318 /// Span represents the position information of a single AST item. 319 /// 320 /// All span positions are absolute byte offsets that can be used on the 321 /// original regular expression that was parsed. 322 #[derive(Clone, Copy, Eq, PartialEq)] 323 pub struct Span { 324 /// The start byte offset. 325 pub start: Position, 326 /// The end byte offset. 327 pub end: Position, 328 } 329 330 impl fmt::Debug for Span { fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result331 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 332 write!(f, "Span({:?}, {:?})", self.start, self.end) 333 } 334 } 335 336 impl Ord for Span { cmp(&self, other: &Span) -> Ordering337 fn cmp(&self, other: &Span) -> Ordering { 338 (&self.start, &self.end).cmp(&(&other.start, &other.end)) 339 } 340 } 341 342 impl PartialOrd for Span { partial_cmp(&self, other: &Span) -> Option<Ordering>343 fn partial_cmp(&self, other: &Span) -> Option<Ordering> { 344 Some(self.cmp(other)) 345 } 346 } 347 348 /// A single position in a regular expression. 349 /// 350 /// A position encodes one half of a span, and include the byte offset, line 351 /// number and column number. 352 #[derive(Clone, Copy, Eq, PartialEq)] 353 pub struct Position { 354 /// The absolute offset of this position, starting at `0` from the 355 /// beginning of the regular expression pattern string. 356 pub offset: usize, 357 /// The line number, starting at `1`. 358 pub line: usize, 359 /// The approximate column number, starting at `1`. 360 pub column: usize, 361 } 362 363 impl fmt::Debug for Position { fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result364 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 365 write!( 366 f, 367 "Position(o: {:?}, l: {:?}, c: {:?})", 368 self.offset, self.line, self.column 369 ) 370 } 371 } 372 373 impl Ord for Position { cmp(&self, other: &Position) -> Ordering374 fn cmp(&self, other: &Position) -> Ordering { 375 self.offset.cmp(&other.offset) 376 } 377 } 378 379 impl PartialOrd for Position { partial_cmp(&self, other: &Position) -> Option<Ordering>380 fn partial_cmp(&self, other: &Position) -> Option<Ordering> { 381 Some(self.cmp(other)) 382 } 383 } 384 385 impl Span { 386 /// Create a new span with the given positions. new(start: Position, end: Position) -> Span387 pub fn new(start: Position, end: Position) -> Span { 388 Span { start, end } 389 } 390 391 /// Create a new span using the given position as the start and end. splat(pos: Position) -> Span392 pub fn splat(pos: Position) -> Span { 393 Span::new(pos, pos) 394 } 395 396 /// Create a new span by replacing the starting the position with the one 397 /// given. with_start(self, pos: Position) -> Span398 pub fn with_start(self, pos: Position) -> Span { 399 Span { start: pos, ..self } 400 } 401 402 /// Create a new span by replacing the ending the position with the one 403 /// given. with_end(self, pos: Position) -> Span404 pub fn with_end(self, pos: Position) -> Span { 405 Span { end: pos, ..self } 406 } 407 408 /// Returns true if and only if this span occurs on a single line. is_one_line(&self) -> bool409 pub fn is_one_line(&self) -> bool { 410 self.start.line == self.end.line 411 } 412 413 /// Returns true if and only if this span is empty. That is, it points to 414 /// a single position in the concrete syntax of a regular expression. is_empty(&self) -> bool415 pub fn is_empty(&self) -> bool { 416 self.start.offset == self.end.offset 417 } 418 } 419 420 impl Position { 421 /// Create a new position with the given information. 422 /// 423 /// `offset` is the absolute offset of the position, starting at `0` from 424 /// the beginning of the regular expression pattern string. 425 /// 426 /// `line` is the line number, starting at `1`. 427 /// 428 /// `column` is the approximate column number, starting at `1`. new(offset: usize, line: usize, column: usize) -> Position429 pub fn new(offset: usize, line: usize, column: usize) -> Position { 430 Position { offset, line, column } 431 } 432 } 433 434 /// An abstract syntax tree for a singular expression along with comments 435 /// found. 436 /// 437 /// Comments are not stored in the tree itself to avoid complexity. Each 438 /// comment contains a span of precisely where it occurred in the original 439 /// regular expression. 440 #[derive(Clone, Debug, Eq, PartialEq)] 441 pub struct WithComments { 442 /// The actual ast. 443 pub ast: Ast, 444 /// All comments found in the original regular expression. 445 pub comments: Vec<Comment>, 446 } 447 448 /// A comment from a regular expression with an associated span. 449 /// 450 /// A regular expression can only contain comments when the `x` flag is 451 /// enabled. 452 #[derive(Clone, Debug, Eq, PartialEq)] 453 pub struct Comment { 454 /// The span of this comment, including the beginning `#` and ending `\n`. 455 pub span: Span, 456 /// The comment text, starting with the first character following the `#` 457 /// and ending with the last character preceding the `\n`. 458 pub comment: String, 459 } 460 461 /// An abstract syntax tree for a single regular expression. 462 /// 463 /// An `Ast`'s `fmt::Display` implementation uses constant stack space and heap 464 /// space proportional to the size of the `Ast`. 465 /// 466 /// This type defines its own destructor that uses constant stack space and 467 /// heap space proportional to the size of the `Ast`. 468 #[derive(Clone, Debug, Eq, PartialEq)] 469 pub enum Ast { 470 /// An empty regex that matches everything. 471 Empty(Span), 472 /// A set of flags, e.g., `(?is)`. 473 Flags(SetFlags), 474 /// A single character literal, which includes escape sequences. 475 Literal(Literal), 476 /// The "any character" class. 477 Dot(Span), 478 /// A single zero-width assertion. 479 Assertion(Assertion), 480 /// A single character class. This includes all forms of character classes 481 /// except for `.`. e.g., `\d`, `\pN`, `[a-z]` and `[[:alpha:]]`. 482 Class(Class), 483 /// A repetition operator applied to an arbitrary regular expression. 484 Repetition(Repetition), 485 /// A grouped regular expression. 486 Group(Group), 487 /// An alternation of regular expressions. 488 Alternation(Alternation), 489 /// A concatenation of regular expressions. 490 Concat(Concat), 491 } 492 493 impl Ast { 494 /// Return the span of this abstract syntax tree. span(&self) -> &Span495 pub fn span(&self) -> &Span { 496 match *self { 497 Ast::Empty(ref span) => span, 498 Ast::Flags(ref x) => &x.span, 499 Ast::Literal(ref x) => &x.span, 500 Ast::Dot(ref span) => span, 501 Ast::Assertion(ref x) => &x.span, 502 Ast::Class(ref x) => x.span(), 503 Ast::Repetition(ref x) => &x.span, 504 Ast::Group(ref x) => &x.span, 505 Ast::Alternation(ref x) => &x.span, 506 Ast::Concat(ref x) => &x.span, 507 } 508 } 509 510 /// Return true if and only if this Ast is empty. is_empty(&self) -> bool511 pub fn is_empty(&self) -> bool { 512 match *self { 513 Ast::Empty(_) => true, 514 _ => false, 515 } 516 } 517 518 /// Returns true if and only if this AST has any (including possibly empty) 519 /// subexpressions. has_subexprs(&self) -> bool520 fn has_subexprs(&self) -> bool { 521 match *self { 522 Ast::Empty(_) 523 | Ast::Flags(_) 524 | Ast::Literal(_) 525 | Ast::Dot(_) 526 | Ast::Assertion(_) => false, 527 Ast::Class(_) 528 | Ast::Repetition(_) 529 | Ast::Group(_) 530 | Ast::Alternation(_) 531 | Ast::Concat(_) => true, 532 } 533 } 534 } 535 536 /// Print a display representation of this Ast. 537 /// 538 /// This does not preserve any of the original whitespace formatting that may 539 /// have originally been present in the concrete syntax from which this Ast 540 /// was generated. 541 /// 542 /// This implementation uses constant stack space and heap space proportional 543 /// to the size of the `Ast`. 544 impl fmt::Display for Ast { fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result545 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 546 use crate::ast::print::Printer; 547 Printer::new().print(self, f) 548 } 549 } 550 551 /// An alternation of regular expressions. 552 #[derive(Clone, Debug, Eq, PartialEq)] 553 pub struct Alternation { 554 /// The span of this alternation. 555 pub span: Span, 556 /// The alternate regular expressions. 557 pub asts: Vec<Ast>, 558 } 559 560 impl Alternation { 561 /// Return this alternation as an AST. 562 /// 563 /// If this alternation contains zero ASTs, then Ast::Empty is 564 /// returned. If this alternation contains exactly 1 AST, then the 565 /// corresponding AST is returned. Otherwise, Ast::Alternation is returned. into_ast(mut self) -> Ast566 pub fn into_ast(mut self) -> Ast { 567 match self.asts.len() { 568 0 => Ast::Empty(self.span), 569 1 => self.asts.pop().unwrap(), 570 _ => Ast::Alternation(self), 571 } 572 } 573 } 574 575 /// A concatenation of regular expressions. 576 #[derive(Clone, Debug, Eq, PartialEq)] 577 pub struct Concat { 578 /// The span of this concatenation. 579 pub span: Span, 580 /// The concatenation regular expressions. 581 pub asts: Vec<Ast>, 582 } 583 584 impl Concat { 585 /// Return this concatenation as an AST. 586 /// 587 /// If this concatenation contains zero ASTs, then Ast::Empty is 588 /// returned. If this concatenation contains exactly 1 AST, then the 589 /// corresponding AST is returned. Otherwise, Ast::Concat is returned. into_ast(mut self) -> Ast590 pub fn into_ast(mut self) -> Ast { 591 match self.asts.len() { 592 0 => Ast::Empty(self.span), 593 1 => self.asts.pop().unwrap(), 594 _ => Ast::Concat(self), 595 } 596 } 597 } 598 599 /// A single literal expression. 600 /// 601 /// A literal corresponds to a single Unicode scalar value. Literals may be 602 /// represented in their literal form, e.g., `a` or in their escaped form, 603 /// e.g., `\x61`. 604 #[derive(Clone, Debug, Eq, PartialEq)] 605 pub struct Literal { 606 /// The span of this literal. 607 pub span: Span, 608 /// The kind of this literal. 609 pub kind: LiteralKind, 610 /// The Unicode scalar value corresponding to this literal. 611 pub c: char, 612 } 613 614 impl Literal { 615 /// If this literal was written as a `\x` hex escape, then this returns 616 /// the corresponding byte value. Otherwise, this returns `None`. byte(&self) -> Option<u8>617 pub fn byte(&self) -> Option<u8> { 618 let short_hex = LiteralKind::HexFixed(HexLiteralKind::X); 619 if self.c as u32 <= 255 && self.kind == short_hex { 620 Some(self.c as u8) 621 } else { 622 None 623 } 624 } 625 } 626 627 /// The kind of a single literal expression. 628 #[derive(Clone, Debug, Eq, PartialEq)] 629 pub enum LiteralKind { 630 /// The literal is written verbatim, e.g., `a` or `☃`. 631 Verbatim, 632 /// The literal is written as an escape because it is punctuation, e.g., 633 /// `\*` or `\[`. 634 Punctuation, 635 /// The literal is written as an octal escape, e.g., `\141`. 636 Octal, 637 /// The literal is written as a hex code with a fixed number of digits 638 /// depending on the type of the escape, e.g., `\x61` or or `\u0061` or 639 /// `\U00000061`. 640 HexFixed(HexLiteralKind), 641 /// The literal is written as a hex code with a bracketed number of 642 /// digits. The only restriction is that the bracketed hex code must refer 643 /// to a valid Unicode scalar value. 644 HexBrace(HexLiteralKind), 645 /// The literal is written as a specially recognized escape, e.g., `\f` 646 /// or `\n`. 647 Special(SpecialLiteralKind), 648 } 649 650 /// The type of a special literal. 651 /// 652 /// A special literal is a special escape sequence recognized by the regex 653 /// parser, e.g., `\f` or `\n`. 654 #[derive(Clone, Debug, Eq, PartialEq)] 655 pub enum SpecialLiteralKind { 656 /// Bell, spelled `\a` (`\x07`). 657 Bell, 658 /// Form feed, spelled `\f` (`\x0C`). 659 FormFeed, 660 /// Tab, spelled `\t` (`\x09`). 661 Tab, 662 /// Line feed, spelled `\n` (`\x0A`). 663 LineFeed, 664 /// Carriage return, spelled `\r` (`\x0D`). 665 CarriageReturn, 666 /// Vertical tab, spelled `\v` (`\x0B`). 667 VerticalTab, 668 /// Space, spelled `\ ` (`\x20`). Note that this can only appear when 669 /// parsing in verbose mode. 670 Space, 671 } 672 673 /// The type of a Unicode hex literal. 674 /// 675 /// Note that all variants behave the same when used with brackets. They only 676 /// differ when used without brackets in the number of hex digits that must 677 /// follow. 678 #[derive(Clone, Debug, Eq, PartialEq)] 679 pub enum HexLiteralKind { 680 /// A `\x` prefix. When used without brackets, this form is limited to 681 /// two digits. 682 X, 683 /// A `\u` prefix. When used without brackets, this form is limited to 684 /// four digits. 685 UnicodeShort, 686 /// A `\U` prefix. When used without brackets, this form is limited to 687 /// eight digits. 688 UnicodeLong, 689 } 690 691 impl HexLiteralKind { 692 /// The number of digits that must be used with this literal form when 693 /// used without brackets. When used with brackets, there is no 694 /// restriction on the number of digits. digits(&self) -> u32695 pub fn digits(&self) -> u32 { 696 match *self { 697 HexLiteralKind::X => 2, 698 HexLiteralKind::UnicodeShort => 4, 699 HexLiteralKind::UnicodeLong => 8, 700 } 701 } 702 } 703 704 /// A single character class expression. 705 #[derive(Clone, Debug, Eq, PartialEq)] 706 pub enum Class { 707 /// A Unicode character class, e.g., `\pL` or `\p{Greek}`. 708 Unicode(ClassUnicode), 709 /// A perl character class, e.g., `\d` or `\W`. 710 Perl(ClassPerl), 711 /// A bracketed character class set, which may contain zero or more 712 /// character ranges and/or zero or more nested classes. e.g., 713 /// `[a-zA-Z\pL]`. 714 Bracketed(ClassBracketed), 715 } 716 717 impl Class { 718 /// Return the span of this character class. span(&self) -> &Span719 pub fn span(&self) -> &Span { 720 match *self { 721 Class::Perl(ref x) => &x.span, 722 Class::Unicode(ref x) => &x.span, 723 Class::Bracketed(ref x) => &x.span, 724 } 725 } 726 } 727 728 /// A Perl character class. 729 #[derive(Clone, Debug, Eq, PartialEq)] 730 pub struct ClassPerl { 731 /// The span of this class. 732 pub span: Span, 733 /// The kind of Perl class. 734 pub kind: ClassPerlKind, 735 /// Whether the class is negated or not. e.g., `\d` is not negated but 736 /// `\D` is. 737 pub negated: bool, 738 } 739 740 /// The available Perl character classes. 741 #[derive(Clone, Debug, Eq, PartialEq)] 742 pub enum ClassPerlKind { 743 /// Decimal numbers. 744 Digit, 745 /// Whitespace. 746 Space, 747 /// Word characters. 748 Word, 749 } 750 751 /// An ASCII character class. 752 #[derive(Clone, Debug, Eq, PartialEq)] 753 pub struct ClassAscii { 754 /// The span of this class. 755 pub span: Span, 756 /// The kind of ASCII class. 757 pub kind: ClassAsciiKind, 758 /// Whether the class is negated or not. e.g., `[[:alpha:]]` is not negated 759 /// but `[[:^alpha:]]` is. 760 pub negated: bool, 761 } 762 763 /// The available ASCII character classes. 764 #[derive(Clone, Debug, Eq, PartialEq)] 765 pub enum ClassAsciiKind { 766 /// `[0-9A-Za-z]` 767 Alnum, 768 /// `[A-Za-z]` 769 Alpha, 770 /// `[\x00-\x7F]` 771 Ascii, 772 /// `[ \t]` 773 Blank, 774 /// `[\x00-\x1F\x7F]` 775 Cntrl, 776 /// `[0-9]` 777 Digit, 778 /// `[!-~]` 779 Graph, 780 /// `[a-z]` 781 Lower, 782 /// `[ -~]` 783 Print, 784 /// `[!-/:-@\[-`{-~]` 785 Punct, 786 /// `[\t\n\v\f\r ]` 787 Space, 788 /// `[A-Z]` 789 Upper, 790 /// `[0-9A-Za-z_]` 791 Word, 792 /// `[0-9A-Fa-f]` 793 Xdigit, 794 } 795 796 impl ClassAsciiKind { 797 /// Return the corresponding ClassAsciiKind variant for the given name. 798 /// 799 /// The name given should correspond to the lowercase version of the 800 /// variant name. e.g., `cntrl` is the name for `ClassAsciiKind::Cntrl`. 801 /// 802 /// If no variant with the corresponding name exists, then `None` is 803 /// returned. from_name(name: &str) -> Option<ClassAsciiKind>804 pub fn from_name(name: &str) -> Option<ClassAsciiKind> { 805 use self::ClassAsciiKind::*; 806 match name { 807 "alnum" => Some(Alnum), 808 "alpha" => Some(Alpha), 809 "ascii" => Some(Ascii), 810 "blank" => Some(Blank), 811 "cntrl" => Some(Cntrl), 812 "digit" => Some(Digit), 813 "graph" => Some(Graph), 814 "lower" => Some(Lower), 815 "print" => Some(Print), 816 "punct" => Some(Punct), 817 "space" => Some(Space), 818 "upper" => Some(Upper), 819 "word" => Some(Word), 820 "xdigit" => Some(Xdigit), 821 _ => None, 822 } 823 } 824 } 825 826 /// A Unicode character class. 827 #[derive(Clone, Debug, Eq, PartialEq)] 828 pub struct ClassUnicode { 829 /// The span of this class. 830 pub span: Span, 831 /// Whether this class is negated or not. 832 /// 833 /// Note: be careful when using this attribute. This specifically refers 834 /// to whether the class is written as `\p` or `\P`, where the latter 835 /// is `negated = true`. However, it also possible to write something like 836 /// `\P{scx!=Katakana}` which is actually equivalent to 837 /// `\p{scx=Katakana}` and is therefore not actually negated even though 838 /// `negated = true` here. To test whether this class is truly negated 839 /// or not, use the `is_negated` method. 840 pub negated: bool, 841 /// The kind of Unicode class. 842 pub kind: ClassUnicodeKind, 843 } 844 845 impl ClassUnicode { 846 /// Returns true if this class has been negated. 847 /// 848 /// Note that this takes the Unicode op into account, if it's present. 849 /// e.g., `is_negated` for `\P{scx!=Katakana}` will return `false`. is_negated(&self) -> bool850 pub fn is_negated(&self) -> bool { 851 match self.kind { 852 ClassUnicodeKind::NamedValue { 853 op: ClassUnicodeOpKind::NotEqual, 854 .. 855 } => !self.negated, 856 _ => self.negated, 857 } 858 } 859 } 860 861 /// The available forms of Unicode character classes. 862 #[derive(Clone, Debug, Eq, PartialEq)] 863 pub enum ClassUnicodeKind { 864 /// A one letter abbreviated class, e.g., `\pN`. 865 OneLetter(char), 866 /// A binary property, general category or script. The string may be 867 /// empty. 868 Named(String), 869 /// A property name and an associated value. 870 NamedValue { 871 /// The type of Unicode op used to associate `name` with `value`. 872 op: ClassUnicodeOpKind, 873 /// The property name (which may be empty). 874 name: String, 875 /// The property value (which may be empty). 876 value: String, 877 }, 878 } 879 880 /// The type of op used in a Unicode character class. 881 #[derive(Clone, Debug, Eq, PartialEq)] 882 pub enum ClassUnicodeOpKind { 883 /// A property set to a specific value, e.g., `\p{scx=Katakana}`. 884 Equal, 885 /// A property set to a specific value using a colon, e.g., 886 /// `\p{scx:Katakana}`. 887 Colon, 888 /// A property that isn't a particular value, e.g., `\p{scx!=Katakana}`. 889 NotEqual, 890 } 891 892 impl ClassUnicodeOpKind { 893 /// Whether the op is an equality op or not. is_equal(&self) -> bool894 pub fn is_equal(&self) -> bool { 895 match *self { 896 ClassUnicodeOpKind::Equal | ClassUnicodeOpKind::Colon => true, 897 _ => false, 898 } 899 } 900 } 901 902 /// A bracketed character class, e.g., `[a-z0-9]`. 903 #[derive(Clone, Debug, Eq, PartialEq)] 904 pub struct ClassBracketed { 905 /// The span of this class. 906 pub span: Span, 907 /// Whether this class is negated or not. e.g., `[a]` is not negated but 908 /// `[^a]` is. 909 pub negated: bool, 910 /// The type of this set. A set is either a normal union of things, e.g., 911 /// `[abc]` or a result of applying set operations, e.g., `[\pL--c]`. 912 pub kind: ClassSet, 913 } 914 915 /// A character class set. 916 /// 917 /// This type corresponds to the internal structure of a bracketed character 918 /// class. That is, every bracketed character is one of two types: a union of 919 /// items (literals, ranges, other bracketed classes) or a tree of binary set 920 /// operations. 921 #[derive(Clone, Debug, Eq, PartialEq)] 922 pub enum ClassSet { 923 /// An item, which can be a single literal, range, nested character class 924 /// or a union of items. 925 Item(ClassSetItem), 926 /// A single binary operation (i.e., &&, -- or ~~). 927 BinaryOp(ClassSetBinaryOp), 928 } 929 930 impl ClassSet { 931 /// Build a set from a union. union(ast: ClassSetUnion) -> ClassSet932 pub fn union(ast: ClassSetUnion) -> ClassSet { 933 ClassSet::Item(ClassSetItem::Union(ast)) 934 } 935 936 /// Return the span of this character class set. span(&self) -> &Span937 pub fn span(&self) -> &Span { 938 match *self { 939 ClassSet::Item(ref x) => x.span(), 940 ClassSet::BinaryOp(ref x) => &x.span, 941 } 942 } 943 944 /// Return true if and only if this class set is empty. is_empty(&self) -> bool945 fn is_empty(&self) -> bool { 946 match *self { 947 ClassSet::Item(ClassSetItem::Empty(_)) => true, 948 _ => false, 949 } 950 } 951 } 952 953 /// A single component of a character class set. 954 #[derive(Clone, Debug, Eq, PartialEq)] 955 pub enum ClassSetItem { 956 /// An empty item. 957 /// 958 /// Note that a bracketed character class cannot contain a single empty 959 /// item. Empty items can appear when using one of the binary operators. 960 /// For example, `[&&]` is the intersection of two empty classes. 961 Empty(Span), 962 /// A single literal. 963 Literal(Literal), 964 /// A range between two literals. 965 Range(ClassSetRange), 966 /// An ASCII character class, e.g., `[:alnum:]` or `[:punct:]`. 967 Ascii(ClassAscii), 968 /// A Unicode character class, e.g., `\pL` or `\p{Greek}`. 969 Unicode(ClassUnicode), 970 /// A perl character class, e.g., `\d` or `\W`. 971 Perl(ClassPerl), 972 /// A bracketed character class set, which may contain zero or more 973 /// character ranges and/or zero or more nested classes. e.g., 974 /// `[a-zA-Z\pL]`. 975 Bracketed(Box<ClassBracketed>), 976 /// A union of items. 977 Union(ClassSetUnion), 978 } 979 980 impl ClassSetItem { 981 /// Return the span of this character class set item. span(&self) -> &Span982 pub fn span(&self) -> &Span { 983 match *self { 984 ClassSetItem::Empty(ref span) => span, 985 ClassSetItem::Literal(ref x) => &x.span, 986 ClassSetItem::Range(ref x) => &x.span, 987 ClassSetItem::Ascii(ref x) => &x.span, 988 ClassSetItem::Perl(ref x) => &x.span, 989 ClassSetItem::Unicode(ref x) => &x.span, 990 ClassSetItem::Bracketed(ref x) => &x.span, 991 ClassSetItem::Union(ref x) => &x.span, 992 } 993 } 994 } 995 996 /// A single character class range in a set. 997 #[derive(Clone, Debug, Eq, PartialEq)] 998 pub struct ClassSetRange { 999 /// The span of this range. 1000 pub span: Span, 1001 /// The start of this range. 1002 pub start: Literal, 1003 /// The end of this range. 1004 pub end: Literal, 1005 } 1006 1007 impl ClassSetRange { 1008 /// Returns true if and only if this character class range is valid. 1009 /// 1010 /// The only case where a range is invalid is if its start is greater than 1011 /// its end. is_valid(&self) -> bool1012 pub fn is_valid(&self) -> bool { 1013 self.start.c <= self.end.c 1014 } 1015 } 1016 1017 /// A union of items inside a character class set. 1018 #[derive(Clone, Debug, Eq, PartialEq)] 1019 pub struct ClassSetUnion { 1020 /// The span of the items in this operation. e.g., the `a-z0-9` in 1021 /// `[^a-z0-9]` 1022 pub span: Span, 1023 /// The sequence of items that make up this union. 1024 pub items: Vec<ClassSetItem>, 1025 } 1026 1027 impl ClassSetUnion { 1028 /// Push a new item in this union. 1029 /// 1030 /// The ending position of this union's span is updated to the ending 1031 /// position of the span of the item given. If the union is empty, then 1032 /// the starting position of this union is set to the starting position 1033 /// of this item. 1034 /// 1035 /// In other words, if you only use this method to add items to a union 1036 /// and you set the spans on each item correctly, then you should never 1037 /// need to adjust the span of the union directly. push(&mut self, item: ClassSetItem)1038 pub fn push(&mut self, item: ClassSetItem) { 1039 if self.items.is_empty() { 1040 self.span.start = item.span().start; 1041 } 1042 self.span.end = item.span().end; 1043 self.items.push(item); 1044 } 1045 1046 /// Return this union as a character class set item. 1047 /// 1048 /// If this union contains zero items, then an empty union is 1049 /// returned. If this concatenation contains exactly 1 item, then the 1050 /// corresponding item is returned. Otherwise, ClassSetItem::Union is 1051 /// returned. into_item(mut self) -> ClassSetItem1052 pub fn into_item(mut self) -> ClassSetItem { 1053 match self.items.len() { 1054 0 => ClassSetItem::Empty(self.span), 1055 1 => self.items.pop().unwrap(), 1056 _ => ClassSetItem::Union(self), 1057 } 1058 } 1059 } 1060 1061 /// A Unicode character class set operation. 1062 #[derive(Clone, Debug, Eq, PartialEq)] 1063 pub struct ClassSetBinaryOp { 1064 /// The span of this operation. e.g., the `a-z--[h-p]` in `[a-z--h-p]`. 1065 pub span: Span, 1066 /// The type of this set operation. 1067 pub kind: ClassSetBinaryOpKind, 1068 /// The left hand side of the operation. 1069 pub lhs: Box<ClassSet>, 1070 /// The right hand side of the operation. 1071 pub rhs: Box<ClassSet>, 1072 } 1073 1074 /// The type of a Unicode character class set operation. 1075 /// 1076 /// Note that this doesn't explicitly represent union since there is no 1077 /// explicit union operator. Concatenation inside a character class corresponds 1078 /// to the union operation. 1079 #[derive(Clone, Copy, Debug, Eq, PartialEq)] 1080 pub enum ClassSetBinaryOpKind { 1081 /// The intersection of two sets, e.g., `\pN&&[a-z]`. 1082 Intersection, 1083 /// The difference of two sets, e.g., `\pN--[0-9]`. 1084 Difference, 1085 /// The symmetric difference of two sets. The symmetric difference is the 1086 /// set of elements belonging to one but not both sets. 1087 /// e.g., `[\pL~~[:ascii:]]`. 1088 SymmetricDifference, 1089 } 1090 1091 /// A single zero-width assertion. 1092 #[derive(Clone, Debug, Eq, PartialEq)] 1093 pub struct Assertion { 1094 /// The span of this assertion. 1095 pub span: Span, 1096 /// The assertion kind, e.g., `\b` or `^`. 1097 pub kind: AssertionKind, 1098 } 1099 1100 /// An assertion kind. 1101 #[derive(Clone, Debug, Eq, PartialEq)] 1102 pub enum AssertionKind { 1103 /// `^` 1104 StartLine, 1105 /// `$` 1106 EndLine, 1107 /// `\A` 1108 StartText, 1109 /// `\z` 1110 EndText, 1111 /// `\b` 1112 WordBoundary, 1113 /// `\B` 1114 NotWordBoundary, 1115 } 1116 1117 /// A repetition operation applied to a regular expression. 1118 #[derive(Clone, Debug, Eq, PartialEq)] 1119 pub struct Repetition { 1120 /// The span of this operation. 1121 pub span: Span, 1122 /// The actual operation. 1123 pub op: RepetitionOp, 1124 /// Whether this operation was applied greedily or not. 1125 pub greedy: bool, 1126 /// The regular expression under repetition. 1127 pub ast: Box<Ast>, 1128 } 1129 1130 /// The repetition operator itself. 1131 #[derive(Clone, Debug, Eq, PartialEq)] 1132 pub struct RepetitionOp { 1133 /// The span of this operator. This includes things like `+`, `*?` and 1134 /// `{m,n}`. 1135 pub span: Span, 1136 /// The type of operation. 1137 pub kind: RepetitionKind, 1138 } 1139 1140 /// The kind of a repetition operator. 1141 #[derive(Clone, Debug, Eq, PartialEq)] 1142 pub enum RepetitionKind { 1143 /// `?` 1144 ZeroOrOne, 1145 /// `*` 1146 ZeroOrMore, 1147 /// `+` 1148 OneOrMore, 1149 /// `{m,n}` 1150 Range(RepetitionRange), 1151 } 1152 1153 /// A range repetition operator. 1154 #[derive(Clone, Debug, Eq, PartialEq)] 1155 pub enum RepetitionRange { 1156 /// `{m}` 1157 Exactly(u32), 1158 /// `{m,}` 1159 AtLeast(u32), 1160 /// `{m,n}` 1161 Bounded(u32, u32), 1162 } 1163 1164 impl RepetitionRange { 1165 /// Returns true if and only if this repetition range is valid. 1166 /// 1167 /// The only case where a repetition range is invalid is if it is bounded 1168 /// and its start is greater than its end. is_valid(&self) -> bool1169 pub fn is_valid(&self) -> bool { 1170 match *self { 1171 RepetitionRange::Bounded(s, e) if s > e => false, 1172 _ => true, 1173 } 1174 } 1175 } 1176 1177 /// A grouped regular expression. 1178 /// 1179 /// This includes both capturing and non-capturing groups. This does **not** 1180 /// include flag-only groups like `(?is)`, but does contain any group that 1181 /// contains a sub-expression, e.g., `(a)`, `(?P<name>a)`, `(?:a)` and 1182 /// `(?is:a)`. 1183 #[derive(Clone, Debug, Eq, PartialEq)] 1184 pub struct Group { 1185 /// The span of this group. 1186 pub span: Span, 1187 /// The kind of this group. 1188 pub kind: GroupKind, 1189 /// The regular expression in this group. 1190 pub ast: Box<Ast>, 1191 } 1192 1193 impl Group { 1194 /// If this group is non-capturing, then this returns the (possibly empty) 1195 /// set of flags. Otherwise, `None` is returned. flags(&self) -> Option<&Flags>1196 pub fn flags(&self) -> Option<&Flags> { 1197 match self.kind { 1198 GroupKind::NonCapturing(ref flags) => Some(flags), 1199 _ => None, 1200 } 1201 } 1202 1203 /// Returns true if and only if this group is capturing. is_capturing(&self) -> bool1204 pub fn is_capturing(&self) -> bool { 1205 match self.kind { 1206 GroupKind::CaptureIndex(_) | GroupKind::CaptureName(_) => true, 1207 GroupKind::NonCapturing(_) => false, 1208 } 1209 } 1210 1211 /// Returns the capture index of this group, if this is a capturing group. 1212 /// 1213 /// This returns a capture index precisely when `is_capturing` is `true`. capture_index(&self) -> Option<u32>1214 pub fn capture_index(&self) -> Option<u32> { 1215 match self.kind { 1216 GroupKind::CaptureIndex(i) => Some(i), 1217 GroupKind::CaptureName(ref x) => Some(x.index), 1218 GroupKind::NonCapturing(_) => None, 1219 } 1220 } 1221 } 1222 1223 /// The kind of a group. 1224 #[derive(Clone, Debug, Eq, PartialEq)] 1225 pub enum GroupKind { 1226 /// `(a)` 1227 CaptureIndex(u32), 1228 /// `(?P<name>a)` 1229 CaptureName(CaptureName), 1230 /// `(?:a)` and `(?i:a)` 1231 NonCapturing(Flags), 1232 } 1233 1234 /// A capture name. 1235 /// 1236 /// This corresponds to the name itself between the angle brackets in, e.g., 1237 /// `(?P<foo>expr)`. 1238 #[derive(Clone, Debug, Eq, PartialEq)] 1239 pub struct CaptureName { 1240 /// The span of this capture name. 1241 pub span: Span, 1242 /// The capture name. 1243 pub name: String, 1244 /// The capture index. 1245 pub index: u32, 1246 } 1247 1248 /// A group of flags that is not applied to a particular regular expression. 1249 #[derive(Clone, Debug, Eq, PartialEq)] 1250 pub struct SetFlags { 1251 /// The span of these flags, including the grouping parentheses. 1252 pub span: Span, 1253 /// The actual sequence of flags. 1254 pub flags: Flags, 1255 } 1256 1257 /// A group of flags. 1258 /// 1259 /// This corresponds only to the sequence of flags themselves, e.g., `is-u`. 1260 #[derive(Clone, Debug, Eq, PartialEq)] 1261 pub struct Flags { 1262 /// The span of this group of flags. 1263 pub span: Span, 1264 /// A sequence of flag items. Each item is either a flag or a negation 1265 /// operator. 1266 pub items: Vec<FlagsItem>, 1267 } 1268 1269 impl Flags { 1270 /// Add the given item to this sequence of flags. 1271 /// 1272 /// If the item was added successfully, then `None` is returned. If the 1273 /// given item is a duplicate, then `Some(i)` is returned, where 1274 /// `items[i].kind == item.kind`. add_item(&mut self, item: FlagsItem) -> Option<usize>1275 pub fn add_item(&mut self, item: FlagsItem) -> Option<usize> { 1276 for (i, x) in self.items.iter().enumerate() { 1277 if x.kind == item.kind { 1278 return Some(i); 1279 } 1280 } 1281 self.items.push(item); 1282 None 1283 } 1284 1285 /// Returns the state of the given flag in this set. 1286 /// 1287 /// If the given flag is in the set but is negated, then `Some(false)` is 1288 /// returned. 1289 /// 1290 /// If the given flag is in the set and is not negated, then `Some(true)` 1291 /// is returned. 1292 /// 1293 /// Otherwise, `None` is returned. flag_state(&self, flag: Flag) -> Option<bool>1294 pub fn flag_state(&self, flag: Flag) -> Option<bool> { 1295 let mut negated = false; 1296 for x in &self.items { 1297 match x.kind { 1298 FlagsItemKind::Negation => { 1299 negated = true; 1300 } 1301 FlagsItemKind::Flag(ref xflag) if xflag == &flag => { 1302 return Some(!negated); 1303 } 1304 _ => {} 1305 } 1306 } 1307 None 1308 } 1309 } 1310 1311 /// A single item in a group of flags. 1312 #[derive(Clone, Debug, Eq, PartialEq)] 1313 pub struct FlagsItem { 1314 /// The span of this item. 1315 pub span: Span, 1316 /// The kind of this item. 1317 pub kind: FlagsItemKind, 1318 } 1319 1320 /// The kind of an item in a group of flags. 1321 #[derive(Clone, Debug, Eq, PartialEq)] 1322 pub enum FlagsItemKind { 1323 /// A negation operator applied to all subsequent flags in the enclosing 1324 /// group. 1325 Negation, 1326 /// A single flag in a group. 1327 Flag(Flag), 1328 } 1329 1330 impl FlagsItemKind { 1331 /// Returns true if and only if this item is a negation operator. is_negation(&self) -> bool1332 pub fn is_negation(&self) -> bool { 1333 match *self { 1334 FlagsItemKind::Negation => true, 1335 _ => false, 1336 } 1337 } 1338 } 1339 1340 /// A single flag. 1341 #[derive(Clone, Copy, Debug, Eq, PartialEq)] 1342 pub enum Flag { 1343 /// `i` 1344 CaseInsensitive, 1345 /// `m` 1346 MultiLine, 1347 /// `s` 1348 DotMatchesNewLine, 1349 /// `U` 1350 SwapGreed, 1351 /// `u` 1352 Unicode, 1353 /// `x` 1354 IgnoreWhitespace, 1355 } 1356 1357 /// A custom `Drop` impl is used for `Ast` such that it uses constant stack 1358 /// space but heap space proportional to the depth of the `Ast`. 1359 impl Drop for Ast { drop(&mut self)1360 fn drop(&mut self) { 1361 use std::mem; 1362 1363 match *self { 1364 Ast::Empty(_) 1365 | Ast::Flags(_) 1366 | Ast::Literal(_) 1367 | Ast::Dot(_) 1368 | Ast::Assertion(_) 1369 // Classes are recursive, so they get their own Drop impl. 1370 | Ast::Class(_) => return, 1371 Ast::Repetition(ref x) if !x.ast.has_subexprs() => return, 1372 Ast::Group(ref x) if !x.ast.has_subexprs() => return, 1373 Ast::Alternation(ref x) if x.asts.is_empty() => return, 1374 Ast::Concat(ref x) if x.asts.is_empty() => return, 1375 _ => {} 1376 } 1377 1378 let empty_span = || Span::splat(Position::new(0, 0, 0)); 1379 let empty_ast = || Ast::Empty(empty_span()); 1380 let mut stack = vec![mem::replace(self, empty_ast())]; 1381 while let Some(mut ast) = stack.pop() { 1382 match ast { 1383 Ast::Empty(_) 1384 | Ast::Flags(_) 1385 | Ast::Literal(_) 1386 | Ast::Dot(_) 1387 | Ast::Assertion(_) 1388 // Classes are recursive, so they get their own Drop impl. 1389 | Ast::Class(_) => {} 1390 Ast::Repetition(ref mut x) => { 1391 stack.push(mem::replace(&mut x.ast, empty_ast())); 1392 } 1393 Ast::Group(ref mut x) => { 1394 stack.push(mem::replace(&mut x.ast, empty_ast())); 1395 } 1396 Ast::Alternation(ref mut x) => { 1397 stack.extend(x.asts.drain(..)); 1398 } 1399 Ast::Concat(ref mut x) => { 1400 stack.extend(x.asts.drain(..)); 1401 } 1402 } 1403 } 1404 } 1405 } 1406 1407 /// A custom `Drop` impl is used for `ClassSet` such that it uses constant 1408 /// stack space but heap space proportional to the depth of the `ClassSet`. 1409 impl Drop for ClassSet { drop(&mut self)1410 fn drop(&mut self) { 1411 use std::mem; 1412 1413 match *self { 1414 ClassSet::Item(ref item) => match *item { 1415 ClassSetItem::Empty(_) 1416 | ClassSetItem::Literal(_) 1417 | ClassSetItem::Range(_) 1418 | ClassSetItem::Ascii(_) 1419 | ClassSetItem::Unicode(_) 1420 | ClassSetItem::Perl(_) => return, 1421 ClassSetItem::Bracketed(ref x) => { 1422 if x.kind.is_empty() { 1423 return; 1424 } 1425 } 1426 ClassSetItem::Union(ref x) => { 1427 if x.items.is_empty() { 1428 return; 1429 } 1430 } 1431 }, 1432 ClassSet::BinaryOp(ref op) => { 1433 if op.lhs.is_empty() && op.rhs.is_empty() { 1434 return; 1435 } 1436 } 1437 } 1438 1439 let empty_span = || Span::splat(Position::new(0, 0, 0)); 1440 let empty_set = || ClassSet::Item(ClassSetItem::Empty(empty_span())); 1441 let mut stack = vec![mem::replace(self, empty_set())]; 1442 while let Some(mut set) = stack.pop() { 1443 match set { 1444 ClassSet::Item(ref mut item) => match *item { 1445 ClassSetItem::Empty(_) 1446 | ClassSetItem::Literal(_) 1447 | ClassSetItem::Range(_) 1448 | ClassSetItem::Ascii(_) 1449 | ClassSetItem::Unicode(_) 1450 | ClassSetItem::Perl(_) => {} 1451 ClassSetItem::Bracketed(ref mut x) => { 1452 stack.push(mem::replace(&mut x.kind, empty_set())); 1453 } 1454 ClassSetItem::Union(ref mut x) => { 1455 stack.extend(x.items.drain(..).map(ClassSet::Item)); 1456 } 1457 }, 1458 ClassSet::BinaryOp(ref mut op) => { 1459 stack.push(mem::replace(&mut op.lhs, empty_set())); 1460 stack.push(mem::replace(&mut op.rhs, empty_set())); 1461 } 1462 } 1463 } 1464 } 1465 } 1466 1467 #[cfg(test)] 1468 mod tests { 1469 use super::*; 1470 1471 // We use a thread with an explicit stack size to test that our destructor 1472 // for Ast can handle arbitrarily sized expressions in constant stack 1473 // space. In case we run on a platform without threads (WASM?), we limit 1474 // this test to Windows/Unix. 1475 #[test] 1476 #[cfg(any(unix, windows))] no_stack_overflow_on_drop()1477 fn no_stack_overflow_on_drop() { 1478 use std::thread; 1479 1480 let run = || { 1481 let span = || Span::splat(Position::new(0, 0, 0)); 1482 let mut ast = Ast::Empty(span()); 1483 for i in 0..200 { 1484 ast = Ast::Group(Group { 1485 span: span(), 1486 kind: GroupKind::CaptureIndex(i), 1487 ast: Box::new(ast), 1488 }); 1489 } 1490 assert!(!ast.is_empty()); 1491 }; 1492 1493 // We run our test on a thread with a small stack size so we can 1494 // force the issue more easily. 1495 // 1496 // NOTE(2023-03-21): It turns out that some platforms (like FreeBSD) 1497 // will just barf with very small stack sizes. So we bump this up a bit 1498 // to give more room to breath. When I did this, I confirmed that if 1499 // I remove the custom `Drop` impl for `Ast`, then this test does 1500 // indeed still fail with a stack overflow. (At the time of writing, I 1501 // had to bump it all the way up to 32K before the test would pass even 1502 // without the custom `Drop` impl. So 16K seems like a safe number 1503 // here.) 1504 // 1505 // See: https://github.com/rust-lang/regex/issues/967 1506 thread::Builder::new() 1507 .stack_size(16 << 10) 1508 .spawn(run) 1509 .unwrap() 1510 .join() 1511 .unwrap(); 1512 } 1513 } 1514