1 /*!
2 This module provides a regular expression parser.
3 */
4
5 use std::borrow::Borrow;
6 use std::cell::{Cell, RefCell};
7 use std::mem;
8 use std::result;
9
10 use ast::{self, Ast, Position, Span};
11 use either::Either;
12
13 use is_meta_character;
14
15 type Result<T> = result::Result<T, ast::Error>;
16
17 /// A primitive is an expression with no sub-expressions. This includes
18 /// literals, assertions and non-set character classes. This representation
19 /// is used as intermediate state in the parser.
20 ///
21 /// This does not include ASCII character classes, since they can only appear
22 /// within a set character class.
23 #[derive(Clone, Debug, Eq, PartialEq)]
24 enum Primitive {
25 Literal(ast::Literal),
26 Assertion(ast::Assertion),
27 Dot(Span),
28 Perl(ast::ClassPerl),
29 Unicode(ast::ClassUnicode),
30 }
31
32 impl Primitive {
33 /// Return the span of this primitive.
span(&self) -> &Span34 fn span(&self) -> &Span {
35 match *self {
36 Primitive::Literal(ref x) => &x.span,
37 Primitive::Assertion(ref x) => &x.span,
38 Primitive::Dot(ref span) => span,
39 Primitive::Perl(ref x) => &x.span,
40 Primitive::Unicode(ref x) => &x.span,
41 }
42 }
43
44 /// Convert this primitive into a proper AST.
into_ast(self) -> Ast45 fn into_ast(self) -> Ast {
46 match self {
47 Primitive::Literal(lit) => Ast::Literal(lit),
48 Primitive::Assertion(assert) => Ast::Assertion(assert),
49 Primitive::Dot(span) => Ast::Dot(span),
50 Primitive::Perl(cls) => Ast::Class(ast::Class::Perl(cls)),
51 Primitive::Unicode(cls) => Ast::Class(ast::Class::Unicode(cls)),
52 }
53 }
54
55 /// Convert this primitive into an item in a character class.
56 ///
57 /// If this primitive is not a legal item (i.e., an assertion or a dot),
58 /// then return an error.
into_class_set_item<P: Borrow<Parser>>( self, p: &ParserI<P>, ) -> Result<ast::ClassSetItem>59 fn into_class_set_item<P: Borrow<Parser>>(
60 self,
61 p: &ParserI<P>,
62 ) -> Result<ast::ClassSetItem> {
63 use self::Primitive::*;
64 use ast::ClassSetItem;
65
66 match self {
67 Literal(lit) => Ok(ClassSetItem::Literal(lit)),
68 Perl(cls) => Ok(ClassSetItem::Perl(cls)),
69 Unicode(cls) => Ok(ClassSetItem::Unicode(cls)),
70 x => Err(p.error(*x.span(), ast::ErrorKind::ClassEscapeInvalid)),
71 }
72 }
73
74 /// Convert this primitive into a literal in a character class. In
75 /// particular, literals are the only valid items that can appear in
76 /// ranges.
77 ///
78 /// If this primitive is not a legal item (i.e., a class, assertion or a
79 /// dot), then return an error.
into_class_literal<P: Borrow<Parser>>( self, p: &ParserI<P>, ) -> Result<ast::Literal>80 fn into_class_literal<P: Borrow<Parser>>(
81 self,
82 p: &ParserI<P>,
83 ) -> Result<ast::Literal> {
84 use self::Primitive::*;
85
86 match self {
87 Literal(lit) => Ok(lit),
88 x => Err(p.error(*x.span(), ast::ErrorKind::ClassRangeLiteral)),
89 }
90 }
91 }
92
93 /// Returns true if the given character is a hexadecimal digit.
is_hex(c: char) -> bool94 fn is_hex(c: char) -> bool {
95 ('0' <= c && c <= '9') || ('a' <= c && c <= 'f') || ('A' <= c && c <= 'F')
96 }
97
98 /// Returns true if the given character is a valid in a capture group name.
99 ///
100 /// If `first` is true, then `c` is treated as the first character in the
101 /// group name (which must be alphabetic or underscore).
is_capture_char(c: char, first: bool) -> bool102 fn is_capture_char(c: char, first: bool) -> bool {
103 c == '_'
104 || (!first
105 && (('0' <= c && c <= '9') || c == '.' || c == '[' || c == ']'))
106 || ('A' <= c && c <= 'Z')
107 || ('a' <= c && c <= 'z')
108 }
109
110 /// A builder for a regular expression parser.
111 ///
112 /// This builder permits modifying configuration options for the parser.
113 #[derive(Clone, Debug)]
114 pub struct ParserBuilder {
115 ignore_whitespace: bool,
116 nest_limit: u32,
117 octal: bool,
118 }
119
120 impl Default for ParserBuilder {
default() -> ParserBuilder121 fn default() -> ParserBuilder {
122 ParserBuilder::new()
123 }
124 }
125
126 impl ParserBuilder {
127 /// Create a new parser builder with a default configuration.
new() -> ParserBuilder128 pub fn new() -> ParserBuilder {
129 ParserBuilder {
130 ignore_whitespace: false,
131 nest_limit: 250,
132 octal: false,
133 }
134 }
135
136 /// Build a parser from this configuration with the given pattern.
build(&self) -> Parser137 pub fn build(&self) -> Parser {
138 Parser {
139 pos: Cell::new(Position { offset: 0, line: 1, column: 1 }),
140 capture_index: Cell::new(0),
141 nest_limit: self.nest_limit,
142 octal: self.octal,
143 initial_ignore_whitespace: self.ignore_whitespace,
144 ignore_whitespace: Cell::new(self.ignore_whitespace),
145 comments: RefCell::new(vec![]),
146 stack_group: RefCell::new(vec![]),
147 stack_class: RefCell::new(vec![]),
148 capture_names: RefCell::new(vec![]),
149 scratch: RefCell::new(String::new()),
150 }
151 }
152
153 /// Set the nesting limit for this parser.
154 ///
155 /// The nesting limit controls how deep the abstract syntax tree is allowed
156 /// to be. If the AST exceeds the given limit (e.g., with too many nested
157 /// groups), then an error is returned by the parser.
158 ///
159 /// The purpose of this limit is to act as a heuristic to prevent stack
160 /// overflow for consumers that do structural induction on an `Ast` using
161 /// explicit recursion. While this crate never does this (instead using
162 /// constant stack space and moving the call stack to the heap), other
163 /// crates may.
164 ///
165 /// This limit is not checked until the entire Ast is parsed. Therefore,
166 /// if callers want to put a limit on the amount of heap space used, then
167 /// they should impose a limit on the length, in bytes, of the concrete
168 /// pattern string. In particular, this is viable since this parser
169 /// implementation will limit itself to heap space proportional to the
170 /// lenth of the pattern string.
171 ///
172 /// Note that a nest limit of `0` will return a nest limit error for most
173 /// patterns but not all. For example, a nest limit of `0` permits `a` but
174 /// not `ab`, since `ab` requires a concatenation, which results in a nest
175 /// depth of `1`. In general, a nest limit is not something that manifests
176 /// in an obvious way in the concrete syntax, therefore, it should not be
177 /// used in a granular way.
nest_limit(&mut self, limit: u32) -> &mut ParserBuilder178 pub fn nest_limit(&mut self, limit: u32) -> &mut ParserBuilder {
179 self.nest_limit = limit;
180 self
181 }
182
183 /// Whether to support octal syntax or not.
184 ///
185 /// Octal syntax is a little-known way of uttering Unicode codepoints in
186 /// a regular expression. For example, `a`, `\x61`, `\u0061` and
187 /// `\141` are all equivalent regular expressions, where the last example
188 /// shows octal syntax.
189 ///
190 /// While supporting octal syntax isn't in and of itself a problem, it does
191 /// make good error messages harder. That is, in PCRE based regex engines,
192 /// syntax like `\0` invokes a backreference, which is explicitly
193 /// unsupported in Rust's regex engine. However, many users expect it to
194 /// be supported. Therefore, when octal support is disabled, the error
195 /// message will explicitly mention that backreferences aren't supported.
196 ///
197 /// Octal syntax is disabled by default.
octal(&mut self, yes: bool) -> &mut ParserBuilder198 pub fn octal(&mut self, yes: bool) -> &mut ParserBuilder {
199 self.octal = yes;
200 self
201 }
202
203 /// Enable verbose mode in the regular expression.
204 ///
205 /// When enabled, verbose mode permits insigificant whitespace in many
206 /// places in the regular expression, as well as comments. Comments are
207 /// started using `#` and continue until the end of the line.
208 ///
209 /// By default, this is disabled. It may be selectively enabled in the
210 /// regular expression by using the `x` flag regardless of this setting.
ignore_whitespace(&mut self, yes: bool) -> &mut ParserBuilder211 pub fn ignore_whitespace(&mut self, yes: bool) -> &mut ParserBuilder {
212 self.ignore_whitespace = yes;
213 self
214 }
215 }
216
217 /// A regular expression parser.
218 ///
219 /// This parses a string representation of a regular expression into an
220 /// abstract syntax tree. The size of the tree is proportional to the length
221 /// of the regular expression pattern.
222 ///
223 /// A `Parser` can be configured in more detail via a
224 /// [`ParserBuilder`](struct.ParserBuilder.html).
225 #[derive(Clone, Debug)]
226 pub struct Parser {
227 /// The current position of the parser.
228 pos: Cell<Position>,
229 /// The current capture index.
230 capture_index: Cell<u32>,
231 /// The maximum number of open parens/brackets allowed. If the parser
232 /// exceeds this number, then an error is returned.
233 nest_limit: u32,
234 /// Whether to support octal syntax or not. When `false`, the parser will
235 /// return an error helpfully pointing out that backreferences are not
236 /// supported.
237 octal: bool,
238 /// The initial setting for `ignore_whitespace` as provided by
239 /// Th`ParserBuilder`. is is used when reseting the parser's state.
240 initial_ignore_whitespace: bool,
241 /// Whether whitespace should be ignored. When enabled, comments are
242 /// also permitted.
243 ignore_whitespace: Cell<bool>,
244 /// A list of comments, in order of appearance.
245 comments: RefCell<Vec<ast::Comment>>,
246 /// A stack of grouped sub-expressions, including alternations.
247 stack_group: RefCell<Vec<GroupState>>,
248 /// A stack of nested character classes. This is only non-empty when
249 /// parsing a class.
250 stack_class: RefCell<Vec<ClassState>>,
251 /// A sorted sequence of capture names. This is used to detect duplicate
252 /// capture names and report an error if one is detected.
253 capture_names: RefCell<Vec<ast::CaptureName>>,
254 /// A scratch buffer used in various places. Mostly this is used to
255 /// accumulate relevant characters from parts of a pattern.
256 scratch: RefCell<String>,
257 }
258
259 /// ParserI is the internal parser implementation.
260 ///
261 /// We use this separate type so that we can carry the provided pattern string
262 /// along with us. In particular, a `Parser` internal state is not tied to any
263 /// one pattern, but `ParserI` is.
264 ///
265 /// This type also lets us use `ParserI<&Parser>` in production code while
266 /// retaining the convenience of `ParserI<Parser>` for tests, which sometimes
267 /// work against the internal interface of the parser.
268 #[derive(Clone, Debug)]
269 struct ParserI<'s, P> {
270 /// The parser state/configuration.
271 parser: P,
272 /// The full regular expression provided by the user.
273 pattern: &'s str,
274 }
275
276 /// GroupState represents a single stack frame while parsing nested groups
277 /// and alternations. Each frame records the state up to an opening parenthesis
278 /// or a alternating bracket `|`.
279 #[derive(Clone, Debug)]
280 enum GroupState {
281 /// This state is pushed whenever an opening group is found.
282 Group {
283 /// The concatenation immediately preceding the opening group.
284 concat: ast::Concat,
285 /// The group that has been opened. Its sub-AST is always empty.
286 group: ast::Group,
287 /// Whether this group has the `x` flag enabled or not.
288 ignore_whitespace: bool,
289 },
290 /// This state is pushed whenever a new alternation branch is found. If
291 /// an alternation branch is found and this state is at the top of the
292 /// stack, then this state should be modified to include the new
293 /// alternation.
294 Alternation(ast::Alternation),
295 }
296
297 /// ClassState represents a single stack frame while parsing character classes.
298 /// Each frame records the state up to an intersection, difference, symmetric
299 /// difference or nested class.
300 ///
301 /// Note that a parser's character class stack is only non-empty when parsing
302 /// a character class. In all other cases, it is empty.
303 #[derive(Clone, Debug)]
304 enum ClassState {
305 /// This state is pushed whenever an opening bracket is found.
306 Open {
307 /// The union of class items immediately preceding this class.
308 union: ast::ClassSetUnion,
309 /// The class that has been opened. Typically this just corresponds
310 /// to the `[`, but it can also include `[^` since `^` indicates
311 /// negation of the class.
312 set: ast::ClassBracketed,
313 },
314 /// This state is pushed when a operator is seen. When popped, the stored
315 /// set becomes the left hand side of the operator.
316 Op {
317 /// The type of the operation, i.e., &&, -- or ~~.
318 kind: ast::ClassSetBinaryOpKind,
319 /// The left-hand side of the operator.
320 lhs: ast::ClassSet,
321 },
322 }
323
324 impl Parser {
325 /// Create a new parser with a default configuration.
326 ///
327 /// The parser can be run with either the `parse` or `parse_with_comments`
328 /// methods. The parse methods return an abstract syntax tree.
329 ///
330 /// To set configuration options on the parser, use
331 /// [`ParserBuilder`](struct.ParserBuilder.html).
new() -> Parser332 pub fn new() -> Parser {
333 ParserBuilder::new().build()
334 }
335
336 /// Parse the regular expression into an abstract syntax tree.
parse(&mut self, pattern: &str) -> Result<Ast>337 pub fn parse(&mut self, pattern: &str) -> Result<Ast> {
338 ParserI::new(self, pattern).parse()
339 }
340
341 /// Parse the regular expression and return an abstract syntax tree with
342 /// all of the comments found in the pattern.
parse_with_comments( &mut self, pattern: &str, ) -> Result<ast::WithComments>343 pub fn parse_with_comments(
344 &mut self,
345 pattern: &str,
346 ) -> Result<ast::WithComments> {
347 ParserI::new(self, pattern).parse_with_comments()
348 }
349
350 /// Reset the internal state of a parser.
351 ///
352 /// This is called at the beginning of every parse. This prevents the
353 /// parser from running with inconsistent state (say, if a previous
354 /// invocation returned an error and the parser is reused).
reset(&self)355 fn reset(&self) {
356 // These settings should be in line with the construction
357 // in `ParserBuilder::build`.
358 self.pos.set(Position { offset: 0, line: 1, column: 1 });
359 self.ignore_whitespace.set(self.initial_ignore_whitespace);
360 self.comments.borrow_mut().clear();
361 self.stack_group.borrow_mut().clear();
362 self.stack_class.borrow_mut().clear();
363 }
364 }
365
366 impl<'s, P: Borrow<Parser>> ParserI<'s, P> {
367 /// Build an internal parser from a parser configuration and a pattern.
new(parser: P, pattern: &'s str) -> ParserI<'s, P>368 fn new(parser: P, pattern: &'s str) -> ParserI<'s, P> {
369 ParserI { parser: parser, pattern: pattern }
370 }
371
372 /// Return a reference to the parser state.
parser(&self) -> &Parser373 fn parser(&self) -> &Parser {
374 self.parser.borrow()
375 }
376
377 /// Return a reference to the pattern being parsed.
pattern(&self) -> &str378 fn pattern(&self) -> &str {
379 self.pattern.borrow()
380 }
381
382 /// Create a new error with the given span and error type.
error(&self, span: Span, kind: ast::ErrorKind) -> ast::Error383 fn error(&self, span: Span, kind: ast::ErrorKind) -> ast::Error {
384 ast::Error {
385 kind: kind,
386 pattern: self.pattern().to_string(),
387 span: span,
388 }
389 }
390
391 /// Return the current offset of the parser.
392 ///
393 /// The offset starts at `0` from the beginning of the regular expression
394 /// pattern string.
offset(&self) -> usize395 fn offset(&self) -> usize {
396 self.parser().pos.get().offset
397 }
398
399 /// Return the current line number of the parser.
400 ///
401 /// The line number starts at `1`.
line(&self) -> usize402 fn line(&self) -> usize {
403 self.parser().pos.get().line
404 }
405
406 /// Return the current column of the parser.
407 ///
408 /// The column number starts at `1` and is reset whenever a `\n` is seen.
column(&self) -> usize409 fn column(&self) -> usize {
410 self.parser().pos.get().column
411 }
412
413 /// Return the next capturing index. Each subsequent call increments the
414 /// internal index.
415 ///
416 /// The span given should correspond to the location of the opening
417 /// parenthesis.
418 ///
419 /// If the capture limit is exceeded, then an error is returned.
next_capture_index(&self, span: Span) -> Result<u32>420 fn next_capture_index(&self, span: Span) -> Result<u32> {
421 let current = self.parser().capture_index.get();
422 let i = current.checked_add(1).ok_or_else(|| {
423 self.error(span, ast::ErrorKind::CaptureLimitExceeded)
424 })?;
425 self.parser().capture_index.set(i);
426 Ok(i)
427 }
428
429 /// Adds the given capture name to this parser. If this capture name has
430 /// already been used, then an error is returned.
add_capture_name(&self, cap: &ast::CaptureName) -> Result<()>431 fn add_capture_name(&self, cap: &ast::CaptureName) -> Result<()> {
432 let mut names = self.parser().capture_names.borrow_mut();
433 match names
434 .binary_search_by_key(&cap.name.as_str(), |c| c.name.as_str())
435 {
436 Err(i) => {
437 names.insert(i, cap.clone());
438 Ok(())
439 }
440 Ok(i) => Err(self.error(
441 cap.span,
442 ast::ErrorKind::GroupNameDuplicate { original: names[i].span },
443 )),
444 }
445 }
446
447 /// Return whether the parser should ignore whitespace or not.
ignore_whitespace(&self) -> bool448 fn ignore_whitespace(&self) -> bool {
449 self.parser().ignore_whitespace.get()
450 }
451
452 /// Return the character at the current position of the parser.
453 ///
454 /// This panics if the current position does not point to a valid char.
char(&self) -> char455 fn char(&self) -> char {
456 self.char_at(self.offset())
457 }
458
459 /// Return the character at the given position.
460 ///
461 /// This panics if the given position does not point to a valid char.
char_at(&self, i: usize) -> char462 fn char_at(&self, i: usize) -> char {
463 self.pattern()[i..]
464 .chars()
465 .next()
466 .unwrap_or_else(|| panic!("expected char at offset {}", i))
467 }
468
469 /// Bump the parser to the next Unicode scalar value.
470 ///
471 /// If the end of the input has been reached, then `false` is returned.
bump(&self) -> bool472 fn bump(&self) -> bool {
473 if self.is_eof() {
474 return false;
475 }
476 let Position { mut offset, mut line, mut column } = self.pos();
477 if self.char() == '\n' {
478 line = line.checked_add(1).unwrap();
479 column = 1;
480 } else {
481 column = column.checked_add(1).unwrap();
482 }
483 offset += self.char().len_utf8();
484 self.parser().pos.set(Position {
485 offset: offset,
486 line: line,
487 column: column,
488 });
489 self.pattern()[self.offset()..].chars().next().is_some()
490 }
491
492 /// If the substring starting at the current position of the parser has
493 /// the given prefix, then bump the parser to the character immediately
494 /// following the prefix and return true. Otherwise, don't bump the parser
495 /// and return false.
bump_if(&self, prefix: &str) -> bool496 fn bump_if(&self, prefix: &str) -> bool {
497 if self.pattern()[self.offset()..].starts_with(prefix) {
498 for _ in 0..prefix.chars().count() {
499 self.bump();
500 }
501 true
502 } else {
503 false
504 }
505 }
506
507 /// Returns true if and only if the parser is positioned at a look-around
508 /// prefix. The conditions under which this returns true must always
509 /// correspond to a regular expression that would otherwise be consider
510 /// invalid.
511 ///
512 /// This should only be called immediately after parsing the opening of
513 /// a group or a set of flags.
is_lookaround_prefix(&self) -> bool514 fn is_lookaround_prefix(&self) -> bool {
515 self.bump_if("?=")
516 || self.bump_if("?!")
517 || self.bump_if("?<=")
518 || self.bump_if("?<!")
519 }
520
521 /// Bump the parser, and if the `x` flag is enabled, bump through any
522 /// subsequent spaces. Return true if and only if the parser is not at
523 /// EOF.
bump_and_bump_space(&self) -> bool524 fn bump_and_bump_space(&self) -> bool {
525 if !self.bump() {
526 return false;
527 }
528 self.bump_space();
529 !self.is_eof()
530 }
531
532 /// If the `x` flag is enabled (i.e., whitespace insensitivity with
533 /// comments), then this will advance the parser through all whitespace
534 /// and comments to the next non-whitespace non-comment byte.
535 ///
536 /// If the `x` flag is disabled, then this is a no-op.
537 ///
538 /// This should be used selectively throughout the parser where
539 /// arbitrary whitespace is permitted when the `x` flag is enabled. For
540 /// example, `{ 5 , 6}` is equivalent to `{5,6}`.
bump_space(&self)541 fn bump_space(&self) {
542 if !self.ignore_whitespace() {
543 return;
544 }
545 while !self.is_eof() {
546 if self.char().is_whitespace() {
547 self.bump();
548 } else if self.char() == '#' {
549 let start = self.pos();
550 let mut comment_text = String::new();
551 self.bump();
552 while !self.is_eof() {
553 let c = self.char();
554 self.bump();
555 if c == '\n' {
556 break;
557 }
558 comment_text.push(c);
559 }
560 let comment = ast::Comment {
561 span: Span::new(start, self.pos()),
562 comment: comment_text,
563 };
564 self.parser().comments.borrow_mut().push(comment);
565 } else {
566 break;
567 }
568 }
569 }
570
571 /// Peek at the next character in the input without advancing the parser.
572 ///
573 /// If the input has been exhausted, then this returns `None`.
peek(&self) -> Option<char>574 fn peek(&self) -> Option<char> {
575 if self.is_eof() {
576 return None;
577 }
578 self.pattern()[self.offset() + self.char().len_utf8()..].chars().next()
579 }
580
581 /// Like peek, but will ignore spaces when the parser is in whitespace
582 /// insensitive mode.
peek_space(&self) -> Option<char>583 fn peek_space(&self) -> Option<char> {
584 if !self.ignore_whitespace() {
585 return self.peek();
586 }
587 if self.is_eof() {
588 return None;
589 }
590 let mut start = self.offset() + self.char().len_utf8();
591 let mut in_comment = false;
592 for (i, c) in self.pattern()[start..].char_indices() {
593 if c.is_whitespace() {
594 continue;
595 } else if !in_comment && c == '#' {
596 in_comment = true;
597 } else if in_comment && c == '\n' {
598 in_comment = false;
599 } else {
600 start += i;
601 break;
602 }
603 }
604 self.pattern()[start..].chars().next()
605 }
606
607 /// Returns true if the next call to `bump` would return false.
is_eof(&self) -> bool608 fn is_eof(&self) -> bool {
609 self.offset() == self.pattern().len()
610 }
611
612 /// Return the current position of the parser, which includes the offset,
613 /// line and column.
pos(&self) -> Position614 fn pos(&self) -> Position {
615 self.parser().pos.get()
616 }
617
618 /// Create a span at the current position of the parser. Both the start
619 /// and end of the span are set.
span(&self) -> Span620 fn span(&self) -> Span {
621 Span::splat(self.pos())
622 }
623
624 /// Create a span that covers the current character.
span_char(&self) -> Span625 fn span_char(&self) -> Span {
626 let mut next = Position {
627 offset: self.offset().checked_add(self.char().len_utf8()).unwrap(),
628 line: self.line(),
629 column: self.column().checked_add(1).unwrap(),
630 };
631 if self.char() == '\n' {
632 next.line += 1;
633 next.column = 1;
634 }
635 Span::new(self.pos(), next)
636 }
637
638 /// Parse and push a single alternation on to the parser's internal stack.
639 /// If the top of the stack already has an alternation, then add to that
640 /// instead of pushing a new one.
641 ///
642 /// The concatenation given corresponds to a single alternation branch.
643 /// The concatenation returned starts the next branch and is empty.
644 ///
645 /// This assumes the parser is currently positioned at `|` and will advance
646 /// the parser to the character following `|`.
647 #[inline(never)]
push_alternate(&self, mut concat: ast::Concat) -> Result<ast::Concat>648 fn push_alternate(&self, mut concat: ast::Concat) -> Result<ast::Concat> {
649 assert_eq!(self.char(), '|');
650 concat.span.end = self.pos();
651 self.push_or_add_alternation(concat);
652 self.bump();
653 Ok(ast::Concat { span: self.span(), asts: vec![] })
654 }
655
656 /// Pushes or adds the given branch of an alternation to the parser's
657 /// internal stack of state.
push_or_add_alternation(&self, concat: ast::Concat)658 fn push_or_add_alternation(&self, concat: ast::Concat) {
659 use self::GroupState::*;
660
661 let mut stack = self.parser().stack_group.borrow_mut();
662 if let Some(&mut Alternation(ref mut alts)) = stack.last_mut() {
663 alts.asts.push(concat.into_ast());
664 return;
665 }
666 stack.push(Alternation(ast::Alternation {
667 span: Span::new(concat.span.start, self.pos()),
668 asts: vec![concat.into_ast()],
669 }));
670 }
671
672 /// Parse and push a group AST (and its parent concatenation) on to the
673 /// parser's internal stack. Return a fresh concatenation corresponding
674 /// to the group's sub-AST.
675 ///
676 /// If a set of flags was found (with no group), then the concatenation
677 /// is returned with that set of flags added.
678 ///
679 /// This assumes that the parser is currently positioned on the opening
680 /// parenthesis. It advances the parser to the character at the start
681 /// of the sub-expression (or adjoining expression).
682 ///
683 /// If there was a problem parsing the start of the group, then an error
684 /// is returned.
685 #[inline(never)]
push_group(&self, mut concat: ast::Concat) -> Result<ast::Concat>686 fn push_group(&self, mut concat: ast::Concat) -> Result<ast::Concat> {
687 assert_eq!(self.char(), '(');
688 match self.parse_group()? {
689 Either::Left(set) => {
690 let ignore = set.flags.flag_state(ast::Flag::IgnoreWhitespace);
691 if let Some(v) = ignore {
692 self.parser().ignore_whitespace.set(v);
693 }
694
695 concat.asts.push(Ast::Flags(set));
696 Ok(concat)
697 }
698 Either::Right(group) => {
699 let old_ignore_whitespace = self.ignore_whitespace();
700 let new_ignore_whitespace = group
701 .flags()
702 .and_then(|f| f.flag_state(ast::Flag::IgnoreWhitespace))
703 .unwrap_or(old_ignore_whitespace);
704 self.parser().stack_group.borrow_mut().push(
705 GroupState::Group {
706 concat: concat,
707 group: group,
708 ignore_whitespace: old_ignore_whitespace,
709 },
710 );
711 self.parser().ignore_whitespace.set(new_ignore_whitespace);
712 Ok(ast::Concat { span: self.span(), asts: vec![] })
713 }
714 }
715 }
716
717 /// Pop a group AST from the parser's internal stack and set the group's
718 /// AST to the given concatenation. Return the concatenation containing
719 /// the group.
720 ///
721 /// This assumes that the parser is currently positioned on the closing
722 /// parenthesis and advances the parser to the character following the `)`.
723 ///
724 /// If no such group could be popped, then an unopened group error is
725 /// returned.
726 #[inline(never)]
pop_group(&self, mut group_concat: ast::Concat) -> Result<ast::Concat>727 fn pop_group(&self, mut group_concat: ast::Concat) -> Result<ast::Concat> {
728 use self::GroupState::*;
729
730 assert_eq!(self.char(), ')');
731 let mut stack = self.parser().stack_group.borrow_mut();
732 let (mut prior_concat, mut group, ignore_whitespace, alt) = match stack
733 .pop()
734 {
735 Some(Group { concat, group, ignore_whitespace }) => {
736 (concat, group, ignore_whitespace, None)
737 }
738 Some(Alternation(alt)) => match stack.pop() {
739 Some(Group { concat, group, ignore_whitespace }) => {
740 (concat, group, ignore_whitespace, Some(alt))
741 }
742 None | Some(Alternation(_)) => {
743 return Err(self.error(
744 self.span_char(),
745 ast::ErrorKind::GroupUnopened,
746 ));
747 }
748 },
749 None => {
750 return Err(self
751 .error(self.span_char(), ast::ErrorKind::GroupUnopened));
752 }
753 };
754 self.parser().ignore_whitespace.set(ignore_whitespace);
755 group_concat.span.end = self.pos();
756 self.bump();
757 group.span.end = self.pos();
758 match alt {
759 Some(mut alt) => {
760 alt.span.end = group_concat.span.end;
761 alt.asts.push(group_concat.into_ast());
762 group.ast = Box::new(alt.into_ast());
763 }
764 None => {
765 group.ast = Box::new(group_concat.into_ast());
766 }
767 }
768 prior_concat.asts.push(Ast::Group(group));
769 Ok(prior_concat)
770 }
771
772 /// Pop the last state from the parser's internal stack, if it exists, and
773 /// add the given concatenation to it. There either must be no state or a
774 /// single alternation item on the stack. Any other scenario produces an
775 /// error.
776 ///
777 /// This assumes that the parser has advanced to the end.
778 #[inline(never)]
pop_group_end(&self, mut concat: ast::Concat) -> Result<Ast>779 fn pop_group_end(&self, mut concat: ast::Concat) -> Result<Ast> {
780 concat.span.end = self.pos();
781 let mut stack = self.parser().stack_group.borrow_mut();
782 let ast = match stack.pop() {
783 None => Ok(concat.into_ast()),
784 Some(GroupState::Alternation(mut alt)) => {
785 alt.span.end = self.pos();
786 alt.asts.push(concat.into_ast());
787 Ok(Ast::Alternation(alt))
788 }
789 Some(GroupState::Group { group, .. }) => {
790 return Err(
791 self.error(group.span, ast::ErrorKind::GroupUnclosed)
792 );
793 }
794 };
795 // If we try to pop again, there should be nothing.
796 match stack.pop() {
797 None => ast,
798 Some(GroupState::Alternation(_)) => {
799 // This unreachable is unfortunate. This case can't happen
800 // because the only way we can be here is if there were two
801 // `GroupState::Alternation`s adjacent in the parser's stack,
802 // which we guarantee to never happen because we never push a
803 // `GroupState::Alternation` if one is already at the top of
804 // the stack.
805 unreachable!()
806 }
807 Some(GroupState::Group { group, .. }) => {
808 Err(self.error(group.span, ast::ErrorKind::GroupUnclosed))
809 }
810 }
811 }
812
813 /// Parse the opening of a character class and push the current class
814 /// parsing context onto the parser's stack. This assumes that the parser
815 /// is positioned at an opening `[`. The given union should correspond to
816 /// the union of set items built up before seeing the `[`.
817 ///
818 /// If there was a problem parsing the opening of the class, then an error
819 /// is returned. Otherwise, a new union of set items for the class is
820 /// returned (which may be populated with either a `]` or a `-`).
821 #[inline(never)]
push_class_open( &self, parent_union: ast::ClassSetUnion, ) -> Result<ast::ClassSetUnion>822 fn push_class_open(
823 &self,
824 parent_union: ast::ClassSetUnion,
825 ) -> Result<ast::ClassSetUnion> {
826 assert_eq!(self.char(), '[');
827
828 let (nested_set, nested_union) = self.parse_set_class_open()?;
829 self.parser()
830 .stack_class
831 .borrow_mut()
832 .push(ClassState::Open { union: parent_union, set: nested_set });
833 Ok(nested_union)
834 }
835
836 /// Parse the end of a character class set and pop the character class
837 /// parser stack. The union given corresponds to the last union built
838 /// before seeing the closing `]`. The union returned corresponds to the
839 /// parent character class set with the nested class added to it.
840 ///
841 /// This assumes that the parser is positioned at a `]` and will advance
842 /// the parser to the byte immediately following the `]`.
843 ///
844 /// If the stack is empty after popping, then this returns the final
845 /// "top-level" character class AST (where a "top-level" character class
846 /// is one that is not nested inside any other character class).
847 ///
848 /// If there is no corresponding opening bracket on the parser's stack,
849 /// then an error is returned.
850 #[inline(never)]
pop_class( &self, nested_union: ast::ClassSetUnion, ) -> Result<Either<ast::ClassSetUnion, ast::Class>>851 fn pop_class(
852 &self,
853 nested_union: ast::ClassSetUnion,
854 ) -> Result<Either<ast::ClassSetUnion, ast::Class>> {
855 assert_eq!(self.char(), ']');
856
857 let item = ast::ClassSet::Item(nested_union.into_item());
858 let prevset = self.pop_class_op(item);
859 let mut stack = self.parser().stack_class.borrow_mut();
860 match stack.pop() {
861 None => {
862 // We can never observe an empty stack:
863 //
864 // 1) We are guaranteed to start with a non-empty stack since
865 // the character class parser is only initiated when it sees
866 // a `[`.
867 // 2) If we ever observe an empty stack while popping after
868 // seeing a `]`, then we signal the character class parser
869 // to terminate.
870 panic!("unexpected empty character class stack")
871 }
872 Some(ClassState::Op { .. }) => {
873 // This panic is unfortunate, but this case is impossible
874 // since we already popped the Op state if one exists above.
875 // Namely, every push to the class parser stack is guarded by
876 // whether an existing Op is already on the top of the stack.
877 // If it is, the existing Op is modified. That is, the stack
878 // can never have consecutive Op states.
879 panic!("unexpected ClassState::Op")
880 }
881 Some(ClassState::Open { mut union, mut set }) => {
882 self.bump();
883 set.span.end = self.pos();
884 set.kind = prevset;
885 if stack.is_empty() {
886 Ok(Either::Right(ast::Class::Bracketed(set)))
887 } else {
888 union.push(ast::ClassSetItem::Bracketed(Box::new(set)));
889 Ok(Either::Left(union))
890 }
891 }
892 }
893 }
894
895 /// Return an "unclosed class" error whose span points to the most
896 /// recently opened class.
897 ///
898 /// This should only be called while parsing a character class.
899 #[inline(never)]
unclosed_class_error(&self) -> ast::Error900 fn unclosed_class_error(&self) -> ast::Error {
901 for state in self.parser().stack_class.borrow().iter().rev() {
902 match *state {
903 ClassState::Open { ref set, .. } => {
904 return self
905 .error(set.span, ast::ErrorKind::ClassUnclosed);
906 }
907 _ => {}
908 }
909 }
910 // We are guaranteed to have a non-empty stack with at least
911 // one open bracket, so we should never get here.
912 panic!("no open character class found")
913 }
914
915 /// Push the current set of class items on to the class parser's stack as
916 /// the left hand side of the given operator.
917 ///
918 /// A fresh set union is returned, which should be used to build the right
919 /// hand side of this operator.
920 #[inline(never)]
push_class_op( &self, next_kind: ast::ClassSetBinaryOpKind, next_union: ast::ClassSetUnion, ) -> ast::ClassSetUnion921 fn push_class_op(
922 &self,
923 next_kind: ast::ClassSetBinaryOpKind,
924 next_union: ast::ClassSetUnion,
925 ) -> ast::ClassSetUnion {
926 let item = ast::ClassSet::Item(next_union.into_item());
927 let new_lhs = self.pop_class_op(item);
928 self.parser()
929 .stack_class
930 .borrow_mut()
931 .push(ClassState::Op { kind: next_kind, lhs: new_lhs });
932 ast::ClassSetUnion { span: self.span(), items: vec![] }
933 }
934
935 /// Pop a character class set from the character class parser stack. If the
936 /// top of the stack is just an item (not an operation), then return the
937 /// given set unchanged. If the top of the stack is an operation, then the
938 /// given set will be used as the rhs of the operation on the top of the
939 /// stack. In that case, the binary operation is returned as a set.
940 #[inline(never)]
pop_class_op(&self, rhs: ast::ClassSet) -> ast::ClassSet941 fn pop_class_op(&self, rhs: ast::ClassSet) -> ast::ClassSet {
942 let mut stack = self.parser().stack_class.borrow_mut();
943 let (kind, lhs) = match stack.pop() {
944 Some(ClassState::Op { kind, lhs }) => (kind, lhs),
945 Some(state @ ClassState::Open { .. }) => {
946 stack.push(state);
947 return rhs;
948 }
949 None => unreachable!(),
950 };
951 let span = Span::new(lhs.span().start, rhs.span().end);
952 ast::ClassSet::BinaryOp(ast::ClassSetBinaryOp {
953 span: span,
954 kind: kind,
955 lhs: Box::new(lhs),
956 rhs: Box::new(rhs),
957 })
958 }
959 }
960
961 impl<'s, P: Borrow<Parser>> ParserI<'s, P> {
962 /// Parse the regular expression into an abstract syntax tree.
parse(&self) -> Result<Ast>963 fn parse(&self) -> Result<Ast> {
964 self.parse_with_comments().map(|astc| astc.ast)
965 }
966
967 /// Parse the regular expression and return an abstract syntax tree with
968 /// all of the comments found in the pattern.
parse_with_comments(&self) -> Result<ast::WithComments>969 fn parse_with_comments(&self) -> Result<ast::WithComments> {
970 assert_eq!(self.offset(), 0, "parser can only be used once");
971 self.parser().reset();
972 let mut concat = ast::Concat { span: self.span(), asts: vec![] };
973 loop {
974 self.bump_space();
975 if self.is_eof() {
976 break;
977 }
978 match self.char() {
979 '(' => concat = self.push_group(concat)?,
980 ')' => concat = self.pop_group(concat)?,
981 '|' => concat = self.push_alternate(concat)?,
982 '[' => {
983 let class = self.parse_set_class()?;
984 concat.asts.push(Ast::Class(class));
985 }
986 '?' => {
987 concat = self.parse_uncounted_repetition(
988 concat,
989 ast::RepetitionKind::ZeroOrOne,
990 )?;
991 }
992 '*' => {
993 concat = self.parse_uncounted_repetition(
994 concat,
995 ast::RepetitionKind::ZeroOrMore,
996 )?;
997 }
998 '+' => {
999 concat = self.parse_uncounted_repetition(
1000 concat,
1001 ast::RepetitionKind::OneOrMore,
1002 )?;
1003 }
1004 '{' => {
1005 concat = self.parse_counted_repetition(concat)?;
1006 }
1007 _ => concat.asts.push(self.parse_primitive()?.into_ast()),
1008 }
1009 }
1010 let ast = self.pop_group_end(concat)?;
1011 NestLimiter::new(self).check(&ast)?;
1012 Ok(ast::WithComments {
1013 ast: ast,
1014 comments: mem::replace(
1015 &mut *self.parser().comments.borrow_mut(),
1016 vec![],
1017 ),
1018 })
1019 }
1020
1021 /// Parses an uncounted repetition operation. An uncounted repetition
1022 /// operator includes ?, * and +, but does not include the {m,n} syntax.
1023 /// The given `kind` should correspond to the operator observed by the
1024 /// caller.
1025 ///
1026 /// This assumes that the paser is currently positioned at the repetition
1027 /// operator and advances the parser to the first character after the
1028 /// operator. (Note that the operator may include a single additional `?`,
1029 /// which makes the operator ungreedy.)
1030 ///
1031 /// The caller should include the concatenation that is being built. The
1032 /// concatenation returned includes the repetition operator applied to the
1033 /// last expression in the given concatenation.
1034 #[inline(never)]
parse_uncounted_repetition( &self, mut concat: ast::Concat, kind: ast::RepetitionKind, ) -> Result<ast::Concat>1035 fn parse_uncounted_repetition(
1036 &self,
1037 mut concat: ast::Concat,
1038 kind: ast::RepetitionKind,
1039 ) -> Result<ast::Concat> {
1040 assert!(
1041 self.char() == '?' || self.char() == '*' || self.char() == '+'
1042 );
1043 let op_start = self.pos();
1044 let ast = match concat.asts.pop() {
1045 Some(ast) => ast,
1046 None => {
1047 return Err(
1048 self.error(self.span(), ast::ErrorKind::RepetitionMissing)
1049 )
1050 }
1051 };
1052 match ast {
1053 Ast::Empty(_) | Ast::Flags(_) => {
1054 return Err(
1055 self.error(self.span(), ast::ErrorKind::RepetitionMissing)
1056 )
1057 }
1058 _ => {}
1059 }
1060 let mut greedy = true;
1061 if self.bump() && self.char() == '?' {
1062 greedy = false;
1063 self.bump();
1064 }
1065 concat.asts.push(Ast::Repetition(ast::Repetition {
1066 span: ast.span().with_end(self.pos()),
1067 op: ast::RepetitionOp {
1068 span: Span::new(op_start, self.pos()),
1069 kind: kind,
1070 },
1071 greedy: greedy,
1072 ast: Box::new(ast),
1073 }));
1074 Ok(concat)
1075 }
1076
1077 /// Parses a counted repetition operation. A counted repetition operator
1078 /// corresponds to the {m,n} syntax, and does not include the ?, * or +
1079 /// operators.
1080 ///
1081 /// This assumes that the paser is currently positioned at the opening `{`
1082 /// and advances the parser to the first character after the operator.
1083 /// (Note that the operator may include a single additional `?`, which
1084 /// makes the operator ungreedy.)
1085 ///
1086 /// The caller should include the concatenation that is being built. The
1087 /// concatenation returned includes the repetition operator applied to the
1088 /// last expression in the given concatenation.
1089 #[inline(never)]
parse_counted_repetition( &self, mut concat: ast::Concat, ) -> Result<ast::Concat>1090 fn parse_counted_repetition(
1091 &self,
1092 mut concat: ast::Concat,
1093 ) -> Result<ast::Concat> {
1094 assert!(self.char() == '{');
1095 let start = self.pos();
1096 let ast = match concat.asts.pop() {
1097 Some(ast) => ast,
1098 None => {
1099 return Err(
1100 self.error(self.span(), ast::ErrorKind::RepetitionMissing)
1101 )
1102 }
1103 };
1104 match ast {
1105 Ast::Empty(_) | Ast::Flags(_) => {
1106 return Err(
1107 self.error(self.span(), ast::ErrorKind::RepetitionMissing)
1108 )
1109 }
1110 _ => {}
1111 }
1112 if !self.bump_and_bump_space() {
1113 return Err(self.error(
1114 Span::new(start, self.pos()),
1115 ast::ErrorKind::RepetitionCountUnclosed,
1116 ));
1117 }
1118 let count_start = specialize_err(
1119 self.parse_decimal(),
1120 ast::ErrorKind::DecimalEmpty,
1121 ast::ErrorKind::RepetitionCountDecimalEmpty,
1122 )?;
1123 let mut range = ast::RepetitionRange::Exactly(count_start);
1124 if self.is_eof() {
1125 return Err(self.error(
1126 Span::new(start, self.pos()),
1127 ast::ErrorKind::RepetitionCountUnclosed,
1128 ));
1129 }
1130 if self.char() == ',' {
1131 if !self.bump_and_bump_space() {
1132 return Err(self.error(
1133 Span::new(start, self.pos()),
1134 ast::ErrorKind::RepetitionCountUnclosed,
1135 ));
1136 }
1137 if self.char() != '}' {
1138 let count_end = specialize_err(
1139 self.parse_decimal(),
1140 ast::ErrorKind::DecimalEmpty,
1141 ast::ErrorKind::RepetitionCountDecimalEmpty,
1142 )?;
1143 range = ast::RepetitionRange::Bounded(count_start, count_end);
1144 } else {
1145 range = ast::RepetitionRange::AtLeast(count_start);
1146 }
1147 }
1148 if self.is_eof() || self.char() != '}' {
1149 return Err(self.error(
1150 Span::new(start, self.pos()),
1151 ast::ErrorKind::RepetitionCountUnclosed,
1152 ));
1153 }
1154
1155 let mut greedy = true;
1156 if self.bump_and_bump_space() && self.char() == '?' {
1157 greedy = false;
1158 self.bump();
1159 }
1160
1161 let op_span = Span::new(start, self.pos());
1162 if !range.is_valid() {
1163 return Err(
1164 self.error(op_span, ast::ErrorKind::RepetitionCountInvalid)
1165 );
1166 }
1167 concat.asts.push(Ast::Repetition(ast::Repetition {
1168 span: ast.span().with_end(self.pos()),
1169 op: ast::RepetitionOp {
1170 span: op_span,
1171 kind: ast::RepetitionKind::Range(range),
1172 },
1173 greedy: greedy,
1174 ast: Box::new(ast),
1175 }));
1176 Ok(concat)
1177 }
1178
1179 /// Parse a group (which contains a sub-expression) or a set of flags.
1180 ///
1181 /// If a group was found, then it is returned with an empty AST. If a set
1182 /// of flags is found, then that set is returned.
1183 ///
1184 /// The parser should be positioned at the opening parenthesis.
1185 ///
1186 /// This advances the parser to the character before the start of the
1187 /// sub-expression (in the case of a group) or to the closing parenthesis
1188 /// immediately following the set of flags.
1189 ///
1190 /// # Errors
1191 ///
1192 /// If flags are given and incorrectly specified, then a corresponding
1193 /// error is returned.
1194 ///
1195 /// If a capture name is given and it is incorrectly specified, then a
1196 /// corresponding error is returned.
1197 #[inline(never)]
parse_group(&self) -> Result<Either<ast::SetFlags, ast::Group>>1198 fn parse_group(&self) -> Result<Either<ast::SetFlags, ast::Group>> {
1199 assert_eq!(self.char(), '(');
1200 let open_span = self.span_char();
1201 self.bump();
1202 self.bump_space();
1203 if self.is_lookaround_prefix() {
1204 return Err(self.error(
1205 Span::new(open_span.start, self.span().end),
1206 ast::ErrorKind::UnsupportedLookAround,
1207 ));
1208 }
1209 let inner_span = self.span();
1210 if self.bump_if("?P<") {
1211 let capture_index = self.next_capture_index(open_span)?;
1212 let cap = self.parse_capture_name(capture_index)?;
1213 Ok(Either::Right(ast::Group {
1214 span: open_span,
1215 kind: ast::GroupKind::CaptureName(cap),
1216 ast: Box::new(Ast::Empty(self.span())),
1217 }))
1218 } else if self.bump_if("?") {
1219 if self.is_eof() {
1220 return Err(
1221 self.error(open_span, ast::ErrorKind::GroupUnclosed)
1222 );
1223 }
1224 let flags = self.parse_flags()?;
1225 let char_end = self.char();
1226 self.bump();
1227 if char_end == ')' {
1228 // We don't allow empty flags, e.g., `(?)`. We instead
1229 // interpret it as a repetition operator missing its argument.
1230 if flags.items.is_empty() {
1231 return Err(self.error(
1232 inner_span,
1233 ast::ErrorKind::RepetitionMissing,
1234 ));
1235 }
1236 Ok(Either::Left(ast::SetFlags {
1237 span: Span { end: self.pos(), ..open_span },
1238 flags: flags,
1239 }))
1240 } else {
1241 assert_eq!(char_end, ':');
1242 Ok(Either::Right(ast::Group {
1243 span: open_span,
1244 kind: ast::GroupKind::NonCapturing(flags),
1245 ast: Box::new(Ast::Empty(self.span())),
1246 }))
1247 }
1248 } else {
1249 let capture_index = self.next_capture_index(open_span)?;
1250 Ok(Either::Right(ast::Group {
1251 span: open_span,
1252 kind: ast::GroupKind::CaptureIndex(capture_index),
1253 ast: Box::new(Ast::Empty(self.span())),
1254 }))
1255 }
1256 }
1257
1258 /// Parses a capture group name. Assumes that the parser is positioned at
1259 /// the first character in the name following the opening `<` (and may
1260 /// possibly be EOF). This advances the parser to the first character
1261 /// following the closing `>`.
1262 ///
1263 /// The caller must provide the capture index of the group for this name.
1264 #[inline(never)]
parse_capture_name( &self, capture_index: u32, ) -> Result<ast::CaptureName>1265 fn parse_capture_name(
1266 &self,
1267 capture_index: u32,
1268 ) -> Result<ast::CaptureName> {
1269 if self.is_eof() {
1270 return Err(self
1271 .error(self.span(), ast::ErrorKind::GroupNameUnexpectedEof));
1272 }
1273 let start = self.pos();
1274 loop {
1275 if self.char() == '>' {
1276 break;
1277 }
1278 if !is_capture_char(self.char(), self.pos() == start) {
1279 return Err(self.error(
1280 self.span_char(),
1281 ast::ErrorKind::GroupNameInvalid,
1282 ));
1283 }
1284 if !self.bump() {
1285 break;
1286 }
1287 }
1288 let end = self.pos();
1289 if self.is_eof() {
1290 return Err(self
1291 .error(self.span(), ast::ErrorKind::GroupNameUnexpectedEof));
1292 }
1293 assert_eq!(self.char(), '>');
1294 self.bump();
1295 let name = &self.pattern()[start.offset..end.offset];
1296 if name.is_empty() {
1297 return Err(self.error(
1298 Span::new(start, start),
1299 ast::ErrorKind::GroupNameEmpty,
1300 ));
1301 }
1302 let capname = ast::CaptureName {
1303 span: Span::new(start, end),
1304 name: name.to_string(),
1305 index: capture_index,
1306 };
1307 self.add_capture_name(&capname)?;
1308 Ok(capname)
1309 }
1310
1311 /// Parse a sequence of flags starting at the current character.
1312 ///
1313 /// This advances the parser to the character immediately following the
1314 /// flags, which is guaranteed to be either `:` or `)`.
1315 ///
1316 /// # Errors
1317 ///
1318 /// If any flags are duplicated, then an error is returned.
1319 ///
1320 /// If the negation operator is used more than once, then an error is
1321 /// returned.
1322 ///
1323 /// If no flags could be found or if the negation operation is not followed
1324 /// by any flags, then an error is returned.
1325 #[inline(never)]
parse_flags(&self) -> Result<ast::Flags>1326 fn parse_flags(&self) -> Result<ast::Flags> {
1327 let mut flags = ast::Flags { span: self.span(), items: vec![] };
1328 let mut last_was_negation = None;
1329 while self.char() != ':' && self.char() != ')' {
1330 if self.char() == '-' {
1331 last_was_negation = Some(self.span_char());
1332 let item = ast::FlagsItem {
1333 span: self.span_char(),
1334 kind: ast::FlagsItemKind::Negation,
1335 };
1336 if let Some(i) = flags.add_item(item) {
1337 return Err(self.error(
1338 self.span_char(),
1339 ast::ErrorKind::FlagRepeatedNegation {
1340 original: flags.items[i].span,
1341 },
1342 ));
1343 }
1344 } else {
1345 last_was_negation = None;
1346 let item = ast::FlagsItem {
1347 span: self.span_char(),
1348 kind: ast::FlagsItemKind::Flag(self.parse_flag()?),
1349 };
1350 if let Some(i) = flags.add_item(item) {
1351 return Err(self.error(
1352 self.span_char(),
1353 ast::ErrorKind::FlagDuplicate {
1354 original: flags.items[i].span,
1355 },
1356 ));
1357 }
1358 }
1359 if !self.bump() {
1360 return Err(
1361 self.error(self.span(), ast::ErrorKind::FlagUnexpectedEof)
1362 );
1363 }
1364 }
1365 if let Some(span) = last_was_negation {
1366 return Err(self.error(span, ast::ErrorKind::FlagDanglingNegation));
1367 }
1368 flags.span.end = self.pos();
1369 Ok(flags)
1370 }
1371
1372 /// Parse the current character as a flag. Do not advance the parser.
1373 ///
1374 /// # Errors
1375 ///
1376 /// If the flag is not recognized, then an error is returned.
1377 #[inline(never)]
parse_flag(&self) -> Result<ast::Flag>1378 fn parse_flag(&self) -> Result<ast::Flag> {
1379 match self.char() {
1380 'i' => Ok(ast::Flag::CaseInsensitive),
1381 'm' => Ok(ast::Flag::MultiLine),
1382 's' => Ok(ast::Flag::DotMatchesNewLine),
1383 'U' => Ok(ast::Flag::SwapGreed),
1384 'u' => Ok(ast::Flag::Unicode),
1385 'x' => Ok(ast::Flag::IgnoreWhitespace),
1386 _ => {
1387 Err(self
1388 .error(self.span_char(), ast::ErrorKind::FlagUnrecognized))
1389 }
1390 }
1391 }
1392
1393 /// Parse a primitive AST. e.g., A literal, non-set character class or
1394 /// assertion.
1395 ///
1396 /// This assumes that the parser expects a primitive at the current
1397 /// location. i.e., All other non-primitive cases have been handled.
1398 /// For example, if the parser's position is at `|`, then `|` will be
1399 /// treated as a literal (e.g., inside a character class).
1400 ///
1401 /// This advances the parser to the first character immediately following
1402 /// the primitive.
parse_primitive(&self) -> Result<Primitive>1403 fn parse_primitive(&self) -> Result<Primitive> {
1404 match self.char() {
1405 '\\' => self.parse_escape(),
1406 '.' => {
1407 let ast = Primitive::Dot(self.span_char());
1408 self.bump();
1409 Ok(ast)
1410 }
1411 '^' => {
1412 let ast = Primitive::Assertion(ast::Assertion {
1413 span: self.span_char(),
1414 kind: ast::AssertionKind::StartLine,
1415 });
1416 self.bump();
1417 Ok(ast)
1418 }
1419 '$' => {
1420 let ast = Primitive::Assertion(ast::Assertion {
1421 span: self.span_char(),
1422 kind: ast::AssertionKind::EndLine,
1423 });
1424 self.bump();
1425 Ok(ast)
1426 }
1427 c => {
1428 let ast = Primitive::Literal(ast::Literal {
1429 span: self.span_char(),
1430 kind: ast::LiteralKind::Verbatim,
1431 c: c,
1432 });
1433 self.bump();
1434 Ok(ast)
1435 }
1436 }
1437 }
1438
1439 /// Parse an escape sequence as a primitive AST.
1440 ///
1441 /// This assumes the parser is positioned at the start of the escape
1442 /// sequence, i.e., `\`. It advances the parser to the first position
1443 /// immediately following the escape sequence.
1444 #[inline(never)]
parse_escape(&self) -> Result<Primitive>1445 fn parse_escape(&self) -> Result<Primitive> {
1446 assert_eq!(self.char(), '\\');
1447 let start = self.pos();
1448 if !self.bump() {
1449 return Err(self.error(
1450 Span::new(start, self.pos()),
1451 ast::ErrorKind::EscapeUnexpectedEof,
1452 ));
1453 }
1454 let c = self.char();
1455 // Put some of the more complicated routines into helpers.
1456 match c {
1457 '0'..='7' => {
1458 if !self.parser().octal {
1459 return Err(self.error(
1460 Span::new(start, self.span_char().end),
1461 ast::ErrorKind::UnsupportedBackreference,
1462 ));
1463 }
1464 let mut lit = self.parse_octal();
1465 lit.span.start = start;
1466 return Ok(Primitive::Literal(lit));
1467 }
1468 '8'..='9' if !self.parser().octal => {
1469 return Err(self.error(
1470 Span::new(start, self.span_char().end),
1471 ast::ErrorKind::UnsupportedBackreference,
1472 ));
1473 }
1474 'x' | 'u' | 'U' => {
1475 let mut lit = self.parse_hex()?;
1476 lit.span.start = start;
1477 return Ok(Primitive::Literal(lit));
1478 }
1479 'p' | 'P' => {
1480 let mut cls = self.parse_unicode_class()?;
1481 cls.span.start = start;
1482 return Ok(Primitive::Unicode(cls));
1483 }
1484 'd' | 's' | 'w' | 'D' | 'S' | 'W' => {
1485 let mut cls = self.parse_perl_class();
1486 cls.span.start = start;
1487 return Ok(Primitive::Perl(cls));
1488 }
1489 _ => {}
1490 }
1491
1492 // Handle all of the one letter sequences inline.
1493 self.bump();
1494 let span = Span::new(start, self.pos());
1495 if is_meta_character(c) {
1496 return Ok(Primitive::Literal(ast::Literal {
1497 span: span,
1498 kind: ast::LiteralKind::Punctuation,
1499 c: c,
1500 }));
1501 }
1502 let special = |kind, c| {
1503 Ok(Primitive::Literal(ast::Literal {
1504 span: span,
1505 kind: ast::LiteralKind::Special(kind),
1506 c: c,
1507 }))
1508 };
1509 match c {
1510 'a' => special(ast::SpecialLiteralKind::Bell, '\x07'),
1511 'f' => special(ast::SpecialLiteralKind::FormFeed, '\x0C'),
1512 't' => special(ast::SpecialLiteralKind::Tab, '\t'),
1513 'n' => special(ast::SpecialLiteralKind::LineFeed, '\n'),
1514 'r' => special(ast::SpecialLiteralKind::CarriageReturn, '\r'),
1515 'v' => special(ast::SpecialLiteralKind::VerticalTab, '\x0B'),
1516 ' ' if self.ignore_whitespace() => {
1517 special(ast::SpecialLiteralKind::Space, ' ')
1518 }
1519 'A' => Ok(Primitive::Assertion(ast::Assertion {
1520 span: span,
1521 kind: ast::AssertionKind::StartText,
1522 })),
1523 'z' => Ok(Primitive::Assertion(ast::Assertion {
1524 span: span,
1525 kind: ast::AssertionKind::EndText,
1526 })),
1527 'b' => Ok(Primitive::Assertion(ast::Assertion {
1528 span: span,
1529 kind: ast::AssertionKind::WordBoundary,
1530 })),
1531 'B' => Ok(Primitive::Assertion(ast::Assertion {
1532 span: span,
1533 kind: ast::AssertionKind::NotWordBoundary,
1534 })),
1535 _ => Err(self.error(span, ast::ErrorKind::EscapeUnrecognized)),
1536 }
1537 }
1538
1539 /// Parse an octal representation of a Unicode codepoint up to 3 digits
1540 /// long. This expects the parser to be positioned at the first octal
1541 /// digit and advances the parser to the first character immediately
1542 /// following the octal number. This also assumes that parsing octal
1543 /// escapes is enabled.
1544 ///
1545 /// Assuming the preconditions are met, this routine can never fail.
1546 #[inline(never)]
parse_octal(&self) -> ast::Literal1547 fn parse_octal(&self) -> ast::Literal {
1548 use std::char;
1549 use std::u32;
1550
1551 assert!(self.parser().octal);
1552 assert!('0' <= self.char() && self.char() <= '7');
1553 let start = self.pos();
1554 // Parse up to two more digits.
1555 while self.bump()
1556 && '0' <= self.char()
1557 && self.char() <= '7'
1558 && self.pos().offset - start.offset <= 2
1559 {}
1560 let end = self.pos();
1561 let octal = &self.pattern()[start.offset..end.offset];
1562 // Parsing the octal should never fail since the above guarantees a
1563 // valid number.
1564 let codepoint =
1565 u32::from_str_radix(octal, 8).expect("valid octal number");
1566 // The max value for 3 digit octal is 0777 = 511 and [0, 511] has no
1567 // invalid Unicode scalar values.
1568 let c = char::from_u32(codepoint).expect("Unicode scalar value");
1569 ast::Literal {
1570 span: Span::new(start, end),
1571 kind: ast::LiteralKind::Octal,
1572 c: c,
1573 }
1574 }
1575
1576 /// Parse a hex representation of a Unicode codepoint. This handles both
1577 /// hex notations, i.e., `\xFF` and `\x{FFFF}`. This expects the parser to
1578 /// be positioned at the `x`, `u` or `U` prefix. The parser is advanced to
1579 /// the first character immediately following the hexadecimal literal.
1580 #[inline(never)]
parse_hex(&self) -> Result<ast::Literal>1581 fn parse_hex(&self) -> Result<ast::Literal> {
1582 assert!(
1583 self.char() == 'x' || self.char() == 'u' || self.char() == 'U'
1584 );
1585
1586 let hex_kind = match self.char() {
1587 'x' => ast::HexLiteralKind::X,
1588 'u' => ast::HexLiteralKind::UnicodeShort,
1589 _ => ast::HexLiteralKind::UnicodeLong,
1590 };
1591 if !self.bump_and_bump_space() {
1592 return Err(
1593 self.error(self.span(), ast::ErrorKind::EscapeUnexpectedEof)
1594 );
1595 }
1596 if self.char() == '{' {
1597 self.parse_hex_brace(hex_kind)
1598 } else {
1599 self.parse_hex_digits(hex_kind)
1600 }
1601 }
1602
1603 /// Parse an N-digit hex representation of a Unicode codepoint. This
1604 /// expects the parser to be positioned at the first digit and will advance
1605 /// the parser to the first character immediately following the escape
1606 /// sequence.
1607 ///
1608 /// The number of digits given must be 2 (for `\xNN`), 4 (for `\uNNNN`)
1609 /// or 8 (for `\UNNNNNNNN`).
1610 #[inline(never)]
parse_hex_digits( &self, kind: ast::HexLiteralKind, ) -> Result<ast::Literal>1611 fn parse_hex_digits(
1612 &self,
1613 kind: ast::HexLiteralKind,
1614 ) -> Result<ast::Literal> {
1615 use std::char;
1616 use std::u32;
1617
1618 let mut scratch = self.parser().scratch.borrow_mut();
1619 scratch.clear();
1620
1621 let start = self.pos();
1622 for i in 0..kind.digits() {
1623 if i > 0 && !self.bump_and_bump_space() {
1624 return Err(self
1625 .error(self.span(), ast::ErrorKind::EscapeUnexpectedEof));
1626 }
1627 if !is_hex(self.char()) {
1628 return Err(self.error(
1629 self.span_char(),
1630 ast::ErrorKind::EscapeHexInvalidDigit,
1631 ));
1632 }
1633 scratch.push(self.char());
1634 }
1635 // The final bump just moves the parser past the literal, which may
1636 // be EOF.
1637 self.bump_and_bump_space();
1638 let end = self.pos();
1639 let hex = scratch.as_str();
1640 match u32::from_str_radix(hex, 16).ok().and_then(char::from_u32) {
1641 None => Err(self.error(
1642 Span::new(start, end),
1643 ast::ErrorKind::EscapeHexInvalid,
1644 )),
1645 Some(c) => Ok(ast::Literal {
1646 span: Span::new(start, end),
1647 kind: ast::LiteralKind::HexFixed(kind),
1648 c: c,
1649 }),
1650 }
1651 }
1652
1653 /// Parse a hex representation of any Unicode scalar value. This expects
1654 /// the parser to be positioned at the opening brace `{` and will advance
1655 /// the parser to the first character following the closing brace `}`.
1656 #[inline(never)]
parse_hex_brace( &self, kind: ast::HexLiteralKind, ) -> Result<ast::Literal>1657 fn parse_hex_brace(
1658 &self,
1659 kind: ast::HexLiteralKind,
1660 ) -> Result<ast::Literal> {
1661 use std::char;
1662 use std::u32;
1663
1664 let mut scratch = self.parser().scratch.borrow_mut();
1665 scratch.clear();
1666
1667 let brace_pos = self.pos();
1668 let start = self.span_char().end;
1669 while self.bump_and_bump_space() && self.char() != '}' {
1670 if !is_hex(self.char()) {
1671 return Err(self.error(
1672 self.span_char(),
1673 ast::ErrorKind::EscapeHexInvalidDigit,
1674 ));
1675 }
1676 scratch.push(self.char());
1677 }
1678 if self.is_eof() {
1679 return Err(self.error(
1680 Span::new(brace_pos, self.pos()),
1681 ast::ErrorKind::EscapeUnexpectedEof,
1682 ));
1683 }
1684 let end = self.pos();
1685 let hex = scratch.as_str();
1686 assert_eq!(self.char(), '}');
1687 self.bump_and_bump_space();
1688
1689 if hex.is_empty() {
1690 return Err(self.error(
1691 Span::new(brace_pos, self.pos()),
1692 ast::ErrorKind::EscapeHexEmpty,
1693 ));
1694 }
1695 match u32::from_str_radix(hex, 16).ok().and_then(char::from_u32) {
1696 None => Err(self.error(
1697 Span::new(start, end),
1698 ast::ErrorKind::EscapeHexInvalid,
1699 )),
1700 Some(c) => Ok(ast::Literal {
1701 span: Span::new(start, self.pos()),
1702 kind: ast::LiteralKind::HexBrace(kind),
1703 c: c,
1704 }),
1705 }
1706 }
1707
1708 /// Parse a decimal number into a u32 while trimming leading and trailing
1709 /// whitespace.
1710 ///
1711 /// This expects the parser to be positioned at the first position where
1712 /// a decimal digit could occur. This will advance the parser to the byte
1713 /// immediately following the last contiguous decimal digit.
1714 ///
1715 /// If no decimal digit could be found or if there was a problem parsing
1716 /// the complete set of digits into a u32, then an error is returned.
parse_decimal(&self) -> Result<u32>1717 fn parse_decimal(&self) -> Result<u32> {
1718 let mut scratch = self.parser().scratch.borrow_mut();
1719 scratch.clear();
1720
1721 while !self.is_eof() && self.char().is_whitespace() {
1722 self.bump();
1723 }
1724 let start = self.pos();
1725 while !self.is_eof() && '0' <= self.char() && self.char() <= '9' {
1726 scratch.push(self.char());
1727 self.bump_and_bump_space();
1728 }
1729 let span = Span::new(start, self.pos());
1730 while !self.is_eof() && self.char().is_whitespace() {
1731 self.bump_and_bump_space();
1732 }
1733 let digits = scratch.as_str();
1734 if digits.is_empty() {
1735 return Err(self.error(span, ast::ErrorKind::DecimalEmpty));
1736 }
1737 match u32::from_str_radix(digits, 10).ok() {
1738 Some(n) => Ok(n),
1739 None => Err(self.error(span, ast::ErrorKind::DecimalInvalid)),
1740 }
1741 }
1742
1743 /// Parse a standard character class consisting primarily of characters or
1744 /// character ranges, but can also contain nested character classes of
1745 /// any type (sans `.`).
1746 ///
1747 /// This assumes the parser is positioned at the opening `[`. If parsing
1748 /// is successful, then the parser is advanced to the position immediately
1749 /// following the closing `]`.
1750 #[inline(never)]
parse_set_class(&self) -> Result<ast::Class>1751 fn parse_set_class(&self) -> Result<ast::Class> {
1752 assert_eq!(self.char(), '[');
1753
1754 let mut union =
1755 ast::ClassSetUnion { span: self.span(), items: vec![] };
1756 loop {
1757 self.bump_space();
1758 if self.is_eof() {
1759 return Err(self.unclosed_class_error());
1760 }
1761 match self.char() {
1762 '[' => {
1763 // If we've already parsed the opening bracket, then
1764 // attempt to treat this as the beginning of an ASCII
1765 // class. If ASCII class parsing fails, then the parser
1766 // backs up to `[`.
1767 if !self.parser().stack_class.borrow().is_empty() {
1768 if let Some(cls) = self.maybe_parse_ascii_class() {
1769 union.push(ast::ClassSetItem::Ascii(cls));
1770 continue;
1771 }
1772 }
1773 union = self.push_class_open(union)?;
1774 }
1775 ']' => match self.pop_class(union)? {
1776 Either::Left(nested_union) => {
1777 union = nested_union;
1778 }
1779 Either::Right(class) => return Ok(class),
1780 },
1781 '&' if self.peek() == Some('&') => {
1782 assert!(self.bump_if("&&"));
1783 union = self.push_class_op(
1784 ast::ClassSetBinaryOpKind::Intersection,
1785 union,
1786 );
1787 }
1788 '-' if self.peek() == Some('-') => {
1789 assert!(self.bump_if("--"));
1790 union = self.push_class_op(
1791 ast::ClassSetBinaryOpKind::Difference,
1792 union,
1793 );
1794 }
1795 '~' if self.peek() == Some('~') => {
1796 assert!(self.bump_if("~~"));
1797 union = self.push_class_op(
1798 ast::ClassSetBinaryOpKind::SymmetricDifference,
1799 union,
1800 );
1801 }
1802 _ => {
1803 union.push(self.parse_set_class_range()?);
1804 }
1805 }
1806 }
1807 }
1808
1809 /// Parse a single primitive item in a character class set. The item to
1810 /// be parsed can either be one of a simple literal character, a range
1811 /// between two simple literal characters or a "primitive" character
1812 /// class like \w or \p{Greek}.
1813 ///
1814 /// If an invalid escape is found, or if a character class is found where
1815 /// a simple literal is expected (e.g., in a range), then an error is
1816 /// returned.
1817 #[inline(never)]
parse_set_class_range(&self) -> Result<ast::ClassSetItem>1818 fn parse_set_class_range(&self) -> Result<ast::ClassSetItem> {
1819 let prim1 = self.parse_set_class_item()?;
1820 self.bump_space();
1821 if self.is_eof() {
1822 return Err(self.unclosed_class_error());
1823 }
1824 // If the next char isn't a `-`, then we don't have a range.
1825 // There are two exceptions. If the char after a `-` is a `]`, then
1826 // `-` is interpreted as a literal `-`. Alternatively, if the char
1827 // after a `-` is a `-`, then `--` corresponds to a "difference"
1828 // operation.
1829 if self.char() != '-'
1830 || self.peek_space() == Some(']')
1831 || self.peek_space() == Some('-')
1832 {
1833 return prim1.into_class_set_item(self);
1834 }
1835 // OK, now we're parsing a range, so bump past the `-` and parse the
1836 // second half of the range.
1837 if !self.bump_and_bump_space() {
1838 return Err(self.unclosed_class_error());
1839 }
1840 let prim2 = self.parse_set_class_item()?;
1841 let range = ast::ClassSetRange {
1842 span: Span::new(prim1.span().start, prim2.span().end),
1843 start: prim1.into_class_literal(self)?,
1844 end: prim2.into_class_literal(self)?,
1845 };
1846 if !range.is_valid() {
1847 return Err(
1848 self.error(range.span, ast::ErrorKind::ClassRangeInvalid)
1849 );
1850 }
1851 Ok(ast::ClassSetItem::Range(range))
1852 }
1853
1854 /// Parse a single item in a character class as a primitive, where the
1855 /// primitive either consists of a verbatim literal or a single escape
1856 /// sequence.
1857 ///
1858 /// This assumes the parser is positioned at the beginning of a primitive,
1859 /// and advances the parser to the first position after the primitive if
1860 /// successful.
1861 ///
1862 /// Note that it is the caller's responsibility to report an error if an
1863 /// illegal primitive was parsed.
1864 #[inline(never)]
parse_set_class_item(&self) -> Result<Primitive>1865 fn parse_set_class_item(&self) -> Result<Primitive> {
1866 if self.char() == '\\' {
1867 self.parse_escape()
1868 } else {
1869 let x = Primitive::Literal(ast::Literal {
1870 span: self.span_char(),
1871 kind: ast::LiteralKind::Verbatim,
1872 c: self.char(),
1873 });
1874 self.bump();
1875 Ok(x)
1876 }
1877 }
1878
1879 /// Parses the opening of a character class set. This includes the opening
1880 /// bracket along with `^` if present to indicate negation. This also
1881 /// starts parsing the opening set of unioned items if applicable, since
1882 /// there are special rules applied to certain characters in the opening
1883 /// of a character class. For example, `[^]]` is the class of all
1884 /// characters not equal to `]`. (`]` would need to be escaped in any other
1885 /// position.) Similarly for `-`.
1886 ///
1887 /// In all cases, the op inside the returned `ast::ClassBracketed` is an
1888 /// empty union. This empty union should be replaced with the actual item
1889 /// when it is popped from the parser's stack.
1890 ///
1891 /// This assumes the parser is positioned at the opening `[` and advances
1892 /// the parser to the first non-special byte of the character class.
1893 ///
1894 /// An error is returned if EOF is found.
1895 #[inline(never)]
parse_set_class_open( &self, ) -> Result<(ast::ClassBracketed, ast::ClassSetUnion)>1896 fn parse_set_class_open(
1897 &self,
1898 ) -> Result<(ast::ClassBracketed, ast::ClassSetUnion)> {
1899 assert_eq!(self.char(), '[');
1900 let start = self.pos();
1901 if !self.bump_and_bump_space() {
1902 return Err(self.error(
1903 Span::new(start, self.pos()),
1904 ast::ErrorKind::ClassUnclosed,
1905 ));
1906 }
1907
1908 let negated = if self.char() != '^' {
1909 false
1910 } else {
1911 if !self.bump_and_bump_space() {
1912 return Err(self.error(
1913 Span::new(start, self.pos()),
1914 ast::ErrorKind::ClassUnclosed,
1915 ));
1916 }
1917 true
1918 };
1919 // Accept any number of `-` as literal `-`.
1920 let mut union =
1921 ast::ClassSetUnion { span: self.span(), items: vec![] };
1922 while self.char() == '-' {
1923 union.push(ast::ClassSetItem::Literal(ast::Literal {
1924 span: self.span_char(),
1925 kind: ast::LiteralKind::Verbatim,
1926 c: '-',
1927 }));
1928 if !self.bump_and_bump_space() {
1929 return Err(self.error(
1930 Span::new(start, self.pos()),
1931 ast::ErrorKind::ClassUnclosed,
1932 ));
1933 }
1934 }
1935 // If `]` is the *first* char in a set, then interpret it as a literal
1936 // `]`. That is, an empty class is impossible to write.
1937 if union.items.is_empty() && self.char() == ']' {
1938 union.push(ast::ClassSetItem::Literal(ast::Literal {
1939 span: self.span_char(),
1940 kind: ast::LiteralKind::Verbatim,
1941 c: ']',
1942 }));
1943 if !self.bump_and_bump_space() {
1944 return Err(self.error(
1945 Span::new(start, self.pos()),
1946 ast::ErrorKind::ClassUnclosed,
1947 ));
1948 }
1949 }
1950 let set = ast::ClassBracketed {
1951 span: Span::new(start, self.pos()),
1952 negated: negated,
1953 kind: ast::ClassSet::union(ast::ClassSetUnion {
1954 span: Span::new(union.span.start, union.span.start),
1955 items: vec![],
1956 }),
1957 };
1958 Ok((set, union))
1959 }
1960
1961 /// Attempt to parse an ASCII character class, e.g., `[:alnum:]`.
1962 ///
1963 /// This assumes the parser is positioned at the opening `[`.
1964 ///
1965 /// If no valid ASCII character class could be found, then this does not
1966 /// advance the parser and `None` is returned. Otherwise, the parser is
1967 /// advanced to the first byte following the closing `]` and the
1968 /// corresponding ASCII class is returned.
1969 #[inline(never)]
maybe_parse_ascii_class(&self) -> Option<ast::ClassAscii>1970 fn maybe_parse_ascii_class(&self) -> Option<ast::ClassAscii> {
1971 // ASCII character classes are interesting from a parsing perspective
1972 // because parsing cannot fail with any interesting error. For example,
1973 // in order to use an ASCII character class, it must be enclosed in
1974 // double brackets, e.g., `[[:alnum:]]`. Alternatively, you might think
1975 // of it as "ASCII character characters have the syntax `[:NAME:]`
1976 // which can only appear within character brackets." This means that
1977 // things like `[[:lower:]A]` are legal constructs.
1978 //
1979 // However, if one types an incorrect ASCII character class, e.g.,
1980 // `[[:loower:]]`, then we treat that as a normal nested character
1981 // class containing the characters `:elorw`. One might argue that we
1982 // should return an error instead since the repeated colons give away
1983 // the intent to write an ASCII class. But what if the user typed
1984 // `[[:lower]]` instead? How can we tell that was intended to be an
1985 // ASCII class and not just a normal nested class?
1986 //
1987 // Reasonable people can probably disagree over this, but for better
1988 // or worse, we implement semantics that never fails at the expense
1989 // of better failure modes.
1990 assert_eq!(self.char(), '[');
1991 // If parsing fails, then we back up the parser to this starting point.
1992 let start = self.pos();
1993 let mut negated = false;
1994 if !self.bump() || self.char() != ':' {
1995 self.parser().pos.set(start);
1996 return None;
1997 }
1998 if !self.bump() {
1999 self.parser().pos.set(start);
2000 return None;
2001 }
2002 if self.char() == '^' {
2003 negated = true;
2004 if !self.bump() {
2005 self.parser().pos.set(start);
2006 return None;
2007 }
2008 }
2009 let name_start = self.offset();
2010 while self.char() != ':' && self.bump() {}
2011 if self.is_eof() {
2012 self.parser().pos.set(start);
2013 return None;
2014 }
2015 let name = &self.pattern()[name_start..self.offset()];
2016 if !self.bump_if(":]") {
2017 self.parser().pos.set(start);
2018 return None;
2019 }
2020 let kind = match ast::ClassAsciiKind::from_name(name) {
2021 Some(kind) => kind,
2022 None => {
2023 self.parser().pos.set(start);
2024 return None;
2025 }
2026 };
2027 Some(ast::ClassAscii {
2028 span: Span::new(start, self.pos()),
2029 kind: kind,
2030 negated: negated,
2031 })
2032 }
2033
2034 /// Parse a Unicode class in either the single character notation, `\pN`
2035 /// or the multi-character bracketed notation, `\p{Greek}`. This assumes
2036 /// the parser is positioned at the `p` (or `P` for negation) and will
2037 /// advance the parser to the character immediately following the class.
2038 ///
2039 /// Note that this does not check whether the class name is valid or not.
2040 #[inline(never)]
parse_unicode_class(&self) -> Result<ast::ClassUnicode>2041 fn parse_unicode_class(&self) -> Result<ast::ClassUnicode> {
2042 assert!(self.char() == 'p' || self.char() == 'P');
2043
2044 let mut scratch = self.parser().scratch.borrow_mut();
2045 scratch.clear();
2046
2047 let negated = self.char() == 'P';
2048 if !self.bump_and_bump_space() {
2049 return Err(
2050 self.error(self.span(), ast::ErrorKind::EscapeUnexpectedEof)
2051 );
2052 }
2053 let (start, kind) = if self.char() == '{' {
2054 let start = self.span_char().end;
2055 while self.bump_and_bump_space() && self.char() != '}' {
2056 scratch.push(self.char());
2057 }
2058 if self.is_eof() {
2059 return Err(self
2060 .error(self.span(), ast::ErrorKind::EscapeUnexpectedEof));
2061 }
2062 assert_eq!(self.char(), '}');
2063 self.bump();
2064
2065 let name = scratch.as_str();
2066 if let Some(i) = name.find("!=") {
2067 (
2068 start,
2069 ast::ClassUnicodeKind::NamedValue {
2070 op: ast::ClassUnicodeOpKind::NotEqual,
2071 name: name[..i].to_string(),
2072 value: name[i + 2..].to_string(),
2073 },
2074 )
2075 } else if let Some(i) = name.find(':') {
2076 (
2077 start,
2078 ast::ClassUnicodeKind::NamedValue {
2079 op: ast::ClassUnicodeOpKind::Colon,
2080 name: name[..i].to_string(),
2081 value: name[i + 1..].to_string(),
2082 },
2083 )
2084 } else if let Some(i) = name.find('=') {
2085 (
2086 start,
2087 ast::ClassUnicodeKind::NamedValue {
2088 op: ast::ClassUnicodeOpKind::Equal,
2089 name: name[..i].to_string(),
2090 value: name[i + 1..].to_string(),
2091 },
2092 )
2093 } else {
2094 (start, ast::ClassUnicodeKind::Named(name.to_string()))
2095 }
2096 } else {
2097 let start = self.pos();
2098 let c = self.char();
2099 if c == '\\' {
2100 return Err(self.error(
2101 self.span_char(),
2102 ast::ErrorKind::UnicodeClassInvalid,
2103 ));
2104 }
2105 self.bump_and_bump_space();
2106 let kind = ast::ClassUnicodeKind::OneLetter(c);
2107 (start, kind)
2108 };
2109 Ok(ast::ClassUnicode {
2110 span: Span::new(start, self.pos()),
2111 negated: negated,
2112 kind: kind,
2113 })
2114 }
2115
2116 /// Parse a Perl character class, e.g., `\d` or `\W`. This assumes the
2117 /// parser is currently at a valid character class name and will be
2118 /// advanced to the character immediately following the class.
2119 #[inline(never)]
parse_perl_class(&self) -> ast::ClassPerl2120 fn parse_perl_class(&self) -> ast::ClassPerl {
2121 let c = self.char();
2122 let span = self.span_char();
2123 self.bump();
2124 let (negated, kind) = match c {
2125 'd' => (false, ast::ClassPerlKind::Digit),
2126 'D' => (true, ast::ClassPerlKind::Digit),
2127 's' => (false, ast::ClassPerlKind::Space),
2128 'S' => (true, ast::ClassPerlKind::Space),
2129 'w' => (false, ast::ClassPerlKind::Word),
2130 'W' => (true, ast::ClassPerlKind::Word),
2131 c => panic!("expected valid Perl class but got '{}'", c),
2132 };
2133 ast::ClassPerl { span: span, kind: kind, negated: negated }
2134 }
2135 }
2136
2137 /// A type that traverses a fully parsed Ast and checks whether its depth
2138 /// exceeds the specified nesting limit. If it does, then an error is returned.
2139 #[derive(Debug)]
2140 struct NestLimiter<'p, 's: 'p, P: 'p + 's> {
2141 /// The parser that is checking the nest limit.
2142 p: &'p ParserI<'s, P>,
2143 /// The current depth while walking an Ast.
2144 depth: u32,
2145 }
2146
2147 impl<'p, 's, P: Borrow<Parser>> NestLimiter<'p, 's, P> {
new(p: &'p ParserI<'s, P>) -> NestLimiter<'p, 's, P>2148 fn new(p: &'p ParserI<'s, P>) -> NestLimiter<'p, 's, P> {
2149 NestLimiter { p: p, depth: 0 }
2150 }
2151
2152 #[inline(never)]
check(self, ast: &Ast) -> Result<()>2153 fn check(self, ast: &Ast) -> Result<()> {
2154 ast::visit(ast, self)
2155 }
2156
increment_depth(&mut self, span: &Span) -> Result<()>2157 fn increment_depth(&mut self, span: &Span) -> Result<()> {
2158 let new = self.depth.checked_add(1).ok_or_else(|| {
2159 self.p.error(
2160 span.clone(),
2161 ast::ErrorKind::NestLimitExceeded(::std::u32::MAX),
2162 )
2163 })?;
2164 let limit = self.p.parser().nest_limit;
2165 if new > limit {
2166 return Err(self.p.error(
2167 span.clone(),
2168 ast::ErrorKind::NestLimitExceeded(limit),
2169 ));
2170 }
2171 self.depth = new;
2172 Ok(())
2173 }
2174
decrement_depth(&mut self)2175 fn decrement_depth(&mut self) {
2176 // Assuming the correctness of the visitor, this should never drop
2177 // below 0.
2178 self.depth = self.depth.checked_sub(1).unwrap();
2179 }
2180 }
2181
2182 impl<'p, 's, P: Borrow<Parser>> ast::Visitor for NestLimiter<'p, 's, P> {
2183 type Output = ();
2184 type Err = ast::Error;
2185
finish(self) -> Result<()>2186 fn finish(self) -> Result<()> {
2187 Ok(())
2188 }
2189
visit_pre(&mut self, ast: &Ast) -> Result<()>2190 fn visit_pre(&mut self, ast: &Ast) -> Result<()> {
2191 let span = match *ast {
2192 Ast::Empty(_)
2193 | Ast::Flags(_)
2194 | Ast::Literal(_)
2195 | Ast::Dot(_)
2196 | Ast::Assertion(_)
2197 | Ast::Class(ast::Class::Unicode(_))
2198 | Ast::Class(ast::Class::Perl(_)) => {
2199 // These are all base cases, so we don't increment depth.
2200 return Ok(());
2201 }
2202 Ast::Class(ast::Class::Bracketed(ref x)) => &x.span,
2203 Ast::Repetition(ref x) => &x.span,
2204 Ast::Group(ref x) => &x.span,
2205 Ast::Alternation(ref x) => &x.span,
2206 Ast::Concat(ref x) => &x.span,
2207 };
2208 self.increment_depth(span)
2209 }
2210
visit_post(&mut self, ast: &Ast) -> Result<()>2211 fn visit_post(&mut self, ast: &Ast) -> Result<()> {
2212 match *ast {
2213 Ast::Empty(_)
2214 | Ast::Flags(_)
2215 | Ast::Literal(_)
2216 | Ast::Dot(_)
2217 | Ast::Assertion(_)
2218 | Ast::Class(ast::Class::Unicode(_))
2219 | Ast::Class(ast::Class::Perl(_)) => {
2220 // These are all base cases, so we don't decrement depth.
2221 Ok(())
2222 }
2223 Ast::Class(ast::Class::Bracketed(_))
2224 | Ast::Repetition(_)
2225 | Ast::Group(_)
2226 | Ast::Alternation(_)
2227 | Ast::Concat(_) => {
2228 self.decrement_depth();
2229 Ok(())
2230 }
2231 }
2232 }
2233
visit_class_set_item_pre( &mut self, ast: &ast::ClassSetItem, ) -> Result<()>2234 fn visit_class_set_item_pre(
2235 &mut self,
2236 ast: &ast::ClassSetItem,
2237 ) -> Result<()> {
2238 let span = match *ast {
2239 ast::ClassSetItem::Empty(_)
2240 | ast::ClassSetItem::Literal(_)
2241 | ast::ClassSetItem::Range(_)
2242 | ast::ClassSetItem::Ascii(_)
2243 | ast::ClassSetItem::Unicode(_)
2244 | ast::ClassSetItem::Perl(_) => {
2245 // These are all base cases, so we don't increment depth.
2246 return Ok(());
2247 }
2248 ast::ClassSetItem::Bracketed(ref x) => &x.span,
2249 ast::ClassSetItem::Union(ref x) => &x.span,
2250 };
2251 self.increment_depth(span)
2252 }
2253
visit_class_set_item_post( &mut self, ast: &ast::ClassSetItem, ) -> Result<()>2254 fn visit_class_set_item_post(
2255 &mut self,
2256 ast: &ast::ClassSetItem,
2257 ) -> Result<()> {
2258 match *ast {
2259 ast::ClassSetItem::Empty(_)
2260 | ast::ClassSetItem::Literal(_)
2261 | ast::ClassSetItem::Range(_)
2262 | ast::ClassSetItem::Ascii(_)
2263 | ast::ClassSetItem::Unicode(_)
2264 | ast::ClassSetItem::Perl(_) => {
2265 // These are all base cases, so we don't decrement depth.
2266 Ok(())
2267 }
2268 ast::ClassSetItem::Bracketed(_) | ast::ClassSetItem::Union(_) => {
2269 self.decrement_depth();
2270 Ok(())
2271 }
2272 }
2273 }
2274
visit_class_set_binary_op_pre( &mut self, ast: &ast::ClassSetBinaryOp, ) -> Result<()>2275 fn visit_class_set_binary_op_pre(
2276 &mut self,
2277 ast: &ast::ClassSetBinaryOp,
2278 ) -> Result<()> {
2279 self.increment_depth(&ast.span)
2280 }
2281
visit_class_set_binary_op_post( &mut self, _ast: &ast::ClassSetBinaryOp, ) -> Result<()>2282 fn visit_class_set_binary_op_post(
2283 &mut self,
2284 _ast: &ast::ClassSetBinaryOp,
2285 ) -> Result<()> {
2286 self.decrement_depth();
2287 Ok(())
2288 }
2289 }
2290
2291 /// When the result is an error, transforms the ast::ErrorKind from the source
2292 /// Result into another one. This function is used to return clearer error
2293 /// messages when possible.
specialize_err<T>( result: Result<T>, from: ast::ErrorKind, to: ast::ErrorKind, ) -> Result<T>2294 fn specialize_err<T>(
2295 result: Result<T>,
2296 from: ast::ErrorKind,
2297 to: ast::ErrorKind,
2298 ) -> Result<T> {
2299 if let Err(e) = result {
2300 if e.kind == from {
2301 Err(ast::Error { kind: to, pattern: e.pattern, span: e.span })
2302 } else {
2303 Err(e)
2304 }
2305 } else {
2306 result
2307 }
2308 }
2309
2310 #[cfg(test)]
2311 mod tests {
2312 use std::ops::Range;
2313
2314 use super::{Parser, ParserBuilder, ParserI, Primitive};
2315 use ast::{self, Ast, Position, Span};
2316
2317 // Our own assert_eq, which has slightly better formatting (but honestly
2318 // still kind of crappy).
2319 macro_rules! assert_eq {
2320 ($left:expr, $right:expr) => {{
2321 match (&$left, &$right) {
2322 (left_val, right_val) => {
2323 if !(*left_val == *right_val) {
2324 panic!(
2325 "assertion failed: `(left == right)`\n\n\
2326 left: `{:?}`\nright: `{:?}`\n\n",
2327 left_val, right_val
2328 )
2329 }
2330 }
2331 }
2332 }};
2333 }
2334
2335 // We create these errors to compare with real ast::Errors in the tests.
2336 // We define equality between TestError and ast::Error to disregard the
2337 // pattern string in ast::Error, which is annoying to provide in tests.
2338 #[derive(Clone, Debug)]
2339 struct TestError {
2340 span: Span,
2341 kind: ast::ErrorKind,
2342 }
2343
2344 impl PartialEq<ast::Error> for TestError {
eq(&self, other: &ast::Error) -> bool2345 fn eq(&self, other: &ast::Error) -> bool {
2346 self.span == other.span && self.kind == other.kind
2347 }
2348 }
2349
2350 impl PartialEq<TestError> for ast::Error {
eq(&self, other: &TestError) -> bool2351 fn eq(&self, other: &TestError) -> bool {
2352 self.span == other.span && self.kind == other.kind
2353 }
2354 }
2355
s(str: &str) -> String2356 fn s(str: &str) -> String {
2357 str.to_string()
2358 }
2359
parser(pattern: &str) -> ParserI<Parser>2360 fn parser(pattern: &str) -> ParserI<Parser> {
2361 ParserI::new(Parser::new(), pattern)
2362 }
2363
parser_octal(pattern: &str) -> ParserI<Parser>2364 fn parser_octal(pattern: &str) -> ParserI<Parser> {
2365 let parser = ParserBuilder::new().octal(true).build();
2366 ParserI::new(parser, pattern)
2367 }
2368
parser_nest_limit(pattern: &str, nest_limit: u32) -> ParserI<Parser>2369 fn parser_nest_limit(pattern: &str, nest_limit: u32) -> ParserI<Parser> {
2370 let p = ParserBuilder::new().nest_limit(nest_limit).build();
2371 ParserI::new(p, pattern)
2372 }
2373
parser_ignore_whitespace(pattern: &str) -> ParserI<Parser>2374 fn parser_ignore_whitespace(pattern: &str) -> ParserI<Parser> {
2375 let p = ParserBuilder::new().ignore_whitespace(true).build();
2376 ParserI::new(p, pattern)
2377 }
2378
2379 /// Short alias for creating a new span.
nspan(start: Position, end: Position) -> Span2380 fn nspan(start: Position, end: Position) -> Span {
2381 Span::new(start, end)
2382 }
2383
2384 /// Short alias for creating a new position.
npos(offset: usize, line: usize, column: usize) -> Position2385 fn npos(offset: usize, line: usize, column: usize) -> Position {
2386 Position::new(offset, line, column)
2387 }
2388
2389 /// Create a new span from the given offset range. This assumes a single
2390 /// line and sets the columns based on the offsets. i.e., This only works
2391 /// out of the box for ASCII, which is fine for most tests.
span(range: Range<usize>) -> Span2392 fn span(range: Range<usize>) -> Span {
2393 let start = Position::new(range.start, 1, range.start + 1);
2394 let end = Position::new(range.end, 1, range.end + 1);
2395 Span::new(start, end)
2396 }
2397
2398 /// Create a new span for the corresponding byte range in the given string.
span_range(subject: &str, range: Range<usize>) -> Span2399 fn span_range(subject: &str, range: Range<usize>) -> Span {
2400 let start = Position {
2401 offset: range.start,
2402 line: 1 + subject[..range.start].matches('\n').count(),
2403 column: 1 + subject[..range.start]
2404 .chars()
2405 .rev()
2406 .position(|c| c == '\n')
2407 .unwrap_or(subject[..range.start].chars().count()),
2408 };
2409 let end = Position {
2410 offset: range.end,
2411 line: 1 + subject[..range.end].matches('\n').count(),
2412 column: 1 + subject[..range.end]
2413 .chars()
2414 .rev()
2415 .position(|c| c == '\n')
2416 .unwrap_or(subject[..range.end].chars().count()),
2417 };
2418 Span::new(start, end)
2419 }
2420
2421 /// Create a verbatim literal starting at the given position.
lit(c: char, start: usize) -> Ast2422 fn lit(c: char, start: usize) -> Ast {
2423 lit_with(c, span(start..start + c.len_utf8()))
2424 }
2425
2426 /// Create a punctuation literal starting at the given position.
punct_lit(c: char, span: Span) -> Ast2427 fn punct_lit(c: char, span: Span) -> Ast {
2428 Ast::Literal(ast::Literal {
2429 span: span,
2430 kind: ast::LiteralKind::Punctuation,
2431 c: c,
2432 })
2433 }
2434
2435 /// Create a verbatim literal with the given span.
lit_with(c: char, span: Span) -> Ast2436 fn lit_with(c: char, span: Span) -> Ast {
2437 Ast::Literal(ast::Literal {
2438 span: span,
2439 kind: ast::LiteralKind::Verbatim,
2440 c: c,
2441 })
2442 }
2443
2444 /// Create a concatenation with the given range.
concat(range: Range<usize>, asts: Vec<Ast>) -> Ast2445 fn concat(range: Range<usize>, asts: Vec<Ast>) -> Ast {
2446 concat_with(span(range), asts)
2447 }
2448
2449 /// Create a concatenation with the given span.
concat_with(span: Span, asts: Vec<Ast>) -> Ast2450 fn concat_with(span: Span, asts: Vec<Ast>) -> Ast {
2451 Ast::Concat(ast::Concat { span: span, asts: asts })
2452 }
2453
2454 /// Create an alternation with the given span.
alt(range: Range<usize>, asts: Vec<Ast>) -> Ast2455 fn alt(range: Range<usize>, asts: Vec<Ast>) -> Ast {
2456 Ast::Alternation(ast::Alternation { span: span(range), asts: asts })
2457 }
2458
2459 /// Create a capturing group with the given span.
group(range: Range<usize>, index: u32, ast: Ast) -> Ast2460 fn group(range: Range<usize>, index: u32, ast: Ast) -> Ast {
2461 Ast::Group(ast::Group {
2462 span: span(range),
2463 kind: ast::GroupKind::CaptureIndex(index),
2464 ast: Box::new(ast),
2465 })
2466 }
2467
2468 /// Create an ast::SetFlags.
2469 ///
2470 /// The given pattern should be the full pattern string. The range given
2471 /// should correspond to the byte offsets where the flag set occurs.
2472 ///
2473 /// If negated is true, then the set is interpreted as beginning with a
2474 /// negation.
flag_set( pat: &str, range: Range<usize>, flag: ast::Flag, negated: bool, ) -> Ast2475 fn flag_set(
2476 pat: &str,
2477 range: Range<usize>,
2478 flag: ast::Flag,
2479 negated: bool,
2480 ) -> Ast {
2481 let mut items = vec![ast::FlagsItem {
2482 span: span_range(pat, (range.end - 2)..(range.end - 1)),
2483 kind: ast::FlagsItemKind::Flag(flag),
2484 }];
2485 if negated {
2486 items.insert(
2487 0,
2488 ast::FlagsItem {
2489 span: span_range(pat, (range.start + 2)..(range.end - 2)),
2490 kind: ast::FlagsItemKind::Negation,
2491 },
2492 );
2493 }
2494 Ast::Flags(ast::SetFlags {
2495 span: span_range(pat, range.clone()),
2496 flags: ast::Flags {
2497 span: span_range(pat, (range.start + 2)..(range.end - 1)),
2498 items: items,
2499 },
2500 })
2501 }
2502
2503 #[test]
parse_nest_limit()2504 fn parse_nest_limit() {
2505 // A nest limit of 0 still allows some types of regexes.
2506 assert_eq!(
2507 parser_nest_limit("", 0).parse(),
2508 Ok(Ast::Empty(span(0..0)))
2509 );
2510 assert_eq!(parser_nest_limit("a", 0).parse(), Ok(lit('a', 0)));
2511
2512 // Test repetition operations, which require one level of nesting.
2513 assert_eq!(
2514 parser_nest_limit("a+", 0).parse().unwrap_err(),
2515 TestError {
2516 span: span(0..2),
2517 kind: ast::ErrorKind::NestLimitExceeded(0),
2518 }
2519 );
2520 assert_eq!(
2521 parser_nest_limit("a+", 1).parse(),
2522 Ok(Ast::Repetition(ast::Repetition {
2523 span: span(0..2),
2524 op: ast::RepetitionOp {
2525 span: span(1..2),
2526 kind: ast::RepetitionKind::OneOrMore,
2527 },
2528 greedy: true,
2529 ast: Box::new(lit('a', 0)),
2530 }))
2531 );
2532 assert_eq!(
2533 parser_nest_limit("(a)+", 1).parse().unwrap_err(),
2534 TestError {
2535 span: span(0..3),
2536 kind: ast::ErrorKind::NestLimitExceeded(1),
2537 }
2538 );
2539 assert_eq!(
2540 parser_nest_limit("a+*", 1).parse().unwrap_err(),
2541 TestError {
2542 span: span(0..2),
2543 kind: ast::ErrorKind::NestLimitExceeded(1),
2544 }
2545 );
2546 assert_eq!(
2547 parser_nest_limit("a+*", 2).parse(),
2548 Ok(Ast::Repetition(ast::Repetition {
2549 span: span(0..3),
2550 op: ast::RepetitionOp {
2551 span: span(2..3),
2552 kind: ast::RepetitionKind::ZeroOrMore,
2553 },
2554 greedy: true,
2555 ast: Box::new(Ast::Repetition(ast::Repetition {
2556 span: span(0..2),
2557 op: ast::RepetitionOp {
2558 span: span(1..2),
2559 kind: ast::RepetitionKind::OneOrMore,
2560 },
2561 greedy: true,
2562 ast: Box::new(lit('a', 0)),
2563 })),
2564 }))
2565 );
2566
2567 // Test concatenations. A concatenation requires one level of nesting.
2568 assert_eq!(
2569 parser_nest_limit("ab", 0).parse().unwrap_err(),
2570 TestError {
2571 span: span(0..2),
2572 kind: ast::ErrorKind::NestLimitExceeded(0),
2573 }
2574 );
2575 assert_eq!(
2576 parser_nest_limit("ab", 1).parse(),
2577 Ok(concat(0..2, vec![lit('a', 0), lit('b', 1)]))
2578 );
2579 assert_eq!(
2580 parser_nest_limit("abc", 1).parse(),
2581 Ok(concat(0..3, vec![lit('a', 0), lit('b', 1), lit('c', 2)]))
2582 );
2583
2584 // Test alternations. An alternation requires one level of nesting.
2585 assert_eq!(
2586 parser_nest_limit("a|b", 0).parse().unwrap_err(),
2587 TestError {
2588 span: span(0..3),
2589 kind: ast::ErrorKind::NestLimitExceeded(0),
2590 }
2591 );
2592 assert_eq!(
2593 parser_nest_limit("a|b", 1).parse(),
2594 Ok(alt(0..3, vec![lit('a', 0), lit('b', 2)]))
2595 );
2596 assert_eq!(
2597 parser_nest_limit("a|b|c", 1).parse(),
2598 Ok(alt(0..5, vec![lit('a', 0), lit('b', 2), lit('c', 4)]))
2599 );
2600
2601 // Test character classes. Classes form their own mini-recursive
2602 // syntax!
2603 assert_eq!(
2604 parser_nest_limit("[a]", 0).parse().unwrap_err(),
2605 TestError {
2606 span: span(0..3),
2607 kind: ast::ErrorKind::NestLimitExceeded(0),
2608 }
2609 );
2610 assert_eq!(
2611 parser_nest_limit("[a]", 1).parse(),
2612 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
2613 span: span(0..3),
2614 negated: false,
2615 kind: ast::ClassSet::Item(ast::ClassSetItem::Literal(
2616 ast::Literal {
2617 span: span(1..2),
2618 kind: ast::LiteralKind::Verbatim,
2619 c: 'a',
2620 }
2621 )),
2622 })))
2623 );
2624 assert_eq!(
2625 parser_nest_limit("[ab]", 1).parse().unwrap_err(),
2626 TestError {
2627 span: span(1..3),
2628 kind: ast::ErrorKind::NestLimitExceeded(1),
2629 }
2630 );
2631 assert_eq!(
2632 parser_nest_limit("[ab[cd]]", 2).parse().unwrap_err(),
2633 TestError {
2634 span: span(3..7),
2635 kind: ast::ErrorKind::NestLimitExceeded(2),
2636 }
2637 );
2638 assert_eq!(
2639 parser_nest_limit("[ab[cd]]", 3).parse().unwrap_err(),
2640 TestError {
2641 span: span(4..6),
2642 kind: ast::ErrorKind::NestLimitExceeded(3),
2643 }
2644 );
2645 assert_eq!(
2646 parser_nest_limit("[a--b]", 1).parse().unwrap_err(),
2647 TestError {
2648 span: span(1..5),
2649 kind: ast::ErrorKind::NestLimitExceeded(1),
2650 }
2651 );
2652 assert_eq!(
2653 parser_nest_limit("[a--bc]", 2).parse().unwrap_err(),
2654 TestError {
2655 span: span(4..6),
2656 kind: ast::ErrorKind::NestLimitExceeded(2),
2657 }
2658 );
2659 }
2660
2661 #[test]
parse_comments()2662 fn parse_comments() {
2663 let pat = "(?x)
2664 # This is comment 1.
2665 foo # This is comment 2.
2666 # This is comment 3.
2667 bar
2668 # This is comment 4.";
2669 let astc = parser(pat).parse_with_comments().unwrap();
2670 assert_eq!(
2671 astc.ast,
2672 concat_with(
2673 span_range(pat, 0..pat.len()),
2674 vec![
2675 flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false),
2676 lit_with('f', span_range(pat, 26..27)),
2677 lit_with('o', span_range(pat, 27..28)),
2678 lit_with('o', span_range(pat, 28..29)),
2679 lit_with('b', span_range(pat, 74..75)),
2680 lit_with('a', span_range(pat, 75..76)),
2681 lit_with('r', span_range(pat, 76..77)),
2682 ]
2683 )
2684 );
2685 assert_eq!(
2686 astc.comments,
2687 vec![
2688 ast::Comment {
2689 span: span_range(pat, 5..26),
2690 comment: s(" This is comment 1."),
2691 },
2692 ast::Comment {
2693 span: span_range(pat, 30..51),
2694 comment: s(" This is comment 2."),
2695 },
2696 ast::Comment {
2697 span: span_range(pat, 53..74),
2698 comment: s(" This is comment 3."),
2699 },
2700 ast::Comment {
2701 span: span_range(pat, 78..98),
2702 comment: s(" This is comment 4."),
2703 },
2704 ]
2705 );
2706 }
2707
2708 #[test]
parse_holistic()2709 fn parse_holistic() {
2710 assert_eq!(parser("]").parse(), Ok(lit(']', 0)));
2711 assert_eq!(
2712 parser(r"\\\.\+\*\?\(\)\|\[\]\{\}\^\$\#\&\-\~").parse(),
2713 Ok(concat(
2714 0..36,
2715 vec![
2716 punct_lit('\\', span(0..2)),
2717 punct_lit('.', span(2..4)),
2718 punct_lit('+', span(4..6)),
2719 punct_lit('*', span(6..8)),
2720 punct_lit('?', span(8..10)),
2721 punct_lit('(', span(10..12)),
2722 punct_lit(')', span(12..14)),
2723 punct_lit('|', span(14..16)),
2724 punct_lit('[', span(16..18)),
2725 punct_lit(']', span(18..20)),
2726 punct_lit('{', span(20..22)),
2727 punct_lit('}', span(22..24)),
2728 punct_lit('^', span(24..26)),
2729 punct_lit('$', span(26..28)),
2730 punct_lit('#', span(28..30)),
2731 punct_lit('&', span(30..32)),
2732 punct_lit('-', span(32..34)),
2733 punct_lit('~', span(34..36)),
2734 ]
2735 ))
2736 );
2737 }
2738
2739 #[test]
parse_ignore_whitespace()2740 fn parse_ignore_whitespace() {
2741 // Test that basic whitespace insensitivity works.
2742 let pat = "(?x)a b";
2743 assert_eq!(
2744 parser(pat).parse(),
2745 Ok(concat_with(
2746 nspan(npos(0, 1, 1), npos(7, 1, 8)),
2747 vec![
2748 flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false),
2749 lit_with('a', nspan(npos(4, 1, 5), npos(5, 1, 6))),
2750 lit_with('b', nspan(npos(6, 1, 7), npos(7, 1, 8))),
2751 ]
2752 ))
2753 );
2754
2755 // Test that we can toggle whitespace insensitivity.
2756 let pat = "(?x)a b(?-x)a b";
2757 assert_eq!(
2758 parser(pat).parse(),
2759 Ok(concat_with(
2760 nspan(npos(0, 1, 1), npos(15, 1, 16)),
2761 vec![
2762 flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false),
2763 lit_with('a', nspan(npos(4, 1, 5), npos(5, 1, 6))),
2764 lit_with('b', nspan(npos(6, 1, 7), npos(7, 1, 8))),
2765 flag_set(pat, 7..12, ast::Flag::IgnoreWhitespace, true),
2766 lit_with('a', nspan(npos(12, 1, 13), npos(13, 1, 14))),
2767 lit_with(' ', nspan(npos(13, 1, 14), npos(14, 1, 15))),
2768 lit_with('b', nspan(npos(14, 1, 15), npos(15, 1, 16))),
2769 ]
2770 ))
2771 );
2772
2773 // Test that nesting whitespace insensitive flags works.
2774 let pat = "a (?x:a )a ";
2775 assert_eq!(
2776 parser(pat).parse(),
2777 Ok(concat_with(
2778 span_range(pat, 0..11),
2779 vec![
2780 lit_with('a', span_range(pat, 0..1)),
2781 lit_with(' ', span_range(pat, 1..2)),
2782 Ast::Group(ast::Group {
2783 span: span_range(pat, 2..9),
2784 kind: ast::GroupKind::NonCapturing(ast::Flags {
2785 span: span_range(pat, 4..5),
2786 items: vec![ast::FlagsItem {
2787 span: span_range(pat, 4..5),
2788 kind: ast::FlagsItemKind::Flag(
2789 ast::Flag::IgnoreWhitespace
2790 ),
2791 },],
2792 }),
2793 ast: Box::new(lit_with('a', span_range(pat, 6..7))),
2794 }),
2795 lit_with('a', span_range(pat, 9..10)),
2796 lit_with(' ', span_range(pat, 10..11)),
2797 ]
2798 ))
2799 );
2800
2801 // Test that whitespace after an opening paren is insignificant.
2802 let pat = "(?x)( ?P<foo> a )";
2803 assert_eq!(
2804 parser(pat).parse(),
2805 Ok(concat_with(
2806 span_range(pat, 0..pat.len()),
2807 vec![
2808 flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false),
2809 Ast::Group(ast::Group {
2810 span: span_range(pat, 4..pat.len()),
2811 kind: ast::GroupKind::CaptureName(ast::CaptureName {
2812 span: span_range(pat, 9..12),
2813 name: s("foo"),
2814 index: 1,
2815 }),
2816 ast: Box::new(lit_with('a', span_range(pat, 14..15))),
2817 }),
2818 ]
2819 ))
2820 );
2821 let pat = "(?x)( a )";
2822 assert_eq!(
2823 parser(pat).parse(),
2824 Ok(concat_with(
2825 span_range(pat, 0..pat.len()),
2826 vec![
2827 flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false),
2828 Ast::Group(ast::Group {
2829 span: span_range(pat, 4..pat.len()),
2830 kind: ast::GroupKind::CaptureIndex(1),
2831 ast: Box::new(lit_with('a', span_range(pat, 7..8))),
2832 }),
2833 ]
2834 ))
2835 );
2836 let pat = "(?x)( ?: a )";
2837 assert_eq!(
2838 parser(pat).parse(),
2839 Ok(concat_with(
2840 span_range(pat, 0..pat.len()),
2841 vec![
2842 flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false),
2843 Ast::Group(ast::Group {
2844 span: span_range(pat, 4..pat.len()),
2845 kind: ast::GroupKind::NonCapturing(ast::Flags {
2846 span: span_range(pat, 8..8),
2847 items: vec![],
2848 }),
2849 ast: Box::new(lit_with('a', span_range(pat, 11..12))),
2850 }),
2851 ]
2852 ))
2853 );
2854 let pat = r"(?x)\x { 53 }";
2855 assert_eq!(
2856 parser(pat).parse(),
2857 Ok(concat_with(
2858 span_range(pat, 0..pat.len()),
2859 vec![
2860 flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false),
2861 Ast::Literal(ast::Literal {
2862 span: span(4..13),
2863 kind: ast::LiteralKind::HexBrace(
2864 ast::HexLiteralKind::X
2865 ),
2866 c: 'S',
2867 }),
2868 ]
2869 ))
2870 );
2871
2872 // Test that whitespace after an escape is OK.
2873 let pat = r"(?x)\ ";
2874 assert_eq!(
2875 parser(pat).parse(),
2876 Ok(concat_with(
2877 span_range(pat, 0..pat.len()),
2878 vec![
2879 flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false),
2880 Ast::Literal(ast::Literal {
2881 span: span_range(pat, 4..6),
2882 kind: ast::LiteralKind::Special(
2883 ast::SpecialLiteralKind::Space
2884 ),
2885 c: ' ',
2886 }),
2887 ]
2888 ))
2889 );
2890 // ... but only when `x` mode is enabled.
2891 let pat = r"\ ";
2892 assert_eq!(
2893 parser(pat).parse().unwrap_err(),
2894 TestError {
2895 span: span_range(pat, 0..2),
2896 kind: ast::ErrorKind::EscapeUnrecognized,
2897 }
2898 );
2899 }
2900
2901 #[test]
parse_newlines()2902 fn parse_newlines() {
2903 let pat = ".\n.";
2904 assert_eq!(
2905 parser(pat).parse(),
2906 Ok(concat_with(
2907 span_range(pat, 0..3),
2908 vec![
2909 Ast::Dot(span_range(pat, 0..1)),
2910 lit_with('\n', span_range(pat, 1..2)),
2911 Ast::Dot(span_range(pat, 2..3)),
2912 ]
2913 ))
2914 );
2915
2916 let pat = "foobar\nbaz\nquux\n";
2917 assert_eq!(
2918 parser(pat).parse(),
2919 Ok(concat_with(
2920 span_range(pat, 0..pat.len()),
2921 vec![
2922 lit_with('f', nspan(npos(0, 1, 1), npos(1, 1, 2))),
2923 lit_with('o', nspan(npos(1, 1, 2), npos(2, 1, 3))),
2924 lit_with('o', nspan(npos(2, 1, 3), npos(3, 1, 4))),
2925 lit_with('b', nspan(npos(3, 1, 4), npos(4, 1, 5))),
2926 lit_with('a', nspan(npos(4, 1, 5), npos(5, 1, 6))),
2927 lit_with('r', nspan(npos(5, 1, 6), npos(6, 1, 7))),
2928 lit_with('\n', nspan(npos(6, 1, 7), npos(7, 2, 1))),
2929 lit_with('b', nspan(npos(7, 2, 1), npos(8, 2, 2))),
2930 lit_with('a', nspan(npos(8, 2, 2), npos(9, 2, 3))),
2931 lit_with('z', nspan(npos(9, 2, 3), npos(10, 2, 4))),
2932 lit_with('\n', nspan(npos(10, 2, 4), npos(11, 3, 1))),
2933 lit_with('q', nspan(npos(11, 3, 1), npos(12, 3, 2))),
2934 lit_with('u', nspan(npos(12, 3, 2), npos(13, 3, 3))),
2935 lit_with('u', nspan(npos(13, 3, 3), npos(14, 3, 4))),
2936 lit_with('x', nspan(npos(14, 3, 4), npos(15, 3, 5))),
2937 lit_with('\n', nspan(npos(15, 3, 5), npos(16, 4, 1))),
2938 ]
2939 ))
2940 );
2941 }
2942
2943 #[test]
parse_uncounted_repetition()2944 fn parse_uncounted_repetition() {
2945 assert_eq!(
2946 parser(r"a*").parse(),
2947 Ok(Ast::Repetition(ast::Repetition {
2948 span: span(0..2),
2949 op: ast::RepetitionOp {
2950 span: span(1..2),
2951 kind: ast::RepetitionKind::ZeroOrMore,
2952 },
2953 greedy: true,
2954 ast: Box::new(lit('a', 0)),
2955 }))
2956 );
2957 assert_eq!(
2958 parser(r"a+").parse(),
2959 Ok(Ast::Repetition(ast::Repetition {
2960 span: span(0..2),
2961 op: ast::RepetitionOp {
2962 span: span(1..2),
2963 kind: ast::RepetitionKind::OneOrMore,
2964 },
2965 greedy: true,
2966 ast: Box::new(lit('a', 0)),
2967 }))
2968 );
2969
2970 assert_eq!(
2971 parser(r"a?").parse(),
2972 Ok(Ast::Repetition(ast::Repetition {
2973 span: span(0..2),
2974 op: ast::RepetitionOp {
2975 span: span(1..2),
2976 kind: ast::RepetitionKind::ZeroOrOne,
2977 },
2978 greedy: true,
2979 ast: Box::new(lit('a', 0)),
2980 }))
2981 );
2982 assert_eq!(
2983 parser(r"a??").parse(),
2984 Ok(Ast::Repetition(ast::Repetition {
2985 span: span(0..3),
2986 op: ast::RepetitionOp {
2987 span: span(1..3),
2988 kind: ast::RepetitionKind::ZeroOrOne,
2989 },
2990 greedy: false,
2991 ast: Box::new(lit('a', 0)),
2992 }))
2993 );
2994 assert_eq!(
2995 parser(r"a?").parse(),
2996 Ok(Ast::Repetition(ast::Repetition {
2997 span: span(0..2),
2998 op: ast::RepetitionOp {
2999 span: span(1..2),
3000 kind: ast::RepetitionKind::ZeroOrOne,
3001 },
3002 greedy: true,
3003 ast: Box::new(lit('a', 0)),
3004 }))
3005 );
3006 assert_eq!(
3007 parser(r"a?b").parse(),
3008 Ok(concat(
3009 0..3,
3010 vec![
3011 Ast::Repetition(ast::Repetition {
3012 span: span(0..2),
3013 op: ast::RepetitionOp {
3014 span: span(1..2),
3015 kind: ast::RepetitionKind::ZeroOrOne,
3016 },
3017 greedy: true,
3018 ast: Box::new(lit('a', 0)),
3019 }),
3020 lit('b', 2),
3021 ]
3022 ))
3023 );
3024 assert_eq!(
3025 parser(r"a??b").parse(),
3026 Ok(concat(
3027 0..4,
3028 vec![
3029 Ast::Repetition(ast::Repetition {
3030 span: span(0..3),
3031 op: ast::RepetitionOp {
3032 span: span(1..3),
3033 kind: ast::RepetitionKind::ZeroOrOne,
3034 },
3035 greedy: false,
3036 ast: Box::new(lit('a', 0)),
3037 }),
3038 lit('b', 3),
3039 ]
3040 ))
3041 );
3042 assert_eq!(
3043 parser(r"ab?").parse(),
3044 Ok(concat(
3045 0..3,
3046 vec![
3047 lit('a', 0),
3048 Ast::Repetition(ast::Repetition {
3049 span: span(1..3),
3050 op: ast::RepetitionOp {
3051 span: span(2..3),
3052 kind: ast::RepetitionKind::ZeroOrOne,
3053 },
3054 greedy: true,
3055 ast: Box::new(lit('b', 1)),
3056 }),
3057 ]
3058 ))
3059 );
3060 assert_eq!(
3061 parser(r"(ab)?").parse(),
3062 Ok(Ast::Repetition(ast::Repetition {
3063 span: span(0..5),
3064 op: ast::RepetitionOp {
3065 span: span(4..5),
3066 kind: ast::RepetitionKind::ZeroOrOne,
3067 },
3068 greedy: true,
3069 ast: Box::new(group(
3070 0..4,
3071 1,
3072 concat(1..3, vec![lit('a', 1), lit('b', 2),])
3073 )),
3074 }))
3075 );
3076 assert_eq!(
3077 parser(r"|a?").parse(),
3078 Ok(alt(
3079 0..3,
3080 vec![
3081 Ast::Empty(span(0..0)),
3082 Ast::Repetition(ast::Repetition {
3083 span: span(1..3),
3084 op: ast::RepetitionOp {
3085 span: span(2..3),
3086 kind: ast::RepetitionKind::ZeroOrOne,
3087 },
3088 greedy: true,
3089 ast: Box::new(lit('a', 1)),
3090 }),
3091 ]
3092 ))
3093 );
3094
3095 assert_eq!(
3096 parser(r"*").parse().unwrap_err(),
3097 TestError {
3098 span: span(0..0),
3099 kind: ast::ErrorKind::RepetitionMissing,
3100 }
3101 );
3102 assert_eq!(
3103 parser(r"(?i)*").parse().unwrap_err(),
3104 TestError {
3105 span: span(4..4),
3106 kind: ast::ErrorKind::RepetitionMissing,
3107 }
3108 );
3109 assert_eq!(
3110 parser(r"(*)").parse().unwrap_err(),
3111 TestError {
3112 span: span(1..1),
3113 kind: ast::ErrorKind::RepetitionMissing,
3114 }
3115 );
3116 assert_eq!(
3117 parser(r"(?:?)").parse().unwrap_err(),
3118 TestError {
3119 span: span(3..3),
3120 kind: ast::ErrorKind::RepetitionMissing,
3121 }
3122 );
3123 assert_eq!(
3124 parser(r"+").parse().unwrap_err(),
3125 TestError {
3126 span: span(0..0),
3127 kind: ast::ErrorKind::RepetitionMissing,
3128 }
3129 );
3130 assert_eq!(
3131 parser(r"?").parse().unwrap_err(),
3132 TestError {
3133 span: span(0..0),
3134 kind: ast::ErrorKind::RepetitionMissing,
3135 }
3136 );
3137 assert_eq!(
3138 parser(r"(?)").parse().unwrap_err(),
3139 TestError {
3140 span: span(1..1),
3141 kind: ast::ErrorKind::RepetitionMissing,
3142 }
3143 );
3144 assert_eq!(
3145 parser(r"|*").parse().unwrap_err(),
3146 TestError {
3147 span: span(1..1),
3148 kind: ast::ErrorKind::RepetitionMissing,
3149 }
3150 );
3151 assert_eq!(
3152 parser(r"|+").parse().unwrap_err(),
3153 TestError {
3154 span: span(1..1),
3155 kind: ast::ErrorKind::RepetitionMissing,
3156 }
3157 );
3158 assert_eq!(
3159 parser(r"|?").parse().unwrap_err(),
3160 TestError {
3161 span: span(1..1),
3162 kind: ast::ErrorKind::RepetitionMissing,
3163 }
3164 );
3165 }
3166
3167 #[test]
parse_counted_repetition()3168 fn parse_counted_repetition() {
3169 assert_eq!(
3170 parser(r"a{5}").parse(),
3171 Ok(Ast::Repetition(ast::Repetition {
3172 span: span(0..4),
3173 op: ast::RepetitionOp {
3174 span: span(1..4),
3175 kind: ast::RepetitionKind::Range(
3176 ast::RepetitionRange::Exactly(5)
3177 ),
3178 },
3179 greedy: true,
3180 ast: Box::new(lit('a', 0)),
3181 }))
3182 );
3183 assert_eq!(
3184 parser(r"a{5,}").parse(),
3185 Ok(Ast::Repetition(ast::Repetition {
3186 span: span(0..5),
3187 op: ast::RepetitionOp {
3188 span: span(1..5),
3189 kind: ast::RepetitionKind::Range(
3190 ast::RepetitionRange::AtLeast(5)
3191 ),
3192 },
3193 greedy: true,
3194 ast: Box::new(lit('a', 0)),
3195 }))
3196 );
3197 assert_eq!(
3198 parser(r"a{5,9}").parse(),
3199 Ok(Ast::Repetition(ast::Repetition {
3200 span: span(0..6),
3201 op: ast::RepetitionOp {
3202 span: span(1..6),
3203 kind: ast::RepetitionKind::Range(
3204 ast::RepetitionRange::Bounded(5, 9)
3205 ),
3206 },
3207 greedy: true,
3208 ast: Box::new(lit('a', 0)),
3209 }))
3210 );
3211 assert_eq!(
3212 parser(r"a{5}?").parse(),
3213 Ok(Ast::Repetition(ast::Repetition {
3214 span: span(0..5),
3215 op: ast::RepetitionOp {
3216 span: span(1..5),
3217 kind: ast::RepetitionKind::Range(
3218 ast::RepetitionRange::Exactly(5)
3219 ),
3220 },
3221 greedy: false,
3222 ast: Box::new(lit('a', 0)),
3223 }))
3224 );
3225 assert_eq!(
3226 parser(r"ab{5}").parse(),
3227 Ok(concat(
3228 0..5,
3229 vec![
3230 lit('a', 0),
3231 Ast::Repetition(ast::Repetition {
3232 span: span(1..5),
3233 op: ast::RepetitionOp {
3234 span: span(2..5),
3235 kind: ast::RepetitionKind::Range(
3236 ast::RepetitionRange::Exactly(5)
3237 ),
3238 },
3239 greedy: true,
3240 ast: Box::new(lit('b', 1)),
3241 }),
3242 ]
3243 ))
3244 );
3245 assert_eq!(
3246 parser(r"ab{5}c").parse(),
3247 Ok(concat(
3248 0..6,
3249 vec![
3250 lit('a', 0),
3251 Ast::Repetition(ast::Repetition {
3252 span: span(1..5),
3253 op: ast::RepetitionOp {
3254 span: span(2..5),
3255 kind: ast::RepetitionKind::Range(
3256 ast::RepetitionRange::Exactly(5)
3257 ),
3258 },
3259 greedy: true,
3260 ast: Box::new(lit('b', 1)),
3261 }),
3262 lit('c', 5),
3263 ]
3264 ))
3265 );
3266
3267 assert_eq!(
3268 parser(r"a{ 5 }").parse(),
3269 Ok(Ast::Repetition(ast::Repetition {
3270 span: span(0..6),
3271 op: ast::RepetitionOp {
3272 span: span(1..6),
3273 kind: ast::RepetitionKind::Range(
3274 ast::RepetitionRange::Exactly(5)
3275 ),
3276 },
3277 greedy: true,
3278 ast: Box::new(lit('a', 0)),
3279 }))
3280 );
3281 assert_eq!(
3282 parser(r"a{ 5 , 9 }").parse(),
3283 Ok(Ast::Repetition(ast::Repetition {
3284 span: span(0..10),
3285 op: ast::RepetitionOp {
3286 span: span(1..10),
3287 kind: ast::RepetitionKind::Range(
3288 ast::RepetitionRange::Bounded(5, 9)
3289 ),
3290 },
3291 greedy: true,
3292 ast: Box::new(lit('a', 0)),
3293 }))
3294 );
3295 assert_eq!(
3296 parser_ignore_whitespace(r"a{5,9} ?").parse(),
3297 Ok(Ast::Repetition(ast::Repetition {
3298 span: span(0..8),
3299 op: ast::RepetitionOp {
3300 span: span(1..8),
3301 kind: ast::RepetitionKind::Range(
3302 ast::RepetitionRange::Bounded(5, 9)
3303 ),
3304 },
3305 greedy: false,
3306 ast: Box::new(lit('a', 0)),
3307 }))
3308 );
3309
3310 assert_eq!(
3311 parser(r"(?i){0}").parse().unwrap_err(),
3312 TestError {
3313 span: span(4..4),
3314 kind: ast::ErrorKind::RepetitionMissing,
3315 }
3316 );
3317 assert_eq!(
3318 parser(r"(?m){1,1}").parse().unwrap_err(),
3319 TestError {
3320 span: span(4..4),
3321 kind: ast::ErrorKind::RepetitionMissing,
3322 }
3323 );
3324 assert_eq!(
3325 parser(r"a{]}").parse().unwrap_err(),
3326 TestError {
3327 span: span(2..2),
3328 kind: ast::ErrorKind::RepetitionCountDecimalEmpty,
3329 }
3330 );
3331 assert_eq!(
3332 parser(r"a{1,]}").parse().unwrap_err(),
3333 TestError {
3334 span: span(4..4),
3335 kind: ast::ErrorKind::RepetitionCountDecimalEmpty,
3336 }
3337 );
3338 assert_eq!(
3339 parser(r"a{").parse().unwrap_err(),
3340 TestError {
3341 span: span(1..2),
3342 kind: ast::ErrorKind::RepetitionCountUnclosed,
3343 }
3344 );
3345 assert_eq!(
3346 parser(r"a{}").parse().unwrap_err(),
3347 TestError {
3348 span: span(2..2),
3349 kind: ast::ErrorKind::RepetitionCountDecimalEmpty,
3350 }
3351 );
3352 assert_eq!(
3353 parser(r"a{a").parse().unwrap_err(),
3354 TestError {
3355 span: span(2..2),
3356 kind: ast::ErrorKind::RepetitionCountDecimalEmpty,
3357 }
3358 );
3359 assert_eq!(
3360 parser(r"a{9999999999}").parse().unwrap_err(),
3361 TestError {
3362 span: span(2..12),
3363 kind: ast::ErrorKind::DecimalInvalid,
3364 }
3365 );
3366 assert_eq!(
3367 parser(r"a{9").parse().unwrap_err(),
3368 TestError {
3369 span: span(1..3),
3370 kind: ast::ErrorKind::RepetitionCountUnclosed,
3371 }
3372 );
3373 assert_eq!(
3374 parser(r"a{9,a").parse().unwrap_err(),
3375 TestError {
3376 span: span(4..4),
3377 kind: ast::ErrorKind::RepetitionCountDecimalEmpty,
3378 }
3379 );
3380 assert_eq!(
3381 parser(r"a{9,9999999999}").parse().unwrap_err(),
3382 TestError {
3383 span: span(4..14),
3384 kind: ast::ErrorKind::DecimalInvalid,
3385 }
3386 );
3387 assert_eq!(
3388 parser(r"a{9,").parse().unwrap_err(),
3389 TestError {
3390 span: span(1..4),
3391 kind: ast::ErrorKind::RepetitionCountUnclosed,
3392 }
3393 );
3394 assert_eq!(
3395 parser(r"a{9,11").parse().unwrap_err(),
3396 TestError {
3397 span: span(1..6),
3398 kind: ast::ErrorKind::RepetitionCountUnclosed,
3399 }
3400 );
3401 assert_eq!(
3402 parser(r"a{2,1}").parse().unwrap_err(),
3403 TestError {
3404 span: span(1..6),
3405 kind: ast::ErrorKind::RepetitionCountInvalid,
3406 }
3407 );
3408 assert_eq!(
3409 parser(r"{5}").parse().unwrap_err(),
3410 TestError {
3411 span: span(0..0),
3412 kind: ast::ErrorKind::RepetitionMissing,
3413 }
3414 );
3415 assert_eq!(
3416 parser(r"|{5}").parse().unwrap_err(),
3417 TestError {
3418 span: span(1..1),
3419 kind: ast::ErrorKind::RepetitionMissing,
3420 }
3421 );
3422 }
3423
3424 #[test]
parse_alternate()3425 fn parse_alternate() {
3426 assert_eq!(
3427 parser(r"a|b").parse(),
3428 Ok(Ast::Alternation(ast::Alternation {
3429 span: span(0..3),
3430 asts: vec![lit('a', 0), lit('b', 2)],
3431 }))
3432 );
3433 assert_eq!(
3434 parser(r"(a|b)").parse(),
3435 Ok(group(
3436 0..5,
3437 1,
3438 Ast::Alternation(ast::Alternation {
3439 span: span(1..4),
3440 asts: vec![lit('a', 1), lit('b', 3)],
3441 })
3442 ))
3443 );
3444
3445 assert_eq!(
3446 parser(r"a|b|c").parse(),
3447 Ok(Ast::Alternation(ast::Alternation {
3448 span: span(0..5),
3449 asts: vec![lit('a', 0), lit('b', 2), lit('c', 4)],
3450 }))
3451 );
3452 assert_eq!(
3453 parser(r"ax|by|cz").parse(),
3454 Ok(Ast::Alternation(ast::Alternation {
3455 span: span(0..8),
3456 asts: vec![
3457 concat(0..2, vec![lit('a', 0), lit('x', 1)]),
3458 concat(3..5, vec![lit('b', 3), lit('y', 4)]),
3459 concat(6..8, vec![lit('c', 6), lit('z', 7)]),
3460 ],
3461 }))
3462 );
3463 assert_eq!(
3464 parser(r"(ax|by|cz)").parse(),
3465 Ok(group(
3466 0..10,
3467 1,
3468 Ast::Alternation(ast::Alternation {
3469 span: span(1..9),
3470 asts: vec![
3471 concat(1..3, vec![lit('a', 1), lit('x', 2)]),
3472 concat(4..6, vec![lit('b', 4), lit('y', 5)]),
3473 concat(7..9, vec![lit('c', 7), lit('z', 8)]),
3474 ],
3475 })
3476 ))
3477 );
3478 assert_eq!(
3479 parser(r"(ax|(by|(cz)))").parse(),
3480 Ok(group(
3481 0..14,
3482 1,
3483 alt(
3484 1..13,
3485 vec![
3486 concat(1..3, vec![lit('a', 1), lit('x', 2)]),
3487 group(
3488 4..13,
3489 2,
3490 alt(
3491 5..12,
3492 vec![
3493 concat(
3494 5..7,
3495 vec![lit('b', 5), lit('y', 6)]
3496 ),
3497 group(
3498 8..12,
3499 3,
3500 concat(
3501 9..11,
3502 vec![lit('c', 9), lit('z', 10),]
3503 )
3504 ),
3505 ]
3506 )
3507 ),
3508 ]
3509 )
3510 ))
3511 );
3512
3513 assert_eq!(
3514 parser(r"|").parse(),
3515 Ok(alt(
3516 0..1,
3517 vec![Ast::Empty(span(0..0)), Ast::Empty(span(1..1)),]
3518 ))
3519 );
3520 assert_eq!(
3521 parser(r"||").parse(),
3522 Ok(alt(
3523 0..2,
3524 vec![
3525 Ast::Empty(span(0..0)),
3526 Ast::Empty(span(1..1)),
3527 Ast::Empty(span(2..2)),
3528 ]
3529 ))
3530 );
3531 assert_eq!(
3532 parser(r"a|").parse(),
3533 Ok(alt(0..2, vec![lit('a', 0), Ast::Empty(span(2..2)),]))
3534 );
3535 assert_eq!(
3536 parser(r"|a").parse(),
3537 Ok(alt(0..2, vec![Ast::Empty(span(0..0)), lit('a', 1),]))
3538 );
3539
3540 assert_eq!(
3541 parser(r"(|)").parse(),
3542 Ok(group(
3543 0..3,
3544 1,
3545 alt(
3546 1..2,
3547 vec![Ast::Empty(span(1..1)), Ast::Empty(span(2..2)),]
3548 )
3549 ))
3550 );
3551 assert_eq!(
3552 parser(r"(a|)").parse(),
3553 Ok(group(
3554 0..4,
3555 1,
3556 alt(1..3, vec![lit('a', 1), Ast::Empty(span(3..3)),])
3557 ))
3558 );
3559 assert_eq!(
3560 parser(r"(|a)").parse(),
3561 Ok(group(
3562 0..4,
3563 1,
3564 alt(1..3, vec![Ast::Empty(span(1..1)), lit('a', 2),])
3565 ))
3566 );
3567
3568 assert_eq!(
3569 parser(r"a|b)").parse().unwrap_err(),
3570 TestError {
3571 span: span(3..4),
3572 kind: ast::ErrorKind::GroupUnopened,
3573 }
3574 );
3575 assert_eq!(
3576 parser(r"(a|b").parse().unwrap_err(),
3577 TestError {
3578 span: span(0..1),
3579 kind: ast::ErrorKind::GroupUnclosed,
3580 }
3581 );
3582 }
3583
3584 #[test]
parse_unsupported_lookaround()3585 fn parse_unsupported_lookaround() {
3586 assert_eq!(
3587 parser(r"(?=a)").parse().unwrap_err(),
3588 TestError {
3589 span: span(0..3),
3590 kind: ast::ErrorKind::UnsupportedLookAround,
3591 }
3592 );
3593 assert_eq!(
3594 parser(r"(?!a)").parse().unwrap_err(),
3595 TestError {
3596 span: span(0..3),
3597 kind: ast::ErrorKind::UnsupportedLookAround,
3598 }
3599 );
3600 assert_eq!(
3601 parser(r"(?<=a)").parse().unwrap_err(),
3602 TestError {
3603 span: span(0..4),
3604 kind: ast::ErrorKind::UnsupportedLookAround,
3605 }
3606 );
3607 assert_eq!(
3608 parser(r"(?<!a)").parse().unwrap_err(),
3609 TestError {
3610 span: span(0..4),
3611 kind: ast::ErrorKind::UnsupportedLookAround,
3612 }
3613 );
3614 }
3615
3616 #[test]
parse_group()3617 fn parse_group() {
3618 assert_eq!(
3619 parser("(?i)").parse(),
3620 Ok(Ast::Flags(ast::SetFlags {
3621 span: span(0..4),
3622 flags: ast::Flags {
3623 span: span(2..3),
3624 items: vec![ast::FlagsItem {
3625 span: span(2..3),
3626 kind: ast::FlagsItemKind::Flag(
3627 ast::Flag::CaseInsensitive
3628 ),
3629 }],
3630 },
3631 }))
3632 );
3633 assert_eq!(
3634 parser("(?iU)").parse(),
3635 Ok(Ast::Flags(ast::SetFlags {
3636 span: span(0..5),
3637 flags: ast::Flags {
3638 span: span(2..4),
3639 items: vec![
3640 ast::FlagsItem {
3641 span: span(2..3),
3642 kind: ast::FlagsItemKind::Flag(
3643 ast::Flag::CaseInsensitive
3644 ),
3645 },
3646 ast::FlagsItem {
3647 span: span(3..4),
3648 kind: ast::FlagsItemKind::Flag(
3649 ast::Flag::SwapGreed
3650 ),
3651 },
3652 ],
3653 },
3654 }))
3655 );
3656 assert_eq!(
3657 parser("(?i-U)").parse(),
3658 Ok(Ast::Flags(ast::SetFlags {
3659 span: span(0..6),
3660 flags: ast::Flags {
3661 span: span(2..5),
3662 items: vec![
3663 ast::FlagsItem {
3664 span: span(2..3),
3665 kind: ast::FlagsItemKind::Flag(
3666 ast::Flag::CaseInsensitive
3667 ),
3668 },
3669 ast::FlagsItem {
3670 span: span(3..4),
3671 kind: ast::FlagsItemKind::Negation,
3672 },
3673 ast::FlagsItem {
3674 span: span(4..5),
3675 kind: ast::FlagsItemKind::Flag(
3676 ast::Flag::SwapGreed
3677 ),
3678 },
3679 ],
3680 },
3681 }))
3682 );
3683
3684 assert_eq!(
3685 parser("()").parse(),
3686 Ok(Ast::Group(ast::Group {
3687 span: span(0..2),
3688 kind: ast::GroupKind::CaptureIndex(1),
3689 ast: Box::new(Ast::Empty(span(1..1))),
3690 }))
3691 );
3692 assert_eq!(
3693 parser("(a)").parse(),
3694 Ok(Ast::Group(ast::Group {
3695 span: span(0..3),
3696 kind: ast::GroupKind::CaptureIndex(1),
3697 ast: Box::new(lit('a', 1)),
3698 }))
3699 );
3700 assert_eq!(
3701 parser("(())").parse(),
3702 Ok(Ast::Group(ast::Group {
3703 span: span(0..4),
3704 kind: ast::GroupKind::CaptureIndex(1),
3705 ast: Box::new(Ast::Group(ast::Group {
3706 span: span(1..3),
3707 kind: ast::GroupKind::CaptureIndex(2),
3708 ast: Box::new(Ast::Empty(span(2..2))),
3709 })),
3710 }))
3711 );
3712
3713 assert_eq!(
3714 parser("(?:a)").parse(),
3715 Ok(Ast::Group(ast::Group {
3716 span: span(0..5),
3717 kind: ast::GroupKind::NonCapturing(ast::Flags {
3718 span: span(2..2),
3719 items: vec![],
3720 }),
3721 ast: Box::new(lit('a', 3)),
3722 }))
3723 );
3724
3725 assert_eq!(
3726 parser("(?i:a)").parse(),
3727 Ok(Ast::Group(ast::Group {
3728 span: span(0..6),
3729 kind: ast::GroupKind::NonCapturing(ast::Flags {
3730 span: span(2..3),
3731 items: vec![ast::FlagsItem {
3732 span: span(2..3),
3733 kind: ast::FlagsItemKind::Flag(
3734 ast::Flag::CaseInsensitive
3735 ),
3736 },],
3737 }),
3738 ast: Box::new(lit('a', 4)),
3739 }))
3740 );
3741 assert_eq!(
3742 parser("(?i-U:a)").parse(),
3743 Ok(Ast::Group(ast::Group {
3744 span: span(0..8),
3745 kind: ast::GroupKind::NonCapturing(ast::Flags {
3746 span: span(2..5),
3747 items: vec![
3748 ast::FlagsItem {
3749 span: span(2..3),
3750 kind: ast::FlagsItemKind::Flag(
3751 ast::Flag::CaseInsensitive
3752 ),
3753 },
3754 ast::FlagsItem {
3755 span: span(3..4),
3756 kind: ast::FlagsItemKind::Negation,
3757 },
3758 ast::FlagsItem {
3759 span: span(4..5),
3760 kind: ast::FlagsItemKind::Flag(
3761 ast::Flag::SwapGreed
3762 ),
3763 },
3764 ],
3765 }),
3766 ast: Box::new(lit('a', 6)),
3767 }))
3768 );
3769
3770 assert_eq!(
3771 parser("(").parse().unwrap_err(),
3772 TestError {
3773 span: span(0..1),
3774 kind: ast::ErrorKind::GroupUnclosed,
3775 }
3776 );
3777 assert_eq!(
3778 parser("(?").parse().unwrap_err(),
3779 TestError {
3780 span: span(0..1),
3781 kind: ast::ErrorKind::GroupUnclosed,
3782 }
3783 );
3784 assert_eq!(
3785 parser("(?P").parse().unwrap_err(),
3786 TestError {
3787 span: span(2..3),
3788 kind: ast::ErrorKind::FlagUnrecognized,
3789 }
3790 );
3791 assert_eq!(
3792 parser("(?P<").parse().unwrap_err(),
3793 TestError {
3794 span: span(4..4),
3795 kind: ast::ErrorKind::GroupNameUnexpectedEof,
3796 }
3797 );
3798 assert_eq!(
3799 parser("(a").parse().unwrap_err(),
3800 TestError {
3801 span: span(0..1),
3802 kind: ast::ErrorKind::GroupUnclosed,
3803 }
3804 );
3805 assert_eq!(
3806 parser("(()").parse().unwrap_err(),
3807 TestError {
3808 span: span(0..1),
3809 kind: ast::ErrorKind::GroupUnclosed,
3810 }
3811 );
3812 assert_eq!(
3813 parser(")").parse().unwrap_err(),
3814 TestError {
3815 span: span(0..1),
3816 kind: ast::ErrorKind::GroupUnopened,
3817 }
3818 );
3819 assert_eq!(
3820 parser("a)").parse().unwrap_err(),
3821 TestError {
3822 span: span(1..2),
3823 kind: ast::ErrorKind::GroupUnopened,
3824 }
3825 );
3826 }
3827
3828 #[test]
parse_capture_name()3829 fn parse_capture_name() {
3830 assert_eq!(
3831 parser("(?P<a>z)").parse(),
3832 Ok(Ast::Group(ast::Group {
3833 span: span(0..8),
3834 kind: ast::GroupKind::CaptureName(ast::CaptureName {
3835 span: span(4..5),
3836 name: s("a"),
3837 index: 1,
3838 }),
3839 ast: Box::new(lit('z', 6)),
3840 }))
3841 );
3842 assert_eq!(
3843 parser("(?P<abc>z)").parse(),
3844 Ok(Ast::Group(ast::Group {
3845 span: span(0..10),
3846 kind: ast::GroupKind::CaptureName(ast::CaptureName {
3847 span: span(4..7),
3848 name: s("abc"),
3849 index: 1,
3850 }),
3851 ast: Box::new(lit('z', 8)),
3852 }))
3853 );
3854
3855 assert_eq!(
3856 parser("(?P<a_1>z)").parse(),
3857 Ok(Ast::Group(ast::Group {
3858 span: span(0..10),
3859 kind: ast::GroupKind::CaptureName(ast::CaptureName {
3860 span: span(4..7),
3861 name: s("a_1"),
3862 index: 1,
3863 }),
3864 ast: Box::new(lit('z', 8)),
3865 }))
3866 );
3867
3868 assert_eq!(
3869 parser("(?P<a.1>z)").parse(),
3870 Ok(Ast::Group(ast::Group {
3871 span: span(0..10),
3872 kind: ast::GroupKind::CaptureName(ast::CaptureName {
3873 span: span(4..7),
3874 name: s("a.1"),
3875 index: 1,
3876 }),
3877 ast: Box::new(lit('z', 8)),
3878 }))
3879 );
3880
3881 assert_eq!(
3882 parser("(?P<a[1]>z)").parse(),
3883 Ok(Ast::Group(ast::Group {
3884 span: span(0..11),
3885 kind: ast::GroupKind::CaptureName(ast::CaptureName {
3886 span: span(4..8),
3887 name: s("a[1]"),
3888 index: 1,
3889 }),
3890 ast: Box::new(lit('z', 9)),
3891 }))
3892 );
3893
3894 assert_eq!(
3895 parser("(?P<").parse().unwrap_err(),
3896 TestError {
3897 span: span(4..4),
3898 kind: ast::ErrorKind::GroupNameUnexpectedEof,
3899 }
3900 );
3901 assert_eq!(
3902 parser("(?P<>z)").parse().unwrap_err(),
3903 TestError {
3904 span: span(4..4),
3905 kind: ast::ErrorKind::GroupNameEmpty,
3906 }
3907 );
3908 assert_eq!(
3909 parser("(?P<a").parse().unwrap_err(),
3910 TestError {
3911 span: span(5..5),
3912 kind: ast::ErrorKind::GroupNameUnexpectedEof,
3913 }
3914 );
3915 assert_eq!(
3916 parser("(?P<ab").parse().unwrap_err(),
3917 TestError {
3918 span: span(6..6),
3919 kind: ast::ErrorKind::GroupNameUnexpectedEof,
3920 }
3921 );
3922 assert_eq!(
3923 parser("(?P<0a").parse().unwrap_err(),
3924 TestError {
3925 span: span(4..5),
3926 kind: ast::ErrorKind::GroupNameInvalid,
3927 }
3928 );
3929 assert_eq!(
3930 parser("(?P<~").parse().unwrap_err(),
3931 TestError {
3932 span: span(4..5),
3933 kind: ast::ErrorKind::GroupNameInvalid,
3934 }
3935 );
3936 assert_eq!(
3937 parser("(?P<abc~").parse().unwrap_err(),
3938 TestError {
3939 span: span(7..8),
3940 kind: ast::ErrorKind::GroupNameInvalid,
3941 }
3942 );
3943 assert_eq!(
3944 parser("(?P<a>y)(?P<a>z)").parse().unwrap_err(),
3945 TestError {
3946 span: span(12..13),
3947 kind: ast::ErrorKind::GroupNameDuplicate {
3948 original: span(4..5),
3949 },
3950 }
3951 );
3952 }
3953
3954 #[test]
parse_flags()3955 fn parse_flags() {
3956 assert_eq!(
3957 parser("i:").parse_flags(),
3958 Ok(ast::Flags {
3959 span: span(0..1),
3960 items: vec![ast::FlagsItem {
3961 span: span(0..1),
3962 kind: ast::FlagsItemKind::Flag(ast::Flag::CaseInsensitive),
3963 }],
3964 })
3965 );
3966 assert_eq!(
3967 parser("i)").parse_flags(),
3968 Ok(ast::Flags {
3969 span: span(0..1),
3970 items: vec![ast::FlagsItem {
3971 span: span(0..1),
3972 kind: ast::FlagsItemKind::Flag(ast::Flag::CaseInsensitive),
3973 }],
3974 })
3975 );
3976
3977 assert_eq!(
3978 parser("isU:").parse_flags(),
3979 Ok(ast::Flags {
3980 span: span(0..3),
3981 items: vec![
3982 ast::FlagsItem {
3983 span: span(0..1),
3984 kind: ast::FlagsItemKind::Flag(
3985 ast::Flag::CaseInsensitive
3986 ),
3987 },
3988 ast::FlagsItem {
3989 span: span(1..2),
3990 kind: ast::FlagsItemKind::Flag(
3991 ast::Flag::DotMatchesNewLine
3992 ),
3993 },
3994 ast::FlagsItem {
3995 span: span(2..3),
3996 kind: ast::FlagsItemKind::Flag(ast::Flag::SwapGreed),
3997 },
3998 ],
3999 })
4000 );
4001
4002 assert_eq!(
4003 parser("-isU:").parse_flags(),
4004 Ok(ast::Flags {
4005 span: span(0..4),
4006 items: vec![
4007 ast::FlagsItem {
4008 span: span(0..1),
4009 kind: ast::FlagsItemKind::Negation,
4010 },
4011 ast::FlagsItem {
4012 span: span(1..2),
4013 kind: ast::FlagsItemKind::Flag(
4014 ast::Flag::CaseInsensitive
4015 ),
4016 },
4017 ast::FlagsItem {
4018 span: span(2..3),
4019 kind: ast::FlagsItemKind::Flag(
4020 ast::Flag::DotMatchesNewLine
4021 ),
4022 },
4023 ast::FlagsItem {
4024 span: span(3..4),
4025 kind: ast::FlagsItemKind::Flag(ast::Flag::SwapGreed),
4026 },
4027 ],
4028 })
4029 );
4030 assert_eq!(
4031 parser("i-sU:").parse_flags(),
4032 Ok(ast::Flags {
4033 span: span(0..4),
4034 items: vec![
4035 ast::FlagsItem {
4036 span: span(0..1),
4037 kind: ast::FlagsItemKind::Flag(
4038 ast::Flag::CaseInsensitive
4039 ),
4040 },
4041 ast::FlagsItem {
4042 span: span(1..2),
4043 kind: ast::FlagsItemKind::Negation,
4044 },
4045 ast::FlagsItem {
4046 span: span(2..3),
4047 kind: ast::FlagsItemKind::Flag(
4048 ast::Flag::DotMatchesNewLine
4049 ),
4050 },
4051 ast::FlagsItem {
4052 span: span(3..4),
4053 kind: ast::FlagsItemKind::Flag(ast::Flag::SwapGreed),
4054 },
4055 ],
4056 })
4057 );
4058
4059 assert_eq!(
4060 parser("isU").parse_flags().unwrap_err(),
4061 TestError {
4062 span: span(3..3),
4063 kind: ast::ErrorKind::FlagUnexpectedEof,
4064 }
4065 );
4066 assert_eq!(
4067 parser("isUa:").parse_flags().unwrap_err(),
4068 TestError {
4069 span: span(3..4),
4070 kind: ast::ErrorKind::FlagUnrecognized,
4071 }
4072 );
4073 assert_eq!(
4074 parser("isUi:").parse_flags().unwrap_err(),
4075 TestError {
4076 span: span(3..4),
4077 kind: ast::ErrorKind::FlagDuplicate { original: span(0..1) },
4078 }
4079 );
4080 assert_eq!(
4081 parser("i-sU-i:").parse_flags().unwrap_err(),
4082 TestError {
4083 span: span(4..5),
4084 kind: ast::ErrorKind::FlagRepeatedNegation {
4085 original: span(1..2),
4086 },
4087 }
4088 );
4089 assert_eq!(
4090 parser("-)").parse_flags().unwrap_err(),
4091 TestError {
4092 span: span(0..1),
4093 kind: ast::ErrorKind::FlagDanglingNegation,
4094 }
4095 );
4096 assert_eq!(
4097 parser("i-)").parse_flags().unwrap_err(),
4098 TestError {
4099 span: span(1..2),
4100 kind: ast::ErrorKind::FlagDanglingNegation,
4101 }
4102 );
4103 assert_eq!(
4104 parser("iU-)").parse_flags().unwrap_err(),
4105 TestError {
4106 span: span(2..3),
4107 kind: ast::ErrorKind::FlagDanglingNegation,
4108 }
4109 );
4110 }
4111
4112 #[test]
parse_flag()4113 fn parse_flag() {
4114 assert_eq!(parser("i").parse_flag(), Ok(ast::Flag::CaseInsensitive));
4115 assert_eq!(parser("m").parse_flag(), Ok(ast::Flag::MultiLine));
4116 assert_eq!(parser("s").parse_flag(), Ok(ast::Flag::DotMatchesNewLine));
4117 assert_eq!(parser("U").parse_flag(), Ok(ast::Flag::SwapGreed));
4118 assert_eq!(parser("u").parse_flag(), Ok(ast::Flag::Unicode));
4119 assert_eq!(parser("x").parse_flag(), Ok(ast::Flag::IgnoreWhitespace));
4120
4121 assert_eq!(
4122 parser("a").parse_flag().unwrap_err(),
4123 TestError {
4124 span: span(0..1),
4125 kind: ast::ErrorKind::FlagUnrecognized,
4126 }
4127 );
4128 assert_eq!(
4129 parser("☃").parse_flag().unwrap_err(),
4130 TestError {
4131 span: span_range("☃", 0..3),
4132 kind: ast::ErrorKind::FlagUnrecognized,
4133 }
4134 );
4135 }
4136
4137 #[test]
parse_primitive_non_escape()4138 fn parse_primitive_non_escape() {
4139 assert_eq!(
4140 parser(r".").parse_primitive(),
4141 Ok(Primitive::Dot(span(0..1)))
4142 );
4143 assert_eq!(
4144 parser(r"^").parse_primitive(),
4145 Ok(Primitive::Assertion(ast::Assertion {
4146 span: span(0..1),
4147 kind: ast::AssertionKind::StartLine,
4148 }))
4149 );
4150 assert_eq!(
4151 parser(r"$").parse_primitive(),
4152 Ok(Primitive::Assertion(ast::Assertion {
4153 span: span(0..1),
4154 kind: ast::AssertionKind::EndLine,
4155 }))
4156 );
4157
4158 assert_eq!(
4159 parser(r"a").parse_primitive(),
4160 Ok(Primitive::Literal(ast::Literal {
4161 span: span(0..1),
4162 kind: ast::LiteralKind::Verbatim,
4163 c: 'a',
4164 }))
4165 );
4166 assert_eq!(
4167 parser(r"|").parse_primitive(),
4168 Ok(Primitive::Literal(ast::Literal {
4169 span: span(0..1),
4170 kind: ast::LiteralKind::Verbatim,
4171 c: '|',
4172 }))
4173 );
4174 assert_eq!(
4175 parser(r"☃").parse_primitive(),
4176 Ok(Primitive::Literal(ast::Literal {
4177 span: span_range("☃", 0..3),
4178 kind: ast::LiteralKind::Verbatim,
4179 c: '☃',
4180 }))
4181 );
4182 }
4183
4184 #[test]
parse_escape()4185 fn parse_escape() {
4186 assert_eq!(
4187 parser(r"\|").parse_primitive(),
4188 Ok(Primitive::Literal(ast::Literal {
4189 span: span(0..2),
4190 kind: ast::LiteralKind::Punctuation,
4191 c: '|',
4192 }))
4193 );
4194 let specials = &[
4195 (r"\a", '\x07', ast::SpecialLiteralKind::Bell),
4196 (r"\f", '\x0C', ast::SpecialLiteralKind::FormFeed),
4197 (r"\t", '\t', ast::SpecialLiteralKind::Tab),
4198 (r"\n", '\n', ast::SpecialLiteralKind::LineFeed),
4199 (r"\r", '\r', ast::SpecialLiteralKind::CarriageReturn),
4200 (r"\v", '\x0B', ast::SpecialLiteralKind::VerticalTab),
4201 ];
4202 for &(pat, c, ref kind) in specials {
4203 assert_eq!(
4204 parser(pat).parse_primitive(),
4205 Ok(Primitive::Literal(ast::Literal {
4206 span: span(0..2),
4207 kind: ast::LiteralKind::Special(kind.clone()),
4208 c: c,
4209 }))
4210 );
4211 }
4212 assert_eq!(
4213 parser(r"\A").parse_primitive(),
4214 Ok(Primitive::Assertion(ast::Assertion {
4215 span: span(0..2),
4216 kind: ast::AssertionKind::StartText,
4217 }))
4218 );
4219 assert_eq!(
4220 parser(r"\z").parse_primitive(),
4221 Ok(Primitive::Assertion(ast::Assertion {
4222 span: span(0..2),
4223 kind: ast::AssertionKind::EndText,
4224 }))
4225 );
4226 assert_eq!(
4227 parser(r"\b").parse_primitive(),
4228 Ok(Primitive::Assertion(ast::Assertion {
4229 span: span(0..2),
4230 kind: ast::AssertionKind::WordBoundary,
4231 }))
4232 );
4233 assert_eq!(
4234 parser(r"\B").parse_primitive(),
4235 Ok(Primitive::Assertion(ast::Assertion {
4236 span: span(0..2),
4237 kind: ast::AssertionKind::NotWordBoundary,
4238 }))
4239 );
4240
4241 assert_eq!(
4242 parser(r"\").parse_escape().unwrap_err(),
4243 TestError {
4244 span: span(0..1),
4245 kind: ast::ErrorKind::EscapeUnexpectedEof,
4246 }
4247 );
4248 assert_eq!(
4249 parser(r"\y").parse_escape().unwrap_err(),
4250 TestError {
4251 span: span(0..2),
4252 kind: ast::ErrorKind::EscapeUnrecognized,
4253 }
4254 );
4255 }
4256
4257 #[test]
parse_unsupported_backreference()4258 fn parse_unsupported_backreference() {
4259 assert_eq!(
4260 parser(r"\0").parse_escape().unwrap_err(),
4261 TestError {
4262 span: span(0..2),
4263 kind: ast::ErrorKind::UnsupportedBackreference,
4264 }
4265 );
4266 assert_eq!(
4267 parser(r"\9").parse_escape().unwrap_err(),
4268 TestError {
4269 span: span(0..2),
4270 kind: ast::ErrorKind::UnsupportedBackreference,
4271 }
4272 );
4273 }
4274
4275 #[test]
parse_octal()4276 fn parse_octal() {
4277 for i in 0..511 {
4278 let pat = format!(r"\{:o}", i);
4279 assert_eq!(
4280 parser_octal(&pat).parse_escape(),
4281 Ok(Primitive::Literal(ast::Literal {
4282 span: span(0..pat.len()),
4283 kind: ast::LiteralKind::Octal,
4284 c: ::std::char::from_u32(i).unwrap(),
4285 }))
4286 );
4287 }
4288 assert_eq!(
4289 parser_octal(r"\778").parse_escape(),
4290 Ok(Primitive::Literal(ast::Literal {
4291 span: span(0..3),
4292 kind: ast::LiteralKind::Octal,
4293 c: '?',
4294 }))
4295 );
4296 assert_eq!(
4297 parser_octal(r"\7777").parse_escape(),
4298 Ok(Primitive::Literal(ast::Literal {
4299 span: span(0..4),
4300 kind: ast::LiteralKind::Octal,
4301 c: '\u{01FF}',
4302 }))
4303 );
4304 assert_eq!(
4305 parser_octal(r"\778").parse(),
4306 Ok(Ast::Concat(ast::Concat {
4307 span: span(0..4),
4308 asts: vec![
4309 Ast::Literal(ast::Literal {
4310 span: span(0..3),
4311 kind: ast::LiteralKind::Octal,
4312 c: '?',
4313 }),
4314 Ast::Literal(ast::Literal {
4315 span: span(3..4),
4316 kind: ast::LiteralKind::Verbatim,
4317 c: '8',
4318 }),
4319 ],
4320 }))
4321 );
4322 assert_eq!(
4323 parser_octal(r"\7777").parse(),
4324 Ok(Ast::Concat(ast::Concat {
4325 span: span(0..5),
4326 asts: vec![
4327 Ast::Literal(ast::Literal {
4328 span: span(0..4),
4329 kind: ast::LiteralKind::Octal,
4330 c: '\u{01FF}',
4331 }),
4332 Ast::Literal(ast::Literal {
4333 span: span(4..5),
4334 kind: ast::LiteralKind::Verbatim,
4335 c: '7',
4336 }),
4337 ],
4338 }))
4339 );
4340
4341 assert_eq!(
4342 parser_octal(r"\8").parse_escape().unwrap_err(),
4343 TestError {
4344 span: span(0..2),
4345 kind: ast::ErrorKind::EscapeUnrecognized,
4346 }
4347 );
4348 }
4349
4350 #[test]
parse_hex_two()4351 fn parse_hex_two() {
4352 for i in 0..256 {
4353 let pat = format!(r"\x{:02x}", i);
4354 assert_eq!(
4355 parser(&pat).parse_escape(),
4356 Ok(Primitive::Literal(ast::Literal {
4357 span: span(0..pat.len()),
4358 kind: ast::LiteralKind::HexFixed(ast::HexLiteralKind::X),
4359 c: ::std::char::from_u32(i).unwrap(),
4360 }))
4361 );
4362 }
4363
4364 assert_eq!(
4365 parser(r"\xF").parse_escape().unwrap_err(),
4366 TestError {
4367 span: span(3..3),
4368 kind: ast::ErrorKind::EscapeUnexpectedEof,
4369 }
4370 );
4371 assert_eq!(
4372 parser(r"\xG").parse_escape().unwrap_err(),
4373 TestError {
4374 span: span(2..3),
4375 kind: ast::ErrorKind::EscapeHexInvalidDigit,
4376 }
4377 );
4378 assert_eq!(
4379 parser(r"\xFG").parse_escape().unwrap_err(),
4380 TestError {
4381 span: span(3..4),
4382 kind: ast::ErrorKind::EscapeHexInvalidDigit,
4383 }
4384 );
4385 }
4386
4387 #[test]
parse_hex_four()4388 fn parse_hex_four() {
4389 for i in 0..65536 {
4390 let c = match ::std::char::from_u32(i) {
4391 None => continue,
4392 Some(c) => c,
4393 };
4394 let pat = format!(r"\u{:04x}", i);
4395 assert_eq!(
4396 parser(&pat).parse_escape(),
4397 Ok(Primitive::Literal(ast::Literal {
4398 span: span(0..pat.len()),
4399 kind: ast::LiteralKind::HexFixed(
4400 ast::HexLiteralKind::UnicodeShort
4401 ),
4402 c: c,
4403 }))
4404 );
4405 }
4406
4407 assert_eq!(
4408 parser(r"\uF").parse_escape().unwrap_err(),
4409 TestError {
4410 span: span(3..3),
4411 kind: ast::ErrorKind::EscapeUnexpectedEof,
4412 }
4413 );
4414 assert_eq!(
4415 parser(r"\uG").parse_escape().unwrap_err(),
4416 TestError {
4417 span: span(2..3),
4418 kind: ast::ErrorKind::EscapeHexInvalidDigit,
4419 }
4420 );
4421 assert_eq!(
4422 parser(r"\uFG").parse_escape().unwrap_err(),
4423 TestError {
4424 span: span(3..4),
4425 kind: ast::ErrorKind::EscapeHexInvalidDigit,
4426 }
4427 );
4428 assert_eq!(
4429 parser(r"\uFFG").parse_escape().unwrap_err(),
4430 TestError {
4431 span: span(4..5),
4432 kind: ast::ErrorKind::EscapeHexInvalidDigit,
4433 }
4434 );
4435 assert_eq!(
4436 parser(r"\uFFFG").parse_escape().unwrap_err(),
4437 TestError {
4438 span: span(5..6),
4439 kind: ast::ErrorKind::EscapeHexInvalidDigit,
4440 }
4441 );
4442 assert_eq!(
4443 parser(r"\uD800").parse_escape().unwrap_err(),
4444 TestError {
4445 span: span(2..6),
4446 kind: ast::ErrorKind::EscapeHexInvalid,
4447 }
4448 );
4449 }
4450
4451 #[test]
parse_hex_eight()4452 fn parse_hex_eight() {
4453 for i in 0..65536 {
4454 let c = match ::std::char::from_u32(i) {
4455 None => continue,
4456 Some(c) => c,
4457 };
4458 let pat = format!(r"\U{:08x}", i);
4459 assert_eq!(
4460 parser(&pat).parse_escape(),
4461 Ok(Primitive::Literal(ast::Literal {
4462 span: span(0..pat.len()),
4463 kind: ast::LiteralKind::HexFixed(
4464 ast::HexLiteralKind::UnicodeLong
4465 ),
4466 c: c,
4467 }))
4468 );
4469 }
4470
4471 assert_eq!(
4472 parser(r"\UF").parse_escape().unwrap_err(),
4473 TestError {
4474 span: span(3..3),
4475 kind: ast::ErrorKind::EscapeUnexpectedEof,
4476 }
4477 );
4478 assert_eq!(
4479 parser(r"\UG").parse_escape().unwrap_err(),
4480 TestError {
4481 span: span(2..3),
4482 kind: ast::ErrorKind::EscapeHexInvalidDigit,
4483 }
4484 );
4485 assert_eq!(
4486 parser(r"\UFG").parse_escape().unwrap_err(),
4487 TestError {
4488 span: span(3..4),
4489 kind: ast::ErrorKind::EscapeHexInvalidDigit,
4490 }
4491 );
4492 assert_eq!(
4493 parser(r"\UFFG").parse_escape().unwrap_err(),
4494 TestError {
4495 span: span(4..5),
4496 kind: ast::ErrorKind::EscapeHexInvalidDigit,
4497 }
4498 );
4499 assert_eq!(
4500 parser(r"\UFFFG").parse_escape().unwrap_err(),
4501 TestError {
4502 span: span(5..6),
4503 kind: ast::ErrorKind::EscapeHexInvalidDigit,
4504 }
4505 );
4506 assert_eq!(
4507 parser(r"\UFFFFG").parse_escape().unwrap_err(),
4508 TestError {
4509 span: span(6..7),
4510 kind: ast::ErrorKind::EscapeHexInvalidDigit,
4511 }
4512 );
4513 assert_eq!(
4514 parser(r"\UFFFFFG").parse_escape().unwrap_err(),
4515 TestError {
4516 span: span(7..8),
4517 kind: ast::ErrorKind::EscapeHexInvalidDigit,
4518 }
4519 );
4520 assert_eq!(
4521 parser(r"\UFFFFFFG").parse_escape().unwrap_err(),
4522 TestError {
4523 span: span(8..9),
4524 kind: ast::ErrorKind::EscapeHexInvalidDigit,
4525 }
4526 );
4527 assert_eq!(
4528 parser(r"\UFFFFFFFG").parse_escape().unwrap_err(),
4529 TestError {
4530 span: span(9..10),
4531 kind: ast::ErrorKind::EscapeHexInvalidDigit,
4532 }
4533 );
4534 }
4535
4536 #[test]
parse_hex_brace()4537 fn parse_hex_brace() {
4538 assert_eq!(
4539 parser(r"\u{26c4}").parse_escape(),
4540 Ok(Primitive::Literal(ast::Literal {
4541 span: span(0..8),
4542 kind: ast::LiteralKind::HexBrace(
4543 ast::HexLiteralKind::UnicodeShort
4544 ),
4545 c: '⛄',
4546 }))
4547 );
4548 assert_eq!(
4549 parser(r"\U{26c4}").parse_escape(),
4550 Ok(Primitive::Literal(ast::Literal {
4551 span: span(0..8),
4552 kind: ast::LiteralKind::HexBrace(
4553 ast::HexLiteralKind::UnicodeLong
4554 ),
4555 c: '⛄',
4556 }))
4557 );
4558 assert_eq!(
4559 parser(r"\x{26c4}").parse_escape(),
4560 Ok(Primitive::Literal(ast::Literal {
4561 span: span(0..8),
4562 kind: ast::LiteralKind::HexBrace(ast::HexLiteralKind::X),
4563 c: '⛄',
4564 }))
4565 );
4566 assert_eq!(
4567 parser(r"\x{26C4}").parse_escape(),
4568 Ok(Primitive::Literal(ast::Literal {
4569 span: span(0..8),
4570 kind: ast::LiteralKind::HexBrace(ast::HexLiteralKind::X),
4571 c: '⛄',
4572 }))
4573 );
4574 assert_eq!(
4575 parser(r"\x{10fFfF}").parse_escape(),
4576 Ok(Primitive::Literal(ast::Literal {
4577 span: span(0..10),
4578 kind: ast::LiteralKind::HexBrace(ast::HexLiteralKind::X),
4579 c: '\u{10FFFF}',
4580 }))
4581 );
4582
4583 assert_eq!(
4584 parser(r"\x").parse_escape().unwrap_err(),
4585 TestError {
4586 span: span(2..2),
4587 kind: ast::ErrorKind::EscapeUnexpectedEof,
4588 }
4589 );
4590 assert_eq!(
4591 parser(r"\x{").parse_escape().unwrap_err(),
4592 TestError {
4593 span: span(2..3),
4594 kind: ast::ErrorKind::EscapeUnexpectedEof,
4595 }
4596 );
4597 assert_eq!(
4598 parser(r"\x{FF").parse_escape().unwrap_err(),
4599 TestError {
4600 span: span(2..5),
4601 kind: ast::ErrorKind::EscapeUnexpectedEof,
4602 }
4603 );
4604 assert_eq!(
4605 parser(r"\x{}").parse_escape().unwrap_err(),
4606 TestError {
4607 span: span(2..4),
4608 kind: ast::ErrorKind::EscapeHexEmpty,
4609 }
4610 );
4611 assert_eq!(
4612 parser(r"\x{FGF}").parse_escape().unwrap_err(),
4613 TestError {
4614 span: span(4..5),
4615 kind: ast::ErrorKind::EscapeHexInvalidDigit,
4616 }
4617 );
4618 assert_eq!(
4619 parser(r"\x{FFFFFF}").parse_escape().unwrap_err(),
4620 TestError {
4621 span: span(3..9),
4622 kind: ast::ErrorKind::EscapeHexInvalid,
4623 }
4624 );
4625 assert_eq!(
4626 parser(r"\x{D800}").parse_escape().unwrap_err(),
4627 TestError {
4628 span: span(3..7),
4629 kind: ast::ErrorKind::EscapeHexInvalid,
4630 }
4631 );
4632 assert_eq!(
4633 parser(r"\x{FFFFFFFFF}").parse_escape().unwrap_err(),
4634 TestError {
4635 span: span(3..12),
4636 kind: ast::ErrorKind::EscapeHexInvalid,
4637 }
4638 );
4639 }
4640
4641 #[test]
parse_decimal()4642 fn parse_decimal() {
4643 assert_eq!(parser("123").parse_decimal(), Ok(123));
4644 assert_eq!(parser("0").parse_decimal(), Ok(0));
4645 assert_eq!(parser("01").parse_decimal(), Ok(1));
4646
4647 assert_eq!(
4648 parser("-1").parse_decimal().unwrap_err(),
4649 TestError { span: span(0..0), kind: ast::ErrorKind::DecimalEmpty }
4650 );
4651 assert_eq!(
4652 parser("").parse_decimal().unwrap_err(),
4653 TestError { span: span(0..0), kind: ast::ErrorKind::DecimalEmpty }
4654 );
4655 assert_eq!(
4656 parser("9999999999").parse_decimal().unwrap_err(),
4657 TestError {
4658 span: span(0..10),
4659 kind: ast::ErrorKind::DecimalInvalid,
4660 }
4661 );
4662 }
4663
4664 #[test]
parse_set_class()4665 fn parse_set_class() {
4666 fn union(span: Span, items: Vec<ast::ClassSetItem>) -> ast::ClassSet {
4667 ast::ClassSet::union(ast::ClassSetUnion {
4668 span: span,
4669 items: items,
4670 })
4671 }
4672
4673 fn intersection(
4674 span: Span,
4675 lhs: ast::ClassSet,
4676 rhs: ast::ClassSet,
4677 ) -> ast::ClassSet {
4678 ast::ClassSet::BinaryOp(ast::ClassSetBinaryOp {
4679 span: span,
4680 kind: ast::ClassSetBinaryOpKind::Intersection,
4681 lhs: Box::new(lhs),
4682 rhs: Box::new(rhs),
4683 })
4684 }
4685
4686 fn difference(
4687 span: Span,
4688 lhs: ast::ClassSet,
4689 rhs: ast::ClassSet,
4690 ) -> ast::ClassSet {
4691 ast::ClassSet::BinaryOp(ast::ClassSetBinaryOp {
4692 span: span,
4693 kind: ast::ClassSetBinaryOpKind::Difference,
4694 lhs: Box::new(lhs),
4695 rhs: Box::new(rhs),
4696 })
4697 }
4698
4699 fn symdifference(
4700 span: Span,
4701 lhs: ast::ClassSet,
4702 rhs: ast::ClassSet,
4703 ) -> ast::ClassSet {
4704 ast::ClassSet::BinaryOp(ast::ClassSetBinaryOp {
4705 span: span,
4706 kind: ast::ClassSetBinaryOpKind::SymmetricDifference,
4707 lhs: Box::new(lhs),
4708 rhs: Box::new(rhs),
4709 })
4710 }
4711
4712 fn itemset(item: ast::ClassSetItem) -> ast::ClassSet {
4713 ast::ClassSet::Item(item)
4714 }
4715
4716 fn item_ascii(cls: ast::ClassAscii) -> ast::ClassSetItem {
4717 ast::ClassSetItem::Ascii(cls)
4718 }
4719
4720 fn item_unicode(cls: ast::ClassUnicode) -> ast::ClassSetItem {
4721 ast::ClassSetItem::Unicode(cls)
4722 }
4723
4724 fn item_perl(cls: ast::ClassPerl) -> ast::ClassSetItem {
4725 ast::ClassSetItem::Perl(cls)
4726 }
4727
4728 fn item_bracket(cls: ast::ClassBracketed) -> ast::ClassSetItem {
4729 ast::ClassSetItem::Bracketed(Box::new(cls))
4730 }
4731
4732 fn lit(span: Span, c: char) -> ast::ClassSetItem {
4733 ast::ClassSetItem::Literal(ast::Literal {
4734 span: span,
4735 kind: ast::LiteralKind::Verbatim,
4736 c: c,
4737 })
4738 }
4739
4740 fn empty(span: Span) -> ast::ClassSetItem {
4741 ast::ClassSetItem::Empty(span)
4742 }
4743
4744 fn range(span: Span, start: char, end: char) -> ast::ClassSetItem {
4745 let pos1 = Position {
4746 offset: span.start.offset + start.len_utf8(),
4747 column: span.start.column + 1,
4748 ..span.start
4749 };
4750 let pos2 = Position {
4751 offset: span.end.offset - end.len_utf8(),
4752 column: span.end.column - 1,
4753 ..span.end
4754 };
4755 ast::ClassSetItem::Range(ast::ClassSetRange {
4756 span: span,
4757 start: ast::Literal {
4758 span: Span { end: pos1, ..span },
4759 kind: ast::LiteralKind::Verbatim,
4760 c: start,
4761 },
4762 end: ast::Literal {
4763 span: Span { start: pos2, ..span },
4764 kind: ast::LiteralKind::Verbatim,
4765 c: end,
4766 },
4767 })
4768 }
4769
4770 fn alnum(span: Span, negated: bool) -> ast::ClassAscii {
4771 ast::ClassAscii {
4772 span: span,
4773 kind: ast::ClassAsciiKind::Alnum,
4774 negated: negated,
4775 }
4776 }
4777
4778 fn lower(span: Span, negated: bool) -> ast::ClassAscii {
4779 ast::ClassAscii {
4780 span: span,
4781 kind: ast::ClassAsciiKind::Lower,
4782 negated: negated,
4783 }
4784 }
4785
4786 assert_eq!(
4787 parser("[[:alnum:]]").parse(),
4788 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
4789 span: span(0..11),
4790 negated: false,
4791 kind: itemset(item_ascii(alnum(span(1..10), false))),
4792 })))
4793 );
4794 assert_eq!(
4795 parser("[[[:alnum:]]]").parse(),
4796 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
4797 span: span(0..13),
4798 negated: false,
4799 kind: itemset(item_bracket(ast::ClassBracketed {
4800 span: span(1..12),
4801 negated: false,
4802 kind: itemset(item_ascii(alnum(span(2..11), false))),
4803 })),
4804 })))
4805 );
4806 assert_eq!(
4807 parser("[[:alnum:]&&[:lower:]]").parse(),
4808 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
4809 span: span(0..22),
4810 negated: false,
4811 kind: intersection(
4812 span(1..21),
4813 itemset(item_ascii(alnum(span(1..10), false))),
4814 itemset(item_ascii(lower(span(12..21), false))),
4815 ),
4816 })))
4817 );
4818 assert_eq!(
4819 parser("[[:alnum:]--[:lower:]]").parse(),
4820 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
4821 span: span(0..22),
4822 negated: false,
4823 kind: difference(
4824 span(1..21),
4825 itemset(item_ascii(alnum(span(1..10), false))),
4826 itemset(item_ascii(lower(span(12..21), false))),
4827 ),
4828 })))
4829 );
4830 assert_eq!(
4831 parser("[[:alnum:]~~[:lower:]]").parse(),
4832 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
4833 span: span(0..22),
4834 negated: false,
4835 kind: symdifference(
4836 span(1..21),
4837 itemset(item_ascii(alnum(span(1..10), false))),
4838 itemset(item_ascii(lower(span(12..21), false))),
4839 ),
4840 })))
4841 );
4842
4843 assert_eq!(
4844 parser("[a]").parse(),
4845 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
4846 span: span(0..3),
4847 negated: false,
4848 kind: itemset(lit(span(1..2), 'a')),
4849 })))
4850 );
4851 assert_eq!(
4852 parser(r"[a\]]").parse(),
4853 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
4854 span: span(0..5),
4855 negated: false,
4856 kind: union(
4857 span(1..4),
4858 vec![
4859 lit(span(1..2), 'a'),
4860 ast::ClassSetItem::Literal(ast::Literal {
4861 span: span(2..4),
4862 kind: ast::LiteralKind::Punctuation,
4863 c: ']',
4864 }),
4865 ]
4866 ),
4867 })))
4868 );
4869 assert_eq!(
4870 parser(r"[a\-z]").parse(),
4871 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
4872 span: span(0..6),
4873 negated: false,
4874 kind: union(
4875 span(1..5),
4876 vec![
4877 lit(span(1..2), 'a'),
4878 ast::ClassSetItem::Literal(ast::Literal {
4879 span: span(2..4),
4880 kind: ast::LiteralKind::Punctuation,
4881 c: '-',
4882 }),
4883 lit(span(4..5), 'z'),
4884 ]
4885 ),
4886 })))
4887 );
4888 assert_eq!(
4889 parser("[ab]").parse(),
4890 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
4891 span: span(0..4),
4892 negated: false,
4893 kind: union(
4894 span(1..3),
4895 vec![lit(span(1..2), 'a'), lit(span(2..3), 'b'),]
4896 ),
4897 })))
4898 );
4899 assert_eq!(
4900 parser("[a-]").parse(),
4901 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
4902 span: span(0..4),
4903 negated: false,
4904 kind: union(
4905 span(1..3),
4906 vec![lit(span(1..2), 'a'), lit(span(2..3), '-'),]
4907 ),
4908 })))
4909 );
4910 assert_eq!(
4911 parser("[-a]").parse(),
4912 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
4913 span: span(0..4),
4914 negated: false,
4915 kind: union(
4916 span(1..3),
4917 vec![lit(span(1..2), '-'), lit(span(2..3), 'a'),]
4918 ),
4919 })))
4920 );
4921 assert_eq!(
4922 parser(r"[\pL]").parse(),
4923 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
4924 span: span(0..5),
4925 negated: false,
4926 kind: itemset(item_unicode(ast::ClassUnicode {
4927 span: span(1..4),
4928 negated: false,
4929 kind: ast::ClassUnicodeKind::OneLetter('L'),
4930 })),
4931 })))
4932 );
4933 assert_eq!(
4934 parser(r"[\w]").parse(),
4935 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
4936 span: span(0..4),
4937 negated: false,
4938 kind: itemset(item_perl(ast::ClassPerl {
4939 span: span(1..3),
4940 kind: ast::ClassPerlKind::Word,
4941 negated: false,
4942 })),
4943 })))
4944 );
4945 assert_eq!(
4946 parser(r"[a\wz]").parse(),
4947 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
4948 span: span(0..6),
4949 negated: false,
4950 kind: union(
4951 span(1..5),
4952 vec![
4953 lit(span(1..2), 'a'),
4954 item_perl(ast::ClassPerl {
4955 span: span(2..4),
4956 kind: ast::ClassPerlKind::Word,
4957 negated: false,
4958 }),
4959 lit(span(4..5), 'z'),
4960 ]
4961 ),
4962 })))
4963 );
4964
4965 assert_eq!(
4966 parser("[a-z]").parse(),
4967 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
4968 span: span(0..5),
4969 negated: false,
4970 kind: itemset(range(span(1..4), 'a', 'z')),
4971 })))
4972 );
4973 assert_eq!(
4974 parser("[a-cx-z]").parse(),
4975 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
4976 span: span(0..8),
4977 negated: false,
4978 kind: union(
4979 span(1..7),
4980 vec![
4981 range(span(1..4), 'a', 'c'),
4982 range(span(4..7), 'x', 'z'),
4983 ]
4984 ),
4985 })))
4986 );
4987 assert_eq!(
4988 parser(r"[\w&&a-cx-z]").parse(),
4989 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
4990 span: span(0..12),
4991 negated: false,
4992 kind: intersection(
4993 span(1..11),
4994 itemset(item_perl(ast::ClassPerl {
4995 span: span(1..3),
4996 kind: ast::ClassPerlKind::Word,
4997 negated: false,
4998 })),
4999 union(
5000 span(5..11),
5001 vec![
5002 range(span(5..8), 'a', 'c'),
5003 range(span(8..11), 'x', 'z'),
5004 ]
5005 ),
5006 ),
5007 })))
5008 );
5009 assert_eq!(
5010 parser(r"[a-cx-z&&\w]").parse(),
5011 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
5012 span: span(0..12),
5013 negated: false,
5014 kind: intersection(
5015 span(1..11),
5016 union(
5017 span(1..7),
5018 vec![
5019 range(span(1..4), 'a', 'c'),
5020 range(span(4..7), 'x', 'z'),
5021 ]
5022 ),
5023 itemset(item_perl(ast::ClassPerl {
5024 span: span(9..11),
5025 kind: ast::ClassPerlKind::Word,
5026 negated: false,
5027 })),
5028 ),
5029 })))
5030 );
5031 assert_eq!(
5032 parser(r"[a--b--c]").parse(),
5033 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
5034 span: span(0..9),
5035 negated: false,
5036 kind: difference(
5037 span(1..8),
5038 difference(
5039 span(1..5),
5040 itemset(lit(span(1..2), 'a')),
5041 itemset(lit(span(4..5), 'b')),
5042 ),
5043 itemset(lit(span(7..8), 'c')),
5044 ),
5045 })))
5046 );
5047 assert_eq!(
5048 parser(r"[a~~b~~c]").parse(),
5049 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
5050 span: span(0..9),
5051 negated: false,
5052 kind: symdifference(
5053 span(1..8),
5054 symdifference(
5055 span(1..5),
5056 itemset(lit(span(1..2), 'a')),
5057 itemset(lit(span(4..5), 'b')),
5058 ),
5059 itemset(lit(span(7..8), 'c')),
5060 ),
5061 })))
5062 );
5063 assert_eq!(
5064 parser(r"[\^&&^]").parse(),
5065 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
5066 span: span(0..7),
5067 negated: false,
5068 kind: intersection(
5069 span(1..6),
5070 itemset(ast::ClassSetItem::Literal(ast::Literal {
5071 span: span(1..3),
5072 kind: ast::LiteralKind::Punctuation,
5073 c: '^',
5074 })),
5075 itemset(lit(span(5..6), '^')),
5076 ),
5077 })))
5078 );
5079 assert_eq!(
5080 parser(r"[\&&&&]").parse(),
5081 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
5082 span: span(0..7),
5083 negated: false,
5084 kind: intersection(
5085 span(1..6),
5086 itemset(ast::ClassSetItem::Literal(ast::Literal {
5087 span: span(1..3),
5088 kind: ast::LiteralKind::Punctuation,
5089 c: '&',
5090 })),
5091 itemset(lit(span(5..6), '&')),
5092 ),
5093 })))
5094 );
5095 assert_eq!(
5096 parser(r"[&&&&]").parse(),
5097 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
5098 span: span(0..6),
5099 negated: false,
5100 kind: intersection(
5101 span(1..5),
5102 intersection(
5103 span(1..3),
5104 itemset(empty(span(1..1))),
5105 itemset(empty(span(3..3))),
5106 ),
5107 itemset(empty(span(5..5))),
5108 ),
5109 })))
5110 );
5111
5112 let pat = "[☃-⛄]";
5113 assert_eq!(
5114 parser(pat).parse(),
5115 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
5116 span: span_range(pat, 0..9),
5117 negated: false,
5118 kind: itemset(ast::ClassSetItem::Range(ast::ClassSetRange {
5119 span: span_range(pat, 1..8),
5120 start: ast::Literal {
5121 span: span_range(pat, 1..4),
5122 kind: ast::LiteralKind::Verbatim,
5123 c: '☃',
5124 },
5125 end: ast::Literal {
5126 span: span_range(pat, 5..8),
5127 kind: ast::LiteralKind::Verbatim,
5128 c: '⛄',
5129 },
5130 })),
5131 })))
5132 );
5133
5134 assert_eq!(
5135 parser(r"[]]").parse(),
5136 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
5137 span: span(0..3),
5138 negated: false,
5139 kind: itemset(lit(span(1..2), ']')),
5140 })))
5141 );
5142 assert_eq!(
5143 parser(r"[]\[]").parse(),
5144 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
5145 span: span(0..5),
5146 negated: false,
5147 kind: union(
5148 span(1..4),
5149 vec![
5150 lit(span(1..2), ']'),
5151 ast::ClassSetItem::Literal(ast::Literal {
5152 span: span(2..4),
5153 kind: ast::LiteralKind::Punctuation,
5154 c: '[',
5155 }),
5156 ]
5157 ),
5158 })))
5159 );
5160 assert_eq!(
5161 parser(r"[\[]]").parse(),
5162 Ok(concat(
5163 0..5,
5164 vec![
5165 Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
5166 span: span(0..4),
5167 negated: false,
5168 kind: itemset(ast::ClassSetItem::Literal(
5169 ast::Literal {
5170 span: span(1..3),
5171 kind: ast::LiteralKind::Punctuation,
5172 c: '[',
5173 }
5174 )),
5175 })),
5176 Ast::Literal(ast::Literal {
5177 span: span(4..5),
5178 kind: ast::LiteralKind::Verbatim,
5179 c: ']',
5180 }),
5181 ]
5182 ))
5183 );
5184
5185 assert_eq!(
5186 parser("[").parse().unwrap_err(),
5187 TestError {
5188 span: span(0..1),
5189 kind: ast::ErrorKind::ClassUnclosed,
5190 }
5191 );
5192 assert_eq!(
5193 parser("[[").parse().unwrap_err(),
5194 TestError {
5195 span: span(1..2),
5196 kind: ast::ErrorKind::ClassUnclosed,
5197 }
5198 );
5199 assert_eq!(
5200 parser("[[-]").parse().unwrap_err(),
5201 TestError {
5202 span: span(0..1),
5203 kind: ast::ErrorKind::ClassUnclosed,
5204 }
5205 );
5206 assert_eq!(
5207 parser("[[[:alnum:]").parse().unwrap_err(),
5208 TestError {
5209 span: span(1..2),
5210 kind: ast::ErrorKind::ClassUnclosed,
5211 }
5212 );
5213 assert_eq!(
5214 parser(r"[\b]").parse().unwrap_err(),
5215 TestError {
5216 span: span(1..3),
5217 kind: ast::ErrorKind::ClassEscapeInvalid,
5218 }
5219 );
5220 assert_eq!(
5221 parser(r"[\w-a]").parse().unwrap_err(),
5222 TestError {
5223 span: span(1..3),
5224 kind: ast::ErrorKind::ClassRangeLiteral,
5225 }
5226 );
5227 assert_eq!(
5228 parser(r"[a-\w]").parse().unwrap_err(),
5229 TestError {
5230 span: span(3..5),
5231 kind: ast::ErrorKind::ClassRangeLiteral,
5232 }
5233 );
5234 assert_eq!(
5235 parser(r"[z-a]").parse().unwrap_err(),
5236 TestError {
5237 span: span(1..4),
5238 kind: ast::ErrorKind::ClassRangeInvalid,
5239 }
5240 );
5241
5242 assert_eq!(
5243 parser_ignore_whitespace("[a ").parse().unwrap_err(),
5244 TestError {
5245 span: span(0..1),
5246 kind: ast::ErrorKind::ClassUnclosed,
5247 }
5248 );
5249 assert_eq!(
5250 parser_ignore_whitespace("[a- ").parse().unwrap_err(),
5251 TestError {
5252 span: span(0..1),
5253 kind: ast::ErrorKind::ClassUnclosed,
5254 }
5255 );
5256 }
5257
5258 #[test]
parse_set_class_open()5259 fn parse_set_class_open() {
5260 assert_eq!(parser("[a]").parse_set_class_open(), {
5261 let set = ast::ClassBracketed {
5262 span: span(0..1),
5263 negated: false,
5264 kind: ast::ClassSet::union(ast::ClassSetUnion {
5265 span: span(1..1),
5266 items: vec![],
5267 }),
5268 };
5269 let union = ast::ClassSetUnion { span: span(1..1), items: vec![] };
5270 Ok((set, union))
5271 });
5272 assert_eq!(
5273 parser_ignore_whitespace("[ a]").parse_set_class_open(),
5274 {
5275 let set = ast::ClassBracketed {
5276 span: span(0..4),
5277 negated: false,
5278 kind: ast::ClassSet::union(ast::ClassSetUnion {
5279 span: span(4..4),
5280 items: vec![],
5281 }),
5282 };
5283 let union =
5284 ast::ClassSetUnion { span: span(4..4), items: vec![] };
5285 Ok((set, union))
5286 }
5287 );
5288 assert_eq!(parser("[^a]").parse_set_class_open(), {
5289 let set = ast::ClassBracketed {
5290 span: span(0..2),
5291 negated: true,
5292 kind: ast::ClassSet::union(ast::ClassSetUnion {
5293 span: span(2..2),
5294 items: vec![],
5295 }),
5296 };
5297 let union = ast::ClassSetUnion { span: span(2..2), items: vec![] };
5298 Ok((set, union))
5299 });
5300 assert_eq!(
5301 parser_ignore_whitespace("[ ^ a]").parse_set_class_open(),
5302 {
5303 let set = ast::ClassBracketed {
5304 span: span(0..4),
5305 negated: true,
5306 kind: ast::ClassSet::union(ast::ClassSetUnion {
5307 span: span(4..4),
5308 items: vec![],
5309 }),
5310 };
5311 let union =
5312 ast::ClassSetUnion { span: span(4..4), items: vec![] };
5313 Ok((set, union))
5314 }
5315 );
5316 assert_eq!(parser("[-a]").parse_set_class_open(), {
5317 let set = ast::ClassBracketed {
5318 span: span(0..2),
5319 negated: false,
5320 kind: ast::ClassSet::union(ast::ClassSetUnion {
5321 span: span(1..1),
5322 items: vec![],
5323 }),
5324 };
5325 let union = ast::ClassSetUnion {
5326 span: span(1..2),
5327 items: vec![ast::ClassSetItem::Literal(ast::Literal {
5328 span: span(1..2),
5329 kind: ast::LiteralKind::Verbatim,
5330 c: '-',
5331 })],
5332 };
5333 Ok((set, union))
5334 });
5335 assert_eq!(
5336 parser_ignore_whitespace("[ - a]").parse_set_class_open(),
5337 {
5338 let set = ast::ClassBracketed {
5339 span: span(0..4),
5340 negated: false,
5341 kind: ast::ClassSet::union(ast::ClassSetUnion {
5342 span: span(2..2),
5343 items: vec![],
5344 }),
5345 };
5346 let union = ast::ClassSetUnion {
5347 span: span(2..3),
5348 items: vec![ast::ClassSetItem::Literal(ast::Literal {
5349 span: span(2..3),
5350 kind: ast::LiteralKind::Verbatim,
5351 c: '-',
5352 })],
5353 };
5354 Ok((set, union))
5355 }
5356 );
5357 assert_eq!(parser("[^-a]").parse_set_class_open(), {
5358 let set = ast::ClassBracketed {
5359 span: span(0..3),
5360 negated: true,
5361 kind: ast::ClassSet::union(ast::ClassSetUnion {
5362 span: span(2..2),
5363 items: vec![],
5364 }),
5365 };
5366 let union = ast::ClassSetUnion {
5367 span: span(2..3),
5368 items: vec![ast::ClassSetItem::Literal(ast::Literal {
5369 span: span(2..3),
5370 kind: ast::LiteralKind::Verbatim,
5371 c: '-',
5372 })],
5373 };
5374 Ok((set, union))
5375 });
5376 assert_eq!(parser("[--a]").parse_set_class_open(), {
5377 let set = ast::ClassBracketed {
5378 span: span(0..3),
5379 negated: false,
5380 kind: ast::ClassSet::union(ast::ClassSetUnion {
5381 span: span(1..1),
5382 items: vec![],
5383 }),
5384 };
5385 let union = ast::ClassSetUnion {
5386 span: span(1..3),
5387 items: vec![
5388 ast::ClassSetItem::Literal(ast::Literal {
5389 span: span(1..2),
5390 kind: ast::LiteralKind::Verbatim,
5391 c: '-',
5392 }),
5393 ast::ClassSetItem::Literal(ast::Literal {
5394 span: span(2..3),
5395 kind: ast::LiteralKind::Verbatim,
5396 c: '-',
5397 }),
5398 ],
5399 };
5400 Ok((set, union))
5401 });
5402 assert_eq!(parser("[]a]").parse_set_class_open(), {
5403 let set = ast::ClassBracketed {
5404 span: span(0..2),
5405 negated: false,
5406 kind: ast::ClassSet::union(ast::ClassSetUnion {
5407 span: span(1..1),
5408 items: vec![],
5409 }),
5410 };
5411 let union = ast::ClassSetUnion {
5412 span: span(1..2),
5413 items: vec![ast::ClassSetItem::Literal(ast::Literal {
5414 span: span(1..2),
5415 kind: ast::LiteralKind::Verbatim,
5416 c: ']',
5417 })],
5418 };
5419 Ok((set, union))
5420 });
5421 assert_eq!(
5422 parser_ignore_whitespace("[ ] a]").parse_set_class_open(),
5423 {
5424 let set = ast::ClassBracketed {
5425 span: span(0..4),
5426 negated: false,
5427 kind: ast::ClassSet::union(ast::ClassSetUnion {
5428 span: span(2..2),
5429 items: vec![],
5430 }),
5431 };
5432 let union = ast::ClassSetUnion {
5433 span: span(2..3),
5434 items: vec![ast::ClassSetItem::Literal(ast::Literal {
5435 span: span(2..3),
5436 kind: ast::LiteralKind::Verbatim,
5437 c: ']',
5438 })],
5439 };
5440 Ok((set, union))
5441 }
5442 );
5443 assert_eq!(parser("[^]a]").parse_set_class_open(), {
5444 let set = ast::ClassBracketed {
5445 span: span(0..3),
5446 negated: true,
5447 kind: ast::ClassSet::union(ast::ClassSetUnion {
5448 span: span(2..2),
5449 items: vec![],
5450 }),
5451 };
5452 let union = ast::ClassSetUnion {
5453 span: span(2..3),
5454 items: vec![ast::ClassSetItem::Literal(ast::Literal {
5455 span: span(2..3),
5456 kind: ast::LiteralKind::Verbatim,
5457 c: ']',
5458 })],
5459 };
5460 Ok((set, union))
5461 });
5462 assert_eq!(parser("[-]a]").parse_set_class_open(), {
5463 let set = ast::ClassBracketed {
5464 span: span(0..2),
5465 negated: false,
5466 kind: ast::ClassSet::union(ast::ClassSetUnion {
5467 span: span(1..1),
5468 items: vec![],
5469 }),
5470 };
5471 let union = ast::ClassSetUnion {
5472 span: span(1..2),
5473 items: vec![ast::ClassSetItem::Literal(ast::Literal {
5474 span: span(1..2),
5475 kind: ast::LiteralKind::Verbatim,
5476 c: '-',
5477 })],
5478 };
5479 Ok((set, union))
5480 });
5481
5482 assert_eq!(
5483 parser("[").parse_set_class_open().unwrap_err(),
5484 TestError {
5485 span: span(0..1),
5486 kind: ast::ErrorKind::ClassUnclosed,
5487 }
5488 );
5489 assert_eq!(
5490 parser_ignore_whitespace("[ ")
5491 .parse_set_class_open()
5492 .unwrap_err(),
5493 TestError {
5494 span: span(0..5),
5495 kind: ast::ErrorKind::ClassUnclosed,
5496 }
5497 );
5498 assert_eq!(
5499 parser("[^").parse_set_class_open().unwrap_err(),
5500 TestError {
5501 span: span(0..2),
5502 kind: ast::ErrorKind::ClassUnclosed,
5503 }
5504 );
5505 assert_eq!(
5506 parser("[]").parse_set_class_open().unwrap_err(),
5507 TestError {
5508 span: span(0..2),
5509 kind: ast::ErrorKind::ClassUnclosed,
5510 }
5511 );
5512 assert_eq!(
5513 parser("[-").parse_set_class_open().unwrap_err(),
5514 TestError {
5515 span: span(0..2),
5516 kind: ast::ErrorKind::ClassUnclosed,
5517 }
5518 );
5519 assert_eq!(
5520 parser("[--").parse_set_class_open().unwrap_err(),
5521 TestError {
5522 span: span(0..3),
5523 kind: ast::ErrorKind::ClassUnclosed,
5524 }
5525 );
5526 }
5527
5528 #[test]
maybe_parse_ascii_class()5529 fn maybe_parse_ascii_class() {
5530 assert_eq!(
5531 parser(r"[:alnum:]").maybe_parse_ascii_class(),
5532 Some(ast::ClassAscii {
5533 span: span(0..9),
5534 kind: ast::ClassAsciiKind::Alnum,
5535 negated: false,
5536 })
5537 );
5538 assert_eq!(
5539 parser(r"[:alnum:]A").maybe_parse_ascii_class(),
5540 Some(ast::ClassAscii {
5541 span: span(0..9),
5542 kind: ast::ClassAsciiKind::Alnum,
5543 negated: false,
5544 })
5545 );
5546 assert_eq!(
5547 parser(r"[:^alnum:]").maybe_parse_ascii_class(),
5548 Some(ast::ClassAscii {
5549 span: span(0..10),
5550 kind: ast::ClassAsciiKind::Alnum,
5551 negated: true,
5552 })
5553 );
5554
5555 let p = parser(r"[:");
5556 assert_eq!(p.maybe_parse_ascii_class(), None);
5557 assert_eq!(p.offset(), 0);
5558
5559 let p = parser(r"[:^");
5560 assert_eq!(p.maybe_parse_ascii_class(), None);
5561 assert_eq!(p.offset(), 0);
5562
5563 let p = parser(r"[^:alnum:]");
5564 assert_eq!(p.maybe_parse_ascii_class(), None);
5565 assert_eq!(p.offset(), 0);
5566
5567 let p = parser(r"[:alnnum:]");
5568 assert_eq!(p.maybe_parse_ascii_class(), None);
5569 assert_eq!(p.offset(), 0);
5570
5571 let p = parser(r"[:alnum]");
5572 assert_eq!(p.maybe_parse_ascii_class(), None);
5573 assert_eq!(p.offset(), 0);
5574
5575 let p = parser(r"[:alnum:");
5576 assert_eq!(p.maybe_parse_ascii_class(), None);
5577 assert_eq!(p.offset(), 0);
5578 }
5579
5580 #[test]
parse_unicode_class()5581 fn parse_unicode_class() {
5582 assert_eq!(
5583 parser(r"\pN").parse_escape(),
5584 Ok(Primitive::Unicode(ast::ClassUnicode {
5585 span: span(0..3),
5586 negated: false,
5587 kind: ast::ClassUnicodeKind::OneLetter('N'),
5588 }))
5589 );
5590 assert_eq!(
5591 parser(r"\PN").parse_escape(),
5592 Ok(Primitive::Unicode(ast::ClassUnicode {
5593 span: span(0..3),
5594 negated: true,
5595 kind: ast::ClassUnicodeKind::OneLetter('N'),
5596 }))
5597 );
5598 assert_eq!(
5599 parser(r"\p{N}").parse_escape(),
5600 Ok(Primitive::Unicode(ast::ClassUnicode {
5601 span: span(0..5),
5602 negated: false,
5603 kind: ast::ClassUnicodeKind::Named(s("N")),
5604 }))
5605 );
5606 assert_eq!(
5607 parser(r"\P{N}").parse_escape(),
5608 Ok(Primitive::Unicode(ast::ClassUnicode {
5609 span: span(0..5),
5610 negated: true,
5611 kind: ast::ClassUnicodeKind::Named(s("N")),
5612 }))
5613 );
5614 assert_eq!(
5615 parser(r"\p{Greek}").parse_escape(),
5616 Ok(Primitive::Unicode(ast::ClassUnicode {
5617 span: span(0..9),
5618 negated: false,
5619 kind: ast::ClassUnicodeKind::Named(s("Greek")),
5620 }))
5621 );
5622
5623 assert_eq!(
5624 parser(r"\p{scx:Katakana}").parse_escape(),
5625 Ok(Primitive::Unicode(ast::ClassUnicode {
5626 span: span(0..16),
5627 negated: false,
5628 kind: ast::ClassUnicodeKind::NamedValue {
5629 op: ast::ClassUnicodeOpKind::Colon,
5630 name: s("scx"),
5631 value: s("Katakana"),
5632 },
5633 }))
5634 );
5635 assert_eq!(
5636 parser(r"\p{scx=Katakana}").parse_escape(),
5637 Ok(Primitive::Unicode(ast::ClassUnicode {
5638 span: span(0..16),
5639 negated: false,
5640 kind: ast::ClassUnicodeKind::NamedValue {
5641 op: ast::ClassUnicodeOpKind::Equal,
5642 name: s("scx"),
5643 value: s("Katakana"),
5644 },
5645 }))
5646 );
5647 assert_eq!(
5648 parser(r"\p{scx!=Katakana}").parse_escape(),
5649 Ok(Primitive::Unicode(ast::ClassUnicode {
5650 span: span(0..17),
5651 negated: false,
5652 kind: ast::ClassUnicodeKind::NamedValue {
5653 op: ast::ClassUnicodeOpKind::NotEqual,
5654 name: s("scx"),
5655 value: s("Katakana"),
5656 },
5657 }))
5658 );
5659
5660 assert_eq!(
5661 parser(r"\p{:}").parse_escape(),
5662 Ok(Primitive::Unicode(ast::ClassUnicode {
5663 span: span(0..5),
5664 negated: false,
5665 kind: ast::ClassUnicodeKind::NamedValue {
5666 op: ast::ClassUnicodeOpKind::Colon,
5667 name: s(""),
5668 value: s(""),
5669 },
5670 }))
5671 );
5672 assert_eq!(
5673 parser(r"\p{=}").parse_escape(),
5674 Ok(Primitive::Unicode(ast::ClassUnicode {
5675 span: span(0..5),
5676 negated: false,
5677 kind: ast::ClassUnicodeKind::NamedValue {
5678 op: ast::ClassUnicodeOpKind::Equal,
5679 name: s(""),
5680 value: s(""),
5681 },
5682 }))
5683 );
5684 assert_eq!(
5685 parser(r"\p{!=}").parse_escape(),
5686 Ok(Primitive::Unicode(ast::ClassUnicode {
5687 span: span(0..6),
5688 negated: false,
5689 kind: ast::ClassUnicodeKind::NamedValue {
5690 op: ast::ClassUnicodeOpKind::NotEqual,
5691 name: s(""),
5692 value: s(""),
5693 },
5694 }))
5695 );
5696
5697 assert_eq!(
5698 parser(r"\p").parse_escape().unwrap_err(),
5699 TestError {
5700 span: span(2..2),
5701 kind: ast::ErrorKind::EscapeUnexpectedEof,
5702 }
5703 );
5704 assert_eq!(
5705 parser(r"\p{").parse_escape().unwrap_err(),
5706 TestError {
5707 span: span(3..3),
5708 kind: ast::ErrorKind::EscapeUnexpectedEof,
5709 }
5710 );
5711 assert_eq!(
5712 parser(r"\p{N").parse_escape().unwrap_err(),
5713 TestError {
5714 span: span(4..4),
5715 kind: ast::ErrorKind::EscapeUnexpectedEof,
5716 }
5717 );
5718 assert_eq!(
5719 parser(r"\p{Greek").parse_escape().unwrap_err(),
5720 TestError {
5721 span: span(8..8),
5722 kind: ast::ErrorKind::EscapeUnexpectedEof,
5723 }
5724 );
5725
5726 assert_eq!(
5727 parser(r"\pNz").parse(),
5728 Ok(Ast::Concat(ast::Concat {
5729 span: span(0..4),
5730 asts: vec![
5731 Ast::Class(ast::Class::Unicode(ast::ClassUnicode {
5732 span: span(0..3),
5733 negated: false,
5734 kind: ast::ClassUnicodeKind::OneLetter('N'),
5735 })),
5736 Ast::Literal(ast::Literal {
5737 span: span(3..4),
5738 kind: ast::LiteralKind::Verbatim,
5739 c: 'z',
5740 }),
5741 ],
5742 }))
5743 );
5744 assert_eq!(
5745 parser(r"\p{Greek}z").parse(),
5746 Ok(Ast::Concat(ast::Concat {
5747 span: span(0..10),
5748 asts: vec![
5749 Ast::Class(ast::Class::Unicode(ast::ClassUnicode {
5750 span: span(0..9),
5751 negated: false,
5752 kind: ast::ClassUnicodeKind::Named(s("Greek")),
5753 })),
5754 Ast::Literal(ast::Literal {
5755 span: span(9..10),
5756 kind: ast::LiteralKind::Verbatim,
5757 c: 'z',
5758 }),
5759 ],
5760 }))
5761 );
5762 assert_eq!(
5763 parser(r"\p\{").parse().unwrap_err(),
5764 TestError {
5765 span: span(2..3),
5766 kind: ast::ErrorKind::UnicodeClassInvalid,
5767 }
5768 );
5769 assert_eq!(
5770 parser(r"\P\{").parse().unwrap_err(),
5771 TestError {
5772 span: span(2..3),
5773 kind: ast::ErrorKind::UnicodeClassInvalid,
5774 }
5775 );
5776 }
5777
5778 #[test]
parse_perl_class()5779 fn parse_perl_class() {
5780 assert_eq!(
5781 parser(r"\d").parse_escape(),
5782 Ok(Primitive::Perl(ast::ClassPerl {
5783 span: span(0..2),
5784 kind: ast::ClassPerlKind::Digit,
5785 negated: false,
5786 }))
5787 );
5788 assert_eq!(
5789 parser(r"\D").parse_escape(),
5790 Ok(Primitive::Perl(ast::ClassPerl {
5791 span: span(0..2),
5792 kind: ast::ClassPerlKind::Digit,
5793 negated: true,
5794 }))
5795 );
5796 assert_eq!(
5797 parser(r"\s").parse_escape(),
5798 Ok(Primitive::Perl(ast::ClassPerl {
5799 span: span(0..2),
5800 kind: ast::ClassPerlKind::Space,
5801 negated: false,
5802 }))
5803 );
5804 assert_eq!(
5805 parser(r"\S").parse_escape(),
5806 Ok(Primitive::Perl(ast::ClassPerl {
5807 span: span(0..2),
5808 kind: ast::ClassPerlKind::Space,
5809 negated: true,
5810 }))
5811 );
5812 assert_eq!(
5813 parser(r"\w").parse_escape(),
5814 Ok(Primitive::Perl(ast::ClassPerl {
5815 span: span(0..2),
5816 kind: ast::ClassPerlKind::Word,
5817 negated: false,
5818 }))
5819 );
5820 assert_eq!(
5821 parser(r"\W").parse_escape(),
5822 Ok(Primitive::Perl(ast::ClassPerl {
5823 span: span(0..2),
5824 kind: ast::ClassPerlKind::Word,
5825 negated: true,
5826 }))
5827 );
5828
5829 assert_eq!(
5830 parser(r"\d").parse(),
5831 Ok(Ast::Class(ast::Class::Perl(ast::ClassPerl {
5832 span: span(0..2),
5833 kind: ast::ClassPerlKind::Digit,
5834 negated: false,
5835 })))
5836 );
5837 assert_eq!(
5838 parser(r"\dz").parse(),
5839 Ok(Ast::Concat(ast::Concat {
5840 span: span(0..3),
5841 asts: vec![
5842 Ast::Class(ast::Class::Perl(ast::ClassPerl {
5843 span: span(0..2),
5844 kind: ast::ClassPerlKind::Digit,
5845 negated: false,
5846 })),
5847 Ast::Literal(ast::Literal {
5848 span: span(2..3),
5849 kind: ast::LiteralKind::Verbatim,
5850 c: 'z',
5851 }),
5852 ],
5853 }))
5854 );
5855 }
5856
5857 // This tests a bug fix where the nest limit checker wasn't decrementing
5858 // its depth during post-traversal, which causes long regexes to trip
5859 // the default limit too aggressively.
5860 #[test]
regression_454_nest_too_big()5861 fn regression_454_nest_too_big() {
5862 let pattern = r#"
5863 2(?:
5864 [45]\d{3}|
5865 7(?:
5866 1[0-267]|
5867 2[0-289]|
5868 3[0-29]|
5869 4[01]|
5870 5[1-3]|
5871 6[013]|
5872 7[0178]|
5873 91
5874 )|
5875 8(?:
5876 0[125]|
5877 [139][1-6]|
5878 2[0157-9]|
5879 41|
5880 6[1-35]|
5881 7[1-5]|
5882 8[1-8]|
5883 90
5884 )|
5885 9(?:
5886 0[0-2]|
5887 1[0-4]|
5888 2[568]|
5889 3[3-6]|
5890 5[5-7]|
5891 6[0167]|
5892 7[15]|
5893 8[0146-9]
5894 )
5895 )\d{4}
5896 "#;
5897 assert!(parser_nest_limit(pattern, 50).parse().is_ok());
5898 }
5899
5900 // This tests that we treat a trailing `-` in a character class as a
5901 // literal `-` even when whitespace mode is enabled and there is whitespace
5902 // after the trailing `-`.
5903 #[test]
regression_455_trailing_dash_ignore_whitespace()5904 fn regression_455_trailing_dash_ignore_whitespace() {
5905 assert!(parser("(?x)[ / - ]").parse().is_ok());
5906 assert!(parser("(?x)[ a - ]").parse().is_ok());
5907 assert!(parser(
5908 "(?x)[
5909 a
5910 - ]
5911 "
5912 )
5913 .parse()
5914 .is_ok());
5915 assert!(parser(
5916 "(?x)[
5917 a # wat
5918 - ]
5919 "
5920 )
5921 .parse()
5922 .is_ok());
5923
5924 assert!(parser("(?x)[ / -").parse().is_err());
5925 assert!(parser("(?x)[ / - ").parse().is_err());
5926 assert!(parser(
5927 "(?x)[
5928 / -
5929 "
5930 )
5931 .parse()
5932 .is_err());
5933 assert!(parser(
5934 "(?x)[
5935 / - # wat
5936 "
5937 )
5938 .parse()
5939 .is_err());
5940 }
5941 }
5942