1 /*!
2 Defines a translator that converts an `Ast` to an `Hir`.
3 */
4
5 use std::cell::{Cell, RefCell};
6 use std::result;
7
8 use crate::ast::{self, Ast, Span, Visitor};
9 use crate::hir::{self, Error, ErrorKind, Hir};
10 use crate::unicode::{self, ClassQuery};
11
12 type Result<T> = result::Result<T, Error>;
13
14 /// A builder for constructing an AST->HIR translator.
15 #[derive(Clone, Debug)]
16 pub struct TranslatorBuilder {
17 allow_invalid_utf8: bool,
18 flags: Flags,
19 }
20
21 impl Default for TranslatorBuilder {
default() -> TranslatorBuilder22 fn default() -> TranslatorBuilder {
23 TranslatorBuilder::new()
24 }
25 }
26
27 impl TranslatorBuilder {
28 /// Create a new translator builder with a default c onfiguration.
new() -> TranslatorBuilder29 pub fn new() -> TranslatorBuilder {
30 TranslatorBuilder {
31 allow_invalid_utf8: false,
32 flags: Flags::default(),
33 }
34 }
35
36 /// Build a translator using the current configuration.
build(&self) -> Translator37 pub fn build(&self) -> Translator {
38 Translator {
39 stack: RefCell::new(vec![]),
40 flags: Cell::new(self.flags),
41 allow_invalid_utf8: self.allow_invalid_utf8,
42 }
43 }
44
45 /// When enabled, translation will permit the construction of a regular
46 /// expression that may match invalid UTF-8.
47 ///
48 /// When disabled (the default), the translator is guaranteed to produce
49 /// an expression that will only ever match valid UTF-8 (otherwise, the
50 /// translator will return an error).
51 ///
52 /// Perhaps surprisingly, when invalid UTF-8 isn't allowed, a negated ASCII
53 /// word boundary (uttered as `(?-u:\B)` in the concrete syntax) will cause
54 /// the parser to return an error. Namely, a negated ASCII word boundary
55 /// can result in matching positions that aren't valid UTF-8 boundaries.
allow_invalid_utf8(&mut self, yes: bool) -> &mut TranslatorBuilder56 pub fn allow_invalid_utf8(&mut self, yes: bool) -> &mut TranslatorBuilder {
57 self.allow_invalid_utf8 = yes;
58 self
59 }
60
61 /// Enable or disable the case insensitive flag (`i`) by default.
case_insensitive(&mut self, yes: bool) -> &mut TranslatorBuilder62 pub fn case_insensitive(&mut self, yes: bool) -> &mut TranslatorBuilder {
63 self.flags.case_insensitive = if yes { Some(true) } else { None };
64 self
65 }
66
67 /// Enable or disable the multi-line matching flag (`m`) by default.
multi_line(&mut self, yes: bool) -> &mut TranslatorBuilder68 pub fn multi_line(&mut self, yes: bool) -> &mut TranslatorBuilder {
69 self.flags.multi_line = if yes { Some(true) } else { None };
70 self
71 }
72
73 /// Enable or disable the "dot matches any character" flag (`s`) by
74 /// default.
dot_matches_new_line( &mut self, yes: bool, ) -> &mut TranslatorBuilder75 pub fn dot_matches_new_line(
76 &mut self,
77 yes: bool,
78 ) -> &mut TranslatorBuilder {
79 self.flags.dot_matches_new_line = if yes { Some(true) } else { None };
80 self
81 }
82
83 /// Enable or disable the "swap greed" flag (`U`) by default.
swap_greed(&mut self, yes: bool) -> &mut TranslatorBuilder84 pub fn swap_greed(&mut self, yes: bool) -> &mut TranslatorBuilder {
85 self.flags.swap_greed = if yes { Some(true) } else { None };
86 self
87 }
88
89 /// Enable or disable the Unicode flag (`u`) by default.
unicode(&mut self, yes: bool) -> &mut TranslatorBuilder90 pub fn unicode(&mut self, yes: bool) -> &mut TranslatorBuilder {
91 self.flags.unicode = if yes { None } else { Some(false) };
92 self
93 }
94 }
95
96 /// A translator maps abstract syntax to a high level intermediate
97 /// representation.
98 ///
99 /// A translator may be benefit from reuse. That is, a translator can translate
100 /// many abstract syntax trees.
101 ///
102 /// A `Translator` can be configured in more detail via a
103 /// [`TranslatorBuilder`](struct.TranslatorBuilder.html).
104 #[derive(Clone, Debug)]
105 pub struct Translator {
106 /// Our call stack, but on the heap.
107 stack: RefCell<Vec<HirFrame>>,
108 /// The current flag settings.
109 flags: Cell<Flags>,
110 /// Whether we're allowed to produce HIR that can match arbitrary bytes.
111 allow_invalid_utf8: bool,
112 }
113
114 impl Translator {
115 /// Create a new translator using the default configuration.
new() -> Translator116 pub fn new() -> Translator {
117 TranslatorBuilder::new().build()
118 }
119
120 /// Translate the given abstract syntax tree (AST) into a high level
121 /// intermediate representation (HIR).
122 ///
123 /// If there was a problem doing the translation, then an HIR-specific
124 /// error is returned.
125 ///
126 /// The original pattern string used to produce the `Ast` *must* also be
127 /// provided. The translator does not use the pattern string during any
128 /// correct translation, but is used for error reporting.
translate(&mut self, pattern: &str, ast: &Ast) -> Result<Hir>129 pub fn translate(&mut self, pattern: &str, ast: &Ast) -> Result<Hir> {
130 ast::visit(ast, TranslatorI::new(self, pattern))
131 }
132 }
133
134 /// An HirFrame is a single stack frame, represented explicitly, which is
135 /// created for each item in the Ast that we traverse.
136 ///
137 /// Note that technically, this type doesn't represent our entire stack
138 /// frame. In particular, the Ast visitor represents any state associated with
139 /// traversing the Ast itself.
140 #[derive(Clone, Debug)]
141 enum HirFrame {
142 /// An arbitrary HIR expression. These get pushed whenever we hit a base
143 /// case in the Ast. They get popped after an inductive (i.e., recursive)
144 /// step is complete.
145 Expr(Hir),
146 /// A Unicode character class. This frame is mutated as we descend into
147 /// the Ast of a character class (which is itself its own mini recursive
148 /// structure).
149 ClassUnicode(hir::ClassUnicode),
150 /// A byte-oriented character class. This frame is mutated as we descend
151 /// into the Ast of a character class (which is itself its own mini
152 /// recursive structure).
153 ///
154 /// Byte character classes are created when Unicode mode (`u`) is disabled.
155 /// If `allow_invalid_utf8` is disabled (the default), then a byte
156 /// character is only permitted to match ASCII text.
157 ClassBytes(hir::ClassBytes),
158 /// This is pushed on to the stack upon first seeing any kind of group,
159 /// indicated by parentheses (including non-capturing groups). It is popped
160 /// upon leaving a group.
161 Group {
162 /// The old active flags when this group was opened.
163 ///
164 /// If this group sets flags, then the new active flags are set to the
165 /// result of merging the old flags with the flags introduced by this
166 /// group. If the group doesn't set any flags, then this is simply
167 /// equivalent to whatever flags were set when the group was opened.
168 ///
169 /// When this group is popped, the active flags should be restored to
170 /// the flags set here.
171 ///
172 /// The "active" flags correspond to whatever flags are set in the
173 /// Translator.
174 old_flags: Flags,
175 },
176 /// This is pushed whenever a concatenation is observed. After visiting
177 /// every sub-expression in the concatenation, the translator's stack is
178 /// popped until it sees a Concat frame.
179 Concat,
180 /// This is pushed whenever an alternation is observed. After visiting
181 /// every sub-expression in the alternation, the translator's stack is
182 /// popped until it sees an Alternation frame.
183 Alternation,
184 }
185
186 impl HirFrame {
187 /// Assert that the current stack frame is an Hir expression and return it.
unwrap_expr(self) -> Hir188 fn unwrap_expr(self) -> Hir {
189 match self {
190 HirFrame::Expr(expr) => expr,
191 _ => panic!("tried to unwrap expr from HirFrame, got: {:?}", self),
192 }
193 }
194
195 /// Assert that the current stack frame is a Unicode class expression and
196 /// return it.
unwrap_class_unicode(self) -> hir::ClassUnicode197 fn unwrap_class_unicode(self) -> hir::ClassUnicode {
198 match self {
199 HirFrame::ClassUnicode(cls) => cls,
200 _ => panic!(
201 "tried to unwrap Unicode class \
202 from HirFrame, got: {:?}",
203 self
204 ),
205 }
206 }
207
208 /// Assert that the current stack frame is a byte class expression and
209 /// return it.
unwrap_class_bytes(self) -> hir::ClassBytes210 fn unwrap_class_bytes(self) -> hir::ClassBytes {
211 match self {
212 HirFrame::ClassBytes(cls) => cls,
213 _ => panic!(
214 "tried to unwrap byte class \
215 from HirFrame, got: {:?}",
216 self
217 ),
218 }
219 }
220
221 /// Assert that the current stack frame is a group indicator and return
222 /// its corresponding flags (the flags that were active at the time the
223 /// group was entered).
unwrap_group(self) -> Flags224 fn unwrap_group(self) -> Flags {
225 match self {
226 HirFrame::Group { old_flags } => old_flags,
227 _ => {
228 panic!("tried to unwrap group from HirFrame, got: {:?}", self)
229 }
230 }
231 }
232 }
233
234 impl<'t, 'p> Visitor for TranslatorI<'t, 'p> {
235 type Output = Hir;
236 type Err = Error;
237
finish(self) -> Result<Hir>238 fn finish(self) -> Result<Hir> {
239 // ... otherwise, we should have exactly one HIR on the stack.
240 assert_eq!(self.trans().stack.borrow().len(), 1);
241 Ok(self.pop().unwrap().unwrap_expr())
242 }
243
visit_pre(&mut self, ast: &Ast) -> Result<()>244 fn visit_pre(&mut self, ast: &Ast) -> Result<()> {
245 match *ast {
246 Ast::Class(ast::Class::Bracketed(_)) => {
247 if self.flags().unicode() {
248 let cls = hir::ClassUnicode::empty();
249 self.push(HirFrame::ClassUnicode(cls));
250 } else {
251 let cls = hir::ClassBytes::empty();
252 self.push(HirFrame::ClassBytes(cls));
253 }
254 }
255 Ast::Group(ref x) => {
256 let old_flags = x
257 .flags()
258 .map(|ast| self.set_flags(ast))
259 .unwrap_or_else(|| self.flags());
260 self.push(HirFrame::Group { old_flags });
261 }
262 Ast::Concat(ref x) if x.asts.is_empty() => {}
263 Ast::Concat(_) => {
264 self.push(HirFrame::Concat);
265 }
266 Ast::Alternation(ref x) if x.asts.is_empty() => {}
267 Ast::Alternation(_) => {
268 self.push(HirFrame::Alternation);
269 }
270 _ => {}
271 }
272 Ok(())
273 }
274
visit_post(&mut self, ast: &Ast) -> Result<()>275 fn visit_post(&mut self, ast: &Ast) -> Result<()> {
276 match *ast {
277 Ast::Empty(_) => {
278 self.push(HirFrame::Expr(Hir::empty()));
279 }
280 Ast::Flags(ref x) => {
281 self.set_flags(&x.flags);
282 // Flags in the AST are generally considered directives and
283 // not actual sub-expressions. However, they can be used in
284 // the concrete syntax like `((?i))`, and we need some kind of
285 // indication of an expression there, and Empty is the correct
286 // choice.
287 //
288 // There can also be things like `(?i)+`, but we rule those out
289 // in the parser. In the future, we might allow them for
290 // consistency sake.
291 self.push(HirFrame::Expr(Hir::empty()));
292 }
293 Ast::Literal(ref x) => {
294 self.push(HirFrame::Expr(self.hir_literal(x)?));
295 }
296 Ast::Dot(span) => {
297 self.push(HirFrame::Expr(self.hir_dot(span)?));
298 }
299 Ast::Assertion(ref x) => {
300 self.push(HirFrame::Expr(self.hir_assertion(x)?));
301 }
302 Ast::Class(ast::Class::Perl(ref x)) => {
303 if self.flags().unicode() {
304 let cls = self.hir_perl_unicode_class(x)?;
305 let hcls = hir::Class::Unicode(cls);
306 self.push(HirFrame::Expr(Hir::class(hcls)));
307 } else {
308 let cls = self.hir_perl_byte_class(x);
309 let hcls = hir::Class::Bytes(cls);
310 self.push(HirFrame::Expr(Hir::class(hcls)));
311 }
312 }
313 Ast::Class(ast::Class::Unicode(ref x)) => {
314 let cls = hir::Class::Unicode(self.hir_unicode_class(x)?);
315 self.push(HirFrame::Expr(Hir::class(cls)));
316 }
317 Ast::Class(ast::Class::Bracketed(ref ast)) => {
318 if self.flags().unicode() {
319 let mut cls = self.pop().unwrap().unwrap_class_unicode();
320 self.unicode_fold_and_negate(
321 &ast.span,
322 ast.negated,
323 &mut cls,
324 )?;
325 if cls.ranges().is_empty() {
326 return Err(self.error(
327 ast.span,
328 ErrorKind::EmptyClassNotAllowed,
329 ));
330 }
331 let expr = Hir::class(hir::Class::Unicode(cls));
332 self.push(HirFrame::Expr(expr));
333 } else {
334 let mut cls = self.pop().unwrap().unwrap_class_bytes();
335 self.bytes_fold_and_negate(
336 &ast.span,
337 ast.negated,
338 &mut cls,
339 )?;
340 if cls.ranges().is_empty() {
341 return Err(self.error(
342 ast.span,
343 ErrorKind::EmptyClassNotAllowed,
344 ));
345 }
346
347 let expr = Hir::class(hir::Class::Bytes(cls));
348 self.push(HirFrame::Expr(expr));
349 }
350 }
351 Ast::Repetition(ref x) => {
352 let expr = self.pop().unwrap().unwrap_expr();
353 self.push(HirFrame::Expr(self.hir_repetition(x, expr)));
354 }
355 Ast::Group(ref x) => {
356 let expr = self.pop().unwrap().unwrap_expr();
357 let old_flags = self.pop().unwrap().unwrap_group();
358 self.trans().flags.set(old_flags);
359 self.push(HirFrame::Expr(self.hir_group(x, expr)));
360 }
361 Ast::Concat(_) => {
362 let mut exprs = vec![];
363 while let Some(HirFrame::Expr(expr)) = self.pop() {
364 if !expr.kind().is_empty() {
365 exprs.push(expr);
366 }
367 }
368 exprs.reverse();
369 self.push(HirFrame::Expr(Hir::concat(exprs)));
370 }
371 Ast::Alternation(_) => {
372 let mut exprs = vec![];
373 while let Some(HirFrame::Expr(expr)) = self.pop() {
374 exprs.push(expr);
375 }
376 exprs.reverse();
377 self.push(HirFrame::Expr(Hir::alternation(exprs)));
378 }
379 }
380 Ok(())
381 }
382
visit_class_set_item_pre( &mut self, ast: &ast::ClassSetItem, ) -> Result<()>383 fn visit_class_set_item_pre(
384 &mut self,
385 ast: &ast::ClassSetItem,
386 ) -> Result<()> {
387 match *ast {
388 ast::ClassSetItem::Bracketed(_) => {
389 if self.flags().unicode() {
390 let cls = hir::ClassUnicode::empty();
391 self.push(HirFrame::ClassUnicode(cls));
392 } else {
393 let cls = hir::ClassBytes::empty();
394 self.push(HirFrame::ClassBytes(cls));
395 }
396 }
397 // We needn't handle the Union case here since the visitor will
398 // do it for us.
399 _ => {}
400 }
401 Ok(())
402 }
403
visit_class_set_item_post( &mut self, ast: &ast::ClassSetItem, ) -> Result<()>404 fn visit_class_set_item_post(
405 &mut self,
406 ast: &ast::ClassSetItem,
407 ) -> Result<()> {
408 match *ast {
409 ast::ClassSetItem::Empty(_) => {}
410 ast::ClassSetItem::Literal(ref x) => {
411 if self.flags().unicode() {
412 let mut cls = self.pop().unwrap().unwrap_class_unicode();
413 cls.push(hir::ClassUnicodeRange::new(x.c, x.c));
414 self.push(HirFrame::ClassUnicode(cls));
415 } else {
416 let mut cls = self.pop().unwrap().unwrap_class_bytes();
417 let byte = self.class_literal_byte(x)?;
418 cls.push(hir::ClassBytesRange::new(byte, byte));
419 self.push(HirFrame::ClassBytes(cls));
420 }
421 }
422 ast::ClassSetItem::Range(ref x) => {
423 if self.flags().unicode() {
424 let mut cls = self.pop().unwrap().unwrap_class_unicode();
425 cls.push(hir::ClassUnicodeRange::new(x.start.c, x.end.c));
426 self.push(HirFrame::ClassUnicode(cls));
427 } else {
428 let mut cls = self.pop().unwrap().unwrap_class_bytes();
429 let start = self.class_literal_byte(&x.start)?;
430 let end = self.class_literal_byte(&x.end)?;
431 cls.push(hir::ClassBytesRange::new(start, end));
432 self.push(HirFrame::ClassBytes(cls));
433 }
434 }
435 ast::ClassSetItem::Ascii(ref x) => {
436 if self.flags().unicode() {
437 let xcls = self.hir_ascii_unicode_class(x)?;
438 let mut cls = self.pop().unwrap().unwrap_class_unicode();
439 cls.union(&xcls);
440 self.push(HirFrame::ClassUnicode(cls));
441 } else {
442 let xcls = self.hir_ascii_byte_class(x)?;
443 let mut cls = self.pop().unwrap().unwrap_class_bytes();
444 cls.union(&xcls);
445 self.push(HirFrame::ClassBytes(cls));
446 }
447 }
448 ast::ClassSetItem::Unicode(ref x) => {
449 let xcls = self.hir_unicode_class(x)?;
450 let mut cls = self.pop().unwrap().unwrap_class_unicode();
451 cls.union(&xcls);
452 self.push(HirFrame::ClassUnicode(cls));
453 }
454 ast::ClassSetItem::Perl(ref x) => {
455 if self.flags().unicode() {
456 let xcls = self.hir_perl_unicode_class(x)?;
457 let mut cls = self.pop().unwrap().unwrap_class_unicode();
458 cls.union(&xcls);
459 self.push(HirFrame::ClassUnicode(cls));
460 } else {
461 let xcls = self.hir_perl_byte_class(x);
462 let mut cls = self.pop().unwrap().unwrap_class_bytes();
463 cls.union(&xcls);
464 self.push(HirFrame::ClassBytes(cls));
465 }
466 }
467 ast::ClassSetItem::Bracketed(ref ast) => {
468 if self.flags().unicode() {
469 let mut cls1 = self.pop().unwrap().unwrap_class_unicode();
470 self.unicode_fold_and_negate(
471 &ast.span,
472 ast.negated,
473 &mut cls1,
474 )?;
475
476 let mut cls2 = self.pop().unwrap().unwrap_class_unicode();
477 cls2.union(&cls1);
478 self.push(HirFrame::ClassUnicode(cls2));
479 } else {
480 let mut cls1 = self.pop().unwrap().unwrap_class_bytes();
481 self.bytes_fold_and_negate(
482 &ast.span,
483 ast.negated,
484 &mut cls1,
485 )?;
486
487 let mut cls2 = self.pop().unwrap().unwrap_class_bytes();
488 cls2.union(&cls1);
489 self.push(HirFrame::ClassBytes(cls2));
490 }
491 }
492 // This is handled automatically by the visitor.
493 ast::ClassSetItem::Union(_) => {}
494 }
495 Ok(())
496 }
497
visit_class_set_binary_op_pre( &mut self, _op: &ast::ClassSetBinaryOp, ) -> Result<()>498 fn visit_class_set_binary_op_pre(
499 &mut self,
500 _op: &ast::ClassSetBinaryOp,
501 ) -> Result<()> {
502 if self.flags().unicode() {
503 let cls = hir::ClassUnicode::empty();
504 self.push(HirFrame::ClassUnicode(cls));
505 } else {
506 let cls = hir::ClassBytes::empty();
507 self.push(HirFrame::ClassBytes(cls));
508 }
509 Ok(())
510 }
511
visit_class_set_binary_op_in( &mut self, _op: &ast::ClassSetBinaryOp, ) -> Result<()>512 fn visit_class_set_binary_op_in(
513 &mut self,
514 _op: &ast::ClassSetBinaryOp,
515 ) -> Result<()> {
516 if self.flags().unicode() {
517 let cls = hir::ClassUnicode::empty();
518 self.push(HirFrame::ClassUnicode(cls));
519 } else {
520 let cls = hir::ClassBytes::empty();
521 self.push(HirFrame::ClassBytes(cls));
522 }
523 Ok(())
524 }
525
visit_class_set_binary_op_post( &mut self, op: &ast::ClassSetBinaryOp, ) -> Result<()>526 fn visit_class_set_binary_op_post(
527 &mut self,
528 op: &ast::ClassSetBinaryOp,
529 ) -> Result<()> {
530 use crate::ast::ClassSetBinaryOpKind::*;
531
532 if self.flags().unicode() {
533 let mut rhs = self.pop().unwrap().unwrap_class_unicode();
534 let mut lhs = self.pop().unwrap().unwrap_class_unicode();
535 let mut cls = self.pop().unwrap().unwrap_class_unicode();
536 if self.flags().case_insensitive() {
537 rhs.try_case_fold_simple().map_err(|_| {
538 self.error(
539 op.rhs.span().clone(),
540 ErrorKind::UnicodeCaseUnavailable,
541 )
542 })?;
543 lhs.try_case_fold_simple().map_err(|_| {
544 self.error(
545 op.lhs.span().clone(),
546 ErrorKind::UnicodeCaseUnavailable,
547 )
548 })?;
549 }
550 match op.kind {
551 Intersection => lhs.intersect(&rhs),
552 Difference => lhs.difference(&rhs),
553 SymmetricDifference => lhs.symmetric_difference(&rhs),
554 }
555 cls.union(&lhs);
556 self.push(HirFrame::ClassUnicode(cls));
557 } else {
558 let mut rhs = self.pop().unwrap().unwrap_class_bytes();
559 let mut lhs = self.pop().unwrap().unwrap_class_bytes();
560 let mut cls = self.pop().unwrap().unwrap_class_bytes();
561 if self.flags().case_insensitive() {
562 rhs.case_fold_simple();
563 lhs.case_fold_simple();
564 }
565 match op.kind {
566 Intersection => lhs.intersect(&rhs),
567 Difference => lhs.difference(&rhs),
568 SymmetricDifference => lhs.symmetric_difference(&rhs),
569 }
570 cls.union(&lhs);
571 self.push(HirFrame::ClassBytes(cls));
572 }
573 Ok(())
574 }
575 }
576
577 /// The internal implementation of a translator.
578 ///
579 /// This type is responsible for carrying around the original pattern string,
580 /// which is not tied to the internal state of a translator.
581 ///
582 /// A TranslatorI exists for the time it takes to translate a single Ast.
583 #[derive(Clone, Debug)]
584 struct TranslatorI<'t, 'p> {
585 trans: &'t Translator,
586 pattern: &'p str,
587 }
588
589 impl<'t, 'p> TranslatorI<'t, 'p> {
590 /// Build a new internal translator.
new(trans: &'t Translator, pattern: &'p str) -> TranslatorI<'t, 'p>591 fn new(trans: &'t Translator, pattern: &'p str) -> TranslatorI<'t, 'p> {
592 TranslatorI { trans, pattern }
593 }
594
595 /// Return a reference to the underlying translator.
trans(&self) -> &Translator596 fn trans(&self) -> &Translator {
597 &self.trans
598 }
599
600 /// Push the given frame on to the call stack.
push(&self, frame: HirFrame)601 fn push(&self, frame: HirFrame) {
602 self.trans().stack.borrow_mut().push(frame);
603 }
604
605 /// Pop the top of the call stack. If the call stack is empty, return None.
pop(&self) -> Option<HirFrame>606 fn pop(&self) -> Option<HirFrame> {
607 self.trans().stack.borrow_mut().pop()
608 }
609
610 /// Create a new error with the given span and error type.
error(&self, span: Span, kind: ErrorKind) -> Error611 fn error(&self, span: Span, kind: ErrorKind) -> Error {
612 Error { kind, pattern: self.pattern.to_string(), span }
613 }
614
615 /// Return a copy of the active flags.
flags(&self) -> Flags616 fn flags(&self) -> Flags {
617 self.trans().flags.get()
618 }
619
620 /// Set the flags of this translator from the flags set in the given AST.
621 /// Then, return the old flags.
set_flags(&self, ast_flags: &ast::Flags) -> Flags622 fn set_flags(&self, ast_flags: &ast::Flags) -> Flags {
623 let old_flags = self.flags();
624 let mut new_flags = Flags::from_ast(ast_flags);
625 new_flags.merge(&old_flags);
626 self.trans().flags.set(new_flags);
627 old_flags
628 }
629
hir_literal(&self, lit: &ast::Literal) -> Result<Hir>630 fn hir_literal(&self, lit: &ast::Literal) -> Result<Hir> {
631 let ch = match self.literal_to_char(lit)? {
632 byte @ hir::Literal::Byte(_) => return Ok(Hir::literal(byte)),
633 hir::Literal::Unicode(ch) => ch,
634 };
635 if self.flags().case_insensitive() {
636 self.hir_from_char_case_insensitive(lit.span, ch)
637 } else {
638 self.hir_from_char(lit.span, ch)
639 }
640 }
641
642 /// Convert an Ast literal to its scalar representation.
643 ///
644 /// When Unicode mode is enabled, then this always succeeds and returns a
645 /// `char` (Unicode scalar value).
646 ///
647 /// When Unicode mode is disabled, then a raw byte is returned. If that
648 /// byte is not ASCII and invalid UTF-8 is not allowed, then this returns
649 /// an error.
literal_to_char(&self, lit: &ast::Literal) -> Result<hir::Literal>650 fn literal_to_char(&self, lit: &ast::Literal) -> Result<hir::Literal> {
651 if self.flags().unicode() {
652 return Ok(hir::Literal::Unicode(lit.c));
653 }
654 let byte = match lit.byte() {
655 None => return Ok(hir::Literal::Unicode(lit.c)),
656 Some(byte) => byte,
657 };
658 if byte <= 0x7F {
659 return Ok(hir::Literal::Unicode(byte as char));
660 }
661 if !self.trans().allow_invalid_utf8 {
662 return Err(self.error(lit.span, ErrorKind::InvalidUtf8));
663 }
664 Ok(hir::Literal::Byte(byte))
665 }
666
hir_from_char(&self, span: Span, c: char) -> Result<Hir>667 fn hir_from_char(&self, span: Span, c: char) -> Result<Hir> {
668 if !self.flags().unicode() && c.len_utf8() > 1 {
669 return Err(self.error(span, ErrorKind::UnicodeNotAllowed));
670 }
671 Ok(Hir::literal(hir::Literal::Unicode(c)))
672 }
673
hir_from_char_case_insensitive( &self, span: Span, c: char, ) -> Result<Hir>674 fn hir_from_char_case_insensitive(
675 &self,
676 span: Span,
677 c: char,
678 ) -> Result<Hir> {
679 if self.flags().unicode() {
680 // If case folding won't do anything, then don't bother trying.
681 let map =
682 unicode::contains_simple_case_mapping(c, c).map_err(|_| {
683 self.error(span, ErrorKind::UnicodeCaseUnavailable)
684 })?;
685 if !map {
686 return self.hir_from_char(span, c);
687 }
688 let mut cls =
689 hir::ClassUnicode::new(vec![hir::ClassUnicodeRange::new(
690 c, c,
691 )]);
692 cls.try_case_fold_simple().map_err(|_| {
693 self.error(span, ErrorKind::UnicodeCaseUnavailable)
694 })?;
695 Ok(Hir::class(hir::Class::Unicode(cls)))
696 } else {
697 if c.len_utf8() > 1 {
698 return Err(self.error(span, ErrorKind::UnicodeNotAllowed));
699 }
700 // If case folding won't do anything, then don't bother trying.
701 match c {
702 'A'..='Z' | 'a'..='z' => {}
703 _ => return self.hir_from_char(span, c),
704 }
705 let mut cls =
706 hir::ClassBytes::new(vec![hir::ClassBytesRange::new(
707 c as u8, c as u8,
708 )]);
709 cls.case_fold_simple();
710 Ok(Hir::class(hir::Class::Bytes(cls)))
711 }
712 }
713
hir_dot(&self, span: Span) -> Result<Hir>714 fn hir_dot(&self, span: Span) -> Result<Hir> {
715 let unicode = self.flags().unicode();
716 if !unicode && !self.trans().allow_invalid_utf8 {
717 return Err(self.error(span, ErrorKind::InvalidUtf8));
718 }
719 Ok(if self.flags().dot_matches_new_line() {
720 Hir::any(!unicode)
721 } else {
722 Hir::dot(!unicode)
723 })
724 }
725
hir_assertion(&self, asst: &ast::Assertion) -> Result<Hir>726 fn hir_assertion(&self, asst: &ast::Assertion) -> Result<Hir> {
727 let unicode = self.flags().unicode();
728 let multi_line = self.flags().multi_line();
729 Ok(match asst.kind {
730 ast::AssertionKind::StartLine => Hir::anchor(if multi_line {
731 hir::Anchor::StartLine
732 } else {
733 hir::Anchor::StartText
734 }),
735 ast::AssertionKind::EndLine => Hir::anchor(if multi_line {
736 hir::Anchor::EndLine
737 } else {
738 hir::Anchor::EndText
739 }),
740 ast::AssertionKind::StartText => {
741 Hir::anchor(hir::Anchor::StartText)
742 }
743 ast::AssertionKind::EndText => Hir::anchor(hir::Anchor::EndText),
744 ast::AssertionKind::WordBoundary => {
745 Hir::word_boundary(if unicode {
746 hir::WordBoundary::Unicode
747 } else {
748 hir::WordBoundary::Ascii
749 })
750 }
751 ast::AssertionKind::NotWordBoundary => {
752 Hir::word_boundary(if unicode {
753 hir::WordBoundary::UnicodeNegate
754 } else {
755 // It is possible for negated ASCII word boundaries to
756 // match at invalid UTF-8 boundaries, even when searching
757 // valid UTF-8.
758 if !self.trans().allow_invalid_utf8 {
759 return Err(
760 self.error(asst.span, ErrorKind::InvalidUtf8)
761 );
762 }
763 hir::WordBoundary::AsciiNegate
764 })
765 }
766 })
767 }
768
hir_group(&self, group: &ast::Group, expr: Hir) -> Hir769 fn hir_group(&self, group: &ast::Group, expr: Hir) -> Hir {
770 let kind = match group.kind {
771 ast::GroupKind::CaptureIndex(idx) => {
772 hir::GroupKind::CaptureIndex(idx)
773 }
774 ast::GroupKind::CaptureName(ref capname) => {
775 hir::GroupKind::CaptureName {
776 name: capname.name.clone(),
777 index: capname.index,
778 }
779 }
780 ast::GroupKind::NonCapturing(_) => hir::GroupKind::NonCapturing,
781 };
782 Hir::group(hir::Group { kind, hir: Box::new(expr) })
783 }
784
hir_repetition(&self, rep: &ast::Repetition, expr: Hir) -> Hir785 fn hir_repetition(&self, rep: &ast::Repetition, expr: Hir) -> Hir {
786 let kind = match rep.op.kind {
787 ast::RepetitionKind::ZeroOrOne => hir::RepetitionKind::ZeroOrOne,
788 ast::RepetitionKind::ZeroOrMore => hir::RepetitionKind::ZeroOrMore,
789 ast::RepetitionKind::OneOrMore => hir::RepetitionKind::OneOrMore,
790 ast::RepetitionKind::Range(ast::RepetitionRange::Exactly(m)) => {
791 hir::RepetitionKind::Range(hir::RepetitionRange::Exactly(m))
792 }
793 ast::RepetitionKind::Range(ast::RepetitionRange::AtLeast(m)) => {
794 hir::RepetitionKind::Range(hir::RepetitionRange::AtLeast(m))
795 }
796 ast::RepetitionKind::Range(ast::RepetitionRange::Bounded(
797 m,
798 n,
799 )) => {
800 hir::RepetitionKind::Range(hir::RepetitionRange::Bounded(m, n))
801 }
802 };
803 let greedy =
804 if self.flags().swap_greed() { !rep.greedy } else { rep.greedy };
805 Hir::repetition(hir::Repetition { kind, greedy, hir: Box::new(expr) })
806 }
807
hir_unicode_class( &self, ast_class: &ast::ClassUnicode, ) -> Result<hir::ClassUnicode>808 fn hir_unicode_class(
809 &self,
810 ast_class: &ast::ClassUnicode,
811 ) -> Result<hir::ClassUnicode> {
812 use crate::ast::ClassUnicodeKind::*;
813
814 if !self.flags().unicode() {
815 return Err(
816 self.error(ast_class.span, ErrorKind::UnicodeNotAllowed)
817 );
818 }
819 let query = match ast_class.kind {
820 OneLetter(name) => ClassQuery::OneLetter(name),
821 Named(ref name) => ClassQuery::Binary(name),
822 NamedValue { ref name, ref value, .. } => ClassQuery::ByValue {
823 property_name: name,
824 property_value: value,
825 },
826 };
827 let mut result = self.convert_unicode_class_error(
828 &ast_class.span,
829 unicode::class(query),
830 );
831 if let Ok(ref mut class) = result {
832 self.unicode_fold_and_negate(
833 &ast_class.span,
834 ast_class.negated,
835 class,
836 )?;
837 if class.ranges().is_empty() {
838 let err = self
839 .error(ast_class.span, ErrorKind::EmptyClassNotAllowed);
840 return Err(err);
841 }
842 }
843 result
844 }
845
hir_ascii_unicode_class( &self, ast: &ast::ClassAscii, ) -> Result<hir::ClassUnicode>846 fn hir_ascii_unicode_class(
847 &self,
848 ast: &ast::ClassAscii,
849 ) -> Result<hir::ClassUnicode> {
850 let mut cls = hir::ClassUnicode::new(
851 ascii_class(&ast.kind)
852 .iter()
853 .map(|&(s, e)| hir::ClassUnicodeRange::new(s, e)),
854 );
855 self.unicode_fold_and_negate(&ast.span, ast.negated, &mut cls)?;
856 Ok(cls)
857 }
858
hir_ascii_byte_class( &self, ast: &ast::ClassAscii, ) -> Result<hir::ClassBytes>859 fn hir_ascii_byte_class(
860 &self,
861 ast: &ast::ClassAscii,
862 ) -> Result<hir::ClassBytes> {
863 let mut cls = hir::ClassBytes::new(
864 ascii_class(&ast.kind)
865 .iter()
866 .map(|&(s, e)| hir::ClassBytesRange::new(s as u8, e as u8)),
867 );
868 self.bytes_fold_and_negate(&ast.span, ast.negated, &mut cls)?;
869 Ok(cls)
870 }
871
hir_perl_unicode_class( &self, ast_class: &ast::ClassPerl, ) -> Result<hir::ClassUnicode>872 fn hir_perl_unicode_class(
873 &self,
874 ast_class: &ast::ClassPerl,
875 ) -> Result<hir::ClassUnicode> {
876 use crate::ast::ClassPerlKind::*;
877
878 assert!(self.flags().unicode());
879 let result = match ast_class.kind {
880 Digit => unicode::perl_digit(),
881 Space => unicode::perl_space(),
882 Word => unicode::perl_word(),
883 };
884 let mut class =
885 self.convert_unicode_class_error(&ast_class.span, result)?;
886 // We needn't apply case folding here because the Perl Unicode classes
887 // are already closed under Unicode simple case folding.
888 if ast_class.negated {
889 class.negate();
890 }
891 Ok(class)
892 }
893
hir_perl_byte_class( &self, ast_class: &ast::ClassPerl, ) -> hir::ClassBytes894 fn hir_perl_byte_class(
895 &self,
896 ast_class: &ast::ClassPerl,
897 ) -> hir::ClassBytes {
898 use crate::ast::ClassPerlKind::*;
899
900 assert!(!self.flags().unicode());
901 let mut class = match ast_class.kind {
902 Digit => hir_ascii_class_bytes(&ast::ClassAsciiKind::Digit),
903 Space => hir_ascii_class_bytes(&ast::ClassAsciiKind::Space),
904 Word => hir_ascii_class_bytes(&ast::ClassAsciiKind::Word),
905 };
906 // We needn't apply case folding here because the Perl ASCII classes
907 // are already closed (under ASCII case folding).
908 if ast_class.negated {
909 class.negate();
910 }
911 class
912 }
913
914 /// Converts the given Unicode specific error to an HIR translation error.
915 ///
916 /// The span given should approximate the position at which an error would
917 /// occur.
convert_unicode_class_error( &self, span: &Span, result: unicode::Result<hir::ClassUnicode>, ) -> Result<hir::ClassUnicode>918 fn convert_unicode_class_error(
919 &self,
920 span: &Span,
921 result: unicode::Result<hir::ClassUnicode>,
922 ) -> Result<hir::ClassUnicode> {
923 result.map_err(|err| {
924 let sp = span.clone();
925 match err {
926 unicode::Error::PropertyNotFound => {
927 self.error(sp, ErrorKind::UnicodePropertyNotFound)
928 }
929 unicode::Error::PropertyValueNotFound => {
930 self.error(sp, ErrorKind::UnicodePropertyValueNotFound)
931 }
932 unicode::Error::PerlClassNotFound => {
933 self.error(sp, ErrorKind::UnicodePerlClassNotFound)
934 }
935 }
936 })
937 }
938
unicode_fold_and_negate( &self, span: &Span, negated: bool, class: &mut hir::ClassUnicode, ) -> Result<()>939 fn unicode_fold_and_negate(
940 &self,
941 span: &Span,
942 negated: bool,
943 class: &mut hir::ClassUnicode,
944 ) -> Result<()> {
945 // Note that we must apply case folding before negation!
946 // Consider `(?i)[^x]`. If we applied negation field, then
947 // the result would be the character class that matched any
948 // Unicode scalar value.
949 if self.flags().case_insensitive() {
950 class.try_case_fold_simple().map_err(|_| {
951 self.error(span.clone(), ErrorKind::UnicodeCaseUnavailable)
952 })?;
953 }
954 if negated {
955 class.negate();
956 }
957 Ok(())
958 }
959
bytes_fold_and_negate( &self, span: &Span, negated: bool, class: &mut hir::ClassBytes, ) -> Result<()>960 fn bytes_fold_and_negate(
961 &self,
962 span: &Span,
963 negated: bool,
964 class: &mut hir::ClassBytes,
965 ) -> Result<()> {
966 // Note that we must apply case folding before negation!
967 // Consider `(?i)[^x]`. If we applied negation first, then
968 // the result would be the character class that matched any
969 // Unicode scalar value.
970 if self.flags().case_insensitive() {
971 class.case_fold_simple();
972 }
973 if negated {
974 class.negate();
975 }
976 if !self.trans().allow_invalid_utf8 && !class.is_all_ascii() {
977 return Err(self.error(span.clone(), ErrorKind::InvalidUtf8));
978 }
979 Ok(())
980 }
981
982 /// Return a scalar byte value suitable for use as a literal in a byte
983 /// character class.
class_literal_byte(&self, ast: &ast::Literal) -> Result<u8>984 fn class_literal_byte(&self, ast: &ast::Literal) -> Result<u8> {
985 match self.literal_to_char(ast)? {
986 hir::Literal::Byte(byte) => Ok(byte),
987 hir::Literal::Unicode(ch) => {
988 if ch <= 0x7F as char {
989 Ok(ch as u8)
990 } else {
991 // We can't feasibly support Unicode in
992 // byte oriented classes. Byte classes don't
993 // do Unicode case folding.
994 Err(self.error(ast.span, ErrorKind::UnicodeNotAllowed))
995 }
996 }
997 }
998 }
999 }
1000
1001 /// A translator's representation of a regular expression's flags at any given
1002 /// moment in time.
1003 ///
1004 /// Each flag can be in one of three states: absent, present but disabled or
1005 /// present but enabled.
1006 #[derive(Clone, Copy, Debug, Default)]
1007 struct Flags {
1008 case_insensitive: Option<bool>,
1009 multi_line: Option<bool>,
1010 dot_matches_new_line: Option<bool>,
1011 swap_greed: Option<bool>,
1012 unicode: Option<bool>,
1013 // Note that `ignore_whitespace` is omitted here because it is handled
1014 // entirely in the parser.
1015 }
1016
1017 impl Flags {
from_ast(ast: &ast::Flags) -> Flags1018 fn from_ast(ast: &ast::Flags) -> Flags {
1019 let mut flags = Flags::default();
1020 let mut enable = true;
1021 for item in &ast.items {
1022 match item.kind {
1023 ast::FlagsItemKind::Negation => {
1024 enable = false;
1025 }
1026 ast::FlagsItemKind::Flag(ast::Flag::CaseInsensitive) => {
1027 flags.case_insensitive = Some(enable);
1028 }
1029 ast::FlagsItemKind::Flag(ast::Flag::MultiLine) => {
1030 flags.multi_line = Some(enable);
1031 }
1032 ast::FlagsItemKind::Flag(ast::Flag::DotMatchesNewLine) => {
1033 flags.dot_matches_new_line = Some(enable);
1034 }
1035 ast::FlagsItemKind::Flag(ast::Flag::SwapGreed) => {
1036 flags.swap_greed = Some(enable);
1037 }
1038 ast::FlagsItemKind::Flag(ast::Flag::Unicode) => {
1039 flags.unicode = Some(enable);
1040 }
1041 ast::FlagsItemKind::Flag(ast::Flag::IgnoreWhitespace) => {}
1042 }
1043 }
1044 flags
1045 }
1046
merge(&mut self, previous: &Flags)1047 fn merge(&mut self, previous: &Flags) {
1048 if self.case_insensitive.is_none() {
1049 self.case_insensitive = previous.case_insensitive;
1050 }
1051 if self.multi_line.is_none() {
1052 self.multi_line = previous.multi_line;
1053 }
1054 if self.dot_matches_new_line.is_none() {
1055 self.dot_matches_new_line = previous.dot_matches_new_line;
1056 }
1057 if self.swap_greed.is_none() {
1058 self.swap_greed = previous.swap_greed;
1059 }
1060 if self.unicode.is_none() {
1061 self.unicode = previous.unicode;
1062 }
1063 }
1064
case_insensitive(&self) -> bool1065 fn case_insensitive(&self) -> bool {
1066 self.case_insensitive.unwrap_or(false)
1067 }
1068
multi_line(&self) -> bool1069 fn multi_line(&self) -> bool {
1070 self.multi_line.unwrap_or(false)
1071 }
1072
dot_matches_new_line(&self) -> bool1073 fn dot_matches_new_line(&self) -> bool {
1074 self.dot_matches_new_line.unwrap_or(false)
1075 }
1076
swap_greed(&self) -> bool1077 fn swap_greed(&self) -> bool {
1078 self.swap_greed.unwrap_or(false)
1079 }
1080
unicode(&self) -> bool1081 fn unicode(&self) -> bool {
1082 self.unicode.unwrap_or(true)
1083 }
1084 }
1085
hir_ascii_class_bytes(kind: &ast::ClassAsciiKind) -> hir::ClassBytes1086 fn hir_ascii_class_bytes(kind: &ast::ClassAsciiKind) -> hir::ClassBytes {
1087 let ranges: Vec<_> = ascii_class(kind)
1088 .iter()
1089 .cloned()
1090 .map(|(s, e)| hir::ClassBytesRange::new(s as u8, e as u8))
1091 .collect();
1092 hir::ClassBytes::new(ranges)
1093 }
1094
ascii_class(kind: &ast::ClassAsciiKind) -> &'static [(char, char)]1095 fn ascii_class(kind: &ast::ClassAsciiKind) -> &'static [(char, char)] {
1096 use crate::ast::ClassAsciiKind::*;
1097 match *kind {
1098 Alnum => &[('0', '9'), ('A', 'Z'), ('a', 'z')],
1099 Alpha => &[('A', 'Z'), ('a', 'z')],
1100 Ascii => &[('\x00', '\x7F')],
1101 Blank => &[('\t', '\t'), (' ', ' ')],
1102 Cntrl => &[('\x00', '\x1F'), ('\x7F', '\x7F')],
1103 Digit => &[('0', '9')],
1104 Graph => &[('!', '~')],
1105 Lower => &[('a', 'z')],
1106 Print => &[(' ', '~')],
1107 Punct => &[('!', '/'), (':', '@'), ('[', '`'), ('{', '~')],
1108 Space => &[
1109 ('\t', '\t'),
1110 ('\n', '\n'),
1111 ('\x0B', '\x0B'),
1112 ('\x0C', '\x0C'),
1113 ('\r', '\r'),
1114 (' ', ' '),
1115 ],
1116 Upper => &[('A', 'Z')],
1117 Word => &[('0', '9'), ('A', 'Z'), ('_', '_'), ('a', 'z')],
1118 Xdigit => &[('0', '9'), ('A', 'F'), ('a', 'f')],
1119 }
1120 }
1121
1122 #[cfg(test)]
1123 mod tests {
1124 use crate::ast::parse::ParserBuilder;
1125 use crate::ast::{self, Ast, Position, Span};
1126 use crate::hir::{self, Hir, HirKind};
1127 use crate::unicode::{self, ClassQuery};
1128
1129 use super::{ascii_class, TranslatorBuilder};
1130
1131 // We create these errors to compare with real hir::Errors in the tests.
1132 // We define equality between TestError and hir::Error to disregard the
1133 // pattern string in hir::Error, which is annoying to provide in tests.
1134 #[derive(Clone, Debug)]
1135 struct TestError {
1136 span: Span,
1137 kind: hir::ErrorKind,
1138 }
1139
1140 impl PartialEq<hir::Error> for TestError {
eq(&self, other: &hir::Error) -> bool1141 fn eq(&self, other: &hir::Error) -> bool {
1142 self.span == other.span && self.kind == other.kind
1143 }
1144 }
1145
1146 impl PartialEq<TestError> for hir::Error {
eq(&self, other: &TestError) -> bool1147 fn eq(&self, other: &TestError) -> bool {
1148 self.span == other.span && self.kind == other.kind
1149 }
1150 }
1151
parse(pattern: &str) -> Ast1152 fn parse(pattern: &str) -> Ast {
1153 ParserBuilder::new().octal(true).build().parse(pattern).unwrap()
1154 }
1155
t(pattern: &str) -> Hir1156 fn t(pattern: &str) -> Hir {
1157 TranslatorBuilder::new()
1158 .allow_invalid_utf8(false)
1159 .build()
1160 .translate(pattern, &parse(pattern))
1161 .unwrap()
1162 }
1163
t_err(pattern: &str) -> hir::Error1164 fn t_err(pattern: &str) -> hir::Error {
1165 TranslatorBuilder::new()
1166 .allow_invalid_utf8(false)
1167 .build()
1168 .translate(pattern, &parse(pattern))
1169 .unwrap_err()
1170 }
1171
t_bytes(pattern: &str) -> Hir1172 fn t_bytes(pattern: &str) -> Hir {
1173 TranslatorBuilder::new()
1174 .allow_invalid_utf8(true)
1175 .build()
1176 .translate(pattern, &parse(pattern))
1177 .unwrap()
1178 }
1179
hir_lit(s: &str) -> Hir1180 fn hir_lit(s: &str) -> Hir {
1181 match s.len() {
1182 0 => Hir::empty(),
1183 _ => {
1184 let lits = s
1185 .chars()
1186 .map(hir::Literal::Unicode)
1187 .map(Hir::literal)
1188 .collect();
1189 Hir::concat(lits)
1190 }
1191 }
1192 }
1193
hir_blit(s: &[u8]) -> Hir1194 fn hir_blit(s: &[u8]) -> Hir {
1195 match s.len() {
1196 0 => Hir::empty(),
1197 1 => Hir::literal(hir::Literal::Byte(s[0])),
1198 _ => {
1199 let lits = s
1200 .iter()
1201 .cloned()
1202 .map(hir::Literal::Byte)
1203 .map(Hir::literal)
1204 .collect();
1205 Hir::concat(lits)
1206 }
1207 }
1208 }
1209
hir_group(i: u32, expr: Hir) -> Hir1210 fn hir_group(i: u32, expr: Hir) -> Hir {
1211 Hir::group(hir::Group {
1212 kind: hir::GroupKind::CaptureIndex(i),
1213 hir: Box::new(expr),
1214 })
1215 }
1216
hir_group_name(i: u32, name: &str, expr: Hir) -> Hir1217 fn hir_group_name(i: u32, name: &str, expr: Hir) -> Hir {
1218 Hir::group(hir::Group {
1219 kind: hir::GroupKind::CaptureName {
1220 name: name.to_string(),
1221 index: i,
1222 },
1223 hir: Box::new(expr),
1224 })
1225 }
1226
hir_group_nocap(expr: Hir) -> Hir1227 fn hir_group_nocap(expr: Hir) -> Hir {
1228 Hir::group(hir::Group {
1229 kind: hir::GroupKind::NonCapturing,
1230 hir: Box::new(expr),
1231 })
1232 }
1233
hir_quest(greedy: bool, expr: Hir) -> Hir1234 fn hir_quest(greedy: bool, expr: Hir) -> Hir {
1235 Hir::repetition(hir::Repetition {
1236 kind: hir::RepetitionKind::ZeroOrOne,
1237 greedy,
1238 hir: Box::new(expr),
1239 })
1240 }
1241
hir_star(greedy: bool, expr: Hir) -> Hir1242 fn hir_star(greedy: bool, expr: Hir) -> Hir {
1243 Hir::repetition(hir::Repetition {
1244 kind: hir::RepetitionKind::ZeroOrMore,
1245 greedy,
1246 hir: Box::new(expr),
1247 })
1248 }
1249
hir_plus(greedy: bool, expr: Hir) -> Hir1250 fn hir_plus(greedy: bool, expr: Hir) -> Hir {
1251 Hir::repetition(hir::Repetition {
1252 kind: hir::RepetitionKind::OneOrMore,
1253 greedy,
1254 hir: Box::new(expr),
1255 })
1256 }
1257
hir_range(greedy: bool, range: hir::RepetitionRange, expr: Hir) -> Hir1258 fn hir_range(greedy: bool, range: hir::RepetitionRange, expr: Hir) -> Hir {
1259 Hir::repetition(hir::Repetition {
1260 kind: hir::RepetitionKind::Range(range),
1261 greedy,
1262 hir: Box::new(expr),
1263 })
1264 }
1265
hir_alt(alts: Vec<Hir>) -> Hir1266 fn hir_alt(alts: Vec<Hir>) -> Hir {
1267 Hir::alternation(alts)
1268 }
1269
hir_cat(exprs: Vec<Hir>) -> Hir1270 fn hir_cat(exprs: Vec<Hir>) -> Hir {
1271 Hir::concat(exprs)
1272 }
1273
1274 #[allow(dead_code)]
hir_uclass_query(query: ClassQuery<'_>) -> Hir1275 fn hir_uclass_query(query: ClassQuery<'_>) -> Hir {
1276 Hir::class(hir::Class::Unicode(unicode::class(query).unwrap()))
1277 }
1278
1279 #[allow(dead_code)]
hir_uclass_perl_word() -> Hir1280 fn hir_uclass_perl_word() -> Hir {
1281 Hir::class(hir::Class::Unicode(unicode::perl_word().unwrap()))
1282 }
1283
hir_uclass(ranges: &[(char, char)]) -> Hir1284 fn hir_uclass(ranges: &[(char, char)]) -> Hir {
1285 let ranges: Vec<hir::ClassUnicodeRange> = ranges
1286 .iter()
1287 .map(|&(s, e)| hir::ClassUnicodeRange::new(s, e))
1288 .collect();
1289 Hir::class(hir::Class::Unicode(hir::ClassUnicode::new(ranges)))
1290 }
1291
hir_bclass(ranges: &[(u8, u8)]) -> Hir1292 fn hir_bclass(ranges: &[(u8, u8)]) -> Hir {
1293 let ranges: Vec<hir::ClassBytesRange> = ranges
1294 .iter()
1295 .map(|&(s, e)| hir::ClassBytesRange::new(s, e))
1296 .collect();
1297 Hir::class(hir::Class::Bytes(hir::ClassBytes::new(ranges)))
1298 }
1299
hir_bclass_from_char(ranges: &[(char, char)]) -> Hir1300 fn hir_bclass_from_char(ranges: &[(char, char)]) -> Hir {
1301 let ranges: Vec<hir::ClassBytesRange> = ranges
1302 .iter()
1303 .map(|&(s, e)| {
1304 assert!(s as u32 <= 0x7F);
1305 assert!(e as u32 <= 0x7F);
1306 hir::ClassBytesRange::new(s as u8, e as u8)
1307 })
1308 .collect();
1309 Hir::class(hir::Class::Bytes(hir::ClassBytes::new(ranges)))
1310 }
1311
hir_case_fold(expr: Hir) -> Hir1312 fn hir_case_fold(expr: Hir) -> Hir {
1313 match expr.into_kind() {
1314 HirKind::Class(mut cls) => {
1315 cls.case_fold_simple();
1316 Hir::class(cls)
1317 }
1318 _ => panic!("cannot case fold non-class Hir expr"),
1319 }
1320 }
1321
hir_negate(expr: Hir) -> Hir1322 fn hir_negate(expr: Hir) -> Hir {
1323 match expr.into_kind() {
1324 HirKind::Class(mut cls) => {
1325 cls.negate();
1326 Hir::class(cls)
1327 }
1328 _ => panic!("cannot negate non-class Hir expr"),
1329 }
1330 }
1331
1332 #[allow(dead_code)]
hir_union(expr1: Hir, expr2: Hir) -> Hir1333 fn hir_union(expr1: Hir, expr2: Hir) -> Hir {
1334 use crate::hir::Class::{Bytes, Unicode};
1335
1336 match (expr1.into_kind(), expr2.into_kind()) {
1337 (HirKind::Class(Unicode(mut c1)), HirKind::Class(Unicode(c2))) => {
1338 c1.union(&c2);
1339 Hir::class(hir::Class::Unicode(c1))
1340 }
1341 (HirKind::Class(Bytes(mut c1)), HirKind::Class(Bytes(c2))) => {
1342 c1.union(&c2);
1343 Hir::class(hir::Class::Bytes(c1))
1344 }
1345 _ => panic!("cannot union non-class Hir exprs"),
1346 }
1347 }
1348
1349 #[allow(dead_code)]
hir_difference(expr1: Hir, expr2: Hir) -> Hir1350 fn hir_difference(expr1: Hir, expr2: Hir) -> Hir {
1351 use crate::hir::Class::{Bytes, Unicode};
1352
1353 match (expr1.into_kind(), expr2.into_kind()) {
1354 (HirKind::Class(Unicode(mut c1)), HirKind::Class(Unicode(c2))) => {
1355 c1.difference(&c2);
1356 Hir::class(hir::Class::Unicode(c1))
1357 }
1358 (HirKind::Class(Bytes(mut c1)), HirKind::Class(Bytes(c2))) => {
1359 c1.difference(&c2);
1360 Hir::class(hir::Class::Bytes(c1))
1361 }
1362 _ => panic!("cannot difference non-class Hir exprs"),
1363 }
1364 }
1365
hir_anchor(anchor: hir::Anchor) -> Hir1366 fn hir_anchor(anchor: hir::Anchor) -> Hir {
1367 Hir::anchor(anchor)
1368 }
1369
hir_word(wb: hir::WordBoundary) -> Hir1370 fn hir_word(wb: hir::WordBoundary) -> Hir {
1371 Hir::word_boundary(wb)
1372 }
1373
1374 #[test]
empty()1375 fn empty() {
1376 assert_eq!(t(""), Hir::empty());
1377 assert_eq!(t("(?i)"), Hir::empty());
1378 assert_eq!(t("()"), hir_group(1, Hir::empty()));
1379 assert_eq!(t("(?:)"), hir_group_nocap(Hir::empty()));
1380 assert_eq!(t("(?P<wat>)"), hir_group_name(1, "wat", Hir::empty()));
1381 assert_eq!(t("|"), hir_alt(vec![Hir::empty(), Hir::empty()]));
1382 assert_eq!(
1383 t("()|()"),
1384 hir_alt(vec![
1385 hir_group(1, Hir::empty()),
1386 hir_group(2, Hir::empty()),
1387 ])
1388 );
1389 assert_eq!(
1390 t("(|b)"),
1391 hir_group(1, hir_alt(vec![Hir::empty(), hir_lit("b"),]))
1392 );
1393 assert_eq!(
1394 t("(a|)"),
1395 hir_group(1, hir_alt(vec![hir_lit("a"), Hir::empty(),]))
1396 );
1397 assert_eq!(
1398 t("(a||c)"),
1399 hir_group(
1400 1,
1401 hir_alt(vec![hir_lit("a"), Hir::empty(), hir_lit("c"),])
1402 )
1403 );
1404 assert_eq!(
1405 t("(||)"),
1406 hir_group(
1407 1,
1408 hir_alt(vec![Hir::empty(), Hir::empty(), Hir::empty(),])
1409 )
1410 );
1411 }
1412
1413 #[test]
literal()1414 fn literal() {
1415 assert_eq!(t("a"), hir_lit("a"));
1416 assert_eq!(t("(?-u)a"), hir_lit("a"));
1417 assert_eq!(t("☃"), hir_lit("☃"));
1418 assert_eq!(t("abcd"), hir_lit("abcd"));
1419
1420 assert_eq!(t_bytes("(?-u)a"), hir_lit("a"));
1421 assert_eq!(t_bytes("(?-u)\x61"), hir_lit("a"));
1422 assert_eq!(t_bytes(r"(?-u)\x61"), hir_lit("a"));
1423 assert_eq!(t_bytes(r"(?-u)\xFF"), hir_blit(b"\xFF"));
1424
1425 assert_eq!(
1426 t_err("(?-u)☃"),
1427 TestError {
1428 kind: hir::ErrorKind::UnicodeNotAllowed,
1429 span: Span::new(
1430 Position::new(5, 1, 6),
1431 Position::new(8, 1, 7)
1432 ),
1433 }
1434 );
1435 assert_eq!(
1436 t_err(r"(?-u)\xFF"),
1437 TestError {
1438 kind: hir::ErrorKind::InvalidUtf8,
1439 span: Span::new(
1440 Position::new(5, 1, 6),
1441 Position::new(9, 1, 10)
1442 ),
1443 }
1444 );
1445 }
1446
1447 #[test]
literal_case_insensitive()1448 fn literal_case_insensitive() {
1449 #[cfg(feature = "unicode-case")]
1450 assert_eq!(t("(?i)a"), hir_uclass(&[('A', 'A'), ('a', 'a'),]));
1451 #[cfg(feature = "unicode-case")]
1452 assert_eq!(
1453 t("(?i:a)"),
1454 hir_group_nocap(hir_uclass(&[('A', 'A'), ('a', 'a')],))
1455 );
1456 #[cfg(feature = "unicode-case")]
1457 assert_eq!(
1458 t("a(?i)a(?-i)a"),
1459 hir_cat(vec![
1460 hir_lit("a"),
1461 hir_uclass(&[('A', 'A'), ('a', 'a')]),
1462 hir_lit("a"),
1463 ])
1464 );
1465 #[cfg(feature = "unicode-case")]
1466 assert_eq!(
1467 t("(?i)ab@c"),
1468 hir_cat(vec![
1469 hir_uclass(&[('A', 'A'), ('a', 'a')]),
1470 hir_uclass(&[('B', 'B'), ('b', 'b')]),
1471 hir_lit("@"),
1472 hir_uclass(&[('C', 'C'), ('c', 'c')]),
1473 ])
1474 );
1475 #[cfg(feature = "unicode-case")]
1476 assert_eq!(
1477 t("(?i)β"),
1478 hir_uclass(&[('Β', 'Β'), ('β', 'β'), ('ϐ', 'ϐ'),])
1479 );
1480
1481 assert_eq!(t("(?i-u)a"), hir_bclass(&[(b'A', b'A'), (b'a', b'a'),]));
1482 #[cfg(feature = "unicode-case")]
1483 assert_eq!(
1484 t("(?-u)a(?i)a(?-i)a"),
1485 hir_cat(vec![
1486 hir_lit("a"),
1487 hir_bclass(&[(b'A', b'A'), (b'a', b'a')]),
1488 hir_lit("a"),
1489 ])
1490 );
1491 assert_eq!(
1492 t("(?i-u)ab@c"),
1493 hir_cat(vec![
1494 hir_bclass(&[(b'A', b'A'), (b'a', b'a')]),
1495 hir_bclass(&[(b'B', b'B'), (b'b', b'b')]),
1496 hir_lit("@"),
1497 hir_bclass(&[(b'C', b'C'), (b'c', b'c')]),
1498 ])
1499 );
1500
1501 assert_eq!(
1502 t_bytes("(?i-u)a"),
1503 hir_bclass(&[(b'A', b'A'), (b'a', b'a'),])
1504 );
1505 assert_eq!(
1506 t_bytes("(?i-u)\x61"),
1507 hir_bclass(&[(b'A', b'A'), (b'a', b'a'),])
1508 );
1509 assert_eq!(
1510 t_bytes(r"(?i-u)\x61"),
1511 hir_bclass(&[(b'A', b'A'), (b'a', b'a'),])
1512 );
1513 assert_eq!(t_bytes(r"(?i-u)\xFF"), hir_blit(b"\xFF"));
1514
1515 assert_eq!(
1516 t_err("(?i-u)β"),
1517 TestError {
1518 kind: hir::ErrorKind::UnicodeNotAllowed,
1519 span: Span::new(
1520 Position::new(6, 1, 7),
1521 Position::new(8, 1, 8),
1522 ),
1523 }
1524 );
1525 }
1526
1527 #[test]
dot()1528 fn dot() {
1529 assert_eq!(
1530 t("."),
1531 hir_uclass(&[('\0', '\t'), ('\x0B', '\u{10FFFF}'),])
1532 );
1533 assert_eq!(t("(?s)."), hir_uclass(&[('\0', '\u{10FFFF}'),]));
1534 assert_eq!(
1535 t_bytes("(?-u)."),
1536 hir_bclass(&[(b'\0', b'\t'), (b'\x0B', b'\xFF'),])
1537 );
1538 assert_eq!(t_bytes("(?s-u)."), hir_bclass(&[(b'\0', b'\xFF'),]));
1539
1540 // If invalid UTF-8 isn't allowed, then non-Unicode `.` isn't allowed.
1541 assert_eq!(
1542 t_err("(?-u)."),
1543 TestError {
1544 kind: hir::ErrorKind::InvalidUtf8,
1545 span: Span::new(
1546 Position::new(5, 1, 6),
1547 Position::new(6, 1, 7)
1548 ),
1549 }
1550 );
1551 assert_eq!(
1552 t_err("(?s-u)."),
1553 TestError {
1554 kind: hir::ErrorKind::InvalidUtf8,
1555 span: Span::new(
1556 Position::new(6, 1, 7),
1557 Position::new(7, 1, 8)
1558 ),
1559 }
1560 );
1561 }
1562
1563 #[test]
assertions()1564 fn assertions() {
1565 assert_eq!(t("^"), hir_anchor(hir::Anchor::StartText));
1566 assert_eq!(t("$"), hir_anchor(hir::Anchor::EndText));
1567 assert_eq!(t(r"\A"), hir_anchor(hir::Anchor::StartText));
1568 assert_eq!(t(r"\z"), hir_anchor(hir::Anchor::EndText));
1569 assert_eq!(t("(?m)^"), hir_anchor(hir::Anchor::StartLine));
1570 assert_eq!(t("(?m)$"), hir_anchor(hir::Anchor::EndLine));
1571 assert_eq!(t(r"(?m)\A"), hir_anchor(hir::Anchor::StartText));
1572 assert_eq!(t(r"(?m)\z"), hir_anchor(hir::Anchor::EndText));
1573
1574 assert_eq!(t(r"\b"), hir_word(hir::WordBoundary::Unicode));
1575 assert_eq!(t(r"\B"), hir_word(hir::WordBoundary::UnicodeNegate));
1576 assert_eq!(t(r"(?-u)\b"), hir_word(hir::WordBoundary::Ascii));
1577 assert_eq!(
1578 t_bytes(r"(?-u)\B"),
1579 hir_word(hir::WordBoundary::AsciiNegate)
1580 );
1581
1582 assert_eq!(
1583 t_err(r"(?-u)\B"),
1584 TestError {
1585 kind: hir::ErrorKind::InvalidUtf8,
1586 span: Span::new(
1587 Position::new(5, 1, 6),
1588 Position::new(7, 1, 8)
1589 ),
1590 }
1591 );
1592 }
1593
1594 #[test]
group()1595 fn group() {
1596 assert_eq!(t("(a)"), hir_group(1, hir_lit("a")));
1597 assert_eq!(
1598 t("(a)(b)"),
1599 hir_cat(vec![
1600 hir_group(1, hir_lit("a")),
1601 hir_group(2, hir_lit("b")),
1602 ])
1603 );
1604 assert_eq!(
1605 t("(a)|(b)"),
1606 hir_alt(vec![
1607 hir_group(1, hir_lit("a")),
1608 hir_group(2, hir_lit("b")),
1609 ])
1610 );
1611 assert_eq!(t("(?P<foo>)"), hir_group_name(1, "foo", Hir::empty()));
1612 assert_eq!(t("(?P<foo>a)"), hir_group_name(1, "foo", hir_lit("a")));
1613 assert_eq!(
1614 t("(?P<foo>a)(?P<bar>b)"),
1615 hir_cat(vec![
1616 hir_group_name(1, "foo", hir_lit("a")),
1617 hir_group_name(2, "bar", hir_lit("b")),
1618 ])
1619 );
1620 assert_eq!(t("(?:)"), hir_group_nocap(Hir::empty()));
1621 assert_eq!(t("(?:a)"), hir_group_nocap(hir_lit("a")));
1622 assert_eq!(
1623 t("(?:a)(b)"),
1624 hir_cat(vec![
1625 hir_group_nocap(hir_lit("a")),
1626 hir_group(1, hir_lit("b")),
1627 ])
1628 );
1629 assert_eq!(
1630 t("(a)(?:b)(c)"),
1631 hir_cat(vec![
1632 hir_group(1, hir_lit("a")),
1633 hir_group_nocap(hir_lit("b")),
1634 hir_group(2, hir_lit("c")),
1635 ])
1636 );
1637 assert_eq!(
1638 t("(a)(?P<foo>b)(c)"),
1639 hir_cat(vec![
1640 hir_group(1, hir_lit("a")),
1641 hir_group_name(2, "foo", hir_lit("b")),
1642 hir_group(3, hir_lit("c")),
1643 ])
1644 );
1645 assert_eq!(t("()"), hir_group(1, Hir::empty()));
1646 assert_eq!(t("((?i))"), hir_group(1, Hir::empty()));
1647 assert_eq!(t("((?x))"), hir_group(1, Hir::empty()));
1648 assert_eq!(t("(((?x)))"), hir_group(1, hir_group(2, Hir::empty())));
1649 }
1650
1651 #[test]
flags()1652 fn flags() {
1653 #[cfg(feature = "unicode-case")]
1654 assert_eq!(
1655 t("(?i:a)a"),
1656 hir_cat(vec![
1657 hir_group_nocap(hir_uclass(&[('A', 'A'), ('a', 'a')])),
1658 hir_lit("a"),
1659 ])
1660 );
1661 assert_eq!(
1662 t("(?i-u:a)β"),
1663 hir_cat(vec![
1664 hir_group_nocap(hir_bclass(&[(b'A', b'A'), (b'a', b'a')])),
1665 hir_lit("β"),
1666 ])
1667 );
1668 assert_eq!(
1669 t("(?:(?i-u)a)b"),
1670 hir_cat(vec![
1671 hir_group_nocap(hir_bclass(&[(b'A', b'A'), (b'a', b'a')])),
1672 hir_lit("b"),
1673 ])
1674 );
1675 assert_eq!(
1676 t("((?i-u)a)b"),
1677 hir_cat(vec![
1678 hir_group(1, hir_bclass(&[(b'A', b'A'), (b'a', b'a')])),
1679 hir_lit("b"),
1680 ])
1681 );
1682 #[cfg(feature = "unicode-case")]
1683 assert_eq!(
1684 t("(?i)(?-i:a)a"),
1685 hir_cat(vec![
1686 hir_group_nocap(hir_lit("a")),
1687 hir_uclass(&[('A', 'A'), ('a', 'a')]),
1688 ])
1689 );
1690 #[cfg(feature = "unicode-case")]
1691 assert_eq!(
1692 t("(?im)a^"),
1693 hir_cat(vec![
1694 hir_uclass(&[('A', 'A'), ('a', 'a')]),
1695 hir_anchor(hir::Anchor::StartLine),
1696 ])
1697 );
1698 #[cfg(feature = "unicode-case")]
1699 assert_eq!(
1700 t("(?im)a^(?i-m)a^"),
1701 hir_cat(vec![
1702 hir_uclass(&[('A', 'A'), ('a', 'a')]),
1703 hir_anchor(hir::Anchor::StartLine),
1704 hir_uclass(&[('A', 'A'), ('a', 'a')]),
1705 hir_anchor(hir::Anchor::StartText),
1706 ])
1707 );
1708 assert_eq!(
1709 t("(?U)a*a*?(?-U)a*a*?"),
1710 hir_cat(vec![
1711 hir_star(false, hir_lit("a")),
1712 hir_star(true, hir_lit("a")),
1713 hir_star(true, hir_lit("a")),
1714 hir_star(false, hir_lit("a")),
1715 ])
1716 );
1717 #[cfg(feature = "unicode-case")]
1718 assert_eq!(
1719 t("(?:a(?i)a)a"),
1720 hir_cat(vec![
1721 hir_group_nocap(hir_cat(vec![
1722 hir_lit("a"),
1723 hir_uclass(&[('A', 'A'), ('a', 'a')]),
1724 ])),
1725 hir_lit("a"),
1726 ])
1727 );
1728 #[cfg(feature = "unicode-case")]
1729 assert_eq!(
1730 t("(?i)(?:a(?-i)a)a"),
1731 hir_cat(vec![
1732 hir_group_nocap(hir_cat(vec![
1733 hir_uclass(&[('A', 'A'), ('a', 'a')]),
1734 hir_lit("a"),
1735 ])),
1736 hir_uclass(&[('A', 'A'), ('a', 'a')]),
1737 ])
1738 );
1739 }
1740
1741 #[test]
escape()1742 fn escape() {
1743 assert_eq!(
1744 t(r"\\\.\+\*\?\(\)\|\[\]\{\}\^\$\#"),
1745 hir_lit(r"\.+*?()|[]{}^$#")
1746 );
1747 }
1748
1749 #[test]
repetition()1750 fn repetition() {
1751 assert_eq!(t("a?"), hir_quest(true, hir_lit("a")));
1752 assert_eq!(t("a*"), hir_star(true, hir_lit("a")));
1753 assert_eq!(t("a+"), hir_plus(true, hir_lit("a")));
1754 assert_eq!(t("a??"), hir_quest(false, hir_lit("a")));
1755 assert_eq!(t("a*?"), hir_star(false, hir_lit("a")));
1756 assert_eq!(t("a+?"), hir_plus(false, hir_lit("a")));
1757
1758 assert_eq!(
1759 t("a{1}"),
1760 hir_range(true, hir::RepetitionRange::Exactly(1), hir_lit("a"),)
1761 );
1762 assert_eq!(
1763 t("a{1,}"),
1764 hir_range(true, hir::RepetitionRange::AtLeast(1), hir_lit("a"),)
1765 );
1766 assert_eq!(
1767 t("a{1,2}"),
1768 hir_range(true, hir::RepetitionRange::Bounded(1, 2), hir_lit("a"),)
1769 );
1770 assert_eq!(
1771 t("a{1}?"),
1772 hir_range(false, hir::RepetitionRange::Exactly(1), hir_lit("a"),)
1773 );
1774 assert_eq!(
1775 t("a{1,}?"),
1776 hir_range(false, hir::RepetitionRange::AtLeast(1), hir_lit("a"),)
1777 );
1778 assert_eq!(
1779 t("a{1,2}?"),
1780 hir_range(
1781 false,
1782 hir::RepetitionRange::Bounded(1, 2),
1783 hir_lit("a"),
1784 )
1785 );
1786
1787 assert_eq!(
1788 t("ab?"),
1789 hir_cat(vec![hir_lit("a"), hir_quest(true, hir_lit("b")),])
1790 );
1791 assert_eq!(
1792 t("(ab)?"),
1793 hir_quest(
1794 true,
1795 hir_group(1, hir_cat(vec![hir_lit("a"), hir_lit("b"),]))
1796 )
1797 );
1798 assert_eq!(
1799 t("a|b?"),
1800 hir_alt(vec![hir_lit("a"), hir_quest(true, hir_lit("b")),])
1801 );
1802 }
1803
1804 #[test]
cat_alt()1805 fn cat_alt() {
1806 assert_eq!(
1807 t("(ab)"),
1808 hir_group(1, hir_cat(vec![hir_lit("a"), hir_lit("b"),]))
1809 );
1810 assert_eq!(t("a|b"), hir_alt(vec![hir_lit("a"), hir_lit("b"),]));
1811 assert_eq!(
1812 t("a|b|c"),
1813 hir_alt(vec![hir_lit("a"), hir_lit("b"), hir_lit("c"),])
1814 );
1815 assert_eq!(
1816 t("ab|bc|cd"),
1817 hir_alt(vec![hir_lit("ab"), hir_lit("bc"), hir_lit("cd"),])
1818 );
1819 assert_eq!(
1820 t("(a|b)"),
1821 hir_group(1, hir_alt(vec![hir_lit("a"), hir_lit("b"),]))
1822 );
1823 assert_eq!(
1824 t("(a|b|c)"),
1825 hir_group(
1826 1,
1827 hir_alt(vec![hir_lit("a"), hir_lit("b"), hir_lit("c"),])
1828 )
1829 );
1830 assert_eq!(
1831 t("(ab|bc|cd)"),
1832 hir_group(
1833 1,
1834 hir_alt(vec![hir_lit("ab"), hir_lit("bc"), hir_lit("cd"),])
1835 )
1836 );
1837 assert_eq!(
1838 t("(ab|(bc|(cd)))"),
1839 hir_group(
1840 1,
1841 hir_alt(vec![
1842 hir_lit("ab"),
1843 hir_group(
1844 2,
1845 hir_alt(vec![
1846 hir_lit("bc"),
1847 hir_group(3, hir_lit("cd")),
1848 ])
1849 ),
1850 ])
1851 )
1852 );
1853 }
1854
1855 #[test]
class_ascii()1856 fn class_ascii() {
1857 assert_eq!(
1858 t("[[:alnum:]]"),
1859 hir_uclass(ascii_class(&ast::ClassAsciiKind::Alnum))
1860 );
1861 assert_eq!(
1862 t("[[:alpha:]]"),
1863 hir_uclass(ascii_class(&ast::ClassAsciiKind::Alpha))
1864 );
1865 assert_eq!(
1866 t("[[:ascii:]]"),
1867 hir_uclass(ascii_class(&ast::ClassAsciiKind::Ascii))
1868 );
1869 assert_eq!(
1870 t("[[:blank:]]"),
1871 hir_uclass(ascii_class(&ast::ClassAsciiKind::Blank))
1872 );
1873 assert_eq!(
1874 t("[[:cntrl:]]"),
1875 hir_uclass(ascii_class(&ast::ClassAsciiKind::Cntrl))
1876 );
1877 assert_eq!(
1878 t("[[:digit:]]"),
1879 hir_uclass(ascii_class(&ast::ClassAsciiKind::Digit))
1880 );
1881 assert_eq!(
1882 t("[[:graph:]]"),
1883 hir_uclass(ascii_class(&ast::ClassAsciiKind::Graph))
1884 );
1885 assert_eq!(
1886 t("[[:lower:]]"),
1887 hir_uclass(ascii_class(&ast::ClassAsciiKind::Lower))
1888 );
1889 assert_eq!(
1890 t("[[:print:]]"),
1891 hir_uclass(ascii_class(&ast::ClassAsciiKind::Print))
1892 );
1893 assert_eq!(
1894 t("[[:punct:]]"),
1895 hir_uclass(ascii_class(&ast::ClassAsciiKind::Punct))
1896 );
1897 assert_eq!(
1898 t("[[:space:]]"),
1899 hir_uclass(ascii_class(&ast::ClassAsciiKind::Space))
1900 );
1901 assert_eq!(
1902 t("[[:upper:]]"),
1903 hir_uclass(ascii_class(&ast::ClassAsciiKind::Upper))
1904 );
1905 assert_eq!(
1906 t("[[:word:]]"),
1907 hir_uclass(ascii_class(&ast::ClassAsciiKind::Word))
1908 );
1909 assert_eq!(
1910 t("[[:xdigit:]]"),
1911 hir_uclass(ascii_class(&ast::ClassAsciiKind::Xdigit))
1912 );
1913
1914 assert_eq!(
1915 t("[[:^lower:]]"),
1916 hir_negate(hir_uclass(ascii_class(&ast::ClassAsciiKind::Lower)))
1917 );
1918 #[cfg(feature = "unicode-case")]
1919 assert_eq!(
1920 t("(?i)[[:lower:]]"),
1921 hir_uclass(&[
1922 ('A', 'Z'),
1923 ('a', 'z'),
1924 ('\u{17F}', '\u{17F}'),
1925 ('\u{212A}', '\u{212A}'),
1926 ])
1927 );
1928
1929 assert_eq!(
1930 t("(?-u)[[:lower:]]"),
1931 hir_bclass_from_char(ascii_class(&ast::ClassAsciiKind::Lower))
1932 );
1933 assert_eq!(
1934 t("(?i-u)[[:lower:]]"),
1935 hir_case_fold(hir_bclass_from_char(ascii_class(
1936 &ast::ClassAsciiKind::Lower
1937 )))
1938 );
1939
1940 assert_eq!(
1941 t_err("(?-u)[[:^lower:]]"),
1942 TestError {
1943 kind: hir::ErrorKind::InvalidUtf8,
1944 span: Span::new(
1945 Position::new(6, 1, 7),
1946 Position::new(16, 1, 17)
1947 ),
1948 }
1949 );
1950 assert_eq!(
1951 t_err("(?i-u)[[:^lower:]]"),
1952 TestError {
1953 kind: hir::ErrorKind::InvalidUtf8,
1954 span: Span::new(
1955 Position::new(7, 1, 8),
1956 Position::new(17, 1, 18)
1957 ),
1958 }
1959 );
1960 }
1961
1962 #[test]
class_ascii_multiple()1963 fn class_ascii_multiple() {
1964 // See: https://github.com/rust-lang/regex/issues/680
1965 assert_eq!(
1966 t("[[:alnum:][:^ascii:]]"),
1967 hir_union(
1968 hir_uclass(ascii_class(&ast::ClassAsciiKind::Alnum)),
1969 hir_uclass(&[('\u{80}', '\u{10FFFF}')]),
1970 ),
1971 );
1972 assert_eq!(
1973 t_bytes("(?-u)[[:alnum:][:^ascii:]]"),
1974 hir_union(
1975 hir_bclass_from_char(ascii_class(&ast::ClassAsciiKind::Alnum)),
1976 hir_bclass(&[(0x80, 0xFF)]),
1977 ),
1978 );
1979 }
1980
1981 #[test]
1982 #[cfg(feature = "unicode-perl")]
class_perl()1983 fn class_perl() {
1984 // Unicode
1985 assert_eq!(t(r"\d"), hir_uclass_query(ClassQuery::Binary("digit")));
1986 assert_eq!(t(r"\s"), hir_uclass_query(ClassQuery::Binary("space")));
1987 assert_eq!(t(r"\w"), hir_uclass_perl_word());
1988 #[cfg(feature = "unicode-case")]
1989 assert_eq!(
1990 t(r"(?i)\d"),
1991 hir_uclass_query(ClassQuery::Binary("digit"))
1992 );
1993 #[cfg(feature = "unicode-case")]
1994 assert_eq!(
1995 t(r"(?i)\s"),
1996 hir_uclass_query(ClassQuery::Binary("space"))
1997 );
1998 #[cfg(feature = "unicode-case")]
1999 assert_eq!(t(r"(?i)\w"), hir_uclass_perl_word());
2000
2001 // Unicode, negated
2002 assert_eq!(
2003 t(r"\D"),
2004 hir_negate(hir_uclass_query(ClassQuery::Binary("digit")))
2005 );
2006 assert_eq!(
2007 t(r"\S"),
2008 hir_negate(hir_uclass_query(ClassQuery::Binary("space")))
2009 );
2010 assert_eq!(t(r"\W"), hir_negate(hir_uclass_perl_word()));
2011 #[cfg(feature = "unicode-case")]
2012 assert_eq!(
2013 t(r"(?i)\D"),
2014 hir_negate(hir_uclass_query(ClassQuery::Binary("digit")))
2015 );
2016 #[cfg(feature = "unicode-case")]
2017 assert_eq!(
2018 t(r"(?i)\S"),
2019 hir_negate(hir_uclass_query(ClassQuery::Binary("space")))
2020 );
2021 #[cfg(feature = "unicode-case")]
2022 assert_eq!(t(r"(?i)\W"), hir_negate(hir_uclass_perl_word()));
2023
2024 // ASCII only
2025 assert_eq!(
2026 t(r"(?-u)\d"),
2027 hir_bclass_from_char(ascii_class(&ast::ClassAsciiKind::Digit))
2028 );
2029 assert_eq!(
2030 t(r"(?-u)\s"),
2031 hir_bclass_from_char(ascii_class(&ast::ClassAsciiKind::Space))
2032 );
2033 assert_eq!(
2034 t(r"(?-u)\w"),
2035 hir_bclass_from_char(ascii_class(&ast::ClassAsciiKind::Word))
2036 );
2037 assert_eq!(
2038 t(r"(?i-u)\d"),
2039 hir_bclass_from_char(ascii_class(&ast::ClassAsciiKind::Digit))
2040 );
2041 assert_eq!(
2042 t(r"(?i-u)\s"),
2043 hir_bclass_from_char(ascii_class(&ast::ClassAsciiKind::Space))
2044 );
2045 assert_eq!(
2046 t(r"(?i-u)\w"),
2047 hir_bclass_from_char(ascii_class(&ast::ClassAsciiKind::Word))
2048 );
2049
2050 // ASCII only, negated
2051 assert_eq!(
2052 t(r"(?-u)\D"),
2053 hir_negate(hir_bclass_from_char(ascii_class(
2054 &ast::ClassAsciiKind::Digit
2055 )))
2056 );
2057 assert_eq!(
2058 t(r"(?-u)\S"),
2059 hir_negate(hir_bclass_from_char(ascii_class(
2060 &ast::ClassAsciiKind::Space
2061 )))
2062 );
2063 assert_eq!(
2064 t(r"(?-u)\W"),
2065 hir_negate(hir_bclass_from_char(ascii_class(
2066 &ast::ClassAsciiKind::Word
2067 )))
2068 );
2069 assert_eq!(
2070 t(r"(?i-u)\D"),
2071 hir_negate(hir_bclass_from_char(ascii_class(
2072 &ast::ClassAsciiKind::Digit
2073 )))
2074 );
2075 assert_eq!(
2076 t(r"(?i-u)\S"),
2077 hir_negate(hir_bclass_from_char(ascii_class(
2078 &ast::ClassAsciiKind::Space
2079 )))
2080 );
2081 assert_eq!(
2082 t(r"(?i-u)\W"),
2083 hir_negate(hir_bclass_from_char(ascii_class(
2084 &ast::ClassAsciiKind::Word
2085 )))
2086 );
2087 }
2088
2089 #[test]
2090 #[cfg(not(feature = "unicode-perl"))]
class_perl_word_disabled()2091 fn class_perl_word_disabled() {
2092 assert_eq!(
2093 t_err(r"\w"),
2094 TestError {
2095 kind: hir::ErrorKind::UnicodePerlClassNotFound,
2096 span: Span::new(
2097 Position::new(0, 1, 1),
2098 Position::new(2, 1, 3)
2099 ),
2100 }
2101 );
2102 }
2103
2104 #[test]
2105 #[cfg(all(not(feature = "unicode-perl"), not(feature = "unicode-bool")))]
class_perl_space_disabled()2106 fn class_perl_space_disabled() {
2107 assert_eq!(
2108 t_err(r"\s"),
2109 TestError {
2110 kind: hir::ErrorKind::UnicodePerlClassNotFound,
2111 span: Span::new(
2112 Position::new(0, 1, 1),
2113 Position::new(2, 1, 3)
2114 ),
2115 }
2116 );
2117 }
2118
2119 #[test]
2120 #[cfg(all(
2121 not(feature = "unicode-perl"),
2122 not(feature = "unicode-gencat")
2123 ))]
class_perl_digit_disabled()2124 fn class_perl_digit_disabled() {
2125 assert_eq!(
2126 t_err(r"\d"),
2127 TestError {
2128 kind: hir::ErrorKind::UnicodePerlClassNotFound,
2129 span: Span::new(
2130 Position::new(0, 1, 1),
2131 Position::new(2, 1, 3)
2132 ),
2133 }
2134 );
2135 }
2136
2137 #[test]
2138 #[cfg(feature = "unicode-gencat")]
class_unicode_gencat()2139 fn class_unicode_gencat() {
2140 assert_eq!(t(r"\pZ"), hir_uclass_query(ClassQuery::Binary("Z")));
2141 assert_eq!(t(r"\pz"), hir_uclass_query(ClassQuery::Binary("Z")));
2142 assert_eq!(
2143 t(r"\p{Separator}"),
2144 hir_uclass_query(ClassQuery::Binary("Z"))
2145 );
2146 assert_eq!(
2147 t(r"\p{se PaRa ToR}"),
2148 hir_uclass_query(ClassQuery::Binary("Z"))
2149 );
2150 assert_eq!(
2151 t(r"\p{gc:Separator}"),
2152 hir_uclass_query(ClassQuery::Binary("Z"))
2153 );
2154 assert_eq!(
2155 t(r"\p{gc=Separator}"),
2156 hir_uclass_query(ClassQuery::Binary("Z"))
2157 );
2158 assert_eq!(
2159 t(r"\p{Other}"),
2160 hir_uclass_query(ClassQuery::Binary("Other"))
2161 );
2162 assert_eq!(t(r"\pC"), hir_uclass_query(ClassQuery::Binary("Other")));
2163
2164 assert_eq!(
2165 t(r"\PZ"),
2166 hir_negate(hir_uclass_query(ClassQuery::Binary("Z")))
2167 );
2168 assert_eq!(
2169 t(r"\P{separator}"),
2170 hir_negate(hir_uclass_query(ClassQuery::Binary("Z")))
2171 );
2172 assert_eq!(
2173 t(r"\P{gc!=separator}"),
2174 hir_negate(hir_uclass_query(ClassQuery::Binary("Z")))
2175 );
2176
2177 assert_eq!(t(r"\p{any}"), hir_uclass_query(ClassQuery::Binary("Any")));
2178 assert_eq!(
2179 t(r"\p{assigned}"),
2180 hir_uclass_query(ClassQuery::Binary("Assigned"))
2181 );
2182 assert_eq!(
2183 t(r"\p{ascii}"),
2184 hir_uclass_query(ClassQuery::Binary("ASCII"))
2185 );
2186 assert_eq!(
2187 t(r"\p{gc:any}"),
2188 hir_uclass_query(ClassQuery::Binary("Any"))
2189 );
2190 assert_eq!(
2191 t(r"\p{gc:assigned}"),
2192 hir_uclass_query(ClassQuery::Binary("Assigned"))
2193 );
2194 assert_eq!(
2195 t(r"\p{gc:ascii}"),
2196 hir_uclass_query(ClassQuery::Binary("ASCII"))
2197 );
2198
2199 assert_eq!(
2200 t_err(r"(?-u)\pZ"),
2201 TestError {
2202 kind: hir::ErrorKind::UnicodeNotAllowed,
2203 span: Span::new(
2204 Position::new(5, 1, 6),
2205 Position::new(8, 1, 9)
2206 ),
2207 }
2208 );
2209 assert_eq!(
2210 t_err(r"(?-u)\p{Separator}"),
2211 TestError {
2212 kind: hir::ErrorKind::UnicodeNotAllowed,
2213 span: Span::new(
2214 Position::new(5, 1, 6),
2215 Position::new(18, 1, 19)
2216 ),
2217 }
2218 );
2219 assert_eq!(
2220 t_err(r"\pE"),
2221 TestError {
2222 kind: hir::ErrorKind::UnicodePropertyNotFound,
2223 span: Span::new(
2224 Position::new(0, 1, 1),
2225 Position::new(3, 1, 4)
2226 ),
2227 }
2228 );
2229 assert_eq!(
2230 t_err(r"\p{Foo}"),
2231 TestError {
2232 kind: hir::ErrorKind::UnicodePropertyNotFound,
2233 span: Span::new(
2234 Position::new(0, 1, 1),
2235 Position::new(7, 1, 8)
2236 ),
2237 }
2238 );
2239 assert_eq!(
2240 t_err(r"\p{gc:Foo}"),
2241 TestError {
2242 kind: hir::ErrorKind::UnicodePropertyValueNotFound,
2243 span: Span::new(
2244 Position::new(0, 1, 1),
2245 Position::new(10, 1, 11)
2246 ),
2247 }
2248 );
2249 }
2250
2251 #[test]
2252 #[cfg(not(feature = "unicode-gencat"))]
class_unicode_gencat_disabled()2253 fn class_unicode_gencat_disabled() {
2254 assert_eq!(
2255 t_err(r"\p{Separator}"),
2256 TestError {
2257 kind: hir::ErrorKind::UnicodePropertyNotFound,
2258 span: Span::new(
2259 Position::new(0, 1, 1),
2260 Position::new(13, 1, 14)
2261 ),
2262 }
2263 );
2264
2265 assert_eq!(
2266 t_err(r"\p{Any}"),
2267 TestError {
2268 kind: hir::ErrorKind::UnicodePropertyNotFound,
2269 span: Span::new(
2270 Position::new(0, 1, 1),
2271 Position::new(7, 1, 8)
2272 ),
2273 }
2274 );
2275 }
2276
2277 #[test]
2278 #[cfg(feature = "unicode-script")]
class_unicode_script()2279 fn class_unicode_script() {
2280 assert_eq!(
2281 t(r"\p{Greek}"),
2282 hir_uclass_query(ClassQuery::Binary("Greek"))
2283 );
2284 #[cfg(feature = "unicode-case")]
2285 assert_eq!(
2286 t(r"(?i)\p{Greek}"),
2287 hir_case_fold(hir_uclass_query(ClassQuery::Binary("Greek")))
2288 );
2289 #[cfg(feature = "unicode-case")]
2290 assert_eq!(
2291 t(r"(?i)\P{Greek}"),
2292 hir_negate(hir_case_fold(hir_uclass_query(ClassQuery::Binary(
2293 "Greek"
2294 ))))
2295 );
2296
2297 assert_eq!(
2298 t_err(r"\p{sc:Foo}"),
2299 TestError {
2300 kind: hir::ErrorKind::UnicodePropertyValueNotFound,
2301 span: Span::new(
2302 Position::new(0, 1, 1),
2303 Position::new(10, 1, 11)
2304 ),
2305 }
2306 );
2307 assert_eq!(
2308 t_err(r"\p{scx:Foo}"),
2309 TestError {
2310 kind: hir::ErrorKind::UnicodePropertyValueNotFound,
2311 span: Span::new(
2312 Position::new(0, 1, 1),
2313 Position::new(11, 1, 12)
2314 ),
2315 }
2316 );
2317 }
2318
2319 #[test]
2320 #[cfg(not(feature = "unicode-script"))]
class_unicode_script_disabled()2321 fn class_unicode_script_disabled() {
2322 assert_eq!(
2323 t_err(r"\p{Greek}"),
2324 TestError {
2325 kind: hir::ErrorKind::UnicodePropertyNotFound,
2326 span: Span::new(
2327 Position::new(0, 1, 1),
2328 Position::new(9, 1, 10)
2329 ),
2330 }
2331 );
2332
2333 assert_eq!(
2334 t_err(r"\p{scx:Greek}"),
2335 TestError {
2336 kind: hir::ErrorKind::UnicodePropertyNotFound,
2337 span: Span::new(
2338 Position::new(0, 1, 1),
2339 Position::new(13, 1, 14)
2340 ),
2341 }
2342 );
2343 }
2344
2345 #[test]
2346 #[cfg(feature = "unicode-age")]
class_unicode_age()2347 fn class_unicode_age() {
2348 assert_eq!(
2349 t_err(r"\p{age:Foo}"),
2350 TestError {
2351 kind: hir::ErrorKind::UnicodePropertyValueNotFound,
2352 span: Span::new(
2353 Position::new(0, 1, 1),
2354 Position::new(11, 1, 12)
2355 ),
2356 }
2357 );
2358 }
2359
2360 #[test]
2361 #[cfg(feature = "unicode-gencat")]
class_unicode_any_empty()2362 fn class_unicode_any_empty() {
2363 assert_eq!(
2364 t_err(r"\P{any}"),
2365 TestError {
2366 kind: hir::ErrorKind::EmptyClassNotAllowed,
2367 span: Span::new(
2368 Position::new(0, 1, 1),
2369 Position::new(7, 1, 8)
2370 ),
2371 }
2372 );
2373 }
2374
2375 #[test]
2376 #[cfg(not(feature = "unicode-age"))]
class_unicode_age_disabled()2377 fn class_unicode_age_disabled() {
2378 assert_eq!(
2379 t_err(r"\p{age:3.0}"),
2380 TestError {
2381 kind: hir::ErrorKind::UnicodePropertyNotFound,
2382 span: Span::new(
2383 Position::new(0, 1, 1),
2384 Position::new(11, 1, 12)
2385 ),
2386 }
2387 );
2388 }
2389
2390 #[test]
class_bracketed()2391 fn class_bracketed() {
2392 assert_eq!(t("[a]"), hir_uclass(&[('a', 'a')]));
2393 assert_eq!(t("[^[a]]"), hir_negate(hir_uclass(&[('a', 'a')])));
2394 assert_eq!(t("[a-z]"), hir_uclass(&[('a', 'z')]));
2395 assert_eq!(t("[a-fd-h]"), hir_uclass(&[('a', 'h')]));
2396 assert_eq!(t("[a-fg-m]"), hir_uclass(&[('a', 'm')]));
2397 assert_eq!(t(r"[\x00]"), hir_uclass(&[('\0', '\0')]));
2398 assert_eq!(t(r"[\n]"), hir_uclass(&[('\n', '\n')]));
2399 assert_eq!(t("[\n]"), hir_uclass(&[('\n', '\n')]));
2400 #[cfg(any(feature = "unicode-perl", feature = "unicode-gencat"))]
2401 assert_eq!(t(r"[\d]"), hir_uclass_query(ClassQuery::Binary("digit")));
2402 #[cfg(feature = "unicode-gencat")]
2403 assert_eq!(
2404 t(r"[\pZ]"),
2405 hir_uclass_query(ClassQuery::Binary("separator"))
2406 );
2407 #[cfg(feature = "unicode-gencat")]
2408 assert_eq!(
2409 t(r"[\p{separator}]"),
2410 hir_uclass_query(ClassQuery::Binary("separator"))
2411 );
2412 #[cfg(any(feature = "unicode-perl", feature = "unicode-gencat"))]
2413 assert_eq!(t(r"[^\D]"), hir_uclass_query(ClassQuery::Binary("digit")));
2414 #[cfg(feature = "unicode-gencat")]
2415 assert_eq!(
2416 t(r"[^\PZ]"),
2417 hir_uclass_query(ClassQuery::Binary("separator"))
2418 );
2419 #[cfg(feature = "unicode-gencat")]
2420 assert_eq!(
2421 t(r"[^\P{separator}]"),
2422 hir_uclass_query(ClassQuery::Binary("separator"))
2423 );
2424 #[cfg(all(
2425 feature = "unicode-case",
2426 any(feature = "unicode-perl", feature = "unicode-gencat")
2427 ))]
2428 assert_eq!(
2429 t(r"(?i)[^\D]"),
2430 hir_uclass_query(ClassQuery::Binary("digit"))
2431 );
2432 #[cfg(all(feature = "unicode-case", feature = "unicode-script"))]
2433 assert_eq!(
2434 t(r"(?i)[^\P{greek}]"),
2435 hir_case_fold(hir_uclass_query(ClassQuery::Binary("greek")))
2436 );
2437
2438 assert_eq!(t("(?-u)[a]"), hir_bclass(&[(b'a', b'a')]));
2439 assert_eq!(t(r"(?-u)[\x00]"), hir_bclass(&[(b'\0', b'\0')]));
2440 assert_eq!(t_bytes(r"(?-u)[\xFF]"), hir_bclass(&[(b'\xFF', b'\xFF')]));
2441
2442 #[cfg(feature = "unicode-case")]
2443 assert_eq!(t("(?i)[a]"), hir_uclass(&[('A', 'A'), ('a', 'a')]));
2444 #[cfg(feature = "unicode-case")]
2445 assert_eq!(
2446 t("(?i)[k]"),
2447 hir_uclass(&[('K', 'K'), ('k', 'k'), ('\u{212A}', '\u{212A}'),])
2448 );
2449 #[cfg(feature = "unicode-case")]
2450 assert_eq!(
2451 t("(?i)[β]"),
2452 hir_uclass(&[('Β', 'Β'), ('β', 'β'), ('ϐ', 'ϐ'),])
2453 );
2454 assert_eq!(t("(?i-u)[k]"), hir_bclass(&[(b'K', b'K'), (b'k', b'k'),]));
2455
2456 assert_eq!(t("[^a]"), hir_negate(hir_uclass(&[('a', 'a')])));
2457 assert_eq!(t(r"[^\x00]"), hir_negate(hir_uclass(&[('\0', '\0')])));
2458 assert_eq!(
2459 t_bytes("(?-u)[^a]"),
2460 hir_negate(hir_bclass(&[(b'a', b'a')]))
2461 );
2462 #[cfg(any(feature = "unicode-perl", feature = "unicode-gencat"))]
2463 assert_eq!(
2464 t(r"[^\d]"),
2465 hir_negate(hir_uclass_query(ClassQuery::Binary("digit")))
2466 );
2467 #[cfg(feature = "unicode-gencat")]
2468 assert_eq!(
2469 t(r"[^\pZ]"),
2470 hir_negate(hir_uclass_query(ClassQuery::Binary("separator")))
2471 );
2472 #[cfg(feature = "unicode-gencat")]
2473 assert_eq!(
2474 t(r"[^\p{separator}]"),
2475 hir_negate(hir_uclass_query(ClassQuery::Binary("separator")))
2476 );
2477 #[cfg(all(feature = "unicode-case", feature = "unicode-script"))]
2478 assert_eq!(
2479 t(r"(?i)[^\p{greek}]"),
2480 hir_negate(hir_case_fold(hir_uclass_query(ClassQuery::Binary(
2481 "greek"
2482 ))))
2483 );
2484 #[cfg(all(feature = "unicode-case", feature = "unicode-script"))]
2485 assert_eq!(
2486 t(r"(?i)[\P{greek}]"),
2487 hir_negate(hir_case_fold(hir_uclass_query(ClassQuery::Binary(
2488 "greek"
2489 ))))
2490 );
2491
2492 // Test some weird cases.
2493 assert_eq!(t(r"[\[]"), hir_uclass(&[('[', '[')]));
2494
2495 assert_eq!(t(r"[&]"), hir_uclass(&[('&', '&')]));
2496 assert_eq!(t(r"[\&]"), hir_uclass(&[('&', '&')]));
2497 assert_eq!(t(r"[\&\&]"), hir_uclass(&[('&', '&')]));
2498 assert_eq!(t(r"[\x00-&]"), hir_uclass(&[('\0', '&')]));
2499 assert_eq!(t(r"[&-\xFF]"), hir_uclass(&[('&', '\u{FF}')]));
2500
2501 assert_eq!(t(r"[~]"), hir_uclass(&[('~', '~')]));
2502 assert_eq!(t(r"[\~]"), hir_uclass(&[('~', '~')]));
2503 assert_eq!(t(r"[\~\~]"), hir_uclass(&[('~', '~')]));
2504 assert_eq!(t(r"[\x00-~]"), hir_uclass(&[('\0', '~')]));
2505 assert_eq!(t(r"[~-\xFF]"), hir_uclass(&[('~', '\u{FF}')]));
2506
2507 assert_eq!(t(r"[-]"), hir_uclass(&[('-', '-')]));
2508 assert_eq!(t(r"[\-]"), hir_uclass(&[('-', '-')]));
2509 assert_eq!(t(r"[\-\-]"), hir_uclass(&[('-', '-')]));
2510 assert_eq!(t(r"[\x00-\-]"), hir_uclass(&[('\0', '-')]));
2511 assert_eq!(t(r"[\--\xFF]"), hir_uclass(&[('-', '\u{FF}')]));
2512
2513 assert_eq!(
2514 t_err("(?-u)[^a]"),
2515 TestError {
2516 kind: hir::ErrorKind::InvalidUtf8,
2517 span: Span::new(
2518 Position::new(5, 1, 6),
2519 Position::new(9, 1, 10)
2520 ),
2521 }
2522 );
2523 #[cfg(any(feature = "unicode-perl", feature = "unicode-bool"))]
2524 assert_eq!(
2525 t_err(r"[^\s\S]"),
2526 TestError {
2527 kind: hir::ErrorKind::EmptyClassNotAllowed,
2528 span: Span::new(
2529 Position::new(0, 1, 1),
2530 Position::new(7, 1, 8)
2531 ),
2532 }
2533 );
2534 #[cfg(any(feature = "unicode-perl", feature = "unicode-bool"))]
2535 assert_eq!(
2536 t_err(r"(?-u)[^\s\S]"),
2537 TestError {
2538 kind: hir::ErrorKind::EmptyClassNotAllowed,
2539 span: Span::new(
2540 Position::new(5, 1, 6),
2541 Position::new(12, 1, 13)
2542 ),
2543 }
2544 );
2545 }
2546
2547 #[test]
class_bracketed_union()2548 fn class_bracketed_union() {
2549 assert_eq!(t("[a-zA-Z]"), hir_uclass(&[('A', 'Z'), ('a', 'z')]));
2550 #[cfg(feature = "unicode-gencat")]
2551 assert_eq!(
2552 t(r"[a\pZb]"),
2553 hir_union(
2554 hir_uclass(&[('a', 'b')]),
2555 hir_uclass_query(ClassQuery::Binary("separator"))
2556 )
2557 );
2558 #[cfg(all(feature = "unicode-gencat", feature = "unicode-script"))]
2559 assert_eq!(
2560 t(r"[\pZ\p{Greek}]"),
2561 hir_union(
2562 hir_uclass_query(ClassQuery::Binary("greek")),
2563 hir_uclass_query(ClassQuery::Binary("separator"))
2564 )
2565 );
2566 #[cfg(all(
2567 feature = "unicode-age",
2568 feature = "unicode-gencat",
2569 feature = "unicode-script"
2570 ))]
2571 assert_eq!(
2572 t(r"[\p{age:3.0}\pZ\p{Greek}]"),
2573 hir_union(
2574 hir_uclass_query(ClassQuery::ByValue {
2575 property_name: "age",
2576 property_value: "3.0",
2577 }),
2578 hir_union(
2579 hir_uclass_query(ClassQuery::Binary("greek")),
2580 hir_uclass_query(ClassQuery::Binary("separator"))
2581 )
2582 )
2583 );
2584 #[cfg(all(
2585 feature = "unicode-age",
2586 feature = "unicode-gencat",
2587 feature = "unicode-script"
2588 ))]
2589 assert_eq!(
2590 t(r"[[[\p{age:3.0}\pZ]\p{Greek}][\p{Cyrillic}]]"),
2591 hir_union(
2592 hir_uclass_query(ClassQuery::ByValue {
2593 property_name: "age",
2594 property_value: "3.0",
2595 }),
2596 hir_union(
2597 hir_uclass_query(ClassQuery::Binary("cyrillic")),
2598 hir_union(
2599 hir_uclass_query(ClassQuery::Binary("greek")),
2600 hir_uclass_query(ClassQuery::Binary("separator"))
2601 )
2602 )
2603 )
2604 );
2605
2606 #[cfg(all(
2607 feature = "unicode-age",
2608 feature = "unicode-case",
2609 feature = "unicode-gencat",
2610 feature = "unicode-script"
2611 ))]
2612 assert_eq!(
2613 t(r"(?i)[\p{age:3.0}\pZ\p{Greek}]"),
2614 hir_case_fold(hir_union(
2615 hir_uclass_query(ClassQuery::ByValue {
2616 property_name: "age",
2617 property_value: "3.0",
2618 }),
2619 hir_union(
2620 hir_uclass_query(ClassQuery::Binary("greek")),
2621 hir_uclass_query(ClassQuery::Binary("separator"))
2622 )
2623 ))
2624 );
2625 #[cfg(all(
2626 feature = "unicode-age",
2627 feature = "unicode-gencat",
2628 feature = "unicode-script"
2629 ))]
2630 assert_eq!(
2631 t(r"[^\p{age:3.0}\pZ\p{Greek}]"),
2632 hir_negate(hir_union(
2633 hir_uclass_query(ClassQuery::ByValue {
2634 property_name: "age",
2635 property_value: "3.0",
2636 }),
2637 hir_union(
2638 hir_uclass_query(ClassQuery::Binary("greek")),
2639 hir_uclass_query(ClassQuery::Binary("separator"))
2640 )
2641 ))
2642 );
2643 #[cfg(all(
2644 feature = "unicode-age",
2645 feature = "unicode-case",
2646 feature = "unicode-gencat",
2647 feature = "unicode-script"
2648 ))]
2649 assert_eq!(
2650 t(r"(?i)[^\p{age:3.0}\pZ\p{Greek}]"),
2651 hir_negate(hir_case_fold(hir_union(
2652 hir_uclass_query(ClassQuery::ByValue {
2653 property_name: "age",
2654 property_value: "3.0",
2655 }),
2656 hir_union(
2657 hir_uclass_query(ClassQuery::Binary("greek")),
2658 hir_uclass_query(ClassQuery::Binary("separator"))
2659 )
2660 )))
2661 );
2662 }
2663
2664 #[test]
class_bracketed_nested()2665 fn class_bracketed_nested() {
2666 assert_eq!(t(r"[a[^c]]"), hir_negate(hir_uclass(&[('c', 'c')])));
2667 assert_eq!(t(r"[a-b[^c]]"), hir_negate(hir_uclass(&[('c', 'c')])));
2668 assert_eq!(t(r"[a-c[^c]]"), hir_negate(hir_uclass(&[])));
2669
2670 assert_eq!(t(r"[^a[^c]]"), hir_uclass(&[('c', 'c')]));
2671 assert_eq!(t(r"[^a-b[^c]]"), hir_uclass(&[('c', 'c')]));
2672
2673 #[cfg(feature = "unicode-case")]
2674 assert_eq!(
2675 t(r"(?i)[a[^c]]"),
2676 hir_negate(hir_case_fold(hir_uclass(&[('c', 'c')])))
2677 );
2678 #[cfg(feature = "unicode-case")]
2679 assert_eq!(
2680 t(r"(?i)[a-b[^c]]"),
2681 hir_negate(hir_case_fold(hir_uclass(&[('c', 'c')])))
2682 );
2683
2684 #[cfg(feature = "unicode-case")]
2685 assert_eq!(t(r"(?i)[^a[^c]]"), hir_uclass(&[('C', 'C'), ('c', 'c')]));
2686 #[cfg(feature = "unicode-case")]
2687 assert_eq!(
2688 t(r"(?i)[^a-b[^c]]"),
2689 hir_uclass(&[('C', 'C'), ('c', 'c')])
2690 );
2691
2692 assert_eq!(
2693 t_err(r"[^a-c[^c]]"),
2694 TestError {
2695 kind: hir::ErrorKind::EmptyClassNotAllowed,
2696 span: Span::new(
2697 Position::new(0, 1, 1),
2698 Position::new(10, 1, 11)
2699 ),
2700 }
2701 );
2702 #[cfg(feature = "unicode-case")]
2703 assert_eq!(
2704 t_err(r"(?i)[^a-c[^c]]"),
2705 TestError {
2706 kind: hir::ErrorKind::EmptyClassNotAllowed,
2707 span: Span::new(
2708 Position::new(4, 1, 5),
2709 Position::new(14, 1, 15)
2710 ),
2711 }
2712 );
2713 }
2714
2715 #[test]
class_bracketed_intersect()2716 fn class_bracketed_intersect() {
2717 assert_eq!(t("[abc&&b-c]"), hir_uclass(&[('b', 'c')]));
2718 assert_eq!(t("[abc&&[b-c]]"), hir_uclass(&[('b', 'c')]));
2719 assert_eq!(t("[[abc]&&[b-c]]"), hir_uclass(&[('b', 'c')]));
2720 assert_eq!(t("[a-z&&b-y&&c-x]"), hir_uclass(&[('c', 'x')]));
2721 assert_eq!(t("[c-da-b&&a-d]"), hir_uclass(&[('a', 'd')]));
2722 assert_eq!(t("[a-d&&c-da-b]"), hir_uclass(&[('a', 'd')]));
2723 assert_eq!(t(r"[a-z&&a-c]"), hir_uclass(&[('a', 'c')]));
2724 assert_eq!(t(r"[[a-z&&a-c]]"), hir_uclass(&[('a', 'c')]));
2725 assert_eq!(t(r"[^[a-z&&a-c]]"), hir_negate(hir_uclass(&[('a', 'c')])));
2726
2727 assert_eq!(t("(?-u)[abc&&b-c]"), hir_bclass(&[(b'b', b'c')]));
2728 assert_eq!(t("(?-u)[abc&&[b-c]]"), hir_bclass(&[(b'b', b'c')]));
2729 assert_eq!(t("(?-u)[[abc]&&[b-c]]"), hir_bclass(&[(b'b', b'c')]));
2730 assert_eq!(t("(?-u)[a-z&&b-y&&c-x]"), hir_bclass(&[(b'c', b'x')]));
2731 assert_eq!(t("(?-u)[c-da-b&&a-d]"), hir_bclass(&[(b'a', b'd')]));
2732 assert_eq!(t("(?-u)[a-d&&c-da-b]"), hir_bclass(&[(b'a', b'd')]));
2733
2734 #[cfg(feature = "unicode-case")]
2735 assert_eq!(
2736 t("(?i)[abc&&b-c]"),
2737 hir_case_fold(hir_uclass(&[('b', 'c')]))
2738 );
2739 #[cfg(feature = "unicode-case")]
2740 assert_eq!(
2741 t("(?i)[abc&&[b-c]]"),
2742 hir_case_fold(hir_uclass(&[('b', 'c')]))
2743 );
2744 #[cfg(feature = "unicode-case")]
2745 assert_eq!(
2746 t("(?i)[[abc]&&[b-c]]"),
2747 hir_case_fold(hir_uclass(&[('b', 'c')]))
2748 );
2749 #[cfg(feature = "unicode-case")]
2750 assert_eq!(
2751 t("(?i)[a-z&&b-y&&c-x]"),
2752 hir_case_fold(hir_uclass(&[('c', 'x')]))
2753 );
2754 #[cfg(feature = "unicode-case")]
2755 assert_eq!(
2756 t("(?i)[c-da-b&&a-d]"),
2757 hir_case_fold(hir_uclass(&[('a', 'd')]))
2758 );
2759 #[cfg(feature = "unicode-case")]
2760 assert_eq!(
2761 t("(?i)[a-d&&c-da-b]"),
2762 hir_case_fold(hir_uclass(&[('a', 'd')]))
2763 );
2764
2765 assert_eq!(
2766 t("(?i-u)[abc&&b-c]"),
2767 hir_case_fold(hir_bclass(&[(b'b', b'c')]))
2768 );
2769 assert_eq!(
2770 t("(?i-u)[abc&&[b-c]]"),
2771 hir_case_fold(hir_bclass(&[(b'b', b'c')]))
2772 );
2773 assert_eq!(
2774 t("(?i-u)[[abc]&&[b-c]]"),
2775 hir_case_fold(hir_bclass(&[(b'b', b'c')]))
2776 );
2777 assert_eq!(
2778 t("(?i-u)[a-z&&b-y&&c-x]"),
2779 hir_case_fold(hir_bclass(&[(b'c', b'x')]))
2780 );
2781 assert_eq!(
2782 t("(?i-u)[c-da-b&&a-d]"),
2783 hir_case_fold(hir_bclass(&[(b'a', b'd')]))
2784 );
2785 assert_eq!(
2786 t("(?i-u)[a-d&&c-da-b]"),
2787 hir_case_fold(hir_bclass(&[(b'a', b'd')]))
2788 );
2789
2790 // In `[a^]`, `^` does not need to be escaped, so it makes sense that
2791 // `^` is also allowed to be unescaped after `&&`.
2792 assert_eq!(t(r"[\^&&^]"), hir_uclass(&[('^', '^')]));
2793 // `]` needs to be escaped after `&&` since it's not at start of class.
2794 assert_eq!(t(r"[]&&\]]"), hir_uclass(&[(']', ']')]));
2795 assert_eq!(t(r"[-&&-]"), hir_uclass(&[('-', '-')]));
2796 assert_eq!(t(r"[\&&&&]"), hir_uclass(&[('&', '&')]));
2797 assert_eq!(t(r"[\&&&\&]"), hir_uclass(&[('&', '&')]));
2798 // Test precedence.
2799 assert_eq!(
2800 t(r"[a-w&&[^c-g]z]"),
2801 hir_uclass(&[('a', 'b'), ('h', 'w')])
2802 );
2803 }
2804
2805 #[test]
class_bracketed_intersect_negate()2806 fn class_bracketed_intersect_negate() {
2807 #[cfg(feature = "unicode-perl")]
2808 assert_eq!(
2809 t(r"[^\w&&\d]"),
2810 hir_negate(hir_uclass_query(ClassQuery::Binary("digit")))
2811 );
2812 assert_eq!(t(r"[^[a-z&&a-c]]"), hir_negate(hir_uclass(&[('a', 'c')])));
2813 #[cfg(feature = "unicode-perl")]
2814 assert_eq!(
2815 t(r"[^[\w&&\d]]"),
2816 hir_negate(hir_uclass_query(ClassQuery::Binary("digit")))
2817 );
2818 #[cfg(feature = "unicode-perl")]
2819 assert_eq!(
2820 t(r"[^[^\w&&\d]]"),
2821 hir_uclass_query(ClassQuery::Binary("digit"))
2822 );
2823 #[cfg(feature = "unicode-perl")]
2824 assert_eq!(t(r"[[[^\w]&&[^\d]]]"), hir_negate(hir_uclass_perl_word()));
2825
2826 #[cfg(feature = "unicode-perl")]
2827 assert_eq!(
2828 t_bytes(r"(?-u)[^\w&&\d]"),
2829 hir_negate(hir_bclass_from_char(ascii_class(
2830 &ast::ClassAsciiKind::Digit
2831 )))
2832 );
2833 assert_eq!(
2834 t_bytes(r"(?-u)[^[a-z&&a-c]]"),
2835 hir_negate(hir_bclass(&[(b'a', b'c')]))
2836 );
2837 assert_eq!(
2838 t_bytes(r"(?-u)[^[\w&&\d]]"),
2839 hir_negate(hir_bclass_from_char(ascii_class(
2840 &ast::ClassAsciiKind::Digit
2841 )))
2842 );
2843 assert_eq!(
2844 t_bytes(r"(?-u)[^[^\w&&\d]]"),
2845 hir_bclass_from_char(ascii_class(&ast::ClassAsciiKind::Digit))
2846 );
2847 assert_eq!(
2848 t_bytes(r"(?-u)[[[^\w]&&[^\d]]]"),
2849 hir_negate(hir_bclass_from_char(ascii_class(
2850 &ast::ClassAsciiKind::Word
2851 )))
2852 );
2853 }
2854
2855 #[test]
class_bracketed_difference()2856 fn class_bracketed_difference() {
2857 #[cfg(feature = "unicode-gencat")]
2858 assert_eq!(
2859 t(r"[\pL--[:ascii:]]"),
2860 hir_difference(
2861 hir_uclass_query(ClassQuery::Binary("letter")),
2862 hir_uclass(&[('\0', '\x7F')])
2863 )
2864 );
2865
2866 assert_eq!(
2867 t(r"(?-u)[[:alpha:]--[:lower:]]"),
2868 hir_bclass(&[(b'A', b'Z')])
2869 );
2870 }
2871
2872 #[test]
class_bracketed_symmetric_difference()2873 fn class_bracketed_symmetric_difference() {
2874 #[cfg(feature = "unicode-script")]
2875 assert_eq!(
2876 t(r"[\p{sc:Greek}~~\p{scx:Greek}]"),
2877 hir_uclass(&[
2878 ('\u{0342}', '\u{0342}'),
2879 ('\u{0345}', '\u{0345}'),
2880 ('\u{1DC0}', '\u{1DC1}'),
2881 ])
2882 );
2883 assert_eq!(t(r"[a-g~~c-j]"), hir_uclass(&[('a', 'b'), ('h', 'j')]));
2884
2885 assert_eq!(
2886 t(r"(?-u)[a-g~~c-j]"),
2887 hir_bclass(&[(b'a', b'b'), (b'h', b'j')])
2888 );
2889 }
2890
2891 #[test]
ignore_whitespace()2892 fn ignore_whitespace() {
2893 assert_eq!(t(r"(?x)\12 3"), hir_lit("\n3"));
2894 assert_eq!(t(r"(?x)\x { 53 }"), hir_lit("S"));
2895 assert_eq!(
2896 t(r"(?x)\x # comment
2897 { # comment
2898 53 # comment
2899 } #comment"),
2900 hir_lit("S")
2901 );
2902
2903 assert_eq!(t(r"(?x)\x 53"), hir_lit("S"));
2904 assert_eq!(
2905 t(r"(?x)\x # comment
2906 53 # comment"),
2907 hir_lit("S")
2908 );
2909 assert_eq!(t(r"(?x)\x5 3"), hir_lit("S"));
2910
2911 #[cfg(feature = "unicode-gencat")]
2912 assert_eq!(
2913 t(r"(?x)\p # comment
2914 { # comment
2915 Separator # comment
2916 } # comment"),
2917 hir_uclass_query(ClassQuery::Binary("separator"))
2918 );
2919
2920 assert_eq!(
2921 t(r"(?x)a # comment
2922 { # comment
2923 5 # comment
2924 , # comment
2925 10 # comment
2926 } # comment"),
2927 hir_range(
2928 true,
2929 hir::RepetitionRange::Bounded(5, 10),
2930 hir_lit("a")
2931 )
2932 );
2933
2934 assert_eq!(t(r"(?x)a\ # hi there"), hir_lit("a "));
2935 }
2936
2937 #[test]
analysis_is_always_utf8()2938 fn analysis_is_always_utf8() {
2939 // Positive examples.
2940 assert!(t_bytes(r"a").is_always_utf8());
2941 assert!(t_bytes(r"ab").is_always_utf8());
2942 assert!(t_bytes(r"(?-u)a").is_always_utf8());
2943 assert!(t_bytes(r"(?-u)ab").is_always_utf8());
2944 assert!(t_bytes(r"\xFF").is_always_utf8());
2945 assert!(t_bytes(r"\xFF\xFF").is_always_utf8());
2946 assert!(t_bytes(r"[^a]").is_always_utf8());
2947 assert!(t_bytes(r"[^a][^a]").is_always_utf8());
2948 assert!(t_bytes(r"\b").is_always_utf8());
2949 assert!(t_bytes(r"\B").is_always_utf8());
2950 assert!(t_bytes(r"(?-u)\b").is_always_utf8());
2951
2952 // Negative examples.
2953 assert!(!t_bytes(r"(?-u)\xFF").is_always_utf8());
2954 assert!(!t_bytes(r"(?-u)\xFF\xFF").is_always_utf8());
2955 assert!(!t_bytes(r"(?-u)[^a]").is_always_utf8());
2956 assert!(!t_bytes(r"(?-u)[^a][^a]").is_always_utf8());
2957 assert!(!t_bytes(r"(?-u)\B").is_always_utf8());
2958 }
2959
2960 #[test]
analysis_is_all_assertions()2961 fn analysis_is_all_assertions() {
2962 // Positive examples.
2963 assert!(t(r"\b").is_all_assertions());
2964 assert!(t(r"\B").is_all_assertions());
2965 assert!(t(r"^").is_all_assertions());
2966 assert!(t(r"$").is_all_assertions());
2967 assert!(t(r"\A").is_all_assertions());
2968 assert!(t(r"\z").is_all_assertions());
2969 assert!(t(r"$^\z\A\b\B").is_all_assertions());
2970 assert!(t(r"$|^|\z|\A|\b|\B").is_all_assertions());
2971 assert!(t(r"^$|$^").is_all_assertions());
2972 assert!(t(r"((\b)+())*^").is_all_assertions());
2973
2974 // Negative examples.
2975 assert!(!t(r"^a").is_all_assertions());
2976 }
2977
2978 #[test]
analysis_is_anchored()2979 fn analysis_is_anchored() {
2980 // Positive examples.
2981 assert!(t(r"^").is_anchored_start());
2982 assert!(t(r"$").is_anchored_end());
2983 assert!(t(r"^").is_line_anchored_start());
2984 assert!(t(r"$").is_line_anchored_end());
2985
2986 assert!(t(r"^^").is_anchored_start());
2987 assert!(t(r"$$").is_anchored_end());
2988 assert!(t(r"^^").is_line_anchored_start());
2989 assert!(t(r"$$").is_line_anchored_end());
2990
2991 assert!(t(r"^$").is_anchored_start());
2992 assert!(t(r"^$").is_anchored_end());
2993 assert!(t(r"^$").is_line_anchored_start());
2994 assert!(t(r"^$").is_line_anchored_end());
2995
2996 assert!(t(r"^foo").is_anchored_start());
2997 assert!(t(r"foo$").is_anchored_end());
2998 assert!(t(r"^foo").is_line_anchored_start());
2999 assert!(t(r"foo$").is_line_anchored_end());
3000
3001 assert!(t(r"^foo|^bar").is_anchored_start());
3002 assert!(t(r"foo$|bar$").is_anchored_end());
3003 assert!(t(r"^foo|^bar").is_line_anchored_start());
3004 assert!(t(r"foo$|bar$").is_line_anchored_end());
3005
3006 assert!(t(r"^(foo|bar)").is_anchored_start());
3007 assert!(t(r"(foo|bar)$").is_anchored_end());
3008 assert!(t(r"^(foo|bar)").is_line_anchored_start());
3009 assert!(t(r"(foo|bar)$").is_line_anchored_end());
3010
3011 assert!(t(r"^+").is_anchored_start());
3012 assert!(t(r"$+").is_anchored_end());
3013 assert!(t(r"^+").is_line_anchored_start());
3014 assert!(t(r"$+").is_line_anchored_end());
3015 assert!(t(r"^++").is_anchored_start());
3016 assert!(t(r"$++").is_anchored_end());
3017 assert!(t(r"^++").is_line_anchored_start());
3018 assert!(t(r"$++").is_line_anchored_end());
3019 assert!(t(r"(^)+").is_anchored_start());
3020 assert!(t(r"($)+").is_anchored_end());
3021 assert!(t(r"(^)+").is_line_anchored_start());
3022 assert!(t(r"($)+").is_line_anchored_end());
3023
3024 assert!(t(r"$^").is_anchored_start());
3025 assert!(t(r"$^").is_anchored_start());
3026 assert!(t(r"$^").is_line_anchored_end());
3027 assert!(t(r"$^").is_line_anchored_end());
3028 assert!(t(r"$^|^$").is_anchored_start());
3029 assert!(t(r"$^|^$").is_anchored_end());
3030 assert!(t(r"$^|^$").is_line_anchored_start());
3031 assert!(t(r"$^|^$").is_line_anchored_end());
3032
3033 assert!(t(r"\b^").is_anchored_start());
3034 assert!(t(r"$\b").is_anchored_end());
3035 assert!(t(r"\b^").is_line_anchored_start());
3036 assert!(t(r"$\b").is_line_anchored_end());
3037 assert!(t(r"^(?m:^)").is_anchored_start());
3038 assert!(t(r"(?m:$)$").is_anchored_end());
3039 assert!(t(r"^(?m:^)").is_line_anchored_start());
3040 assert!(t(r"(?m:$)$").is_line_anchored_end());
3041 assert!(t(r"(?m:^)^").is_anchored_start());
3042 assert!(t(r"$(?m:$)").is_anchored_end());
3043 assert!(t(r"(?m:^)^").is_line_anchored_start());
3044 assert!(t(r"$(?m:$)").is_line_anchored_end());
3045
3046 // Negative examples.
3047 assert!(!t(r"(?m)^").is_anchored_start());
3048 assert!(!t(r"(?m)$").is_anchored_end());
3049 assert!(!t(r"(?m:^$)|$^").is_anchored_start());
3050 assert!(!t(r"(?m:^$)|$^").is_anchored_end());
3051 assert!(!t(r"$^|(?m:^$)").is_anchored_start());
3052 assert!(!t(r"$^|(?m:^$)").is_anchored_end());
3053
3054 assert!(!t(r"a^").is_anchored_start());
3055 assert!(!t(r"$a").is_anchored_start());
3056 assert!(!t(r"a^").is_line_anchored_start());
3057 assert!(!t(r"$a").is_line_anchored_start());
3058
3059 assert!(!t(r"a^").is_anchored_end());
3060 assert!(!t(r"$a").is_anchored_end());
3061 assert!(!t(r"a^").is_line_anchored_end());
3062 assert!(!t(r"$a").is_line_anchored_end());
3063
3064 assert!(!t(r"^foo|bar").is_anchored_start());
3065 assert!(!t(r"foo|bar$").is_anchored_end());
3066 assert!(!t(r"^foo|bar").is_line_anchored_start());
3067 assert!(!t(r"foo|bar$").is_line_anchored_end());
3068
3069 assert!(!t(r"^*").is_anchored_start());
3070 assert!(!t(r"$*").is_anchored_end());
3071 assert!(!t(r"^*").is_line_anchored_start());
3072 assert!(!t(r"$*").is_line_anchored_end());
3073 assert!(!t(r"^*+").is_anchored_start());
3074 assert!(!t(r"$*+").is_anchored_end());
3075 assert!(!t(r"^*+").is_line_anchored_start());
3076 assert!(!t(r"$*+").is_line_anchored_end());
3077 assert!(!t(r"^+*").is_anchored_start());
3078 assert!(!t(r"$+*").is_anchored_end());
3079 assert!(!t(r"^+*").is_line_anchored_start());
3080 assert!(!t(r"$+*").is_line_anchored_end());
3081 assert!(!t(r"(^)*").is_anchored_start());
3082 assert!(!t(r"($)*").is_anchored_end());
3083 assert!(!t(r"(^)*").is_line_anchored_start());
3084 assert!(!t(r"($)*").is_line_anchored_end());
3085 }
3086
3087 #[test]
analysis_is_line_anchored()3088 fn analysis_is_line_anchored() {
3089 assert!(t(r"(?m)^(foo|bar)").is_line_anchored_start());
3090 assert!(t(r"(?m)(foo|bar)$").is_line_anchored_end());
3091
3092 assert!(t(r"(?m)^foo|^bar").is_line_anchored_start());
3093 assert!(t(r"(?m)foo$|bar$").is_line_anchored_end());
3094
3095 assert!(t(r"(?m)^").is_line_anchored_start());
3096 assert!(t(r"(?m)$").is_line_anchored_end());
3097
3098 assert!(t(r"(?m:^$)|$^").is_line_anchored_start());
3099 assert!(t(r"(?m:^$)|$^").is_line_anchored_end());
3100
3101 assert!(t(r"$^|(?m:^$)").is_line_anchored_start());
3102 assert!(t(r"$^|(?m:^$)").is_line_anchored_end());
3103 }
3104
3105 #[test]
analysis_is_any_anchored()3106 fn analysis_is_any_anchored() {
3107 // Positive examples.
3108 assert!(t(r"^").is_any_anchored_start());
3109 assert!(t(r"$").is_any_anchored_end());
3110 assert!(t(r"\A").is_any_anchored_start());
3111 assert!(t(r"\z").is_any_anchored_end());
3112
3113 // Negative examples.
3114 assert!(!t(r"(?m)^").is_any_anchored_start());
3115 assert!(!t(r"(?m)$").is_any_anchored_end());
3116 assert!(!t(r"$").is_any_anchored_start());
3117 assert!(!t(r"^").is_any_anchored_end());
3118 }
3119
3120 #[test]
analysis_is_match_empty()3121 fn analysis_is_match_empty() {
3122 // Positive examples.
3123 assert!(t(r"").is_match_empty());
3124 assert!(t(r"()").is_match_empty());
3125 assert!(t(r"()*").is_match_empty());
3126 assert!(t(r"()+").is_match_empty());
3127 assert!(t(r"()?").is_match_empty());
3128 assert!(t(r"a*").is_match_empty());
3129 assert!(t(r"a?").is_match_empty());
3130 assert!(t(r"a{0}").is_match_empty());
3131 assert!(t(r"a{0,}").is_match_empty());
3132 assert!(t(r"a{0,1}").is_match_empty());
3133 assert!(t(r"a{0,10}").is_match_empty());
3134 #[cfg(feature = "unicode-gencat")]
3135 assert!(t(r"\pL*").is_match_empty());
3136 assert!(t(r"a*|b").is_match_empty());
3137 assert!(t(r"b|a*").is_match_empty());
3138 assert!(t(r"a|").is_match_empty());
3139 assert!(t(r"|a").is_match_empty());
3140 assert!(t(r"a||b").is_match_empty());
3141 assert!(t(r"a*a?(abcd)*").is_match_empty());
3142 assert!(t(r"^").is_match_empty());
3143 assert!(t(r"$").is_match_empty());
3144 assert!(t(r"(?m)^").is_match_empty());
3145 assert!(t(r"(?m)$").is_match_empty());
3146 assert!(t(r"\A").is_match_empty());
3147 assert!(t(r"\z").is_match_empty());
3148 assert!(t(r"\B").is_match_empty());
3149 assert!(t_bytes(r"(?-u)\B").is_match_empty());
3150 assert!(t(r"\b").is_match_empty());
3151 assert!(t(r"(?-u)\b").is_match_empty());
3152
3153 // Negative examples.
3154 assert!(!t(r"a+").is_match_empty());
3155 assert!(!t(r"a{1}").is_match_empty());
3156 assert!(!t(r"a{1,}").is_match_empty());
3157 assert!(!t(r"a{1,2}").is_match_empty());
3158 assert!(!t(r"a{1,10}").is_match_empty());
3159 assert!(!t(r"b|a").is_match_empty());
3160 assert!(!t(r"a*a+(abcd)*").is_match_empty());
3161 }
3162
3163 #[test]
analysis_is_literal()3164 fn analysis_is_literal() {
3165 // Positive examples.
3166 assert!(t(r"a").is_literal());
3167 assert!(t(r"ab").is_literal());
3168 assert!(t(r"abc").is_literal());
3169 assert!(t(r"(?m)abc").is_literal());
3170
3171 // Negative examples.
3172 assert!(!t(r"").is_literal());
3173 assert!(!t(r"^").is_literal());
3174 assert!(!t(r"a|b").is_literal());
3175 assert!(!t(r"(a)").is_literal());
3176 assert!(!t(r"a+").is_literal());
3177 assert!(!t(r"foo(a)").is_literal());
3178 assert!(!t(r"(a)foo").is_literal());
3179 assert!(!t(r"[a]").is_literal());
3180 }
3181
3182 #[test]
analysis_is_alternation_literal()3183 fn analysis_is_alternation_literal() {
3184 // Positive examples.
3185 assert!(t(r"a").is_alternation_literal());
3186 assert!(t(r"ab").is_alternation_literal());
3187 assert!(t(r"abc").is_alternation_literal());
3188 assert!(t(r"(?m)abc").is_alternation_literal());
3189 assert!(t(r"a|b").is_alternation_literal());
3190 assert!(t(r"a|b|c").is_alternation_literal());
3191 assert!(t(r"foo|bar").is_alternation_literal());
3192 assert!(t(r"foo|bar|baz").is_alternation_literal());
3193
3194 // Negative examples.
3195 assert!(!t(r"").is_alternation_literal());
3196 assert!(!t(r"^").is_alternation_literal());
3197 assert!(!t(r"(a)").is_alternation_literal());
3198 assert!(!t(r"a+").is_alternation_literal());
3199 assert!(!t(r"foo(a)").is_alternation_literal());
3200 assert!(!t(r"(a)foo").is_alternation_literal());
3201 assert!(!t(r"[a]").is_alternation_literal());
3202 assert!(!t(r"[a]|b").is_alternation_literal());
3203 assert!(!t(r"a|[b]").is_alternation_literal());
3204 assert!(!t(r"(a)|b").is_alternation_literal());
3205 assert!(!t(r"a|(b)").is_alternation_literal());
3206 }
3207 }
3208