• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*!
2 Provides routines for extracting literal prefixes and suffixes from an `Hir`.
3 */
4 
5 use std::cmp;
6 use std::fmt;
7 use std::iter;
8 use std::mem;
9 use std::ops;
10 
11 use crate::hir::{self, Hir, HirKind};
12 
13 /// A set of literal byte strings extracted from a regular expression.
14 ///
15 /// Every member of the set is a `Literal`, which is represented by a
16 /// `Vec<u8>`. (Notably, it may contain invalid UTF-8.) Every member is
17 /// said to be either *complete* or *cut*. A complete literal means that
18 /// it extends until the beginning (or end) of the regular expression. In
19 /// some circumstances, this can be used to indicate a match in the regular
20 /// expression.
21 ///
22 /// A key aspect of literal extraction is knowing when to stop. It is not
23 /// feasible to blindly extract all literals from a regular expression, even if
24 /// there are finitely many. For example, the regular expression `[0-9]{10}`
25 /// has `10^10` distinct literals. For this reason, literal extraction is
26 /// bounded to some low number by default using heuristics, but the limits can
27 /// be tweaked.
28 ///
29 /// **WARNING**: Literal extraction uses stack space proportional to the size
30 /// of the `Hir` expression. At some point, this drawback will be eliminated.
31 /// To protect yourself, set a reasonable
32 /// [`nest_limit` on your `Parser`](../../struct.ParserBuilder.html#method.nest_limit).
33 /// This is done for you by default.
34 #[derive(Clone, Eq, PartialEq)]
35 pub struct Literals {
36     lits: Vec<Literal>,
37     limit_size: usize,
38     limit_class: usize,
39 }
40 
41 /// A single member of a set of literals extracted from a regular expression.
42 ///
43 /// This type has `Deref` and `DerefMut` impls to `Vec<u8>` so that all slice
44 /// and `Vec` operations are available.
45 #[derive(Clone, Eq, Ord)]
46 pub struct Literal {
47     v: Vec<u8>,
48     cut: bool,
49 }
50 
51 impl Literals {
52     /// Returns a new empty set of literals using default limits.
empty() -> Literals53     pub fn empty() -> Literals {
54         Literals { lits: vec![], limit_size: 250, limit_class: 10 }
55     }
56 
57     /// Returns a set of literal prefixes extracted from the given `Hir`.
prefixes(expr: &Hir) -> Literals58     pub fn prefixes(expr: &Hir) -> Literals {
59         let mut lits = Literals::empty();
60         lits.union_prefixes(expr);
61         lits
62     }
63 
64     /// Returns a set of literal suffixes extracted from the given `Hir`.
suffixes(expr: &Hir) -> Literals65     pub fn suffixes(expr: &Hir) -> Literals {
66         let mut lits = Literals::empty();
67         lits.union_suffixes(expr);
68         lits
69     }
70 
71     /// Get the approximate size limit (in bytes) of this set.
limit_size(&self) -> usize72     pub fn limit_size(&self) -> usize {
73         self.limit_size
74     }
75 
76     /// Set the approximate size limit (in bytes) of this set.
77     ///
78     /// If extracting a literal would put the set over this limit, then
79     /// extraction stops.
80     ///
81     /// The new limits will only apply to additions to this set. Existing
82     /// members remain unchanged, even if the set exceeds the new limit.
set_limit_size(&mut self, size: usize) -> &mut Literals83     pub fn set_limit_size(&mut self, size: usize) -> &mut Literals {
84         self.limit_size = size;
85         self
86     }
87 
88     /// Get the character class size limit for this set.
limit_class(&self) -> usize89     pub fn limit_class(&self) -> usize {
90         self.limit_class
91     }
92 
93     /// Limits the size of character(or byte) classes considered.
94     ///
95     /// A value of `0` prevents all character classes from being considered.
96     ///
97     /// This limit also applies to case insensitive literals, since each
98     /// character in the case insensitive literal is converted to a class, and
99     /// then case folded.
100     ///
101     /// The new limits will only apply to additions to this set. Existing
102     /// members remain unchanged, even if the set exceeds the new limit.
set_limit_class(&mut self, size: usize) -> &mut Literals103     pub fn set_limit_class(&mut self, size: usize) -> &mut Literals {
104         self.limit_class = size;
105         self
106     }
107 
108     /// Returns the set of literals as a slice. Its order is unspecified.
literals(&self) -> &[Literal]109     pub fn literals(&self) -> &[Literal] {
110         &self.lits
111     }
112 
113     /// Returns the length of the smallest literal.
114     ///
115     /// Returns None is there are no literals in the set.
min_len(&self) -> Option<usize>116     pub fn min_len(&self) -> Option<usize> {
117         let mut min = None;
118         for lit in &self.lits {
119             match min {
120                 None => min = Some(lit.len()),
121                 Some(m) if lit.len() < m => min = Some(lit.len()),
122                 _ => {}
123             }
124         }
125         min
126     }
127 
128     /// Returns true if all members in this set are complete.
all_complete(&self) -> bool129     pub fn all_complete(&self) -> bool {
130         !self.lits.is_empty() && self.lits.iter().all(|l| !l.is_cut())
131     }
132 
133     /// Returns true if any member in this set is complete.
any_complete(&self) -> bool134     pub fn any_complete(&self) -> bool {
135         self.lits.iter().any(|lit| !lit.is_cut())
136     }
137 
138     /// Returns true if this set contains an empty literal.
contains_empty(&self) -> bool139     pub fn contains_empty(&self) -> bool {
140         self.lits.iter().any(|lit| lit.is_empty())
141     }
142 
143     /// Returns true if this set is empty or if all of its members is empty.
is_empty(&self) -> bool144     pub fn is_empty(&self) -> bool {
145         self.lits.is_empty() || self.lits.iter().all(|lit| lit.is_empty())
146     }
147 
148     /// Returns a new empty set of literals using this set's limits.
to_empty(&self) -> Literals149     pub fn to_empty(&self) -> Literals {
150         let mut lits = Literals::empty();
151         lits.set_limit_size(self.limit_size).set_limit_class(self.limit_class);
152         lits
153     }
154 
155     /// Returns the longest common prefix of all members in this set.
longest_common_prefix(&self) -> &[u8]156     pub fn longest_common_prefix(&self) -> &[u8] {
157         if self.is_empty() {
158             return &[];
159         }
160         let lit0 = &*self.lits[0];
161         let mut len = lit0.len();
162         for lit in &self.lits[1..] {
163             len = cmp::min(
164                 len,
165                 lit.iter().zip(lit0).take_while(|&(a, b)| a == b).count(),
166             );
167         }
168         &self.lits[0][..len]
169     }
170 
171     /// Returns the longest common suffix of all members in this set.
longest_common_suffix(&self) -> &[u8]172     pub fn longest_common_suffix(&self) -> &[u8] {
173         if self.is_empty() {
174             return &[];
175         }
176         let lit0 = &*self.lits[0];
177         let mut len = lit0.len();
178         for lit in &self.lits[1..] {
179             len = cmp::min(
180                 len,
181                 lit.iter()
182                     .rev()
183                     .zip(lit0.iter().rev())
184                     .take_while(|&(a, b)| a == b)
185                     .count(),
186             );
187         }
188         &self.lits[0][self.lits[0].len() - len..]
189     }
190 
191     /// Returns a new set of literals with the given number of bytes trimmed
192     /// from the suffix of each literal.
193     ///
194     /// If any literal would be cut out completely by trimming, then None is
195     /// returned.
196     ///
197     /// Any duplicates that are created as a result of this transformation are
198     /// removed.
trim_suffix(&self, num_bytes: usize) -> Option<Literals>199     pub fn trim_suffix(&self, num_bytes: usize) -> Option<Literals> {
200         if self.min_len().map(|len| len <= num_bytes).unwrap_or(true) {
201             return None;
202         }
203         let mut new = self.to_empty();
204         for mut lit in self.lits.iter().cloned() {
205             let new_len = lit.len() - num_bytes;
206             lit.truncate(new_len);
207             lit.cut();
208             new.lits.push(lit);
209         }
210         new.lits.sort();
211         new.lits.dedup();
212         Some(new)
213     }
214 
215     /// Returns a new set of prefixes of this set of literals that are
216     /// guaranteed to be unambiguous.
217     ///
218     /// Any substring match with a member of the set is returned is guaranteed
219     /// to never overlap with a substring match of another member of the set
220     /// at the same starting position.
221     ///
222     /// Given any two members of the returned set, neither is a substring of
223     /// the other.
unambiguous_prefixes(&self) -> Literals224     pub fn unambiguous_prefixes(&self) -> Literals {
225         if self.lits.is_empty() {
226             return self.to_empty();
227         }
228         let mut old = self.lits.to_vec();
229         let mut new = self.to_empty();
230         'OUTER: while let Some(mut candidate) = old.pop() {
231             if candidate.is_empty() {
232                 continue;
233             }
234             if new.lits.is_empty() {
235                 new.lits.push(candidate);
236                 continue;
237             }
238             for lit2 in &mut new.lits {
239                 if lit2.is_empty() {
240                     continue;
241                 }
242                 if &candidate == lit2 {
243                     // If the literal is already in the set, then we can
244                     // just drop it. But make sure that cut literals are
245                     // infectious!
246                     candidate.cut = candidate.cut || lit2.cut;
247                     lit2.cut = candidate.cut;
248                     continue 'OUTER;
249                 }
250                 if candidate.len() < lit2.len() {
251                     if let Some(i) = position(&candidate, &lit2) {
252                         candidate.cut();
253                         let mut lit3 = lit2.clone();
254                         lit3.truncate(i);
255                         lit3.cut();
256                         old.push(lit3);
257                         lit2.clear();
258                     }
259                 } else if let Some(i) = position(&lit2, &candidate) {
260                     lit2.cut();
261                     let mut new_candidate = candidate.clone();
262                     new_candidate.truncate(i);
263                     new_candidate.cut();
264                     old.push(new_candidate);
265                     candidate.clear();
266                 }
267                 // Oops, the candidate is already represented in the set.
268                 if candidate.is_empty() {
269                     continue 'OUTER;
270                 }
271             }
272             new.lits.push(candidate);
273         }
274         new.lits.retain(|lit| !lit.is_empty());
275         new.lits.sort();
276         new.lits.dedup();
277         new
278     }
279 
280     /// Returns a new set of suffixes of this set of literals that are
281     /// guaranteed to be unambiguous.
282     ///
283     /// Any substring match with a member of the set is returned is guaranteed
284     /// to never overlap with a substring match of another member of the set
285     /// at the same ending position.
286     ///
287     /// Given any two members of the returned set, neither is a substring of
288     /// the other.
unambiguous_suffixes(&self) -> Literals289     pub fn unambiguous_suffixes(&self) -> Literals {
290         // This is a touch wasteful...
291         let mut lits = self.clone();
292         lits.reverse();
293         let mut unamb = lits.unambiguous_prefixes();
294         unamb.reverse();
295         unamb
296     }
297 
298     /// Unions the prefixes from the given expression to this set.
299     ///
300     /// If prefixes could not be added (for example, this set would exceed its
301     /// size limits or the set of prefixes from `expr` includes the empty
302     /// string), then false is returned.
303     ///
304     /// Note that prefix literals extracted from `expr` are said to be complete
305     /// if and only if the literal extends from the beginning of `expr` to the
306     /// end of `expr`.
union_prefixes(&mut self, expr: &Hir) -> bool307     pub fn union_prefixes(&mut self, expr: &Hir) -> bool {
308         let mut lits = self.to_empty();
309         prefixes(expr, &mut lits);
310         !lits.is_empty() && !lits.contains_empty() && self.union(lits)
311     }
312 
313     /// Unions the suffixes from the given expression to this set.
314     ///
315     /// If suffixes could not be added (for example, this set would exceed its
316     /// size limits or the set of suffixes from `expr` includes the empty
317     /// string), then false is returned.
318     ///
319     /// Note that prefix literals extracted from `expr` are said to be complete
320     /// if and only if the literal extends from the end of `expr` to the
321     /// beginning of `expr`.
union_suffixes(&mut self, expr: &Hir) -> bool322     pub fn union_suffixes(&mut self, expr: &Hir) -> bool {
323         let mut lits = self.to_empty();
324         suffixes(expr, &mut lits);
325         lits.reverse();
326         !lits.is_empty() && !lits.contains_empty() && self.union(lits)
327     }
328 
329     /// Unions this set with another set.
330     ///
331     /// If the union would cause the set to exceed its limits, then the union
332     /// is skipped and it returns false. Otherwise, if the union succeeds, it
333     /// returns true.
union(&mut self, lits: Literals) -> bool334     pub fn union(&mut self, lits: Literals) -> bool {
335         if self.num_bytes() + lits.num_bytes() > self.limit_size {
336             return false;
337         }
338         if lits.is_empty() {
339             self.lits.push(Literal::empty());
340         } else {
341             self.lits.extend(lits.lits);
342         }
343         true
344     }
345 
346     /// Extends this set with another set.
347     ///
348     /// The set of literals is extended via a cross product.
349     ///
350     /// If a cross product would cause this set to exceed its limits, then the
351     /// cross product is skipped and it returns false. Otherwise, if the cross
352     /// product succeeds, it returns true.
cross_product(&mut self, lits: &Literals) -> bool353     pub fn cross_product(&mut self, lits: &Literals) -> bool {
354         if lits.is_empty() {
355             return true;
356         }
357         // Check that we make sure we stay in our limits.
358         let mut size_after;
359         if self.is_empty() || !self.any_complete() {
360             size_after = self.num_bytes();
361             for lits_lit in lits.literals() {
362                 size_after += lits_lit.len();
363             }
364         } else {
365             size_after = self.lits.iter().fold(0, |accum, lit| {
366                 accum + if lit.is_cut() { lit.len() } else { 0 }
367             });
368             for lits_lit in lits.literals() {
369                 for self_lit in self.literals() {
370                     if !self_lit.is_cut() {
371                         size_after += self_lit.len() + lits_lit.len();
372                     }
373                 }
374             }
375         }
376         if size_after > self.limit_size {
377             return false;
378         }
379 
380         let mut base = self.remove_complete();
381         if base.is_empty() {
382             base = vec![Literal::empty()];
383         }
384         for lits_lit in lits.literals() {
385             for mut self_lit in base.clone() {
386                 self_lit.extend(&**lits_lit);
387                 self_lit.cut = lits_lit.cut;
388                 self.lits.push(self_lit);
389             }
390         }
391         true
392     }
393 
394     /// Extends each literal in this set with the bytes given.
395     ///
396     /// If the set is empty, then the given literal is added to the set.
397     ///
398     /// If adding any number of bytes to all members of this set causes a limit
399     /// to be exceeded, then no bytes are added and false is returned. If a
400     /// prefix of `bytes` can be fit into this set, then it is used and all
401     /// resulting literals are cut.
cross_add(&mut self, bytes: &[u8]) -> bool402     pub fn cross_add(&mut self, bytes: &[u8]) -> bool {
403         // N.B. This could be implemented by simply calling cross_product with
404         // a literal set containing just `bytes`, but we can be smarter about
405         // taking shorter prefixes of `bytes` if they'll fit.
406         if bytes.is_empty() {
407             return true;
408         }
409         if self.lits.is_empty() {
410             let i = cmp::min(self.limit_size, bytes.len());
411             self.lits.push(Literal::new(bytes[..i].to_owned()));
412             self.lits[0].cut = i < bytes.len();
413             return !self.lits[0].is_cut();
414         }
415         let size = self.num_bytes();
416         if size + self.lits.len() >= self.limit_size {
417             return false;
418         }
419         let mut i = 1;
420         while size + (i * self.lits.len()) <= self.limit_size
421             && i < bytes.len()
422         {
423             i += 1;
424         }
425         for lit in &mut self.lits {
426             if !lit.is_cut() {
427                 lit.extend(&bytes[..i]);
428                 if i < bytes.len() {
429                     lit.cut();
430                 }
431             }
432         }
433         true
434     }
435 
436     /// Adds the given literal to this set.
437     ///
438     /// Returns false if adding this literal would cause the class to be too
439     /// big.
add(&mut self, lit: Literal) -> bool440     pub fn add(&mut self, lit: Literal) -> bool {
441         if self.num_bytes() + lit.len() > self.limit_size {
442             return false;
443         }
444         self.lits.push(lit);
445         true
446     }
447 
448     /// Extends each literal in this set with the character class given.
449     ///
450     /// Returns false if the character class was too big to add.
add_char_class(&mut self, cls: &hir::ClassUnicode) -> bool451     pub fn add_char_class(&mut self, cls: &hir::ClassUnicode) -> bool {
452         self._add_char_class(cls, false)
453     }
454 
455     /// Extends each literal in this set with the character class given,
456     /// writing the bytes of each character in reverse.
457     ///
458     /// Returns false if the character class was too big to add.
add_char_class_reverse(&mut self, cls: &hir::ClassUnicode) -> bool459     fn add_char_class_reverse(&mut self, cls: &hir::ClassUnicode) -> bool {
460         self._add_char_class(cls, true)
461     }
462 
_add_char_class( &mut self, cls: &hir::ClassUnicode, reverse: bool, ) -> bool463     fn _add_char_class(
464         &mut self,
465         cls: &hir::ClassUnicode,
466         reverse: bool,
467     ) -> bool {
468         use std::char;
469 
470         if self.class_exceeds_limits(cls_char_count(cls)) {
471             return false;
472         }
473         let mut base = self.remove_complete();
474         if base.is_empty() {
475             base = vec![Literal::empty()];
476         }
477         for r in cls.iter() {
478             let (s, e) = (r.start as u32, r.end as u32 + 1);
479             for c in (s..e).filter_map(char::from_u32) {
480                 for mut lit in base.clone() {
481                     let mut bytes = c.to_string().into_bytes();
482                     if reverse {
483                         bytes.reverse();
484                     }
485                     lit.extend(&bytes);
486                     self.lits.push(lit);
487                 }
488             }
489         }
490         true
491     }
492 
493     /// Extends each literal in this set with the byte class given.
494     ///
495     /// Returns false if the byte class was too big to add.
add_byte_class(&mut self, cls: &hir::ClassBytes) -> bool496     pub fn add_byte_class(&mut self, cls: &hir::ClassBytes) -> bool {
497         if self.class_exceeds_limits(cls_byte_count(cls)) {
498             return false;
499         }
500         let mut base = self.remove_complete();
501         if base.is_empty() {
502             base = vec![Literal::empty()];
503         }
504         for r in cls.iter() {
505             let (s, e) = (r.start as u32, r.end as u32 + 1);
506             for b in (s..e).map(|b| b as u8) {
507                 for mut lit in base.clone() {
508                     lit.push(b);
509                     self.lits.push(lit);
510                 }
511             }
512         }
513         true
514     }
515 
516     /// Cuts every member of this set. When a member is cut, it can never
517     /// be extended.
cut(&mut self)518     pub fn cut(&mut self) {
519         for lit in &mut self.lits {
520             lit.cut();
521         }
522     }
523 
524     /// Reverses all members in place.
reverse(&mut self)525     pub fn reverse(&mut self) {
526         for lit in &mut self.lits {
527             lit.reverse();
528         }
529     }
530 
531     /// Clears this set of all members.
clear(&mut self)532     pub fn clear(&mut self) {
533         self.lits.clear();
534     }
535 
536     /// Pops all complete literals out of this set.
remove_complete(&mut self) -> Vec<Literal>537     fn remove_complete(&mut self) -> Vec<Literal> {
538         let mut base = vec![];
539         for lit in mem::replace(&mut self.lits, vec![]) {
540             if lit.is_cut() {
541                 self.lits.push(lit);
542             } else {
543                 base.push(lit);
544             }
545         }
546         base
547     }
548 
549     /// Returns the total number of bytes in this set.
num_bytes(&self) -> usize550     fn num_bytes(&self) -> usize {
551         self.lits.iter().fold(0, |accum, lit| accum + lit.len())
552     }
553 
554     /// Returns true if a character class with the given size would cause this
555     /// set to exceed its limits.
556     ///
557     /// The size given should correspond to the number of items in the class.
class_exceeds_limits(&self, size: usize) -> bool558     fn class_exceeds_limits(&self, size: usize) -> bool {
559         if size > self.limit_class {
560             return true;
561         }
562         // This is an approximation since codepoints in a char class can encode
563         // to 1-4 bytes.
564         let new_byte_count = if self.lits.is_empty() {
565             size
566         } else {
567             self.lits.iter().fold(0, |accum, lit| {
568                 accum
569                     + if lit.is_cut() {
570                         // If the literal is cut, then we'll never add
571                         // anything to it, so don't count it.
572                         0
573                     } else {
574                         (lit.len() + 1) * size
575                     }
576             })
577         };
578         new_byte_count > self.limit_size
579     }
580 }
581 
prefixes(expr: &Hir, lits: &mut Literals)582 fn prefixes(expr: &Hir, lits: &mut Literals) {
583     match *expr.kind() {
584         HirKind::Literal(hir::Literal::Unicode(c)) => {
585             let mut buf = [0; 4];
586             lits.cross_add(c.encode_utf8(&mut buf).as_bytes());
587         }
588         HirKind::Literal(hir::Literal::Byte(b)) => {
589             lits.cross_add(&[b]);
590         }
591         HirKind::Class(hir::Class::Unicode(ref cls)) => {
592             if !lits.add_char_class(cls) {
593                 lits.cut();
594             }
595         }
596         HirKind::Class(hir::Class::Bytes(ref cls)) => {
597             if !lits.add_byte_class(cls) {
598                 lits.cut();
599             }
600         }
601         HirKind::Group(hir::Group { ref hir, .. }) => {
602             prefixes(&**hir, lits);
603         }
604         HirKind::Repetition(ref x) => match x.kind {
605             hir::RepetitionKind::ZeroOrOne => {
606                 repeat_zero_or_one_literals(&x.hir, lits, prefixes);
607             }
608             hir::RepetitionKind::ZeroOrMore => {
609                 repeat_zero_or_more_literals(&x.hir, lits, prefixes);
610             }
611             hir::RepetitionKind::OneOrMore => {
612                 repeat_one_or_more_literals(&x.hir, lits, prefixes);
613             }
614             hir::RepetitionKind::Range(ref rng) => {
615                 let (min, max) = match *rng {
616                     hir::RepetitionRange::Exactly(m) => (m, Some(m)),
617                     hir::RepetitionRange::AtLeast(m) => (m, None),
618                     hir::RepetitionRange::Bounded(m, n) => (m, Some(n)),
619                 };
620                 repeat_range_literals(
621                     &x.hir, min, max, x.greedy, lits, prefixes,
622                 )
623             }
624         },
625         HirKind::Concat(ref es) if es.is_empty() => {}
626         HirKind::Concat(ref es) if es.len() == 1 => prefixes(&es[0], lits),
627         HirKind::Concat(ref es) => {
628             for e in es {
629                 if let HirKind::Anchor(hir::Anchor::StartText) = *e.kind() {
630                     if !lits.is_empty() {
631                         lits.cut();
632                         break;
633                     }
634                     lits.add(Literal::empty());
635                     continue;
636                 }
637                 let mut lits2 = lits.to_empty();
638                 prefixes(e, &mut lits2);
639                 if !lits.cross_product(&lits2) || !lits2.any_complete() {
640                     // If this expression couldn't yield any literal that
641                     // could be extended, then we need to quit. Since we're
642                     // short-circuiting, we also need to freeze every member.
643                     lits.cut();
644                     break;
645                 }
646             }
647         }
648         HirKind::Alternation(ref es) => {
649             alternate_literals(es, lits, prefixes);
650         }
651         _ => lits.cut(),
652     }
653 }
654 
suffixes(expr: &Hir, lits: &mut Literals)655 fn suffixes(expr: &Hir, lits: &mut Literals) {
656     match *expr.kind() {
657         HirKind::Literal(hir::Literal::Unicode(c)) => {
658             let mut buf = [0u8; 4];
659             let i = c.encode_utf8(&mut buf).len();
660             let buf = &mut buf[..i];
661             buf.reverse();
662             lits.cross_add(buf);
663         }
664         HirKind::Literal(hir::Literal::Byte(b)) => {
665             lits.cross_add(&[b]);
666         }
667         HirKind::Class(hir::Class::Unicode(ref cls)) => {
668             if !lits.add_char_class_reverse(cls) {
669                 lits.cut();
670             }
671         }
672         HirKind::Class(hir::Class::Bytes(ref cls)) => {
673             if !lits.add_byte_class(cls) {
674                 lits.cut();
675             }
676         }
677         HirKind::Group(hir::Group { ref hir, .. }) => {
678             suffixes(&**hir, lits);
679         }
680         HirKind::Repetition(ref x) => match x.kind {
681             hir::RepetitionKind::ZeroOrOne => {
682                 repeat_zero_or_one_literals(&x.hir, lits, suffixes);
683             }
684             hir::RepetitionKind::ZeroOrMore => {
685                 repeat_zero_or_more_literals(&x.hir, lits, suffixes);
686             }
687             hir::RepetitionKind::OneOrMore => {
688                 repeat_one_or_more_literals(&x.hir, lits, suffixes);
689             }
690             hir::RepetitionKind::Range(ref rng) => {
691                 let (min, max) = match *rng {
692                     hir::RepetitionRange::Exactly(m) => (m, Some(m)),
693                     hir::RepetitionRange::AtLeast(m) => (m, None),
694                     hir::RepetitionRange::Bounded(m, n) => (m, Some(n)),
695                 };
696                 repeat_range_literals(
697                     &x.hir, min, max, x.greedy, lits, suffixes,
698                 )
699             }
700         },
701         HirKind::Concat(ref es) if es.is_empty() => {}
702         HirKind::Concat(ref es) if es.len() == 1 => suffixes(&es[0], lits),
703         HirKind::Concat(ref es) => {
704             for e in es.iter().rev() {
705                 if let HirKind::Anchor(hir::Anchor::EndText) = *e.kind() {
706                     if !lits.is_empty() {
707                         lits.cut();
708                         break;
709                     }
710                     lits.add(Literal::empty());
711                     continue;
712                 }
713                 let mut lits2 = lits.to_empty();
714                 suffixes(e, &mut lits2);
715                 if !lits.cross_product(&lits2) || !lits2.any_complete() {
716                     // If this expression couldn't yield any literal that
717                     // could be extended, then we need to quit. Since we're
718                     // short-circuiting, we also need to freeze every member.
719                     lits.cut();
720                     break;
721                 }
722             }
723         }
724         HirKind::Alternation(ref es) => {
725             alternate_literals(es, lits, suffixes);
726         }
727         _ => lits.cut(),
728     }
729 }
730 
repeat_zero_or_one_literals<F: FnMut(&Hir, &mut Literals)>( e: &Hir, lits: &mut Literals, mut f: F, )731 fn repeat_zero_or_one_literals<F: FnMut(&Hir, &mut Literals)>(
732     e: &Hir,
733     lits: &mut Literals,
734     mut f: F,
735 ) {
736     f(
737         &Hir::repetition(hir::Repetition {
738             kind: hir::RepetitionKind::ZeroOrMore,
739             // FIXME: Our literal extraction doesn't care about greediness.
740             // Which is partially why we're treating 'e?' as 'e*'. Namely,
741             // 'ab??' yields [Complete(ab), Complete(a)], but it should yield
742             // [Complete(a), Complete(ab)] because of the non-greediness.
743             greedy: true,
744             hir: Box::new(e.clone()),
745         }),
746         lits,
747     );
748 }
749 
repeat_zero_or_more_literals<F: FnMut(&Hir, &mut Literals)>( e: &Hir, lits: &mut Literals, mut f: F, )750 fn repeat_zero_or_more_literals<F: FnMut(&Hir, &mut Literals)>(
751     e: &Hir,
752     lits: &mut Literals,
753     mut f: F,
754 ) {
755     let (mut lits2, mut lits3) = (lits.clone(), lits.to_empty());
756     lits3.set_limit_size(lits.limit_size() / 2);
757     f(e, &mut lits3);
758 
759     if lits3.is_empty() || !lits2.cross_product(&lits3) {
760         lits.cut();
761         return;
762     }
763     lits2.cut();
764     lits2.add(Literal::empty());
765     if !lits.union(lits2) {
766         lits.cut();
767     }
768 }
769 
repeat_one_or_more_literals<F: FnMut(&Hir, &mut Literals)>( e: &Hir, lits: &mut Literals, mut f: F, )770 fn repeat_one_or_more_literals<F: FnMut(&Hir, &mut Literals)>(
771     e: &Hir,
772     lits: &mut Literals,
773     mut f: F,
774 ) {
775     f(e, lits);
776     lits.cut();
777 }
778 
repeat_range_literals<F: FnMut(&Hir, &mut Literals)>( e: &Hir, min: u32, max: Option<u32>, greedy: bool, lits: &mut Literals, mut f: F, )779 fn repeat_range_literals<F: FnMut(&Hir, &mut Literals)>(
780     e: &Hir,
781     min: u32,
782     max: Option<u32>,
783     greedy: bool,
784     lits: &mut Literals,
785     mut f: F,
786 ) {
787     if min == 0 {
788         // This is a bit conservative. If `max` is set, then we could
789         // treat this as a finite set of alternations. For now, we
790         // just treat it as `e*`.
791         f(
792             &Hir::repetition(hir::Repetition {
793                 kind: hir::RepetitionKind::ZeroOrMore,
794                 greedy,
795                 hir: Box::new(e.clone()),
796             }),
797             lits,
798         );
799     } else {
800         if min > 0 {
801             let n = cmp::min(lits.limit_size, min as usize);
802             let es = iter::repeat(e.clone()).take(n).collect();
803             f(&Hir::concat(es), lits);
804             if n < min as usize || lits.contains_empty() {
805                 lits.cut();
806             }
807         }
808         if max.map_or(true, |max| min < max) {
809             lits.cut();
810         }
811     }
812 }
813 
alternate_literals<F: FnMut(&Hir, &mut Literals)>( es: &[Hir], lits: &mut Literals, mut f: F, )814 fn alternate_literals<F: FnMut(&Hir, &mut Literals)>(
815     es: &[Hir],
816     lits: &mut Literals,
817     mut f: F,
818 ) {
819     let mut lits2 = lits.to_empty();
820     for e in es {
821         let mut lits3 = lits.to_empty();
822         lits3.set_limit_size(lits.limit_size() / 5);
823         f(e, &mut lits3);
824         if lits3.is_empty() || !lits2.union(lits3) {
825             // If we couldn't find suffixes for *any* of the
826             // alternates, then the entire alternation has to be thrown
827             // away and any existing members must be frozen. Similarly,
828             // if the union couldn't complete, stop and freeze.
829             lits.cut();
830             return;
831         }
832     }
833     if !lits.cross_product(&lits2) {
834         lits.cut();
835     }
836 }
837 
838 impl fmt::Debug for Literals {
fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result839     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
840         f.debug_struct("Literals")
841             .field("lits", &self.lits)
842             .field("limit_size", &self.limit_size)
843             .field("limit_class", &self.limit_class)
844             .finish()
845     }
846 }
847 
848 impl Literal {
849     /// Returns a new complete literal with the bytes given.
new(bytes: Vec<u8>) -> Literal850     pub fn new(bytes: Vec<u8>) -> Literal {
851         Literal { v: bytes, cut: false }
852     }
853 
854     /// Returns a new complete empty literal.
empty() -> Literal855     pub fn empty() -> Literal {
856         Literal { v: vec![], cut: false }
857     }
858 
859     /// Returns true if this literal was "cut."
is_cut(&self) -> bool860     pub fn is_cut(&self) -> bool {
861         self.cut
862     }
863 
864     /// Cuts this literal.
cut(&mut self)865     pub fn cut(&mut self) {
866         self.cut = true;
867     }
868 }
869 
870 impl PartialEq for Literal {
eq(&self, other: &Literal) -> bool871     fn eq(&self, other: &Literal) -> bool {
872         self.v == other.v
873     }
874 }
875 
876 impl PartialOrd for Literal {
partial_cmp(&self, other: &Literal) -> Option<cmp::Ordering>877     fn partial_cmp(&self, other: &Literal) -> Option<cmp::Ordering> {
878         self.v.partial_cmp(&other.v)
879     }
880 }
881 
882 impl fmt::Debug for Literal {
fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result883     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
884         if self.is_cut() {
885             write!(f, "Cut({})", escape_unicode(&self.v))
886         } else {
887             write!(f, "Complete({})", escape_unicode(&self.v))
888         }
889     }
890 }
891 
892 impl AsRef<[u8]> for Literal {
as_ref(&self) -> &[u8]893     fn as_ref(&self) -> &[u8] {
894         &self.v
895     }
896 }
897 
898 impl ops::Deref for Literal {
899     type Target = Vec<u8>;
deref(&self) -> &Vec<u8>900     fn deref(&self) -> &Vec<u8> {
901         &self.v
902     }
903 }
904 
905 impl ops::DerefMut for Literal {
deref_mut(&mut self) -> &mut Vec<u8>906     fn deref_mut(&mut self) -> &mut Vec<u8> {
907         &mut self.v
908     }
909 }
910 
position(needle: &[u8], mut haystack: &[u8]) -> Option<usize>911 fn position(needle: &[u8], mut haystack: &[u8]) -> Option<usize> {
912     let mut i = 0;
913     while haystack.len() >= needle.len() {
914         if needle == &haystack[..needle.len()] {
915             return Some(i);
916         }
917         i += 1;
918         haystack = &haystack[1..];
919     }
920     None
921 }
922 
escape_unicode(bytes: &[u8]) -> String923 fn escape_unicode(bytes: &[u8]) -> String {
924     let show = match ::std::str::from_utf8(bytes) {
925         Ok(v) => v.to_string(),
926         Err(_) => escape_bytes(bytes),
927     };
928     let mut space_escaped = String::new();
929     for c in show.chars() {
930         if c.is_whitespace() {
931             let escaped = if c as u32 <= 0x7F {
932                 escape_byte(c as u8)
933             } else if c as u32 <= 0xFFFF {
934                 format!(r"\u{{{:04x}}}", c as u32)
935             } else {
936                 format!(r"\U{{{:08x}}}", c as u32)
937             };
938             space_escaped.push_str(&escaped);
939         } else {
940             space_escaped.push(c);
941         }
942     }
943     space_escaped
944 }
945 
escape_bytes(bytes: &[u8]) -> String946 fn escape_bytes(bytes: &[u8]) -> String {
947     let mut s = String::new();
948     for &b in bytes {
949         s.push_str(&escape_byte(b));
950     }
951     s
952 }
953 
escape_byte(byte: u8) -> String954 fn escape_byte(byte: u8) -> String {
955     use std::ascii::escape_default;
956 
957     let escaped: Vec<u8> = escape_default(byte).collect();
958     String::from_utf8_lossy(&escaped).into_owned()
959 }
960 
cls_char_count(cls: &hir::ClassUnicode) -> usize961 fn cls_char_count(cls: &hir::ClassUnicode) -> usize {
962     cls.iter().map(|&r| 1 + (r.end as u32) - (r.start as u32)).sum::<u32>()
963         as usize
964 }
965 
cls_byte_count(cls: &hir::ClassBytes) -> usize966 fn cls_byte_count(cls: &hir::ClassBytes) -> usize {
967     cls.iter().map(|&r| 1 + (r.end as u32) - (r.start as u32)).sum::<u32>()
968         as usize
969 }
970 
971 #[cfg(test)]
972 mod tests {
973     use std::fmt;
974 
975     use super::{escape_bytes, Literal, Literals};
976     use crate::hir::Hir;
977     use crate::ParserBuilder;
978 
979     // To make test failures easier to read.
980     #[derive(Debug, Eq, PartialEq)]
981     struct Bytes(Vec<ULiteral>);
982     #[derive(Debug, Eq, PartialEq)]
983     struct Unicode(Vec<ULiteral>);
984 
escape_lits(blits: &[Literal]) -> Vec<ULiteral>985     fn escape_lits(blits: &[Literal]) -> Vec<ULiteral> {
986         let mut ulits = vec![];
987         for blit in blits {
988             ulits
989                 .push(ULiteral { v: escape_bytes(&blit), cut: blit.is_cut() });
990         }
991         ulits
992     }
993 
create_lits<I: IntoIterator<Item = Literal>>(it: I) -> Literals994     fn create_lits<I: IntoIterator<Item = Literal>>(it: I) -> Literals {
995         Literals {
996             lits: it.into_iter().collect(),
997             limit_size: 0,
998             limit_class: 0,
999         }
1000     }
1001 
1002     // Needs to be pub for 1.3?
1003     #[derive(Clone, Eq, PartialEq)]
1004     pub struct ULiteral {
1005         v: String,
1006         cut: bool,
1007     }
1008 
1009     impl ULiteral {
is_cut(&self) -> bool1010         fn is_cut(&self) -> bool {
1011             self.cut
1012         }
1013     }
1014 
1015     impl fmt::Debug for ULiteral {
fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result1016         fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
1017             if self.is_cut() {
1018                 write!(f, "Cut({})", self.v)
1019             } else {
1020                 write!(f, "Complete({})", self.v)
1021             }
1022         }
1023     }
1024 
1025     impl PartialEq<Literal> for ULiteral {
eq(&self, other: &Literal) -> bool1026         fn eq(&self, other: &Literal) -> bool {
1027             self.v.as_bytes() == &*other.v && self.is_cut() == other.is_cut()
1028         }
1029     }
1030 
1031     impl PartialEq<ULiteral> for Literal {
eq(&self, other: &ULiteral) -> bool1032         fn eq(&self, other: &ULiteral) -> bool {
1033             &*self.v == other.v.as_bytes() && self.is_cut() == other.is_cut()
1034         }
1035     }
1036 
1037     #[allow(non_snake_case)]
C(s: &'static str) -> ULiteral1038     fn C(s: &'static str) -> ULiteral {
1039         ULiteral { v: s.to_owned(), cut: true }
1040     }
1041     #[allow(non_snake_case)]
M(s: &'static str) -> ULiteral1042     fn M(s: &'static str) -> ULiteral {
1043         ULiteral { v: s.to_owned(), cut: false }
1044     }
1045 
prefixes(lits: &mut Literals, expr: &Hir)1046     fn prefixes(lits: &mut Literals, expr: &Hir) {
1047         lits.union_prefixes(expr);
1048     }
1049 
suffixes(lits: &mut Literals, expr: &Hir)1050     fn suffixes(lits: &mut Literals, expr: &Hir) {
1051         lits.union_suffixes(expr);
1052     }
1053 
1054     macro_rules! assert_lit_eq {
1055         ($which:ident, $got_lits:expr, $($expected_lit:expr),*) => {{
1056             let expected: Vec<ULiteral> = vec![$($expected_lit),*];
1057             let lits = $got_lits;
1058             assert_eq!(
1059                 $which(expected.clone()),
1060                 $which(escape_lits(lits.literals())));
1061             assert_eq!(
1062                 !expected.is_empty() && expected.iter().all(|l| !l.is_cut()),
1063                 lits.all_complete());
1064             assert_eq!(
1065                 expected.iter().any(|l| !l.is_cut()),
1066                 lits.any_complete());
1067         }};
1068     }
1069 
1070     macro_rules! test_lit {
1071         ($name:ident, $which:ident, $re:expr) => {
1072             test_lit!($name, $which, $re,);
1073         };
1074         ($name:ident, $which:ident, $re:expr, $($lit:expr),*) => {
1075             #[test]
1076             fn $name() {
1077                 let expr = ParserBuilder::new()
1078                     .build()
1079                     .parse($re)
1080                     .unwrap();
1081                 let lits = Literals::$which(&expr);
1082                 assert_lit_eq!(Unicode, lits, $($lit),*);
1083 
1084                 let expr = ParserBuilder::new()
1085                     .allow_invalid_utf8(true)
1086                     .unicode(false)
1087                     .build()
1088                     .parse($re)
1089                     .unwrap();
1090                 let lits = Literals::$which(&expr);
1091                 assert_lit_eq!(Bytes, lits, $($lit),*);
1092             }
1093         };
1094     }
1095 
1096     // ************************************************************************
1097     // Tests for prefix literal extraction.
1098     // ************************************************************************
1099 
1100     // Elementary tests.
1101     test_lit!(pfx_one_lit1, prefixes, "a", M("a"));
1102     test_lit!(pfx_one_lit2, prefixes, "abc", M("abc"));
1103     test_lit!(pfx_one_lit3, prefixes, "(?u)☃", M("\\xe2\\x98\\x83"));
1104     #[cfg(feature = "unicode-case")]
1105     test_lit!(pfx_one_lit4, prefixes, "(?ui)☃", M("\\xe2\\x98\\x83"));
1106     test_lit!(pfx_class1, prefixes, "[1-4]", M("1"), M("2"), M("3"), M("4"));
1107     test_lit!(
1108         pfx_class2,
1109         prefixes,
1110         "(?u)[☃Ⅰ]",
1111         M("\\xe2\\x85\\xa0"),
1112         M("\\xe2\\x98\\x83")
1113     );
1114     #[cfg(feature = "unicode-case")]
1115     test_lit!(
1116         pfx_class3,
1117         prefixes,
1118         "(?ui)[☃Ⅰ]",
1119         M("\\xe2\\x85\\xa0"),
1120         M("\\xe2\\x85\\xb0"),
1121         M("\\xe2\\x98\\x83")
1122     );
1123     test_lit!(pfx_one_lit_casei1, prefixes, "(?i-u)a", M("A"), M("a"));
1124     test_lit!(
1125         pfx_one_lit_casei2,
1126         prefixes,
1127         "(?i-u)abc",
1128         M("ABC"),
1129         M("aBC"),
1130         M("AbC"),
1131         M("abC"),
1132         M("ABc"),
1133         M("aBc"),
1134         M("Abc"),
1135         M("abc")
1136     );
1137     test_lit!(pfx_group1, prefixes, "(a)", M("a"));
1138     test_lit!(pfx_rep_zero_or_one1, prefixes, "a?");
1139     test_lit!(pfx_rep_zero_or_one2, prefixes, "(?:abc)?");
1140     test_lit!(pfx_rep_zero_or_one_cat1, prefixes, "ab?", C("ab"), M("a"));
1141     // FIXME: This should return [M("a"), M("ab")] because of the non-greedy
1142     // repetition. As a work-around, we rewrite ab?? as ab*?, and thus we get
1143     // a cut literal.
1144     test_lit!(pfx_rep_zero_or_one_cat2, prefixes, "ab??", C("ab"), M("a"));
1145     test_lit!(pfx_rep_zero_or_more1, prefixes, "a*");
1146     test_lit!(pfx_rep_zero_or_more2, prefixes, "(?:abc)*");
1147     test_lit!(pfx_rep_one_or_more1, prefixes, "a+", C("a"));
1148     test_lit!(pfx_rep_one_or_more2, prefixes, "(?:abc)+", C("abc"));
1149     test_lit!(pfx_rep_nested_one_or_more, prefixes, "(?:a+)+", C("a"));
1150     test_lit!(pfx_rep_range1, prefixes, "a{0}");
1151     test_lit!(pfx_rep_range2, prefixes, "a{0,}");
1152     test_lit!(pfx_rep_range3, prefixes, "a{0,1}");
1153     test_lit!(pfx_rep_range4, prefixes, "a{1}", M("a"));
1154     test_lit!(pfx_rep_range5, prefixes, "a{2}", M("aa"));
1155     test_lit!(pfx_rep_range6, prefixes, "a{1,2}", C("a"));
1156     test_lit!(pfx_rep_range7, prefixes, "a{2,3}", C("aa"));
1157 
1158     // Test regexes with concatenations.
1159     test_lit!(pfx_cat1, prefixes, "(?:a)(?:b)", M("ab"));
1160     test_lit!(pfx_cat2, prefixes, "[ab]z", M("az"), M("bz"));
1161     test_lit!(
1162         pfx_cat3,
1163         prefixes,
1164         "(?i-u)[ab]z",
1165         M("AZ"),
1166         M("BZ"),
1167         M("aZ"),
1168         M("bZ"),
1169         M("Az"),
1170         M("Bz"),
1171         M("az"),
1172         M("bz")
1173     );
1174     test_lit!(
1175         pfx_cat4,
1176         prefixes,
1177         "[ab][yz]",
1178         M("ay"),
1179         M("by"),
1180         M("az"),
1181         M("bz")
1182     );
1183     test_lit!(pfx_cat5, prefixes, "a*b", C("a"), M("b"));
1184     test_lit!(pfx_cat6, prefixes, "a*b*c", C("a"), C("b"), M("c"));
1185     test_lit!(pfx_cat7, prefixes, "a*b*c+", C("a"), C("b"), C("c"));
1186     test_lit!(pfx_cat8, prefixes, "a*b+c", C("a"), C("b"));
1187     test_lit!(pfx_cat9, prefixes, "a*b+c*", C("a"), C("b"));
1188     test_lit!(pfx_cat10, prefixes, "ab*", C("ab"), M("a"));
1189     test_lit!(pfx_cat11, prefixes, "ab*c", C("ab"), M("ac"));
1190     test_lit!(pfx_cat12, prefixes, "ab+", C("ab"));
1191     test_lit!(pfx_cat13, prefixes, "ab+c", C("ab"));
1192     test_lit!(pfx_cat14, prefixes, "a^", C("a"));
1193     test_lit!(pfx_cat15, prefixes, "$a");
1194     test_lit!(pfx_cat16, prefixes, r"ab*c", C("ab"), M("ac"));
1195     test_lit!(pfx_cat17, prefixes, r"ab+c", C("ab"));
1196     test_lit!(pfx_cat18, prefixes, r"z*azb", C("z"), M("azb"));
1197     test_lit!(pfx_cat19, prefixes, "a.z", C("a"));
1198 
1199     // Test regexes with alternations.
1200     test_lit!(pfx_alt1, prefixes, "a|b", M("a"), M("b"));
1201     test_lit!(pfx_alt2, prefixes, "[1-3]|b", M("1"), M("2"), M("3"), M("b"));
1202     test_lit!(pfx_alt3, prefixes, "y(?:a|b)z", M("yaz"), M("ybz"));
1203     test_lit!(pfx_alt4, prefixes, "a|b*");
1204     test_lit!(pfx_alt5, prefixes, "a|b+", M("a"), C("b"));
1205     test_lit!(pfx_alt6, prefixes, "a|(?:b|c*)");
1206     test_lit!(
1207         pfx_alt7,
1208         prefixes,
1209         "(a|b)*c|(a|ab)*c",
1210         C("a"),
1211         C("b"),
1212         M("c"),
1213         C("a"),
1214         C("ab"),
1215         M("c")
1216     );
1217     test_lit!(pfx_alt8, prefixes, "a*b|c", C("a"), M("b"), M("c"));
1218 
1219     // Test regexes with empty assertions.
1220     test_lit!(pfx_empty1, prefixes, "^a", M("a"));
1221     test_lit!(pfx_empty2, prefixes, "a${2}", C("a"));
1222     test_lit!(pfx_empty3, prefixes, "^abc", M("abc"));
1223     test_lit!(pfx_empty4, prefixes, "(?:^abc)|(?:^z)", M("abc"), M("z"));
1224 
1225     // Make sure some curious regexes have no prefixes.
1226     test_lit!(pfx_nothing1, prefixes, ".");
1227     test_lit!(pfx_nothing2, prefixes, "(?s).");
1228     test_lit!(pfx_nothing3, prefixes, "^");
1229     test_lit!(pfx_nothing4, prefixes, "$");
1230     test_lit!(pfx_nothing6, prefixes, "(?m)$");
1231     test_lit!(pfx_nothing7, prefixes, r"\b");
1232     test_lit!(pfx_nothing8, prefixes, r"\B");
1233 
1234     // Test a few regexes that defeat any prefix literal detection.
1235     test_lit!(pfx_defeated1, prefixes, ".a");
1236     test_lit!(pfx_defeated2, prefixes, "(?s).a");
1237     test_lit!(pfx_defeated3, prefixes, "a*b*c*");
1238     test_lit!(pfx_defeated4, prefixes, "a|.");
1239     test_lit!(pfx_defeated5, prefixes, ".|a");
1240     test_lit!(pfx_defeated6, prefixes, "a|^");
1241     test_lit!(pfx_defeated7, prefixes, ".(?:a(?:b)(?:c))");
1242     test_lit!(pfx_defeated8, prefixes, "$a");
1243     test_lit!(pfx_defeated9, prefixes, "(?m)$a");
1244     test_lit!(pfx_defeated10, prefixes, r"\ba");
1245     test_lit!(pfx_defeated11, prefixes, r"\Ba");
1246     test_lit!(pfx_defeated12, prefixes, "^*a");
1247     test_lit!(pfx_defeated13, prefixes, "^+a");
1248 
1249     test_lit!(
1250         pfx_crazy1,
1251         prefixes,
1252         r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]",
1253         C("Mo\\'"),
1254         C("Mu\\'"),
1255         C("Moam"),
1256         C("Muam")
1257     );
1258 
1259     // ************************************************************************
1260     // Tests for quiting prefix literal search.
1261     // ************************************************************************
1262 
1263     macro_rules! test_exhausted {
1264         ($name:ident, $which:ident, $re:expr) => {
1265             test_exhausted!($name, $which, $re,);
1266         };
1267         ($name:ident, $which:ident, $re:expr, $($lit:expr),*) => {
1268             #[test]
1269             fn $name() {
1270                 let expr = ParserBuilder::new()
1271                     .build()
1272                     .parse($re)
1273                     .unwrap();
1274                 let mut lits = Literals::empty();
1275                 lits.set_limit_size(20).set_limit_class(10);
1276                 $which(&mut lits, &expr);
1277                 assert_lit_eq!(Unicode, lits, $($lit),*);
1278 
1279                 let expr = ParserBuilder::new()
1280                     .allow_invalid_utf8(true)
1281                     .unicode(false)
1282                     .build()
1283                     .parse($re)
1284                     .unwrap();
1285                 let mut lits = Literals::empty();
1286                 lits.set_limit_size(20).set_limit_class(10);
1287                 $which(&mut lits, &expr);
1288                 assert_lit_eq!(Bytes, lits, $($lit),*);
1289             }
1290         };
1291     }
1292 
1293     // These test use a much lower limit than the default so that we can
1294     // write test cases of reasonable size.
1295     test_exhausted!(pfx_exhausted1, prefixes, "[a-z]");
1296     test_exhausted!(pfx_exhausted2, prefixes, "[a-z]*A");
1297     test_exhausted!(pfx_exhausted3, prefixes, "A[a-z]Z", C("A"));
1298     test_exhausted!(
1299         pfx_exhausted4,
1300         prefixes,
1301         "(?i-u)foobar",
1302         C("FO"),
1303         C("fO"),
1304         C("Fo"),
1305         C("fo")
1306     );
1307     test_exhausted!(
1308         pfx_exhausted5,
1309         prefixes,
1310         "(?:ab){100}",
1311         C("abababababababababab")
1312     );
1313     test_exhausted!(
1314         pfx_exhausted6,
1315         prefixes,
1316         "(?:(?:ab){100})*cd",
1317         C("ababababab"),
1318         M("cd")
1319     );
1320     test_exhausted!(
1321         pfx_exhausted7,
1322         prefixes,
1323         "z(?:(?:ab){100})*cd",
1324         C("zababababab"),
1325         M("zcd")
1326     );
1327     test_exhausted!(
1328         pfx_exhausted8,
1329         prefixes,
1330         "aaaaaaaaaaaaaaaaaaaaz",
1331         C("aaaaaaaaaaaaaaaaaaaa")
1332     );
1333 
1334     // ************************************************************************
1335     // Tests for suffix literal extraction.
1336     // ************************************************************************
1337 
1338     // Elementary tests.
1339     test_lit!(sfx_one_lit1, suffixes, "a", M("a"));
1340     test_lit!(sfx_one_lit2, suffixes, "abc", M("abc"));
1341     test_lit!(sfx_one_lit3, suffixes, "(?u)☃", M("\\xe2\\x98\\x83"));
1342     #[cfg(feature = "unicode-case")]
1343     test_lit!(sfx_one_lit4, suffixes, "(?ui)☃", M("\\xe2\\x98\\x83"));
1344     test_lit!(sfx_class1, suffixes, "[1-4]", M("1"), M("2"), M("3"), M("4"));
1345     test_lit!(
1346         sfx_class2,
1347         suffixes,
1348         "(?u)[☃Ⅰ]",
1349         M("\\xe2\\x85\\xa0"),
1350         M("\\xe2\\x98\\x83")
1351     );
1352     #[cfg(feature = "unicode-case")]
1353     test_lit!(
1354         sfx_class3,
1355         suffixes,
1356         "(?ui)[☃Ⅰ]",
1357         M("\\xe2\\x85\\xa0"),
1358         M("\\xe2\\x85\\xb0"),
1359         M("\\xe2\\x98\\x83")
1360     );
1361     test_lit!(sfx_one_lit_casei1, suffixes, "(?i-u)a", M("A"), M("a"));
1362     test_lit!(
1363         sfx_one_lit_casei2,
1364         suffixes,
1365         "(?i-u)abc",
1366         M("ABC"),
1367         M("ABc"),
1368         M("AbC"),
1369         M("Abc"),
1370         M("aBC"),
1371         M("aBc"),
1372         M("abC"),
1373         M("abc")
1374     );
1375     test_lit!(sfx_group1, suffixes, "(a)", M("a"));
1376     test_lit!(sfx_rep_zero_or_one1, suffixes, "a?");
1377     test_lit!(sfx_rep_zero_or_one2, suffixes, "(?:abc)?");
1378     test_lit!(sfx_rep_zero_or_more1, suffixes, "a*");
1379     test_lit!(sfx_rep_zero_or_more2, suffixes, "(?:abc)*");
1380     test_lit!(sfx_rep_one_or_more1, suffixes, "a+", C("a"));
1381     test_lit!(sfx_rep_one_or_more2, suffixes, "(?:abc)+", C("abc"));
1382     test_lit!(sfx_rep_nested_one_or_more, suffixes, "(?:a+)+", C("a"));
1383     test_lit!(sfx_rep_range1, suffixes, "a{0}");
1384     test_lit!(sfx_rep_range2, suffixes, "a{0,}");
1385     test_lit!(sfx_rep_range3, suffixes, "a{0,1}");
1386     test_lit!(sfx_rep_range4, suffixes, "a{1}", M("a"));
1387     test_lit!(sfx_rep_range5, suffixes, "a{2}", M("aa"));
1388     test_lit!(sfx_rep_range6, suffixes, "a{1,2}", C("a"));
1389     test_lit!(sfx_rep_range7, suffixes, "a{2,3}", C("aa"));
1390 
1391     // Test regexes with concatenations.
1392     test_lit!(sfx_cat1, suffixes, "(?:a)(?:b)", M("ab"));
1393     test_lit!(sfx_cat2, suffixes, "[ab]z", M("az"), M("bz"));
1394     test_lit!(
1395         sfx_cat3,
1396         suffixes,
1397         "(?i-u)[ab]z",
1398         M("AZ"),
1399         M("Az"),
1400         M("BZ"),
1401         M("Bz"),
1402         M("aZ"),
1403         M("az"),
1404         M("bZ"),
1405         M("bz")
1406     );
1407     test_lit!(
1408         sfx_cat4,
1409         suffixes,
1410         "[ab][yz]",
1411         M("ay"),
1412         M("az"),
1413         M("by"),
1414         M("bz")
1415     );
1416     test_lit!(sfx_cat5, suffixes, "a*b", C("ab"), M("b"));
1417     test_lit!(sfx_cat6, suffixes, "a*b*c", C("bc"), C("ac"), M("c"));
1418     test_lit!(sfx_cat7, suffixes, "a*b*c+", C("c"));
1419     test_lit!(sfx_cat8, suffixes, "a*b+c", C("bc"));
1420     test_lit!(sfx_cat9, suffixes, "a*b+c*", C("c"), C("b"));
1421     test_lit!(sfx_cat10, suffixes, "ab*", C("b"), M("a"));
1422     test_lit!(sfx_cat11, suffixes, "ab*c", C("bc"), M("ac"));
1423     test_lit!(sfx_cat12, suffixes, "ab+", C("b"));
1424     test_lit!(sfx_cat13, suffixes, "ab+c", C("bc"));
1425     test_lit!(sfx_cat14, suffixes, "a^");
1426     test_lit!(sfx_cat15, suffixes, "$a", C("a"));
1427     test_lit!(sfx_cat16, suffixes, r"ab*c", C("bc"), M("ac"));
1428     test_lit!(sfx_cat17, suffixes, r"ab+c", C("bc"));
1429     test_lit!(sfx_cat18, suffixes, r"z*azb", C("zazb"), M("azb"));
1430     test_lit!(sfx_cat19, suffixes, "a.z", C("z"));
1431 
1432     // Test regexes with alternations.
1433     test_lit!(sfx_alt1, suffixes, "a|b", M("a"), M("b"));
1434     test_lit!(sfx_alt2, suffixes, "[1-3]|b", M("1"), M("2"), M("3"), M("b"));
1435     test_lit!(sfx_alt3, suffixes, "y(?:a|b)z", M("yaz"), M("ybz"));
1436     test_lit!(sfx_alt4, suffixes, "a|b*");
1437     test_lit!(sfx_alt5, suffixes, "a|b+", M("a"), C("b"));
1438     test_lit!(sfx_alt6, suffixes, "a|(?:b|c*)");
1439     test_lit!(
1440         sfx_alt7,
1441         suffixes,
1442         "(a|b)*c|(a|ab)*c",
1443         C("ac"),
1444         C("bc"),
1445         M("c"),
1446         C("ac"),
1447         C("abc"),
1448         M("c")
1449     );
1450     test_lit!(sfx_alt8, suffixes, "a*b|c", C("ab"), M("b"), M("c"));
1451 
1452     // Test regexes with empty assertions.
1453     test_lit!(sfx_empty1, suffixes, "a$", M("a"));
1454     test_lit!(sfx_empty2, suffixes, "${2}a", C("a"));
1455 
1456     // Make sure some curious regexes have no suffixes.
1457     test_lit!(sfx_nothing1, suffixes, ".");
1458     test_lit!(sfx_nothing2, suffixes, "(?s).");
1459     test_lit!(sfx_nothing3, suffixes, "^");
1460     test_lit!(sfx_nothing4, suffixes, "$");
1461     test_lit!(sfx_nothing6, suffixes, "(?m)$");
1462     test_lit!(sfx_nothing7, suffixes, r"\b");
1463     test_lit!(sfx_nothing8, suffixes, r"\B");
1464 
1465     // Test a few regexes that defeat any suffix literal detection.
1466     test_lit!(sfx_defeated1, suffixes, "a.");
1467     test_lit!(sfx_defeated2, suffixes, "(?s)a.");
1468     test_lit!(sfx_defeated3, suffixes, "a*b*c*");
1469     test_lit!(sfx_defeated4, suffixes, "a|.");
1470     test_lit!(sfx_defeated5, suffixes, ".|a");
1471     test_lit!(sfx_defeated6, suffixes, "a|^");
1472     test_lit!(sfx_defeated7, suffixes, "(?:a(?:b)(?:c)).");
1473     test_lit!(sfx_defeated8, suffixes, "a^");
1474     test_lit!(sfx_defeated9, suffixes, "(?m)a$");
1475     test_lit!(sfx_defeated10, suffixes, r"a\b");
1476     test_lit!(sfx_defeated11, suffixes, r"a\B");
1477     test_lit!(sfx_defeated12, suffixes, "a^*");
1478     test_lit!(sfx_defeated13, suffixes, "a^+");
1479 
1480     // These test use a much lower limit than the default so that we can
1481     // write test cases of reasonable size.
1482     test_exhausted!(sfx_exhausted1, suffixes, "[a-z]");
1483     test_exhausted!(sfx_exhausted2, suffixes, "A[a-z]*");
1484     test_exhausted!(sfx_exhausted3, suffixes, "A[a-z]Z", C("Z"));
1485     test_exhausted!(
1486         sfx_exhausted4,
1487         suffixes,
1488         "(?i-u)foobar",
1489         C("AR"),
1490         C("Ar"),
1491         C("aR"),
1492         C("ar")
1493     );
1494     test_exhausted!(
1495         sfx_exhausted5,
1496         suffixes,
1497         "(?:ab){100}",
1498         C("abababababababababab")
1499     );
1500     test_exhausted!(
1501         sfx_exhausted6,
1502         suffixes,
1503         "cd(?:(?:ab){100})*",
1504         C("ababababab"),
1505         M("cd")
1506     );
1507     test_exhausted!(
1508         sfx_exhausted7,
1509         suffixes,
1510         "cd(?:(?:ab){100})*z",
1511         C("abababababz"),
1512         M("cdz")
1513     );
1514     test_exhausted!(
1515         sfx_exhausted8,
1516         suffixes,
1517         "zaaaaaaaaaaaaaaaaaaaa",
1518         C("aaaaaaaaaaaaaaaaaaaa")
1519     );
1520 
1521     // ************************************************************************
1522     // Tests for generating unambiguous literal sets.
1523     // ************************************************************************
1524 
1525     macro_rules! test_unamb {
1526         ($name:ident, $given:expr, $expected:expr) => {
1527             #[test]
1528             fn $name() {
1529                 let given: Vec<Literal> = $given
1530                     .into_iter()
1531                     .map(|ul| {
1532                         let cut = ul.is_cut();
1533                         Literal { v: ul.v.into_bytes(), cut: cut }
1534                     })
1535                     .collect();
1536                 let lits = create_lits(given);
1537                 let got = lits.unambiguous_prefixes();
1538                 assert_eq!($expected, escape_lits(got.literals()));
1539             }
1540         };
1541     }
1542 
1543     test_unamb!(unambiguous1, vec![M("z"), M("azb")], vec![C("a"), C("z")]);
1544     test_unamb!(
1545         unambiguous2,
1546         vec![M("zaaaaaa"), M("aa")],
1547         vec![C("aa"), C("z")]
1548     );
1549     test_unamb!(
1550         unambiguous3,
1551         vec![M("Sherlock"), M("Watson")],
1552         vec![M("Sherlock"), M("Watson")]
1553     );
1554     test_unamb!(unambiguous4, vec![M("abc"), M("bc")], vec![C("a"), C("bc")]);
1555     test_unamb!(unambiguous5, vec![M("bc"), M("abc")], vec![C("a"), C("bc")]);
1556     test_unamb!(unambiguous6, vec![M("a"), M("aa")], vec![C("a")]);
1557     test_unamb!(unambiguous7, vec![M("aa"), M("a")], vec![C("a")]);
1558     test_unamb!(unambiguous8, vec![M("ab"), M("a")], vec![C("a")]);
1559     test_unamb!(
1560         unambiguous9,
1561         vec![M("ac"), M("bc"), M("c"), M("ac"), M("abc"), M("c")],
1562         vec![C("a"), C("b"), C("c")]
1563     );
1564     test_unamb!(
1565         unambiguous10,
1566         vec![M("Mo'"), M("Mu'"), M("Mo"), M("Mu")],
1567         vec![C("Mo"), C("Mu")]
1568     );
1569     test_unamb!(
1570         unambiguous11,
1571         vec![M("zazb"), M("azb")],
1572         vec![C("a"), C("z")]
1573     );
1574     test_unamb!(unambiguous12, vec![M("foo"), C("foo")], vec![C("foo")]);
1575     test_unamb!(
1576         unambiguous13,
1577         vec![M("ABCX"), M("CDAX"), M("BCX")],
1578         vec![C("A"), C("BCX"), C("CD")]
1579     );
1580     test_unamb!(
1581         unambiguous14,
1582         vec![M("IMGX"), M("MVIX"), M("MGX"), M("DSX")],
1583         vec![M("DSX"), C("I"), C("MGX"), C("MV")]
1584     );
1585     test_unamb!(
1586         unambiguous15,
1587         vec![M("IMG_"), M("MG_"), M("CIMG")],
1588         vec![C("C"), C("I"), C("MG_")]
1589     );
1590 
1591     // ************************************************************************
1592     // Tests for suffix trimming.
1593     // ************************************************************************
1594     macro_rules! test_trim {
1595         ($name:ident, $trim:expr, $given:expr, $expected:expr) => {
1596             #[test]
1597             fn $name() {
1598                 let given: Vec<Literal> = $given
1599                     .into_iter()
1600                     .map(|ul| {
1601                         let cut = ul.is_cut();
1602                         Literal { v: ul.v.into_bytes(), cut: cut }
1603                     })
1604                     .collect();
1605                 let lits = create_lits(given);
1606                 let got = lits.trim_suffix($trim).unwrap();
1607                 assert_eq!($expected, escape_lits(got.literals()));
1608             }
1609         };
1610     }
1611 
1612     test_trim!(trim1, 1, vec![M("ab"), M("yz")], vec![C("a"), C("y")]);
1613     test_trim!(trim2, 1, vec![M("abc"), M("abd")], vec![C("ab")]);
1614     test_trim!(trim3, 2, vec![M("abc"), M("abd")], vec![C("a")]);
1615     test_trim!(trim4, 2, vec![M("abc"), M("ghij")], vec![C("a"), C("gh")]);
1616 
1617     // ************************************************************************
1618     // Tests for longest common prefix.
1619     // ************************************************************************
1620 
1621     macro_rules! test_lcp {
1622         ($name:ident, $given:expr, $expected:expr) => {
1623             #[test]
1624             fn $name() {
1625                 let given: Vec<Literal> = $given
1626                     .into_iter()
1627                     .map(|s: &str| Literal {
1628                         v: s.to_owned().into_bytes(),
1629                         cut: false,
1630                     })
1631                     .collect();
1632                 let lits = create_lits(given);
1633                 let got = lits.longest_common_prefix();
1634                 assert_eq!($expected, escape_bytes(got));
1635             }
1636         };
1637     }
1638 
1639     test_lcp!(lcp1, vec!["a"], "a");
1640     test_lcp!(lcp2, vec![], "");
1641     test_lcp!(lcp3, vec!["a", "b"], "");
1642     test_lcp!(lcp4, vec!["ab", "ab"], "ab");
1643     test_lcp!(lcp5, vec!["ab", "a"], "a");
1644     test_lcp!(lcp6, vec!["a", "ab"], "a");
1645     test_lcp!(lcp7, vec!["ab", "b"], "");
1646     test_lcp!(lcp8, vec!["b", "ab"], "");
1647     test_lcp!(lcp9, vec!["foobar", "foobaz"], "fooba");
1648     test_lcp!(lcp10, vec!["foobar", "foobaz", "a"], "");
1649     test_lcp!(lcp11, vec!["a", "foobar", "foobaz"], "");
1650     test_lcp!(lcp12, vec!["foo", "flub", "flab", "floo"], "f");
1651 
1652     // ************************************************************************
1653     // Tests for longest common suffix.
1654     // ************************************************************************
1655 
1656     macro_rules! test_lcs {
1657         ($name:ident, $given:expr, $expected:expr) => {
1658             #[test]
1659             fn $name() {
1660                 let given: Vec<Literal> = $given
1661                     .into_iter()
1662                     .map(|s: &str| Literal {
1663                         v: s.to_owned().into_bytes(),
1664                         cut: false,
1665                     })
1666                     .collect();
1667                 let lits = create_lits(given);
1668                 let got = lits.longest_common_suffix();
1669                 assert_eq!($expected, escape_bytes(got));
1670             }
1671         };
1672     }
1673 
1674     test_lcs!(lcs1, vec!["a"], "a");
1675     test_lcs!(lcs2, vec![], "");
1676     test_lcs!(lcs3, vec!["a", "b"], "");
1677     test_lcs!(lcs4, vec!["ab", "ab"], "ab");
1678     test_lcs!(lcs5, vec!["ab", "a"], "");
1679     test_lcs!(lcs6, vec!["a", "ab"], "");
1680     test_lcs!(lcs7, vec!["ab", "b"], "b");
1681     test_lcs!(lcs8, vec!["b", "ab"], "b");
1682     test_lcs!(lcs9, vec!["barfoo", "bazfoo"], "foo");
1683     test_lcs!(lcs10, vec!["barfoo", "bazfoo", "a"], "");
1684     test_lcs!(lcs11, vec!["a", "barfoo", "bazfoo"], "");
1685     test_lcs!(lcs12, vec!["flub", "bub", "boob", "dub"], "b");
1686 }
1687