1 /*!
2 Provides routines for extracting literal prefixes and suffixes from an `Hir`.
3 */
4
5 use std::cmp;
6 use std::fmt;
7 use std::iter;
8 use std::mem;
9 use std::ops;
10
11 use crate::hir::{self, Hir, HirKind};
12
13 /// A set of literal byte strings extracted from a regular expression.
14 ///
15 /// Every member of the set is a `Literal`, which is represented by a
16 /// `Vec<u8>`. (Notably, it may contain invalid UTF-8.) Every member is
17 /// said to be either *complete* or *cut*. A complete literal means that
18 /// it extends until the beginning (or end) of the regular expression. In
19 /// some circumstances, this can be used to indicate a match in the regular
20 /// expression.
21 ///
22 /// A key aspect of literal extraction is knowing when to stop. It is not
23 /// feasible to blindly extract all literals from a regular expression, even if
24 /// there are finitely many. For example, the regular expression `[0-9]{10}`
25 /// has `10^10` distinct literals. For this reason, literal extraction is
26 /// bounded to some low number by default using heuristics, but the limits can
27 /// be tweaked.
28 ///
29 /// **WARNING**: Literal extraction uses stack space proportional to the size
30 /// of the `Hir` expression. At some point, this drawback will be eliminated.
31 /// To protect yourself, set a reasonable
32 /// [`nest_limit` on your `Parser`](../../struct.ParserBuilder.html#method.nest_limit).
33 /// This is done for you by default.
34 #[derive(Clone, Eq, PartialEq)]
35 pub struct Literals {
36 lits: Vec<Literal>,
37 limit_size: usize,
38 limit_class: usize,
39 }
40
41 /// A single member of a set of literals extracted from a regular expression.
42 ///
43 /// This type has `Deref` and `DerefMut` impls to `Vec<u8>` so that all slice
44 /// and `Vec` operations are available.
45 #[derive(Clone, Eq, Ord)]
46 pub struct Literal {
47 v: Vec<u8>,
48 cut: bool,
49 }
50
51 impl Literals {
52 /// Returns a new empty set of literals using default limits.
empty() -> Literals53 pub fn empty() -> Literals {
54 Literals { lits: vec![], limit_size: 250, limit_class: 10 }
55 }
56
57 /// Returns a set of literal prefixes extracted from the given `Hir`.
prefixes(expr: &Hir) -> Literals58 pub fn prefixes(expr: &Hir) -> Literals {
59 let mut lits = Literals::empty();
60 lits.union_prefixes(expr);
61 lits
62 }
63
64 /// Returns a set of literal suffixes extracted from the given `Hir`.
suffixes(expr: &Hir) -> Literals65 pub fn suffixes(expr: &Hir) -> Literals {
66 let mut lits = Literals::empty();
67 lits.union_suffixes(expr);
68 lits
69 }
70
71 /// Get the approximate size limit (in bytes) of this set.
limit_size(&self) -> usize72 pub fn limit_size(&self) -> usize {
73 self.limit_size
74 }
75
76 /// Set the approximate size limit (in bytes) of this set.
77 ///
78 /// If extracting a literal would put the set over this limit, then
79 /// extraction stops.
80 ///
81 /// The new limits will only apply to additions to this set. Existing
82 /// members remain unchanged, even if the set exceeds the new limit.
set_limit_size(&mut self, size: usize) -> &mut Literals83 pub fn set_limit_size(&mut self, size: usize) -> &mut Literals {
84 self.limit_size = size;
85 self
86 }
87
88 /// Get the character class size limit for this set.
limit_class(&self) -> usize89 pub fn limit_class(&self) -> usize {
90 self.limit_class
91 }
92
93 /// Limits the size of character(or byte) classes considered.
94 ///
95 /// A value of `0` prevents all character classes from being considered.
96 ///
97 /// This limit also applies to case insensitive literals, since each
98 /// character in the case insensitive literal is converted to a class, and
99 /// then case folded.
100 ///
101 /// The new limits will only apply to additions to this set. Existing
102 /// members remain unchanged, even if the set exceeds the new limit.
set_limit_class(&mut self, size: usize) -> &mut Literals103 pub fn set_limit_class(&mut self, size: usize) -> &mut Literals {
104 self.limit_class = size;
105 self
106 }
107
108 /// Returns the set of literals as a slice. Its order is unspecified.
literals(&self) -> &[Literal]109 pub fn literals(&self) -> &[Literal] {
110 &self.lits
111 }
112
113 /// Returns the length of the smallest literal.
114 ///
115 /// Returns None is there are no literals in the set.
min_len(&self) -> Option<usize>116 pub fn min_len(&self) -> Option<usize> {
117 let mut min = None;
118 for lit in &self.lits {
119 match min {
120 None => min = Some(lit.len()),
121 Some(m) if lit.len() < m => min = Some(lit.len()),
122 _ => {}
123 }
124 }
125 min
126 }
127
128 /// Returns true if all members in this set are complete.
all_complete(&self) -> bool129 pub fn all_complete(&self) -> bool {
130 !self.lits.is_empty() && self.lits.iter().all(|l| !l.is_cut())
131 }
132
133 /// Returns true if any member in this set is complete.
any_complete(&self) -> bool134 pub fn any_complete(&self) -> bool {
135 self.lits.iter().any(|lit| !lit.is_cut())
136 }
137
138 /// Returns true if this set contains an empty literal.
contains_empty(&self) -> bool139 pub fn contains_empty(&self) -> bool {
140 self.lits.iter().any(|lit| lit.is_empty())
141 }
142
143 /// Returns true if this set is empty or if all of its members is empty.
is_empty(&self) -> bool144 pub fn is_empty(&self) -> bool {
145 self.lits.is_empty() || self.lits.iter().all(|lit| lit.is_empty())
146 }
147
148 /// Returns a new empty set of literals using this set's limits.
to_empty(&self) -> Literals149 pub fn to_empty(&self) -> Literals {
150 let mut lits = Literals::empty();
151 lits.set_limit_size(self.limit_size).set_limit_class(self.limit_class);
152 lits
153 }
154
155 /// Returns the longest common prefix of all members in this set.
longest_common_prefix(&self) -> &[u8]156 pub fn longest_common_prefix(&self) -> &[u8] {
157 if self.is_empty() {
158 return &[];
159 }
160 let lit0 = &*self.lits[0];
161 let mut len = lit0.len();
162 for lit in &self.lits[1..] {
163 len = cmp::min(
164 len,
165 lit.iter().zip(lit0).take_while(|&(a, b)| a == b).count(),
166 );
167 }
168 &self.lits[0][..len]
169 }
170
171 /// Returns the longest common suffix of all members in this set.
longest_common_suffix(&self) -> &[u8]172 pub fn longest_common_suffix(&self) -> &[u8] {
173 if self.is_empty() {
174 return &[];
175 }
176 let lit0 = &*self.lits[0];
177 let mut len = lit0.len();
178 for lit in &self.lits[1..] {
179 len = cmp::min(
180 len,
181 lit.iter()
182 .rev()
183 .zip(lit0.iter().rev())
184 .take_while(|&(a, b)| a == b)
185 .count(),
186 );
187 }
188 &self.lits[0][self.lits[0].len() - len..]
189 }
190
191 /// Returns a new set of literals with the given number of bytes trimmed
192 /// from the suffix of each literal.
193 ///
194 /// If any literal would be cut out completely by trimming, then None is
195 /// returned.
196 ///
197 /// Any duplicates that are created as a result of this transformation are
198 /// removed.
trim_suffix(&self, num_bytes: usize) -> Option<Literals>199 pub fn trim_suffix(&self, num_bytes: usize) -> Option<Literals> {
200 if self.min_len().map(|len| len <= num_bytes).unwrap_or(true) {
201 return None;
202 }
203 let mut new = self.to_empty();
204 for mut lit in self.lits.iter().cloned() {
205 let new_len = lit.len() - num_bytes;
206 lit.truncate(new_len);
207 lit.cut();
208 new.lits.push(lit);
209 }
210 new.lits.sort();
211 new.lits.dedup();
212 Some(new)
213 }
214
215 /// Returns a new set of prefixes of this set of literals that are
216 /// guaranteed to be unambiguous.
217 ///
218 /// Any substring match with a member of the set is returned is guaranteed
219 /// to never overlap with a substring match of another member of the set
220 /// at the same starting position.
221 ///
222 /// Given any two members of the returned set, neither is a substring of
223 /// the other.
unambiguous_prefixes(&self) -> Literals224 pub fn unambiguous_prefixes(&self) -> Literals {
225 if self.lits.is_empty() {
226 return self.to_empty();
227 }
228 let mut old = self.lits.to_vec();
229 let mut new = self.to_empty();
230 'OUTER: while let Some(mut candidate) = old.pop() {
231 if candidate.is_empty() {
232 continue;
233 }
234 if new.lits.is_empty() {
235 new.lits.push(candidate);
236 continue;
237 }
238 for lit2 in &mut new.lits {
239 if lit2.is_empty() {
240 continue;
241 }
242 if &candidate == lit2 {
243 // If the literal is already in the set, then we can
244 // just drop it. But make sure that cut literals are
245 // infectious!
246 candidate.cut = candidate.cut || lit2.cut;
247 lit2.cut = candidate.cut;
248 continue 'OUTER;
249 }
250 if candidate.len() < lit2.len() {
251 if let Some(i) = position(&candidate, &lit2) {
252 candidate.cut();
253 let mut lit3 = lit2.clone();
254 lit3.truncate(i);
255 lit3.cut();
256 old.push(lit3);
257 lit2.clear();
258 }
259 } else if let Some(i) = position(&lit2, &candidate) {
260 lit2.cut();
261 let mut new_candidate = candidate.clone();
262 new_candidate.truncate(i);
263 new_candidate.cut();
264 old.push(new_candidate);
265 candidate.clear();
266 }
267 // Oops, the candidate is already represented in the set.
268 if candidate.is_empty() {
269 continue 'OUTER;
270 }
271 }
272 new.lits.push(candidate);
273 }
274 new.lits.retain(|lit| !lit.is_empty());
275 new.lits.sort();
276 new.lits.dedup();
277 new
278 }
279
280 /// Returns a new set of suffixes of this set of literals that are
281 /// guaranteed to be unambiguous.
282 ///
283 /// Any substring match with a member of the set is returned is guaranteed
284 /// to never overlap with a substring match of another member of the set
285 /// at the same ending position.
286 ///
287 /// Given any two members of the returned set, neither is a substring of
288 /// the other.
unambiguous_suffixes(&self) -> Literals289 pub fn unambiguous_suffixes(&self) -> Literals {
290 // This is a touch wasteful...
291 let mut lits = self.clone();
292 lits.reverse();
293 let mut unamb = lits.unambiguous_prefixes();
294 unamb.reverse();
295 unamb
296 }
297
298 /// Unions the prefixes from the given expression to this set.
299 ///
300 /// If prefixes could not be added (for example, this set would exceed its
301 /// size limits or the set of prefixes from `expr` includes the empty
302 /// string), then false is returned.
303 ///
304 /// Note that prefix literals extracted from `expr` are said to be complete
305 /// if and only if the literal extends from the beginning of `expr` to the
306 /// end of `expr`.
union_prefixes(&mut self, expr: &Hir) -> bool307 pub fn union_prefixes(&mut self, expr: &Hir) -> bool {
308 let mut lits = self.to_empty();
309 prefixes(expr, &mut lits);
310 !lits.is_empty() && !lits.contains_empty() && self.union(lits)
311 }
312
313 /// Unions the suffixes from the given expression to this set.
314 ///
315 /// If suffixes could not be added (for example, this set would exceed its
316 /// size limits or the set of suffixes from `expr` includes the empty
317 /// string), then false is returned.
318 ///
319 /// Note that prefix literals extracted from `expr` are said to be complete
320 /// if and only if the literal extends from the end of `expr` to the
321 /// beginning of `expr`.
union_suffixes(&mut self, expr: &Hir) -> bool322 pub fn union_suffixes(&mut self, expr: &Hir) -> bool {
323 let mut lits = self.to_empty();
324 suffixes(expr, &mut lits);
325 lits.reverse();
326 !lits.is_empty() && !lits.contains_empty() && self.union(lits)
327 }
328
329 /// Unions this set with another set.
330 ///
331 /// If the union would cause the set to exceed its limits, then the union
332 /// is skipped and it returns false. Otherwise, if the union succeeds, it
333 /// returns true.
union(&mut self, lits: Literals) -> bool334 pub fn union(&mut self, lits: Literals) -> bool {
335 if self.num_bytes() + lits.num_bytes() > self.limit_size {
336 return false;
337 }
338 if lits.is_empty() {
339 self.lits.push(Literal::empty());
340 } else {
341 self.lits.extend(lits.lits);
342 }
343 true
344 }
345
346 /// Extends this set with another set.
347 ///
348 /// The set of literals is extended via a cross product.
349 ///
350 /// If a cross product would cause this set to exceed its limits, then the
351 /// cross product is skipped and it returns false. Otherwise, if the cross
352 /// product succeeds, it returns true.
cross_product(&mut self, lits: &Literals) -> bool353 pub fn cross_product(&mut self, lits: &Literals) -> bool {
354 if lits.is_empty() {
355 return true;
356 }
357 // Check that we make sure we stay in our limits.
358 let mut size_after;
359 if self.is_empty() || !self.any_complete() {
360 size_after = self.num_bytes();
361 for lits_lit in lits.literals() {
362 size_after += lits_lit.len();
363 }
364 } else {
365 size_after = self.lits.iter().fold(0, |accum, lit| {
366 accum + if lit.is_cut() { lit.len() } else { 0 }
367 });
368 for lits_lit in lits.literals() {
369 for self_lit in self.literals() {
370 if !self_lit.is_cut() {
371 size_after += self_lit.len() + lits_lit.len();
372 }
373 }
374 }
375 }
376 if size_after > self.limit_size {
377 return false;
378 }
379
380 let mut base = self.remove_complete();
381 if base.is_empty() {
382 base = vec![Literal::empty()];
383 }
384 for lits_lit in lits.literals() {
385 for mut self_lit in base.clone() {
386 self_lit.extend(&**lits_lit);
387 self_lit.cut = lits_lit.cut;
388 self.lits.push(self_lit);
389 }
390 }
391 true
392 }
393
394 /// Extends each literal in this set with the bytes given.
395 ///
396 /// If the set is empty, then the given literal is added to the set.
397 ///
398 /// If adding any number of bytes to all members of this set causes a limit
399 /// to be exceeded, then no bytes are added and false is returned. If a
400 /// prefix of `bytes` can be fit into this set, then it is used and all
401 /// resulting literals are cut.
cross_add(&mut self, bytes: &[u8]) -> bool402 pub fn cross_add(&mut self, bytes: &[u8]) -> bool {
403 // N.B. This could be implemented by simply calling cross_product with
404 // a literal set containing just `bytes`, but we can be smarter about
405 // taking shorter prefixes of `bytes` if they'll fit.
406 if bytes.is_empty() {
407 return true;
408 }
409 if self.lits.is_empty() {
410 let i = cmp::min(self.limit_size, bytes.len());
411 self.lits.push(Literal::new(bytes[..i].to_owned()));
412 self.lits[0].cut = i < bytes.len();
413 return !self.lits[0].is_cut();
414 }
415 let size = self.num_bytes();
416 if size + self.lits.len() >= self.limit_size {
417 return false;
418 }
419 let mut i = 1;
420 while size + (i * self.lits.len()) <= self.limit_size
421 && i < bytes.len()
422 {
423 i += 1;
424 }
425 for lit in &mut self.lits {
426 if !lit.is_cut() {
427 lit.extend(&bytes[..i]);
428 if i < bytes.len() {
429 lit.cut();
430 }
431 }
432 }
433 true
434 }
435
436 /// Adds the given literal to this set.
437 ///
438 /// Returns false if adding this literal would cause the class to be too
439 /// big.
add(&mut self, lit: Literal) -> bool440 pub fn add(&mut self, lit: Literal) -> bool {
441 if self.num_bytes() + lit.len() > self.limit_size {
442 return false;
443 }
444 self.lits.push(lit);
445 true
446 }
447
448 /// Extends each literal in this set with the character class given.
449 ///
450 /// Returns false if the character class was too big to add.
add_char_class(&mut self, cls: &hir::ClassUnicode) -> bool451 pub fn add_char_class(&mut self, cls: &hir::ClassUnicode) -> bool {
452 self._add_char_class(cls, false)
453 }
454
455 /// Extends each literal in this set with the character class given,
456 /// writing the bytes of each character in reverse.
457 ///
458 /// Returns false if the character class was too big to add.
add_char_class_reverse(&mut self, cls: &hir::ClassUnicode) -> bool459 fn add_char_class_reverse(&mut self, cls: &hir::ClassUnicode) -> bool {
460 self._add_char_class(cls, true)
461 }
462
_add_char_class( &mut self, cls: &hir::ClassUnicode, reverse: bool, ) -> bool463 fn _add_char_class(
464 &mut self,
465 cls: &hir::ClassUnicode,
466 reverse: bool,
467 ) -> bool {
468 use std::char;
469
470 if self.class_exceeds_limits(cls_char_count(cls)) {
471 return false;
472 }
473 let mut base = self.remove_complete();
474 if base.is_empty() {
475 base = vec![Literal::empty()];
476 }
477 for r in cls.iter() {
478 let (s, e) = (r.start as u32, r.end as u32 + 1);
479 for c in (s..e).filter_map(char::from_u32) {
480 for mut lit in base.clone() {
481 let mut bytes = c.to_string().into_bytes();
482 if reverse {
483 bytes.reverse();
484 }
485 lit.extend(&bytes);
486 self.lits.push(lit);
487 }
488 }
489 }
490 true
491 }
492
493 /// Extends each literal in this set with the byte class given.
494 ///
495 /// Returns false if the byte class was too big to add.
add_byte_class(&mut self, cls: &hir::ClassBytes) -> bool496 pub fn add_byte_class(&mut self, cls: &hir::ClassBytes) -> bool {
497 if self.class_exceeds_limits(cls_byte_count(cls)) {
498 return false;
499 }
500 let mut base = self.remove_complete();
501 if base.is_empty() {
502 base = vec![Literal::empty()];
503 }
504 for r in cls.iter() {
505 let (s, e) = (r.start as u32, r.end as u32 + 1);
506 for b in (s..e).map(|b| b as u8) {
507 for mut lit in base.clone() {
508 lit.push(b);
509 self.lits.push(lit);
510 }
511 }
512 }
513 true
514 }
515
516 /// Cuts every member of this set. When a member is cut, it can never
517 /// be extended.
cut(&mut self)518 pub fn cut(&mut self) {
519 for lit in &mut self.lits {
520 lit.cut();
521 }
522 }
523
524 /// Reverses all members in place.
reverse(&mut self)525 pub fn reverse(&mut self) {
526 for lit in &mut self.lits {
527 lit.reverse();
528 }
529 }
530
531 /// Clears this set of all members.
clear(&mut self)532 pub fn clear(&mut self) {
533 self.lits.clear();
534 }
535
536 /// Pops all complete literals out of this set.
remove_complete(&mut self) -> Vec<Literal>537 fn remove_complete(&mut self) -> Vec<Literal> {
538 let mut base = vec![];
539 for lit in mem::replace(&mut self.lits, vec![]) {
540 if lit.is_cut() {
541 self.lits.push(lit);
542 } else {
543 base.push(lit);
544 }
545 }
546 base
547 }
548
549 /// Returns the total number of bytes in this set.
num_bytes(&self) -> usize550 fn num_bytes(&self) -> usize {
551 self.lits.iter().fold(0, |accum, lit| accum + lit.len())
552 }
553
554 /// Returns true if a character class with the given size would cause this
555 /// set to exceed its limits.
556 ///
557 /// The size given should correspond to the number of items in the class.
class_exceeds_limits(&self, size: usize) -> bool558 fn class_exceeds_limits(&self, size: usize) -> bool {
559 if size > self.limit_class {
560 return true;
561 }
562 // This is an approximation since codepoints in a char class can encode
563 // to 1-4 bytes.
564 let new_byte_count = if self.lits.is_empty() {
565 size
566 } else {
567 self.lits.iter().fold(0, |accum, lit| {
568 accum
569 + if lit.is_cut() {
570 // If the literal is cut, then we'll never add
571 // anything to it, so don't count it.
572 0
573 } else {
574 (lit.len() + 1) * size
575 }
576 })
577 };
578 new_byte_count > self.limit_size
579 }
580 }
581
prefixes(expr: &Hir, lits: &mut Literals)582 fn prefixes(expr: &Hir, lits: &mut Literals) {
583 match *expr.kind() {
584 HirKind::Literal(hir::Literal::Unicode(c)) => {
585 let mut buf = [0; 4];
586 lits.cross_add(c.encode_utf8(&mut buf).as_bytes());
587 }
588 HirKind::Literal(hir::Literal::Byte(b)) => {
589 lits.cross_add(&[b]);
590 }
591 HirKind::Class(hir::Class::Unicode(ref cls)) => {
592 if !lits.add_char_class(cls) {
593 lits.cut();
594 }
595 }
596 HirKind::Class(hir::Class::Bytes(ref cls)) => {
597 if !lits.add_byte_class(cls) {
598 lits.cut();
599 }
600 }
601 HirKind::Group(hir::Group { ref hir, .. }) => {
602 prefixes(&**hir, lits);
603 }
604 HirKind::Repetition(ref x) => match x.kind {
605 hir::RepetitionKind::ZeroOrOne => {
606 repeat_zero_or_one_literals(&x.hir, lits, prefixes);
607 }
608 hir::RepetitionKind::ZeroOrMore => {
609 repeat_zero_or_more_literals(&x.hir, lits, prefixes);
610 }
611 hir::RepetitionKind::OneOrMore => {
612 repeat_one_or_more_literals(&x.hir, lits, prefixes);
613 }
614 hir::RepetitionKind::Range(ref rng) => {
615 let (min, max) = match *rng {
616 hir::RepetitionRange::Exactly(m) => (m, Some(m)),
617 hir::RepetitionRange::AtLeast(m) => (m, None),
618 hir::RepetitionRange::Bounded(m, n) => (m, Some(n)),
619 };
620 repeat_range_literals(
621 &x.hir, min, max, x.greedy, lits, prefixes,
622 )
623 }
624 },
625 HirKind::Concat(ref es) if es.is_empty() => {}
626 HirKind::Concat(ref es) if es.len() == 1 => prefixes(&es[0], lits),
627 HirKind::Concat(ref es) => {
628 for e in es {
629 if let HirKind::Anchor(hir::Anchor::StartText) = *e.kind() {
630 if !lits.is_empty() {
631 lits.cut();
632 break;
633 }
634 lits.add(Literal::empty());
635 continue;
636 }
637 let mut lits2 = lits.to_empty();
638 prefixes(e, &mut lits2);
639 if !lits.cross_product(&lits2) || !lits2.any_complete() {
640 // If this expression couldn't yield any literal that
641 // could be extended, then we need to quit. Since we're
642 // short-circuiting, we also need to freeze every member.
643 lits.cut();
644 break;
645 }
646 }
647 }
648 HirKind::Alternation(ref es) => {
649 alternate_literals(es, lits, prefixes);
650 }
651 _ => lits.cut(),
652 }
653 }
654
suffixes(expr: &Hir, lits: &mut Literals)655 fn suffixes(expr: &Hir, lits: &mut Literals) {
656 match *expr.kind() {
657 HirKind::Literal(hir::Literal::Unicode(c)) => {
658 let mut buf = [0u8; 4];
659 let i = c.encode_utf8(&mut buf).len();
660 let buf = &mut buf[..i];
661 buf.reverse();
662 lits.cross_add(buf);
663 }
664 HirKind::Literal(hir::Literal::Byte(b)) => {
665 lits.cross_add(&[b]);
666 }
667 HirKind::Class(hir::Class::Unicode(ref cls)) => {
668 if !lits.add_char_class_reverse(cls) {
669 lits.cut();
670 }
671 }
672 HirKind::Class(hir::Class::Bytes(ref cls)) => {
673 if !lits.add_byte_class(cls) {
674 lits.cut();
675 }
676 }
677 HirKind::Group(hir::Group { ref hir, .. }) => {
678 suffixes(&**hir, lits);
679 }
680 HirKind::Repetition(ref x) => match x.kind {
681 hir::RepetitionKind::ZeroOrOne => {
682 repeat_zero_or_one_literals(&x.hir, lits, suffixes);
683 }
684 hir::RepetitionKind::ZeroOrMore => {
685 repeat_zero_or_more_literals(&x.hir, lits, suffixes);
686 }
687 hir::RepetitionKind::OneOrMore => {
688 repeat_one_or_more_literals(&x.hir, lits, suffixes);
689 }
690 hir::RepetitionKind::Range(ref rng) => {
691 let (min, max) = match *rng {
692 hir::RepetitionRange::Exactly(m) => (m, Some(m)),
693 hir::RepetitionRange::AtLeast(m) => (m, None),
694 hir::RepetitionRange::Bounded(m, n) => (m, Some(n)),
695 };
696 repeat_range_literals(
697 &x.hir, min, max, x.greedy, lits, suffixes,
698 )
699 }
700 },
701 HirKind::Concat(ref es) if es.is_empty() => {}
702 HirKind::Concat(ref es) if es.len() == 1 => suffixes(&es[0], lits),
703 HirKind::Concat(ref es) => {
704 for e in es.iter().rev() {
705 if let HirKind::Anchor(hir::Anchor::EndText) = *e.kind() {
706 if !lits.is_empty() {
707 lits.cut();
708 break;
709 }
710 lits.add(Literal::empty());
711 continue;
712 }
713 let mut lits2 = lits.to_empty();
714 suffixes(e, &mut lits2);
715 if !lits.cross_product(&lits2) || !lits2.any_complete() {
716 // If this expression couldn't yield any literal that
717 // could be extended, then we need to quit. Since we're
718 // short-circuiting, we also need to freeze every member.
719 lits.cut();
720 break;
721 }
722 }
723 }
724 HirKind::Alternation(ref es) => {
725 alternate_literals(es, lits, suffixes);
726 }
727 _ => lits.cut(),
728 }
729 }
730
repeat_zero_or_one_literals<F: FnMut(&Hir, &mut Literals)>( e: &Hir, lits: &mut Literals, mut f: F, )731 fn repeat_zero_or_one_literals<F: FnMut(&Hir, &mut Literals)>(
732 e: &Hir,
733 lits: &mut Literals,
734 mut f: F,
735 ) {
736 f(
737 &Hir::repetition(hir::Repetition {
738 kind: hir::RepetitionKind::ZeroOrMore,
739 // FIXME: Our literal extraction doesn't care about greediness.
740 // Which is partially why we're treating 'e?' as 'e*'. Namely,
741 // 'ab??' yields [Complete(ab), Complete(a)], but it should yield
742 // [Complete(a), Complete(ab)] because of the non-greediness.
743 greedy: true,
744 hir: Box::new(e.clone()),
745 }),
746 lits,
747 );
748 }
749
repeat_zero_or_more_literals<F: FnMut(&Hir, &mut Literals)>( e: &Hir, lits: &mut Literals, mut f: F, )750 fn repeat_zero_or_more_literals<F: FnMut(&Hir, &mut Literals)>(
751 e: &Hir,
752 lits: &mut Literals,
753 mut f: F,
754 ) {
755 let (mut lits2, mut lits3) = (lits.clone(), lits.to_empty());
756 lits3.set_limit_size(lits.limit_size() / 2);
757 f(e, &mut lits3);
758
759 if lits3.is_empty() || !lits2.cross_product(&lits3) {
760 lits.cut();
761 return;
762 }
763 lits2.cut();
764 lits2.add(Literal::empty());
765 if !lits.union(lits2) {
766 lits.cut();
767 }
768 }
769
repeat_one_or_more_literals<F: FnMut(&Hir, &mut Literals)>( e: &Hir, lits: &mut Literals, mut f: F, )770 fn repeat_one_or_more_literals<F: FnMut(&Hir, &mut Literals)>(
771 e: &Hir,
772 lits: &mut Literals,
773 mut f: F,
774 ) {
775 f(e, lits);
776 lits.cut();
777 }
778
repeat_range_literals<F: FnMut(&Hir, &mut Literals)>( e: &Hir, min: u32, max: Option<u32>, greedy: bool, lits: &mut Literals, mut f: F, )779 fn repeat_range_literals<F: FnMut(&Hir, &mut Literals)>(
780 e: &Hir,
781 min: u32,
782 max: Option<u32>,
783 greedy: bool,
784 lits: &mut Literals,
785 mut f: F,
786 ) {
787 if min == 0 {
788 // This is a bit conservative. If `max` is set, then we could
789 // treat this as a finite set of alternations. For now, we
790 // just treat it as `e*`.
791 f(
792 &Hir::repetition(hir::Repetition {
793 kind: hir::RepetitionKind::ZeroOrMore,
794 greedy,
795 hir: Box::new(e.clone()),
796 }),
797 lits,
798 );
799 } else {
800 if min > 0 {
801 let n = cmp::min(lits.limit_size, min as usize);
802 let es = iter::repeat(e.clone()).take(n).collect();
803 f(&Hir::concat(es), lits);
804 if n < min as usize || lits.contains_empty() {
805 lits.cut();
806 }
807 }
808 if max.map_or(true, |max| min < max) {
809 lits.cut();
810 }
811 }
812 }
813
alternate_literals<F: FnMut(&Hir, &mut Literals)>( es: &[Hir], lits: &mut Literals, mut f: F, )814 fn alternate_literals<F: FnMut(&Hir, &mut Literals)>(
815 es: &[Hir],
816 lits: &mut Literals,
817 mut f: F,
818 ) {
819 let mut lits2 = lits.to_empty();
820 for e in es {
821 let mut lits3 = lits.to_empty();
822 lits3.set_limit_size(lits.limit_size() / 5);
823 f(e, &mut lits3);
824 if lits3.is_empty() || !lits2.union(lits3) {
825 // If we couldn't find suffixes for *any* of the
826 // alternates, then the entire alternation has to be thrown
827 // away and any existing members must be frozen. Similarly,
828 // if the union couldn't complete, stop and freeze.
829 lits.cut();
830 return;
831 }
832 }
833 if !lits.cross_product(&lits2) {
834 lits.cut();
835 }
836 }
837
838 impl fmt::Debug for Literals {
fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result839 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
840 f.debug_struct("Literals")
841 .field("lits", &self.lits)
842 .field("limit_size", &self.limit_size)
843 .field("limit_class", &self.limit_class)
844 .finish()
845 }
846 }
847
848 impl Literal {
849 /// Returns a new complete literal with the bytes given.
new(bytes: Vec<u8>) -> Literal850 pub fn new(bytes: Vec<u8>) -> Literal {
851 Literal { v: bytes, cut: false }
852 }
853
854 /// Returns a new complete empty literal.
empty() -> Literal855 pub fn empty() -> Literal {
856 Literal { v: vec![], cut: false }
857 }
858
859 /// Returns true if this literal was "cut."
is_cut(&self) -> bool860 pub fn is_cut(&self) -> bool {
861 self.cut
862 }
863
864 /// Cuts this literal.
cut(&mut self)865 pub fn cut(&mut self) {
866 self.cut = true;
867 }
868 }
869
870 impl PartialEq for Literal {
eq(&self, other: &Literal) -> bool871 fn eq(&self, other: &Literal) -> bool {
872 self.v == other.v
873 }
874 }
875
876 impl PartialOrd for Literal {
partial_cmp(&self, other: &Literal) -> Option<cmp::Ordering>877 fn partial_cmp(&self, other: &Literal) -> Option<cmp::Ordering> {
878 self.v.partial_cmp(&other.v)
879 }
880 }
881
882 impl fmt::Debug for Literal {
fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result883 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
884 if self.is_cut() {
885 write!(f, "Cut({})", escape_unicode(&self.v))
886 } else {
887 write!(f, "Complete({})", escape_unicode(&self.v))
888 }
889 }
890 }
891
892 impl AsRef<[u8]> for Literal {
as_ref(&self) -> &[u8]893 fn as_ref(&self) -> &[u8] {
894 &self.v
895 }
896 }
897
898 impl ops::Deref for Literal {
899 type Target = Vec<u8>;
deref(&self) -> &Vec<u8>900 fn deref(&self) -> &Vec<u8> {
901 &self.v
902 }
903 }
904
905 impl ops::DerefMut for Literal {
deref_mut(&mut self) -> &mut Vec<u8>906 fn deref_mut(&mut self) -> &mut Vec<u8> {
907 &mut self.v
908 }
909 }
910
position(needle: &[u8], mut haystack: &[u8]) -> Option<usize>911 fn position(needle: &[u8], mut haystack: &[u8]) -> Option<usize> {
912 let mut i = 0;
913 while haystack.len() >= needle.len() {
914 if needle == &haystack[..needle.len()] {
915 return Some(i);
916 }
917 i += 1;
918 haystack = &haystack[1..];
919 }
920 None
921 }
922
escape_unicode(bytes: &[u8]) -> String923 fn escape_unicode(bytes: &[u8]) -> String {
924 let show = match ::std::str::from_utf8(bytes) {
925 Ok(v) => v.to_string(),
926 Err(_) => escape_bytes(bytes),
927 };
928 let mut space_escaped = String::new();
929 for c in show.chars() {
930 if c.is_whitespace() {
931 let escaped = if c as u32 <= 0x7F {
932 escape_byte(c as u8)
933 } else if c as u32 <= 0xFFFF {
934 format!(r"\u{{{:04x}}}", c as u32)
935 } else {
936 format!(r"\U{{{:08x}}}", c as u32)
937 };
938 space_escaped.push_str(&escaped);
939 } else {
940 space_escaped.push(c);
941 }
942 }
943 space_escaped
944 }
945
escape_bytes(bytes: &[u8]) -> String946 fn escape_bytes(bytes: &[u8]) -> String {
947 let mut s = String::new();
948 for &b in bytes {
949 s.push_str(&escape_byte(b));
950 }
951 s
952 }
953
escape_byte(byte: u8) -> String954 fn escape_byte(byte: u8) -> String {
955 use std::ascii::escape_default;
956
957 let escaped: Vec<u8> = escape_default(byte).collect();
958 String::from_utf8_lossy(&escaped).into_owned()
959 }
960
cls_char_count(cls: &hir::ClassUnicode) -> usize961 fn cls_char_count(cls: &hir::ClassUnicode) -> usize {
962 cls.iter().map(|&r| 1 + (r.end as u32) - (r.start as u32)).sum::<u32>()
963 as usize
964 }
965
cls_byte_count(cls: &hir::ClassBytes) -> usize966 fn cls_byte_count(cls: &hir::ClassBytes) -> usize {
967 cls.iter().map(|&r| 1 + (r.end as u32) - (r.start as u32)).sum::<u32>()
968 as usize
969 }
970
971 #[cfg(test)]
972 mod tests {
973 use std::fmt;
974
975 use super::{escape_bytes, Literal, Literals};
976 use crate::hir::Hir;
977 use crate::ParserBuilder;
978
979 // To make test failures easier to read.
980 #[derive(Debug, Eq, PartialEq)]
981 struct Bytes(Vec<ULiteral>);
982 #[derive(Debug, Eq, PartialEq)]
983 struct Unicode(Vec<ULiteral>);
984
escape_lits(blits: &[Literal]) -> Vec<ULiteral>985 fn escape_lits(blits: &[Literal]) -> Vec<ULiteral> {
986 let mut ulits = vec![];
987 for blit in blits {
988 ulits
989 .push(ULiteral { v: escape_bytes(&blit), cut: blit.is_cut() });
990 }
991 ulits
992 }
993
create_lits<I: IntoIterator<Item = Literal>>(it: I) -> Literals994 fn create_lits<I: IntoIterator<Item = Literal>>(it: I) -> Literals {
995 Literals {
996 lits: it.into_iter().collect(),
997 limit_size: 0,
998 limit_class: 0,
999 }
1000 }
1001
1002 // Needs to be pub for 1.3?
1003 #[derive(Clone, Eq, PartialEq)]
1004 pub struct ULiteral {
1005 v: String,
1006 cut: bool,
1007 }
1008
1009 impl ULiteral {
is_cut(&self) -> bool1010 fn is_cut(&self) -> bool {
1011 self.cut
1012 }
1013 }
1014
1015 impl fmt::Debug for ULiteral {
fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result1016 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
1017 if self.is_cut() {
1018 write!(f, "Cut({})", self.v)
1019 } else {
1020 write!(f, "Complete({})", self.v)
1021 }
1022 }
1023 }
1024
1025 impl PartialEq<Literal> for ULiteral {
eq(&self, other: &Literal) -> bool1026 fn eq(&self, other: &Literal) -> bool {
1027 self.v.as_bytes() == &*other.v && self.is_cut() == other.is_cut()
1028 }
1029 }
1030
1031 impl PartialEq<ULiteral> for Literal {
eq(&self, other: &ULiteral) -> bool1032 fn eq(&self, other: &ULiteral) -> bool {
1033 &*self.v == other.v.as_bytes() && self.is_cut() == other.is_cut()
1034 }
1035 }
1036
1037 #[allow(non_snake_case)]
C(s: &'static str) -> ULiteral1038 fn C(s: &'static str) -> ULiteral {
1039 ULiteral { v: s.to_owned(), cut: true }
1040 }
1041 #[allow(non_snake_case)]
M(s: &'static str) -> ULiteral1042 fn M(s: &'static str) -> ULiteral {
1043 ULiteral { v: s.to_owned(), cut: false }
1044 }
1045
prefixes(lits: &mut Literals, expr: &Hir)1046 fn prefixes(lits: &mut Literals, expr: &Hir) {
1047 lits.union_prefixes(expr);
1048 }
1049
suffixes(lits: &mut Literals, expr: &Hir)1050 fn suffixes(lits: &mut Literals, expr: &Hir) {
1051 lits.union_suffixes(expr);
1052 }
1053
1054 macro_rules! assert_lit_eq {
1055 ($which:ident, $got_lits:expr, $($expected_lit:expr),*) => {{
1056 let expected: Vec<ULiteral> = vec![$($expected_lit),*];
1057 let lits = $got_lits;
1058 assert_eq!(
1059 $which(expected.clone()),
1060 $which(escape_lits(lits.literals())));
1061 assert_eq!(
1062 !expected.is_empty() && expected.iter().all(|l| !l.is_cut()),
1063 lits.all_complete());
1064 assert_eq!(
1065 expected.iter().any(|l| !l.is_cut()),
1066 lits.any_complete());
1067 }};
1068 }
1069
1070 macro_rules! test_lit {
1071 ($name:ident, $which:ident, $re:expr) => {
1072 test_lit!($name, $which, $re,);
1073 };
1074 ($name:ident, $which:ident, $re:expr, $($lit:expr),*) => {
1075 #[test]
1076 fn $name() {
1077 let expr = ParserBuilder::new()
1078 .build()
1079 .parse($re)
1080 .unwrap();
1081 let lits = Literals::$which(&expr);
1082 assert_lit_eq!(Unicode, lits, $($lit),*);
1083
1084 let expr = ParserBuilder::new()
1085 .allow_invalid_utf8(true)
1086 .unicode(false)
1087 .build()
1088 .parse($re)
1089 .unwrap();
1090 let lits = Literals::$which(&expr);
1091 assert_lit_eq!(Bytes, lits, $($lit),*);
1092 }
1093 };
1094 }
1095
1096 // ************************************************************************
1097 // Tests for prefix literal extraction.
1098 // ************************************************************************
1099
1100 // Elementary tests.
1101 test_lit!(pfx_one_lit1, prefixes, "a", M("a"));
1102 test_lit!(pfx_one_lit2, prefixes, "abc", M("abc"));
1103 test_lit!(pfx_one_lit3, prefixes, "(?u)☃", M("\\xe2\\x98\\x83"));
1104 #[cfg(feature = "unicode-case")]
1105 test_lit!(pfx_one_lit4, prefixes, "(?ui)☃", M("\\xe2\\x98\\x83"));
1106 test_lit!(pfx_class1, prefixes, "[1-4]", M("1"), M("2"), M("3"), M("4"));
1107 test_lit!(
1108 pfx_class2,
1109 prefixes,
1110 "(?u)[☃Ⅰ]",
1111 M("\\xe2\\x85\\xa0"),
1112 M("\\xe2\\x98\\x83")
1113 );
1114 #[cfg(feature = "unicode-case")]
1115 test_lit!(
1116 pfx_class3,
1117 prefixes,
1118 "(?ui)[☃Ⅰ]",
1119 M("\\xe2\\x85\\xa0"),
1120 M("\\xe2\\x85\\xb0"),
1121 M("\\xe2\\x98\\x83")
1122 );
1123 test_lit!(pfx_one_lit_casei1, prefixes, "(?i-u)a", M("A"), M("a"));
1124 test_lit!(
1125 pfx_one_lit_casei2,
1126 prefixes,
1127 "(?i-u)abc",
1128 M("ABC"),
1129 M("aBC"),
1130 M("AbC"),
1131 M("abC"),
1132 M("ABc"),
1133 M("aBc"),
1134 M("Abc"),
1135 M("abc")
1136 );
1137 test_lit!(pfx_group1, prefixes, "(a)", M("a"));
1138 test_lit!(pfx_rep_zero_or_one1, prefixes, "a?");
1139 test_lit!(pfx_rep_zero_or_one2, prefixes, "(?:abc)?");
1140 test_lit!(pfx_rep_zero_or_one_cat1, prefixes, "ab?", C("ab"), M("a"));
1141 // FIXME: This should return [M("a"), M("ab")] because of the non-greedy
1142 // repetition. As a work-around, we rewrite ab?? as ab*?, and thus we get
1143 // a cut literal.
1144 test_lit!(pfx_rep_zero_or_one_cat2, prefixes, "ab??", C("ab"), M("a"));
1145 test_lit!(pfx_rep_zero_or_more1, prefixes, "a*");
1146 test_lit!(pfx_rep_zero_or_more2, prefixes, "(?:abc)*");
1147 test_lit!(pfx_rep_one_or_more1, prefixes, "a+", C("a"));
1148 test_lit!(pfx_rep_one_or_more2, prefixes, "(?:abc)+", C("abc"));
1149 test_lit!(pfx_rep_nested_one_or_more, prefixes, "(?:a+)+", C("a"));
1150 test_lit!(pfx_rep_range1, prefixes, "a{0}");
1151 test_lit!(pfx_rep_range2, prefixes, "a{0,}");
1152 test_lit!(pfx_rep_range3, prefixes, "a{0,1}");
1153 test_lit!(pfx_rep_range4, prefixes, "a{1}", M("a"));
1154 test_lit!(pfx_rep_range5, prefixes, "a{2}", M("aa"));
1155 test_lit!(pfx_rep_range6, prefixes, "a{1,2}", C("a"));
1156 test_lit!(pfx_rep_range7, prefixes, "a{2,3}", C("aa"));
1157
1158 // Test regexes with concatenations.
1159 test_lit!(pfx_cat1, prefixes, "(?:a)(?:b)", M("ab"));
1160 test_lit!(pfx_cat2, prefixes, "[ab]z", M("az"), M("bz"));
1161 test_lit!(
1162 pfx_cat3,
1163 prefixes,
1164 "(?i-u)[ab]z",
1165 M("AZ"),
1166 M("BZ"),
1167 M("aZ"),
1168 M("bZ"),
1169 M("Az"),
1170 M("Bz"),
1171 M("az"),
1172 M("bz")
1173 );
1174 test_lit!(
1175 pfx_cat4,
1176 prefixes,
1177 "[ab][yz]",
1178 M("ay"),
1179 M("by"),
1180 M("az"),
1181 M("bz")
1182 );
1183 test_lit!(pfx_cat5, prefixes, "a*b", C("a"), M("b"));
1184 test_lit!(pfx_cat6, prefixes, "a*b*c", C("a"), C("b"), M("c"));
1185 test_lit!(pfx_cat7, prefixes, "a*b*c+", C("a"), C("b"), C("c"));
1186 test_lit!(pfx_cat8, prefixes, "a*b+c", C("a"), C("b"));
1187 test_lit!(pfx_cat9, prefixes, "a*b+c*", C("a"), C("b"));
1188 test_lit!(pfx_cat10, prefixes, "ab*", C("ab"), M("a"));
1189 test_lit!(pfx_cat11, prefixes, "ab*c", C("ab"), M("ac"));
1190 test_lit!(pfx_cat12, prefixes, "ab+", C("ab"));
1191 test_lit!(pfx_cat13, prefixes, "ab+c", C("ab"));
1192 test_lit!(pfx_cat14, prefixes, "a^", C("a"));
1193 test_lit!(pfx_cat15, prefixes, "$a");
1194 test_lit!(pfx_cat16, prefixes, r"ab*c", C("ab"), M("ac"));
1195 test_lit!(pfx_cat17, prefixes, r"ab+c", C("ab"));
1196 test_lit!(pfx_cat18, prefixes, r"z*azb", C("z"), M("azb"));
1197 test_lit!(pfx_cat19, prefixes, "a.z", C("a"));
1198
1199 // Test regexes with alternations.
1200 test_lit!(pfx_alt1, prefixes, "a|b", M("a"), M("b"));
1201 test_lit!(pfx_alt2, prefixes, "[1-3]|b", M("1"), M("2"), M("3"), M("b"));
1202 test_lit!(pfx_alt3, prefixes, "y(?:a|b)z", M("yaz"), M("ybz"));
1203 test_lit!(pfx_alt4, prefixes, "a|b*");
1204 test_lit!(pfx_alt5, prefixes, "a|b+", M("a"), C("b"));
1205 test_lit!(pfx_alt6, prefixes, "a|(?:b|c*)");
1206 test_lit!(
1207 pfx_alt7,
1208 prefixes,
1209 "(a|b)*c|(a|ab)*c",
1210 C("a"),
1211 C("b"),
1212 M("c"),
1213 C("a"),
1214 C("ab"),
1215 M("c")
1216 );
1217 test_lit!(pfx_alt8, prefixes, "a*b|c", C("a"), M("b"), M("c"));
1218
1219 // Test regexes with empty assertions.
1220 test_lit!(pfx_empty1, prefixes, "^a", M("a"));
1221 test_lit!(pfx_empty2, prefixes, "a${2}", C("a"));
1222 test_lit!(pfx_empty3, prefixes, "^abc", M("abc"));
1223 test_lit!(pfx_empty4, prefixes, "(?:^abc)|(?:^z)", M("abc"), M("z"));
1224
1225 // Make sure some curious regexes have no prefixes.
1226 test_lit!(pfx_nothing1, prefixes, ".");
1227 test_lit!(pfx_nothing2, prefixes, "(?s).");
1228 test_lit!(pfx_nothing3, prefixes, "^");
1229 test_lit!(pfx_nothing4, prefixes, "$");
1230 test_lit!(pfx_nothing6, prefixes, "(?m)$");
1231 test_lit!(pfx_nothing7, prefixes, r"\b");
1232 test_lit!(pfx_nothing8, prefixes, r"\B");
1233
1234 // Test a few regexes that defeat any prefix literal detection.
1235 test_lit!(pfx_defeated1, prefixes, ".a");
1236 test_lit!(pfx_defeated2, prefixes, "(?s).a");
1237 test_lit!(pfx_defeated3, prefixes, "a*b*c*");
1238 test_lit!(pfx_defeated4, prefixes, "a|.");
1239 test_lit!(pfx_defeated5, prefixes, ".|a");
1240 test_lit!(pfx_defeated6, prefixes, "a|^");
1241 test_lit!(pfx_defeated7, prefixes, ".(?:a(?:b)(?:c))");
1242 test_lit!(pfx_defeated8, prefixes, "$a");
1243 test_lit!(pfx_defeated9, prefixes, "(?m)$a");
1244 test_lit!(pfx_defeated10, prefixes, r"\ba");
1245 test_lit!(pfx_defeated11, prefixes, r"\Ba");
1246 test_lit!(pfx_defeated12, prefixes, "^*a");
1247 test_lit!(pfx_defeated13, prefixes, "^+a");
1248
1249 test_lit!(
1250 pfx_crazy1,
1251 prefixes,
1252 r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]",
1253 C("Mo\\'"),
1254 C("Mu\\'"),
1255 C("Moam"),
1256 C("Muam")
1257 );
1258
1259 // ************************************************************************
1260 // Tests for quiting prefix literal search.
1261 // ************************************************************************
1262
1263 macro_rules! test_exhausted {
1264 ($name:ident, $which:ident, $re:expr) => {
1265 test_exhausted!($name, $which, $re,);
1266 };
1267 ($name:ident, $which:ident, $re:expr, $($lit:expr),*) => {
1268 #[test]
1269 fn $name() {
1270 let expr = ParserBuilder::new()
1271 .build()
1272 .parse($re)
1273 .unwrap();
1274 let mut lits = Literals::empty();
1275 lits.set_limit_size(20).set_limit_class(10);
1276 $which(&mut lits, &expr);
1277 assert_lit_eq!(Unicode, lits, $($lit),*);
1278
1279 let expr = ParserBuilder::new()
1280 .allow_invalid_utf8(true)
1281 .unicode(false)
1282 .build()
1283 .parse($re)
1284 .unwrap();
1285 let mut lits = Literals::empty();
1286 lits.set_limit_size(20).set_limit_class(10);
1287 $which(&mut lits, &expr);
1288 assert_lit_eq!(Bytes, lits, $($lit),*);
1289 }
1290 };
1291 }
1292
1293 // These test use a much lower limit than the default so that we can
1294 // write test cases of reasonable size.
1295 test_exhausted!(pfx_exhausted1, prefixes, "[a-z]");
1296 test_exhausted!(pfx_exhausted2, prefixes, "[a-z]*A");
1297 test_exhausted!(pfx_exhausted3, prefixes, "A[a-z]Z", C("A"));
1298 test_exhausted!(
1299 pfx_exhausted4,
1300 prefixes,
1301 "(?i-u)foobar",
1302 C("FO"),
1303 C("fO"),
1304 C("Fo"),
1305 C("fo")
1306 );
1307 test_exhausted!(
1308 pfx_exhausted5,
1309 prefixes,
1310 "(?:ab){100}",
1311 C("abababababababababab")
1312 );
1313 test_exhausted!(
1314 pfx_exhausted6,
1315 prefixes,
1316 "(?:(?:ab){100})*cd",
1317 C("ababababab"),
1318 M("cd")
1319 );
1320 test_exhausted!(
1321 pfx_exhausted7,
1322 prefixes,
1323 "z(?:(?:ab){100})*cd",
1324 C("zababababab"),
1325 M("zcd")
1326 );
1327 test_exhausted!(
1328 pfx_exhausted8,
1329 prefixes,
1330 "aaaaaaaaaaaaaaaaaaaaz",
1331 C("aaaaaaaaaaaaaaaaaaaa")
1332 );
1333
1334 // ************************************************************************
1335 // Tests for suffix literal extraction.
1336 // ************************************************************************
1337
1338 // Elementary tests.
1339 test_lit!(sfx_one_lit1, suffixes, "a", M("a"));
1340 test_lit!(sfx_one_lit2, suffixes, "abc", M("abc"));
1341 test_lit!(sfx_one_lit3, suffixes, "(?u)☃", M("\\xe2\\x98\\x83"));
1342 #[cfg(feature = "unicode-case")]
1343 test_lit!(sfx_one_lit4, suffixes, "(?ui)☃", M("\\xe2\\x98\\x83"));
1344 test_lit!(sfx_class1, suffixes, "[1-4]", M("1"), M("2"), M("3"), M("4"));
1345 test_lit!(
1346 sfx_class2,
1347 suffixes,
1348 "(?u)[☃Ⅰ]",
1349 M("\\xe2\\x85\\xa0"),
1350 M("\\xe2\\x98\\x83")
1351 );
1352 #[cfg(feature = "unicode-case")]
1353 test_lit!(
1354 sfx_class3,
1355 suffixes,
1356 "(?ui)[☃Ⅰ]",
1357 M("\\xe2\\x85\\xa0"),
1358 M("\\xe2\\x85\\xb0"),
1359 M("\\xe2\\x98\\x83")
1360 );
1361 test_lit!(sfx_one_lit_casei1, suffixes, "(?i-u)a", M("A"), M("a"));
1362 test_lit!(
1363 sfx_one_lit_casei2,
1364 suffixes,
1365 "(?i-u)abc",
1366 M("ABC"),
1367 M("ABc"),
1368 M("AbC"),
1369 M("Abc"),
1370 M("aBC"),
1371 M("aBc"),
1372 M("abC"),
1373 M("abc")
1374 );
1375 test_lit!(sfx_group1, suffixes, "(a)", M("a"));
1376 test_lit!(sfx_rep_zero_or_one1, suffixes, "a?");
1377 test_lit!(sfx_rep_zero_or_one2, suffixes, "(?:abc)?");
1378 test_lit!(sfx_rep_zero_or_more1, suffixes, "a*");
1379 test_lit!(sfx_rep_zero_or_more2, suffixes, "(?:abc)*");
1380 test_lit!(sfx_rep_one_or_more1, suffixes, "a+", C("a"));
1381 test_lit!(sfx_rep_one_or_more2, suffixes, "(?:abc)+", C("abc"));
1382 test_lit!(sfx_rep_nested_one_or_more, suffixes, "(?:a+)+", C("a"));
1383 test_lit!(sfx_rep_range1, suffixes, "a{0}");
1384 test_lit!(sfx_rep_range2, suffixes, "a{0,}");
1385 test_lit!(sfx_rep_range3, suffixes, "a{0,1}");
1386 test_lit!(sfx_rep_range4, suffixes, "a{1}", M("a"));
1387 test_lit!(sfx_rep_range5, suffixes, "a{2}", M("aa"));
1388 test_lit!(sfx_rep_range6, suffixes, "a{1,2}", C("a"));
1389 test_lit!(sfx_rep_range7, suffixes, "a{2,3}", C("aa"));
1390
1391 // Test regexes with concatenations.
1392 test_lit!(sfx_cat1, suffixes, "(?:a)(?:b)", M("ab"));
1393 test_lit!(sfx_cat2, suffixes, "[ab]z", M("az"), M("bz"));
1394 test_lit!(
1395 sfx_cat3,
1396 suffixes,
1397 "(?i-u)[ab]z",
1398 M("AZ"),
1399 M("Az"),
1400 M("BZ"),
1401 M("Bz"),
1402 M("aZ"),
1403 M("az"),
1404 M("bZ"),
1405 M("bz")
1406 );
1407 test_lit!(
1408 sfx_cat4,
1409 suffixes,
1410 "[ab][yz]",
1411 M("ay"),
1412 M("az"),
1413 M("by"),
1414 M("bz")
1415 );
1416 test_lit!(sfx_cat5, suffixes, "a*b", C("ab"), M("b"));
1417 test_lit!(sfx_cat6, suffixes, "a*b*c", C("bc"), C("ac"), M("c"));
1418 test_lit!(sfx_cat7, suffixes, "a*b*c+", C("c"));
1419 test_lit!(sfx_cat8, suffixes, "a*b+c", C("bc"));
1420 test_lit!(sfx_cat9, suffixes, "a*b+c*", C("c"), C("b"));
1421 test_lit!(sfx_cat10, suffixes, "ab*", C("b"), M("a"));
1422 test_lit!(sfx_cat11, suffixes, "ab*c", C("bc"), M("ac"));
1423 test_lit!(sfx_cat12, suffixes, "ab+", C("b"));
1424 test_lit!(sfx_cat13, suffixes, "ab+c", C("bc"));
1425 test_lit!(sfx_cat14, suffixes, "a^");
1426 test_lit!(sfx_cat15, suffixes, "$a", C("a"));
1427 test_lit!(sfx_cat16, suffixes, r"ab*c", C("bc"), M("ac"));
1428 test_lit!(sfx_cat17, suffixes, r"ab+c", C("bc"));
1429 test_lit!(sfx_cat18, suffixes, r"z*azb", C("zazb"), M("azb"));
1430 test_lit!(sfx_cat19, suffixes, "a.z", C("z"));
1431
1432 // Test regexes with alternations.
1433 test_lit!(sfx_alt1, suffixes, "a|b", M("a"), M("b"));
1434 test_lit!(sfx_alt2, suffixes, "[1-3]|b", M("1"), M("2"), M("3"), M("b"));
1435 test_lit!(sfx_alt3, suffixes, "y(?:a|b)z", M("yaz"), M("ybz"));
1436 test_lit!(sfx_alt4, suffixes, "a|b*");
1437 test_lit!(sfx_alt5, suffixes, "a|b+", M("a"), C("b"));
1438 test_lit!(sfx_alt6, suffixes, "a|(?:b|c*)");
1439 test_lit!(
1440 sfx_alt7,
1441 suffixes,
1442 "(a|b)*c|(a|ab)*c",
1443 C("ac"),
1444 C("bc"),
1445 M("c"),
1446 C("ac"),
1447 C("abc"),
1448 M("c")
1449 );
1450 test_lit!(sfx_alt8, suffixes, "a*b|c", C("ab"), M("b"), M("c"));
1451
1452 // Test regexes with empty assertions.
1453 test_lit!(sfx_empty1, suffixes, "a$", M("a"));
1454 test_lit!(sfx_empty2, suffixes, "${2}a", C("a"));
1455
1456 // Make sure some curious regexes have no suffixes.
1457 test_lit!(sfx_nothing1, suffixes, ".");
1458 test_lit!(sfx_nothing2, suffixes, "(?s).");
1459 test_lit!(sfx_nothing3, suffixes, "^");
1460 test_lit!(sfx_nothing4, suffixes, "$");
1461 test_lit!(sfx_nothing6, suffixes, "(?m)$");
1462 test_lit!(sfx_nothing7, suffixes, r"\b");
1463 test_lit!(sfx_nothing8, suffixes, r"\B");
1464
1465 // Test a few regexes that defeat any suffix literal detection.
1466 test_lit!(sfx_defeated1, suffixes, "a.");
1467 test_lit!(sfx_defeated2, suffixes, "(?s)a.");
1468 test_lit!(sfx_defeated3, suffixes, "a*b*c*");
1469 test_lit!(sfx_defeated4, suffixes, "a|.");
1470 test_lit!(sfx_defeated5, suffixes, ".|a");
1471 test_lit!(sfx_defeated6, suffixes, "a|^");
1472 test_lit!(sfx_defeated7, suffixes, "(?:a(?:b)(?:c)).");
1473 test_lit!(sfx_defeated8, suffixes, "a^");
1474 test_lit!(sfx_defeated9, suffixes, "(?m)a$");
1475 test_lit!(sfx_defeated10, suffixes, r"a\b");
1476 test_lit!(sfx_defeated11, suffixes, r"a\B");
1477 test_lit!(sfx_defeated12, suffixes, "a^*");
1478 test_lit!(sfx_defeated13, suffixes, "a^+");
1479
1480 // These test use a much lower limit than the default so that we can
1481 // write test cases of reasonable size.
1482 test_exhausted!(sfx_exhausted1, suffixes, "[a-z]");
1483 test_exhausted!(sfx_exhausted2, suffixes, "A[a-z]*");
1484 test_exhausted!(sfx_exhausted3, suffixes, "A[a-z]Z", C("Z"));
1485 test_exhausted!(
1486 sfx_exhausted4,
1487 suffixes,
1488 "(?i-u)foobar",
1489 C("AR"),
1490 C("Ar"),
1491 C("aR"),
1492 C("ar")
1493 );
1494 test_exhausted!(
1495 sfx_exhausted5,
1496 suffixes,
1497 "(?:ab){100}",
1498 C("abababababababababab")
1499 );
1500 test_exhausted!(
1501 sfx_exhausted6,
1502 suffixes,
1503 "cd(?:(?:ab){100})*",
1504 C("ababababab"),
1505 M("cd")
1506 );
1507 test_exhausted!(
1508 sfx_exhausted7,
1509 suffixes,
1510 "cd(?:(?:ab){100})*z",
1511 C("abababababz"),
1512 M("cdz")
1513 );
1514 test_exhausted!(
1515 sfx_exhausted8,
1516 suffixes,
1517 "zaaaaaaaaaaaaaaaaaaaa",
1518 C("aaaaaaaaaaaaaaaaaaaa")
1519 );
1520
1521 // ************************************************************************
1522 // Tests for generating unambiguous literal sets.
1523 // ************************************************************************
1524
1525 macro_rules! test_unamb {
1526 ($name:ident, $given:expr, $expected:expr) => {
1527 #[test]
1528 fn $name() {
1529 let given: Vec<Literal> = $given
1530 .into_iter()
1531 .map(|ul| {
1532 let cut = ul.is_cut();
1533 Literal { v: ul.v.into_bytes(), cut: cut }
1534 })
1535 .collect();
1536 let lits = create_lits(given);
1537 let got = lits.unambiguous_prefixes();
1538 assert_eq!($expected, escape_lits(got.literals()));
1539 }
1540 };
1541 }
1542
1543 test_unamb!(unambiguous1, vec![M("z"), M("azb")], vec![C("a"), C("z")]);
1544 test_unamb!(
1545 unambiguous2,
1546 vec![M("zaaaaaa"), M("aa")],
1547 vec![C("aa"), C("z")]
1548 );
1549 test_unamb!(
1550 unambiguous3,
1551 vec![M("Sherlock"), M("Watson")],
1552 vec![M("Sherlock"), M("Watson")]
1553 );
1554 test_unamb!(unambiguous4, vec![M("abc"), M("bc")], vec![C("a"), C("bc")]);
1555 test_unamb!(unambiguous5, vec![M("bc"), M("abc")], vec![C("a"), C("bc")]);
1556 test_unamb!(unambiguous6, vec![M("a"), M("aa")], vec![C("a")]);
1557 test_unamb!(unambiguous7, vec![M("aa"), M("a")], vec![C("a")]);
1558 test_unamb!(unambiguous8, vec![M("ab"), M("a")], vec![C("a")]);
1559 test_unamb!(
1560 unambiguous9,
1561 vec![M("ac"), M("bc"), M("c"), M("ac"), M("abc"), M("c")],
1562 vec![C("a"), C("b"), C("c")]
1563 );
1564 test_unamb!(
1565 unambiguous10,
1566 vec![M("Mo'"), M("Mu'"), M("Mo"), M("Mu")],
1567 vec![C("Mo"), C("Mu")]
1568 );
1569 test_unamb!(
1570 unambiguous11,
1571 vec![M("zazb"), M("azb")],
1572 vec![C("a"), C("z")]
1573 );
1574 test_unamb!(unambiguous12, vec![M("foo"), C("foo")], vec![C("foo")]);
1575 test_unamb!(
1576 unambiguous13,
1577 vec![M("ABCX"), M("CDAX"), M("BCX")],
1578 vec![C("A"), C("BCX"), C("CD")]
1579 );
1580 test_unamb!(
1581 unambiguous14,
1582 vec![M("IMGX"), M("MVIX"), M("MGX"), M("DSX")],
1583 vec![M("DSX"), C("I"), C("MGX"), C("MV")]
1584 );
1585 test_unamb!(
1586 unambiguous15,
1587 vec![M("IMG_"), M("MG_"), M("CIMG")],
1588 vec![C("C"), C("I"), C("MG_")]
1589 );
1590
1591 // ************************************************************************
1592 // Tests for suffix trimming.
1593 // ************************************************************************
1594 macro_rules! test_trim {
1595 ($name:ident, $trim:expr, $given:expr, $expected:expr) => {
1596 #[test]
1597 fn $name() {
1598 let given: Vec<Literal> = $given
1599 .into_iter()
1600 .map(|ul| {
1601 let cut = ul.is_cut();
1602 Literal { v: ul.v.into_bytes(), cut: cut }
1603 })
1604 .collect();
1605 let lits = create_lits(given);
1606 let got = lits.trim_suffix($trim).unwrap();
1607 assert_eq!($expected, escape_lits(got.literals()));
1608 }
1609 };
1610 }
1611
1612 test_trim!(trim1, 1, vec![M("ab"), M("yz")], vec![C("a"), C("y")]);
1613 test_trim!(trim2, 1, vec![M("abc"), M("abd")], vec![C("ab")]);
1614 test_trim!(trim3, 2, vec![M("abc"), M("abd")], vec![C("a")]);
1615 test_trim!(trim4, 2, vec![M("abc"), M("ghij")], vec![C("a"), C("gh")]);
1616
1617 // ************************************************************************
1618 // Tests for longest common prefix.
1619 // ************************************************************************
1620
1621 macro_rules! test_lcp {
1622 ($name:ident, $given:expr, $expected:expr) => {
1623 #[test]
1624 fn $name() {
1625 let given: Vec<Literal> = $given
1626 .into_iter()
1627 .map(|s: &str| Literal {
1628 v: s.to_owned().into_bytes(),
1629 cut: false,
1630 })
1631 .collect();
1632 let lits = create_lits(given);
1633 let got = lits.longest_common_prefix();
1634 assert_eq!($expected, escape_bytes(got));
1635 }
1636 };
1637 }
1638
1639 test_lcp!(lcp1, vec!["a"], "a");
1640 test_lcp!(lcp2, vec![], "");
1641 test_lcp!(lcp3, vec!["a", "b"], "");
1642 test_lcp!(lcp4, vec!["ab", "ab"], "ab");
1643 test_lcp!(lcp5, vec!["ab", "a"], "a");
1644 test_lcp!(lcp6, vec!["a", "ab"], "a");
1645 test_lcp!(lcp7, vec!["ab", "b"], "");
1646 test_lcp!(lcp8, vec!["b", "ab"], "");
1647 test_lcp!(lcp9, vec!["foobar", "foobaz"], "fooba");
1648 test_lcp!(lcp10, vec!["foobar", "foobaz", "a"], "");
1649 test_lcp!(lcp11, vec!["a", "foobar", "foobaz"], "");
1650 test_lcp!(lcp12, vec!["foo", "flub", "flab", "floo"], "f");
1651
1652 // ************************************************************************
1653 // Tests for longest common suffix.
1654 // ************************************************************************
1655
1656 macro_rules! test_lcs {
1657 ($name:ident, $given:expr, $expected:expr) => {
1658 #[test]
1659 fn $name() {
1660 let given: Vec<Literal> = $given
1661 .into_iter()
1662 .map(|s: &str| Literal {
1663 v: s.to_owned().into_bytes(),
1664 cut: false,
1665 })
1666 .collect();
1667 let lits = create_lits(given);
1668 let got = lits.longest_common_suffix();
1669 assert_eq!($expected, escape_bytes(got));
1670 }
1671 };
1672 }
1673
1674 test_lcs!(lcs1, vec!["a"], "a");
1675 test_lcs!(lcs2, vec![], "");
1676 test_lcs!(lcs3, vec!["a", "b"], "");
1677 test_lcs!(lcs4, vec!["ab", "ab"], "ab");
1678 test_lcs!(lcs5, vec!["ab", "a"], "");
1679 test_lcs!(lcs6, vec!["a", "ab"], "");
1680 test_lcs!(lcs7, vec!["ab", "b"], "b");
1681 test_lcs!(lcs8, vec!["b", "ab"], "b");
1682 test_lcs!(lcs9, vec!["barfoo", "bazfoo"], "foo");
1683 test_lcs!(lcs10, vec!["barfoo", "bazfoo", "a"], "");
1684 test_lcs!(lcs11, vec!["a", "barfoo", "bazfoo"], "");
1685 test_lcs!(lcs12, vec!["flub", "bub", "boob", "dub"], "b");
1686 }
1687