1 use std::cmp;
2 use std::mem;
3
4 use aho_corasick::{self, packed, AhoCorasick, AhoCorasickBuilder};
5 use memchr::{memchr, memchr2, memchr3};
6 use syntax::hir::literal::{Literal, Literals};
7
8 use freqs::BYTE_FREQUENCIES;
9
10 /// A prefix extracted from a compiled regular expression.
11 ///
12 /// A regex prefix is a set of literal strings that *must* be matched at the
13 /// beginning of a regex in order for the entire regex to match. Similarly
14 /// for a regex suffix.
15 #[derive(Clone, Debug)]
16 pub struct LiteralSearcher {
17 complete: bool,
18 lcp: FreqyPacked,
19 lcs: FreqyPacked,
20 matcher: Matcher,
21 }
22
23 #[derive(Clone, Debug)]
24 enum Matcher {
25 /// No literals. (Never advances through the input.)
26 Empty,
27 /// A set of four or more single byte literals.
28 Bytes(SingleByteSet),
29 /// A single substring, find using memchr and frequency analysis.
30 FreqyPacked(FreqyPacked),
31 /// A single substring, find using Boyer-Moore.
32 BoyerMoore(BoyerMooreSearch),
33 /// An Aho-Corasick automaton.
34 AC { ac: AhoCorasick<u32>, lits: Vec<Literal> },
35 /// A packed multiple substring searcher, using SIMD.
36 ///
37 /// Note that Aho-Corasick will actually use this packed searcher
38 /// internally automatically, however, there is some overhead associated
39 /// with going through the Aho-Corasick machinery. So using the packed
40 /// searcher directly results in some gains.
41 Packed { s: packed::Searcher, lits: Vec<Literal> },
42 }
43
44 impl LiteralSearcher {
45 /// Returns a matcher that never matches and never advances the input.
empty() -> Self46 pub fn empty() -> Self {
47 Self::new(Literals::empty(), Matcher::Empty)
48 }
49
50 /// Returns a matcher for literal prefixes from the given set.
prefixes(lits: Literals) -> Self51 pub fn prefixes(lits: Literals) -> Self {
52 let matcher = Matcher::prefixes(&lits);
53 Self::new(lits, matcher)
54 }
55
56 /// Returns a matcher for literal suffixes from the given set.
suffixes(lits: Literals) -> Self57 pub fn suffixes(lits: Literals) -> Self {
58 let matcher = Matcher::suffixes(&lits);
59 Self::new(lits, matcher)
60 }
61
new(lits: Literals, matcher: Matcher) -> Self62 fn new(lits: Literals, matcher: Matcher) -> Self {
63 let complete = lits.all_complete();
64 LiteralSearcher {
65 complete: complete,
66 lcp: FreqyPacked::new(lits.longest_common_prefix().to_vec()),
67 lcs: FreqyPacked::new(lits.longest_common_suffix().to_vec()),
68 matcher: matcher,
69 }
70 }
71
72 /// Returns true if all matches comprise the entire regular expression.
73 ///
74 /// This does not necessarily mean that a literal match implies a match
75 /// of the regular expression. For example, the regular expression `^a`
76 /// is comprised of a single complete literal `a`, but the regular
77 /// expression demands that it only match at the beginning of a string.
complete(&self) -> bool78 pub fn complete(&self) -> bool {
79 self.complete && !self.is_empty()
80 }
81
82 /// Find the position of a literal in `haystack` if it exists.
83 #[cfg_attr(feature = "perf-inline", inline(always))]
find(&self, haystack: &[u8]) -> Option<(usize, usize)>84 pub fn find(&self, haystack: &[u8]) -> Option<(usize, usize)> {
85 use self::Matcher::*;
86 match self.matcher {
87 Empty => Some((0, 0)),
88 Bytes(ref sset) => sset.find(haystack).map(|i| (i, i + 1)),
89 FreqyPacked(ref s) => s.find(haystack).map(|i| (i, i + s.len())),
90 BoyerMoore(ref s) => s.find(haystack).map(|i| (i, i + s.len())),
91 AC { ref ac, .. } => {
92 ac.find(haystack).map(|m| (m.start(), m.end()))
93 }
94 Packed { ref s, .. } => {
95 s.find(haystack).map(|m| (m.start(), m.end()))
96 }
97 }
98 }
99
100 /// Like find, except matches must start at index `0`.
find_start(&self, haystack: &[u8]) -> Option<(usize, usize)>101 pub fn find_start(&self, haystack: &[u8]) -> Option<(usize, usize)> {
102 for lit in self.iter() {
103 if lit.len() > haystack.len() {
104 continue;
105 }
106 if lit == &haystack[0..lit.len()] {
107 return Some((0, lit.len()));
108 }
109 }
110 None
111 }
112
113 /// Like find, except matches must end at index `haystack.len()`.
find_end(&self, haystack: &[u8]) -> Option<(usize, usize)>114 pub fn find_end(&self, haystack: &[u8]) -> Option<(usize, usize)> {
115 for lit in self.iter() {
116 if lit.len() > haystack.len() {
117 continue;
118 }
119 if lit == &haystack[haystack.len() - lit.len()..] {
120 return Some((haystack.len() - lit.len(), haystack.len()));
121 }
122 }
123 None
124 }
125
126 /// Returns an iterator over all literals to be matched.
iter(&self) -> LiteralIter127 pub fn iter(&self) -> LiteralIter {
128 match self.matcher {
129 Matcher::Empty => LiteralIter::Empty,
130 Matcher::Bytes(ref sset) => LiteralIter::Bytes(&sset.dense),
131 Matcher::FreqyPacked(ref s) => LiteralIter::Single(&s.pat),
132 Matcher::BoyerMoore(ref s) => LiteralIter::Single(&s.pattern),
133 Matcher::AC { ref lits, .. } => LiteralIter::AC(lits),
134 Matcher::Packed { ref lits, .. } => LiteralIter::Packed(lits),
135 }
136 }
137
138 /// Returns a matcher for the longest common prefix of this matcher.
lcp(&self) -> &FreqyPacked139 pub fn lcp(&self) -> &FreqyPacked {
140 &self.lcp
141 }
142
143 /// Returns a matcher for the longest common suffix of this matcher.
lcs(&self) -> &FreqyPacked144 pub fn lcs(&self) -> &FreqyPacked {
145 &self.lcs
146 }
147
148 /// Returns true iff this prefix is empty.
is_empty(&self) -> bool149 pub fn is_empty(&self) -> bool {
150 self.len() == 0
151 }
152
153 /// Returns the number of prefixes in this machine.
len(&self) -> usize154 pub fn len(&self) -> usize {
155 use self::Matcher::*;
156 match self.matcher {
157 Empty => 0,
158 Bytes(ref sset) => sset.dense.len(),
159 FreqyPacked(_) => 1,
160 BoyerMoore(_) => 1,
161 AC { ref ac, .. } => ac.pattern_count(),
162 Packed { ref lits, .. } => lits.len(),
163 }
164 }
165
166 /// Return the approximate heap usage of literals in bytes.
approximate_size(&self) -> usize167 pub fn approximate_size(&self) -> usize {
168 use self::Matcher::*;
169 match self.matcher {
170 Empty => 0,
171 Bytes(ref sset) => sset.approximate_size(),
172 FreqyPacked(ref single) => single.approximate_size(),
173 BoyerMoore(ref single) => single.approximate_size(),
174 AC { ref ac, .. } => ac.heap_bytes(),
175 Packed { ref s, .. } => s.heap_bytes(),
176 }
177 }
178 }
179
180 impl Matcher {
prefixes(lits: &Literals) -> Self181 fn prefixes(lits: &Literals) -> Self {
182 let sset = SingleByteSet::prefixes(lits);
183 Matcher::new(lits, sset)
184 }
185
suffixes(lits: &Literals) -> Self186 fn suffixes(lits: &Literals) -> Self {
187 let sset = SingleByteSet::suffixes(lits);
188 Matcher::new(lits, sset)
189 }
190
new(lits: &Literals, sset: SingleByteSet) -> Self191 fn new(lits: &Literals, sset: SingleByteSet) -> Self {
192 if lits.literals().is_empty() {
193 return Matcher::Empty;
194 }
195 if sset.dense.len() >= 26 {
196 // Avoid trying to match a large number of single bytes.
197 // This is *very* sensitive to a frequency analysis comparison
198 // between the bytes in sset and the composition of the haystack.
199 // No matter the size of sset, if its members all are rare in the
200 // haystack, then it'd be worth using it. How to tune this... IDK.
201 // ---AG
202 return Matcher::Empty;
203 }
204 if sset.complete {
205 return Matcher::Bytes(sset);
206 }
207 if lits.literals().len() == 1 {
208 let lit = lits.literals()[0].to_vec();
209 if BoyerMooreSearch::should_use(lit.as_slice()) {
210 return Matcher::BoyerMoore(BoyerMooreSearch::new(lit));
211 } else {
212 return Matcher::FreqyPacked(FreqyPacked::new(lit));
213 }
214 }
215
216 let pats = lits.literals().to_owned();
217 let is_aho_corasick_fast = sset.dense.len() <= 1 && sset.all_ascii;
218 if lits.literals().len() <= 100 && !is_aho_corasick_fast {
219 let mut builder = packed::Config::new()
220 .match_kind(packed::MatchKind::LeftmostFirst)
221 .builder();
222 if let Some(s) = builder.extend(&pats).build() {
223 return Matcher::Packed { s, lits: pats };
224 }
225 }
226 let ac = AhoCorasickBuilder::new()
227 .match_kind(aho_corasick::MatchKind::LeftmostFirst)
228 .dfa(true)
229 .build_with_size::<u32, _, _>(&pats)
230 .unwrap();
231 Matcher::AC { ac, lits: pats }
232 }
233 }
234
235 #[derive(Debug)]
236 pub enum LiteralIter<'a> {
237 Empty,
238 Bytes(&'a [u8]),
239 Single(&'a [u8]),
240 AC(&'a [Literal]),
241 Packed(&'a [Literal]),
242 }
243
244 impl<'a> Iterator for LiteralIter<'a> {
245 type Item = &'a [u8];
246
next(&mut self) -> Option<Self::Item>247 fn next(&mut self) -> Option<Self::Item> {
248 match *self {
249 LiteralIter::Empty => None,
250 LiteralIter::Bytes(ref mut many) => {
251 if many.is_empty() {
252 None
253 } else {
254 let next = &many[0..1];
255 *many = &many[1..];
256 Some(next)
257 }
258 }
259 LiteralIter::Single(ref mut one) => {
260 if one.is_empty() {
261 None
262 } else {
263 let next = &one[..];
264 *one = &[];
265 Some(next)
266 }
267 }
268 LiteralIter::AC(ref mut lits) => {
269 if lits.is_empty() {
270 None
271 } else {
272 let next = &lits[0];
273 *lits = &lits[1..];
274 Some(&**next)
275 }
276 }
277 LiteralIter::Packed(ref mut lits) => {
278 if lits.is_empty() {
279 None
280 } else {
281 let next = &lits[0];
282 *lits = &lits[1..];
283 Some(&**next)
284 }
285 }
286 }
287 }
288 }
289
290 #[derive(Clone, Debug)]
291 struct SingleByteSet {
292 sparse: Vec<bool>,
293 dense: Vec<u8>,
294 complete: bool,
295 all_ascii: bool,
296 }
297
298 impl SingleByteSet {
new() -> SingleByteSet299 fn new() -> SingleByteSet {
300 SingleByteSet {
301 sparse: vec![false; 256],
302 dense: vec![],
303 complete: true,
304 all_ascii: true,
305 }
306 }
307
prefixes(lits: &Literals) -> SingleByteSet308 fn prefixes(lits: &Literals) -> SingleByteSet {
309 let mut sset = SingleByteSet::new();
310 for lit in lits.literals() {
311 sset.complete = sset.complete && lit.len() == 1;
312 if let Some(&b) = lit.get(0) {
313 if !sset.sparse[b as usize] {
314 if b > 0x7F {
315 sset.all_ascii = false;
316 }
317 sset.dense.push(b);
318 sset.sparse[b as usize] = true;
319 }
320 }
321 }
322 sset
323 }
324
suffixes(lits: &Literals) -> SingleByteSet325 fn suffixes(lits: &Literals) -> SingleByteSet {
326 let mut sset = SingleByteSet::new();
327 for lit in lits.literals() {
328 sset.complete = sset.complete && lit.len() == 1;
329 if let Some(&b) = lit.get(lit.len().checked_sub(1).unwrap()) {
330 if !sset.sparse[b as usize] {
331 if b > 0x7F {
332 sset.all_ascii = false;
333 }
334 sset.dense.push(b);
335 sset.sparse[b as usize] = true;
336 }
337 }
338 }
339 sset
340 }
341
342 /// Faster find that special cases certain sizes to use memchr.
343 #[cfg_attr(feature = "perf-inline", inline(always))]
find(&self, text: &[u8]) -> Option<usize>344 fn find(&self, text: &[u8]) -> Option<usize> {
345 match self.dense.len() {
346 0 => None,
347 1 => memchr(self.dense[0], text),
348 2 => memchr2(self.dense[0], self.dense[1], text),
349 3 => memchr3(self.dense[0], self.dense[1], self.dense[2], text),
350 _ => self._find(text),
351 }
352 }
353
354 /// Generic find that works on any sized set.
_find(&self, haystack: &[u8]) -> Option<usize>355 fn _find(&self, haystack: &[u8]) -> Option<usize> {
356 for (i, &b) in haystack.iter().enumerate() {
357 if self.sparse[b as usize] {
358 return Some(i);
359 }
360 }
361 None
362 }
363
approximate_size(&self) -> usize364 fn approximate_size(&self) -> usize {
365 (self.dense.len() * mem::size_of::<u8>())
366 + (self.sparse.len() * mem::size_of::<bool>())
367 }
368 }
369
370 /// Provides an implementation of fast subtring search using frequency
371 /// analysis.
372 ///
373 /// memchr is so fast that we do everything we can to keep the loop in memchr
374 /// for as long as possible. The easiest way to do this is to intelligently
375 /// pick the byte to send to memchr. The best byte is the byte that occurs
376 /// least frequently in the haystack. Since doing frequency analysis on the
377 /// haystack is far too expensive, we compute a set of fixed frequencies up
378 /// front and hard code them in src/freqs.rs. Frequency analysis is done via
379 /// scripts/frequencies.py.
380 #[derive(Clone, Debug)]
381 pub struct FreqyPacked {
382 /// The pattern.
383 pat: Vec<u8>,
384 /// The number of Unicode characters in the pattern. This is useful for
385 /// determining the effective length of a pattern when deciding which
386 /// optimizations to perform. A trailing incomplete UTF-8 sequence counts
387 /// as one character.
388 char_len: usize,
389 /// The rarest byte in the pattern, according to pre-computed frequency
390 /// analysis.
391 rare1: u8,
392 /// The offset of the rarest byte in `pat`.
393 rare1i: usize,
394 /// The second rarest byte in the pattern, according to pre-computed
395 /// frequency analysis. (This may be equivalent to the rarest byte.)
396 ///
397 /// The second rarest byte is used as a type of guard for quickly detecting
398 /// a mismatch after memchr locates an instance of the rarest byte. This
399 /// is a hedge against pathological cases where the pre-computed frequency
400 /// analysis may be off. (But of course, does not prevent *all*
401 /// pathological cases.)
402 rare2: u8,
403 /// The offset of the second rarest byte in `pat`.
404 rare2i: usize,
405 }
406
407 impl FreqyPacked {
new(pat: Vec<u8>) -> FreqyPacked408 fn new(pat: Vec<u8>) -> FreqyPacked {
409 if pat.is_empty() {
410 return FreqyPacked::empty();
411 }
412
413 // Find the rarest two bytes. Try to make them distinct (but it's not
414 // required).
415 let mut rare1 = pat[0];
416 let mut rare2 = pat[0];
417 for b in pat[1..].iter().cloned() {
418 if freq_rank(b) < freq_rank(rare1) {
419 rare1 = b;
420 }
421 }
422 for &b in &pat {
423 if rare1 == rare2 {
424 rare2 = b
425 } else if b != rare1 && freq_rank(b) < freq_rank(rare2) {
426 rare2 = b;
427 }
428 }
429
430 // And find the offsets of their last occurrences.
431 let rare1i = pat.iter().rposition(|&b| b == rare1).unwrap();
432 let rare2i = pat.iter().rposition(|&b| b == rare2).unwrap();
433
434 let char_len = char_len_lossy(&pat);
435 FreqyPacked {
436 pat: pat,
437 char_len: char_len,
438 rare1: rare1,
439 rare1i: rare1i,
440 rare2: rare2,
441 rare2i: rare2i,
442 }
443 }
444
empty() -> FreqyPacked445 fn empty() -> FreqyPacked {
446 FreqyPacked {
447 pat: vec![],
448 char_len: 0,
449 rare1: 0,
450 rare1i: 0,
451 rare2: 0,
452 rare2i: 0,
453 }
454 }
455
456 #[cfg_attr(feature = "perf-inline", inline(always))]
find(&self, haystack: &[u8]) -> Option<usize>457 pub fn find(&self, haystack: &[u8]) -> Option<usize> {
458 let pat = &*self.pat;
459 if haystack.len() < pat.len() || pat.is_empty() {
460 return None;
461 }
462 let mut i = self.rare1i;
463 while i < haystack.len() {
464 i += match memchr(self.rare1, &haystack[i..]) {
465 None => return None,
466 Some(i) => i,
467 };
468 let start = i - self.rare1i;
469 let end = start + pat.len();
470 if end > haystack.len() {
471 return None;
472 }
473 let aligned = &haystack[start..end];
474 if aligned[self.rare2i] == self.rare2 && aligned == &*self.pat {
475 return Some(start);
476 }
477 i += 1;
478 }
479 None
480 }
481
482 #[cfg_attr(feature = "perf-inline", inline(always))]
is_suffix(&self, text: &[u8]) -> bool483 pub fn is_suffix(&self, text: &[u8]) -> bool {
484 if text.len() < self.len() {
485 return false;
486 }
487 text[text.len() - self.len()..] == *self.pat
488 }
489
len(&self) -> usize490 pub fn len(&self) -> usize {
491 self.pat.len()
492 }
493
char_len(&self) -> usize494 pub fn char_len(&self) -> usize {
495 self.char_len
496 }
497
approximate_size(&self) -> usize498 fn approximate_size(&self) -> usize {
499 self.pat.len() * mem::size_of::<u8>()
500 }
501 }
502
char_len_lossy(bytes: &[u8]) -> usize503 fn char_len_lossy(bytes: &[u8]) -> usize {
504 String::from_utf8_lossy(bytes).chars().count()
505 }
506
507 /// An implementation of Tuned Boyer-Moore as laid out by
508 /// Andrew Hume and Daniel Sunday in "Fast String Searching".
509 /// O(n) in the size of the input.
510 ///
511 /// Fast string searching algorithms come in many variations,
512 /// but they can generally be described in terms of three main
513 /// components.
514 ///
515 /// The skip loop is where the string searcher wants to spend
516 /// as much time as possible. Exactly which character in the
517 /// pattern the skip loop examines varies from algorithm to
518 /// algorithm, but in the simplest case this loop repeated
519 /// looks at the last character in the pattern and jumps
520 /// forward in the input if it is not in the pattern.
521 /// Robert Boyer and J Moore called this the "fast" loop in
522 /// their original paper.
523 ///
524 /// The match loop is responsible for actually examining the
525 /// whole potentially matching substring. In order to fail
526 /// faster, the match loop sometimes has a guard test attached.
527 /// The guard test uses frequency analysis of the different
528 /// characters in the pattern to choose the least frequency
529 /// occurring character and use it to find match failures
530 /// as quickly as possible.
531 ///
532 /// The shift rule governs how the algorithm will shuffle its
533 /// test window in the event of a failure during the match loop.
534 /// Certain shift rules allow the worst-case run time of the
535 /// algorithm to be shown to be O(n) in the size of the input
536 /// rather than O(nm) in the size of the input and the size
537 /// of the pattern (as naive Boyer-Moore is).
538 ///
539 /// "Fast String Searching", in addition to presenting a tuned
540 /// algorithm, provides a comprehensive taxonomy of the many
541 /// different flavors of string searchers. Under that taxonomy
542 /// TBM, the algorithm implemented here, uses an unrolled fast
543 /// skip loop with memchr fallback, a forward match loop with guard,
544 /// and the mini Sunday's delta shift rule. To unpack that you'll have to
545 /// read the paper.
546 #[derive(Clone, Debug)]
547 pub struct BoyerMooreSearch {
548 /// The pattern we are going to look for in the haystack.
549 pattern: Vec<u8>,
550
551 /// The skip table for the skip loop.
552 ///
553 /// Maps the character at the end of the input
554 /// to a shift.
555 skip_table: Vec<usize>,
556
557 /// The guard character (least frequently occurring char).
558 guard: u8,
559 /// The reverse-index of the guard character in the pattern.
560 guard_reverse_idx: usize,
561
562 /// Daniel Sunday's mini generalized delta2 shift table.
563 ///
564 /// We use a skip loop, so we only have to provide a shift
565 /// for the skip char (last char). This is why it is a mini
566 /// shift rule.
567 md2_shift: usize,
568 }
569
570 impl BoyerMooreSearch {
571 /// Create a new string searcher, performing whatever
572 /// compilation steps are required.
new(pattern: Vec<u8>) -> Self573 fn new(pattern: Vec<u8>) -> Self {
574 debug_assert!(!pattern.is_empty());
575
576 let (g, gi) = Self::select_guard(pattern.as_slice());
577 let skip_table = Self::compile_skip_table(pattern.as_slice());
578 let md2_shift = Self::compile_md2_shift(pattern.as_slice());
579 BoyerMooreSearch {
580 pattern: pattern,
581 skip_table: skip_table,
582 guard: g,
583 guard_reverse_idx: gi,
584 md2_shift: md2_shift,
585 }
586 }
587
588 /// Find the pattern in `haystack`, returning the offset
589 /// of the start of the first occurrence of the pattern
590 /// in `haystack`.
591 #[inline]
find(&self, haystack: &[u8]) -> Option<usize>592 fn find(&self, haystack: &[u8]) -> Option<usize> {
593 if haystack.len() < self.pattern.len() {
594 return None;
595 }
596
597 let mut window_end = self.pattern.len() - 1;
598
599 // Inspired by the grep source. It is a way
600 // to do correct loop unrolling without having to place
601 // a crashpad of terminating charicters at the end in
602 // the way described in the Fast String Searching paper.
603 const NUM_UNROLL: usize = 10;
604 // 1 for the initial position, and 1 for the md2 shift
605 let short_circut = (NUM_UNROLL + 2) * self.pattern.len();
606
607 if haystack.len() > short_circut {
608 // just 1 for the md2 shift
609 let backstop =
610 haystack.len() - ((NUM_UNROLL + 1) * self.pattern.len());
611 loop {
612 window_end =
613 match self.skip_loop(haystack, window_end, backstop) {
614 Some(i) => i,
615 None => return None,
616 };
617 if window_end >= backstop {
618 break;
619 }
620
621 if self.check_match(haystack, window_end) {
622 return Some(window_end - (self.pattern.len() - 1));
623 } else {
624 let skip = self.skip_table[haystack[window_end] as usize];
625 window_end +=
626 if skip == 0 { self.md2_shift } else { skip };
627 continue;
628 }
629 }
630 }
631
632 // now process the input after the backstop
633 while window_end < haystack.len() {
634 let mut skip = self.skip_table[haystack[window_end] as usize];
635 if skip == 0 {
636 if self.check_match(haystack, window_end) {
637 return Some(window_end - (self.pattern.len() - 1));
638 } else {
639 skip = self.md2_shift;
640 }
641 }
642 window_end += skip;
643 }
644
645 None
646 }
647
len(&self) -> usize648 fn len(&self) -> usize {
649 return self.pattern.len();
650 }
651
652 /// The key heuristic behind which the BoyerMooreSearch lives.
653 ///
654 /// See `rust-lang/regex/issues/408`.
655 ///
656 /// Tuned Boyer-Moore is actually pretty slow! It turns out a handrolled
657 /// platform-specific memchr routine with a bit of frequency
658 /// analysis sprinkled on top actually wins most of the time.
659 /// However, there are a few cases where Tuned Boyer-Moore still
660 /// wins.
661 ///
662 /// If the haystack is random, frequency analysis doesn't help us,
663 /// so Boyer-Moore will win for sufficiently large needles.
664 /// Unfortunately, there is no obvious way to determine this
665 /// ahead of time.
666 ///
667 /// If the pattern itself consists of very common characters,
668 /// frequency analysis won't get us anywhere. The most extreme
669 /// example of this is a pattern like `eeeeeeeeeeeeeeee`. Fortunately,
670 /// this case is wholly determined by the pattern, so we can actually
671 /// implement the heuristic.
672 ///
673 /// A third case is if the pattern is sufficiently long. The idea
674 /// here is that once the pattern gets long enough the Tuned
675 /// Boyer-Moore skip loop will start making strides long enough
676 /// to beat the asm deep magic that is memchr.
should_use(pattern: &[u8]) -> bool677 fn should_use(pattern: &[u8]) -> bool {
678 // The minimum pattern length required to use TBM.
679 const MIN_LEN: usize = 9;
680 // The minimum frequency rank (lower is rarer) that every byte in the
681 // pattern must have in order to use TBM. That is, if the pattern
682 // contains _any_ byte with a lower rank, then TBM won't be used.
683 const MIN_CUTOFF: usize = 150;
684 // The maximum frequency rank for any byte.
685 const MAX_CUTOFF: usize = 255;
686 // The scaling factor used to determine the actual cutoff frequency
687 // to use (keeping in mind that the minimum frequency rank is bounded
688 // by MIN_CUTOFF). This scaling factor is an attempt to make TBM more
689 // likely to be used as the pattern grows longer. That is, longer
690 // patterns permit somewhat less frequent bytes than shorter patterns,
691 // under the assumption that TBM gets better as the pattern gets
692 // longer.
693 const LEN_CUTOFF_PROPORTION: usize = 4;
694
695 let scaled_rank = pattern.len().wrapping_mul(LEN_CUTOFF_PROPORTION);
696 let cutoff = cmp::max(
697 MIN_CUTOFF,
698 MAX_CUTOFF - cmp::min(MAX_CUTOFF, scaled_rank),
699 );
700 // The pattern must be long enough to be worthwhile. e.g., memchr will
701 // be faster on `e` because it is short even though e is quite common.
702 pattern.len() > MIN_LEN
703 // all the bytes must be more common than the cutoff.
704 && pattern.iter().all(|c| freq_rank(*c) >= cutoff)
705 }
706
707 /// Check to see if there is a match at the given position
708 #[inline]
check_match(&self, haystack: &[u8], window_end: usize) -> bool709 fn check_match(&self, haystack: &[u8], window_end: usize) -> bool {
710 // guard test
711 if haystack[window_end - self.guard_reverse_idx] != self.guard {
712 return false;
713 }
714
715 // match loop
716 let window_start = window_end - (self.pattern.len() - 1);
717 for i in 0..self.pattern.len() {
718 if self.pattern[i] != haystack[window_start + i] {
719 return false;
720 }
721 }
722
723 true
724 }
725
726 /// Skip forward according to the shift table.
727 ///
728 /// Returns the offset of the next occurrence
729 /// of the last char in the pattern, or the none
730 /// if it never reappears. If `skip_loop` hits the backstop
731 /// it will leave early.
732 #[inline]
skip_loop( &self, haystack: &[u8], mut window_end: usize, backstop: usize, ) -> Option<usize>733 fn skip_loop(
734 &self,
735 haystack: &[u8],
736 mut window_end: usize,
737 backstop: usize,
738 ) -> Option<usize> {
739 let window_end_snapshot = window_end;
740 let skip_of = |we: usize| -> usize {
741 // Unsafe might make this faster, but the benchmarks
742 // were hard to interpret.
743 self.skip_table[haystack[we] as usize]
744 };
745
746 loop {
747 let mut skip = skip_of(window_end);
748 window_end += skip;
749 skip = skip_of(window_end);
750 window_end += skip;
751 if skip != 0 {
752 skip = skip_of(window_end);
753 window_end += skip;
754 skip = skip_of(window_end);
755 window_end += skip;
756 skip = skip_of(window_end);
757 window_end += skip;
758 if skip != 0 {
759 skip = skip_of(window_end);
760 window_end += skip;
761 skip = skip_of(window_end);
762 window_end += skip;
763 skip = skip_of(window_end);
764 window_end += skip;
765 if skip != 0 {
766 skip = skip_of(window_end);
767 window_end += skip;
768 skip = skip_of(window_end);
769 window_end += skip;
770
771 // If ten iterations did not make at least 16 words
772 // worth of progress, we just fall back on memchr.
773 if window_end - window_end_snapshot
774 > 16 * mem::size_of::<usize>()
775 {
776 // Returning a window_end >= backstop will
777 // immediatly break us out of the inner loop in
778 // `find`.
779 if window_end >= backstop {
780 return Some(window_end);
781 }
782
783 continue; // we made enough progress
784 } else {
785 // In case we are already there, and so that
786 // we will catch the guard char.
787 window_end = window_end
788 .checked_sub(1 + self.guard_reverse_idx)
789 .unwrap_or(0);
790
791 match memchr(self.guard, &haystack[window_end..]) {
792 None => return None,
793 Some(g_idx) => {
794 return Some(
795 window_end
796 + g_idx
797 + self.guard_reverse_idx,
798 );
799 }
800 }
801 }
802 }
803 }
804 }
805
806 return Some(window_end);
807 }
808 }
809
810 /// Compute the ufast skip table.
compile_skip_table(pattern: &[u8]) -> Vec<usize>811 fn compile_skip_table(pattern: &[u8]) -> Vec<usize> {
812 let mut tab = vec![pattern.len(); 256];
813
814 // For every char in the pattern, we write a skip
815 // that will line us up with the rightmost occurrence.
816 //
817 // N.B. the sentinel (0) is written by the last
818 // loop iteration.
819 for (i, c) in pattern.iter().enumerate() {
820 tab[*c as usize] = (pattern.len() - 1) - i;
821 }
822
823 tab
824 }
825
826 /// Select the guard character based off of the precomputed
827 /// frequency table.
select_guard(pattern: &[u8]) -> (u8, usize)828 fn select_guard(pattern: &[u8]) -> (u8, usize) {
829 let mut rarest = pattern[0];
830 let mut rarest_rev_idx = pattern.len() - 1;
831 for (i, c) in pattern.iter().enumerate() {
832 if freq_rank(*c) < freq_rank(rarest) {
833 rarest = *c;
834 rarest_rev_idx = (pattern.len() - 1) - i;
835 }
836 }
837
838 (rarest, rarest_rev_idx)
839 }
840
841 /// If there is another occurrence of the skip
842 /// char, shift to it, otherwise just shift to
843 /// the next window.
compile_md2_shift(pattern: &[u8]) -> usize844 fn compile_md2_shift(pattern: &[u8]) -> usize {
845 let shiftc = *pattern.last().unwrap();
846
847 // For a pattern of length 1 we will never apply the
848 // shift rule, so we use a poison value on the principle
849 // that failing fast is a good thing.
850 if pattern.len() == 1 {
851 return 0xDEADBEAF;
852 }
853
854 let mut i = pattern.len() - 2;
855 while i > 0 {
856 if pattern[i] == shiftc {
857 return (pattern.len() - 1) - i;
858 }
859 i -= 1;
860 }
861
862 // The skip char never re-occurs in the pattern, so
863 // we can just shift the whole window length.
864 pattern.len() - 1
865 }
866
approximate_size(&self) -> usize867 fn approximate_size(&self) -> usize {
868 (self.pattern.len() * mem::size_of::<u8>())
869 + (256 * mem::size_of::<usize>()) // skip table
870 }
871 }
872
freq_rank(b: u8) -> usize873 fn freq_rank(b: u8) -> usize {
874 BYTE_FREQUENCIES[b as usize] as usize
875 }
876
877 #[cfg(test)]
878 mod tests {
879 use super::{BoyerMooreSearch, FreqyPacked};
880
881 //
882 // Unit Tests
883 //
884
885 // The "hello, world" of string searching
886 #[test]
bm_find_subs()887 fn bm_find_subs() {
888 let searcher = BoyerMooreSearch::new(Vec::from(&b"pattern"[..]));
889 let haystack = b"I keep seeing patterns in this text";
890 assert_eq!(14, searcher.find(haystack).unwrap());
891 }
892
893 #[test]
bm_find_no_subs()894 fn bm_find_no_subs() {
895 let searcher = BoyerMooreSearch::new(Vec::from(&b"pattern"[..]));
896 let haystack = b"I keep seeing needles in this text";
897 assert_eq!(None, searcher.find(haystack));
898 }
899
900 //
901 // Regression Tests
902 //
903
904 #[test]
bm_skip_reset_bug()905 fn bm_skip_reset_bug() {
906 let haystack = vec![0, 0, 0, 0, 0, 1, 1, 0];
907 let needle = vec![0, 1, 1, 0];
908
909 let searcher = BoyerMooreSearch::new(needle);
910 let offset = searcher.find(haystack.as_slice()).unwrap();
911 assert_eq!(4, offset);
912 }
913
914 #[test]
bm_backstop_underflow_bug()915 fn bm_backstop_underflow_bug() {
916 let haystack = vec![0, 0];
917 let needle = vec![0, 0];
918
919 let searcher = BoyerMooreSearch::new(needle);
920 let offset = searcher.find(haystack.as_slice()).unwrap();
921 assert_eq!(0, offset);
922 }
923
924 #[test]
bm_naive_off_by_one_bug()925 fn bm_naive_off_by_one_bug() {
926 let haystack = vec![91];
927 let needle = vec![91];
928
929 let naive_offset = naive_find(&needle, &haystack).unwrap();
930 assert_eq!(0, naive_offset);
931 }
932
933 #[test]
bm_memchr_fallback_indexing_bug()934 fn bm_memchr_fallback_indexing_bug() {
935 let mut haystack = vec![
936 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
937 0, 0, 0, 0, 0, 87, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
938 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
939 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
940 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
941 ];
942 let needle = vec![1, 1, 1, 1, 32, 32, 87];
943 let needle_start = haystack.len();
944 haystack.extend(needle.clone());
945
946 let searcher = BoyerMooreSearch::new(needle);
947 assert_eq!(needle_start, searcher.find(haystack.as_slice()).unwrap());
948 }
949
950 #[test]
bm_backstop_boundary()951 fn bm_backstop_boundary() {
952 let haystack = b"\
953 // aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
954 e_data.clone_created(entity_id, entity_to_add.entity_id);
955 aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
956 aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
957 "
958 .to_vec();
959 let needle = b"clone_created".to_vec();
960
961 let searcher = BoyerMooreSearch::new(needle);
962 let result = searcher.find(&haystack);
963 assert_eq!(Some(43), result);
964 }
965
966 #[test]
bm_win_gnu_indexing_bug()967 fn bm_win_gnu_indexing_bug() {
968 let haystack_raw = vec![
969 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
970 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
971 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
972 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
973 ];
974 let needle = vec![1, 1, 1, 1, 1, 1, 1];
975 let haystack = haystack_raw.as_slice();
976
977 BoyerMooreSearch::new(needle.clone()).find(haystack);
978 }
979
980 //
981 // QuickCheck Properties
982 //
983
984 use quickcheck::TestResult;
985
naive_find(needle: &[u8], haystack: &[u8]) -> Option<usize>986 fn naive_find(needle: &[u8], haystack: &[u8]) -> Option<usize> {
987 assert!(needle.len() <= haystack.len());
988
989 for i in 0..(haystack.len() - (needle.len() - 1)) {
990 if haystack[i] == needle[0]
991 && &haystack[i..(i + needle.len())] == needle
992 {
993 return Some(i);
994 }
995 }
996
997 None
998 }
999
1000 quickcheck! {
1001 fn qc_bm_equals_nieve_find(pile1: Vec<u8>, pile2: Vec<u8>) -> TestResult {
1002 if pile1.len() == 0 || pile2.len() == 0 {
1003 return TestResult::discard();
1004 }
1005
1006 let (needle, haystack) = if pile1.len() < pile2.len() {
1007 (pile1, pile2.as_slice())
1008 } else {
1009 (pile2, pile1.as_slice())
1010 };
1011
1012 let searcher = BoyerMooreSearch::new(needle.clone());
1013 TestResult::from_bool(
1014 searcher.find(haystack) == naive_find(&needle, haystack))
1015 }
1016
1017 fn qc_bm_equals_single(pile1: Vec<u8>, pile2: Vec<u8>) -> TestResult {
1018 if pile1.len() == 0 || pile2.len() == 0 {
1019 return TestResult::discard();
1020 }
1021
1022 let (needle, haystack) = if pile1.len() < pile2.len() {
1023 (pile1, pile2.as_slice())
1024 } else {
1025 (pile2, pile1.as_slice())
1026 };
1027
1028 let bm_searcher = BoyerMooreSearch::new(needle.clone());
1029 let freqy_memchr = FreqyPacked::new(needle);
1030 TestResult::from_bool(
1031 bm_searcher.find(haystack) == freqy_memchr.find(haystack))
1032 }
1033
1034 fn qc_bm_finds_trailing_needle(
1035 haystack_pre: Vec<u8>,
1036 needle: Vec<u8>
1037 ) -> TestResult {
1038 if needle.len() == 0 {
1039 return TestResult::discard();
1040 }
1041
1042 let mut haystack = haystack_pre.clone();
1043 let searcher = BoyerMooreSearch::new(needle.clone());
1044
1045 if haystack.len() >= needle.len() &&
1046 searcher.find(haystack.as_slice()).is_some() {
1047 return TestResult::discard();
1048 }
1049
1050 haystack.extend(needle.clone());
1051
1052 // What if the the tail of the haystack can start the
1053 // needle?
1054 let start = haystack_pre.len()
1055 .checked_sub(needle.len())
1056 .unwrap_or(0);
1057 for i in 0..(needle.len() - 1) {
1058 if searcher.find(&haystack[(i + start)..]).is_some() {
1059 return TestResult::discard();
1060 }
1061 }
1062
1063 TestResult::from_bool(
1064 searcher.find(haystack.as_slice())
1065 .map(|x| x == haystack_pre.len())
1066 .unwrap_or(false))
1067 }
1068
1069 // qc_equals_* is only testing the negative case as @burntsushi
1070 // pointed out in https://github.com/rust-lang/regex/issues/446.
1071 // This quickcheck prop represents an effort to force testing of
1072 // the positive case. qc_bm_finds_first and qc_bm_finds_trailing_needle
1073 // already check some of the positive cases, but they don't cover
1074 // cases where the needle is in the middle of haystack. This prop
1075 // fills that hole.
1076 fn qc_bm_finds_subslice(
1077 haystack: Vec<u8>,
1078 needle_start: usize,
1079 needle_length: usize
1080 ) -> TestResult {
1081 if haystack.len() == 0 {
1082 return TestResult::discard();
1083 }
1084
1085 let needle_start = needle_start % haystack.len();
1086 let needle_length = needle_length % (haystack.len() - needle_start);
1087
1088 if needle_length == 0 {
1089 return TestResult::discard();
1090 }
1091
1092 let needle = &haystack[needle_start..(needle_start + needle_length)];
1093
1094 let bm_searcher = BoyerMooreSearch::new(needle.to_vec());
1095
1096 let start = naive_find(&needle, &haystack);
1097 match start {
1098 None => TestResult::from_bool(false),
1099 Some(nf_start) =>
1100 TestResult::from_bool(
1101 nf_start <= needle_start
1102 && bm_searcher.find(&haystack) == start
1103 )
1104 }
1105 }
1106
1107 fn qc_bm_finds_first(needle: Vec<u8>) -> TestResult {
1108 if needle.len() == 0 {
1109 return TestResult::discard();
1110 }
1111
1112 let mut haystack = needle.clone();
1113 let searcher = BoyerMooreSearch::new(needle.clone());
1114 haystack.extend(needle);
1115
1116 TestResult::from_bool(
1117 searcher.find(haystack.as_slice())
1118 .map(|x| x == 0)
1119 .unwrap_or(false))
1120 }
1121 }
1122 }
1123