• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 use std::io;
2 
3 use crate::automaton::Automaton;
4 use crate::buffer::Buffer;
5 use crate::dfa::{self, DFA};
6 use crate::error::Result;
7 use crate::nfa::{self, NFA};
8 use crate::packed;
9 use crate::prefilter::{Prefilter, PrefilterState};
10 use crate::state_id::StateID;
11 use crate::Match;
12 
13 /// An automaton for searching multiple strings in linear time.
14 ///
15 /// The `AhoCorasick` type supports a few basic ways of constructing an
16 /// automaton, including
17 /// [`AhoCorasick::new`](struct.AhoCorasick.html#method.new)
18 /// and
19 /// [`AhoCorasick::new_auto_configured`](struct.AhoCorasick.html#method.new_auto_configured).
20 /// However, there are a fair number of configurable options that can be set
21 /// by using
22 /// [`AhoCorasickBuilder`](struct.AhoCorasickBuilder.html)
23 /// instead. Such options include, but are not limited to, how matches are
24 /// determined, simple case insensitivity, whether to use a DFA or not and
25 /// various knobs for controlling the space-vs-time trade offs taken when
26 /// building the automaton.
27 ///
28 /// If you aren't sure where to start, try beginning with
29 /// [`AhoCorasick::new_auto_configured`](struct.AhoCorasick.html#method.new_auto_configured).
30 ///
31 /// # Resource usage
32 ///
33 /// Aho-Corasick automatons are always constructed in `O(p)` time, where `p`
34 /// is the combined length of all patterns being searched. With that said,
35 /// building an automaton can be fairly costly because of high constant
36 /// factors, particularly when enabling the
37 /// [DFA](struct.AhoCorasickBuilder.html#method.dfa)
38 /// option (which is disabled by default). For this reason, it's generally a
39 /// good idea to build an automaton once and reuse it as much as possible.
40 ///
41 /// Aho-Corasick automatons can also use a fair bit of memory. To get a
42 /// concrete idea of how much memory is being used, try using the
43 /// [`AhoCorasick::heap_bytes`](struct.AhoCorasick.html#method.heap_bytes)
44 /// method.
45 ///
46 /// # Examples
47 ///
48 /// This example shows how to search for occurrences of multiple patterns
49 /// simultaneously in a case insensitive fashion. Each match includes the
50 /// pattern that matched along with the byte offsets of the match.
51 ///
52 /// ```
53 /// use aho_corasick::AhoCorasickBuilder;
54 ///
55 /// let patterns = &["apple", "maple", "snapple"];
56 /// let haystack = "Nobody likes maple in their apple flavored Snapple.";
57 ///
58 /// let ac = AhoCorasickBuilder::new()
59 ///     .ascii_case_insensitive(true)
60 ///     .build(patterns);
61 /// let mut matches = vec![];
62 /// for mat in ac.find_iter(haystack) {
63 ///     matches.push((mat.pattern(), mat.start(), mat.end()));
64 /// }
65 /// assert_eq!(matches, vec![
66 ///     (1, 13, 18),
67 ///     (0, 28, 33),
68 ///     (2, 43, 50),
69 /// ]);
70 /// ```
71 ///
72 /// This example shows how to replace matches with some other string:
73 ///
74 /// ```
75 /// use aho_corasick::AhoCorasick;
76 ///
77 /// let patterns = &["fox", "brown", "quick"];
78 /// let haystack = "The quick brown fox.";
79 /// let replace_with = &["sloth", "grey", "slow"];
80 ///
81 /// let ac = AhoCorasick::new(patterns);
82 /// let result = ac.replace_all(haystack, replace_with);
83 /// assert_eq!(result, "The slow grey sloth.");
84 /// ```
85 #[derive(Clone, Debug)]
86 pub struct AhoCorasick<S: StateID = usize> {
87     imp: Imp<S>,
88     match_kind: MatchKind,
89 }
90 
91 impl AhoCorasick {
92     /// Create a new Aho-Corasick automaton using the default configuration.
93     ///
94     /// The default configuration optimizes for less space usage, but at the
95     /// expense of longer search times. To change the configuration, use
96     /// [`AhoCorasickBuilder`](struct.AhoCorasickBuilder.html)
97     /// for fine-grained control, or
98     /// [`AhoCorasick::new_auto_configured`](struct.AhoCorasick.html#method.new_auto_configured)
99     /// for automatic configuration if you aren't sure which settings to pick.
100     ///
101     /// This uses the default
102     /// [`MatchKind::Standard`](enum.MatchKind.html#variant.Standard)
103     /// match semantics, which reports a match as soon as it is found. This
104     /// corresponds to the standard match semantics supported by textbook
105     /// descriptions of the Aho-Corasick algorithm.
106     ///
107     /// # Examples
108     ///
109     /// Basic usage:
110     ///
111     /// ```
112     /// use aho_corasick::AhoCorasick;
113     ///
114     /// let ac = AhoCorasick::new(&[
115     ///     "foo", "bar", "baz",
116     /// ]);
117     /// assert_eq!(Some(1), ac.find("xxx bar xxx").map(|m| m.pattern()));
118     /// ```
new<I, P>(patterns: I) -> AhoCorasick where I: IntoIterator<Item = P>, P: AsRef<[u8]>,119     pub fn new<I, P>(patterns: I) -> AhoCorasick
120     where
121         I: IntoIterator<Item = P>,
122         P: AsRef<[u8]>,
123     {
124         AhoCorasickBuilder::new().build(patterns)
125     }
126 
127     /// Build an Aho-Corasick automaton with an automatically determined
128     /// configuration.
129     ///
130     /// Specifically, this requires a slice of patterns instead of an iterator
131     /// since the configuration is determined by looking at the patterns before
132     /// constructing the automaton. The idea here is to balance space and time
133     /// automatically. That is, when searching a small number of patterns, this
134     /// will attempt to use the fastest possible configuration since the total
135     /// space required will be small anyway. As the number of patterns grows,
136     /// this will fall back to slower configurations that use less space.
137     ///
138     /// If you want auto configuration but with match semantics different from
139     /// the default `MatchKind::Standard`, then use
140     /// [`AhoCorasickBuilder::auto_configure`](struct.AhoCorasickBuilder.html#method.auto_configure).
141     ///
142     /// # Examples
143     ///
144     /// Basic usage is just like `new`, except you must provide a slice:
145     ///
146     /// ```
147     /// use aho_corasick::AhoCorasick;
148     ///
149     /// let ac = AhoCorasick::new_auto_configured(&[
150     ///     "foo", "bar", "baz",
151     /// ]);
152     /// assert_eq!(Some(1), ac.find("xxx bar xxx").map(|m| m.pattern()));
153     /// ```
new_auto_configured<B>(patterns: &[B]) -> AhoCorasick where B: AsRef<[u8]>,154     pub fn new_auto_configured<B>(patterns: &[B]) -> AhoCorasick
155     where
156         B: AsRef<[u8]>,
157     {
158         AhoCorasickBuilder::new().auto_configure(patterns).build(patterns)
159     }
160 }
161 
162 impl<S: StateID> AhoCorasick<S> {
163     /// Returns true if and only if this automaton matches the haystack at any
164     /// position.
165     ///
166     /// `haystack` may be any type that is cheaply convertible to a `&[u8]`.
167     /// This includes, but is not limited to, `String`, `&str`, `Vec<u8>`, and
168     /// `&[u8]` itself.
169     ///
170     /// # Examples
171     ///
172     /// Basic usage:
173     ///
174     /// ```
175     /// use aho_corasick::AhoCorasick;
176     ///
177     /// let ac = AhoCorasick::new(&[
178     ///     "foo", "bar", "quux", "baz",
179     /// ]);
180     /// assert!(ac.is_match("xxx bar xxx"));
181     /// assert!(!ac.is_match("xxx qux xxx"));
182     /// ```
is_match<B: AsRef<[u8]>>(&self, haystack: B) -> bool183     pub fn is_match<B: AsRef<[u8]>>(&self, haystack: B) -> bool {
184         self.earliest_find(haystack).is_some()
185     }
186 
187     /// Returns the location of the first detected match in `haystack`.
188     ///
189     /// This method has the same behavior regardless of the
190     /// [`MatchKind`](enum.MatchKind.html)
191     /// of this automaton.
192     ///
193     /// `haystack` may be any type that is cheaply convertible to a `&[u8]`.
194     /// This includes, but is not limited to, `String`, `&str`, `Vec<u8>`, and
195     /// `&[u8]` itself.
196     ///
197     /// # Examples
198     ///
199     /// Basic usage:
200     ///
201     /// ```
202     /// use aho_corasick::AhoCorasick;
203     ///
204     /// let ac = AhoCorasick::new(&[
205     ///     "abc", "b",
206     /// ]);
207     /// let mat = ac.earliest_find("abcd").expect("should have match");
208     /// assert_eq!(1, mat.pattern());
209     /// assert_eq!((1, 2), (mat.start(), mat.end()));
210     /// ```
earliest_find<B: AsRef<[u8]>>(&self, haystack: B) -> Option<Match>211     pub fn earliest_find<B: AsRef<[u8]>>(&self, haystack: B) -> Option<Match> {
212         let mut prestate = PrefilterState::new(self.max_pattern_len());
213         let mut start = self.imp.start_state();
214         self.imp.earliest_find_at(
215             &mut prestate,
216             haystack.as_ref(),
217             0,
218             &mut start,
219         )
220     }
221 
222     /// Returns the location of the first match according to the match
223     /// semantics that this automaton was constructed with.
224     ///
225     /// When using `MatchKind::Standard`, this corresponds precisely to the
226     /// same behavior as
227     /// [`earliest_find`](struct.AhoCorasick.html#method.earliest_find).
228     /// Otherwise, match semantics correspond to either
229     /// [leftmost-first](enum.MatchKind.html#variant.LeftmostFirst)
230     /// or
231     /// [leftmost-longest](enum.MatchKind.html#variant.LeftmostLongest).
232     ///
233     /// `haystack` may be any type that is cheaply convertible to a `&[u8]`.
234     /// This includes, but is not limited to, `String`, `&str`, `Vec<u8>`, and
235     /// `&[u8]` itself.
236     ///
237     /// # Examples
238     ///
239     /// Basic usage, with standard semantics:
240     ///
241     /// ```
242     /// use aho_corasick::{AhoCorasickBuilder, MatchKind};
243     ///
244     /// let patterns = &["b", "abc", "abcd"];
245     /// let haystack = "abcd";
246     ///
247     /// let ac = AhoCorasickBuilder::new()
248     ///     .match_kind(MatchKind::Standard) // default, not necessary
249     ///     .build(patterns);
250     /// let mat = ac.find(haystack).expect("should have a match");
251     /// assert_eq!("b", &haystack[mat.start()..mat.end()]);
252     /// ```
253     ///
254     /// Now with leftmost-first semantics:
255     ///
256     /// ```
257     /// use aho_corasick::{AhoCorasickBuilder, MatchKind};
258     ///
259     /// let patterns = &["b", "abc", "abcd"];
260     /// let haystack = "abcd";
261     ///
262     /// let ac = AhoCorasickBuilder::new()
263     ///     .match_kind(MatchKind::LeftmostFirst)
264     ///     .build(patterns);
265     /// let mat = ac.find(haystack).expect("should have a match");
266     /// assert_eq!("abc", &haystack[mat.start()..mat.end()]);
267     /// ```
268     ///
269     /// And finally, leftmost-longest semantics:
270     ///
271     /// ```
272     /// use aho_corasick::{AhoCorasickBuilder, MatchKind};
273     ///
274     /// let patterns = &["b", "abc", "abcd"];
275     /// let haystack = "abcd";
276     ///
277     /// let ac = AhoCorasickBuilder::new()
278     ///     .match_kind(MatchKind::LeftmostLongest)
279     ///     .build(patterns);
280     /// let mat = ac.find(haystack).expect("should have a match");
281     /// assert_eq!("abcd", &haystack[mat.start()..mat.end()]);
282     /// ```
find<B: AsRef<[u8]>>(&self, haystack: B) -> Option<Match>283     pub fn find<B: AsRef<[u8]>>(&self, haystack: B) -> Option<Match> {
284         let mut prestate = PrefilterState::new(self.max_pattern_len());
285         self.imp.find_at_no_state(&mut prestate, haystack.as_ref(), 0)
286     }
287 
288     /// Returns an iterator of non-overlapping matches, using the match
289     /// semantics that this automaton was constructed with.
290     ///
291     /// `haystack` may be any type that is cheaply convertible to a `&[u8]`.
292     /// This includes, but is not limited to, `String`, `&str`, `Vec<u8>`, and
293     /// `&[u8]` itself.
294     ///
295     /// # Examples
296     ///
297     /// Basic usage, with standard semantics:
298     ///
299     /// ```
300     /// use aho_corasick::{AhoCorasickBuilder, MatchKind};
301     ///
302     /// let patterns = &["append", "appendage", "app"];
303     /// let haystack = "append the app to the appendage";
304     ///
305     /// let ac = AhoCorasickBuilder::new()
306     ///     .match_kind(MatchKind::Standard) // default, not necessary
307     ///     .build(patterns);
308     /// let matches: Vec<usize> = ac
309     ///     .find_iter(haystack)
310     ///     .map(|mat| mat.pattern())
311     ///     .collect();
312     /// assert_eq!(vec![2, 2, 2], matches);
313     /// ```
314     ///
315     /// Now with leftmost-first semantics:
316     ///
317     /// ```
318     /// use aho_corasick::{AhoCorasickBuilder, MatchKind};
319     ///
320     /// let patterns = &["append", "appendage", "app"];
321     /// let haystack = "append the app to the appendage";
322     ///
323     /// let ac = AhoCorasickBuilder::new()
324     ///     .match_kind(MatchKind::LeftmostFirst)
325     ///     .build(patterns);
326     /// let matches: Vec<usize> = ac
327     ///     .find_iter(haystack)
328     ///     .map(|mat| mat.pattern())
329     ///     .collect();
330     /// assert_eq!(vec![0, 2, 0], matches);
331     /// ```
332     ///
333     /// And finally, leftmost-longest semantics:
334     ///
335     /// ```
336     /// use aho_corasick::{AhoCorasickBuilder, MatchKind};
337     ///
338     /// let patterns = &["append", "appendage", "app"];
339     /// let haystack = "append the app to the appendage";
340     ///
341     /// let ac = AhoCorasickBuilder::new()
342     ///     .match_kind(MatchKind::LeftmostLongest)
343     ///     .build(patterns);
344     /// let matches: Vec<usize> = ac
345     ///     .find_iter(haystack)
346     ///     .map(|mat| mat.pattern())
347     ///     .collect();
348     /// assert_eq!(vec![0, 2, 1], matches);
349     /// ```
find_iter<'a, 'b, B: ?Sized + AsRef<[u8]>>( &'a self, haystack: &'b B, ) -> FindIter<'a, 'b, S>350     pub fn find_iter<'a, 'b, B: ?Sized + AsRef<[u8]>>(
351         &'a self,
352         haystack: &'b B,
353     ) -> FindIter<'a, 'b, S> {
354         FindIter::new(self, haystack.as_ref())
355     }
356 
357     /// Returns an iterator of overlapping matches in the given `haystack`.
358     ///
359     /// Overlapping matches can _only_ be detected using
360     /// `MatchKind::Standard` semantics. If this automaton was constructed with
361     /// leftmost semantics, then this method will panic. To determine whether
362     /// this will panic at runtime, use the
363     /// [`AhoCorasick::supports_overlapping`](struct.AhoCorasick.html#method.supports_overlapping)
364     /// method.
365     ///
366     /// `haystack` may be any type that is cheaply convertible to a `&[u8]`.
367     /// This includes, but is not limited to, `String`, `&str`, `Vec<u8>`, and
368     /// `&[u8]` itself.
369     ///
370     /// # Panics
371     ///
372     /// This panics when `AhoCorasick::supports_overlapping` returns `false`.
373     /// That is, this panics when this automaton's match semantics are not
374     /// `MatchKind::Standard`.
375     ///
376     /// # Examples
377     ///
378     /// Basic usage, with standard semantics:
379     ///
380     /// ```
381     /// use aho_corasick::AhoCorasick;
382     ///
383     /// let patterns = &["append", "appendage", "app"];
384     /// let haystack = "append the app to the appendage";
385     ///
386     /// let ac = AhoCorasick::new(patterns);
387     /// let matches: Vec<usize> = ac
388     ///     .find_overlapping_iter(haystack)
389     ///     .map(|mat| mat.pattern())
390     ///     .collect();
391     /// assert_eq!(vec![2, 0, 2, 2, 0, 1], matches);
392     /// ```
find_overlapping_iter<'a, 'b, B: ?Sized + AsRef<[u8]>>( &'a self, haystack: &'b B, ) -> FindOverlappingIter<'a, 'b, S>393     pub fn find_overlapping_iter<'a, 'b, B: ?Sized + AsRef<[u8]>>(
394         &'a self,
395         haystack: &'b B,
396     ) -> FindOverlappingIter<'a, 'b, S> {
397         FindOverlappingIter::new(self, haystack.as_ref())
398     }
399 
400     /// Replace all matches with a corresponding value in the `replace_with`
401     /// slice given. Matches correspond to the same matches as reported by
402     /// [`find_iter`](struct.AhoCorasick.html#method.find_iter).
403     ///
404     /// Replacements are determined by the index of the matching pattern.
405     /// For example, if the pattern with index `2` is found, then it is
406     /// replaced by `replace_with[2]`.
407     ///
408     /// # Panics
409     ///
410     /// This panics when `replace_with.len()` does not equal the total number
411     /// of patterns that are matched by this automaton.
412     ///
413     /// # Examples
414     ///
415     /// Basic usage:
416     ///
417     /// ```
418     /// use aho_corasick::{AhoCorasickBuilder, MatchKind};
419     ///
420     /// let patterns = &["append", "appendage", "app"];
421     /// let haystack = "append the app to the appendage";
422     ///
423     /// let ac = AhoCorasickBuilder::new()
424     ///     .match_kind(MatchKind::LeftmostFirst)
425     ///     .build(patterns);
426     /// let result = ac.replace_all(haystack, &["x", "y", "z"]);
427     /// assert_eq!("x the z to the xage", result);
428     /// ```
replace_all<B>(&self, haystack: &str, replace_with: &[B]) -> String where B: AsRef<str>,429     pub fn replace_all<B>(&self, haystack: &str, replace_with: &[B]) -> String
430     where
431         B: AsRef<str>,
432     {
433         assert_eq!(
434             replace_with.len(),
435             self.pattern_count(),
436             "replace_all requires a replacement for every pattern \
437              in the automaton"
438         );
439         let mut dst = String::with_capacity(haystack.len());
440         self.replace_all_with(haystack, &mut dst, |mat, _, dst| {
441             dst.push_str(replace_with[mat.pattern()].as_ref());
442             true
443         });
444         dst
445     }
446 
447     /// Replace all matches using raw bytes with a corresponding value in the
448     /// `replace_with` slice given. Matches correspond to the same matches as
449     /// reported by [`find_iter`](struct.AhoCorasick.html#method.find_iter).
450     ///
451     /// Replacements are determined by the index of the matching pattern.
452     /// For example, if the pattern with index `2` is found, then it is
453     /// replaced by `replace_with[2]`.
454     ///
455     /// # Panics
456     ///
457     /// This panics when `replace_with.len()` does not equal the total number
458     /// of patterns that are matched by this automaton.
459     ///
460     /// # Examples
461     ///
462     /// Basic usage:
463     ///
464     /// ```
465     /// use aho_corasick::{AhoCorasickBuilder, MatchKind};
466     ///
467     /// let patterns = &["append", "appendage", "app"];
468     /// let haystack = b"append the app to the appendage";
469     ///
470     /// let ac = AhoCorasickBuilder::new()
471     ///     .match_kind(MatchKind::LeftmostFirst)
472     ///     .build(patterns);
473     /// let result = ac.replace_all_bytes(haystack, &["x", "y", "z"]);
474     /// assert_eq!(b"x the z to the xage".to_vec(), result);
475     /// ```
replace_all_bytes<B>( &self, haystack: &[u8], replace_with: &[B], ) -> Vec<u8> where B: AsRef<[u8]>,476     pub fn replace_all_bytes<B>(
477         &self,
478         haystack: &[u8],
479         replace_with: &[B],
480     ) -> Vec<u8>
481     where
482         B: AsRef<[u8]>,
483     {
484         assert_eq!(
485             replace_with.len(),
486             self.pattern_count(),
487             "replace_all_bytes requires a replacement for every pattern \
488              in the automaton"
489         );
490         let mut dst = Vec::with_capacity(haystack.len());
491         self.replace_all_with_bytes(haystack, &mut dst, |mat, _, dst| {
492             dst.extend(replace_with[mat.pattern()].as_ref());
493             true
494         });
495         dst
496     }
497 
498     /// Replace all matches using a closure called on each match.
499     /// Matches correspond to the same matches as reported by
500     /// [`find_iter`](struct.AhoCorasick.html#method.find_iter).
501     ///
502     /// The closure accepts three parameters: the match found, the text of
503     /// the match and a string buffer with which to write the replaced text
504     /// (if any). If the closure returns `true`, then it continues to the next
505     /// match. If the closure returns `false`, then searching is stopped.
506     ///
507     /// # Examples
508     ///
509     /// Basic usage:
510     ///
511     /// ```
512     /// use aho_corasick::{AhoCorasickBuilder, MatchKind};
513     ///
514     /// let patterns = &["append", "appendage", "app"];
515     /// let haystack = "append the app to the appendage";
516     ///
517     /// let ac = AhoCorasickBuilder::new()
518     ///     .match_kind(MatchKind::LeftmostFirst)
519     ///     .build(patterns);
520     /// let mut result = String::new();
521     /// ac.replace_all_with(haystack, &mut result, |mat, _, dst| {
522     ///     dst.push_str(&mat.pattern().to_string());
523     ///     true
524     /// });
525     /// assert_eq!("0 the 2 to the 0age", result);
526     /// ```
527     ///
528     /// Stopping the replacement by returning `false` (continued from the
529     /// example above):
530     ///
531     /// ```
532     /// # use aho_corasick::{AhoCorasickBuilder, MatchKind};
533     /// # let patterns = &["append", "appendage", "app"];
534     /// # let haystack = "append the app to the appendage";
535     /// # let ac = AhoCorasickBuilder::new()
536     /// #    .match_kind(MatchKind::LeftmostFirst)
537     /// #    .build(patterns);
538     /// let mut result = String::new();
539     /// ac.replace_all_with(haystack, &mut result, |mat, _, dst| {
540     ///     dst.push_str(&mat.pattern().to_string());
541     ///     mat.pattern() != 2
542     /// });
543     /// assert_eq!("0 the 2 to the appendage", result);
544     /// ```
replace_all_with<F>( &self, haystack: &str, dst: &mut String, mut replace_with: F, ) where F: FnMut(&Match, &str, &mut String) -> bool,545     pub fn replace_all_with<F>(
546         &self,
547         haystack: &str,
548         dst: &mut String,
549         mut replace_with: F,
550     ) where
551         F: FnMut(&Match, &str, &mut String) -> bool,
552     {
553         let mut last_match = 0;
554         for mat in self.find_iter(haystack) {
555             dst.push_str(&haystack[last_match..mat.start()]);
556             last_match = mat.end();
557             if !replace_with(&mat, &haystack[mat.start()..mat.end()], dst) {
558                 break;
559             };
560         }
561         dst.push_str(&haystack[last_match..]);
562     }
563 
564     /// Replace all matches using raw bytes with a closure called on each
565     /// match. Matches correspond to the same matches as reported by
566     /// [`find_iter`](struct.AhoCorasick.html#method.find_iter).
567     ///
568     /// The closure accepts three parameters: the match found, the text of
569     /// the match and a byte buffer with which to write the replaced text
570     /// (if any). If the closure returns `true`, then it continues to the next
571     /// match. If the closure returns `false`, then searching is stopped.
572     ///
573     /// # Examples
574     ///
575     /// Basic usage:
576     ///
577     /// ```
578     /// use aho_corasick::{AhoCorasickBuilder, MatchKind};
579     ///
580     /// let patterns = &["append", "appendage", "app"];
581     /// let haystack = b"append the app to the appendage";
582     ///
583     /// let ac = AhoCorasickBuilder::new()
584     ///     .match_kind(MatchKind::LeftmostFirst)
585     ///     .build(patterns);
586     /// let mut result = vec![];
587     /// ac.replace_all_with_bytes(haystack, &mut result, |mat, _, dst| {
588     ///     dst.extend(mat.pattern().to_string().bytes());
589     ///     true
590     /// });
591     /// assert_eq!(b"0 the 2 to the 0age".to_vec(), result);
592     /// ```
593     ///
594     /// Stopping the replacement by returning `false` (continued from the
595     /// example above):
596     ///
597     /// ```
598     /// # use aho_corasick::{AhoCorasickBuilder, MatchKind};
599     /// # let patterns = &["append", "appendage", "app"];
600     /// # let haystack = b"append the app to the appendage";
601     /// # let ac = AhoCorasickBuilder::new()
602     /// #    .match_kind(MatchKind::LeftmostFirst)
603     /// #    .build(patterns);
604     /// let mut result = vec![];
605     /// ac.replace_all_with_bytes(haystack, &mut result, |mat, _, dst| {
606     ///     dst.extend(mat.pattern().to_string().bytes());
607     ///     mat.pattern() != 2
608     /// });
609     /// assert_eq!(b"0 the 2 to the appendage".to_vec(), result);
610     /// ```
replace_all_with_bytes<F>( &self, haystack: &[u8], dst: &mut Vec<u8>, mut replace_with: F, ) where F: FnMut(&Match, &[u8], &mut Vec<u8>) -> bool,611     pub fn replace_all_with_bytes<F>(
612         &self,
613         haystack: &[u8],
614         dst: &mut Vec<u8>,
615         mut replace_with: F,
616     ) where
617         F: FnMut(&Match, &[u8], &mut Vec<u8>) -> bool,
618     {
619         let mut last_match = 0;
620         for mat in self.find_iter(haystack) {
621             dst.extend(&haystack[last_match..mat.start()]);
622             last_match = mat.end();
623             if !replace_with(&mat, &haystack[mat.start()..mat.end()], dst) {
624                 break;
625             };
626         }
627         dst.extend(&haystack[last_match..]);
628     }
629 
630     /// Returns an iterator of non-overlapping matches in the given
631     /// stream. Matches correspond to the same matches as reported by
632     /// [`find_iter`](struct.AhoCorasick.html#method.find_iter).
633     ///
634     /// The matches yielded by this iterator use absolute position offsets in
635     /// the stream given, where the first byte has index `0`. Matches are
636     /// yieled until the stream is exhausted.
637     ///
638     /// Each item yielded by the iterator is an `io::Result<Match>`, where an
639     /// error is yielded if there was a problem reading from the reader given.
640     ///
641     /// When searching a stream, an internal buffer is used. Therefore, callers
642     /// should avoiding providing a buffered reader, if possible.
643     ///
644     /// Searching a stream requires that the automaton was built with
645     /// `MatchKind::Standard` semantics. If this automaton was constructed
646     /// with leftmost semantics, then this method will panic. To determine
647     /// whether this will panic at runtime, use the
648     /// [`AhoCorasick::supports_stream`](struct.AhoCorasick.html#method.supports_stream)
649     /// method.
650     ///
651     /// # Memory usage
652     ///
653     /// In general, searching streams will use a constant amount of memory for
654     /// its internal buffer. The one requirement is that the internal buffer
655     /// must be at least the size of the longest possible match. In most use
656     /// cases, the default buffer size will be much larger than any individual
657     /// match.
658     ///
659     /// # Panics
660     ///
661     /// This panics when `AhoCorasick::supports_stream` returns `false`.
662     /// That is, this panics when this automaton's match semantics are not
663     /// `MatchKind::Standard`. This restriction may be lifted in the future.
664     ///
665     /// # Examples
666     ///
667     /// Basic usage:
668     ///
669     /// ```
670     /// use aho_corasick::AhoCorasick;
671     ///
672     /// # fn example() -> Result<(), ::std::io::Error> {
673     /// let patterns = &["append", "appendage", "app"];
674     /// let haystack = "append the app to the appendage";
675     ///
676     /// let ac = AhoCorasick::new(patterns);
677     /// let mut matches = vec![];
678     /// for result in ac.stream_find_iter(haystack.as_bytes()) {
679     ///     let mat = result?;
680     ///     matches.push(mat.pattern());
681     /// }
682     /// assert_eq!(vec![2, 2, 2], matches);
683     /// # Ok(()) }; example().unwrap()
684     /// ```
stream_find_iter<'a, R: io::Read>( &'a self, rdr: R, ) -> StreamFindIter<'a, R, S>685     pub fn stream_find_iter<'a, R: io::Read>(
686         &'a self,
687         rdr: R,
688     ) -> StreamFindIter<'a, R, S> {
689         StreamFindIter::new(self, rdr)
690     }
691 
692     /// Search for and replace all matches of this automaton in
693     /// the given reader, and write the replacements to the given
694     /// writer. Matches correspond to the same matches as reported by
695     /// [`find_iter`](struct.AhoCorasick.html#method.find_iter).
696     ///
697     /// Replacements are determined by the index of the matching pattern.
698     /// For example, if the pattern with index `2` is found, then it is
699     /// replaced by `replace_with[2]`.
700     ///
701     /// After all matches are replaced, the writer is _not_ flushed.
702     ///
703     /// If there was a problem reading from the given reader or writing to the
704     /// given writer, then the corresponding `io::Error` is returned and all
705     /// replacement is stopped.
706     ///
707     /// When searching a stream, an internal buffer is used. Therefore, callers
708     /// should avoiding providing a buffered reader, if possible. However,
709     /// callers may want to provide a buffered writer.
710     ///
711     /// Searching a stream requires that the automaton was built with
712     /// `MatchKind::Standard` semantics. If this automaton was constructed
713     /// with leftmost semantics, then this method will panic. To determine
714     /// whether this will panic at runtime, use the
715     /// [`AhoCorasick::supports_stream`](struct.AhoCorasick.html#method.supports_stream)
716     /// method.
717     ///
718     /// # Memory usage
719     ///
720     /// In general, searching streams will use a constant amount of memory for
721     /// its internal buffer. The one requirement is that the internal buffer
722     /// must be at least the size of the longest possible match. In most use
723     /// cases, the default buffer size will be much larger than any individual
724     /// match.
725     ///
726     /// # Panics
727     ///
728     /// This panics when `AhoCorasick::supports_stream` returns `false`.
729     /// That is, this panics when this automaton's match semantics are not
730     /// `MatchKind::Standard`. This restriction may be lifted in the future.
731     ///
732     /// # Examples
733     ///
734     /// Basic usage:
735     ///
736     /// ```
737     /// use aho_corasick::AhoCorasick;
738     ///
739     /// # fn example() -> Result<(), ::std::io::Error> {
740     /// let patterns = &["fox", "brown", "quick"];
741     /// let haystack = "The quick brown fox.";
742     /// let replace_with = &["sloth", "grey", "slow"];
743     ///
744     /// let ac = AhoCorasick::new(patterns);
745     /// let mut result = vec![];
746     /// ac.stream_replace_all(haystack.as_bytes(), &mut result, replace_with)?;
747     /// assert_eq!(b"The slow grey sloth.".to_vec(), result);
748     /// # Ok(()) }; example().unwrap()
749     /// ```
stream_replace_all<R, W, B>( &self, rdr: R, wtr: W, replace_with: &[B], ) -> io::Result<()> where R: io::Read, W: io::Write, B: AsRef<[u8]>,750     pub fn stream_replace_all<R, W, B>(
751         &self,
752         rdr: R,
753         wtr: W,
754         replace_with: &[B],
755     ) -> io::Result<()>
756     where
757         R: io::Read,
758         W: io::Write,
759         B: AsRef<[u8]>,
760     {
761         assert_eq!(
762             replace_with.len(),
763             self.pattern_count(),
764             "stream_replace_all requires a replacement for every pattern \
765              in the automaton"
766         );
767         self.stream_replace_all_with(rdr, wtr, |mat, _, wtr| {
768             wtr.write_all(replace_with[mat.pattern()].as_ref())
769         })
770     }
771 
772     /// Search the given reader and replace all matches of this automaton
773     /// using the given closure. The result is written to the given
774     /// writer. Matches correspond to the same matches as reported by
775     /// [`find_iter`](struct.AhoCorasick.html#method.find_iter).
776     ///
777     /// The closure accepts three parameters: the match found, the text of
778     /// the match and the writer with which to write the replaced text (if any).
779     ///
780     /// After all matches are replaced, the writer is _not_ flushed.
781     ///
782     /// If there was a problem reading from the given reader or writing to the
783     /// given writer, then the corresponding `io::Error` is returned and all
784     /// replacement is stopped.
785     ///
786     /// When searching a stream, an internal buffer is used. Therefore, callers
787     /// should avoiding providing a buffered reader, if possible. However,
788     /// callers may want to provide a buffered writer.
789     ///
790     /// Searching a stream requires that the automaton was built with
791     /// `MatchKind::Standard` semantics. If this automaton was constructed
792     /// with leftmost semantics, then this method will panic. To determine
793     /// whether this will panic at runtime, use the
794     /// [`AhoCorasick::supports_stream`](struct.AhoCorasick.html#method.supports_stream)
795     /// method.
796     ///
797     /// # Memory usage
798     ///
799     /// In general, searching streams will use a constant amount of memory for
800     /// its internal buffer. The one requirement is that the internal buffer
801     /// must be at least the size of the longest possible match. In most use
802     /// cases, the default buffer size will be much larger than any individual
803     /// match.
804     ///
805     /// # Panics
806     ///
807     /// This panics when `AhoCorasick::supports_stream` returns `false`.
808     /// That is, this panics when this automaton's match semantics are not
809     /// `MatchKind::Standard`. This restriction may be lifted in the future.
810     ///
811     /// # Examples
812     ///
813     /// Basic usage:
814     ///
815     /// ```
816     /// use std::io::Write;
817     /// use aho_corasick::AhoCorasick;
818     ///
819     /// # fn example() -> Result<(), ::std::io::Error> {
820     /// let patterns = &["fox", "brown", "quick"];
821     /// let haystack = "The quick brown fox.";
822     ///
823     /// let ac = AhoCorasick::new(patterns);
824     /// let mut result = vec![];
825     /// ac.stream_replace_all_with(
826     ///     haystack.as_bytes(),
827     ///     &mut result,
828     ///     |mat, _, wtr| {
829     ///         wtr.write_all(mat.pattern().to_string().as_bytes())
830     ///     },
831     /// )?;
832     /// assert_eq!(b"The 2 1 0.".to_vec(), result);
833     /// # Ok(()) }; example().unwrap()
834     /// ```
stream_replace_all_with<R, W, F>( &self, rdr: R, mut wtr: W, mut replace_with: F, ) -> io::Result<()> where R: io::Read, W: io::Write, F: FnMut(&Match, &[u8], &mut W) -> io::Result<()>,835     pub fn stream_replace_all_with<R, W, F>(
836         &self,
837         rdr: R,
838         mut wtr: W,
839         mut replace_with: F,
840     ) -> io::Result<()>
841     where
842         R: io::Read,
843         W: io::Write,
844         F: FnMut(&Match, &[u8], &mut W) -> io::Result<()>,
845     {
846         let mut it = StreamChunkIter::new(self, rdr);
847         while let Some(result) = it.next() {
848             let chunk = result?;
849             match chunk {
850                 StreamChunk::NonMatch { bytes, .. } => {
851                     wtr.write_all(bytes)?;
852                 }
853                 StreamChunk::Match { bytes, mat } => {
854                     replace_with(&mat, bytes, &mut wtr)?;
855                 }
856             }
857         }
858         Ok(())
859     }
860 
861     /// Returns the match kind used by this automaton.
862     ///
863     /// # Examples
864     ///
865     /// Basic usage:
866     ///
867     /// ```
868     /// use aho_corasick::{AhoCorasick, MatchKind};
869     ///
870     /// let ac = AhoCorasick::new(&[
871     ///     "foo", "bar", "quux", "baz",
872     /// ]);
873     /// assert_eq!(&MatchKind::Standard, ac.match_kind());
874     /// ```
match_kind(&self) -> &MatchKind875     pub fn match_kind(&self) -> &MatchKind {
876         self.imp.match_kind()
877     }
878 
879     /// Returns the length of the longest pattern matched by this automaton.
880     ///
881     /// # Examples
882     ///
883     /// Basic usage:
884     ///
885     /// ```
886     /// use aho_corasick::AhoCorasick;
887     ///
888     /// let ac = AhoCorasick::new(&[
889     ///     "foo", "bar", "quux", "baz",
890     /// ]);
891     /// assert_eq!(4, ac.max_pattern_len());
892     /// ```
max_pattern_len(&self) -> usize893     pub fn max_pattern_len(&self) -> usize {
894         self.imp.max_pattern_len()
895     }
896 
897     /// Return the total number of patterns matched by this automaton.
898     ///
899     /// This includes patterns that may never participate in a match. For
900     /// example, if
901     /// [`MatchKind::LeftmostFirst`](enum.MatchKind.html#variant.LeftmostFirst)
902     /// match semantics are used, and the patterns `Sam` and `Samwise` were
903     /// used to build the automaton, then `Samwise` can never participate in a
904     /// match because `Sam` will always take priority.
905     ///
906     /// # Examples
907     ///
908     /// Basic usage:
909     ///
910     /// ```
911     /// use aho_corasick::AhoCorasick;
912     ///
913     /// let ac = AhoCorasick::new(&[
914     ///     "foo", "bar", "baz",
915     /// ]);
916     /// assert_eq!(3, ac.pattern_count());
917     /// ```
pattern_count(&self) -> usize918     pub fn pattern_count(&self) -> usize {
919         self.imp.pattern_count()
920     }
921 
922     /// Returns true if and only if this automaton supports reporting
923     /// overlapping matches.
924     ///
925     /// If this returns false and overlapping matches are requested, then it
926     /// will result in a panic.
927     ///
928     /// Since leftmost matching is inherently incompatible with overlapping
929     /// matches, only
930     /// [`MatchKind::Standard`](enum.MatchKind.html#variant.Standard)
931     /// supports overlapping matches. This is unlikely to change in the future.
932     ///
933     /// # Examples
934     ///
935     /// Basic usage:
936     ///
937     /// ```
938     /// use aho_corasick::{AhoCorasickBuilder, MatchKind};
939     ///
940     /// let ac = AhoCorasickBuilder::new()
941     ///     .match_kind(MatchKind::Standard)
942     ///     .build(&["foo", "bar", "baz"]);
943     /// assert!(ac.supports_overlapping());
944     ///
945     /// let ac = AhoCorasickBuilder::new()
946     ///     .match_kind(MatchKind::LeftmostFirst)
947     ///     .build(&["foo", "bar", "baz"]);
948     /// assert!(!ac.supports_overlapping());
949     /// ```
supports_overlapping(&self) -> bool950     pub fn supports_overlapping(&self) -> bool {
951         self.match_kind.supports_overlapping()
952     }
953 
954     /// Returns true if and only if this automaton supports stream searching.
955     ///
956     /// If this returns false and stream searching (or replacing) is attempted,
957     /// then it will result in a panic.
958     ///
959     /// Currently, only
960     /// [`MatchKind::Standard`](enum.MatchKind.html#variant.Standard)
961     /// supports streaming. This may be expanded in the future.
962     ///
963     /// # Examples
964     ///
965     /// Basic usage:
966     ///
967     /// ```
968     /// use aho_corasick::{AhoCorasickBuilder, MatchKind};
969     ///
970     /// let ac = AhoCorasickBuilder::new()
971     ///     .match_kind(MatchKind::Standard)
972     ///     .build(&["foo", "bar", "baz"]);
973     /// assert!(ac.supports_stream());
974     ///
975     /// let ac = AhoCorasickBuilder::new()
976     ///     .match_kind(MatchKind::LeftmostFirst)
977     ///     .build(&["foo", "bar", "baz"]);
978     /// assert!(!ac.supports_stream());
979     /// ```
supports_stream(&self) -> bool980     pub fn supports_stream(&self) -> bool {
981         self.match_kind.supports_stream()
982     }
983 
984     /// Returns the approximate total amount of heap used by this automaton, in
985     /// units of bytes.
986     ///
987     /// # Examples
988     ///
989     /// This example shows the difference in heap usage between a few
990     /// configurations:
991     ///
992     /// ```ignore
993     /// use aho_corasick::{AhoCorasickBuilder, MatchKind};
994     ///
995     /// let ac = AhoCorasickBuilder::new()
996     ///     .dfa(false) // default
997     ///     .build(&["foo", "bar", "baz"]);
998     /// assert_eq!(10_336, ac.heap_bytes());
999     ///
1000     /// let ac = AhoCorasickBuilder::new()
1001     ///     .dfa(false) // default
1002     ///     .ascii_case_insensitive(true)
1003     ///     .build(&["foo", "bar", "baz"]);
1004     /// assert_eq!(10_384, ac.heap_bytes());
1005     ///
1006     /// let ac = AhoCorasickBuilder::new()
1007     ///     .dfa(true)
1008     ///     .ascii_case_insensitive(true)
1009     ///     .build(&["foo", "bar", "baz"]);
1010     /// assert_eq!(1_248, ac.heap_bytes());
1011     /// ```
heap_bytes(&self) -> usize1012     pub fn heap_bytes(&self) -> usize {
1013         match self.imp {
1014             Imp::NFA(ref nfa) => nfa.heap_bytes(),
1015             Imp::DFA(ref dfa) => dfa.heap_bytes(),
1016         }
1017     }
1018 }
1019 
1020 /// The internal implementation of Aho-Corasick, which is either an NFA or
1021 /// a DFA. The NFA is slower but uses less memory. The DFA is faster but uses
1022 /// more memory.
1023 #[derive(Clone, Debug)]
1024 enum Imp<S: StateID> {
1025     NFA(NFA<S>),
1026     DFA(DFA<S>),
1027 }
1028 
1029 impl<S: StateID> Imp<S> {
1030     /// Returns the type of match semantics implemented by this automaton.
match_kind(&self) -> &MatchKind1031     fn match_kind(&self) -> &MatchKind {
1032         match *self {
1033             Imp::NFA(ref nfa) => nfa.match_kind(),
1034             Imp::DFA(ref dfa) => dfa.match_kind(),
1035         }
1036     }
1037 
1038     /// Returns the identifier of the start state.
start_state(&self) -> S1039     fn start_state(&self) -> S {
1040         match *self {
1041             Imp::NFA(ref nfa) => nfa.start_state(),
1042             Imp::DFA(ref dfa) => dfa.start_state(),
1043         }
1044     }
1045 
1046     /// The length, in bytes, of the longest pattern in this automaton. This
1047     /// information is useful for maintaining correct buffer sizes when
1048     /// searching on streams.
max_pattern_len(&self) -> usize1049     fn max_pattern_len(&self) -> usize {
1050         match *self {
1051             Imp::NFA(ref nfa) => nfa.max_pattern_len(),
1052             Imp::DFA(ref dfa) => dfa.max_pattern_len(),
1053         }
1054     }
1055 
1056     /// The total number of patterns added to this automaton. This includes
1057     /// patterns that may never match. The maximum matching pattern that can be
1058     /// reported is exactly one less than this number.
pattern_count(&self) -> usize1059     fn pattern_count(&self) -> usize {
1060         match *self {
1061             Imp::NFA(ref nfa) => nfa.pattern_count(),
1062             Imp::DFA(ref dfa) => dfa.pattern_count(),
1063         }
1064     }
1065 
1066     /// Returns the prefilter object, if one exists, for the underlying
1067     /// automaton.
prefilter(&self) -> Option<&dyn Prefilter>1068     fn prefilter(&self) -> Option<&dyn Prefilter> {
1069         match *self {
1070             Imp::NFA(ref nfa) => nfa.prefilter(),
1071             Imp::DFA(ref dfa) => dfa.prefilter(),
1072         }
1073     }
1074 
1075     /// Returns true if and only if we should attempt to use a prefilter.
use_prefilter(&self) -> bool1076     fn use_prefilter(&self) -> bool {
1077         let p = match self.prefilter() {
1078             None => return false,
1079             Some(p) => p,
1080         };
1081         !p.looks_for_non_start_of_match()
1082     }
1083 
1084     #[inline(always)]
overlapping_find_at( &self, prestate: &mut PrefilterState, haystack: &[u8], at: usize, state_id: &mut S, match_index: &mut usize, ) -> Option<Match>1085     fn overlapping_find_at(
1086         &self,
1087         prestate: &mut PrefilterState,
1088         haystack: &[u8],
1089         at: usize,
1090         state_id: &mut S,
1091         match_index: &mut usize,
1092     ) -> Option<Match> {
1093         match *self {
1094             Imp::NFA(ref nfa) => nfa.overlapping_find_at(
1095                 prestate,
1096                 haystack,
1097                 at,
1098                 state_id,
1099                 match_index,
1100             ),
1101             Imp::DFA(ref dfa) => dfa.overlapping_find_at(
1102                 prestate,
1103                 haystack,
1104                 at,
1105                 state_id,
1106                 match_index,
1107             ),
1108         }
1109     }
1110 
1111     #[inline(always)]
earliest_find_at( &self, prestate: &mut PrefilterState, haystack: &[u8], at: usize, state_id: &mut S, ) -> Option<Match>1112     fn earliest_find_at(
1113         &self,
1114         prestate: &mut PrefilterState,
1115         haystack: &[u8],
1116         at: usize,
1117         state_id: &mut S,
1118     ) -> Option<Match> {
1119         match *self {
1120             Imp::NFA(ref nfa) => {
1121                 nfa.earliest_find_at(prestate, haystack, at, state_id)
1122             }
1123             Imp::DFA(ref dfa) => {
1124                 dfa.earliest_find_at(prestate, haystack, at, state_id)
1125             }
1126         }
1127     }
1128 
1129     #[inline(always)]
find_at_no_state( &self, prestate: &mut PrefilterState, haystack: &[u8], at: usize, ) -> Option<Match>1130     fn find_at_no_state(
1131         &self,
1132         prestate: &mut PrefilterState,
1133         haystack: &[u8],
1134         at: usize,
1135     ) -> Option<Match> {
1136         match *self {
1137             Imp::NFA(ref nfa) => nfa.find_at_no_state(prestate, haystack, at),
1138             Imp::DFA(ref dfa) => dfa.find_at_no_state(prestate, haystack, at),
1139         }
1140     }
1141 }
1142 
1143 /// An iterator of non-overlapping matches in a particular haystack.
1144 ///
1145 /// This iterator yields matches according to the
1146 /// [`MatchKind`](enum.MatchKind.html)
1147 /// used by this automaton.
1148 ///
1149 /// This iterator is constructed via the
1150 /// [`AhoCorasick::find_iter`](struct.AhoCorasick.html#method.find_iter)
1151 /// method.
1152 ///
1153 /// The type variable `S` refers to the representation used for state
1154 /// identifiers. (By default, this is `usize`.)
1155 ///
1156 /// The lifetime `'a` refers to the lifetime of the `AhoCorasick` automaton.
1157 ///
1158 /// The lifetime `'b` refers to the lifetime of the haystack being searched.
1159 #[derive(Debug)]
1160 pub struct FindIter<'a, 'b, S: StateID> {
1161     fsm: &'a Imp<S>,
1162     prestate: PrefilterState,
1163     haystack: &'b [u8],
1164     pos: usize,
1165 }
1166 
1167 impl<'a, 'b, S: StateID> FindIter<'a, 'b, S> {
new(ac: &'a AhoCorasick<S>, haystack: &'b [u8]) -> FindIter<'a, 'b, S>1168     fn new(ac: &'a AhoCorasick<S>, haystack: &'b [u8]) -> FindIter<'a, 'b, S> {
1169         let prestate = PrefilterState::new(ac.max_pattern_len());
1170         FindIter { fsm: &ac.imp, prestate, haystack, pos: 0 }
1171     }
1172 }
1173 
1174 impl<'a, 'b, S: StateID> Iterator for FindIter<'a, 'b, S> {
1175     type Item = Match;
1176 
next(&mut self) -> Option<Match>1177     fn next(&mut self) -> Option<Match> {
1178         if self.pos > self.haystack.len() {
1179             return None;
1180         }
1181         let result = self.fsm.find_at_no_state(
1182             &mut self.prestate,
1183             self.haystack,
1184             self.pos,
1185         );
1186         let mat = match result {
1187             None => return None,
1188             Some(mat) => mat,
1189         };
1190         if mat.end() == self.pos {
1191             // If the automaton can match the empty string and if we found an
1192             // empty match, then we need to forcefully move the position.
1193             self.pos += 1;
1194         } else {
1195             self.pos = mat.end();
1196         }
1197         Some(mat)
1198     }
1199 }
1200 
1201 /// An iterator of overlapping matches in a particular haystack.
1202 ///
1203 /// This iterator will report all possible matches in a particular haystack,
1204 /// even when the matches overlap.
1205 ///
1206 /// This iterator is constructed via the
1207 /// [`AhoCorasick::find_overlapping_iter`](struct.AhoCorasick.html#method.find_overlapping_iter)
1208 /// method.
1209 ///
1210 /// The type variable `S` refers to the representation used for state
1211 /// identifiers. (By default, this is `usize`.)
1212 ///
1213 /// The lifetime `'a` refers to the lifetime of the `AhoCorasick` automaton.
1214 ///
1215 /// The lifetime `'b` refers to the lifetime of the haystack being searched.
1216 #[derive(Debug)]
1217 pub struct FindOverlappingIter<'a, 'b, S: StateID> {
1218     fsm: &'a Imp<S>,
1219     prestate: PrefilterState,
1220     haystack: &'b [u8],
1221     pos: usize,
1222     state_id: S,
1223     match_index: usize,
1224 }
1225 
1226 impl<'a, 'b, S: StateID> FindOverlappingIter<'a, 'b, S> {
new( ac: &'a AhoCorasick<S>, haystack: &'b [u8], ) -> FindOverlappingIter<'a, 'b, S>1227     fn new(
1228         ac: &'a AhoCorasick<S>,
1229         haystack: &'b [u8],
1230     ) -> FindOverlappingIter<'a, 'b, S> {
1231         assert!(
1232             ac.supports_overlapping(),
1233             "automaton does not support overlapping searches"
1234         );
1235         let prestate = PrefilterState::new(ac.max_pattern_len());
1236         FindOverlappingIter {
1237             fsm: &ac.imp,
1238             prestate,
1239             haystack,
1240             pos: 0,
1241             state_id: ac.imp.start_state(),
1242             match_index: 0,
1243         }
1244     }
1245 }
1246 
1247 impl<'a, 'b, S: StateID> Iterator for FindOverlappingIter<'a, 'b, S> {
1248     type Item = Match;
1249 
next(&mut self) -> Option<Match>1250     fn next(&mut self) -> Option<Match> {
1251         let result = self.fsm.overlapping_find_at(
1252             &mut self.prestate,
1253             self.haystack,
1254             self.pos,
1255             &mut self.state_id,
1256             &mut self.match_index,
1257         );
1258         match result {
1259             None => return None,
1260             Some(m) => {
1261                 self.pos = m.end();
1262                 Some(m)
1263             }
1264         }
1265     }
1266 }
1267 
1268 /// An iterator that reports Aho-Corasick matches in a stream.
1269 ///
1270 /// This iterator yields elements of type `io::Result<Match>`, where an error
1271 /// is reported if there was a problem reading from the underlying stream.
1272 /// The iterator terminates only when the underlying stream reaches `EOF`.
1273 ///
1274 /// This iterator is constructed via the
1275 /// [`AhoCorasick::stream_find_iter`](struct.AhoCorasick.html#method.stream_find_iter)
1276 /// method.
1277 ///
1278 /// The type variable `R` refers to the `io::Read` stream that is being read
1279 /// from.
1280 ///
1281 /// The type variable `S` refers to the representation used for state
1282 /// identifiers. (By default, this is `usize`.)
1283 ///
1284 /// The lifetime `'a` refers to the lifetime of the `AhoCorasick` automaton.
1285 #[derive(Debug)]
1286 pub struct StreamFindIter<'a, R, S: StateID> {
1287     it: StreamChunkIter<'a, R, S>,
1288 }
1289 
1290 impl<'a, R: io::Read, S: StateID> StreamFindIter<'a, R, S> {
new(ac: &'a AhoCorasick<S>, rdr: R) -> StreamFindIter<'a, R, S>1291     fn new(ac: &'a AhoCorasick<S>, rdr: R) -> StreamFindIter<'a, R, S> {
1292         StreamFindIter { it: StreamChunkIter::new(ac, rdr) }
1293     }
1294 }
1295 
1296 impl<'a, R: io::Read, S: StateID> Iterator for StreamFindIter<'a, R, S> {
1297     type Item = io::Result<Match>;
1298 
next(&mut self) -> Option<io::Result<Match>>1299     fn next(&mut self) -> Option<io::Result<Match>> {
1300         loop {
1301             match self.it.next() {
1302                 None => return None,
1303                 Some(Err(err)) => return Some(Err(err)),
1304                 Some(Ok(StreamChunk::NonMatch { .. })) => {}
1305                 Some(Ok(StreamChunk::Match { mat, .. })) => {
1306                     return Some(Ok(mat));
1307                 }
1308             }
1309         }
1310     }
1311 }
1312 
1313 /// An iterator over chunks in an underlying reader. Each chunk either
1314 /// corresponds to non-matching bytes or matching bytes, but all bytes from
1315 /// the underlying reader are reported in sequence. There may be an arbitrary
1316 /// number of non-matching chunks before seeing a matching chunk.
1317 ///
1318 /// N.B. This does not actually implement Iterator because we need to borrow
1319 /// from the underlying reader. But conceptually, it's still an iterator.
1320 #[derive(Debug)]
1321 struct StreamChunkIter<'a, R, S: StateID> {
1322     /// The AC automaton.
1323     fsm: &'a Imp<S>,
1324     /// State associated with this automaton's prefilter. It is a heuristic
1325     /// for stopping the prefilter if it's deemed ineffective.
1326     prestate: PrefilterState,
1327     /// The source of bytes we read from.
1328     rdr: R,
1329     /// A fixed size buffer. This is what we actually search. There are some
1330     /// invariants around the buffer's size, namely, it must be big enough to
1331     /// contain the longest possible match.
1332     buf: Buffer,
1333     /// The ID of the FSM state we're currently in.
1334     state_id: S,
1335     /// The current position at which to start the next search in `buf`.
1336     search_pos: usize,
1337     /// The absolute position of `search_pos`, where `0` corresponds to the
1338     /// position of the first byte read from `rdr`.
1339     absolute_pos: usize,
1340     /// The ending position of the last StreamChunk that was returned to the
1341     /// caller. This position is used to determine whether we need to emit
1342     /// non-matching bytes before emitting a match.
1343     report_pos: usize,
1344     /// A match that should be reported on the next call.
1345     pending_match: Option<Match>,
1346     /// Enabled only when the automaton can match the empty string. When
1347     /// enabled, we need to execute one final search after consuming the
1348     /// reader to find the trailing empty match.
1349     has_empty_match_at_end: bool,
1350 }
1351 
1352 /// A single chunk yielded by the stream chunk iterator.
1353 ///
1354 /// The `'r` lifetime refers to the lifetime of the stream chunk iterator.
1355 #[derive(Debug)]
1356 enum StreamChunk<'r> {
1357     /// A chunk that does not contain any matches.
1358     NonMatch { bytes: &'r [u8] },
1359     /// A chunk that precisely contains a match.
1360     Match { bytes: &'r [u8], mat: Match },
1361 }
1362 
1363 impl<'a, R: io::Read, S: StateID> StreamChunkIter<'a, R, S> {
new(ac: &'a AhoCorasick<S>, rdr: R) -> StreamChunkIter<'a, R, S>1364     fn new(ac: &'a AhoCorasick<S>, rdr: R) -> StreamChunkIter<'a, R, S> {
1365         assert!(
1366             ac.supports_stream(),
1367             "stream searching is only supported for Standard match semantics"
1368         );
1369 
1370         let prestate = if ac.imp.use_prefilter() {
1371             PrefilterState::new(ac.max_pattern_len())
1372         } else {
1373             PrefilterState::disabled()
1374         };
1375         let buf = Buffer::new(ac.imp.max_pattern_len());
1376         let state_id = ac.imp.start_state();
1377         StreamChunkIter {
1378             fsm: &ac.imp,
1379             prestate,
1380             rdr,
1381             buf,
1382             state_id,
1383             absolute_pos: 0,
1384             report_pos: 0,
1385             search_pos: 0,
1386             pending_match: None,
1387             has_empty_match_at_end: ac.is_match(""),
1388         }
1389     }
1390 
next(&mut self) -> Option<io::Result<StreamChunk>>1391     fn next(&mut self) -> Option<io::Result<StreamChunk>> {
1392         loop {
1393             if let Some(mut mat) = self.pending_match.take() {
1394                 let bytes = &self.buf.buffer()[mat.start()..mat.end()];
1395                 self.report_pos = mat.end();
1396                 mat = mat.increment(self.absolute_pos);
1397                 return Some(Ok(StreamChunk::Match { bytes, mat }));
1398             }
1399             if self.search_pos >= self.buf.len() {
1400                 if let Some(end) = self.unreported() {
1401                     let bytes = &self.buf.buffer()[self.report_pos..end];
1402                     self.report_pos = end;
1403                     return Some(Ok(StreamChunk::NonMatch { bytes }));
1404                 }
1405                 if self.buf.len() >= self.buf.min_buffer_len() {
1406                     // This is the point at which we roll our buffer, which we
1407                     // only do if our buffer has at least the minimum amount of
1408                     // bytes in it. Before rolling, we update our various
1409                     // positions to be consistent with the buffer after it has
1410                     // been rolled.
1411 
1412                     self.report_pos -=
1413                         self.buf.len() - self.buf.min_buffer_len();
1414                     self.absolute_pos +=
1415                         self.search_pos - self.buf.min_buffer_len();
1416                     self.search_pos = self.buf.min_buffer_len();
1417                     self.buf.roll();
1418                 }
1419                 match self.buf.fill(&mut self.rdr) {
1420                     Err(err) => return Some(Err(err)),
1421                     Ok(false) => {
1422                         // We've hit EOF, but if there are still some
1423                         // unreported bytes remaining, return them now.
1424                         if self.report_pos < self.buf.len() {
1425                             let bytes = &self.buf.buffer()[self.report_pos..];
1426                             self.report_pos = self.buf.len();
1427 
1428                             let chunk = StreamChunk::NonMatch { bytes };
1429                             return Some(Ok(chunk));
1430                         } else {
1431                             // We've reported everything, but there might still
1432                             // be a match at the very last position.
1433                             if !self.has_empty_match_at_end {
1434                                 return None;
1435                             }
1436                             // fallthrough for another search to get trailing
1437                             // empty matches
1438                             self.has_empty_match_at_end = false;
1439                         }
1440                     }
1441                     Ok(true) => {}
1442                 }
1443             }
1444             let result = self.fsm.earliest_find_at(
1445                 &mut self.prestate,
1446                 self.buf.buffer(),
1447                 self.search_pos,
1448                 &mut self.state_id,
1449             );
1450             match result {
1451                 None => {
1452                     self.search_pos = self.buf.len();
1453                 }
1454                 Some(mat) => {
1455                     self.state_id = self.fsm.start_state();
1456                     if mat.end() == self.search_pos {
1457                         // If the automaton can match the empty string and if
1458                         // we found an empty match, then we need to forcefully
1459                         // move the position.
1460                         self.search_pos += 1;
1461                     } else {
1462                         self.search_pos = mat.end();
1463                     }
1464                     self.pending_match = Some(mat.clone());
1465                     if self.report_pos < mat.start() {
1466                         let bytes =
1467                             &self.buf.buffer()[self.report_pos..mat.start()];
1468                         self.report_pos = mat.start();
1469 
1470                         let chunk = StreamChunk::NonMatch { bytes };
1471                         return Some(Ok(chunk));
1472                     }
1473                 }
1474             }
1475         }
1476     }
1477 
unreported(&self) -> Option<usize>1478     fn unreported(&self) -> Option<usize> {
1479         let end = self.search_pos.saturating_sub(self.buf.min_buffer_len());
1480         if self.report_pos < end {
1481             Some(end)
1482         } else {
1483             None
1484         }
1485     }
1486 }
1487 
1488 /// A builder for configuring an Aho-Corasick automaton.
1489 #[derive(Clone, Debug)]
1490 pub struct AhoCorasickBuilder {
1491     nfa_builder: nfa::Builder,
1492     dfa_builder: dfa::Builder,
1493     dfa: bool,
1494 }
1495 
1496 impl Default for AhoCorasickBuilder {
default() -> AhoCorasickBuilder1497     fn default() -> AhoCorasickBuilder {
1498         AhoCorasickBuilder::new()
1499     }
1500 }
1501 
1502 impl AhoCorasickBuilder {
1503     /// Create a new builder for configuring an Aho-Corasick automaton.
1504     ///
1505     /// If you don't need fine grained configuration or aren't sure which knobs
1506     /// to set, try using
1507     /// [`AhoCorasick::new_auto_configured`](struct.AhoCorasick.html#method.new_auto_configured)
1508     /// instead.
new() -> AhoCorasickBuilder1509     pub fn new() -> AhoCorasickBuilder {
1510         AhoCorasickBuilder {
1511             nfa_builder: nfa::Builder::new(),
1512             dfa_builder: dfa::Builder::new(),
1513             dfa: false,
1514         }
1515     }
1516 
1517     /// Build an Aho-Corasick automaton using the configuration set on this
1518     /// builder.
1519     ///
1520     /// A builder may be reused to create more automatons.
1521     ///
1522     /// This method will use the default for representing internal state
1523     /// identifiers, which is `usize`. This guarantees that building the
1524     /// automaton will succeed and is generally a good default, but can make
1525     /// the size of the automaton 2-8 times bigger than it needs to be,
1526     /// depending on your target platform.
1527     ///
1528     /// # Examples
1529     ///
1530     /// Basic usage:
1531     ///
1532     /// ```
1533     /// use aho_corasick::AhoCorasickBuilder;
1534     ///
1535     /// let patterns = &["foo", "bar", "baz"];
1536     /// let ac = AhoCorasickBuilder::new()
1537     ///     .build(patterns);
1538     /// assert_eq!(Some(1), ac.find("xxx bar xxx").map(|m| m.pattern()));
1539     /// ```
build<I, P>(&self, patterns: I) -> AhoCorasick where I: IntoIterator<Item = P>, P: AsRef<[u8]>,1540     pub fn build<I, P>(&self, patterns: I) -> AhoCorasick
1541     where
1542         I: IntoIterator<Item = P>,
1543         P: AsRef<[u8]>,
1544     {
1545         // The builder only returns an error if the chosen state ID
1546         // representation is too small to fit all of the given patterns. In
1547         // this case, since we fix the representation to usize, it will always
1548         // work because it's impossible to overflow usize since the underlying
1549         // storage would OOM long before that happens.
1550         self.build_with_size::<usize, I, P>(patterns)
1551             .expect("usize state ID type should always work")
1552     }
1553 
1554     /// Build an Aho-Corasick automaton using the configuration set on this
1555     /// builder with a specific state identifier representation. This only has
1556     /// an effect when the `dfa` option is enabled.
1557     ///
1558     /// Generally, the choices for a state identifier representation are
1559     /// `u8`, `u16`, `u32`, `u64` or `usize`, with `usize` being the default.
1560     /// The advantage of choosing a smaller state identifier representation
1561     /// is that the automaton produced will be smaller. This might be
1562     /// beneficial for just generally using less space, or might even allow it
1563     /// to fit more of the automaton in your CPU's cache, leading to overall
1564     /// better search performance.
1565     ///
1566     /// Unlike the standard `build` method, this can report an error if the
1567     /// state identifier representation cannot support the size of the
1568     /// automaton.
1569     ///
1570     /// Note that the state identifier representation is determined by the
1571     /// `S` type variable. This requires a type hint of some sort, either
1572     /// by specifying the return type or using the turbofish, e.g.,
1573     /// `build_with_size::<u16, _, _>(...)`.
1574     ///
1575     /// # Examples
1576     ///
1577     /// Basic usage:
1578     ///
1579     /// ```
1580     /// use aho_corasick::{AhoCorasick, AhoCorasickBuilder};
1581     ///
1582     /// # fn example() -> Result<(), ::aho_corasick::Error> {
1583     /// let patterns = &["foo", "bar", "baz"];
1584     /// let ac: AhoCorasick<u8> = AhoCorasickBuilder::new()
1585     ///     .build_with_size(patterns)?;
1586     /// assert_eq!(Some(1), ac.find("xxx bar xxx").map(|m| m.pattern()));
1587     /// # Ok(()) }; example().unwrap()
1588     /// ```
1589     ///
1590     /// Or alternatively, with turbofish:
1591     ///
1592     /// ```
1593     /// use aho_corasick::AhoCorasickBuilder;
1594     ///
1595     /// # fn example() -> Result<(), ::aho_corasick::Error> {
1596     /// let patterns = &["foo", "bar", "baz"];
1597     /// let ac = AhoCorasickBuilder::new()
1598     ///     .build_with_size::<u8, _, _>(patterns)?;
1599     /// assert_eq!(Some(1), ac.find("xxx bar xxx").map(|m| m.pattern()));
1600     /// # Ok(()) }; example().unwrap()
1601     /// ```
build_with_size<S, I, P>( &self, patterns: I, ) -> Result<AhoCorasick<S>> where S: StateID, I: IntoIterator<Item = P>, P: AsRef<[u8]>,1602     pub fn build_with_size<S, I, P>(
1603         &self,
1604         patterns: I,
1605     ) -> Result<AhoCorasick<S>>
1606     where
1607         S: StateID,
1608         I: IntoIterator<Item = P>,
1609         P: AsRef<[u8]>,
1610     {
1611         let nfa = self.nfa_builder.build(patterns)?;
1612         let match_kind = nfa.match_kind().clone();
1613         let imp = if self.dfa {
1614             let dfa = self.dfa_builder.build(&nfa)?;
1615             Imp::DFA(dfa)
1616         } else {
1617             Imp::NFA(nfa)
1618         };
1619         Ok(AhoCorasick { imp, match_kind })
1620     }
1621 
1622     /// Automatically configure the settings on this builder according to the
1623     /// patterns that will be used to construct the automaton.
1624     ///
1625     /// The idea here is to balance space and time automatically. That is, when
1626     /// searching a small number of patterns, this will attempt to use the
1627     /// fastest possible configuration since the total space required will be
1628     /// small anyway. As the number of patterns grows, this will fall back to
1629     /// slower configurations that use less space.
1630     ///
1631     /// This is guaranteed to never set `match_kind`, but any other option may
1632     /// be overridden.
1633     ///
1634     /// # Examples
1635     ///
1636     /// Basic usage:
1637     ///
1638     /// ```
1639     /// use aho_corasick::AhoCorasickBuilder;
1640     ///
1641     /// let patterns = &["foo", "bar", "baz"];
1642     /// let ac = AhoCorasickBuilder::new()
1643     ///     .auto_configure(patterns)
1644     ///     .build(patterns);
1645     /// assert_eq!(Some(1), ac.find("xxx bar xxx").map(|m| m.pattern()));
1646     /// ```
auto_configure<B: AsRef<[u8]>>( &mut self, patterns: &[B], ) -> &mut AhoCorasickBuilder1647     pub fn auto_configure<B: AsRef<[u8]>>(
1648         &mut self,
1649         patterns: &[B],
1650     ) -> &mut AhoCorasickBuilder {
1651         // N.B. Currently we only use the length of `patterns` to make a
1652         // decision here, and could therefore ask for an `ExactSizeIterator`
1653         // instead. But it's conceivable that we might adapt this to look at
1654         // the total number of bytes, which would requires a second pass.
1655         //
1656         // The logic here is fairly rudimentary at the moment, but probably
1657         // OK. The idea here is to use the fastest thing possible for a small
1658         // number of patterns. That is, a DFA with no byte classes, since byte
1659         // classes require an extra indirection for every byte searched. With a
1660         // moderate number of patterns, we still want a DFA, but save on both
1661         // space and compilation time by enabling byte classes. Finally, fall
1662         // back to the slower but smaller NFA.
1663         if patterns.len() <= 100 {
1664             // N.B. Using byte classes can actually be faster by improving
1665             // locality, but this only really applies for multi-megabyte
1666             // automata (i.e., automata that don't fit in your CPU's cache).
1667             self.dfa(true);
1668         } else if patterns.len() <= 5000 {
1669             self.dfa(true);
1670         }
1671         self
1672     }
1673 
1674     /// Set the desired match semantics.
1675     ///
1676     /// The default is `MatchKind::Standard`, which corresponds to the match
1677     /// semantics supported by the standard textbook description of the
1678     /// Aho-Corasick algorithm. Namely, matches are reported as soon as they
1679     /// are found. Moreover, this is the only way to get overlapping matches
1680     /// or do stream searching.
1681     ///
1682     /// The other kinds of match semantics that are supported are
1683     /// `MatchKind::LeftmostFirst` and `MatchKind::LeftmostLongest`. The former
1684     /// corresponds to the match you would get if you were to try to match
1685     /// each pattern at each position in the haystack in the same order that
1686     /// you give to the automaton. That is, it returns the leftmost match
1687     /// corresponding the earliest pattern given to the automaton. The latter
1688     /// corresponds to finding the longest possible match among all leftmost
1689     /// matches.
1690     ///
1691     /// For more details on match semantics, see the
1692     /// [documentation for `MatchKind`](enum.MatchKind.html).
1693     ///
1694     /// # Examples
1695     ///
1696     /// In these examples, we demonstrate the differences between match
1697     /// semantics for a particular set of patterns in a specific order:
1698     /// `b`, `abc`, `abcd`.
1699     ///
1700     /// Standard semantics:
1701     ///
1702     /// ```
1703     /// use aho_corasick::{AhoCorasickBuilder, MatchKind};
1704     ///
1705     /// let patterns = &["b", "abc", "abcd"];
1706     /// let haystack = "abcd";
1707     ///
1708     /// let ac = AhoCorasickBuilder::new()
1709     ///     .match_kind(MatchKind::Standard) // default, not necessary
1710     ///     .build(patterns);
1711     /// let mat = ac.find(haystack).expect("should have a match");
1712     /// assert_eq!("b", &haystack[mat.start()..mat.end()]);
1713     /// ```
1714     ///
1715     /// Leftmost-first semantics:
1716     ///
1717     /// ```
1718     /// use aho_corasick::{AhoCorasickBuilder, MatchKind};
1719     ///
1720     /// let patterns = &["b", "abc", "abcd"];
1721     /// let haystack = "abcd";
1722     ///
1723     /// let ac = AhoCorasickBuilder::new()
1724     ///     .match_kind(MatchKind::LeftmostFirst)
1725     ///     .build(patterns);
1726     /// let mat = ac.find(haystack).expect("should have a match");
1727     /// assert_eq!("abc", &haystack[mat.start()..mat.end()]);
1728     /// ```
1729     ///
1730     /// Leftmost-longest semantics:
1731     ///
1732     /// ```
1733     /// use aho_corasick::{AhoCorasickBuilder, MatchKind};
1734     ///
1735     /// let patterns = &["b", "abc", "abcd"];
1736     /// let haystack = "abcd";
1737     ///
1738     /// let ac = AhoCorasickBuilder::new()
1739     ///     .match_kind(MatchKind::LeftmostLongest)
1740     ///     .build(patterns);
1741     /// let mat = ac.find(haystack).expect("should have a match");
1742     /// assert_eq!("abcd", &haystack[mat.start()..mat.end()]);
1743     /// ```
match_kind(&mut self, kind: MatchKind) -> &mut AhoCorasickBuilder1744     pub fn match_kind(&mut self, kind: MatchKind) -> &mut AhoCorasickBuilder {
1745         self.nfa_builder.match_kind(kind);
1746         self
1747     }
1748 
1749     /// Enable anchored mode, which requires all matches to start at the
1750     /// first position in a haystack.
1751     ///
1752     /// This option is disabled by default.
1753     ///
1754     /// # Examples
1755     ///
1756     /// Basic usage:
1757     ///
1758     /// ```
1759     /// use aho_corasick::AhoCorasickBuilder;
1760     ///
1761     /// let patterns = &["foo", "bar"];
1762     /// let haystack = "foobar";
1763     ///
1764     /// let ac = AhoCorasickBuilder::new()
1765     ///     .anchored(true)
1766     ///     .build(patterns);
1767     /// assert_eq!(1, ac.find_iter(haystack).count());
1768     /// ```
1769     ///
1770     /// When searching for overlapping matches, all matches that start at
1771     /// the beginning of a haystack will be reported:
1772     ///
1773     /// ```
1774     /// use aho_corasick::AhoCorasickBuilder;
1775     ///
1776     /// let patterns = &["foo", "foofoo"];
1777     /// let haystack = "foofoo";
1778     ///
1779     /// let ac = AhoCorasickBuilder::new()
1780     ///     .anchored(true)
1781     ///     .build(patterns);
1782     /// assert_eq!(2, ac.find_overlapping_iter(haystack).count());
1783     /// // A non-anchored search would return 3 matches.
1784     /// ```
anchored(&mut self, yes: bool) -> &mut AhoCorasickBuilder1785     pub fn anchored(&mut self, yes: bool) -> &mut AhoCorasickBuilder {
1786         self.nfa_builder.anchored(yes);
1787         self
1788     }
1789 
1790     /// Enable ASCII-aware case insensitive matching.
1791     ///
1792     /// When this option is enabled, searching will be performed without
1793     /// respect to case for ASCII letters (`a-z` and `A-Z`) only.
1794     ///
1795     /// Enabling this option does not change the search algorithm, but it may
1796     /// increase the size of the automaton.
1797     ///
1798     /// **NOTE:** It is unlikely that support for Unicode case folding will
1799     /// be added in the future. The ASCII case works via a simple hack to the
1800     /// underlying automaton, but full Unicode handling requires a fair bit of
1801     /// sophistication. If you do need Unicode handling, you might consider
1802     /// using the [`regex` crate](https://docs.rs/regex) or the lower level
1803     /// [`regex-automata` crate](https://docs.rs/regex-automata).
1804     ///
1805     /// # Examples
1806     ///
1807     /// Basic usage:
1808     ///
1809     /// ```
1810     /// use aho_corasick::AhoCorasickBuilder;
1811     ///
1812     /// let patterns = &["FOO", "bAr", "BaZ"];
1813     /// let haystack = "foo bar baz";
1814     ///
1815     /// let ac = AhoCorasickBuilder::new()
1816     ///     .ascii_case_insensitive(true)
1817     ///     .build(patterns);
1818     /// assert_eq!(3, ac.find_iter(haystack).count());
1819     /// ```
ascii_case_insensitive( &mut self, yes: bool, ) -> &mut AhoCorasickBuilder1820     pub fn ascii_case_insensitive(
1821         &mut self,
1822         yes: bool,
1823     ) -> &mut AhoCorasickBuilder {
1824         self.nfa_builder.ascii_case_insensitive(yes);
1825         self
1826     }
1827 
1828     /// Set the limit on how many NFA states use a dense representation for
1829     /// their transitions.
1830     ///
1831     /// A dense representation uses more space, but supports faster access to
1832     /// transitions at search time. Thus, this setting permits the control of a
1833     /// space vs time trade off when using the NFA variant of Aho-Corasick.
1834     ///
1835     /// This limit is expressed in terms of the depth of a state, i.e., the
1836     /// number of transitions from the starting state of the NFA. The idea is
1837     /// that most of the time searching will be spent near the starting state
1838     /// of the automaton, so states near the start state should use a dense
1839     /// representation. States further away from the start state would then use
1840     /// a sparse representation, which uses less space but is slower to access
1841     /// transitions at search time.
1842     ///
1843     /// By default, this is set to a low but non-zero number.
1844     ///
1845     /// This setting has no effect if the `dfa` option is enabled.
dense_depth(&mut self, depth: usize) -> &mut AhoCorasickBuilder1846     pub fn dense_depth(&mut self, depth: usize) -> &mut AhoCorasickBuilder {
1847         self.nfa_builder.dense_depth(depth);
1848         self
1849     }
1850 
1851     /// Compile the standard Aho-Corasick automaton into a deterministic finite
1852     /// automaton (DFA).
1853     ///
1854     /// When this is disabled (which is the default), then a non-deterministic
1855     /// finite automaton (NFA) is used instead.
1856     ///
1857     /// The main benefit to a DFA is that it can execute searches more quickly
1858     /// than a NFA (perhaps 2-4 times as fast). The main drawback is that the
1859     /// DFA uses more space and can take much longer to build.
1860     ///
1861     /// Enabling this option does not change the time complexity for
1862     /// constructing the Aho-Corasick automaton (which is `O(p)` where
1863     /// `p` is the total number of patterns being compiled). Enabling this
1864     /// option does however reduce the time complexity of non-overlapping
1865     /// searches from `O(n + p)` to `O(n)`, where `n` is the length of the
1866     /// haystack.
1867     ///
1868     /// In general, it's a good idea to enable this if you're searching a
1869     /// small number of fairly short patterns (~1000), or if you want the
1870     /// fastest possible search without regard to compilation time or space
1871     /// usage.
dfa(&mut self, yes: bool) -> &mut AhoCorasickBuilder1872     pub fn dfa(&mut self, yes: bool) -> &mut AhoCorasickBuilder {
1873         self.dfa = yes;
1874         self
1875     }
1876 
1877     /// Enable heuristic prefilter optimizations.
1878     ///
1879     /// When enabled, searching will attempt to quickly skip to match
1880     /// candidates using specialized literal search routines. A prefilter
1881     /// cannot always be used, and is generally treated as a heuristic. It
1882     /// can be useful to disable this if the prefilter is observed to be
1883     /// sub-optimal for a particular workload.
1884     ///
1885     /// This is enabled by default.
prefilter(&mut self, yes: bool) -> &mut AhoCorasickBuilder1886     pub fn prefilter(&mut self, yes: bool) -> &mut AhoCorasickBuilder {
1887         self.nfa_builder.prefilter(yes);
1888         self
1889     }
1890 
1891     /// Shrink the size of the transition alphabet by mapping bytes to their
1892     /// equivalence classes. This only has an effect when the `dfa` option is
1893     /// enabled.
1894     ///
1895     /// When enabled, each a DFA will use a map from all possible bytes
1896     /// to their corresponding equivalence class. Each equivalence class
1897     /// represents a set of bytes that does not discriminate between a match
1898     /// and a non-match in the DFA. For example, the patterns `bar` and `baz`
1899     /// have at least five equivalence classes: singleton sets of `b`, `a`, `r`
1900     /// and `z`, and a final set that contains every other byte.
1901     ///
1902     /// The advantage of this map is that the size of the transition table can
1903     /// be reduced drastically from `#states * 256 * sizeof(id)` to
1904     /// `#states * k * sizeof(id)` where `k` is the number of equivalence
1905     /// classes. As a result, total space usage can decrease substantially.
1906     /// Moreover, since a smaller alphabet is used, compilation becomes faster
1907     /// as well.
1908     ///
1909     /// The disadvantage of this map is that every byte searched must be
1910     /// passed through this map before it can be used to determine the next
1911     /// transition. This has a small match time performance cost. However, if
1912     /// the DFA is otherwise very large without byte classes, then using byte
1913     /// classes can greatly improve memory locality and thus lead to better
1914     /// overall performance.
1915     ///
1916     /// This option is enabled by default.
1917     #[deprecated(
1918         since = "0.7.16",
1919         note = "not carrying its weight, will be always enabled, see: https://github.com/BurntSushi/aho-corasick/issues/57"
1920     )]
byte_classes(&mut self, yes: bool) -> &mut AhoCorasickBuilder1921     pub fn byte_classes(&mut self, yes: bool) -> &mut AhoCorasickBuilder {
1922         self.dfa_builder.byte_classes(yes);
1923         self
1924     }
1925 
1926     /// Premultiply state identifiers in the transition table. This only has
1927     /// an effect when the `dfa` option is enabled.
1928     ///
1929     /// When enabled, state identifiers are premultiplied to point to their
1930     /// corresponding row in the transition table. That is, given the `i`th
1931     /// state, its corresponding premultiplied identifier is `i * k` where `k`
1932     /// is the alphabet size of the automaton. (The alphabet size is at most
1933     /// 256, but is in practice smaller if byte classes is enabled.)
1934     ///
1935     /// When state identifiers are not premultiplied, then the identifier of
1936     /// the `i`th state is `i`.
1937     ///
1938     /// The advantage of premultiplying state identifiers is that is saves a
1939     /// multiplication instruction per byte when searching with a DFA. This has
1940     /// been observed to lead to a 20% performance benefit in micro-benchmarks.
1941     ///
1942     /// The primary disadvantage of premultiplying state identifiers is
1943     /// that they require a larger integer size to represent. For example,
1944     /// if the DFA has 200 states, then its premultiplied form requires 16
1945     /// bits to represent every possible state identifier, where as its
1946     /// non-premultiplied form only requires 8 bits.
1947     ///
1948     /// This option is enabled by default.
1949     #[deprecated(
1950         since = "0.7.16",
1951         note = "not carrying its weight, will be always enabled, see: https://github.com/BurntSushi/aho-corasick/issues/57"
1952     )]
premultiply(&mut self, yes: bool) -> &mut AhoCorasickBuilder1953     pub fn premultiply(&mut self, yes: bool) -> &mut AhoCorasickBuilder {
1954         self.dfa_builder.premultiply(yes);
1955         self
1956     }
1957 }
1958 
1959 /// A knob for controlling the match semantics of an Aho-Corasick automaton.
1960 ///
1961 /// There are two generally different ways that Aho-Corasick automatons can
1962 /// report matches. The first way is the "standard" approach that results from
1963 /// implementing most textbook explanations of Aho-Corasick. The second way is
1964 /// to report only the leftmost non-overlapping matches. The leftmost approach
1965 /// is in turn split into two different ways of resolving ambiguous matches:
1966 /// leftmost-first and leftmost-longest.
1967 ///
1968 /// The `Standard` match kind is the default and is the only one that supports
1969 /// overlapping matches and stream searching. (Trying to find overlapping
1970 /// or streaming matches using leftmost match semantics will result in a
1971 /// panic.) The `Standard` match kind will report matches as they are seen.
1972 /// When searching for overlapping matches, then all possible matches are
1973 /// reported. When searching for non-overlapping matches, the first match seen
1974 /// is reported. For example, for non-overlapping matches, given the patterns
1975 /// `abcd` and `b` and the subject string `abcdef`, only a match for `b` is
1976 /// reported since it is detected first. The `abcd` match is never reported
1977 /// since it overlaps with the `b` match.
1978 ///
1979 /// In contrast, the leftmost match kind always prefers the leftmost match
1980 /// among all possible matches. Given the same example as above with `abcd` and
1981 /// `b` as patterns and `abcdef` as the subject string, the leftmost match is
1982 /// `abcd` since it begins before the `b` match, even though the `b` match is
1983 /// detected before the `abcd` match. In this case, the `b` match is not
1984 /// reported at all since it overlaps with the `abcd` match.
1985 ///
1986 /// The difference between leftmost-first and leftmost-longest is in how they
1987 /// resolve ambiguous matches when there are multiple leftmost matches to
1988 /// choose from. Leftmost-first always chooses the pattern that was provided
1989 /// earliest, where as leftmost-longest always chooses the longest matching
1990 /// pattern. For example, given the patterns `a` and `ab` and the subject
1991 /// string `ab`, the leftmost-first match is `a` but the leftmost-longest match
1992 /// is `ab`. Conversely, if the patterns were given in reverse order, i.e.,
1993 /// `ab` and `a`, then both the leftmost-first and leftmost-longest matches
1994 /// would be `ab`. Stated differently, the leftmost-first match depends on the
1995 /// order in which the patterns were given to the Aho-Corasick automaton.
1996 /// Because of that, when leftmost-first matching is used, if a pattern `A`
1997 /// that appears before a pattern `B` is a prefix of `B`, then it is impossible
1998 /// to ever observe a match of `B`.
1999 ///
2000 /// If you're not sure which match kind to pick, then stick with the standard
2001 /// kind, which is the default. In particular, if you need overlapping or
2002 /// streaming matches, then you _must_ use the standard kind. The leftmost
2003 /// kinds are useful in specific circumstances. For example, leftmost-first can
2004 /// be very useful as a way to implement match priority based on the order of
2005 /// patterns given and leftmost-longest can be useful for dictionary searching
2006 /// such that only the longest matching words are reported.
2007 ///
2008 /// # Relationship with regular expression alternations
2009 ///
2010 /// Understanding match semantics can be a little tricky, and one easy way
2011 /// to conceptualize non-overlapping matches from an Aho-Corasick automaton
2012 /// is to think about them as a simple alternation of literals in a regular
2013 /// expression. For example, let's say we wanted to match the strings
2014 /// `Sam` and `Samwise`, which would turn into the regex `Sam|Samwise`. It
2015 /// turns out that regular expression engines have two different ways of
2016 /// matching this alternation. The first way, leftmost-longest, is commonly
2017 /// found in POSIX compatible implementations of regular expressions (such as
2018 /// `grep`). The second way, leftmost-first, is commonly found in backtracking
2019 /// implementations such as Perl. (Some regex engines, such as RE2 and Rust's
2020 /// regex engine do not use backtracking, but still implement leftmost-first
2021 /// semantics in an effort to match the behavior of dominant backtracking
2022 /// regex engines such as those found in Perl, Ruby, Python, Javascript and
2023 /// PHP.)
2024 ///
2025 /// That is, when matching `Sam|Samwise` against `Samwise`, a POSIX regex
2026 /// will match `Samwise` because it is the longest possible match, but a
2027 /// Perl-like regex will match `Sam` since it appears earlier in the
2028 /// alternation. Indeed, the regex `Sam|Samwise` in a Perl-like regex engine
2029 /// will never match `Samwise` since `Sam` will always have higher priority.
2030 /// Conversely, matching the regex `Samwise|Sam` against `Samwise` will lead to
2031 /// a match of `Samwise` in both POSIX and Perl-like regexes since `Samwise` is
2032 /// still longest match, but it also appears earlier than `Sam`.
2033 ///
2034 /// The "standard" match semantics of Aho-Corasick generally don't correspond
2035 /// to the match semantics of any large group of regex implementations, so
2036 /// there's no direct analogy that can be made here. Standard match semantics
2037 /// are generally useful for overlapping matches, or if you just want to see
2038 /// matches as they are detected.
2039 ///
2040 /// The main conclusion to draw from this section is that the match semantics
2041 /// can be tweaked to precisely match either Perl-like regex alternations or
2042 /// POSIX regex alternations.
2043 #[derive(Clone, Copy, Debug, Eq, PartialEq)]
2044 pub enum MatchKind {
2045     /// Use standard match semantics, which support overlapping matches. When
2046     /// used with non-overlapping matches, matches are reported as they are
2047     /// seen.
2048     Standard,
2049     /// Use leftmost-first match semantics, which reports leftmost matches.
2050     /// When there are multiple possible leftmost matches, the match
2051     /// corresponding to the pattern that appeared earlier when constructing
2052     /// the automaton is reported.
2053     ///
2054     /// This does **not** support overlapping matches or stream searching. If
2055     /// this match kind is used, attempting to find overlapping matches or
2056     /// stream matches will panic.
2057     LeftmostFirst,
2058     /// Use leftmost-longest match semantics, which reports leftmost matches.
2059     /// When there are multiple possible leftmost matches, the longest match
2060     /// is chosen.
2061     ///
2062     /// This does **not** support overlapping matches or stream searching. If
2063     /// this match kind is used, attempting to find overlapping matches or
2064     /// stream matches will panic.
2065     LeftmostLongest,
2066     /// Hints that destructuring should not be exhaustive.
2067     ///
2068     /// This enum may grow additional variants, so this makes sure clients
2069     /// don't count on exhaustive matching. (Otherwise, adding a new variant
2070     /// could break existing code.)
2071     #[doc(hidden)]
2072     __Nonexhaustive,
2073 }
2074 
2075 /// The default match kind is `MatchKind::Standard`.
2076 impl Default for MatchKind {
default() -> MatchKind2077     fn default() -> MatchKind {
2078         MatchKind::Standard
2079     }
2080 }
2081 
2082 impl MatchKind {
supports_overlapping(&self) -> bool2083     fn supports_overlapping(&self) -> bool {
2084         self.is_standard()
2085     }
2086 
supports_stream(&self) -> bool2087     fn supports_stream(&self) -> bool {
2088         // TODO: It may be possible to support this. It's hard.
2089         //
2090         // See: https://github.com/rust-lang/regex/issues/425#issuecomment-471367838
2091         self.is_standard()
2092     }
2093 
is_standard(&self) -> bool2094     pub(crate) fn is_standard(&self) -> bool {
2095         *self == MatchKind::Standard
2096     }
2097 
is_leftmost(&self) -> bool2098     pub(crate) fn is_leftmost(&self) -> bool {
2099         *self == MatchKind::LeftmostFirst
2100             || *self == MatchKind::LeftmostLongest
2101     }
2102 
is_leftmost_first(&self) -> bool2103     pub(crate) fn is_leftmost_first(&self) -> bool {
2104         *self == MatchKind::LeftmostFirst
2105     }
2106 
2107     /// Convert this match kind into a packed match kind. If this match kind
2108     /// corresponds to standard semantics, then this returns None, since
2109     /// packed searching does not support standard semantics.
as_packed(&self) -> Option<packed::MatchKind>2110     pub(crate) fn as_packed(&self) -> Option<packed::MatchKind> {
2111         match *self {
2112             MatchKind::Standard => None,
2113             MatchKind::LeftmostFirst => Some(packed::MatchKind::LeftmostFirst),
2114             MatchKind::LeftmostLongest => {
2115                 Some(packed::MatchKind::LeftmostLongest)
2116             }
2117             MatchKind::__Nonexhaustive => unreachable!(),
2118         }
2119     }
2120 }
2121 
2122 #[cfg(test)]
2123 mod tests {
2124     use super::*;
2125 
2126     #[test]
oibits()2127     fn oibits() {
2128         use std::panic::{RefUnwindSafe, UnwindSafe};
2129 
2130         fn assert_send<T: Send>() {}
2131         fn assert_sync<T: Sync>() {}
2132         fn assert_unwind_safe<T: RefUnwindSafe + UnwindSafe>() {}
2133 
2134         assert_send::<AhoCorasick>();
2135         assert_sync::<AhoCorasick>();
2136         assert_unwind_safe::<AhoCorasick>();
2137         assert_send::<AhoCorasickBuilder>();
2138         assert_sync::<AhoCorasickBuilder>();
2139         assert_unwind_safe::<AhoCorasickBuilder>();
2140     }
2141 }
2142