• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 use std::u16;
2 
3 use crate::packed::pattern::Patterns;
4 use crate::packed::rabinkarp::RabinKarp;
5 use crate::packed::teddy::{self, Teddy};
6 use crate::Match;
7 
8 /// This is a limit placed on the total number of patterns we're willing to try
9 /// and match at once. As more sophisticated algorithms are added, this number
10 /// may be increased.
11 const PATTERN_LIMIT: usize = 128;
12 
13 /// A knob for controlling the match semantics of a packed multiple string
14 /// searcher.
15 ///
16 /// This differs from the
17 /// [`MatchKind`](../enum.MatchKind.html)
18 /// type in the top-level crate module in that it doesn't support
19 /// "standard" match semantics, and instead only supports leftmost-first or
20 /// leftmost-longest. Namely, "standard" semantics cannot be easily supported
21 /// by packed searchers.
22 ///
23 /// For more information on the distinction between leftmost-first and
24 /// leftmost-longest, see the docs on the top-level `MatchKind` type.
25 ///
26 /// Unlike the top-level `MatchKind` type, the default match semantics for this
27 /// type are leftmost-first.
28 #[derive(Clone, Copy, Debug, Eq, PartialEq)]
29 pub enum MatchKind {
30     /// Use leftmost-first match semantics, which reports leftmost matches.
31     /// When there are multiple possible leftmost matches, the match
32     /// corresponding to the pattern that appeared earlier when constructing
33     /// the automaton is reported.
34     ///
35     /// This is the default.
36     LeftmostFirst,
37     /// Use leftmost-longest match semantics, which reports leftmost matches.
38     /// When there are multiple possible leftmost matches, the longest match
39     /// is chosen.
40     LeftmostLongest,
41     /// Hints that destructuring should not be exhaustive.
42     ///
43     /// This enum may grow additional variants, so this makes sure clients
44     /// don't count on exhaustive matching. (Otherwise, adding a new variant
45     /// could break existing code.)
46     #[doc(hidden)]
47     __Nonexhaustive,
48 }
49 
50 impl Default for MatchKind {
default() -> MatchKind51     fn default() -> MatchKind {
52         MatchKind::LeftmostFirst
53     }
54 }
55 
56 /// The configuration for a packed multiple pattern searcher.
57 ///
58 /// The configuration is currently limited only to being able to select the
59 /// match semantics (leftmost-first or leftmost-longest) of a searcher. In the
60 /// future, more knobs may be made available.
61 ///
62 /// A configuration produces a [`packed::Builder`](struct.Builder.html), which
63 /// in turn can be used to construct a
64 /// [`packed::Searcher`](struct.Searcher.html) for searching.
65 ///
66 /// # Example
67 ///
68 /// This example shows how to use leftmost-longest semantics instead of the
69 /// default (leftmost-first).
70 ///
71 /// ```
72 /// use aho_corasick::packed::{Config, MatchKind};
73 ///
74 /// # fn example() -> Option<()> {
75 /// let searcher = Config::new()
76 ///     .match_kind(MatchKind::LeftmostLongest)
77 ///     .builder()
78 ///     .add("foo")
79 ///     .add("foobar")
80 ///     .build()?;
81 /// let matches: Vec<usize> = searcher
82 ///     .find_iter("foobar")
83 ///     .map(|mat| mat.pattern())
84 ///     .collect();
85 /// assert_eq!(vec![1], matches);
86 /// # Some(()) }
87 /// # if cfg!(target_arch = "x86_64") {
88 /// #     example().unwrap()
89 /// # } else {
90 /// #     assert!(example().is_none());
91 /// # }
92 /// ```
93 #[derive(Clone, Debug)]
94 pub struct Config {
95     kind: MatchKind,
96     force: Option<ForceAlgorithm>,
97     force_teddy_fat: Option<bool>,
98     force_avx: Option<bool>,
99 }
100 
101 /// An internal option for forcing the use of a particular packed algorithm.
102 ///
103 /// When an algorithm is forced, if a searcher could not be constructed for it,
104 /// then no searcher will be returned even if an alternative algorithm would
105 /// work.
106 #[derive(Clone, Debug)]
107 enum ForceAlgorithm {
108     Teddy,
109     RabinKarp,
110 }
111 
112 impl Default for Config {
default() -> Config113     fn default() -> Config {
114         Config::new()
115     }
116 }
117 
118 impl Config {
119     /// Create a new default configuration. A default configuration uses
120     /// leftmost-first match semantics.
new() -> Config121     pub fn new() -> Config {
122         Config {
123             kind: MatchKind::LeftmostFirst,
124             force: None,
125             force_teddy_fat: None,
126             force_avx: None,
127         }
128     }
129 
130     /// Create a packed builder from this configuration. The builder can be
131     /// used to accumulate patterns and create a
132     /// [`Searcher`](struct.Searcher.html)
133     /// from them.
builder(&self) -> Builder134     pub fn builder(&self) -> Builder {
135         Builder::from_config(self.clone())
136     }
137 
138     /// Set the match semantics for this configuration.
match_kind(&mut self, kind: MatchKind) -> &mut Config139     pub fn match_kind(&mut self, kind: MatchKind) -> &mut Config {
140         self.kind = kind;
141         self
142     }
143 
144     /// An undocumented method for forcing the use of the Teddy algorithm.
145     ///
146     /// This is only exposed for more precise testing and benchmarks. Callers
147     /// should not use it as it is not part of the API stability guarantees of
148     /// this crate.
149     #[doc(hidden)]
force_teddy(&mut self, yes: bool) -> &mut Config150     pub fn force_teddy(&mut self, yes: bool) -> &mut Config {
151         if yes {
152             self.force = Some(ForceAlgorithm::Teddy);
153         } else {
154             self.force = None;
155         }
156         self
157     }
158 
159     /// An undocumented method for forcing the use of the Fat Teddy algorithm.
160     ///
161     /// This is only exposed for more precise testing and benchmarks. Callers
162     /// should not use it as it is not part of the API stability guarantees of
163     /// this crate.
164     #[doc(hidden)]
force_teddy_fat(&mut self, yes: Option<bool>) -> &mut Config165     pub fn force_teddy_fat(&mut self, yes: Option<bool>) -> &mut Config {
166         self.force_teddy_fat = yes;
167         self
168     }
169 
170     /// An undocumented method for forcing the use of SSE (`Some(false)`) or
171     /// AVX (`Some(true)`) algorithms.
172     ///
173     /// This is only exposed for more precise testing and benchmarks. Callers
174     /// should not use it as it is not part of the API stability guarantees of
175     /// this crate.
176     #[doc(hidden)]
force_avx(&mut self, yes: Option<bool>) -> &mut Config177     pub fn force_avx(&mut self, yes: Option<bool>) -> &mut Config {
178         self.force_avx = yes;
179         self
180     }
181 
182     /// An undocumented method for forcing the use of the Rabin-Karp algorithm.
183     ///
184     /// This is only exposed for more precise testing and benchmarks. Callers
185     /// should not use it as it is not part of the API stability guarantees of
186     /// this crate.
187     #[doc(hidden)]
force_rabin_karp(&mut self, yes: bool) -> &mut Config188     pub fn force_rabin_karp(&mut self, yes: bool) -> &mut Config {
189         if yes {
190             self.force = Some(ForceAlgorithm::RabinKarp);
191         } else {
192             self.force = None;
193         }
194         self
195     }
196 }
197 
198 /// A builder for constructing a packed searcher from a collection of patterns.
199 ///
200 /// # Example
201 ///
202 /// This example shows how to use a builder to construct a searcher. By
203 /// default, leftmost-first match semantics are used.
204 ///
205 /// ```
206 /// use aho_corasick::packed::{Builder, MatchKind};
207 ///
208 /// # fn example() -> Option<()> {
209 /// let searcher = Builder::new()
210 ///     .add("foobar")
211 ///     .add("foo")
212 ///     .build()?;
213 /// let matches: Vec<usize> = searcher
214 ///     .find_iter("foobar")
215 ///     .map(|mat| mat.pattern())
216 ///     .collect();
217 /// assert_eq!(vec![0], matches);
218 /// # Some(()) }
219 /// # if cfg!(target_arch = "x86_64") {
220 /// #     example().unwrap()
221 /// # } else {
222 /// #     assert!(example().is_none());
223 /// # }
224 /// ```
225 #[derive(Clone, Debug)]
226 pub struct Builder {
227     /// The configuration of this builder and subsequent matcher.
228     config: Config,
229     /// Set to true if the builder detects that a matcher cannot be built.
230     inert: bool,
231     /// The patterns provided by the caller.
232     patterns: Patterns,
233 }
234 
235 impl Builder {
236     /// Create a new builder for constructing a multi-pattern searcher. This
237     /// constructor uses the default configuration.
new() -> Builder238     pub fn new() -> Builder {
239         Builder::from_config(Config::new())
240     }
241 
from_config(config: Config) -> Builder242     fn from_config(config: Config) -> Builder {
243         Builder { config, inert: false, patterns: Patterns::new() }
244     }
245 
246     /// Build a searcher from the patterns added to this builder so far.
build(&self) -> Option<Searcher>247     pub fn build(&self) -> Option<Searcher> {
248         if self.inert || self.patterns.is_empty() {
249             return None;
250         }
251         let mut patterns = self.patterns.clone();
252         patterns.set_match_kind(self.config.kind);
253         let rabinkarp = RabinKarp::new(&patterns);
254         // Effectively, we only want to return a searcher if we can use Teddy,
255         // since Teddy is our only fast packed searcher at the moment.
256         // Rabin-Karp is only used when searching haystacks smaller than what
257         // Teddy can support. Thus, the only way to get a Rabin-Karp searcher
258         // is to force it using undocumented APIs (for tests/benchmarks).
259         let (search_kind, minimum_len) = match self.config.force {
260             None | Some(ForceAlgorithm::Teddy) => {
261                 let teddy = match self.build_teddy(&patterns) {
262                     None => return None,
263                     Some(teddy) => teddy,
264                 };
265                 let minimum_len = teddy.minimum_len();
266                 (SearchKind::Teddy(teddy), minimum_len)
267             }
268             Some(ForceAlgorithm::RabinKarp) => (SearchKind::RabinKarp, 0),
269         };
270         Some(Searcher { patterns, rabinkarp, search_kind, minimum_len })
271     }
272 
build_teddy(&self, patterns: &Patterns) -> Option<Teddy>273     fn build_teddy(&self, patterns: &Patterns) -> Option<Teddy> {
274         teddy::Builder::new()
275             .avx(self.config.force_avx)
276             .fat(self.config.force_teddy_fat)
277             .build(&patterns)
278     }
279 
280     /// Add the given pattern to this set to match.
281     ///
282     /// The order in which patterns are added is significant. Namely, when
283     /// using leftmost-first match semantics, then when multiple patterns can
284     /// match at a particular location, the pattern that was added first is
285     /// used as the match.
286     ///
287     /// If the number of patterns added exceeds the amount supported by packed
288     /// searchers, then the builder will stop accumulating patterns and render
289     /// itself inert. At this point, constructing a searcher will always return
290     /// `None`.
add<P: AsRef<[u8]>>(&mut self, pattern: P) -> &mut Builder291     pub fn add<P: AsRef<[u8]>>(&mut self, pattern: P) -> &mut Builder {
292         if self.inert {
293             return self;
294         } else if self.patterns.len() >= PATTERN_LIMIT {
295             self.inert = true;
296             self.patterns.reset();
297             return self;
298         }
299         // Just in case PATTERN_LIMIT increases beyond u16::MAX.
300         assert!(self.patterns.len() <= u16::MAX as usize);
301 
302         let pattern = pattern.as_ref();
303         if pattern.is_empty() {
304             self.inert = true;
305             self.patterns.reset();
306             return self;
307         }
308         self.patterns.add(pattern);
309         self
310     }
311 
312     /// Add the given iterator of patterns to this set to match.
313     ///
314     /// The iterator must yield elements that can be converted into a `&[u8]`.
315     ///
316     /// The order in which patterns are added is significant. Namely, when
317     /// using leftmost-first match semantics, then when multiple patterns can
318     /// match at a particular location, the pattern that was added first is
319     /// used as the match.
320     ///
321     /// If the number of patterns added exceeds the amount supported by packed
322     /// searchers, then the builder will stop accumulating patterns and render
323     /// itself inert. At this point, constructing a searcher will always return
324     /// `None`.
extend<I, P>(&mut self, patterns: I) -> &mut Builder where I: IntoIterator<Item = P>, P: AsRef<[u8]>,325     pub fn extend<I, P>(&mut self, patterns: I) -> &mut Builder
326     where
327         I: IntoIterator<Item = P>,
328         P: AsRef<[u8]>,
329     {
330         for p in patterns {
331             self.add(p);
332         }
333         self
334     }
335 }
336 
337 impl Default for Builder {
default() -> Builder338     fn default() -> Builder {
339         Builder::new()
340     }
341 }
342 
343 /// A packed searcher for quickly finding occurrences of multiple patterns.
344 ///
345 /// If callers need more flexible construction, or if one wants to change the
346 /// match semantics (either leftmost-first or leftmost-longest), then one can
347 /// use the [`Config`](struct.Config.html) and/or
348 /// [`Builder`](struct.Builder.html) types for more fine grained control.
349 ///
350 /// # Example
351 ///
352 /// This example shows how to create a searcher from an iterator of patterns.
353 /// By default, leftmost-first match semantics are used.
354 ///
355 /// ```
356 /// use aho_corasick::packed::{MatchKind, Searcher};
357 ///
358 /// # fn example() -> Option<()> {
359 /// let searcher = Searcher::new(["foobar", "foo"].iter().cloned())?;
360 /// let matches: Vec<usize> = searcher
361 ///     .find_iter("foobar")
362 ///     .map(|mat| mat.pattern())
363 ///     .collect();
364 /// assert_eq!(vec![0], matches);
365 /// # Some(()) }
366 /// # if cfg!(target_arch = "x86_64") {
367 /// #     example().unwrap()
368 /// # } else {
369 /// #     assert!(example().is_none());
370 /// # }
371 /// ```
372 #[derive(Clone, Debug)]
373 pub struct Searcher {
374     patterns: Patterns,
375     rabinkarp: RabinKarp,
376     search_kind: SearchKind,
377     minimum_len: usize,
378 }
379 
380 #[derive(Clone, Debug)]
381 enum SearchKind {
382     Teddy(Teddy),
383     RabinKarp,
384 }
385 
386 impl Searcher {
387     /// A convenience function for constructing a searcher from an iterator
388     /// of things that can be converted to a `&[u8]`.
389     ///
390     /// If a searcher could not be constructed (either because of an
391     /// unsupported CPU or because there are too many patterns), then `None`
392     /// is returned.
393     ///
394     /// # Example
395     ///
396     /// Basic usage:
397     ///
398     /// ```
399     /// use aho_corasick::packed::{MatchKind, Searcher};
400     ///
401     /// # fn example() -> Option<()> {
402     /// let searcher = Searcher::new(["foobar", "foo"].iter().cloned())?;
403     /// let matches: Vec<usize> = searcher
404     ///     .find_iter("foobar")
405     ///     .map(|mat| mat.pattern())
406     ///     .collect();
407     /// assert_eq!(vec![0], matches);
408     /// # Some(()) }
409     /// # if cfg!(target_arch = "x86_64") {
410     /// #     example().unwrap()
411     /// # } else {
412     /// #     assert!(example().is_none());
413     /// # }
414     /// ```
new<I, P>(patterns: I) -> Option<Searcher> where I: IntoIterator<Item = P>, P: AsRef<[u8]>,415     pub fn new<I, P>(patterns: I) -> Option<Searcher>
416     where
417         I: IntoIterator<Item = P>,
418         P: AsRef<[u8]>,
419     {
420         Builder::new().extend(patterns).build()
421     }
422 
423     /// Return the first occurrence of any of the patterns in this searcher,
424     /// according to its match semantics, in the given haystack. The `Match`
425     /// returned will include the identifier of the pattern that matched, which
426     /// corresponds to the index of the pattern (starting from `0`) in which it
427     /// was added.
428     ///
429     /// # Example
430     ///
431     /// Basic usage:
432     ///
433     /// ```
434     /// use aho_corasick::packed::{MatchKind, Searcher};
435     ///
436     /// # fn example() -> Option<()> {
437     /// let searcher = Searcher::new(["foobar", "foo"].iter().cloned())?;
438     /// let mat = searcher.find("foobar")?;
439     /// assert_eq!(0, mat.pattern());
440     /// assert_eq!(0, mat.start());
441     /// assert_eq!(6, mat.end());
442     /// # Some(()) }
443     /// # if cfg!(target_arch = "x86_64") {
444     /// #     example().unwrap()
445     /// # } else {
446     /// #     assert!(example().is_none());
447     /// # }
448     /// ```
find<B: AsRef<[u8]>>(&self, haystack: B) -> Option<Match>449     pub fn find<B: AsRef<[u8]>>(&self, haystack: B) -> Option<Match> {
450         self.find_at(haystack, 0)
451     }
452 
453     /// Return the first occurrence of any of the patterns in this searcher,
454     /// according to its match semantics, in the given haystack starting from
455     /// the given position.
456     ///
457     /// The `Match` returned will include the identifier of the pattern that
458     /// matched, which corresponds to the index of the pattern (starting from
459     /// `0`) in which it was added. The offsets in the `Match` will be relative
460     /// to the start of `haystack` (and not `at`).
461     ///
462     /// # Example
463     ///
464     /// Basic usage:
465     ///
466     /// ```
467     /// use aho_corasick::packed::{MatchKind, Searcher};
468     ///
469     /// # fn example() -> Option<()> {
470     /// let searcher = Searcher::new(["foobar", "foo"].iter().cloned())?;
471     /// let mat = searcher.find_at("foofoobar", 3)?;
472     /// assert_eq!(0, mat.pattern());
473     /// assert_eq!(3, mat.start());
474     /// assert_eq!(9, mat.end());
475     /// # Some(()) }
476     /// # if cfg!(target_arch = "x86_64") {
477     /// #     example().unwrap()
478     /// # } else {
479     /// #     assert!(example().is_none());
480     /// # }
481     /// ```
find_at<B: AsRef<[u8]>>( &self, haystack: B, at: usize, ) -> Option<Match>482     pub fn find_at<B: AsRef<[u8]>>(
483         &self,
484         haystack: B,
485         at: usize,
486     ) -> Option<Match> {
487         let haystack = haystack.as_ref();
488         match self.search_kind {
489             SearchKind::Teddy(ref teddy) => {
490                 if haystack[at..].len() < teddy.minimum_len() {
491                     return self.slow_at(haystack, at);
492                 }
493                 teddy.find_at(&self.patterns, haystack, at)
494             }
495             SearchKind::RabinKarp => {
496                 self.rabinkarp.find_at(&self.patterns, haystack, at)
497             }
498         }
499     }
500 
501     /// Return an iterator of non-overlapping occurrences of the patterns in
502     /// this searcher, according to its match semantics, in the given haystack.
503     ///
504     /// # Example
505     ///
506     /// Basic usage:
507     ///
508     /// ```
509     /// use aho_corasick::packed::{MatchKind, Searcher};
510     ///
511     /// # fn example() -> Option<()> {
512     /// let searcher = Searcher::new(["foobar", "foo"].iter().cloned())?;
513     /// let matches: Vec<usize> = searcher
514     ///     .find_iter("foobar fooba foofoo")
515     ///     .map(|mat| mat.pattern())
516     ///     .collect();
517     /// assert_eq!(vec![0, 1, 1, 1], matches);
518     /// # Some(()) }
519     /// # if cfg!(target_arch = "x86_64") {
520     /// #     example().unwrap()
521     /// # } else {
522     /// #     assert!(example().is_none());
523     /// # }
524     /// ```
find_iter<'a, 'b, B: ?Sized + AsRef<[u8]>>( &'a self, haystack: &'b B, ) -> FindIter<'a, 'b>525     pub fn find_iter<'a, 'b, B: ?Sized + AsRef<[u8]>>(
526         &'a self,
527         haystack: &'b B,
528     ) -> FindIter<'a, 'b> {
529         FindIter { searcher: self, haystack: haystack.as_ref(), at: 0 }
530     }
531 
532     /// Returns the match kind used by this packed searcher.
533     ///
534     /// # Examples
535     ///
536     /// Basic usage:
537     ///
538     /// ```
539     /// use aho_corasick::packed::{MatchKind, Searcher};
540     ///
541     /// # fn example() -> Option<()> {
542     /// let searcher = Searcher::new(["foobar", "foo"].iter().cloned())?;
543     /// // leftmost-first is the default.
544     /// assert_eq!(&MatchKind::LeftmostFirst, searcher.match_kind());
545     /// # Some(()) }
546     /// # if cfg!(target_arch = "x86_64") {
547     /// #     example().unwrap()
548     /// # } else {
549     /// #     assert!(example().is_none());
550     /// # }
551     /// ```
match_kind(&self) -> &MatchKind552     pub fn match_kind(&self) -> &MatchKind {
553         self.patterns.match_kind()
554     }
555 
556     /// Returns the minimum length of a haystack that is required in order for
557     /// packed searching to be effective.
558     ///
559     /// In some cases, the underlying packed searcher may not be able to search
560     /// very short haystacks. When that occurs, the implementation will defer
561     /// to a slower non-packed searcher (which is still generally faster than
562     /// Aho-Corasick for a small number of patterns). However, callers may
563     /// want to avoid ever using the slower variant, which one can do by
564     /// never passing a haystack shorter than the minimum length returned by
565     /// this method.
minimum_len(&self) -> usize566     pub fn minimum_len(&self) -> usize {
567         self.minimum_len
568     }
569 
570     /// Returns the approximate total amount of heap used by this searcher, in
571     /// units of bytes.
heap_bytes(&self) -> usize572     pub fn heap_bytes(&self) -> usize {
573         self.patterns.heap_bytes()
574             + self.rabinkarp.heap_bytes()
575             + self.search_kind.heap_bytes()
576     }
577 
578     /// Use a slow (non-packed) searcher.
579     ///
580     /// This is useful when a packed searcher could be constructed, but could
581     /// not be used to search a specific haystack. For example, if Teddy was
582     /// built but the haystack is smaller than ~34 bytes, then Teddy might not
583     /// be able to run.
slow_at(&self, haystack: &[u8], at: usize) -> Option<Match>584     fn slow_at(&self, haystack: &[u8], at: usize) -> Option<Match> {
585         self.rabinkarp.find_at(&self.patterns, haystack, at)
586     }
587 }
588 
589 impl SearchKind {
heap_bytes(&self) -> usize590     fn heap_bytes(&self) -> usize {
591         match *self {
592             SearchKind::Teddy(ref ted) => ted.heap_bytes(),
593             SearchKind::RabinKarp => 0,
594         }
595     }
596 }
597 
598 /// An iterator over non-overlapping matches from a packed searcher.
599 ///
600 /// The lifetime `'s` refers to the lifetime of the underlying
601 /// [`Searcher`](struct.Searcher.html), while the lifetime `'h` refers to the
602 /// lifetime of the haystack being searched.
603 #[derive(Debug)]
604 pub struct FindIter<'s, 'h> {
605     searcher: &'s Searcher,
606     haystack: &'h [u8],
607     at: usize,
608 }
609 
610 impl<'s, 'h> Iterator for FindIter<'s, 'h> {
611     type Item = Match;
612 
next(&mut self) -> Option<Match>613     fn next(&mut self) -> Option<Match> {
614         if self.at > self.haystack.len() {
615             return None;
616         }
617         match self.searcher.find_at(&self.haystack, self.at) {
618             None => None,
619             Some(c) => {
620                 self.at = c.end;
621                 Some(c)
622             }
623         }
624     }
625 }
626