1 use std::u16; 2 3 use crate::packed::pattern::Patterns; 4 use crate::packed::rabinkarp::RabinKarp; 5 use crate::packed::teddy::{self, Teddy}; 6 use crate::Match; 7 8 /// This is a limit placed on the total number of patterns we're willing to try 9 /// and match at once. As more sophisticated algorithms are added, this number 10 /// may be increased. 11 const PATTERN_LIMIT: usize = 128; 12 13 /// A knob for controlling the match semantics of a packed multiple string 14 /// searcher. 15 /// 16 /// This differs from the 17 /// [`MatchKind`](../enum.MatchKind.html) 18 /// type in the top-level crate module in that it doesn't support 19 /// "standard" match semantics, and instead only supports leftmost-first or 20 /// leftmost-longest. Namely, "standard" semantics cannot be easily supported 21 /// by packed searchers. 22 /// 23 /// For more information on the distinction between leftmost-first and 24 /// leftmost-longest, see the docs on the top-level `MatchKind` type. 25 /// 26 /// Unlike the top-level `MatchKind` type, the default match semantics for this 27 /// type are leftmost-first. 28 #[derive(Clone, Copy, Debug, Eq, PartialEq)] 29 pub enum MatchKind { 30 /// Use leftmost-first match semantics, which reports leftmost matches. 31 /// When there are multiple possible leftmost matches, the match 32 /// corresponding to the pattern that appeared earlier when constructing 33 /// the automaton is reported. 34 /// 35 /// This is the default. 36 LeftmostFirst, 37 /// Use leftmost-longest match semantics, which reports leftmost matches. 38 /// When there are multiple possible leftmost matches, the longest match 39 /// is chosen. 40 LeftmostLongest, 41 /// Hints that destructuring should not be exhaustive. 42 /// 43 /// This enum may grow additional variants, so this makes sure clients 44 /// don't count on exhaustive matching. (Otherwise, adding a new variant 45 /// could break existing code.) 46 #[doc(hidden)] 47 __Nonexhaustive, 48 } 49 50 impl Default for MatchKind { default() -> MatchKind51 fn default() -> MatchKind { 52 MatchKind::LeftmostFirst 53 } 54 } 55 56 /// The configuration for a packed multiple pattern searcher. 57 /// 58 /// The configuration is currently limited only to being able to select the 59 /// match semantics (leftmost-first or leftmost-longest) of a searcher. In the 60 /// future, more knobs may be made available. 61 /// 62 /// A configuration produces a [`packed::Builder`](struct.Builder.html), which 63 /// in turn can be used to construct a 64 /// [`packed::Searcher`](struct.Searcher.html) for searching. 65 /// 66 /// # Example 67 /// 68 /// This example shows how to use leftmost-longest semantics instead of the 69 /// default (leftmost-first). 70 /// 71 /// ``` 72 /// use aho_corasick::packed::{Config, MatchKind}; 73 /// 74 /// # fn example() -> Option<()> { 75 /// let searcher = Config::new() 76 /// .match_kind(MatchKind::LeftmostLongest) 77 /// .builder() 78 /// .add("foo") 79 /// .add("foobar") 80 /// .build()?; 81 /// let matches: Vec<usize> = searcher 82 /// .find_iter("foobar") 83 /// .map(|mat| mat.pattern()) 84 /// .collect(); 85 /// assert_eq!(vec![1], matches); 86 /// # Some(()) } 87 /// # if cfg!(target_arch = "x86_64") { 88 /// # example().unwrap() 89 /// # } else { 90 /// # assert!(example().is_none()); 91 /// # } 92 /// ``` 93 #[derive(Clone, Debug)] 94 pub struct Config { 95 kind: MatchKind, 96 force: Option<ForceAlgorithm>, 97 force_teddy_fat: Option<bool>, 98 force_avx: Option<bool>, 99 } 100 101 /// An internal option for forcing the use of a particular packed algorithm. 102 /// 103 /// When an algorithm is forced, if a searcher could not be constructed for it, 104 /// then no searcher will be returned even if an alternative algorithm would 105 /// work. 106 #[derive(Clone, Debug)] 107 enum ForceAlgorithm { 108 Teddy, 109 RabinKarp, 110 } 111 112 impl Default for Config { default() -> Config113 fn default() -> Config { 114 Config::new() 115 } 116 } 117 118 impl Config { 119 /// Create a new default configuration. A default configuration uses 120 /// leftmost-first match semantics. new() -> Config121 pub fn new() -> Config { 122 Config { 123 kind: MatchKind::LeftmostFirst, 124 force: None, 125 force_teddy_fat: None, 126 force_avx: None, 127 } 128 } 129 130 /// Create a packed builder from this configuration. The builder can be 131 /// used to accumulate patterns and create a 132 /// [`Searcher`](struct.Searcher.html) 133 /// from them. builder(&self) -> Builder134 pub fn builder(&self) -> Builder { 135 Builder::from_config(self.clone()) 136 } 137 138 /// Set the match semantics for this configuration. match_kind(&mut self, kind: MatchKind) -> &mut Config139 pub fn match_kind(&mut self, kind: MatchKind) -> &mut Config { 140 self.kind = kind; 141 self 142 } 143 144 /// An undocumented method for forcing the use of the Teddy algorithm. 145 /// 146 /// This is only exposed for more precise testing and benchmarks. Callers 147 /// should not use it as it is not part of the API stability guarantees of 148 /// this crate. 149 #[doc(hidden)] force_teddy(&mut self, yes: bool) -> &mut Config150 pub fn force_teddy(&mut self, yes: bool) -> &mut Config { 151 if yes { 152 self.force = Some(ForceAlgorithm::Teddy); 153 } else { 154 self.force = None; 155 } 156 self 157 } 158 159 /// An undocumented method for forcing the use of the Fat Teddy algorithm. 160 /// 161 /// This is only exposed for more precise testing and benchmarks. Callers 162 /// should not use it as it is not part of the API stability guarantees of 163 /// this crate. 164 #[doc(hidden)] force_teddy_fat(&mut self, yes: Option<bool>) -> &mut Config165 pub fn force_teddy_fat(&mut self, yes: Option<bool>) -> &mut Config { 166 self.force_teddy_fat = yes; 167 self 168 } 169 170 /// An undocumented method for forcing the use of SSE (`Some(false)`) or 171 /// AVX (`Some(true)`) algorithms. 172 /// 173 /// This is only exposed for more precise testing and benchmarks. Callers 174 /// should not use it as it is not part of the API stability guarantees of 175 /// this crate. 176 #[doc(hidden)] force_avx(&mut self, yes: Option<bool>) -> &mut Config177 pub fn force_avx(&mut self, yes: Option<bool>) -> &mut Config { 178 self.force_avx = yes; 179 self 180 } 181 182 /// An undocumented method for forcing the use of the Rabin-Karp algorithm. 183 /// 184 /// This is only exposed for more precise testing and benchmarks. Callers 185 /// should not use it as it is not part of the API stability guarantees of 186 /// this crate. 187 #[doc(hidden)] force_rabin_karp(&mut self, yes: bool) -> &mut Config188 pub fn force_rabin_karp(&mut self, yes: bool) -> &mut Config { 189 if yes { 190 self.force = Some(ForceAlgorithm::RabinKarp); 191 } else { 192 self.force = None; 193 } 194 self 195 } 196 } 197 198 /// A builder for constructing a packed searcher from a collection of patterns. 199 /// 200 /// # Example 201 /// 202 /// This example shows how to use a builder to construct a searcher. By 203 /// default, leftmost-first match semantics are used. 204 /// 205 /// ``` 206 /// use aho_corasick::packed::{Builder, MatchKind}; 207 /// 208 /// # fn example() -> Option<()> { 209 /// let searcher = Builder::new() 210 /// .add("foobar") 211 /// .add("foo") 212 /// .build()?; 213 /// let matches: Vec<usize> = searcher 214 /// .find_iter("foobar") 215 /// .map(|mat| mat.pattern()) 216 /// .collect(); 217 /// assert_eq!(vec![0], matches); 218 /// # Some(()) } 219 /// # if cfg!(target_arch = "x86_64") { 220 /// # example().unwrap() 221 /// # } else { 222 /// # assert!(example().is_none()); 223 /// # } 224 /// ``` 225 #[derive(Clone, Debug)] 226 pub struct Builder { 227 /// The configuration of this builder and subsequent matcher. 228 config: Config, 229 /// Set to true if the builder detects that a matcher cannot be built. 230 inert: bool, 231 /// The patterns provided by the caller. 232 patterns: Patterns, 233 } 234 235 impl Builder { 236 /// Create a new builder for constructing a multi-pattern searcher. This 237 /// constructor uses the default configuration. new() -> Builder238 pub fn new() -> Builder { 239 Builder::from_config(Config::new()) 240 } 241 from_config(config: Config) -> Builder242 fn from_config(config: Config) -> Builder { 243 Builder { config, inert: false, patterns: Patterns::new() } 244 } 245 246 /// Build a searcher from the patterns added to this builder so far. build(&self) -> Option<Searcher>247 pub fn build(&self) -> Option<Searcher> { 248 if self.inert || self.patterns.is_empty() { 249 return None; 250 } 251 let mut patterns = self.patterns.clone(); 252 patterns.set_match_kind(self.config.kind); 253 let rabinkarp = RabinKarp::new(&patterns); 254 // Effectively, we only want to return a searcher if we can use Teddy, 255 // since Teddy is our only fast packed searcher at the moment. 256 // Rabin-Karp is only used when searching haystacks smaller than what 257 // Teddy can support. Thus, the only way to get a Rabin-Karp searcher 258 // is to force it using undocumented APIs (for tests/benchmarks). 259 let (search_kind, minimum_len) = match self.config.force { 260 None | Some(ForceAlgorithm::Teddy) => { 261 let teddy = match self.build_teddy(&patterns) { 262 None => return None, 263 Some(teddy) => teddy, 264 }; 265 let minimum_len = teddy.minimum_len(); 266 (SearchKind::Teddy(teddy), minimum_len) 267 } 268 Some(ForceAlgorithm::RabinKarp) => (SearchKind::RabinKarp, 0), 269 }; 270 Some(Searcher { patterns, rabinkarp, search_kind, minimum_len }) 271 } 272 build_teddy(&self, patterns: &Patterns) -> Option<Teddy>273 fn build_teddy(&self, patterns: &Patterns) -> Option<Teddy> { 274 teddy::Builder::new() 275 .avx(self.config.force_avx) 276 .fat(self.config.force_teddy_fat) 277 .build(&patterns) 278 } 279 280 /// Add the given pattern to this set to match. 281 /// 282 /// The order in which patterns are added is significant. Namely, when 283 /// using leftmost-first match semantics, then when multiple patterns can 284 /// match at a particular location, the pattern that was added first is 285 /// used as the match. 286 /// 287 /// If the number of patterns added exceeds the amount supported by packed 288 /// searchers, then the builder will stop accumulating patterns and render 289 /// itself inert. At this point, constructing a searcher will always return 290 /// `None`. add<P: AsRef<[u8]>>(&mut self, pattern: P) -> &mut Builder291 pub fn add<P: AsRef<[u8]>>(&mut self, pattern: P) -> &mut Builder { 292 if self.inert { 293 return self; 294 } else if self.patterns.len() >= PATTERN_LIMIT { 295 self.inert = true; 296 self.patterns.reset(); 297 return self; 298 } 299 // Just in case PATTERN_LIMIT increases beyond u16::MAX. 300 assert!(self.patterns.len() <= u16::MAX as usize); 301 302 let pattern = pattern.as_ref(); 303 if pattern.is_empty() { 304 self.inert = true; 305 self.patterns.reset(); 306 return self; 307 } 308 self.patterns.add(pattern); 309 self 310 } 311 312 /// Add the given iterator of patterns to this set to match. 313 /// 314 /// The iterator must yield elements that can be converted into a `&[u8]`. 315 /// 316 /// The order in which patterns are added is significant. Namely, when 317 /// using leftmost-first match semantics, then when multiple patterns can 318 /// match at a particular location, the pattern that was added first is 319 /// used as the match. 320 /// 321 /// If the number of patterns added exceeds the amount supported by packed 322 /// searchers, then the builder will stop accumulating patterns and render 323 /// itself inert. At this point, constructing a searcher will always return 324 /// `None`. extend<I, P>(&mut self, patterns: I) -> &mut Builder where I: IntoIterator<Item = P>, P: AsRef<[u8]>,325 pub fn extend<I, P>(&mut self, patterns: I) -> &mut Builder 326 where 327 I: IntoIterator<Item = P>, 328 P: AsRef<[u8]>, 329 { 330 for p in patterns { 331 self.add(p); 332 } 333 self 334 } 335 } 336 337 impl Default for Builder { default() -> Builder338 fn default() -> Builder { 339 Builder::new() 340 } 341 } 342 343 /// A packed searcher for quickly finding occurrences of multiple patterns. 344 /// 345 /// If callers need more flexible construction, or if one wants to change the 346 /// match semantics (either leftmost-first or leftmost-longest), then one can 347 /// use the [`Config`](struct.Config.html) and/or 348 /// [`Builder`](struct.Builder.html) types for more fine grained control. 349 /// 350 /// # Example 351 /// 352 /// This example shows how to create a searcher from an iterator of patterns. 353 /// By default, leftmost-first match semantics are used. 354 /// 355 /// ``` 356 /// use aho_corasick::packed::{MatchKind, Searcher}; 357 /// 358 /// # fn example() -> Option<()> { 359 /// let searcher = Searcher::new(["foobar", "foo"].iter().cloned())?; 360 /// let matches: Vec<usize> = searcher 361 /// .find_iter("foobar") 362 /// .map(|mat| mat.pattern()) 363 /// .collect(); 364 /// assert_eq!(vec![0], matches); 365 /// # Some(()) } 366 /// # if cfg!(target_arch = "x86_64") { 367 /// # example().unwrap() 368 /// # } else { 369 /// # assert!(example().is_none()); 370 /// # } 371 /// ``` 372 #[derive(Clone, Debug)] 373 pub struct Searcher { 374 patterns: Patterns, 375 rabinkarp: RabinKarp, 376 search_kind: SearchKind, 377 minimum_len: usize, 378 } 379 380 #[derive(Clone, Debug)] 381 enum SearchKind { 382 Teddy(Teddy), 383 RabinKarp, 384 } 385 386 impl Searcher { 387 /// A convenience function for constructing a searcher from an iterator 388 /// of things that can be converted to a `&[u8]`. 389 /// 390 /// If a searcher could not be constructed (either because of an 391 /// unsupported CPU or because there are too many patterns), then `None` 392 /// is returned. 393 /// 394 /// # Example 395 /// 396 /// Basic usage: 397 /// 398 /// ``` 399 /// use aho_corasick::packed::{MatchKind, Searcher}; 400 /// 401 /// # fn example() -> Option<()> { 402 /// let searcher = Searcher::new(["foobar", "foo"].iter().cloned())?; 403 /// let matches: Vec<usize> = searcher 404 /// .find_iter("foobar") 405 /// .map(|mat| mat.pattern()) 406 /// .collect(); 407 /// assert_eq!(vec![0], matches); 408 /// # Some(()) } 409 /// # if cfg!(target_arch = "x86_64") { 410 /// # example().unwrap() 411 /// # } else { 412 /// # assert!(example().is_none()); 413 /// # } 414 /// ``` new<I, P>(patterns: I) -> Option<Searcher> where I: IntoIterator<Item = P>, P: AsRef<[u8]>,415 pub fn new<I, P>(patterns: I) -> Option<Searcher> 416 where 417 I: IntoIterator<Item = P>, 418 P: AsRef<[u8]>, 419 { 420 Builder::new().extend(patterns).build() 421 } 422 423 /// Return the first occurrence of any of the patterns in this searcher, 424 /// according to its match semantics, in the given haystack. The `Match` 425 /// returned will include the identifier of the pattern that matched, which 426 /// corresponds to the index of the pattern (starting from `0`) in which it 427 /// was added. 428 /// 429 /// # Example 430 /// 431 /// Basic usage: 432 /// 433 /// ``` 434 /// use aho_corasick::packed::{MatchKind, Searcher}; 435 /// 436 /// # fn example() -> Option<()> { 437 /// let searcher = Searcher::new(["foobar", "foo"].iter().cloned())?; 438 /// let mat = searcher.find("foobar")?; 439 /// assert_eq!(0, mat.pattern()); 440 /// assert_eq!(0, mat.start()); 441 /// assert_eq!(6, mat.end()); 442 /// # Some(()) } 443 /// # if cfg!(target_arch = "x86_64") { 444 /// # example().unwrap() 445 /// # } else { 446 /// # assert!(example().is_none()); 447 /// # } 448 /// ``` find<B: AsRef<[u8]>>(&self, haystack: B) -> Option<Match>449 pub fn find<B: AsRef<[u8]>>(&self, haystack: B) -> Option<Match> { 450 self.find_at(haystack, 0) 451 } 452 453 /// Return the first occurrence of any of the patterns in this searcher, 454 /// according to its match semantics, in the given haystack starting from 455 /// the given position. 456 /// 457 /// The `Match` returned will include the identifier of the pattern that 458 /// matched, which corresponds to the index of the pattern (starting from 459 /// `0`) in which it was added. The offsets in the `Match` will be relative 460 /// to the start of `haystack` (and not `at`). 461 /// 462 /// # Example 463 /// 464 /// Basic usage: 465 /// 466 /// ``` 467 /// use aho_corasick::packed::{MatchKind, Searcher}; 468 /// 469 /// # fn example() -> Option<()> { 470 /// let searcher = Searcher::new(["foobar", "foo"].iter().cloned())?; 471 /// let mat = searcher.find_at("foofoobar", 3)?; 472 /// assert_eq!(0, mat.pattern()); 473 /// assert_eq!(3, mat.start()); 474 /// assert_eq!(9, mat.end()); 475 /// # Some(()) } 476 /// # if cfg!(target_arch = "x86_64") { 477 /// # example().unwrap() 478 /// # } else { 479 /// # assert!(example().is_none()); 480 /// # } 481 /// ``` find_at<B: AsRef<[u8]>>( &self, haystack: B, at: usize, ) -> Option<Match>482 pub fn find_at<B: AsRef<[u8]>>( 483 &self, 484 haystack: B, 485 at: usize, 486 ) -> Option<Match> { 487 let haystack = haystack.as_ref(); 488 match self.search_kind { 489 SearchKind::Teddy(ref teddy) => { 490 if haystack[at..].len() < teddy.minimum_len() { 491 return self.slow_at(haystack, at); 492 } 493 teddy.find_at(&self.patterns, haystack, at) 494 } 495 SearchKind::RabinKarp => { 496 self.rabinkarp.find_at(&self.patterns, haystack, at) 497 } 498 } 499 } 500 501 /// Return an iterator of non-overlapping occurrences of the patterns in 502 /// this searcher, according to its match semantics, in the given haystack. 503 /// 504 /// # Example 505 /// 506 /// Basic usage: 507 /// 508 /// ``` 509 /// use aho_corasick::packed::{MatchKind, Searcher}; 510 /// 511 /// # fn example() -> Option<()> { 512 /// let searcher = Searcher::new(["foobar", "foo"].iter().cloned())?; 513 /// let matches: Vec<usize> = searcher 514 /// .find_iter("foobar fooba foofoo") 515 /// .map(|mat| mat.pattern()) 516 /// .collect(); 517 /// assert_eq!(vec![0, 1, 1, 1], matches); 518 /// # Some(()) } 519 /// # if cfg!(target_arch = "x86_64") { 520 /// # example().unwrap() 521 /// # } else { 522 /// # assert!(example().is_none()); 523 /// # } 524 /// ``` find_iter<'a, 'b, B: ?Sized + AsRef<[u8]>>( &'a self, haystack: &'b B, ) -> FindIter<'a, 'b>525 pub fn find_iter<'a, 'b, B: ?Sized + AsRef<[u8]>>( 526 &'a self, 527 haystack: &'b B, 528 ) -> FindIter<'a, 'b> { 529 FindIter { searcher: self, haystack: haystack.as_ref(), at: 0 } 530 } 531 532 /// Returns the match kind used by this packed searcher. 533 /// 534 /// # Examples 535 /// 536 /// Basic usage: 537 /// 538 /// ``` 539 /// use aho_corasick::packed::{MatchKind, Searcher}; 540 /// 541 /// # fn example() -> Option<()> { 542 /// let searcher = Searcher::new(["foobar", "foo"].iter().cloned())?; 543 /// // leftmost-first is the default. 544 /// assert_eq!(&MatchKind::LeftmostFirst, searcher.match_kind()); 545 /// # Some(()) } 546 /// # if cfg!(target_arch = "x86_64") { 547 /// # example().unwrap() 548 /// # } else { 549 /// # assert!(example().is_none()); 550 /// # } 551 /// ``` match_kind(&self) -> &MatchKind552 pub fn match_kind(&self) -> &MatchKind { 553 self.patterns.match_kind() 554 } 555 556 /// Returns the minimum length of a haystack that is required in order for 557 /// packed searching to be effective. 558 /// 559 /// In some cases, the underlying packed searcher may not be able to search 560 /// very short haystacks. When that occurs, the implementation will defer 561 /// to a slower non-packed searcher (which is still generally faster than 562 /// Aho-Corasick for a small number of patterns). However, callers may 563 /// want to avoid ever using the slower variant, which one can do by 564 /// never passing a haystack shorter than the minimum length returned by 565 /// this method. minimum_len(&self) -> usize566 pub fn minimum_len(&self) -> usize { 567 self.minimum_len 568 } 569 570 /// Returns the approximate total amount of heap used by this searcher, in 571 /// units of bytes. heap_bytes(&self) -> usize572 pub fn heap_bytes(&self) -> usize { 573 self.patterns.heap_bytes() 574 + self.rabinkarp.heap_bytes() 575 + self.search_kind.heap_bytes() 576 } 577 578 /// Use a slow (non-packed) searcher. 579 /// 580 /// This is useful when a packed searcher could be constructed, but could 581 /// not be used to search a specific haystack. For example, if Teddy was 582 /// built but the haystack is smaller than ~34 bytes, then Teddy might not 583 /// be able to run. slow_at(&self, haystack: &[u8], at: usize) -> Option<Match>584 fn slow_at(&self, haystack: &[u8], at: usize) -> Option<Match> { 585 self.rabinkarp.find_at(&self.patterns, haystack, at) 586 } 587 } 588 589 impl SearchKind { heap_bytes(&self) -> usize590 fn heap_bytes(&self) -> usize { 591 match *self { 592 SearchKind::Teddy(ref ted) => ted.heap_bytes(), 593 SearchKind::RabinKarp => 0, 594 } 595 } 596 } 597 598 /// An iterator over non-overlapping matches from a packed searcher. 599 /// 600 /// The lifetime `'s` refers to the lifetime of the underlying 601 /// [`Searcher`](struct.Searcher.html), while the lifetime `'h` refers to the 602 /// lifetime of the haystack being searched. 603 #[derive(Debug)] 604 pub struct FindIter<'s, 'h> { 605 searcher: &'s Searcher, 606 haystack: &'h [u8], 607 at: usize, 608 } 609 610 impl<'s, 'h> Iterator for FindIter<'s, 'h> { 611 type Item = Match; 612 next(&mut self) -> Option<Match>613 fn next(&mut self) -> Option<Match> { 614 if self.at > self.haystack.len() { 615 return None; 616 } 617 match self.searcher.find_at(&self.haystack, self.at) { 618 None => None, 619 Some(c) => { 620 self.at = c.end; 621 Some(c) 622 } 623 } 624 } 625 } 626