1 use std::u16; 2 3 use packed::pattern::Patterns; 4 use packed::rabinkarp::RabinKarp; 5 use packed::teddy::{self, Teddy}; 6 use Match; 7 8 /// This is a limit placed on the total number of patterns we're willing to try 9 /// and match at once. As more sophisticated algorithms are added, this number 10 /// may be increased. 11 const PATTERN_LIMIT: usize = 128; 12 13 /// A knob for controlling the match semantics of a packed multiple string 14 /// searcher. 15 /// 16 /// This differs from the 17 /// [`MatchKind`](../enum.MatchKind.html) 18 /// type in the top-level crate module in that it doesn't support 19 /// "standard" match semantics, and instead only supports leftmost-first or 20 /// leftmost-longest. Namely, "standard" semantics cannot be easily supported 21 /// by packed searchers. 22 /// 23 /// For more information on the distinction between leftmost-first and 24 /// leftmost-longest, see the docs on the top-level `MatchKind` type. 25 /// 26 /// Unlike the top-level `MatchKind` type, the default match semantics for this 27 /// type are leftmost-first. 28 #[derive(Clone, Copy, Debug, Eq, PartialEq)] 29 pub enum MatchKind { 30 /// Use leftmost-first match semantics, which reports leftmost matches. 31 /// When there are multiple possible leftmost matches, the match 32 /// corresponding to the pattern that appeared earlier when constructing 33 /// the automaton is reported. 34 /// 35 /// This is the default. 36 LeftmostFirst, 37 /// Use leftmost-longest match semantics, which reports leftmost matches. 38 /// When there are multiple possible leftmost matches, the longest match 39 /// is chosen. 40 LeftmostLongest, 41 /// Hints that destructuring should not be exhaustive. 42 /// 43 /// This enum may grow additional variants, so this makes sure clients 44 /// don't count on exhaustive matching. (Otherwise, adding a new variant 45 /// could break existing code.) 46 #[doc(hidden)] 47 __Nonexhaustive, 48 } 49 50 impl Default for MatchKind { default() -> MatchKind51 fn default() -> MatchKind { 52 MatchKind::LeftmostFirst 53 } 54 } 55 56 /// The configuration for a packed multiple pattern searcher. 57 /// 58 /// The configuration is currently limited only to being able to select the 59 /// match semantics (leftmost-first or leftmost-longest) of a searcher. In the 60 /// future, more knobs may be made available. 61 /// 62 /// A configuration produces a [`packed::Builder`](struct.Builder.html), which 63 /// in turn can be used to construct a 64 /// [`packed::Searcher`](struct.Searcher.html) for searching. 65 /// 66 /// # Example 67 /// 68 /// This example shows how to use leftmost-longest semantics instead of the 69 /// default (leftmost-first). 70 /// 71 /// ``` 72 /// use aho_corasick::packed::{Config, MatchKind}; 73 /// 74 /// # fn example() -> Option<()> { 75 /// let searcher = Config::new() 76 /// .match_kind(MatchKind::LeftmostLongest) 77 /// .builder() 78 /// .add("foo") 79 /// .add("foobar") 80 /// .build()?; 81 /// let matches: Vec<usize> = searcher 82 /// .find_iter("foobar") 83 /// .map(|mat| mat.pattern()) 84 /// .collect(); 85 /// assert_eq!(vec![1], matches); 86 /// # Some(()) } 87 /// # if cfg!(target_arch = "x86_64") { 88 /// # example().unwrap() 89 /// # } else { 90 /// # assert!(example().is_none()); 91 /// # } 92 /// ``` 93 #[derive(Clone, Debug)] 94 pub struct Config { 95 kind: MatchKind, 96 force: Option<ForceAlgorithm>, 97 force_teddy_fat: Option<bool>, 98 force_avx: Option<bool>, 99 } 100 101 /// An internal option for forcing the use of a particular packed algorithm. 102 /// 103 /// When an algorithm is forced, if a searcher could not be constructed for it, 104 /// then no searcher will be returned even if an alternative algorithm would 105 /// work. 106 #[derive(Clone, Debug)] 107 enum ForceAlgorithm { 108 Teddy, 109 RabinKarp, 110 } 111 112 impl Default for Config { default() -> Config113 fn default() -> Config { 114 Config::new() 115 } 116 } 117 118 impl Config { 119 /// Create a new default configuration. A default configuration uses 120 /// leftmost-first match semantics. new() -> Config121 pub fn new() -> Config { 122 Config { 123 kind: MatchKind::LeftmostFirst, 124 force: None, 125 force_teddy_fat: None, 126 force_avx: None, 127 } 128 } 129 130 /// Create a packed builder from this configuration. The builder can be 131 /// used to accumulate patterns and create a 132 /// [`Searcher`](struct.Searcher.html) 133 /// from them. builder(&self) -> Builder134 pub fn builder(&self) -> Builder { 135 Builder::from_config(self.clone()) 136 } 137 138 /// Set the match semantics for this configuration. match_kind(&mut self, kind: MatchKind) -> &mut Config139 pub fn match_kind(&mut self, kind: MatchKind) -> &mut Config { 140 self.kind = kind; 141 self 142 } 143 144 /// An undocumented method for forcing the use of the Teddy algorithm. 145 /// 146 /// This is only exposed for more precise testing and benchmarks. Callers 147 /// should not use it as it is not part of the API stability guarantees of 148 /// this crate. 149 #[doc(hidden)] force_teddy(&mut self, yes: bool) -> &mut Config150 pub fn force_teddy(&mut self, yes: bool) -> &mut Config { 151 if yes { 152 self.force = Some(ForceAlgorithm::Teddy); 153 } else { 154 self.force = None; 155 } 156 self 157 } 158 159 /// An undocumented method for forcing the use of the Fat Teddy algorithm. 160 /// 161 /// This is only exposed for more precise testing and benchmarks. Callers 162 /// should not use it as it is not part of the API stability guarantees of 163 /// this crate. 164 #[doc(hidden)] force_teddy_fat(&mut self, yes: Option<bool>) -> &mut Config165 pub fn force_teddy_fat(&mut self, yes: Option<bool>) -> &mut Config { 166 self.force_teddy_fat = yes; 167 self 168 } 169 170 /// An undocumented method for forcing the use of SSE (`Some(false)`) or 171 /// AVX (`Some(true)`) algorithms. 172 /// 173 /// This is only exposed for more precise testing and benchmarks. Callers 174 /// should not use it as it is not part of the API stability guarantees of 175 /// this crate. 176 #[doc(hidden)] force_avx(&mut self, yes: Option<bool>) -> &mut Config177 pub fn force_avx(&mut self, yes: Option<bool>) -> &mut Config { 178 self.force_avx = yes; 179 self 180 } 181 182 /// An undocumented method for forcing the use of the Rabin-Karp algorithm. 183 /// 184 /// This is only exposed for more precise testing and benchmarks. Callers 185 /// should not use it as it is not part of the API stability guarantees of 186 /// this crate. 187 #[doc(hidden)] force_rabin_karp(&mut self, yes: bool) -> &mut Config188 pub fn force_rabin_karp(&mut self, yes: bool) -> &mut Config { 189 if yes { 190 self.force = Some(ForceAlgorithm::RabinKarp); 191 } else { 192 self.force = None; 193 } 194 self 195 } 196 } 197 198 /// A builder for constructing a packed searcher from a collection of patterns. 199 /// 200 /// # Example 201 /// 202 /// This example shows how to use a builder to construct a searcher. By 203 /// default, leftmost-first match semantics are used. 204 /// 205 /// ``` 206 /// use aho_corasick::packed::{Builder, MatchKind}; 207 /// 208 /// # fn example() -> Option<()> { 209 /// let searcher = Builder::new() 210 /// .add("foobar") 211 /// .add("foo") 212 /// .build()?; 213 /// let matches: Vec<usize> = searcher 214 /// .find_iter("foobar") 215 /// .map(|mat| mat.pattern()) 216 /// .collect(); 217 /// assert_eq!(vec![0], matches); 218 /// # Some(()) } 219 /// # if cfg!(target_arch = "x86_64") { 220 /// # example().unwrap() 221 /// # } else { 222 /// # assert!(example().is_none()); 223 /// # } 224 /// ``` 225 #[derive(Clone, Debug)] 226 pub struct Builder { 227 /// The configuration of this builder and subsequent matcher. 228 config: Config, 229 /// Set to true if the builder detects that a matcher cannot be built. 230 inert: bool, 231 /// The patterns provided by the caller. 232 patterns: Patterns, 233 } 234 235 impl Builder { 236 /// Create a new builder for constructing a multi-pattern searcher. This 237 /// constructor uses the default configuration. new() -> Builder238 pub fn new() -> Builder { 239 Builder::from_config(Config::new()) 240 } 241 from_config(config: Config) -> Builder242 fn from_config(config: Config) -> Builder { 243 Builder { config, inert: false, patterns: Patterns::new() } 244 } 245 246 /// Build a searcher from the patterns added to this builder so far. build(&self) -> Option<Searcher>247 pub fn build(&self) -> Option<Searcher> { 248 if self.inert || self.patterns.is_empty() { 249 return None; 250 } 251 let mut patterns = self.patterns.clone(); 252 patterns.set_match_kind(self.config.kind); 253 let rabinkarp = RabinKarp::new(&patterns); 254 // Effectively, we only want to return a searcher if we can use Teddy, 255 // since Teddy is our only fast packed searcher at the moment. 256 // Rabin-Karp is only used when searching haystacks smaller than what 257 // Teddy can support. Thus, the only way to get a Rabin-Karp searcher 258 // is to force it using undocumented APIs (for tests/benchmarks). 259 let (search_kind, minimum_len) = match self.config.force { 260 None | Some(ForceAlgorithm::Teddy) => { 261 let teddy = match self.build_teddy(&patterns) { 262 None => return None, 263 Some(teddy) => teddy, 264 }; 265 let minimum_len = teddy.minimum_len(); 266 (SearchKind::Teddy(teddy), minimum_len) 267 } 268 Some(ForceAlgorithm::RabinKarp) => (SearchKind::RabinKarp, 0), 269 }; 270 Some(Searcher { 271 config: self.config.clone(), 272 patterns, 273 rabinkarp, 274 search_kind, 275 minimum_len, 276 }) 277 } 278 build_teddy(&self, patterns: &Patterns) -> Option<Teddy>279 fn build_teddy(&self, patterns: &Patterns) -> Option<Teddy> { 280 teddy::Builder::new() 281 .avx(self.config.force_avx) 282 .fat(self.config.force_teddy_fat) 283 .build(&patterns) 284 } 285 286 /// Add the given pattern to this set to match. 287 /// 288 /// The order in which patterns are added is significant. Namely, when 289 /// using leftmost-first match semantics, then when multiple patterns can 290 /// match at a particular location, the pattern that was added first is 291 /// used as the match. 292 /// 293 /// If the number of patterns added exceeds the amount supported by packed 294 /// searchers, then the builder will stop accumulating patterns and render 295 /// itself inert. At this point, constructing a searcher will always return 296 /// `None`. add<P: AsRef<[u8]>>(&mut self, pattern: P) -> &mut Builder297 pub fn add<P: AsRef<[u8]>>(&mut self, pattern: P) -> &mut Builder { 298 if self.inert { 299 return self; 300 } else if self.patterns.len() >= PATTERN_LIMIT { 301 self.inert = true; 302 self.patterns.reset(); 303 return self; 304 } 305 // Just in case PATTERN_LIMIT increases beyond u16::MAX. 306 assert!(self.patterns.len() <= u16::MAX as usize); 307 308 let pattern = pattern.as_ref(); 309 if pattern.is_empty() { 310 self.inert = true; 311 self.patterns.reset(); 312 return self; 313 } 314 self.patterns.add(pattern); 315 self 316 } 317 318 /// Add the given iterator of patterns to this set to match. 319 /// 320 /// The iterator must yield elements that can be converted into a `&[u8]`. 321 /// 322 /// The order in which patterns are added is significant. Namely, when 323 /// using leftmost-first match semantics, then when multiple patterns can 324 /// match at a particular location, the pattern that was added first is 325 /// used as the match. 326 /// 327 /// If the number of patterns added exceeds the amount supported by packed 328 /// searchers, then the builder will stop accumulating patterns and render 329 /// itself inert. At this point, constructing a searcher will always return 330 /// `None`. extend<I, P>(&mut self, patterns: I) -> &mut Builder where I: IntoIterator<Item = P>, P: AsRef<[u8]>,331 pub fn extend<I, P>(&mut self, patterns: I) -> &mut Builder 332 where 333 I: IntoIterator<Item = P>, 334 P: AsRef<[u8]>, 335 { 336 for p in patterns { 337 self.add(p); 338 } 339 self 340 } 341 } 342 343 impl Default for Builder { default() -> Builder344 fn default() -> Builder { 345 Builder::new() 346 } 347 } 348 349 /// A packed searcher for quickly finding occurrences of multiple patterns. 350 /// 351 /// If callers need more flexible construction, or if one wants to change the 352 /// match semantics (either leftmost-first or leftmost-longest), then one can 353 /// use the [`Config`](struct.Config.html) and/or 354 /// [`Builder`](struct.Builder.html) types for more fine grained control. 355 /// 356 /// # Example 357 /// 358 /// This example shows how to create a searcher from an iterator of patterns. 359 /// By default, leftmost-first match semantics are used. 360 /// 361 /// ``` 362 /// use aho_corasick::packed::{MatchKind, Searcher}; 363 /// 364 /// # fn example() -> Option<()> { 365 /// let searcher = Searcher::new(["foobar", "foo"].iter().cloned())?; 366 /// let matches: Vec<usize> = searcher 367 /// .find_iter("foobar") 368 /// .map(|mat| mat.pattern()) 369 /// .collect(); 370 /// assert_eq!(vec![0], matches); 371 /// # Some(()) } 372 /// # if cfg!(target_arch = "x86_64") { 373 /// # example().unwrap() 374 /// # } else { 375 /// # assert!(example().is_none()); 376 /// # } 377 /// ``` 378 #[derive(Clone, Debug)] 379 pub struct Searcher { 380 config: Config, 381 patterns: Patterns, 382 rabinkarp: RabinKarp, 383 search_kind: SearchKind, 384 minimum_len: usize, 385 } 386 387 #[derive(Clone, Debug)] 388 enum SearchKind { 389 Teddy(Teddy), 390 RabinKarp, 391 } 392 393 impl Searcher { 394 /// A convenience function for constructing a searcher from an iterator 395 /// of things that can be converted to a `&[u8]`. 396 /// 397 /// If a searcher could not be constructed (either because of an 398 /// unsupported CPU or because there are too many patterns), then `None` 399 /// is returned. 400 /// 401 /// # Example 402 /// 403 /// Basic usage: 404 /// 405 /// ``` 406 /// use aho_corasick::packed::{MatchKind, Searcher}; 407 /// 408 /// # fn example() -> Option<()> { 409 /// let searcher = Searcher::new(["foobar", "foo"].iter().cloned())?; 410 /// let matches: Vec<usize> = searcher 411 /// .find_iter("foobar") 412 /// .map(|mat| mat.pattern()) 413 /// .collect(); 414 /// assert_eq!(vec![0], matches); 415 /// # Some(()) } 416 /// # if cfg!(target_arch = "x86_64") { 417 /// # example().unwrap() 418 /// # } else { 419 /// # assert!(example().is_none()); 420 /// # } 421 /// ``` new<I, P>(patterns: I) -> Option<Searcher> where I: IntoIterator<Item = P>, P: AsRef<[u8]>,422 pub fn new<I, P>(patterns: I) -> Option<Searcher> 423 where 424 I: IntoIterator<Item = P>, 425 P: AsRef<[u8]>, 426 { 427 Builder::new().extend(patterns).build() 428 } 429 430 /// Return the first occurrence of any of the patterns in this searcher, 431 /// according to its match semantics, in the given haystack. The `Match` 432 /// returned will include the identifier of the pattern that matched, which 433 /// corresponds to the index of the pattern (starting from `0`) in which it 434 /// was added. 435 /// 436 /// # Example 437 /// 438 /// Basic usage: 439 /// 440 /// ``` 441 /// use aho_corasick::packed::{MatchKind, Searcher}; 442 /// 443 /// # fn example() -> Option<()> { 444 /// let searcher = Searcher::new(["foobar", "foo"].iter().cloned())?; 445 /// let mat = searcher.find("foobar")?; 446 /// assert_eq!(0, mat.pattern()); 447 /// assert_eq!(0, mat.start()); 448 /// assert_eq!(6, mat.end()); 449 /// # Some(()) } 450 /// # if cfg!(target_arch = "x86_64") { 451 /// # example().unwrap() 452 /// # } else { 453 /// # assert!(example().is_none()); 454 /// # } 455 /// ``` find<B: AsRef<[u8]>>(&self, haystack: B) -> Option<Match>456 pub fn find<B: AsRef<[u8]>>(&self, haystack: B) -> Option<Match> { 457 self.find_at(haystack, 0) 458 } 459 460 /// Return the first occurrence of any of the patterns in this searcher, 461 /// according to its match semantics, in the given haystack starting from 462 /// the given position. 463 /// 464 /// The `Match` returned will include the identifier of the pattern that 465 /// matched, which corresponds to the index of the pattern (starting from 466 /// `0`) in which it was added. The offsets in the `Match` will be relative 467 /// to the start of `haystack` (and not `at`). 468 /// 469 /// # Example 470 /// 471 /// Basic usage: 472 /// 473 /// ``` 474 /// use aho_corasick::packed::{MatchKind, Searcher}; 475 /// 476 /// # fn example() -> Option<()> { 477 /// let searcher = Searcher::new(["foobar", "foo"].iter().cloned())?; 478 /// let mat = searcher.find_at("foofoobar", 3)?; 479 /// assert_eq!(0, mat.pattern()); 480 /// assert_eq!(3, mat.start()); 481 /// assert_eq!(9, mat.end()); 482 /// # Some(()) } 483 /// # if cfg!(target_arch = "x86_64") { 484 /// # example().unwrap() 485 /// # } else { 486 /// # assert!(example().is_none()); 487 /// # } 488 /// ``` find_at<B: AsRef<[u8]>>( &self, haystack: B, at: usize, ) -> Option<Match>489 pub fn find_at<B: AsRef<[u8]>>( 490 &self, 491 haystack: B, 492 at: usize, 493 ) -> Option<Match> { 494 let haystack = haystack.as_ref(); 495 match self.search_kind { 496 SearchKind::Teddy(ref teddy) => { 497 if haystack[at..].len() < teddy.minimum_len() { 498 return self.slow_at(haystack, at); 499 } 500 teddy.find_at(&self.patterns, haystack, at) 501 } 502 SearchKind::RabinKarp => { 503 self.rabinkarp.find_at(&self.patterns, haystack, at) 504 } 505 } 506 } 507 508 /// Return an iterator of non-overlapping occurrences of the patterns in 509 /// this searcher, according to its match semantics, in the given haystack. 510 /// 511 /// # Example 512 /// 513 /// Basic usage: 514 /// 515 /// ``` 516 /// use aho_corasick::packed::{MatchKind, Searcher}; 517 /// 518 /// # fn example() -> Option<()> { 519 /// let searcher = Searcher::new(["foobar", "foo"].iter().cloned())?; 520 /// let matches: Vec<usize> = searcher 521 /// .find_iter("foobar fooba foofoo") 522 /// .map(|mat| mat.pattern()) 523 /// .collect(); 524 /// assert_eq!(vec![0, 1, 1, 1], matches); 525 /// # Some(()) } 526 /// # if cfg!(target_arch = "x86_64") { 527 /// # example().unwrap() 528 /// # } else { 529 /// # assert!(example().is_none()); 530 /// # } 531 /// ``` find_iter<'a, 'b, B: ?Sized + AsRef<[u8]>>( &'a self, haystack: &'b B, ) -> FindIter<'a, 'b>532 pub fn find_iter<'a, 'b, B: ?Sized + AsRef<[u8]>>( 533 &'a self, 534 haystack: &'b B, 535 ) -> FindIter<'a, 'b> { 536 FindIter { searcher: self, haystack: haystack.as_ref(), at: 0 } 537 } 538 539 /// Returns the match kind used by this packed searcher. 540 /// 541 /// # Examples 542 /// 543 /// Basic usage: 544 /// 545 /// ``` 546 /// use aho_corasick::packed::{MatchKind, Searcher}; 547 /// 548 /// # fn example() -> Option<()> { 549 /// let searcher = Searcher::new(["foobar", "foo"].iter().cloned())?; 550 /// // leftmost-first is the default. 551 /// assert_eq!(&MatchKind::LeftmostFirst, searcher.match_kind()); 552 /// # Some(()) } 553 /// # if cfg!(target_arch = "x86_64") { 554 /// # example().unwrap() 555 /// # } else { 556 /// # assert!(example().is_none()); 557 /// # } 558 /// ``` match_kind(&self) -> &MatchKind559 pub fn match_kind(&self) -> &MatchKind { 560 self.patterns.match_kind() 561 } 562 563 /// Returns the minimum length of a haystack that is required in order for 564 /// packed searching to be effective. 565 /// 566 /// In some cases, the underlying packed searcher may not be able to search 567 /// very short haystacks. When that occurs, the implementation will defer 568 /// to a slower non-packed searcher (which is still generally faster than 569 /// Aho-Corasick for a small number of patterns). However, callers may 570 /// want to avoid ever using the slower variant, which one can do by 571 /// never passing a haystack shorter than the minimum length returned by 572 /// this method. minimum_len(&self) -> usize573 pub fn minimum_len(&self) -> usize { 574 self.minimum_len 575 } 576 577 /// Returns the approximate total amount of heap used by this searcher, in 578 /// units of bytes. heap_bytes(&self) -> usize579 pub fn heap_bytes(&self) -> usize { 580 self.patterns.heap_bytes() 581 + self.rabinkarp.heap_bytes() 582 + self.search_kind.heap_bytes() 583 } 584 585 /// Use a slow (non-packed) searcher. 586 /// 587 /// This is useful when a packed searcher could be constructed, but could 588 /// not be used to search a specific haystack. For example, if Teddy was 589 /// built but the haystack is smaller than ~34 bytes, then Teddy might not 590 /// be able to run. slow_at(&self, haystack: &[u8], at: usize) -> Option<Match>591 fn slow_at(&self, haystack: &[u8], at: usize) -> Option<Match> { 592 self.rabinkarp.find_at(&self.patterns, haystack, at) 593 } 594 } 595 596 impl SearchKind { heap_bytes(&self) -> usize597 fn heap_bytes(&self) -> usize { 598 match *self { 599 SearchKind::Teddy(ref ted) => ted.heap_bytes(), 600 SearchKind::RabinKarp => 0, 601 } 602 } 603 } 604 605 /// An iterator over non-overlapping matches from a packed searcher. 606 /// 607 /// The lifetime `'s` refers to the lifetime of the underlying 608 /// [`Searcher`](struct.Searcher.html), while the lifetime `'h` refers to the 609 /// lifetime of the haystack being searched. 610 #[derive(Debug)] 611 pub struct FindIter<'s, 'h> { 612 searcher: &'s Searcher, 613 haystack: &'h [u8], 614 at: usize, 615 } 616 617 impl<'s, 'h> Iterator for FindIter<'s, 'h> { 618 type Item = Match; 619 next(&mut self) -> Option<Match>620 fn next(&mut self) -> Option<Match> { 621 if self.at > self.haystack.len() { 622 return None; 623 } 624 match self.searcher.find_at(&self.haystack, self.at) { 625 None => None, 626 Some(c) => { 627 self.at = c.end; 628 Some(c) 629 } 630 } 631 } 632 } 633