1 use std::io; 2 3 use crate::automaton::Automaton; 4 use crate::buffer::Buffer; 5 use crate::dfa::{self, DFA}; 6 use crate::error::Result; 7 use crate::nfa::{self, NFA}; 8 use crate::packed; 9 use crate::prefilter::{Prefilter, PrefilterState}; 10 use crate::state_id::StateID; 11 use crate::Match; 12 13 /// An automaton for searching multiple strings in linear time. 14 /// 15 /// The `AhoCorasick` type supports a few basic ways of constructing an 16 /// automaton, including 17 /// [`AhoCorasick::new`](struct.AhoCorasick.html#method.new) 18 /// and 19 /// [`AhoCorasick::new_auto_configured`](struct.AhoCorasick.html#method.new_auto_configured). 20 /// However, there are a fair number of configurable options that can be set 21 /// by using 22 /// [`AhoCorasickBuilder`](struct.AhoCorasickBuilder.html) 23 /// instead. Such options include, but are not limited to, how matches are 24 /// determined, simple case insensitivity, whether to use a DFA or not and 25 /// various knobs for controlling the space-vs-time trade offs taken when 26 /// building the automaton. 27 /// 28 /// If you aren't sure where to start, try beginning with 29 /// [`AhoCorasick::new_auto_configured`](struct.AhoCorasick.html#method.new_auto_configured). 30 /// 31 /// # Resource usage 32 /// 33 /// Aho-Corasick automatons are always constructed in `O(p)` time, where `p` 34 /// is the combined length of all patterns being searched. With that said, 35 /// building an automaton can be fairly costly because of high constant 36 /// factors, particularly when enabling the 37 /// [DFA](struct.AhoCorasickBuilder.html#method.dfa) 38 /// option (which is disabled by default). For this reason, it's generally a 39 /// good idea to build an automaton once and reuse it as much as possible. 40 /// 41 /// Aho-Corasick automatons can also use a fair bit of memory. To get a 42 /// concrete idea of how much memory is being used, try using the 43 /// [`AhoCorasick::heap_bytes`](struct.AhoCorasick.html#method.heap_bytes) 44 /// method. 45 /// 46 /// # Examples 47 /// 48 /// This example shows how to search for occurrences of multiple patterns 49 /// simultaneously in a case insensitive fashion. Each match includes the 50 /// pattern that matched along with the byte offsets of the match. 51 /// 52 /// ``` 53 /// use aho_corasick::AhoCorasickBuilder; 54 /// 55 /// let patterns = &["apple", "maple", "snapple"]; 56 /// let haystack = "Nobody likes maple in their apple flavored Snapple."; 57 /// 58 /// let ac = AhoCorasickBuilder::new() 59 /// .ascii_case_insensitive(true) 60 /// .build(patterns); 61 /// let mut matches = vec![]; 62 /// for mat in ac.find_iter(haystack) { 63 /// matches.push((mat.pattern(), mat.start(), mat.end())); 64 /// } 65 /// assert_eq!(matches, vec![ 66 /// (1, 13, 18), 67 /// (0, 28, 33), 68 /// (2, 43, 50), 69 /// ]); 70 /// ``` 71 /// 72 /// This example shows how to replace matches with some other string: 73 /// 74 /// ``` 75 /// use aho_corasick::AhoCorasick; 76 /// 77 /// let patterns = &["fox", "brown", "quick"]; 78 /// let haystack = "The quick brown fox."; 79 /// let replace_with = &["sloth", "grey", "slow"]; 80 /// 81 /// let ac = AhoCorasick::new(patterns); 82 /// let result = ac.replace_all(haystack, replace_with); 83 /// assert_eq!(result, "The slow grey sloth."); 84 /// ``` 85 #[derive(Clone, Debug)] 86 pub struct AhoCorasick<S: StateID = usize> { 87 imp: Imp<S>, 88 match_kind: MatchKind, 89 } 90 91 impl AhoCorasick { 92 /// Create a new Aho-Corasick automaton using the default configuration. 93 /// 94 /// The default configuration optimizes for less space usage, but at the 95 /// expense of longer search times. To change the configuration, use 96 /// [`AhoCorasickBuilder`](struct.AhoCorasickBuilder.html) 97 /// for fine-grained control, or 98 /// [`AhoCorasick::new_auto_configured`](struct.AhoCorasick.html#method.new_auto_configured) 99 /// for automatic configuration if you aren't sure which settings to pick. 100 /// 101 /// This uses the default 102 /// [`MatchKind::Standard`](enum.MatchKind.html#variant.Standard) 103 /// match semantics, which reports a match as soon as it is found. This 104 /// corresponds to the standard match semantics supported by textbook 105 /// descriptions of the Aho-Corasick algorithm. 106 /// 107 /// # Examples 108 /// 109 /// Basic usage: 110 /// 111 /// ``` 112 /// use aho_corasick::AhoCorasick; 113 /// 114 /// let ac = AhoCorasick::new(&[ 115 /// "foo", "bar", "baz", 116 /// ]); 117 /// assert_eq!(Some(1), ac.find("xxx bar xxx").map(|m| m.pattern())); 118 /// ``` new<I, P>(patterns: I) -> AhoCorasick where I: IntoIterator<Item = P>, P: AsRef<[u8]>,119 pub fn new<I, P>(patterns: I) -> AhoCorasick 120 where 121 I: IntoIterator<Item = P>, 122 P: AsRef<[u8]>, 123 { 124 AhoCorasickBuilder::new().build(patterns) 125 } 126 127 /// Build an Aho-Corasick automaton with an automatically determined 128 /// configuration. 129 /// 130 /// Specifically, this requires a slice of patterns instead of an iterator 131 /// since the configuration is determined by looking at the patterns before 132 /// constructing the automaton. The idea here is to balance space and time 133 /// automatically. That is, when searching a small number of patterns, this 134 /// will attempt to use the fastest possible configuration since the total 135 /// space required will be small anyway. As the number of patterns grows, 136 /// this will fall back to slower configurations that use less space. 137 /// 138 /// If you want auto configuration but with match semantics different from 139 /// the default `MatchKind::Standard`, then use 140 /// [`AhoCorasickBuilder::auto_configure`](struct.AhoCorasickBuilder.html#method.auto_configure). 141 /// 142 /// # Examples 143 /// 144 /// Basic usage is just like `new`, except you must provide a slice: 145 /// 146 /// ``` 147 /// use aho_corasick::AhoCorasick; 148 /// 149 /// let ac = AhoCorasick::new_auto_configured(&[ 150 /// "foo", "bar", "baz", 151 /// ]); 152 /// assert_eq!(Some(1), ac.find("xxx bar xxx").map(|m| m.pattern())); 153 /// ``` new_auto_configured<B>(patterns: &[B]) -> AhoCorasick where B: AsRef<[u8]>,154 pub fn new_auto_configured<B>(patterns: &[B]) -> AhoCorasick 155 where 156 B: AsRef<[u8]>, 157 { 158 AhoCorasickBuilder::new().auto_configure(patterns).build(patterns) 159 } 160 } 161 162 impl<S: StateID> AhoCorasick<S> { 163 /// Returns true if and only if this automaton matches the haystack at any 164 /// position. 165 /// 166 /// `haystack` may be any type that is cheaply convertible to a `&[u8]`. 167 /// This includes, but is not limited to, `String`, `&str`, `Vec<u8>`, and 168 /// `&[u8]` itself. 169 /// 170 /// # Examples 171 /// 172 /// Basic usage: 173 /// 174 /// ``` 175 /// use aho_corasick::AhoCorasick; 176 /// 177 /// let ac = AhoCorasick::new(&[ 178 /// "foo", "bar", "quux", "baz", 179 /// ]); 180 /// assert!(ac.is_match("xxx bar xxx")); 181 /// assert!(!ac.is_match("xxx qux xxx")); 182 /// ``` is_match<B: AsRef<[u8]>>(&self, haystack: B) -> bool183 pub fn is_match<B: AsRef<[u8]>>(&self, haystack: B) -> bool { 184 self.earliest_find(haystack).is_some() 185 } 186 187 /// Returns the location of the first detected match in `haystack`. 188 /// 189 /// This method has the same behavior regardless of the 190 /// [`MatchKind`](enum.MatchKind.html) 191 /// of this automaton. 192 /// 193 /// `haystack` may be any type that is cheaply convertible to a `&[u8]`. 194 /// This includes, but is not limited to, `String`, `&str`, `Vec<u8>`, and 195 /// `&[u8]` itself. 196 /// 197 /// # Examples 198 /// 199 /// Basic usage: 200 /// 201 /// ``` 202 /// use aho_corasick::AhoCorasick; 203 /// 204 /// let ac = AhoCorasick::new(&[ 205 /// "abc", "b", 206 /// ]); 207 /// let mat = ac.earliest_find("abcd").expect("should have match"); 208 /// assert_eq!(1, mat.pattern()); 209 /// assert_eq!((1, 2), (mat.start(), mat.end())); 210 /// ``` earliest_find<B: AsRef<[u8]>>(&self, haystack: B) -> Option<Match>211 pub fn earliest_find<B: AsRef<[u8]>>(&self, haystack: B) -> Option<Match> { 212 let mut prestate = PrefilterState::new(self.max_pattern_len()); 213 let mut start = self.imp.start_state(); 214 self.imp.earliest_find_at( 215 &mut prestate, 216 haystack.as_ref(), 217 0, 218 &mut start, 219 ) 220 } 221 222 /// Returns the location of the first match according to the match 223 /// semantics that this automaton was constructed with. 224 /// 225 /// When using `MatchKind::Standard`, this corresponds precisely to the 226 /// same behavior as 227 /// [`earliest_find`](struct.AhoCorasick.html#method.earliest_find). 228 /// Otherwise, match semantics correspond to either 229 /// [leftmost-first](enum.MatchKind.html#variant.LeftmostFirst) 230 /// or 231 /// [leftmost-longest](enum.MatchKind.html#variant.LeftmostLongest). 232 /// 233 /// `haystack` may be any type that is cheaply convertible to a `&[u8]`. 234 /// This includes, but is not limited to, `String`, `&str`, `Vec<u8>`, and 235 /// `&[u8]` itself. 236 /// 237 /// # Examples 238 /// 239 /// Basic usage, with standard semantics: 240 /// 241 /// ``` 242 /// use aho_corasick::{AhoCorasickBuilder, MatchKind}; 243 /// 244 /// let patterns = &["b", "abc", "abcd"]; 245 /// let haystack = "abcd"; 246 /// 247 /// let ac = AhoCorasickBuilder::new() 248 /// .match_kind(MatchKind::Standard) // default, not necessary 249 /// .build(patterns); 250 /// let mat = ac.find(haystack).expect("should have a match"); 251 /// assert_eq!("b", &haystack[mat.start()..mat.end()]); 252 /// ``` 253 /// 254 /// Now with leftmost-first semantics: 255 /// 256 /// ``` 257 /// use aho_corasick::{AhoCorasickBuilder, MatchKind}; 258 /// 259 /// let patterns = &["b", "abc", "abcd"]; 260 /// let haystack = "abcd"; 261 /// 262 /// let ac = AhoCorasickBuilder::new() 263 /// .match_kind(MatchKind::LeftmostFirst) 264 /// .build(patterns); 265 /// let mat = ac.find(haystack).expect("should have a match"); 266 /// assert_eq!("abc", &haystack[mat.start()..mat.end()]); 267 /// ``` 268 /// 269 /// And finally, leftmost-longest semantics: 270 /// 271 /// ``` 272 /// use aho_corasick::{AhoCorasickBuilder, MatchKind}; 273 /// 274 /// let patterns = &["b", "abc", "abcd"]; 275 /// let haystack = "abcd"; 276 /// 277 /// let ac = AhoCorasickBuilder::new() 278 /// .match_kind(MatchKind::LeftmostLongest) 279 /// .build(patterns); 280 /// let mat = ac.find(haystack).expect("should have a match"); 281 /// assert_eq!("abcd", &haystack[mat.start()..mat.end()]); 282 /// ``` find<B: AsRef<[u8]>>(&self, haystack: B) -> Option<Match>283 pub fn find<B: AsRef<[u8]>>(&self, haystack: B) -> Option<Match> { 284 let mut prestate = PrefilterState::new(self.max_pattern_len()); 285 self.imp.find_at_no_state(&mut prestate, haystack.as_ref(), 0) 286 } 287 288 /// Returns an iterator of non-overlapping matches, using the match 289 /// semantics that this automaton was constructed with. 290 /// 291 /// `haystack` may be any type that is cheaply convertible to a `&[u8]`. 292 /// This includes, but is not limited to, `String`, `&str`, `Vec<u8>`, and 293 /// `&[u8]` itself. 294 /// 295 /// # Examples 296 /// 297 /// Basic usage, with standard semantics: 298 /// 299 /// ``` 300 /// use aho_corasick::{AhoCorasickBuilder, MatchKind}; 301 /// 302 /// let patterns = &["append", "appendage", "app"]; 303 /// let haystack = "append the app to the appendage"; 304 /// 305 /// let ac = AhoCorasickBuilder::new() 306 /// .match_kind(MatchKind::Standard) // default, not necessary 307 /// .build(patterns); 308 /// let matches: Vec<usize> = ac 309 /// .find_iter(haystack) 310 /// .map(|mat| mat.pattern()) 311 /// .collect(); 312 /// assert_eq!(vec![2, 2, 2], matches); 313 /// ``` 314 /// 315 /// Now with leftmost-first semantics: 316 /// 317 /// ``` 318 /// use aho_corasick::{AhoCorasickBuilder, MatchKind}; 319 /// 320 /// let patterns = &["append", "appendage", "app"]; 321 /// let haystack = "append the app to the appendage"; 322 /// 323 /// let ac = AhoCorasickBuilder::new() 324 /// .match_kind(MatchKind::LeftmostFirst) 325 /// .build(patterns); 326 /// let matches: Vec<usize> = ac 327 /// .find_iter(haystack) 328 /// .map(|mat| mat.pattern()) 329 /// .collect(); 330 /// assert_eq!(vec![0, 2, 0], matches); 331 /// ``` 332 /// 333 /// And finally, leftmost-longest semantics: 334 /// 335 /// ``` 336 /// use aho_corasick::{AhoCorasickBuilder, MatchKind}; 337 /// 338 /// let patterns = &["append", "appendage", "app"]; 339 /// let haystack = "append the app to the appendage"; 340 /// 341 /// let ac = AhoCorasickBuilder::new() 342 /// .match_kind(MatchKind::LeftmostLongest) 343 /// .build(patterns); 344 /// let matches: Vec<usize> = ac 345 /// .find_iter(haystack) 346 /// .map(|mat| mat.pattern()) 347 /// .collect(); 348 /// assert_eq!(vec![0, 2, 1], matches); 349 /// ``` find_iter<'a, 'b, B: ?Sized + AsRef<[u8]>>( &'a self, haystack: &'b B, ) -> FindIter<'a, 'b, S>350 pub fn find_iter<'a, 'b, B: ?Sized + AsRef<[u8]>>( 351 &'a self, 352 haystack: &'b B, 353 ) -> FindIter<'a, 'b, S> { 354 FindIter::new(self, haystack.as_ref()) 355 } 356 357 /// Returns an iterator of overlapping matches in the given `haystack`. 358 /// 359 /// Overlapping matches can _only_ be detected using 360 /// `MatchKind::Standard` semantics. If this automaton was constructed with 361 /// leftmost semantics, then this method will panic. To determine whether 362 /// this will panic at runtime, use the 363 /// [`AhoCorasick::supports_overlapping`](struct.AhoCorasick.html#method.supports_overlapping) 364 /// method. 365 /// 366 /// `haystack` may be any type that is cheaply convertible to a `&[u8]`. 367 /// This includes, but is not limited to, `String`, `&str`, `Vec<u8>`, and 368 /// `&[u8]` itself. 369 /// 370 /// # Panics 371 /// 372 /// This panics when `AhoCorasick::supports_overlapping` returns `false`. 373 /// That is, this panics when this automaton's match semantics are not 374 /// `MatchKind::Standard`. 375 /// 376 /// # Examples 377 /// 378 /// Basic usage, with standard semantics: 379 /// 380 /// ``` 381 /// use aho_corasick::AhoCorasick; 382 /// 383 /// let patterns = &["append", "appendage", "app"]; 384 /// let haystack = "append the app to the appendage"; 385 /// 386 /// let ac = AhoCorasick::new(patterns); 387 /// let matches: Vec<usize> = ac 388 /// .find_overlapping_iter(haystack) 389 /// .map(|mat| mat.pattern()) 390 /// .collect(); 391 /// assert_eq!(vec![2, 0, 2, 2, 0, 1], matches); 392 /// ``` find_overlapping_iter<'a, 'b, B: ?Sized + AsRef<[u8]>>( &'a self, haystack: &'b B, ) -> FindOverlappingIter<'a, 'b, S>393 pub fn find_overlapping_iter<'a, 'b, B: ?Sized + AsRef<[u8]>>( 394 &'a self, 395 haystack: &'b B, 396 ) -> FindOverlappingIter<'a, 'b, S> { 397 FindOverlappingIter::new(self, haystack.as_ref()) 398 } 399 400 /// Replace all matches with a corresponding value in the `replace_with` 401 /// slice given. Matches correspond to the same matches as reported by 402 /// [`find_iter`](struct.AhoCorasick.html#method.find_iter). 403 /// 404 /// Replacements are determined by the index of the matching pattern. 405 /// For example, if the pattern with index `2` is found, then it is 406 /// replaced by `replace_with[2]`. 407 /// 408 /// # Panics 409 /// 410 /// This panics when `replace_with.len()` does not equal the total number 411 /// of patterns that are matched by this automaton. 412 /// 413 /// # Examples 414 /// 415 /// Basic usage: 416 /// 417 /// ``` 418 /// use aho_corasick::{AhoCorasickBuilder, MatchKind}; 419 /// 420 /// let patterns = &["append", "appendage", "app"]; 421 /// let haystack = "append the app to the appendage"; 422 /// 423 /// let ac = AhoCorasickBuilder::new() 424 /// .match_kind(MatchKind::LeftmostFirst) 425 /// .build(patterns); 426 /// let result = ac.replace_all(haystack, &["x", "y", "z"]); 427 /// assert_eq!("x the z to the xage", result); 428 /// ``` replace_all<B>(&self, haystack: &str, replace_with: &[B]) -> String where B: AsRef<str>,429 pub fn replace_all<B>(&self, haystack: &str, replace_with: &[B]) -> String 430 where 431 B: AsRef<str>, 432 { 433 assert_eq!( 434 replace_with.len(), 435 self.pattern_count(), 436 "replace_all requires a replacement for every pattern \ 437 in the automaton" 438 ); 439 let mut dst = String::with_capacity(haystack.len()); 440 self.replace_all_with(haystack, &mut dst, |mat, _, dst| { 441 dst.push_str(replace_with[mat.pattern()].as_ref()); 442 true 443 }); 444 dst 445 } 446 447 /// Replace all matches using raw bytes with a corresponding value in the 448 /// `replace_with` slice given. Matches correspond to the same matches as 449 /// reported by [`find_iter`](struct.AhoCorasick.html#method.find_iter). 450 /// 451 /// Replacements are determined by the index of the matching pattern. 452 /// For example, if the pattern with index `2` is found, then it is 453 /// replaced by `replace_with[2]`. 454 /// 455 /// # Panics 456 /// 457 /// This panics when `replace_with.len()` does not equal the total number 458 /// of patterns that are matched by this automaton. 459 /// 460 /// # Examples 461 /// 462 /// Basic usage: 463 /// 464 /// ``` 465 /// use aho_corasick::{AhoCorasickBuilder, MatchKind}; 466 /// 467 /// let patterns = &["append", "appendage", "app"]; 468 /// let haystack = b"append the app to the appendage"; 469 /// 470 /// let ac = AhoCorasickBuilder::new() 471 /// .match_kind(MatchKind::LeftmostFirst) 472 /// .build(patterns); 473 /// let result = ac.replace_all_bytes(haystack, &["x", "y", "z"]); 474 /// assert_eq!(b"x the z to the xage".to_vec(), result); 475 /// ``` replace_all_bytes<B>( &self, haystack: &[u8], replace_with: &[B], ) -> Vec<u8> where B: AsRef<[u8]>,476 pub fn replace_all_bytes<B>( 477 &self, 478 haystack: &[u8], 479 replace_with: &[B], 480 ) -> Vec<u8> 481 where 482 B: AsRef<[u8]>, 483 { 484 assert_eq!( 485 replace_with.len(), 486 self.pattern_count(), 487 "replace_all_bytes requires a replacement for every pattern \ 488 in the automaton" 489 ); 490 let mut dst = Vec::with_capacity(haystack.len()); 491 self.replace_all_with_bytes(haystack, &mut dst, |mat, _, dst| { 492 dst.extend(replace_with[mat.pattern()].as_ref()); 493 true 494 }); 495 dst 496 } 497 498 /// Replace all matches using a closure called on each match. 499 /// Matches correspond to the same matches as reported by 500 /// [`find_iter`](struct.AhoCorasick.html#method.find_iter). 501 /// 502 /// The closure accepts three parameters: the match found, the text of 503 /// the match and a string buffer with which to write the replaced text 504 /// (if any). If the closure returns `true`, then it continues to the next 505 /// match. If the closure returns `false`, then searching is stopped. 506 /// 507 /// # Examples 508 /// 509 /// Basic usage: 510 /// 511 /// ``` 512 /// use aho_corasick::{AhoCorasickBuilder, MatchKind}; 513 /// 514 /// let patterns = &["append", "appendage", "app"]; 515 /// let haystack = "append the app to the appendage"; 516 /// 517 /// let ac = AhoCorasickBuilder::new() 518 /// .match_kind(MatchKind::LeftmostFirst) 519 /// .build(patterns); 520 /// let mut result = String::new(); 521 /// ac.replace_all_with(haystack, &mut result, |mat, _, dst| { 522 /// dst.push_str(&mat.pattern().to_string()); 523 /// true 524 /// }); 525 /// assert_eq!("0 the 2 to the 0age", result); 526 /// ``` 527 /// 528 /// Stopping the replacement by returning `false` (continued from the 529 /// example above): 530 /// 531 /// ``` 532 /// # use aho_corasick::{AhoCorasickBuilder, MatchKind}; 533 /// # let patterns = &["append", "appendage", "app"]; 534 /// # let haystack = "append the app to the appendage"; 535 /// # let ac = AhoCorasickBuilder::new() 536 /// # .match_kind(MatchKind::LeftmostFirst) 537 /// # .build(patterns); 538 /// let mut result = String::new(); 539 /// ac.replace_all_with(haystack, &mut result, |mat, _, dst| { 540 /// dst.push_str(&mat.pattern().to_string()); 541 /// mat.pattern() != 2 542 /// }); 543 /// assert_eq!("0 the 2 to the appendage", result); 544 /// ``` replace_all_with<F>( &self, haystack: &str, dst: &mut String, mut replace_with: F, ) where F: FnMut(&Match, &str, &mut String) -> bool,545 pub fn replace_all_with<F>( 546 &self, 547 haystack: &str, 548 dst: &mut String, 549 mut replace_with: F, 550 ) where 551 F: FnMut(&Match, &str, &mut String) -> bool, 552 { 553 let mut last_match = 0; 554 for mat in self.find_iter(haystack) { 555 dst.push_str(&haystack[last_match..mat.start()]); 556 last_match = mat.end(); 557 if !replace_with(&mat, &haystack[mat.start()..mat.end()], dst) { 558 break; 559 }; 560 } 561 dst.push_str(&haystack[last_match..]); 562 } 563 564 /// Replace all matches using raw bytes with a closure called on each 565 /// match. Matches correspond to the same matches as reported by 566 /// [`find_iter`](struct.AhoCorasick.html#method.find_iter). 567 /// 568 /// The closure accepts three parameters: the match found, the text of 569 /// the match and a byte buffer with which to write the replaced text 570 /// (if any). If the closure returns `true`, then it continues to the next 571 /// match. If the closure returns `false`, then searching is stopped. 572 /// 573 /// # Examples 574 /// 575 /// Basic usage: 576 /// 577 /// ``` 578 /// use aho_corasick::{AhoCorasickBuilder, MatchKind}; 579 /// 580 /// let patterns = &["append", "appendage", "app"]; 581 /// let haystack = b"append the app to the appendage"; 582 /// 583 /// let ac = AhoCorasickBuilder::new() 584 /// .match_kind(MatchKind::LeftmostFirst) 585 /// .build(patterns); 586 /// let mut result = vec![]; 587 /// ac.replace_all_with_bytes(haystack, &mut result, |mat, _, dst| { 588 /// dst.extend(mat.pattern().to_string().bytes()); 589 /// true 590 /// }); 591 /// assert_eq!(b"0 the 2 to the 0age".to_vec(), result); 592 /// ``` 593 /// 594 /// Stopping the replacement by returning `false` (continued from the 595 /// example above): 596 /// 597 /// ``` 598 /// # use aho_corasick::{AhoCorasickBuilder, MatchKind}; 599 /// # let patterns = &["append", "appendage", "app"]; 600 /// # let haystack = b"append the app to the appendage"; 601 /// # let ac = AhoCorasickBuilder::new() 602 /// # .match_kind(MatchKind::LeftmostFirst) 603 /// # .build(patterns); 604 /// let mut result = vec![]; 605 /// ac.replace_all_with_bytes(haystack, &mut result, |mat, _, dst| { 606 /// dst.extend(mat.pattern().to_string().bytes()); 607 /// mat.pattern() != 2 608 /// }); 609 /// assert_eq!(b"0 the 2 to the appendage".to_vec(), result); 610 /// ``` replace_all_with_bytes<F>( &self, haystack: &[u8], dst: &mut Vec<u8>, mut replace_with: F, ) where F: FnMut(&Match, &[u8], &mut Vec<u8>) -> bool,611 pub fn replace_all_with_bytes<F>( 612 &self, 613 haystack: &[u8], 614 dst: &mut Vec<u8>, 615 mut replace_with: F, 616 ) where 617 F: FnMut(&Match, &[u8], &mut Vec<u8>) -> bool, 618 { 619 let mut last_match = 0; 620 for mat in self.find_iter(haystack) { 621 dst.extend(&haystack[last_match..mat.start()]); 622 last_match = mat.end(); 623 if !replace_with(&mat, &haystack[mat.start()..mat.end()], dst) { 624 break; 625 }; 626 } 627 dst.extend(&haystack[last_match..]); 628 } 629 630 /// Returns an iterator of non-overlapping matches in the given 631 /// stream. Matches correspond to the same matches as reported by 632 /// [`find_iter`](struct.AhoCorasick.html#method.find_iter). 633 /// 634 /// The matches yielded by this iterator use absolute position offsets in 635 /// the stream given, where the first byte has index `0`. Matches are 636 /// yieled until the stream is exhausted. 637 /// 638 /// Each item yielded by the iterator is an `io::Result<Match>`, where an 639 /// error is yielded if there was a problem reading from the reader given. 640 /// 641 /// When searching a stream, an internal buffer is used. Therefore, callers 642 /// should avoiding providing a buffered reader, if possible. 643 /// 644 /// Searching a stream requires that the automaton was built with 645 /// `MatchKind::Standard` semantics. If this automaton was constructed 646 /// with leftmost semantics, then this method will panic. To determine 647 /// whether this will panic at runtime, use the 648 /// [`AhoCorasick::supports_stream`](struct.AhoCorasick.html#method.supports_stream) 649 /// method. 650 /// 651 /// # Memory usage 652 /// 653 /// In general, searching streams will use a constant amount of memory for 654 /// its internal buffer. The one requirement is that the internal buffer 655 /// must be at least the size of the longest possible match. In most use 656 /// cases, the default buffer size will be much larger than any individual 657 /// match. 658 /// 659 /// # Panics 660 /// 661 /// This panics when `AhoCorasick::supports_stream` returns `false`. 662 /// That is, this panics when this automaton's match semantics are not 663 /// `MatchKind::Standard`. This restriction may be lifted in the future. 664 /// 665 /// # Examples 666 /// 667 /// Basic usage: 668 /// 669 /// ``` 670 /// use aho_corasick::AhoCorasick; 671 /// 672 /// # fn example() -> Result<(), ::std::io::Error> { 673 /// let patterns = &["append", "appendage", "app"]; 674 /// let haystack = "append the app to the appendage"; 675 /// 676 /// let ac = AhoCorasick::new(patterns); 677 /// let mut matches = vec![]; 678 /// for result in ac.stream_find_iter(haystack.as_bytes()) { 679 /// let mat = result?; 680 /// matches.push(mat.pattern()); 681 /// } 682 /// assert_eq!(vec![2, 2, 2], matches); 683 /// # Ok(()) }; example().unwrap() 684 /// ``` stream_find_iter<'a, R: io::Read>( &'a self, rdr: R, ) -> StreamFindIter<'a, R, S>685 pub fn stream_find_iter<'a, R: io::Read>( 686 &'a self, 687 rdr: R, 688 ) -> StreamFindIter<'a, R, S> { 689 StreamFindIter::new(self, rdr) 690 } 691 692 /// Search for and replace all matches of this automaton in 693 /// the given reader, and write the replacements to the given 694 /// writer. Matches correspond to the same matches as reported by 695 /// [`find_iter`](struct.AhoCorasick.html#method.find_iter). 696 /// 697 /// Replacements are determined by the index of the matching pattern. 698 /// For example, if the pattern with index `2` is found, then it is 699 /// replaced by `replace_with[2]`. 700 /// 701 /// After all matches are replaced, the writer is _not_ flushed. 702 /// 703 /// If there was a problem reading from the given reader or writing to the 704 /// given writer, then the corresponding `io::Error` is returned and all 705 /// replacement is stopped. 706 /// 707 /// When searching a stream, an internal buffer is used. Therefore, callers 708 /// should avoiding providing a buffered reader, if possible. However, 709 /// callers may want to provide a buffered writer. 710 /// 711 /// Searching a stream requires that the automaton was built with 712 /// `MatchKind::Standard` semantics. If this automaton was constructed 713 /// with leftmost semantics, then this method will panic. To determine 714 /// whether this will panic at runtime, use the 715 /// [`AhoCorasick::supports_stream`](struct.AhoCorasick.html#method.supports_stream) 716 /// method. 717 /// 718 /// # Memory usage 719 /// 720 /// In general, searching streams will use a constant amount of memory for 721 /// its internal buffer. The one requirement is that the internal buffer 722 /// must be at least the size of the longest possible match. In most use 723 /// cases, the default buffer size will be much larger than any individual 724 /// match. 725 /// 726 /// # Panics 727 /// 728 /// This panics when `AhoCorasick::supports_stream` returns `false`. 729 /// That is, this panics when this automaton's match semantics are not 730 /// `MatchKind::Standard`. This restriction may be lifted in the future. 731 /// 732 /// # Examples 733 /// 734 /// Basic usage: 735 /// 736 /// ``` 737 /// use aho_corasick::AhoCorasick; 738 /// 739 /// # fn example() -> Result<(), ::std::io::Error> { 740 /// let patterns = &["fox", "brown", "quick"]; 741 /// let haystack = "The quick brown fox."; 742 /// let replace_with = &["sloth", "grey", "slow"]; 743 /// 744 /// let ac = AhoCorasick::new(patterns); 745 /// let mut result = vec![]; 746 /// ac.stream_replace_all(haystack.as_bytes(), &mut result, replace_with)?; 747 /// assert_eq!(b"The slow grey sloth.".to_vec(), result); 748 /// # Ok(()) }; example().unwrap() 749 /// ``` stream_replace_all<R, W, B>( &self, rdr: R, wtr: W, replace_with: &[B], ) -> io::Result<()> where R: io::Read, W: io::Write, B: AsRef<[u8]>,750 pub fn stream_replace_all<R, W, B>( 751 &self, 752 rdr: R, 753 wtr: W, 754 replace_with: &[B], 755 ) -> io::Result<()> 756 where 757 R: io::Read, 758 W: io::Write, 759 B: AsRef<[u8]>, 760 { 761 assert_eq!( 762 replace_with.len(), 763 self.pattern_count(), 764 "stream_replace_all requires a replacement for every pattern \ 765 in the automaton" 766 ); 767 self.stream_replace_all_with(rdr, wtr, |mat, _, wtr| { 768 wtr.write_all(replace_with[mat.pattern()].as_ref()) 769 }) 770 } 771 772 /// Search the given reader and replace all matches of this automaton 773 /// using the given closure. The result is written to the given 774 /// writer. Matches correspond to the same matches as reported by 775 /// [`find_iter`](struct.AhoCorasick.html#method.find_iter). 776 /// 777 /// The closure accepts three parameters: the match found, the text of 778 /// the match and the writer with which to write the replaced text (if any). 779 /// 780 /// After all matches are replaced, the writer is _not_ flushed. 781 /// 782 /// If there was a problem reading from the given reader or writing to the 783 /// given writer, then the corresponding `io::Error` is returned and all 784 /// replacement is stopped. 785 /// 786 /// When searching a stream, an internal buffer is used. Therefore, callers 787 /// should avoiding providing a buffered reader, if possible. However, 788 /// callers may want to provide a buffered writer. 789 /// 790 /// Searching a stream requires that the automaton was built with 791 /// `MatchKind::Standard` semantics. If this automaton was constructed 792 /// with leftmost semantics, then this method will panic. To determine 793 /// whether this will panic at runtime, use the 794 /// [`AhoCorasick::supports_stream`](struct.AhoCorasick.html#method.supports_stream) 795 /// method. 796 /// 797 /// # Memory usage 798 /// 799 /// In general, searching streams will use a constant amount of memory for 800 /// its internal buffer. The one requirement is that the internal buffer 801 /// must be at least the size of the longest possible match. In most use 802 /// cases, the default buffer size will be much larger than any individual 803 /// match. 804 /// 805 /// # Panics 806 /// 807 /// This panics when `AhoCorasick::supports_stream` returns `false`. 808 /// That is, this panics when this automaton's match semantics are not 809 /// `MatchKind::Standard`. This restriction may be lifted in the future. 810 /// 811 /// # Examples 812 /// 813 /// Basic usage: 814 /// 815 /// ``` 816 /// use std::io::Write; 817 /// use aho_corasick::AhoCorasick; 818 /// 819 /// # fn example() -> Result<(), ::std::io::Error> { 820 /// let patterns = &["fox", "brown", "quick"]; 821 /// let haystack = "The quick brown fox."; 822 /// 823 /// let ac = AhoCorasick::new(patterns); 824 /// let mut result = vec![]; 825 /// ac.stream_replace_all_with( 826 /// haystack.as_bytes(), 827 /// &mut result, 828 /// |mat, _, wtr| { 829 /// wtr.write_all(mat.pattern().to_string().as_bytes()) 830 /// }, 831 /// )?; 832 /// assert_eq!(b"The 2 1 0.".to_vec(), result); 833 /// # Ok(()) }; example().unwrap() 834 /// ``` stream_replace_all_with<R, W, F>( &self, rdr: R, mut wtr: W, mut replace_with: F, ) -> io::Result<()> where R: io::Read, W: io::Write, F: FnMut(&Match, &[u8], &mut W) -> io::Result<()>,835 pub fn stream_replace_all_with<R, W, F>( 836 &self, 837 rdr: R, 838 mut wtr: W, 839 mut replace_with: F, 840 ) -> io::Result<()> 841 where 842 R: io::Read, 843 W: io::Write, 844 F: FnMut(&Match, &[u8], &mut W) -> io::Result<()>, 845 { 846 let mut it = StreamChunkIter::new(self, rdr); 847 while let Some(result) = it.next() { 848 let chunk = result?; 849 match chunk { 850 StreamChunk::NonMatch { bytes, .. } => { 851 wtr.write_all(bytes)?; 852 } 853 StreamChunk::Match { bytes, mat } => { 854 replace_with(&mat, bytes, &mut wtr)?; 855 } 856 } 857 } 858 Ok(()) 859 } 860 861 /// Returns the match kind used by this automaton. 862 /// 863 /// # Examples 864 /// 865 /// Basic usage: 866 /// 867 /// ``` 868 /// use aho_corasick::{AhoCorasick, MatchKind}; 869 /// 870 /// let ac = AhoCorasick::new(&[ 871 /// "foo", "bar", "quux", "baz", 872 /// ]); 873 /// assert_eq!(&MatchKind::Standard, ac.match_kind()); 874 /// ``` match_kind(&self) -> &MatchKind875 pub fn match_kind(&self) -> &MatchKind { 876 self.imp.match_kind() 877 } 878 879 /// Returns the length of the longest pattern matched by this automaton. 880 /// 881 /// # Examples 882 /// 883 /// Basic usage: 884 /// 885 /// ``` 886 /// use aho_corasick::AhoCorasick; 887 /// 888 /// let ac = AhoCorasick::new(&[ 889 /// "foo", "bar", "quux", "baz", 890 /// ]); 891 /// assert_eq!(4, ac.max_pattern_len()); 892 /// ``` max_pattern_len(&self) -> usize893 pub fn max_pattern_len(&self) -> usize { 894 self.imp.max_pattern_len() 895 } 896 897 /// Return the total number of patterns matched by this automaton. 898 /// 899 /// This includes patterns that may never participate in a match. For 900 /// example, if 901 /// [`MatchKind::LeftmostFirst`](enum.MatchKind.html#variant.LeftmostFirst) 902 /// match semantics are used, and the patterns `Sam` and `Samwise` were 903 /// used to build the automaton, then `Samwise` can never participate in a 904 /// match because `Sam` will always take priority. 905 /// 906 /// # Examples 907 /// 908 /// Basic usage: 909 /// 910 /// ``` 911 /// use aho_corasick::AhoCorasick; 912 /// 913 /// let ac = AhoCorasick::new(&[ 914 /// "foo", "bar", "baz", 915 /// ]); 916 /// assert_eq!(3, ac.pattern_count()); 917 /// ``` pattern_count(&self) -> usize918 pub fn pattern_count(&self) -> usize { 919 self.imp.pattern_count() 920 } 921 922 /// Returns true if and only if this automaton supports reporting 923 /// overlapping matches. 924 /// 925 /// If this returns false and overlapping matches are requested, then it 926 /// will result in a panic. 927 /// 928 /// Since leftmost matching is inherently incompatible with overlapping 929 /// matches, only 930 /// [`MatchKind::Standard`](enum.MatchKind.html#variant.Standard) 931 /// supports overlapping matches. This is unlikely to change in the future. 932 /// 933 /// # Examples 934 /// 935 /// Basic usage: 936 /// 937 /// ``` 938 /// use aho_corasick::{AhoCorasickBuilder, MatchKind}; 939 /// 940 /// let ac = AhoCorasickBuilder::new() 941 /// .match_kind(MatchKind::Standard) 942 /// .build(&["foo", "bar", "baz"]); 943 /// assert!(ac.supports_overlapping()); 944 /// 945 /// let ac = AhoCorasickBuilder::new() 946 /// .match_kind(MatchKind::LeftmostFirst) 947 /// .build(&["foo", "bar", "baz"]); 948 /// assert!(!ac.supports_overlapping()); 949 /// ``` supports_overlapping(&self) -> bool950 pub fn supports_overlapping(&self) -> bool { 951 self.match_kind.supports_overlapping() 952 } 953 954 /// Returns true if and only if this automaton supports stream searching. 955 /// 956 /// If this returns false and stream searching (or replacing) is attempted, 957 /// then it will result in a panic. 958 /// 959 /// Currently, only 960 /// [`MatchKind::Standard`](enum.MatchKind.html#variant.Standard) 961 /// supports streaming. This may be expanded in the future. 962 /// 963 /// # Examples 964 /// 965 /// Basic usage: 966 /// 967 /// ``` 968 /// use aho_corasick::{AhoCorasickBuilder, MatchKind}; 969 /// 970 /// let ac = AhoCorasickBuilder::new() 971 /// .match_kind(MatchKind::Standard) 972 /// .build(&["foo", "bar", "baz"]); 973 /// assert!(ac.supports_stream()); 974 /// 975 /// let ac = AhoCorasickBuilder::new() 976 /// .match_kind(MatchKind::LeftmostFirst) 977 /// .build(&["foo", "bar", "baz"]); 978 /// assert!(!ac.supports_stream()); 979 /// ``` supports_stream(&self) -> bool980 pub fn supports_stream(&self) -> bool { 981 self.match_kind.supports_stream() 982 } 983 984 /// Returns the approximate total amount of heap used by this automaton, in 985 /// units of bytes. 986 /// 987 /// # Examples 988 /// 989 /// This example shows the difference in heap usage between a few 990 /// configurations: 991 /// 992 /// ```ignore 993 /// use aho_corasick::{AhoCorasickBuilder, MatchKind}; 994 /// 995 /// let ac = AhoCorasickBuilder::new() 996 /// .dfa(false) // default 997 /// .build(&["foo", "bar", "baz"]); 998 /// assert_eq!(10_336, ac.heap_bytes()); 999 /// 1000 /// let ac = AhoCorasickBuilder::new() 1001 /// .dfa(false) // default 1002 /// .ascii_case_insensitive(true) 1003 /// .build(&["foo", "bar", "baz"]); 1004 /// assert_eq!(10_384, ac.heap_bytes()); 1005 /// 1006 /// let ac = AhoCorasickBuilder::new() 1007 /// .dfa(true) 1008 /// .ascii_case_insensitive(true) 1009 /// .build(&["foo", "bar", "baz"]); 1010 /// assert_eq!(1_248, ac.heap_bytes()); 1011 /// ``` heap_bytes(&self) -> usize1012 pub fn heap_bytes(&self) -> usize { 1013 match self.imp { 1014 Imp::NFA(ref nfa) => nfa.heap_bytes(), 1015 Imp::DFA(ref dfa) => dfa.heap_bytes(), 1016 } 1017 } 1018 } 1019 1020 /// The internal implementation of Aho-Corasick, which is either an NFA or 1021 /// a DFA. The NFA is slower but uses less memory. The DFA is faster but uses 1022 /// more memory. 1023 #[derive(Clone, Debug)] 1024 enum Imp<S: StateID> { 1025 NFA(NFA<S>), 1026 DFA(DFA<S>), 1027 } 1028 1029 impl<S: StateID> Imp<S> { 1030 /// Returns the type of match semantics implemented by this automaton. match_kind(&self) -> &MatchKind1031 fn match_kind(&self) -> &MatchKind { 1032 match *self { 1033 Imp::NFA(ref nfa) => nfa.match_kind(), 1034 Imp::DFA(ref dfa) => dfa.match_kind(), 1035 } 1036 } 1037 1038 /// Returns the identifier of the start state. start_state(&self) -> S1039 fn start_state(&self) -> S { 1040 match *self { 1041 Imp::NFA(ref nfa) => nfa.start_state(), 1042 Imp::DFA(ref dfa) => dfa.start_state(), 1043 } 1044 } 1045 1046 /// The length, in bytes, of the longest pattern in this automaton. This 1047 /// information is useful for maintaining correct buffer sizes when 1048 /// searching on streams. max_pattern_len(&self) -> usize1049 fn max_pattern_len(&self) -> usize { 1050 match *self { 1051 Imp::NFA(ref nfa) => nfa.max_pattern_len(), 1052 Imp::DFA(ref dfa) => dfa.max_pattern_len(), 1053 } 1054 } 1055 1056 /// The total number of patterns added to this automaton. This includes 1057 /// patterns that may never match. The maximum matching pattern that can be 1058 /// reported is exactly one less than this number. pattern_count(&self) -> usize1059 fn pattern_count(&self) -> usize { 1060 match *self { 1061 Imp::NFA(ref nfa) => nfa.pattern_count(), 1062 Imp::DFA(ref dfa) => dfa.pattern_count(), 1063 } 1064 } 1065 1066 /// Returns the prefilter object, if one exists, for the underlying 1067 /// automaton. prefilter(&self) -> Option<&dyn Prefilter>1068 fn prefilter(&self) -> Option<&dyn Prefilter> { 1069 match *self { 1070 Imp::NFA(ref nfa) => nfa.prefilter(), 1071 Imp::DFA(ref dfa) => dfa.prefilter(), 1072 } 1073 } 1074 1075 /// Returns true if and only if we should attempt to use a prefilter. use_prefilter(&self) -> bool1076 fn use_prefilter(&self) -> bool { 1077 let p = match self.prefilter() { 1078 None => return false, 1079 Some(p) => p, 1080 }; 1081 !p.looks_for_non_start_of_match() 1082 } 1083 1084 #[inline(always)] overlapping_find_at( &self, prestate: &mut PrefilterState, haystack: &[u8], at: usize, state_id: &mut S, match_index: &mut usize, ) -> Option<Match>1085 fn overlapping_find_at( 1086 &self, 1087 prestate: &mut PrefilterState, 1088 haystack: &[u8], 1089 at: usize, 1090 state_id: &mut S, 1091 match_index: &mut usize, 1092 ) -> Option<Match> { 1093 match *self { 1094 Imp::NFA(ref nfa) => nfa.overlapping_find_at( 1095 prestate, 1096 haystack, 1097 at, 1098 state_id, 1099 match_index, 1100 ), 1101 Imp::DFA(ref dfa) => dfa.overlapping_find_at( 1102 prestate, 1103 haystack, 1104 at, 1105 state_id, 1106 match_index, 1107 ), 1108 } 1109 } 1110 1111 #[inline(always)] earliest_find_at( &self, prestate: &mut PrefilterState, haystack: &[u8], at: usize, state_id: &mut S, ) -> Option<Match>1112 fn earliest_find_at( 1113 &self, 1114 prestate: &mut PrefilterState, 1115 haystack: &[u8], 1116 at: usize, 1117 state_id: &mut S, 1118 ) -> Option<Match> { 1119 match *self { 1120 Imp::NFA(ref nfa) => { 1121 nfa.earliest_find_at(prestate, haystack, at, state_id) 1122 } 1123 Imp::DFA(ref dfa) => { 1124 dfa.earliest_find_at(prestate, haystack, at, state_id) 1125 } 1126 } 1127 } 1128 1129 #[inline(always)] find_at_no_state( &self, prestate: &mut PrefilterState, haystack: &[u8], at: usize, ) -> Option<Match>1130 fn find_at_no_state( 1131 &self, 1132 prestate: &mut PrefilterState, 1133 haystack: &[u8], 1134 at: usize, 1135 ) -> Option<Match> { 1136 match *self { 1137 Imp::NFA(ref nfa) => nfa.find_at_no_state(prestate, haystack, at), 1138 Imp::DFA(ref dfa) => dfa.find_at_no_state(prestate, haystack, at), 1139 } 1140 } 1141 } 1142 1143 /// An iterator of non-overlapping matches in a particular haystack. 1144 /// 1145 /// This iterator yields matches according to the 1146 /// [`MatchKind`](enum.MatchKind.html) 1147 /// used by this automaton. 1148 /// 1149 /// This iterator is constructed via the 1150 /// [`AhoCorasick::find_iter`](struct.AhoCorasick.html#method.find_iter) 1151 /// method. 1152 /// 1153 /// The type variable `S` refers to the representation used for state 1154 /// identifiers. (By default, this is `usize`.) 1155 /// 1156 /// The lifetime `'a` refers to the lifetime of the `AhoCorasick` automaton. 1157 /// 1158 /// The lifetime `'b` refers to the lifetime of the haystack being searched. 1159 #[derive(Debug)] 1160 pub struct FindIter<'a, 'b, S: StateID> { 1161 fsm: &'a Imp<S>, 1162 prestate: PrefilterState, 1163 haystack: &'b [u8], 1164 pos: usize, 1165 } 1166 1167 impl<'a, 'b, S: StateID> FindIter<'a, 'b, S> { new(ac: &'a AhoCorasick<S>, haystack: &'b [u8]) -> FindIter<'a, 'b, S>1168 fn new(ac: &'a AhoCorasick<S>, haystack: &'b [u8]) -> FindIter<'a, 'b, S> { 1169 let prestate = PrefilterState::new(ac.max_pattern_len()); 1170 FindIter { fsm: &ac.imp, prestate, haystack, pos: 0 } 1171 } 1172 } 1173 1174 impl<'a, 'b, S: StateID> Iterator for FindIter<'a, 'b, S> { 1175 type Item = Match; 1176 next(&mut self) -> Option<Match>1177 fn next(&mut self) -> Option<Match> { 1178 if self.pos > self.haystack.len() { 1179 return None; 1180 } 1181 let result = self.fsm.find_at_no_state( 1182 &mut self.prestate, 1183 self.haystack, 1184 self.pos, 1185 ); 1186 let mat = match result { 1187 None => return None, 1188 Some(mat) => mat, 1189 }; 1190 if mat.end() == self.pos { 1191 // If the automaton can match the empty string and if we found an 1192 // empty match, then we need to forcefully move the position. 1193 self.pos += 1; 1194 } else { 1195 self.pos = mat.end(); 1196 } 1197 Some(mat) 1198 } 1199 } 1200 1201 /// An iterator of overlapping matches in a particular haystack. 1202 /// 1203 /// This iterator will report all possible matches in a particular haystack, 1204 /// even when the matches overlap. 1205 /// 1206 /// This iterator is constructed via the 1207 /// [`AhoCorasick::find_overlapping_iter`](struct.AhoCorasick.html#method.find_overlapping_iter) 1208 /// method. 1209 /// 1210 /// The type variable `S` refers to the representation used for state 1211 /// identifiers. (By default, this is `usize`.) 1212 /// 1213 /// The lifetime `'a` refers to the lifetime of the `AhoCorasick` automaton. 1214 /// 1215 /// The lifetime `'b` refers to the lifetime of the haystack being searched. 1216 #[derive(Debug)] 1217 pub struct FindOverlappingIter<'a, 'b, S: StateID> { 1218 fsm: &'a Imp<S>, 1219 prestate: PrefilterState, 1220 haystack: &'b [u8], 1221 pos: usize, 1222 last_match_end: usize, 1223 state_id: S, 1224 match_index: usize, 1225 } 1226 1227 impl<'a, 'b, S: StateID> FindOverlappingIter<'a, 'b, S> { new( ac: &'a AhoCorasick<S>, haystack: &'b [u8], ) -> FindOverlappingIter<'a, 'b, S>1228 fn new( 1229 ac: &'a AhoCorasick<S>, 1230 haystack: &'b [u8], 1231 ) -> FindOverlappingIter<'a, 'b, S> { 1232 assert!( 1233 ac.supports_overlapping(), 1234 "automaton does not support overlapping searches" 1235 ); 1236 let prestate = PrefilterState::new(ac.max_pattern_len()); 1237 FindOverlappingIter { 1238 fsm: &ac.imp, 1239 prestate, 1240 haystack, 1241 pos: 0, 1242 last_match_end: 0, 1243 state_id: ac.imp.start_state(), 1244 match_index: 0, 1245 } 1246 } 1247 } 1248 1249 impl<'a, 'b, S: StateID> Iterator for FindOverlappingIter<'a, 'b, S> { 1250 type Item = Match; 1251 next(&mut self) -> Option<Match>1252 fn next(&mut self) -> Option<Match> { 1253 let result = self.fsm.overlapping_find_at( 1254 &mut self.prestate, 1255 self.haystack, 1256 self.pos, 1257 &mut self.state_id, 1258 &mut self.match_index, 1259 ); 1260 match result { 1261 None => return None, 1262 Some(m) => { 1263 self.pos = m.end(); 1264 Some(m) 1265 } 1266 } 1267 } 1268 } 1269 1270 /// An iterator that reports Aho-Corasick matches in a stream. 1271 /// 1272 /// This iterator yields elements of type `io::Result<Match>`, where an error 1273 /// is reported if there was a problem reading from the underlying stream. 1274 /// The iterator terminates only when the underlying stream reaches `EOF`. 1275 /// 1276 /// This iterator is constructed via the 1277 /// [`AhoCorasick::stream_find_iter`](struct.AhoCorasick.html#method.stream_find_iter) 1278 /// method. 1279 /// 1280 /// The type variable `R` refers to the `io::Read` stream that is being read 1281 /// from. 1282 /// 1283 /// The type variable `S` refers to the representation used for state 1284 /// identifiers. (By default, this is `usize`.) 1285 /// 1286 /// The lifetime `'a` refers to the lifetime of the `AhoCorasick` automaton. 1287 #[derive(Debug)] 1288 pub struct StreamFindIter<'a, R, S: StateID> { 1289 it: StreamChunkIter<'a, R, S>, 1290 } 1291 1292 impl<'a, R: io::Read, S: StateID> StreamFindIter<'a, R, S> { new(ac: &'a AhoCorasick<S>, rdr: R) -> StreamFindIter<'a, R, S>1293 fn new(ac: &'a AhoCorasick<S>, rdr: R) -> StreamFindIter<'a, R, S> { 1294 StreamFindIter { it: StreamChunkIter::new(ac, rdr) } 1295 } 1296 } 1297 1298 impl<'a, R: io::Read, S: StateID> Iterator for StreamFindIter<'a, R, S> { 1299 type Item = io::Result<Match>; 1300 next(&mut self) -> Option<io::Result<Match>>1301 fn next(&mut self) -> Option<io::Result<Match>> { 1302 loop { 1303 match self.it.next() { 1304 None => return None, 1305 Some(Err(err)) => return Some(Err(err)), 1306 Some(Ok(StreamChunk::NonMatch { .. })) => {} 1307 Some(Ok(StreamChunk::Match { mat, .. })) => { 1308 return Some(Ok(mat)); 1309 } 1310 } 1311 } 1312 } 1313 } 1314 1315 /// An iterator over chunks in an underlying reader. Each chunk either 1316 /// corresponds to non-matching bytes or matching bytes, but all bytes from 1317 /// the underlying reader are reported in sequence. There may be an arbitrary 1318 /// number of non-matching chunks before seeing a matching chunk. 1319 /// 1320 /// N.B. This does not actually implement Iterator because we need to borrow 1321 /// from the underlying reader. But conceptually, it's still an iterator. 1322 #[derive(Debug)] 1323 struct StreamChunkIter<'a, R, S: StateID> { 1324 /// The AC automaton. 1325 fsm: &'a Imp<S>, 1326 /// State associated with this automaton's prefilter. It is a heuristic 1327 /// for stopping the prefilter if it's deemed ineffective. 1328 prestate: PrefilterState, 1329 /// The source of bytes we read from. 1330 rdr: R, 1331 /// A fixed size buffer. This is what we actually search. There are some 1332 /// invariants around the buffer's size, namely, it must be big enough to 1333 /// contain the longest possible match. 1334 buf: Buffer, 1335 /// The ID of the FSM state we're currently in. 1336 state_id: S, 1337 /// The current position at which to start the next search in `buf`. 1338 search_pos: usize, 1339 /// The absolute position of `search_pos`, where `0` corresponds to the 1340 /// position of the first byte read from `rdr`. 1341 absolute_pos: usize, 1342 /// The ending position of the last StreamChunk that was returned to the 1343 /// caller. This position is used to determine whether we need to emit 1344 /// non-matching bytes before emitting a match. 1345 report_pos: usize, 1346 /// A match that should be reported on the next call. 1347 pending_match: Option<Match>, 1348 /// Enabled only when the automaton can match the empty string. When 1349 /// enabled, we need to execute one final search after consuming the 1350 /// reader to find the trailing empty match. 1351 has_empty_match_at_end: bool, 1352 } 1353 1354 /// A single chunk yielded by the stream chunk iterator. 1355 /// 1356 /// The `'r` lifetime refers to the lifetime of the stream chunk iterator. 1357 #[derive(Debug)] 1358 enum StreamChunk<'r> { 1359 /// A chunk that does not contain any matches. 1360 NonMatch { bytes: &'r [u8], start: usize }, 1361 /// A chunk that precisely contains a match. 1362 Match { bytes: &'r [u8], mat: Match }, 1363 } 1364 1365 impl<'a, R: io::Read, S: StateID> StreamChunkIter<'a, R, S> { new(ac: &'a AhoCorasick<S>, rdr: R) -> StreamChunkIter<'a, R, S>1366 fn new(ac: &'a AhoCorasick<S>, rdr: R) -> StreamChunkIter<'a, R, S> { 1367 assert!( 1368 ac.supports_stream(), 1369 "stream searching is only supported for Standard match semantics" 1370 ); 1371 1372 let prestate = if ac.imp.use_prefilter() { 1373 PrefilterState::new(ac.max_pattern_len()) 1374 } else { 1375 PrefilterState::disabled() 1376 }; 1377 let buf = Buffer::new(ac.imp.max_pattern_len()); 1378 let state_id = ac.imp.start_state(); 1379 StreamChunkIter { 1380 fsm: &ac.imp, 1381 prestate, 1382 rdr, 1383 buf, 1384 state_id, 1385 absolute_pos: 0, 1386 report_pos: 0, 1387 search_pos: 0, 1388 pending_match: None, 1389 has_empty_match_at_end: ac.is_match(""), 1390 } 1391 } 1392 next<'r>(&'r mut self) -> Option<io::Result<StreamChunk<'r>>>1393 fn next<'r>(&'r mut self) -> Option<io::Result<StreamChunk<'r>>> { 1394 loop { 1395 if let Some(mut mat) = self.pending_match.take() { 1396 let bytes = &self.buf.buffer()[mat.start()..mat.end()]; 1397 self.report_pos = mat.end(); 1398 mat = mat.increment(self.absolute_pos); 1399 return Some(Ok(StreamChunk::Match { bytes, mat })); 1400 } 1401 if self.search_pos >= self.buf.len() { 1402 if let Some(end) = self.unreported() { 1403 let bytes = &self.buf.buffer()[self.report_pos..end]; 1404 let start = self.absolute_pos + self.report_pos; 1405 self.report_pos = end; 1406 return Some(Ok(StreamChunk::NonMatch { bytes, start })); 1407 } 1408 if self.buf.len() >= self.buf.min_buffer_len() { 1409 // This is the point at which we roll our buffer, which we 1410 // only do if our buffer has at least the minimum amount of 1411 // bytes in it. Before rolling, we update our various 1412 // positions to be consistent with the buffer after it has 1413 // been rolled. 1414 1415 self.report_pos -= 1416 self.buf.len() - self.buf.min_buffer_len(); 1417 self.absolute_pos += 1418 self.search_pos - self.buf.min_buffer_len(); 1419 self.search_pos = self.buf.min_buffer_len(); 1420 self.buf.roll(); 1421 } 1422 match self.buf.fill(&mut self.rdr) { 1423 Err(err) => return Some(Err(err)), 1424 Ok(false) => { 1425 // We've hit EOF, but if there are still some 1426 // unreported bytes remaining, return them now. 1427 if self.report_pos < self.buf.len() { 1428 let bytes = &self.buf.buffer()[self.report_pos..]; 1429 let start = self.absolute_pos + self.report_pos; 1430 self.report_pos = self.buf.len(); 1431 1432 let chunk = StreamChunk::NonMatch { bytes, start }; 1433 return Some(Ok(chunk)); 1434 } else { 1435 // We've reported everything, but there might still 1436 // be a match at the very last position. 1437 if !self.has_empty_match_at_end { 1438 return None; 1439 } 1440 // fallthrough for another search to get trailing 1441 // empty matches 1442 self.has_empty_match_at_end = false; 1443 } 1444 } 1445 Ok(true) => {} 1446 } 1447 } 1448 let result = self.fsm.earliest_find_at( 1449 &mut self.prestate, 1450 self.buf.buffer(), 1451 self.search_pos, 1452 &mut self.state_id, 1453 ); 1454 match result { 1455 None => { 1456 self.search_pos = self.buf.len(); 1457 } 1458 Some(mat) => { 1459 self.state_id = self.fsm.start_state(); 1460 if mat.end() == self.search_pos { 1461 // If the automaton can match the empty string and if 1462 // we found an empty match, then we need to forcefully 1463 // move the position. 1464 self.search_pos += 1; 1465 } else { 1466 self.search_pos = mat.end(); 1467 } 1468 self.pending_match = Some(mat.clone()); 1469 if self.report_pos < mat.start() { 1470 let bytes = 1471 &self.buf.buffer()[self.report_pos..mat.start()]; 1472 let start = self.absolute_pos + self.report_pos; 1473 self.report_pos = mat.start(); 1474 1475 let chunk = StreamChunk::NonMatch { bytes, start }; 1476 return Some(Ok(chunk)); 1477 } 1478 } 1479 } 1480 } 1481 } 1482 unreported(&self) -> Option<usize>1483 fn unreported(&self) -> Option<usize> { 1484 let end = self.search_pos.saturating_sub(self.buf.min_buffer_len()); 1485 if self.report_pos < end { 1486 Some(end) 1487 } else { 1488 None 1489 } 1490 } 1491 } 1492 1493 /// A builder for configuring an Aho-Corasick automaton. 1494 #[derive(Clone, Debug)] 1495 pub struct AhoCorasickBuilder { 1496 nfa_builder: nfa::Builder, 1497 dfa_builder: dfa::Builder, 1498 dfa: bool, 1499 } 1500 1501 impl Default for AhoCorasickBuilder { default() -> AhoCorasickBuilder1502 fn default() -> AhoCorasickBuilder { 1503 AhoCorasickBuilder::new() 1504 } 1505 } 1506 1507 impl AhoCorasickBuilder { 1508 /// Create a new builder for configuring an Aho-Corasick automaton. 1509 /// 1510 /// If you don't need fine grained configuration or aren't sure which knobs 1511 /// to set, try using 1512 /// [`AhoCorasick::new_auto_configured`](struct.AhoCorasick.html#method.new_auto_configured) 1513 /// instead. new() -> AhoCorasickBuilder1514 pub fn new() -> AhoCorasickBuilder { 1515 AhoCorasickBuilder { 1516 nfa_builder: nfa::Builder::new(), 1517 dfa_builder: dfa::Builder::new(), 1518 dfa: false, 1519 } 1520 } 1521 1522 /// Build an Aho-Corasick automaton using the configuration set on this 1523 /// builder. 1524 /// 1525 /// A builder may be reused to create more automatons. 1526 /// 1527 /// This method will use the default for representing internal state 1528 /// identifiers, which is `usize`. This guarantees that building the 1529 /// automaton will succeed and is generally a good default, but can make 1530 /// the size of the automaton 2-8 times bigger than it needs to be, 1531 /// depending on your target platform. 1532 /// 1533 /// # Examples 1534 /// 1535 /// Basic usage: 1536 /// 1537 /// ``` 1538 /// use aho_corasick::AhoCorasickBuilder; 1539 /// 1540 /// let patterns = &["foo", "bar", "baz"]; 1541 /// let ac = AhoCorasickBuilder::new() 1542 /// .build(patterns); 1543 /// assert_eq!(Some(1), ac.find("xxx bar xxx").map(|m| m.pattern())); 1544 /// ``` build<I, P>(&self, patterns: I) -> AhoCorasick where I: IntoIterator<Item = P>, P: AsRef<[u8]>,1545 pub fn build<I, P>(&self, patterns: I) -> AhoCorasick 1546 where 1547 I: IntoIterator<Item = P>, 1548 P: AsRef<[u8]>, 1549 { 1550 // The builder only returns an error if the chosen state ID 1551 // representation is too small to fit all of the given patterns. In 1552 // this case, since we fix the representation to usize, it will always 1553 // work because it's impossible to overflow usize since the underlying 1554 // storage would OOM long before that happens. 1555 self.build_with_size::<usize, I, P>(patterns) 1556 .expect("usize state ID type should always work") 1557 } 1558 1559 /// Build an Aho-Corasick automaton using the configuration set on this 1560 /// builder with a specific state identifier representation. This only has 1561 /// an effect when the `dfa` option is enabled. 1562 /// 1563 /// Generally, the choices for a state identifier representation are 1564 /// `u8`, `u16`, `u32`, `u64` or `usize`, with `usize` being the default. 1565 /// The advantage of choosing a smaller state identifier representation 1566 /// is that the automaton produced will be smaller. This might be 1567 /// beneficial for just generally using less space, or might even allow it 1568 /// to fit more of the automaton in your CPU's cache, leading to overall 1569 /// better search performance. 1570 /// 1571 /// Unlike the standard `build` method, this can report an error if the 1572 /// state identifier representation cannot support the size of the 1573 /// automaton. 1574 /// 1575 /// Note that the state identifier representation is determined by the 1576 /// `S` type variable. This requires a type hint of some sort, either 1577 /// by specifying the return type or using the turbofish, e.g., 1578 /// `build_with_size::<u16, _, _>(...)`. 1579 /// 1580 /// # Examples 1581 /// 1582 /// Basic usage: 1583 /// 1584 /// ``` 1585 /// use aho_corasick::{AhoCorasick, AhoCorasickBuilder}; 1586 /// 1587 /// # fn example() -> Result<(), ::aho_corasick::Error> { 1588 /// let patterns = &["foo", "bar", "baz"]; 1589 /// let ac: AhoCorasick<u8> = AhoCorasickBuilder::new() 1590 /// .build_with_size(patterns)?; 1591 /// assert_eq!(Some(1), ac.find("xxx bar xxx").map(|m| m.pattern())); 1592 /// # Ok(()) }; example().unwrap() 1593 /// ``` 1594 /// 1595 /// Or alternatively, with turbofish: 1596 /// 1597 /// ``` 1598 /// use aho_corasick::AhoCorasickBuilder; 1599 /// 1600 /// # fn example() -> Result<(), ::aho_corasick::Error> { 1601 /// let patterns = &["foo", "bar", "baz"]; 1602 /// let ac = AhoCorasickBuilder::new() 1603 /// .build_with_size::<u8, _, _>(patterns)?; 1604 /// assert_eq!(Some(1), ac.find("xxx bar xxx").map(|m| m.pattern())); 1605 /// # Ok(()) }; example().unwrap() 1606 /// ``` build_with_size<S, I, P>( &self, patterns: I, ) -> Result<AhoCorasick<S>> where S: StateID, I: IntoIterator<Item = P>, P: AsRef<[u8]>,1607 pub fn build_with_size<S, I, P>( 1608 &self, 1609 patterns: I, 1610 ) -> Result<AhoCorasick<S>> 1611 where 1612 S: StateID, 1613 I: IntoIterator<Item = P>, 1614 P: AsRef<[u8]>, 1615 { 1616 let nfa = self.nfa_builder.build(patterns)?; 1617 let match_kind = nfa.match_kind().clone(); 1618 let imp = if self.dfa { 1619 let dfa = self.dfa_builder.build(&nfa)?; 1620 Imp::DFA(dfa) 1621 } else { 1622 Imp::NFA(nfa) 1623 }; 1624 Ok(AhoCorasick { imp, match_kind }) 1625 } 1626 1627 /// Automatically configure the settings on this builder according to the 1628 /// patterns that will be used to construct the automaton. 1629 /// 1630 /// The idea here is to balance space and time automatically. That is, when 1631 /// searching a small number of patterns, this will attempt to use the 1632 /// fastest possible configuration since the total space required will be 1633 /// small anyway. As the number of patterns grows, this will fall back to 1634 /// slower configurations that use less space. 1635 /// 1636 /// This is guaranteed to never set `match_kind`, but any other option may 1637 /// be overridden. 1638 /// 1639 /// # Examples 1640 /// 1641 /// Basic usage: 1642 /// 1643 /// ``` 1644 /// use aho_corasick::AhoCorasickBuilder; 1645 /// 1646 /// let patterns = &["foo", "bar", "baz"]; 1647 /// let ac = AhoCorasickBuilder::new() 1648 /// .auto_configure(patterns) 1649 /// .build(patterns); 1650 /// assert_eq!(Some(1), ac.find("xxx bar xxx").map(|m| m.pattern())); 1651 /// ``` auto_configure<B: AsRef<[u8]>>( &mut self, patterns: &[B], ) -> &mut AhoCorasickBuilder1652 pub fn auto_configure<B: AsRef<[u8]>>( 1653 &mut self, 1654 patterns: &[B], 1655 ) -> &mut AhoCorasickBuilder { 1656 // N.B. Currently we only use the length of `patterns` to make a 1657 // decision here, and could therefore ask for an `ExactSizeIterator` 1658 // instead. But it's conceivable that we might adapt this to look at 1659 // the total number of bytes, which would requires a second pass. 1660 // 1661 // The logic here is fairly rudimentary at the moment, but probably 1662 // OK. The idea here is to use the fastest thing possible for a small 1663 // number of patterns. That is, a DFA with no byte classes, since byte 1664 // classes require an extra indirection for every byte searched. With a 1665 // moderate number of patterns, we still want a DFA, but save on both 1666 // space and compilation time by enabling byte classes. Finally, fall 1667 // back to the slower but smaller NFA. 1668 if patterns.len() <= 100 { 1669 // N.B. Using byte classes can actually be faster by improving 1670 // locality, but this only really applies for multi-megabyte 1671 // automata (i.e., automata that don't fit in your CPU's cache). 1672 self.dfa(true); 1673 } else if patterns.len() <= 5000 { 1674 self.dfa(true); 1675 } 1676 self 1677 } 1678 1679 /// Set the desired match semantics. 1680 /// 1681 /// The default is `MatchKind::Standard`, which corresponds to the match 1682 /// semantics supported by the standard textbook description of the 1683 /// Aho-Corasick algorithm. Namely, matches are reported as soon as they 1684 /// are found. Moreover, this is the only way to get overlapping matches 1685 /// or do stream searching. 1686 /// 1687 /// The other kinds of match semantics that are supported are 1688 /// `MatchKind::LeftmostFirst` and `MatchKind::LeftmostLongest`. The former 1689 /// corresponds to the match you would get if you were to try to match 1690 /// each pattern at each position in the haystack in the same order that 1691 /// you give to the automaton. That is, it returns the leftmost match 1692 /// corresponding the earliest pattern given to the automaton. The latter 1693 /// corresponds to finding the longest possible match among all leftmost 1694 /// matches. 1695 /// 1696 /// For more details on match semantics, see the 1697 /// [documentation for `MatchKind`](enum.MatchKind.html). 1698 /// 1699 /// # Examples 1700 /// 1701 /// In these examples, we demonstrate the differences between match 1702 /// semantics for a particular set of patterns in a specific order: 1703 /// `b`, `abc`, `abcd`. 1704 /// 1705 /// Standard semantics: 1706 /// 1707 /// ``` 1708 /// use aho_corasick::{AhoCorasickBuilder, MatchKind}; 1709 /// 1710 /// let patterns = &["b", "abc", "abcd"]; 1711 /// let haystack = "abcd"; 1712 /// 1713 /// let ac = AhoCorasickBuilder::new() 1714 /// .match_kind(MatchKind::Standard) // default, not necessary 1715 /// .build(patterns); 1716 /// let mat = ac.find(haystack).expect("should have a match"); 1717 /// assert_eq!("b", &haystack[mat.start()..mat.end()]); 1718 /// ``` 1719 /// 1720 /// Leftmost-first semantics: 1721 /// 1722 /// ``` 1723 /// use aho_corasick::{AhoCorasickBuilder, MatchKind}; 1724 /// 1725 /// let patterns = &["b", "abc", "abcd"]; 1726 /// let haystack = "abcd"; 1727 /// 1728 /// let ac = AhoCorasickBuilder::new() 1729 /// .match_kind(MatchKind::LeftmostFirst) 1730 /// .build(patterns); 1731 /// let mat = ac.find(haystack).expect("should have a match"); 1732 /// assert_eq!("abc", &haystack[mat.start()..mat.end()]); 1733 /// ``` 1734 /// 1735 /// Leftmost-longest semantics: 1736 /// 1737 /// ``` 1738 /// use aho_corasick::{AhoCorasickBuilder, MatchKind}; 1739 /// 1740 /// let patterns = &["b", "abc", "abcd"]; 1741 /// let haystack = "abcd"; 1742 /// 1743 /// let ac = AhoCorasickBuilder::new() 1744 /// .match_kind(MatchKind::LeftmostLongest) 1745 /// .build(patterns); 1746 /// let mat = ac.find(haystack).expect("should have a match"); 1747 /// assert_eq!("abcd", &haystack[mat.start()..mat.end()]); 1748 /// ``` match_kind(&mut self, kind: MatchKind) -> &mut AhoCorasickBuilder1749 pub fn match_kind(&mut self, kind: MatchKind) -> &mut AhoCorasickBuilder { 1750 self.nfa_builder.match_kind(kind); 1751 self 1752 } 1753 1754 /// Enable anchored mode, which requires all matches to start at the 1755 /// first position in a haystack. 1756 /// 1757 /// This option is disabled by default. 1758 /// 1759 /// # Examples 1760 /// 1761 /// Basic usage: 1762 /// 1763 /// ``` 1764 /// use aho_corasick::AhoCorasickBuilder; 1765 /// 1766 /// let patterns = &["foo", "bar"]; 1767 /// let haystack = "foobar"; 1768 /// 1769 /// let ac = AhoCorasickBuilder::new() 1770 /// .anchored(true) 1771 /// .build(patterns); 1772 /// assert_eq!(1, ac.find_iter(haystack).count()); 1773 /// ``` 1774 /// 1775 /// When searching for overlapping matches, all matches that start at 1776 /// the beginning of a haystack will be reported: 1777 /// 1778 /// ``` 1779 /// use aho_corasick::AhoCorasickBuilder; 1780 /// 1781 /// let patterns = &["foo", "foofoo"]; 1782 /// let haystack = "foofoo"; 1783 /// 1784 /// let ac = AhoCorasickBuilder::new() 1785 /// .anchored(true) 1786 /// .build(patterns); 1787 /// assert_eq!(2, ac.find_overlapping_iter(haystack).count()); 1788 /// // A non-anchored search would return 3 matches. 1789 /// ``` anchored(&mut self, yes: bool) -> &mut AhoCorasickBuilder1790 pub fn anchored(&mut self, yes: bool) -> &mut AhoCorasickBuilder { 1791 self.nfa_builder.anchored(yes); 1792 self 1793 } 1794 1795 /// Enable ASCII-aware case insensitive matching. 1796 /// 1797 /// When this option is enabled, searching will be performed without 1798 /// respect to case for ASCII letters (`a-z` and `A-Z`) only. 1799 /// 1800 /// Enabling this option does not change the search algorithm, but it may 1801 /// increase the size of the automaton. 1802 /// 1803 /// **NOTE:** In the future, support for full Unicode case insensitivity 1804 /// may be added, but ASCII case insensitivity is comparatively much 1805 /// simpler to add. 1806 /// 1807 /// # Examples 1808 /// 1809 /// Basic usage: 1810 /// 1811 /// ``` 1812 /// use aho_corasick::AhoCorasickBuilder; 1813 /// 1814 /// let patterns = &["FOO", "bAr", "BaZ"]; 1815 /// let haystack = "foo bar baz"; 1816 /// 1817 /// let ac = AhoCorasickBuilder::new() 1818 /// .ascii_case_insensitive(true) 1819 /// .build(patterns); 1820 /// assert_eq!(3, ac.find_iter(haystack).count()); 1821 /// ``` ascii_case_insensitive( &mut self, yes: bool, ) -> &mut AhoCorasickBuilder1822 pub fn ascii_case_insensitive( 1823 &mut self, 1824 yes: bool, 1825 ) -> &mut AhoCorasickBuilder { 1826 self.nfa_builder.ascii_case_insensitive(yes); 1827 self 1828 } 1829 1830 /// Set the limit on how many NFA states use a dense representation for 1831 /// their transitions. 1832 /// 1833 /// A dense representation uses more space, but supports faster access to 1834 /// transitions at search time. Thus, this setting permits the control of a 1835 /// space vs time trade off when using the NFA variant of Aho-Corasick. 1836 /// 1837 /// This limit is expressed in terms of the depth of a state, i.e., the 1838 /// number of transitions from the starting state of the NFA. The idea is 1839 /// that most of the time searching will be spent near the starting state 1840 /// of the automaton, so states near the start state should use a dense 1841 /// representation. States further away from the start state would then use 1842 /// a sparse representation, which uses less space but is slower to access 1843 /// transitions at search time. 1844 /// 1845 /// By default, this is set to a low but non-zero number. 1846 /// 1847 /// This setting has no effect if the `dfa` option is enabled. dense_depth(&mut self, depth: usize) -> &mut AhoCorasickBuilder1848 pub fn dense_depth(&mut self, depth: usize) -> &mut AhoCorasickBuilder { 1849 self.nfa_builder.dense_depth(depth); 1850 self 1851 } 1852 1853 /// Compile the standard Aho-Corasick automaton into a deterministic finite 1854 /// automaton (DFA). 1855 /// 1856 /// When this is disabled (which is the default), then a non-deterministic 1857 /// finite automaton (NFA) is used instead. 1858 /// 1859 /// The main benefit to a DFA is that it can execute searches more quickly 1860 /// than a NFA (perhaps 2-4 times as fast). The main drawback is that the 1861 /// DFA uses more space and can take much longer to build. 1862 /// 1863 /// Enabling this option does not change the time complexity for 1864 /// constructing the Aho-Corasick automaton (which is `O(p)` where 1865 /// `p` is the total number of patterns being compiled). Enabling this 1866 /// option does however reduce the time complexity of non-overlapping 1867 /// searches from `O(n + p)` to `O(n)`, where `n` is the length of the 1868 /// haystack. 1869 /// 1870 /// In general, it's a good idea to enable this if you're searching a 1871 /// small number of fairly short patterns (~1000), or if you want the 1872 /// fastest possible search without regard to compilation time or space 1873 /// usage. dfa(&mut self, yes: bool) -> &mut AhoCorasickBuilder1874 pub fn dfa(&mut self, yes: bool) -> &mut AhoCorasickBuilder { 1875 self.dfa = yes; 1876 self 1877 } 1878 1879 /// Enable heuristic prefilter optimizations. 1880 /// 1881 /// When enabled, searching will attempt to quickly skip to match 1882 /// candidates using specialized literal search routines. A prefilter 1883 /// cannot always be used, and is generally treated as a heuristic. It 1884 /// can be useful to disable this if the prefilter is observed to be 1885 /// sub-optimal for a particular workload. 1886 /// 1887 /// This is enabled by default. prefilter(&mut self, yes: bool) -> &mut AhoCorasickBuilder1888 pub fn prefilter(&mut self, yes: bool) -> &mut AhoCorasickBuilder { 1889 self.nfa_builder.prefilter(yes); 1890 self 1891 } 1892 1893 /// Shrink the size of the transition alphabet by mapping bytes to their 1894 /// equivalence classes. This only has an effect when the `dfa` option is 1895 /// enabled. 1896 /// 1897 /// When enabled, each a DFA will use a map from all possible bytes 1898 /// to their corresponding equivalence class. Each equivalence class 1899 /// represents a set of bytes that does not discriminate between a match 1900 /// and a non-match in the DFA. For example, the patterns `bar` and `baz` 1901 /// have at least five equivalence classes: singleton sets of `b`, `a`, `r` 1902 /// and `z`, and a final set that contains every other byte. 1903 /// 1904 /// The advantage of this map is that the size of the transition table can 1905 /// be reduced drastically from `#states * 256 * sizeof(id)` to 1906 /// `#states * k * sizeof(id)` where `k` is the number of equivalence 1907 /// classes. As a result, total space usage can decrease substantially. 1908 /// Moreover, since a smaller alphabet is used, compilation becomes faster 1909 /// as well. 1910 /// 1911 /// The disadvantage of this map is that every byte searched must be 1912 /// passed through this map before it can be used to determine the next 1913 /// transition. This has a small match time performance cost. However, if 1914 /// the DFA is otherwise very large without byte classes, then using byte 1915 /// classes can greatly improve memory locality and thus lead to better 1916 /// overall performance. 1917 /// 1918 /// This option is enabled by default. 1919 #[deprecated( 1920 since = "0.7.16", 1921 note = "not carrying its weight, will be always enabled, see: https://github.com/BurntSushi/aho-corasick/issues/57" 1922 )] byte_classes(&mut self, yes: bool) -> &mut AhoCorasickBuilder1923 pub fn byte_classes(&mut self, yes: bool) -> &mut AhoCorasickBuilder { 1924 self.dfa_builder.byte_classes(yes); 1925 self 1926 } 1927 1928 /// Premultiply state identifiers in the transition table. This only has 1929 /// an effect when the `dfa` option is enabled. 1930 /// 1931 /// When enabled, state identifiers are premultiplied to point to their 1932 /// corresponding row in the transition table. That is, given the `i`th 1933 /// state, its corresponding premultiplied identifier is `i * k` where `k` 1934 /// is the alphabet size of the automaton. (The alphabet size is at most 1935 /// 256, but is in practice smaller if byte classes is enabled.) 1936 /// 1937 /// When state identifiers are not premultiplied, then the identifier of 1938 /// the `i`th state is `i`. 1939 /// 1940 /// The advantage of premultiplying state identifiers is that is saves a 1941 /// multiplication instruction per byte when searching with a DFA. This has 1942 /// been observed to lead to a 20% performance benefit in micro-benchmarks. 1943 /// 1944 /// The primary disadvantage of premultiplying state identifiers is 1945 /// that they require a larger integer size to represent. For example, 1946 /// if the DFA has 200 states, then its premultiplied form requires 16 1947 /// bits to represent every possible state identifier, where as its 1948 /// non-premultiplied form only requires 8 bits. 1949 /// 1950 /// This option is enabled by default. 1951 #[deprecated( 1952 since = "0.7.16", 1953 note = "not carrying its weight, will be always enabled, see: https://github.com/BurntSushi/aho-corasick/issues/57" 1954 )] premultiply(&mut self, yes: bool) -> &mut AhoCorasickBuilder1955 pub fn premultiply(&mut self, yes: bool) -> &mut AhoCorasickBuilder { 1956 self.dfa_builder.premultiply(yes); 1957 self 1958 } 1959 } 1960 1961 /// A knob for controlling the match semantics of an Aho-Corasick automaton. 1962 /// 1963 /// There are two generally different ways that Aho-Corasick automatons can 1964 /// report matches. The first way is the "standard" approach that results from 1965 /// implementing most textbook explanations of Aho-Corasick. The second way is 1966 /// to report only the leftmost non-overlapping matches. The leftmost approach 1967 /// is in turn split into two different ways of resolving ambiguous matches: 1968 /// leftmost-first and leftmost-longest. 1969 /// 1970 /// The `Standard` match kind is the default and is the only one that supports 1971 /// overlapping matches and stream searching. (Trying to find overlapping 1972 /// or streaming matches using leftmost match semantics will result in a 1973 /// panic.) The `Standard` match kind will report matches as they are seen. 1974 /// When searching for overlapping matches, then all possible matches are 1975 /// reported. When searching for non-overlapping matches, the first match seen 1976 /// is reported. For example, for non-overlapping matches, given the patterns 1977 /// `abcd` and `b` and the subject string `abcdef`, only a match for `b` is 1978 /// reported since it is detected first. The `abcd` match is never reported 1979 /// since it overlaps with the `b` match. 1980 /// 1981 /// In contrast, the leftmost match kind always prefers the leftmost match 1982 /// among all possible matches. Given the same example as above with `abcd` and 1983 /// `b` as patterns and `abcdef` as the subject string, the leftmost match is 1984 /// `abcd` since it begins before the `b` match, even though the `b` match is 1985 /// detected before the `abcd` match. In this case, the `b` match is not 1986 /// reported at all since it overlaps with the `abcd` match. 1987 /// 1988 /// The difference between leftmost-first and leftmost-longest is in how they 1989 /// resolve ambiguous matches when there are multiple leftmost matches to 1990 /// choose from. Leftmost-first always chooses the pattern that was provided 1991 /// earliest, where as leftmost-longest always chooses the longest matching 1992 /// pattern. For example, given the patterns `a` and `ab` and the subject 1993 /// string `ab`, the leftmost-first match is `a` but the leftmost-longest match 1994 /// is `ab`. Conversely, if the patterns were given in reverse order, i.e., 1995 /// `ab` and `a`, then both the leftmost-first and leftmost-longest matches 1996 /// would be `ab`. Stated differently, the leftmost-first match depends on the 1997 /// order in which the patterns were given to the Aho-Corasick automaton. 1998 /// Because of that, when leftmost-first matching is used, if a pattern `A` 1999 /// that appears before a pattern `B` is a prefix of `B`, then it is impossible 2000 /// to ever observe a match of `B`. 2001 /// 2002 /// If you're not sure which match kind to pick, then stick with the standard 2003 /// kind, which is the default. In particular, if you need overlapping or 2004 /// streaming matches, then you _must_ use the standard kind. The leftmost 2005 /// kinds are useful in specific circumstances. For example, leftmost-first can 2006 /// be very useful as a way to implement match priority based on the order of 2007 /// patterns given and leftmost-longest can be useful for dictionary searching 2008 /// such that only the longest matching words are reported. 2009 /// 2010 /// # Relationship with regular expression alternations 2011 /// 2012 /// Understanding match semantics can be a little tricky, and one easy way 2013 /// to conceptualize non-overlapping matches from an Aho-Corasick automaton 2014 /// is to think about them as a simple alternation of literals in a regular 2015 /// expression. For example, let's say we wanted to match the strings 2016 /// `Sam` and `Samwise`, which would turn into the regex `Sam|Samwise`. It 2017 /// turns out that regular expression engines have two different ways of 2018 /// matching this alternation. The first way, leftmost-longest, is commonly 2019 /// found in POSIX compatible implementations of regular expressions (such as 2020 /// `grep`). The second way, leftmost-first, is commonly found in backtracking 2021 /// implementations such as Perl. (Some regex engines, such as RE2 and Rust's 2022 /// regex engine do not use backtracking, but still implement leftmost-first 2023 /// semantics in an effort to match the behavior of dominant backtracking 2024 /// regex engines such as those found in Perl, Ruby, Python, Javascript and 2025 /// PHP.) 2026 /// 2027 /// That is, when matching `Sam|Samwise` against `Samwise`, a POSIX regex 2028 /// will match `Samwise` because it is the longest possible match, but a 2029 /// Perl-like regex will match `Sam` since it appears earlier in the 2030 /// alternation. Indeed, the regex `Sam|Samwise` in a Perl-like regex engine 2031 /// will never match `Samwise` since `Sam` will always have higher priority. 2032 /// Conversely, matching the regex `Samwise|Sam` against `Samwise` will lead to 2033 /// a match of `Samwise` in both POSIX and Perl-like regexes since `Samwise` is 2034 /// still longest match, but it also appears earlier than `Sam`. 2035 /// 2036 /// The "standard" match semantics of Aho-Corasick generally don't correspond 2037 /// to the match semantics of any large group of regex implementations, so 2038 /// there's no direct analogy that can be made here. Standard match semantics 2039 /// are generally useful for overlapping matches, or if you just want to see 2040 /// matches as they are detected. 2041 /// 2042 /// The main conclusion to draw from this section is that the match semantics 2043 /// can be tweaked to precisely match either Perl-like regex alternations or 2044 /// POSIX regex alternations. 2045 #[derive(Clone, Copy, Debug, Eq, PartialEq)] 2046 pub enum MatchKind { 2047 /// Use standard match semantics, which support overlapping matches. When 2048 /// used with non-overlapping matches, matches are reported as they are 2049 /// seen. 2050 Standard, 2051 /// Use leftmost-first match semantics, which reports leftmost matches. 2052 /// When there are multiple possible leftmost matches, the match 2053 /// corresponding to the pattern that appeared earlier when constructing 2054 /// the automaton is reported. 2055 /// 2056 /// This does **not** support overlapping matches or stream searching. If 2057 /// this match kind is used, attempting to find overlapping matches or 2058 /// stream matches will panic. 2059 LeftmostFirst, 2060 /// Use leftmost-longest match semantics, which reports leftmost matches. 2061 /// When there are multiple possible leftmost matches, the longest match 2062 /// is chosen. 2063 /// 2064 /// This does **not** support overlapping matches or stream searching. If 2065 /// this match kind is used, attempting to find overlapping matches or 2066 /// stream matches will panic. 2067 LeftmostLongest, 2068 /// Hints that destructuring should not be exhaustive. 2069 /// 2070 /// This enum may grow additional variants, so this makes sure clients 2071 /// don't count on exhaustive matching. (Otherwise, adding a new variant 2072 /// could break existing code.) 2073 #[doc(hidden)] 2074 __Nonexhaustive, 2075 } 2076 2077 /// The default match kind is `MatchKind::Standard`. 2078 impl Default for MatchKind { default() -> MatchKind2079 fn default() -> MatchKind { 2080 MatchKind::Standard 2081 } 2082 } 2083 2084 impl MatchKind { supports_overlapping(&self) -> bool2085 fn supports_overlapping(&self) -> bool { 2086 self.is_standard() 2087 } 2088 supports_stream(&self) -> bool2089 fn supports_stream(&self) -> bool { 2090 // TODO: It may be possible to support this. It's hard. 2091 // 2092 // See: https://github.com/rust-lang/regex/issues/425#issuecomment-471367838 2093 self.is_standard() 2094 } 2095 is_standard(&self) -> bool2096 pub(crate) fn is_standard(&self) -> bool { 2097 *self == MatchKind::Standard 2098 } 2099 is_leftmost(&self) -> bool2100 pub(crate) fn is_leftmost(&self) -> bool { 2101 *self == MatchKind::LeftmostFirst 2102 || *self == MatchKind::LeftmostLongest 2103 } 2104 is_leftmost_first(&self) -> bool2105 pub(crate) fn is_leftmost_first(&self) -> bool { 2106 *self == MatchKind::LeftmostFirst 2107 } 2108 2109 /// Convert this match kind into a packed match kind. If this match kind 2110 /// corresponds to standard semantics, then this returns None, since 2111 /// packed searching does not support standard semantics. as_packed(&self) -> Option<packed::MatchKind>2112 pub(crate) fn as_packed(&self) -> Option<packed::MatchKind> { 2113 match *self { 2114 MatchKind::Standard => None, 2115 MatchKind::LeftmostFirst => Some(packed::MatchKind::LeftmostFirst), 2116 MatchKind::LeftmostLongest => { 2117 Some(packed::MatchKind::LeftmostLongest) 2118 } 2119 MatchKind::__Nonexhaustive => unreachable!(), 2120 } 2121 } 2122 } 2123 2124 #[cfg(test)] 2125 mod tests { 2126 use super::*; 2127 2128 #[test] oibits()2129 fn oibits() { 2130 use std::panic::{RefUnwindSafe, UnwindSafe}; 2131 2132 fn assert_send<T: Send>() {} 2133 fn assert_sync<T: Sync>() {} 2134 fn assert_unwind_safe<T: RefUnwindSafe + UnwindSafe>() {} 2135 2136 assert_send::<AhoCorasick>(); 2137 assert_sync::<AhoCorasick>(); 2138 assert_unwind_safe::<AhoCorasick>(); 2139 assert_send::<AhoCorasickBuilder>(); 2140 assert_sync::<AhoCorasickBuilder>(); 2141 assert_unwind_safe::<AhoCorasickBuilder>(); 2142 } 2143 } 2144