1 use std::io; 2 3 use crate::automaton::Automaton; 4 use crate::buffer::Buffer; 5 use crate::dfa::{self, DFA}; 6 use crate::error::Result; 7 use crate::nfa::{self, NFA}; 8 use crate::packed; 9 use crate::prefilter::{Prefilter, PrefilterState}; 10 use crate::state_id::StateID; 11 use crate::Match; 12 13 /// An automaton for searching multiple strings in linear time. 14 /// 15 /// The `AhoCorasick` type supports a few basic ways of constructing an 16 /// automaton, including 17 /// [`AhoCorasick::new`](struct.AhoCorasick.html#method.new) 18 /// and 19 /// [`AhoCorasick::new_auto_configured`](struct.AhoCorasick.html#method.new_auto_configured). 20 /// However, there are a fair number of configurable options that can be set 21 /// by using 22 /// [`AhoCorasickBuilder`](struct.AhoCorasickBuilder.html) 23 /// instead. Such options include, but are not limited to, how matches are 24 /// determined, simple case insensitivity, whether to use a DFA or not and 25 /// various knobs for controlling the space-vs-time trade offs taken when 26 /// building the automaton. 27 /// 28 /// If you aren't sure where to start, try beginning with 29 /// [`AhoCorasick::new_auto_configured`](struct.AhoCorasick.html#method.new_auto_configured). 30 /// 31 /// # Resource usage 32 /// 33 /// Aho-Corasick automatons are always constructed in `O(p)` time, where `p` 34 /// is the combined length of all patterns being searched. With that said, 35 /// building an automaton can be fairly costly because of high constant 36 /// factors, particularly when enabling the 37 /// [DFA](struct.AhoCorasickBuilder.html#method.dfa) 38 /// option (which is disabled by default). For this reason, it's generally a 39 /// good idea to build an automaton once and reuse it as much as possible. 40 /// 41 /// Aho-Corasick automatons can also use a fair bit of memory. To get a 42 /// concrete idea of how much memory is being used, try using the 43 /// [`AhoCorasick::heap_bytes`](struct.AhoCorasick.html#method.heap_bytes) 44 /// method. 45 /// 46 /// # Examples 47 /// 48 /// This example shows how to search for occurrences of multiple patterns 49 /// simultaneously in a case insensitive fashion. Each match includes the 50 /// pattern that matched along with the byte offsets of the match. 51 /// 52 /// ``` 53 /// use aho_corasick::AhoCorasickBuilder; 54 /// 55 /// let patterns = &["apple", "maple", "snapple"]; 56 /// let haystack = "Nobody likes maple in their apple flavored Snapple."; 57 /// 58 /// let ac = AhoCorasickBuilder::new() 59 /// .ascii_case_insensitive(true) 60 /// .build(patterns); 61 /// let mut matches = vec![]; 62 /// for mat in ac.find_iter(haystack) { 63 /// matches.push((mat.pattern(), mat.start(), mat.end())); 64 /// } 65 /// assert_eq!(matches, vec![ 66 /// (1, 13, 18), 67 /// (0, 28, 33), 68 /// (2, 43, 50), 69 /// ]); 70 /// ``` 71 /// 72 /// This example shows how to replace matches with some other string: 73 /// 74 /// ``` 75 /// use aho_corasick::AhoCorasick; 76 /// 77 /// let patterns = &["fox", "brown", "quick"]; 78 /// let haystack = "The quick brown fox."; 79 /// let replace_with = &["sloth", "grey", "slow"]; 80 /// 81 /// let ac = AhoCorasick::new(patterns); 82 /// let result = ac.replace_all(haystack, replace_with); 83 /// assert_eq!(result, "The slow grey sloth."); 84 /// ``` 85 #[derive(Clone, Debug)] 86 pub struct AhoCorasick<S: StateID = usize> { 87 imp: Imp<S>, 88 match_kind: MatchKind, 89 } 90 91 impl AhoCorasick { 92 /// Create a new Aho-Corasick automaton using the default configuration. 93 /// 94 /// The default configuration optimizes for less space usage, but at the 95 /// expense of longer search times. To change the configuration, use 96 /// [`AhoCorasickBuilder`](struct.AhoCorasickBuilder.html) 97 /// for fine-grained control, or 98 /// [`AhoCorasick::new_auto_configured`](struct.AhoCorasick.html#method.new_auto_configured) 99 /// for automatic configuration if you aren't sure which settings to pick. 100 /// 101 /// This uses the default 102 /// [`MatchKind::Standard`](enum.MatchKind.html#variant.Standard) 103 /// match semantics, which reports a match as soon as it is found. This 104 /// corresponds to the standard match semantics supported by textbook 105 /// descriptions of the Aho-Corasick algorithm. 106 /// 107 /// # Examples 108 /// 109 /// Basic usage: 110 /// 111 /// ``` 112 /// use aho_corasick::AhoCorasick; 113 /// 114 /// let ac = AhoCorasick::new(&[ 115 /// "foo", "bar", "baz", 116 /// ]); 117 /// assert_eq!(Some(1), ac.find("xxx bar xxx").map(|m| m.pattern())); 118 /// ``` new<I, P>(patterns: I) -> AhoCorasick where I: IntoIterator<Item = P>, P: AsRef<[u8]>,119 pub fn new<I, P>(patterns: I) -> AhoCorasick 120 where 121 I: IntoIterator<Item = P>, 122 P: AsRef<[u8]>, 123 { 124 AhoCorasickBuilder::new().build(patterns) 125 } 126 127 /// Build an Aho-Corasick automaton with an automatically determined 128 /// configuration. 129 /// 130 /// Specifically, this requires a slice of patterns instead of an iterator 131 /// since the configuration is determined by looking at the patterns before 132 /// constructing the automaton. The idea here is to balance space and time 133 /// automatically. That is, when searching a small number of patterns, this 134 /// will attempt to use the fastest possible configuration since the total 135 /// space required will be small anyway. As the number of patterns grows, 136 /// this will fall back to slower configurations that use less space. 137 /// 138 /// If you want auto configuration but with match semantics different from 139 /// the default `MatchKind::Standard`, then use 140 /// [`AhoCorasickBuilder::auto_configure`](struct.AhoCorasickBuilder.html#method.auto_configure). 141 /// 142 /// # Examples 143 /// 144 /// Basic usage is just like `new`, except you must provide a slice: 145 /// 146 /// ``` 147 /// use aho_corasick::AhoCorasick; 148 /// 149 /// let ac = AhoCorasick::new_auto_configured(&[ 150 /// "foo", "bar", "baz", 151 /// ]); 152 /// assert_eq!(Some(1), ac.find("xxx bar xxx").map(|m| m.pattern())); 153 /// ``` new_auto_configured<B>(patterns: &[B]) -> AhoCorasick where B: AsRef<[u8]>,154 pub fn new_auto_configured<B>(patterns: &[B]) -> AhoCorasick 155 where 156 B: AsRef<[u8]>, 157 { 158 AhoCorasickBuilder::new().auto_configure(patterns).build(patterns) 159 } 160 } 161 162 impl<S: StateID> AhoCorasick<S> { 163 /// Returns true if and only if this automaton matches the haystack at any 164 /// position. 165 /// 166 /// `haystack` may be any type that is cheaply convertible to a `&[u8]`. 167 /// This includes, but is not limited to, `String`, `&str`, `Vec<u8>`, and 168 /// `&[u8]` itself. 169 /// 170 /// # Examples 171 /// 172 /// Basic usage: 173 /// 174 /// ``` 175 /// use aho_corasick::AhoCorasick; 176 /// 177 /// let ac = AhoCorasick::new(&[ 178 /// "foo", "bar", "quux", "baz", 179 /// ]); 180 /// assert!(ac.is_match("xxx bar xxx")); 181 /// assert!(!ac.is_match("xxx qux xxx")); 182 /// ``` is_match<B: AsRef<[u8]>>(&self, haystack: B) -> bool183 pub fn is_match<B: AsRef<[u8]>>(&self, haystack: B) -> bool { 184 self.earliest_find(haystack).is_some() 185 } 186 187 /// Returns the location of the first detected match in `haystack`. 188 /// 189 /// This method has the same behavior regardless of the 190 /// [`MatchKind`](enum.MatchKind.html) 191 /// of this automaton. 192 /// 193 /// `haystack` may be any type that is cheaply convertible to a `&[u8]`. 194 /// This includes, but is not limited to, `String`, `&str`, `Vec<u8>`, and 195 /// `&[u8]` itself. 196 /// 197 /// # Examples 198 /// 199 /// Basic usage: 200 /// 201 /// ``` 202 /// use aho_corasick::AhoCorasick; 203 /// 204 /// let ac = AhoCorasick::new(&[ 205 /// "abc", "b", 206 /// ]); 207 /// let mat = ac.earliest_find("abcd").expect("should have match"); 208 /// assert_eq!(1, mat.pattern()); 209 /// assert_eq!((1, 2), (mat.start(), mat.end())); 210 /// ``` earliest_find<B: AsRef<[u8]>>(&self, haystack: B) -> Option<Match>211 pub fn earliest_find<B: AsRef<[u8]>>(&self, haystack: B) -> Option<Match> { 212 let mut prestate = PrefilterState::new(self.max_pattern_len()); 213 let mut start = self.imp.start_state(); 214 self.imp.earliest_find_at( 215 &mut prestate, 216 haystack.as_ref(), 217 0, 218 &mut start, 219 ) 220 } 221 222 /// Returns the location of the first match according to the match 223 /// semantics that this automaton was constructed with. 224 /// 225 /// When using `MatchKind::Standard`, this corresponds precisely to the 226 /// same behavior as 227 /// [`earliest_find`](struct.AhoCorasick.html#method.earliest_find). 228 /// Otherwise, match semantics correspond to either 229 /// [leftmost-first](enum.MatchKind.html#variant.LeftmostFirst) 230 /// or 231 /// [leftmost-longest](enum.MatchKind.html#variant.LeftmostLongest). 232 /// 233 /// `haystack` may be any type that is cheaply convertible to a `&[u8]`. 234 /// This includes, but is not limited to, `String`, `&str`, `Vec<u8>`, and 235 /// `&[u8]` itself. 236 /// 237 /// # Examples 238 /// 239 /// Basic usage, with standard semantics: 240 /// 241 /// ``` 242 /// use aho_corasick::{AhoCorasickBuilder, MatchKind}; 243 /// 244 /// let patterns = &["b", "abc", "abcd"]; 245 /// let haystack = "abcd"; 246 /// 247 /// let ac = AhoCorasickBuilder::new() 248 /// .match_kind(MatchKind::Standard) // default, not necessary 249 /// .build(patterns); 250 /// let mat = ac.find(haystack).expect("should have a match"); 251 /// assert_eq!("b", &haystack[mat.start()..mat.end()]); 252 /// ``` 253 /// 254 /// Now with leftmost-first semantics: 255 /// 256 /// ``` 257 /// use aho_corasick::{AhoCorasickBuilder, MatchKind}; 258 /// 259 /// let patterns = &["b", "abc", "abcd"]; 260 /// let haystack = "abcd"; 261 /// 262 /// let ac = AhoCorasickBuilder::new() 263 /// .match_kind(MatchKind::LeftmostFirst) 264 /// .build(patterns); 265 /// let mat = ac.find(haystack).expect("should have a match"); 266 /// assert_eq!("abc", &haystack[mat.start()..mat.end()]); 267 /// ``` 268 /// 269 /// And finally, leftmost-longest semantics: 270 /// 271 /// ``` 272 /// use aho_corasick::{AhoCorasickBuilder, MatchKind}; 273 /// 274 /// let patterns = &["b", "abc", "abcd"]; 275 /// let haystack = "abcd"; 276 /// 277 /// let ac = AhoCorasickBuilder::new() 278 /// .match_kind(MatchKind::LeftmostLongest) 279 /// .build(patterns); 280 /// let mat = ac.find(haystack).expect("should have a match"); 281 /// assert_eq!("abcd", &haystack[mat.start()..mat.end()]); 282 /// ``` find<B: AsRef<[u8]>>(&self, haystack: B) -> Option<Match>283 pub fn find<B: AsRef<[u8]>>(&self, haystack: B) -> Option<Match> { 284 let mut prestate = PrefilterState::new(self.max_pattern_len()); 285 self.imp.find_at_no_state(&mut prestate, haystack.as_ref(), 0) 286 } 287 288 /// Returns an iterator of non-overlapping matches, using the match 289 /// semantics that this automaton was constructed with. 290 /// 291 /// `haystack` may be any type that is cheaply convertible to a `&[u8]`. 292 /// This includes, but is not limited to, `String`, `&str`, `Vec<u8>`, and 293 /// `&[u8]` itself. 294 /// 295 /// # Examples 296 /// 297 /// Basic usage, with standard semantics: 298 /// 299 /// ``` 300 /// use aho_corasick::{AhoCorasickBuilder, MatchKind}; 301 /// 302 /// let patterns = &["append", "appendage", "app"]; 303 /// let haystack = "append the app to the appendage"; 304 /// 305 /// let ac = AhoCorasickBuilder::new() 306 /// .match_kind(MatchKind::Standard) // default, not necessary 307 /// .build(patterns); 308 /// let matches: Vec<usize> = ac 309 /// .find_iter(haystack) 310 /// .map(|mat| mat.pattern()) 311 /// .collect(); 312 /// assert_eq!(vec![2, 2, 2], matches); 313 /// ``` 314 /// 315 /// Now with leftmost-first semantics: 316 /// 317 /// ``` 318 /// use aho_corasick::{AhoCorasickBuilder, MatchKind}; 319 /// 320 /// let patterns = &["append", "appendage", "app"]; 321 /// let haystack = "append the app to the appendage"; 322 /// 323 /// let ac = AhoCorasickBuilder::new() 324 /// .match_kind(MatchKind::LeftmostFirst) 325 /// .build(patterns); 326 /// let matches: Vec<usize> = ac 327 /// .find_iter(haystack) 328 /// .map(|mat| mat.pattern()) 329 /// .collect(); 330 /// assert_eq!(vec![0, 2, 0], matches); 331 /// ``` 332 /// 333 /// And finally, leftmost-longest semantics: 334 /// 335 /// ``` 336 /// use aho_corasick::{AhoCorasickBuilder, MatchKind}; 337 /// 338 /// let patterns = &["append", "appendage", "app"]; 339 /// let haystack = "append the app to the appendage"; 340 /// 341 /// let ac = AhoCorasickBuilder::new() 342 /// .match_kind(MatchKind::LeftmostLongest) 343 /// .build(patterns); 344 /// let matches: Vec<usize> = ac 345 /// .find_iter(haystack) 346 /// .map(|mat| mat.pattern()) 347 /// .collect(); 348 /// assert_eq!(vec![0, 2, 1], matches); 349 /// ``` find_iter<'a, 'b, B: ?Sized + AsRef<[u8]>>( &'a self, haystack: &'b B, ) -> FindIter<'a, 'b, S>350 pub fn find_iter<'a, 'b, B: ?Sized + AsRef<[u8]>>( 351 &'a self, 352 haystack: &'b B, 353 ) -> FindIter<'a, 'b, S> { 354 FindIter::new(self, haystack.as_ref()) 355 } 356 357 /// Returns an iterator of overlapping matches in the given `haystack`. 358 /// 359 /// Overlapping matches can _only_ be detected using 360 /// `MatchKind::Standard` semantics. If this automaton was constructed with 361 /// leftmost semantics, then this method will panic. To determine whether 362 /// this will panic at runtime, use the 363 /// [`AhoCorasick::supports_overlapping`](struct.AhoCorasick.html#method.supports_overlapping) 364 /// method. 365 /// 366 /// `haystack` may be any type that is cheaply convertible to a `&[u8]`. 367 /// This includes, but is not limited to, `String`, `&str`, `Vec<u8>`, and 368 /// `&[u8]` itself. 369 /// 370 /// # Panics 371 /// 372 /// This panics when `AhoCorasick::supports_overlapping` returns `false`. 373 /// That is, this panics when this automaton's match semantics are not 374 /// `MatchKind::Standard`. 375 /// 376 /// # Examples 377 /// 378 /// Basic usage, with standard semantics: 379 /// 380 /// ``` 381 /// use aho_corasick::AhoCorasick; 382 /// 383 /// let patterns = &["append", "appendage", "app"]; 384 /// let haystack = "append the app to the appendage"; 385 /// 386 /// let ac = AhoCorasick::new(patterns); 387 /// let matches: Vec<usize> = ac 388 /// .find_overlapping_iter(haystack) 389 /// .map(|mat| mat.pattern()) 390 /// .collect(); 391 /// assert_eq!(vec![2, 0, 2, 2, 0, 1], matches); 392 /// ``` find_overlapping_iter<'a, 'b, B: ?Sized + AsRef<[u8]>>( &'a self, haystack: &'b B, ) -> FindOverlappingIter<'a, 'b, S>393 pub fn find_overlapping_iter<'a, 'b, B: ?Sized + AsRef<[u8]>>( 394 &'a self, 395 haystack: &'b B, 396 ) -> FindOverlappingIter<'a, 'b, S> { 397 FindOverlappingIter::new(self, haystack.as_ref()) 398 } 399 400 /// Replace all matches with a corresponding value in the `replace_with` 401 /// slice given. Matches correspond to the same matches as reported by 402 /// [`find_iter`](struct.AhoCorasick.html#method.find_iter). 403 /// 404 /// Replacements are determined by the index of the matching pattern. 405 /// For example, if the pattern with index `2` is found, then it is 406 /// replaced by `replace_with[2]`. 407 /// 408 /// # Panics 409 /// 410 /// This panics when `replace_with.len()` does not equal the total number 411 /// of patterns that are matched by this automaton. 412 /// 413 /// # Examples 414 /// 415 /// Basic usage: 416 /// 417 /// ``` 418 /// use aho_corasick::{AhoCorasickBuilder, MatchKind}; 419 /// 420 /// let patterns = &["append", "appendage", "app"]; 421 /// let haystack = "append the app to the appendage"; 422 /// 423 /// let ac = AhoCorasickBuilder::new() 424 /// .match_kind(MatchKind::LeftmostFirst) 425 /// .build(patterns); 426 /// let result = ac.replace_all(haystack, &["x", "y", "z"]); 427 /// assert_eq!("x the z to the xage", result); 428 /// ``` replace_all<B>(&self, haystack: &str, replace_with: &[B]) -> String where B: AsRef<str>,429 pub fn replace_all<B>(&self, haystack: &str, replace_with: &[B]) -> String 430 where 431 B: AsRef<str>, 432 { 433 assert_eq!( 434 replace_with.len(), 435 self.pattern_count(), 436 "replace_all requires a replacement for every pattern \ 437 in the automaton" 438 ); 439 let mut dst = String::with_capacity(haystack.len()); 440 self.replace_all_with(haystack, &mut dst, |mat, _, dst| { 441 dst.push_str(replace_with[mat.pattern()].as_ref()); 442 true 443 }); 444 dst 445 } 446 447 /// Replace all matches using raw bytes with a corresponding value in the 448 /// `replace_with` slice given. Matches correspond to the same matches as 449 /// reported by [`find_iter`](struct.AhoCorasick.html#method.find_iter). 450 /// 451 /// Replacements are determined by the index of the matching pattern. 452 /// For example, if the pattern with index `2` is found, then it is 453 /// replaced by `replace_with[2]`. 454 /// 455 /// # Panics 456 /// 457 /// This panics when `replace_with.len()` does not equal the total number 458 /// of patterns that are matched by this automaton. 459 /// 460 /// # Examples 461 /// 462 /// Basic usage: 463 /// 464 /// ``` 465 /// use aho_corasick::{AhoCorasickBuilder, MatchKind}; 466 /// 467 /// let patterns = &["append", "appendage", "app"]; 468 /// let haystack = b"append the app to the appendage"; 469 /// 470 /// let ac = AhoCorasickBuilder::new() 471 /// .match_kind(MatchKind::LeftmostFirst) 472 /// .build(patterns); 473 /// let result = ac.replace_all_bytes(haystack, &["x", "y", "z"]); 474 /// assert_eq!(b"x the z to the xage".to_vec(), result); 475 /// ``` replace_all_bytes<B>( &self, haystack: &[u8], replace_with: &[B], ) -> Vec<u8> where B: AsRef<[u8]>,476 pub fn replace_all_bytes<B>( 477 &self, 478 haystack: &[u8], 479 replace_with: &[B], 480 ) -> Vec<u8> 481 where 482 B: AsRef<[u8]>, 483 { 484 assert_eq!( 485 replace_with.len(), 486 self.pattern_count(), 487 "replace_all_bytes requires a replacement for every pattern \ 488 in the automaton" 489 ); 490 let mut dst = Vec::with_capacity(haystack.len()); 491 self.replace_all_with_bytes(haystack, &mut dst, |mat, _, dst| { 492 dst.extend(replace_with[mat.pattern()].as_ref()); 493 true 494 }); 495 dst 496 } 497 498 /// Replace all matches using a closure called on each match. 499 /// Matches correspond to the same matches as reported by 500 /// [`find_iter`](struct.AhoCorasick.html#method.find_iter). 501 /// 502 /// The closure accepts three parameters: the match found, the text of 503 /// the match and a string buffer with which to write the replaced text 504 /// (if any). If the closure returns `true`, then it continues to the next 505 /// match. If the closure returns `false`, then searching is stopped. 506 /// 507 /// # Examples 508 /// 509 /// Basic usage: 510 /// 511 /// ``` 512 /// use aho_corasick::{AhoCorasickBuilder, MatchKind}; 513 /// 514 /// let patterns = &["append", "appendage", "app"]; 515 /// let haystack = "append the app to the appendage"; 516 /// 517 /// let ac = AhoCorasickBuilder::new() 518 /// .match_kind(MatchKind::LeftmostFirst) 519 /// .build(patterns); 520 /// let mut result = String::new(); 521 /// ac.replace_all_with(haystack, &mut result, |mat, _, dst| { 522 /// dst.push_str(&mat.pattern().to_string()); 523 /// true 524 /// }); 525 /// assert_eq!("0 the 2 to the 0age", result); 526 /// ``` 527 /// 528 /// Stopping the replacement by returning `false` (continued from the 529 /// example above): 530 /// 531 /// ``` 532 /// # use aho_corasick::{AhoCorasickBuilder, MatchKind}; 533 /// # let patterns = &["append", "appendage", "app"]; 534 /// # let haystack = "append the app to the appendage"; 535 /// # let ac = AhoCorasickBuilder::new() 536 /// # .match_kind(MatchKind::LeftmostFirst) 537 /// # .build(patterns); 538 /// let mut result = String::new(); 539 /// ac.replace_all_with(haystack, &mut result, |mat, _, dst| { 540 /// dst.push_str(&mat.pattern().to_string()); 541 /// mat.pattern() != 2 542 /// }); 543 /// assert_eq!("0 the 2 to the appendage", result); 544 /// ``` replace_all_with<F>( &self, haystack: &str, dst: &mut String, mut replace_with: F, ) where F: FnMut(&Match, &str, &mut String) -> bool,545 pub fn replace_all_with<F>( 546 &self, 547 haystack: &str, 548 dst: &mut String, 549 mut replace_with: F, 550 ) where 551 F: FnMut(&Match, &str, &mut String) -> bool, 552 { 553 let mut last_match = 0; 554 for mat in self.find_iter(haystack) { 555 dst.push_str(&haystack[last_match..mat.start()]); 556 last_match = mat.end(); 557 if !replace_with(&mat, &haystack[mat.start()..mat.end()], dst) { 558 break; 559 }; 560 } 561 dst.push_str(&haystack[last_match..]); 562 } 563 564 /// Replace all matches using raw bytes with a closure called on each 565 /// match. Matches correspond to the same matches as reported by 566 /// [`find_iter`](struct.AhoCorasick.html#method.find_iter). 567 /// 568 /// The closure accepts three parameters: the match found, the text of 569 /// the match and a byte buffer with which to write the replaced text 570 /// (if any). If the closure returns `true`, then it continues to the next 571 /// match. If the closure returns `false`, then searching is stopped. 572 /// 573 /// # Examples 574 /// 575 /// Basic usage: 576 /// 577 /// ``` 578 /// use aho_corasick::{AhoCorasickBuilder, MatchKind}; 579 /// 580 /// let patterns = &["append", "appendage", "app"]; 581 /// let haystack = b"append the app to the appendage"; 582 /// 583 /// let ac = AhoCorasickBuilder::new() 584 /// .match_kind(MatchKind::LeftmostFirst) 585 /// .build(patterns); 586 /// let mut result = vec![]; 587 /// ac.replace_all_with_bytes(haystack, &mut result, |mat, _, dst| { 588 /// dst.extend(mat.pattern().to_string().bytes()); 589 /// true 590 /// }); 591 /// assert_eq!(b"0 the 2 to the 0age".to_vec(), result); 592 /// ``` 593 /// 594 /// Stopping the replacement by returning `false` (continued from the 595 /// example above): 596 /// 597 /// ``` 598 /// # use aho_corasick::{AhoCorasickBuilder, MatchKind}; 599 /// # let patterns = &["append", "appendage", "app"]; 600 /// # let haystack = b"append the app to the appendage"; 601 /// # let ac = AhoCorasickBuilder::new() 602 /// # .match_kind(MatchKind::LeftmostFirst) 603 /// # .build(patterns); 604 /// let mut result = vec![]; 605 /// ac.replace_all_with_bytes(haystack, &mut result, |mat, _, dst| { 606 /// dst.extend(mat.pattern().to_string().bytes()); 607 /// mat.pattern() != 2 608 /// }); 609 /// assert_eq!(b"0 the 2 to the appendage".to_vec(), result); 610 /// ``` replace_all_with_bytes<F>( &self, haystack: &[u8], dst: &mut Vec<u8>, mut replace_with: F, ) where F: FnMut(&Match, &[u8], &mut Vec<u8>) -> bool,611 pub fn replace_all_with_bytes<F>( 612 &self, 613 haystack: &[u8], 614 dst: &mut Vec<u8>, 615 mut replace_with: F, 616 ) where 617 F: FnMut(&Match, &[u8], &mut Vec<u8>) -> bool, 618 { 619 let mut last_match = 0; 620 for mat in self.find_iter(haystack) { 621 dst.extend(&haystack[last_match..mat.start()]); 622 last_match = mat.end(); 623 if !replace_with(&mat, &haystack[mat.start()..mat.end()], dst) { 624 break; 625 }; 626 } 627 dst.extend(&haystack[last_match..]); 628 } 629 630 /// Returns an iterator of non-overlapping matches in the given 631 /// stream. Matches correspond to the same matches as reported by 632 /// [`find_iter`](struct.AhoCorasick.html#method.find_iter). 633 /// 634 /// The matches yielded by this iterator use absolute position offsets in 635 /// the stream given, where the first byte has index `0`. Matches are 636 /// yieled until the stream is exhausted. 637 /// 638 /// Each item yielded by the iterator is an `io::Result<Match>`, where an 639 /// error is yielded if there was a problem reading from the reader given. 640 /// 641 /// When searching a stream, an internal buffer is used. Therefore, callers 642 /// should avoiding providing a buffered reader, if possible. 643 /// 644 /// Searching a stream requires that the automaton was built with 645 /// `MatchKind::Standard` semantics. If this automaton was constructed 646 /// with leftmost semantics, then this method will panic. To determine 647 /// whether this will panic at runtime, use the 648 /// [`AhoCorasick::supports_stream`](struct.AhoCorasick.html#method.supports_stream) 649 /// method. 650 /// 651 /// # Memory usage 652 /// 653 /// In general, searching streams will use a constant amount of memory for 654 /// its internal buffer. The one requirement is that the internal buffer 655 /// must be at least the size of the longest possible match. In most use 656 /// cases, the default buffer size will be much larger than any individual 657 /// match. 658 /// 659 /// # Panics 660 /// 661 /// This panics when `AhoCorasick::supports_stream` returns `false`. 662 /// That is, this panics when this automaton's match semantics are not 663 /// `MatchKind::Standard`. This restriction may be lifted in the future. 664 /// 665 /// # Examples 666 /// 667 /// Basic usage: 668 /// 669 /// ``` 670 /// use aho_corasick::AhoCorasick; 671 /// 672 /// # fn example() -> Result<(), ::std::io::Error> { 673 /// let patterns = &["append", "appendage", "app"]; 674 /// let haystack = "append the app to the appendage"; 675 /// 676 /// let ac = AhoCorasick::new(patterns); 677 /// let mut matches = vec![]; 678 /// for result in ac.stream_find_iter(haystack.as_bytes()) { 679 /// let mat = result?; 680 /// matches.push(mat.pattern()); 681 /// } 682 /// assert_eq!(vec![2, 2, 2], matches); 683 /// # Ok(()) }; example().unwrap() 684 /// ``` stream_find_iter<'a, R: io::Read>( &'a self, rdr: R, ) -> StreamFindIter<'a, R, S>685 pub fn stream_find_iter<'a, R: io::Read>( 686 &'a self, 687 rdr: R, 688 ) -> StreamFindIter<'a, R, S> { 689 StreamFindIter::new(self, rdr) 690 } 691 692 /// Search for and replace all matches of this automaton in 693 /// the given reader, and write the replacements to the given 694 /// writer. Matches correspond to the same matches as reported by 695 /// [`find_iter`](struct.AhoCorasick.html#method.find_iter). 696 /// 697 /// Replacements are determined by the index of the matching pattern. 698 /// For example, if the pattern with index `2` is found, then it is 699 /// replaced by `replace_with[2]`. 700 /// 701 /// After all matches are replaced, the writer is _not_ flushed. 702 /// 703 /// If there was a problem reading from the given reader or writing to the 704 /// given writer, then the corresponding `io::Error` is returned and all 705 /// replacement is stopped. 706 /// 707 /// When searching a stream, an internal buffer is used. Therefore, callers 708 /// should avoiding providing a buffered reader, if possible. However, 709 /// callers may want to provide a buffered writer. 710 /// 711 /// Searching a stream requires that the automaton was built with 712 /// `MatchKind::Standard` semantics. If this automaton was constructed 713 /// with leftmost semantics, then this method will panic. To determine 714 /// whether this will panic at runtime, use the 715 /// [`AhoCorasick::supports_stream`](struct.AhoCorasick.html#method.supports_stream) 716 /// method. 717 /// 718 /// # Memory usage 719 /// 720 /// In general, searching streams will use a constant amount of memory for 721 /// its internal buffer. The one requirement is that the internal buffer 722 /// must be at least the size of the longest possible match. In most use 723 /// cases, the default buffer size will be much larger than any individual 724 /// match. 725 /// 726 /// # Panics 727 /// 728 /// This panics when `AhoCorasick::supports_stream` returns `false`. 729 /// That is, this panics when this automaton's match semantics are not 730 /// `MatchKind::Standard`. This restriction may be lifted in the future. 731 /// 732 /// # Examples 733 /// 734 /// Basic usage: 735 /// 736 /// ``` 737 /// use aho_corasick::AhoCorasick; 738 /// 739 /// # fn example() -> Result<(), ::std::io::Error> { 740 /// let patterns = &["fox", "brown", "quick"]; 741 /// let haystack = "The quick brown fox."; 742 /// let replace_with = &["sloth", "grey", "slow"]; 743 /// 744 /// let ac = AhoCorasick::new(patterns); 745 /// let mut result = vec![]; 746 /// ac.stream_replace_all(haystack.as_bytes(), &mut result, replace_with)?; 747 /// assert_eq!(b"The slow grey sloth.".to_vec(), result); 748 /// # Ok(()) }; example().unwrap() 749 /// ``` stream_replace_all<R, W, B>( &self, rdr: R, wtr: W, replace_with: &[B], ) -> io::Result<()> where R: io::Read, W: io::Write, B: AsRef<[u8]>,750 pub fn stream_replace_all<R, W, B>( 751 &self, 752 rdr: R, 753 wtr: W, 754 replace_with: &[B], 755 ) -> io::Result<()> 756 where 757 R: io::Read, 758 W: io::Write, 759 B: AsRef<[u8]>, 760 { 761 assert_eq!( 762 replace_with.len(), 763 self.pattern_count(), 764 "stream_replace_all requires a replacement for every pattern \ 765 in the automaton" 766 ); 767 self.stream_replace_all_with(rdr, wtr, |mat, _, wtr| { 768 wtr.write_all(replace_with[mat.pattern()].as_ref()) 769 }) 770 } 771 772 /// Search the given reader and replace all matches of this automaton 773 /// using the given closure. The result is written to the given 774 /// writer. Matches correspond to the same matches as reported by 775 /// [`find_iter`](struct.AhoCorasick.html#method.find_iter). 776 /// 777 /// The closure accepts three parameters: the match found, the text of 778 /// the match and the writer with which to write the replaced text (if any). 779 /// 780 /// After all matches are replaced, the writer is _not_ flushed. 781 /// 782 /// If there was a problem reading from the given reader or writing to the 783 /// given writer, then the corresponding `io::Error` is returned and all 784 /// replacement is stopped. 785 /// 786 /// When searching a stream, an internal buffer is used. Therefore, callers 787 /// should avoiding providing a buffered reader, if possible. However, 788 /// callers may want to provide a buffered writer. 789 /// 790 /// Searching a stream requires that the automaton was built with 791 /// `MatchKind::Standard` semantics. If this automaton was constructed 792 /// with leftmost semantics, then this method will panic. To determine 793 /// whether this will panic at runtime, use the 794 /// [`AhoCorasick::supports_stream`](struct.AhoCorasick.html#method.supports_stream) 795 /// method. 796 /// 797 /// # Memory usage 798 /// 799 /// In general, searching streams will use a constant amount of memory for 800 /// its internal buffer. The one requirement is that the internal buffer 801 /// must be at least the size of the longest possible match. In most use 802 /// cases, the default buffer size will be much larger than any individual 803 /// match. 804 /// 805 /// # Panics 806 /// 807 /// This panics when `AhoCorasick::supports_stream` returns `false`. 808 /// That is, this panics when this automaton's match semantics are not 809 /// `MatchKind::Standard`. This restriction may be lifted in the future. 810 /// 811 /// # Examples 812 /// 813 /// Basic usage: 814 /// 815 /// ``` 816 /// use std::io::Write; 817 /// use aho_corasick::AhoCorasick; 818 /// 819 /// # fn example() -> Result<(), ::std::io::Error> { 820 /// let patterns = &["fox", "brown", "quick"]; 821 /// let haystack = "The quick brown fox."; 822 /// 823 /// let ac = AhoCorasick::new(patterns); 824 /// let mut result = vec![]; 825 /// ac.stream_replace_all_with( 826 /// haystack.as_bytes(), 827 /// &mut result, 828 /// |mat, _, wtr| { 829 /// wtr.write_all(mat.pattern().to_string().as_bytes()) 830 /// }, 831 /// )?; 832 /// assert_eq!(b"The 2 1 0.".to_vec(), result); 833 /// # Ok(()) }; example().unwrap() 834 /// ``` stream_replace_all_with<R, W, F>( &self, rdr: R, mut wtr: W, mut replace_with: F, ) -> io::Result<()> where R: io::Read, W: io::Write, F: FnMut(&Match, &[u8], &mut W) -> io::Result<()>,835 pub fn stream_replace_all_with<R, W, F>( 836 &self, 837 rdr: R, 838 mut wtr: W, 839 mut replace_with: F, 840 ) -> io::Result<()> 841 where 842 R: io::Read, 843 W: io::Write, 844 F: FnMut(&Match, &[u8], &mut W) -> io::Result<()>, 845 { 846 let mut it = StreamChunkIter::new(self, rdr); 847 while let Some(result) = it.next() { 848 let chunk = result?; 849 match chunk { 850 StreamChunk::NonMatch { bytes, .. } => { 851 wtr.write_all(bytes)?; 852 } 853 StreamChunk::Match { bytes, mat } => { 854 replace_with(&mat, bytes, &mut wtr)?; 855 } 856 } 857 } 858 Ok(()) 859 } 860 861 /// Returns the match kind used by this automaton. 862 /// 863 /// # Examples 864 /// 865 /// Basic usage: 866 /// 867 /// ``` 868 /// use aho_corasick::{AhoCorasick, MatchKind}; 869 /// 870 /// let ac = AhoCorasick::new(&[ 871 /// "foo", "bar", "quux", "baz", 872 /// ]); 873 /// assert_eq!(&MatchKind::Standard, ac.match_kind()); 874 /// ``` match_kind(&self) -> &MatchKind875 pub fn match_kind(&self) -> &MatchKind { 876 self.imp.match_kind() 877 } 878 879 /// Returns the length of the longest pattern matched by this automaton. 880 /// 881 /// # Examples 882 /// 883 /// Basic usage: 884 /// 885 /// ``` 886 /// use aho_corasick::AhoCorasick; 887 /// 888 /// let ac = AhoCorasick::new(&[ 889 /// "foo", "bar", "quux", "baz", 890 /// ]); 891 /// assert_eq!(4, ac.max_pattern_len()); 892 /// ``` max_pattern_len(&self) -> usize893 pub fn max_pattern_len(&self) -> usize { 894 self.imp.max_pattern_len() 895 } 896 897 /// Return the total number of patterns matched by this automaton. 898 /// 899 /// This includes patterns that may never participate in a match. For 900 /// example, if 901 /// [`MatchKind::LeftmostFirst`](enum.MatchKind.html#variant.LeftmostFirst) 902 /// match semantics are used, and the patterns `Sam` and `Samwise` were 903 /// used to build the automaton, then `Samwise` can never participate in a 904 /// match because `Sam` will always take priority. 905 /// 906 /// # Examples 907 /// 908 /// Basic usage: 909 /// 910 /// ``` 911 /// use aho_corasick::AhoCorasick; 912 /// 913 /// let ac = AhoCorasick::new(&[ 914 /// "foo", "bar", "baz", 915 /// ]); 916 /// assert_eq!(3, ac.pattern_count()); 917 /// ``` pattern_count(&self) -> usize918 pub fn pattern_count(&self) -> usize { 919 self.imp.pattern_count() 920 } 921 922 /// Returns true if and only if this automaton supports reporting 923 /// overlapping matches. 924 /// 925 /// If this returns false and overlapping matches are requested, then it 926 /// will result in a panic. 927 /// 928 /// Since leftmost matching is inherently incompatible with overlapping 929 /// matches, only 930 /// [`MatchKind::Standard`](enum.MatchKind.html#variant.Standard) 931 /// supports overlapping matches. This is unlikely to change in the future. 932 /// 933 /// # Examples 934 /// 935 /// Basic usage: 936 /// 937 /// ``` 938 /// use aho_corasick::{AhoCorasickBuilder, MatchKind}; 939 /// 940 /// let ac = AhoCorasickBuilder::new() 941 /// .match_kind(MatchKind::Standard) 942 /// .build(&["foo", "bar", "baz"]); 943 /// assert!(ac.supports_overlapping()); 944 /// 945 /// let ac = AhoCorasickBuilder::new() 946 /// .match_kind(MatchKind::LeftmostFirst) 947 /// .build(&["foo", "bar", "baz"]); 948 /// assert!(!ac.supports_overlapping()); 949 /// ``` supports_overlapping(&self) -> bool950 pub fn supports_overlapping(&self) -> bool { 951 self.match_kind.supports_overlapping() 952 } 953 954 /// Returns true if and only if this automaton supports stream searching. 955 /// 956 /// If this returns false and stream searching (or replacing) is attempted, 957 /// then it will result in a panic. 958 /// 959 /// Currently, only 960 /// [`MatchKind::Standard`](enum.MatchKind.html#variant.Standard) 961 /// supports streaming. This may be expanded in the future. 962 /// 963 /// # Examples 964 /// 965 /// Basic usage: 966 /// 967 /// ``` 968 /// use aho_corasick::{AhoCorasickBuilder, MatchKind}; 969 /// 970 /// let ac = AhoCorasickBuilder::new() 971 /// .match_kind(MatchKind::Standard) 972 /// .build(&["foo", "bar", "baz"]); 973 /// assert!(ac.supports_stream()); 974 /// 975 /// let ac = AhoCorasickBuilder::new() 976 /// .match_kind(MatchKind::LeftmostFirst) 977 /// .build(&["foo", "bar", "baz"]); 978 /// assert!(!ac.supports_stream()); 979 /// ``` supports_stream(&self) -> bool980 pub fn supports_stream(&self) -> bool { 981 self.match_kind.supports_stream() 982 } 983 984 /// Returns the approximate total amount of heap used by this automaton, in 985 /// units of bytes. 986 /// 987 /// # Examples 988 /// 989 /// This example shows the difference in heap usage between a few 990 /// configurations: 991 /// 992 /// ```ignore 993 /// use aho_corasick::{AhoCorasickBuilder, MatchKind}; 994 /// 995 /// let ac = AhoCorasickBuilder::new() 996 /// .dfa(false) // default 997 /// .build(&["foo", "bar", "baz"]); 998 /// assert_eq!(10_336, ac.heap_bytes()); 999 /// 1000 /// let ac = AhoCorasickBuilder::new() 1001 /// .dfa(false) // default 1002 /// .ascii_case_insensitive(true) 1003 /// .build(&["foo", "bar", "baz"]); 1004 /// assert_eq!(10_384, ac.heap_bytes()); 1005 /// 1006 /// let ac = AhoCorasickBuilder::new() 1007 /// .dfa(true) 1008 /// .ascii_case_insensitive(true) 1009 /// .build(&["foo", "bar", "baz"]); 1010 /// assert_eq!(1_248, ac.heap_bytes()); 1011 /// ``` heap_bytes(&self) -> usize1012 pub fn heap_bytes(&self) -> usize { 1013 match self.imp { 1014 Imp::NFA(ref nfa) => nfa.heap_bytes(), 1015 Imp::DFA(ref dfa) => dfa.heap_bytes(), 1016 } 1017 } 1018 } 1019 1020 /// The internal implementation of Aho-Corasick, which is either an NFA or 1021 /// a DFA. The NFA is slower but uses less memory. The DFA is faster but uses 1022 /// more memory. 1023 #[derive(Clone, Debug)] 1024 enum Imp<S: StateID> { 1025 NFA(NFA<S>), 1026 DFA(DFA<S>), 1027 } 1028 1029 impl<S: StateID> Imp<S> { 1030 /// Returns the type of match semantics implemented by this automaton. match_kind(&self) -> &MatchKind1031 fn match_kind(&self) -> &MatchKind { 1032 match *self { 1033 Imp::NFA(ref nfa) => nfa.match_kind(), 1034 Imp::DFA(ref dfa) => dfa.match_kind(), 1035 } 1036 } 1037 1038 /// Returns the identifier of the start state. start_state(&self) -> S1039 fn start_state(&self) -> S { 1040 match *self { 1041 Imp::NFA(ref nfa) => nfa.start_state(), 1042 Imp::DFA(ref dfa) => dfa.start_state(), 1043 } 1044 } 1045 1046 /// The length, in bytes, of the longest pattern in this automaton. This 1047 /// information is useful for maintaining correct buffer sizes when 1048 /// searching on streams. max_pattern_len(&self) -> usize1049 fn max_pattern_len(&self) -> usize { 1050 match *self { 1051 Imp::NFA(ref nfa) => nfa.max_pattern_len(), 1052 Imp::DFA(ref dfa) => dfa.max_pattern_len(), 1053 } 1054 } 1055 1056 /// The total number of patterns added to this automaton. This includes 1057 /// patterns that may never match. The maximum matching pattern that can be 1058 /// reported is exactly one less than this number. pattern_count(&self) -> usize1059 fn pattern_count(&self) -> usize { 1060 match *self { 1061 Imp::NFA(ref nfa) => nfa.pattern_count(), 1062 Imp::DFA(ref dfa) => dfa.pattern_count(), 1063 } 1064 } 1065 1066 /// Returns the prefilter object, if one exists, for the underlying 1067 /// automaton. prefilter(&self) -> Option<&dyn Prefilter>1068 fn prefilter(&self) -> Option<&dyn Prefilter> { 1069 match *self { 1070 Imp::NFA(ref nfa) => nfa.prefilter(), 1071 Imp::DFA(ref dfa) => dfa.prefilter(), 1072 } 1073 } 1074 1075 /// Returns true if and only if we should attempt to use a prefilter. use_prefilter(&self) -> bool1076 fn use_prefilter(&self) -> bool { 1077 let p = match self.prefilter() { 1078 None => return false, 1079 Some(p) => p, 1080 }; 1081 !p.looks_for_non_start_of_match() 1082 } 1083 1084 #[inline(always)] overlapping_find_at( &self, prestate: &mut PrefilterState, haystack: &[u8], at: usize, state_id: &mut S, match_index: &mut usize, ) -> Option<Match>1085 fn overlapping_find_at( 1086 &self, 1087 prestate: &mut PrefilterState, 1088 haystack: &[u8], 1089 at: usize, 1090 state_id: &mut S, 1091 match_index: &mut usize, 1092 ) -> Option<Match> { 1093 match *self { 1094 Imp::NFA(ref nfa) => nfa.overlapping_find_at( 1095 prestate, 1096 haystack, 1097 at, 1098 state_id, 1099 match_index, 1100 ), 1101 Imp::DFA(ref dfa) => dfa.overlapping_find_at( 1102 prestate, 1103 haystack, 1104 at, 1105 state_id, 1106 match_index, 1107 ), 1108 } 1109 } 1110 1111 #[inline(always)] earliest_find_at( &self, prestate: &mut PrefilterState, haystack: &[u8], at: usize, state_id: &mut S, ) -> Option<Match>1112 fn earliest_find_at( 1113 &self, 1114 prestate: &mut PrefilterState, 1115 haystack: &[u8], 1116 at: usize, 1117 state_id: &mut S, 1118 ) -> Option<Match> { 1119 match *self { 1120 Imp::NFA(ref nfa) => { 1121 nfa.earliest_find_at(prestate, haystack, at, state_id) 1122 } 1123 Imp::DFA(ref dfa) => { 1124 dfa.earliest_find_at(prestate, haystack, at, state_id) 1125 } 1126 } 1127 } 1128 1129 #[inline(always)] find_at_no_state( &self, prestate: &mut PrefilterState, haystack: &[u8], at: usize, ) -> Option<Match>1130 fn find_at_no_state( 1131 &self, 1132 prestate: &mut PrefilterState, 1133 haystack: &[u8], 1134 at: usize, 1135 ) -> Option<Match> { 1136 match *self { 1137 Imp::NFA(ref nfa) => nfa.find_at_no_state(prestate, haystack, at), 1138 Imp::DFA(ref dfa) => dfa.find_at_no_state(prestate, haystack, at), 1139 } 1140 } 1141 } 1142 1143 /// An iterator of non-overlapping matches in a particular haystack. 1144 /// 1145 /// This iterator yields matches according to the 1146 /// [`MatchKind`](enum.MatchKind.html) 1147 /// used by this automaton. 1148 /// 1149 /// This iterator is constructed via the 1150 /// [`AhoCorasick::find_iter`](struct.AhoCorasick.html#method.find_iter) 1151 /// method. 1152 /// 1153 /// The type variable `S` refers to the representation used for state 1154 /// identifiers. (By default, this is `usize`.) 1155 /// 1156 /// The lifetime `'a` refers to the lifetime of the `AhoCorasick` automaton. 1157 /// 1158 /// The lifetime `'b` refers to the lifetime of the haystack being searched. 1159 #[derive(Debug)] 1160 pub struct FindIter<'a, 'b, S: StateID> { 1161 fsm: &'a Imp<S>, 1162 prestate: PrefilterState, 1163 haystack: &'b [u8], 1164 pos: usize, 1165 } 1166 1167 impl<'a, 'b, S: StateID> FindIter<'a, 'b, S> { new(ac: &'a AhoCorasick<S>, haystack: &'b [u8]) -> FindIter<'a, 'b, S>1168 fn new(ac: &'a AhoCorasick<S>, haystack: &'b [u8]) -> FindIter<'a, 'b, S> { 1169 let prestate = PrefilterState::new(ac.max_pattern_len()); 1170 FindIter { fsm: &ac.imp, prestate, haystack, pos: 0 } 1171 } 1172 } 1173 1174 impl<'a, 'b, S: StateID> Iterator for FindIter<'a, 'b, S> { 1175 type Item = Match; 1176 next(&mut self) -> Option<Match>1177 fn next(&mut self) -> Option<Match> { 1178 if self.pos > self.haystack.len() { 1179 return None; 1180 } 1181 let result = self.fsm.find_at_no_state( 1182 &mut self.prestate, 1183 self.haystack, 1184 self.pos, 1185 ); 1186 let mat = match result { 1187 None => return None, 1188 Some(mat) => mat, 1189 }; 1190 if mat.end() == self.pos { 1191 // If the automaton can match the empty string and if we found an 1192 // empty match, then we need to forcefully move the position. 1193 self.pos += 1; 1194 } else { 1195 self.pos = mat.end(); 1196 } 1197 Some(mat) 1198 } 1199 } 1200 1201 /// An iterator of overlapping matches in a particular haystack. 1202 /// 1203 /// This iterator will report all possible matches in a particular haystack, 1204 /// even when the matches overlap. 1205 /// 1206 /// This iterator is constructed via the 1207 /// [`AhoCorasick::find_overlapping_iter`](struct.AhoCorasick.html#method.find_overlapping_iter) 1208 /// method. 1209 /// 1210 /// The type variable `S` refers to the representation used for state 1211 /// identifiers. (By default, this is `usize`.) 1212 /// 1213 /// The lifetime `'a` refers to the lifetime of the `AhoCorasick` automaton. 1214 /// 1215 /// The lifetime `'b` refers to the lifetime of the haystack being searched. 1216 #[derive(Debug)] 1217 pub struct FindOverlappingIter<'a, 'b, S: StateID> { 1218 fsm: &'a Imp<S>, 1219 prestate: PrefilterState, 1220 haystack: &'b [u8], 1221 pos: usize, 1222 state_id: S, 1223 match_index: usize, 1224 } 1225 1226 impl<'a, 'b, S: StateID> FindOverlappingIter<'a, 'b, S> { new( ac: &'a AhoCorasick<S>, haystack: &'b [u8], ) -> FindOverlappingIter<'a, 'b, S>1227 fn new( 1228 ac: &'a AhoCorasick<S>, 1229 haystack: &'b [u8], 1230 ) -> FindOverlappingIter<'a, 'b, S> { 1231 assert!( 1232 ac.supports_overlapping(), 1233 "automaton does not support overlapping searches" 1234 ); 1235 let prestate = PrefilterState::new(ac.max_pattern_len()); 1236 FindOverlappingIter { 1237 fsm: &ac.imp, 1238 prestate, 1239 haystack, 1240 pos: 0, 1241 state_id: ac.imp.start_state(), 1242 match_index: 0, 1243 } 1244 } 1245 } 1246 1247 impl<'a, 'b, S: StateID> Iterator for FindOverlappingIter<'a, 'b, S> { 1248 type Item = Match; 1249 next(&mut self) -> Option<Match>1250 fn next(&mut self) -> Option<Match> { 1251 let result = self.fsm.overlapping_find_at( 1252 &mut self.prestate, 1253 self.haystack, 1254 self.pos, 1255 &mut self.state_id, 1256 &mut self.match_index, 1257 ); 1258 match result { 1259 None => return None, 1260 Some(m) => { 1261 self.pos = m.end(); 1262 Some(m) 1263 } 1264 } 1265 } 1266 } 1267 1268 /// An iterator that reports Aho-Corasick matches in a stream. 1269 /// 1270 /// This iterator yields elements of type `io::Result<Match>`, where an error 1271 /// is reported if there was a problem reading from the underlying stream. 1272 /// The iterator terminates only when the underlying stream reaches `EOF`. 1273 /// 1274 /// This iterator is constructed via the 1275 /// [`AhoCorasick::stream_find_iter`](struct.AhoCorasick.html#method.stream_find_iter) 1276 /// method. 1277 /// 1278 /// The type variable `R` refers to the `io::Read` stream that is being read 1279 /// from. 1280 /// 1281 /// The type variable `S` refers to the representation used for state 1282 /// identifiers. (By default, this is `usize`.) 1283 /// 1284 /// The lifetime `'a` refers to the lifetime of the `AhoCorasick` automaton. 1285 #[derive(Debug)] 1286 pub struct StreamFindIter<'a, R, S: StateID> { 1287 it: StreamChunkIter<'a, R, S>, 1288 } 1289 1290 impl<'a, R: io::Read, S: StateID> StreamFindIter<'a, R, S> { new(ac: &'a AhoCorasick<S>, rdr: R) -> StreamFindIter<'a, R, S>1291 fn new(ac: &'a AhoCorasick<S>, rdr: R) -> StreamFindIter<'a, R, S> { 1292 StreamFindIter { it: StreamChunkIter::new(ac, rdr) } 1293 } 1294 } 1295 1296 impl<'a, R: io::Read, S: StateID> Iterator for StreamFindIter<'a, R, S> { 1297 type Item = io::Result<Match>; 1298 next(&mut self) -> Option<io::Result<Match>>1299 fn next(&mut self) -> Option<io::Result<Match>> { 1300 loop { 1301 match self.it.next() { 1302 None => return None, 1303 Some(Err(err)) => return Some(Err(err)), 1304 Some(Ok(StreamChunk::NonMatch { .. })) => {} 1305 Some(Ok(StreamChunk::Match { mat, .. })) => { 1306 return Some(Ok(mat)); 1307 } 1308 } 1309 } 1310 } 1311 } 1312 1313 /// An iterator over chunks in an underlying reader. Each chunk either 1314 /// corresponds to non-matching bytes or matching bytes, but all bytes from 1315 /// the underlying reader are reported in sequence. There may be an arbitrary 1316 /// number of non-matching chunks before seeing a matching chunk. 1317 /// 1318 /// N.B. This does not actually implement Iterator because we need to borrow 1319 /// from the underlying reader. But conceptually, it's still an iterator. 1320 #[derive(Debug)] 1321 struct StreamChunkIter<'a, R, S: StateID> { 1322 /// The AC automaton. 1323 fsm: &'a Imp<S>, 1324 /// State associated with this automaton's prefilter. It is a heuristic 1325 /// for stopping the prefilter if it's deemed ineffective. 1326 prestate: PrefilterState, 1327 /// The source of bytes we read from. 1328 rdr: R, 1329 /// A fixed size buffer. This is what we actually search. There are some 1330 /// invariants around the buffer's size, namely, it must be big enough to 1331 /// contain the longest possible match. 1332 buf: Buffer, 1333 /// The ID of the FSM state we're currently in. 1334 state_id: S, 1335 /// The current position at which to start the next search in `buf`. 1336 search_pos: usize, 1337 /// The absolute position of `search_pos`, where `0` corresponds to the 1338 /// position of the first byte read from `rdr`. 1339 absolute_pos: usize, 1340 /// The ending position of the last StreamChunk that was returned to the 1341 /// caller. This position is used to determine whether we need to emit 1342 /// non-matching bytes before emitting a match. 1343 report_pos: usize, 1344 /// A match that should be reported on the next call. 1345 pending_match: Option<Match>, 1346 /// Enabled only when the automaton can match the empty string. When 1347 /// enabled, we need to execute one final search after consuming the 1348 /// reader to find the trailing empty match. 1349 has_empty_match_at_end: bool, 1350 } 1351 1352 /// A single chunk yielded by the stream chunk iterator. 1353 /// 1354 /// The `'r` lifetime refers to the lifetime of the stream chunk iterator. 1355 #[derive(Debug)] 1356 enum StreamChunk<'r> { 1357 /// A chunk that does not contain any matches. 1358 NonMatch { bytes: &'r [u8] }, 1359 /// A chunk that precisely contains a match. 1360 Match { bytes: &'r [u8], mat: Match }, 1361 } 1362 1363 impl<'a, R: io::Read, S: StateID> StreamChunkIter<'a, R, S> { new(ac: &'a AhoCorasick<S>, rdr: R) -> StreamChunkIter<'a, R, S>1364 fn new(ac: &'a AhoCorasick<S>, rdr: R) -> StreamChunkIter<'a, R, S> { 1365 assert!( 1366 ac.supports_stream(), 1367 "stream searching is only supported for Standard match semantics" 1368 ); 1369 1370 let prestate = if ac.imp.use_prefilter() { 1371 PrefilterState::new(ac.max_pattern_len()) 1372 } else { 1373 PrefilterState::disabled() 1374 }; 1375 let buf = Buffer::new(ac.imp.max_pattern_len()); 1376 let state_id = ac.imp.start_state(); 1377 StreamChunkIter { 1378 fsm: &ac.imp, 1379 prestate, 1380 rdr, 1381 buf, 1382 state_id, 1383 absolute_pos: 0, 1384 report_pos: 0, 1385 search_pos: 0, 1386 pending_match: None, 1387 has_empty_match_at_end: ac.is_match(""), 1388 } 1389 } 1390 next(&mut self) -> Option<io::Result<StreamChunk>>1391 fn next(&mut self) -> Option<io::Result<StreamChunk>> { 1392 loop { 1393 if let Some(mut mat) = self.pending_match.take() { 1394 let bytes = &self.buf.buffer()[mat.start()..mat.end()]; 1395 self.report_pos = mat.end(); 1396 mat = mat.increment(self.absolute_pos); 1397 return Some(Ok(StreamChunk::Match { bytes, mat })); 1398 } 1399 if self.search_pos >= self.buf.len() { 1400 if let Some(end) = self.unreported() { 1401 let bytes = &self.buf.buffer()[self.report_pos..end]; 1402 self.report_pos = end; 1403 return Some(Ok(StreamChunk::NonMatch { bytes })); 1404 } 1405 if self.buf.len() >= self.buf.min_buffer_len() { 1406 // This is the point at which we roll our buffer, which we 1407 // only do if our buffer has at least the minimum amount of 1408 // bytes in it. Before rolling, we update our various 1409 // positions to be consistent with the buffer after it has 1410 // been rolled. 1411 1412 self.report_pos -= 1413 self.buf.len() - self.buf.min_buffer_len(); 1414 self.absolute_pos += 1415 self.search_pos - self.buf.min_buffer_len(); 1416 self.search_pos = self.buf.min_buffer_len(); 1417 self.buf.roll(); 1418 } 1419 match self.buf.fill(&mut self.rdr) { 1420 Err(err) => return Some(Err(err)), 1421 Ok(false) => { 1422 // We've hit EOF, but if there are still some 1423 // unreported bytes remaining, return them now. 1424 if self.report_pos < self.buf.len() { 1425 let bytes = &self.buf.buffer()[self.report_pos..]; 1426 self.report_pos = self.buf.len(); 1427 1428 let chunk = StreamChunk::NonMatch { bytes }; 1429 return Some(Ok(chunk)); 1430 } else { 1431 // We've reported everything, but there might still 1432 // be a match at the very last position. 1433 if !self.has_empty_match_at_end { 1434 return None; 1435 } 1436 // fallthrough for another search to get trailing 1437 // empty matches 1438 self.has_empty_match_at_end = false; 1439 } 1440 } 1441 Ok(true) => {} 1442 } 1443 } 1444 let result = self.fsm.earliest_find_at( 1445 &mut self.prestate, 1446 self.buf.buffer(), 1447 self.search_pos, 1448 &mut self.state_id, 1449 ); 1450 match result { 1451 None => { 1452 self.search_pos = self.buf.len(); 1453 } 1454 Some(mat) => { 1455 self.state_id = self.fsm.start_state(); 1456 if mat.end() == self.search_pos { 1457 // If the automaton can match the empty string and if 1458 // we found an empty match, then we need to forcefully 1459 // move the position. 1460 self.search_pos += 1; 1461 } else { 1462 self.search_pos = mat.end(); 1463 } 1464 self.pending_match = Some(mat.clone()); 1465 if self.report_pos < mat.start() { 1466 let bytes = 1467 &self.buf.buffer()[self.report_pos..mat.start()]; 1468 self.report_pos = mat.start(); 1469 1470 let chunk = StreamChunk::NonMatch { bytes }; 1471 return Some(Ok(chunk)); 1472 } 1473 } 1474 } 1475 } 1476 } 1477 unreported(&self) -> Option<usize>1478 fn unreported(&self) -> Option<usize> { 1479 let end = self.search_pos.saturating_sub(self.buf.min_buffer_len()); 1480 if self.report_pos < end { 1481 Some(end) 1482 } else { 1483 None 1484 } 1485 } 1486 } 1487 1488 /// A builder for configuring an Aho-Corasick automaton. 1489 #[derive(Clone, Debug)] 1490 pub struct AhoCorasickBuilder { 1491 nfa_builder: nfa::Builder, 1492 dfa_builder: dfa::Builder, 1493 dfa: bool, 1494 } 1495 1496 impl Default for AhoCorasickBuilder { default() -> AhoCorasickBuilder1497 fn default() -> AhoCorasickBuilder { 1498 AhoCorasickBuilder::new() 1499 } 1500 } 1501 1502 impl AhoCorasickBuilder { 1503 /// Create a new builder for configuring an Aho-Corasick automaton. 1504 /// 1505 /// If you don't need fine grained configuration or aren't sure which knobs 1506 /// to set, try using 1507 /// [`AhoCorasick::new_auto_configured`](struct.AhoCorasick.html#method.new_auto_configured) 1508 /// instead. new() -> AhoCorasickBuilder1509 pub fn new() -> AhoCorasickBuilder { 1510 AhoCorasickBuilder { 1511 nfa_builder: nfa::Builder::new(), 1512 dfa_builder: dfa::Builder::new(), 1513 dfa: false, 1514 } 1515 } 1516 1517 /// Build an Aho-Corasick automaton using the configuration set on this 1518 /// builder. 1519 /// 1520 /// A builder may be reused to create more automatons. 1521 /// 1522 /// This method will use the default for representing internal state 1523 /// identifiers, which is `usize`. This guarantees that building the 1524 /// automaton will succeed and is generally a good default, but can make 1525 /// the size of the automaton 2-8 times bigger than it needs to be, 1526 /// depending on your target platform. 1527 /// 1528 /// # Examples 1529 /// 1530 /// Basic usage: 1531 /// 1532 /// ``` 1533 /// use aho_corasick::AhoCorasickBuilder; 1534 /// 1535 /// let patterns = &["foo", "bar", "baz"]; 1536 /// let ac = AhoCorasickBuilder::new() 1537 /// .build(patterns); 1538 /// assert_eq!(Some(1), ac.find("xxx bar xxx").map(|m| m.pattern())); 1539 /// ``` build<I, P>(&self, patterns: I) -> AhoCorasick where I: IntoIterator<Item = P>, P: AsRef<[u8]>,1540 pub fn build<I, P>(&self, patterns: I) -> AhoCorasick 1541 where 1542 I: IntoIterator<Item = P>, 1543 P: AsRef<[u8]>, 1544 { 1545 // The builder only returns an error if the chosen state ID 1546 // representation is too small to fit all of the given patterns. In 1547 // this case, since we fix the representation to usize, it will always 1548 // work because it's impossible to overflow usize since the underlying 1549 // storage would OOM long before that happens. 1550 self.build_with_size::<usize, I, P>(patterns) 1551 .expect("usize state ID type should always work") 1552 } 1553 1554 /// Build an Aho-Corasick automaton using the configuration set on this 1555 /// builder with a specific state identifier representation. This only has 1556 /// an effect when the `dfa` option is enabled. 1557 /// 1558 /// Generally, the choices for a state identifier representation are 1559 /// `u8`, `u16`, `u32`, `u64` or `usize`, with `usize` being the default. 1560 /// The advantage of choosing a smaller state identifier representation 1561 /// is that the automaton produced will be smaller. This might be 1562 /// beneficial for just generally using less space, or might even allow it 1563 /// to fit more of the automaton in your CPU's cache, leading to overall 1564 /// better search performance. 1565 /// 1566 /// Unlike the standard `build` method, this can report an error if the 1567 /// state identifier representation cannot support the size of the 1568 /// automaton. 1569 /// 1570 /// Note that the state identifier representation is determined by the 1571 /// `S` type variable. This requires a type hint of some sort, either 1572 /// by specifying the return type or using the turbofish, e.g., 1573 /// `build_with_size::<u16, _, _>(...)`. 1574 /// 1575 /// # Examples 1576 /// 1577 /// Basic usage: 1578 /// 1579 /// ``` 1580 /// use aho_corasick::{AhoCorasick, AhoCorasickBuilder}; 1581 /// 1582 /// # fn example() -> Result<(), ::aho_corasick::Error> { 1583 /// let patterns = &["foo", "bar", "baz"]; 1584 /// let ac: AhoCorasick<u8> = AhoCorasickBuilder::new() 1585 /// .build_with_size(patterns)?; 1586 /// assert_eq!(Some(1), ac.find("xxx bar xxx").map(|m| m.pattern())); 1587 /// # Ok(()) }; example().unwrap() 1588 /// ``` 1589 /// 1590 /// Or alternatively, with turbofish: 1591 /// 1592 /// ``` 1593 /// use aho_corasick::AhoCorasickBuilder; 1594 /// 1595 /// # fn example() -> Result<(), ::aho_corasick::Error> { 1596 /// let patterns = &["foo", "bar", "baz"]; 1597 /// let ac = AhoCorasickBuilder::new() 1598 /// .build_with_size::<u8, _, _>(patterns)?; 1599 /// assert_eq!(Some(1), ac.find("xxx bar xxx").map(|m| m.pattern())); 1600 /// # Ok(()) }; example().unwrap() 1601 /// ``` build_with_size<S, I, P>( &self, patterns: I, ) -> Result<AhoCorasick<S>> where S: StateID, I: IntoIterator<Item = P>, P: AsRef<[u8]>,1602 pub fn build_with_size<S, I, P>( 1603 &self, 1604 patterns: I, 1605 ) -> Result<AhoCorasick<S>> 1606 where 1607 S: StateID, 1608 I: IntoIterator<Item = P>, 1609 P: AsRef<[u8]>, 1610 { 1611 let nfa = self.nfa_builder.build(patterns)?; 1612 let match_kind = nfa.match_kind().clone(); 1613 let imp = if self.dfa { 1614 let dfa = self.dfa_builder.build(&nfa)?; 1615 Imp::DFA(dfa) 1616 } else { 1617 Imp::NFA(nfa) 1618 }; 1619 Ok(AhoCorasick { imp, match_kind }) 1620 } 1621 1622 /// Automatically configure the settings on this builder according to the 1623 /// patterns that will be used to construct the automaton. 1624 /// 1625 /// The idea here is to balance space and time automatically. That is, when 1626 /// searching a small number of patterns, this will attempt to use the 1627 /// fastest possible configuration since the total space required will be 1628 /// small anyway. As the number of patterns grows, this will fall back to 1629 /// slower configurations that use less space. 1630 /// 1631 /// This is guaranteed to never set `match_kind`, but any other option may 1632 /// be overridden. 1633 /// 1634 /// # Examples 1635 /// 1636 /// Basic usage: 1637 /// 1638 /// ``` 1639 /// use aho_corasick::AhoCorasickBuilder; 1640 /// 1641 /// let patterns = &["foo", "bar", "baz"]; 1642 /// let ac = AhoCorasickBuilder::new() 1643 /// .auto_configure(patterns) 1644 /// .build(patterns); 1645 /// assert_eq!(Some(1), ac.find("xxx bar xxx").map(|m| m.pattern())); 1646 /// ``` auto_configure<B: AsRef<[u8]>>( &mut self, patterns: &[B], ) -> &mut AhoCorasickBuilder1647 pub fn auto_configure<B: AsRef<[u8]>>( 1648 &mut self, 1649 patterns: &[B], 1650 ) -> &mut AhoCorasickBuilder { 1651 // N.B. Currently we only use the length of `patterns` to make a 1652 // decision here, and could therefore ask for an `ExactSizeIterator` 1653 // instead. But it's conceivable that we might adapt this to look at 1654 // the total number of bytes, which would requires a second pass. 1655 // 1656 // The logic here is fairly rudimentary at the moment, but probably 1657 // OK. The idea here is to use the fastest thing possible for a small 1658 // number of patterns. That is, a DFA with no byte classes, since byte 1659 // classes require an extra indirection for every byte searched. With a 1660 // moderate number of patterns, we still want a DFA, but save on both 1661 // space and compilation time by enabling byte classes. Finally, fall 1662 // back to the slower but smaller NFA. 1663 if patterns.len() <= 100 { 1664 // N.B. Using byte classes can actually be faster by improving 1665 // locality, but this only really applies for multi-megabyte 1666 // automata (i.e., automata that don't fit in your CPU's cache). 1667 self.dfa(true); 1668 } else if patterns.len() <= 5000 { 1669 self.dfa(true); 1670 } 1671 self 1672 } 1673 1674 /// Set the desired match semantics. 1675 /// 1676 /// The default is `MatchKind::Standard`, which corresponds to the match 1677 /// semantics supported by the standard textbook description of the 1678 /// Aho-Corasick algorithm. Namely, matches are reported as soon as they 1679 /// are found. Moreover, this is the only way to get overlapping matches 1680 /// or do stream searching. 1681 /// 1682 /// The other kinds of match semantics that are supported are 1683 /// `MatchKind::LeftmostFirst` and `MatchKind::LeftmostLongest`. The former 1684 /// corresponds to the match you would get if you were to try to match 1685 /// each pattern at each position in the haystack in the same order that 1686 /// you give to the automaton. That is, it returns the leftmost match 1687 /// corresponding the earliest pattern given to the automaton. The latter 1688 /// corresponds to finding the longest possible match among all leftmost 1689 /// matches. 1690 /// 1691 /// For more details on match semantics, see the 1692 /// [documentation for `MatchKind`](enum.MatchKind.html). 1693 /// 1694 /// # Examples 1695 /// 1696 /// In these examples, we demonstrate the differences between match 1697 /// semantics for a particular set of patterns in a specific order: 1698 /// `b`, `abc`, `abcd`. 1699 /// 1700 /// Standard semantics: 1701 /// 1702 /// ``` 1703 /// use aho_corasick::{AhoCorasickBuilder, MatchKind}; 1704 /// 1705 /// let patterns = &["b", "abc", "abcd"]; 1706 /// let haystack = "abcd"; 1707 /// 1708 /// let ac = AhoCorasickBuilder::new() 1709 /// .match_kind(MatchKind::Standard) // default, not necessary 1710 /// .build(patterns); 1711 /// let mat = ac.find(haystack).expect("should have a match"); 1712 /// assert_eq!("b", &haystack[mat.start()..mat.end()]); 1713 /// ``` 1714 /// 1715 /// Leftmost-first semantics: 1716 /// 1717 /// ``` 1718 /// use aho_corasick::{AhoCorasickBuilder, MatchKind}; 1719 /// 1720 /// let patterns = &["b", "abc", "abcd"]; 1721 /// let haystack = "abcd"; 1722 /// 1723 /// let ac = AhoCorasickBuilder::new() 1724 /// .match_kind(MatchKind::LeftmostFirst) 1725 /// .build(patterns); 1726 /// let mat = ac.find(haystack).expect("should have a match"); 1727 /// assert_eq!("abc", &haystack[mat.start()..mat.end()]); 1728 /// ``` 1729 /// 1730 /// Leftmost-longest semantics: 1731 /// 1732 /// ``` 1733 /// use aho_corasick::{AhoCorasickBuilder, MatchKind}; 1734 /// 1735 /// let patterns = &["b", "abc", "abcd"]; 1736 /// let haystack = "abcd"; 1737 /// 1738 /// let ac = AhoCorasickBuilder::new() 1739 /// .match_kind(MatchKind::LeftmostLongest) 1740 /// .build(patterns); 1741 /// let mat = ac.find(haystack).expect("should have a match"); 1742 /// assert_eq!("abcd", &haystack[mat.start()..mat.end()]); 1743 /// ``` match_kind(&mut self, kind: MatchKind) -> &mut AhoCorasickBuilder1744 pub fn match_kind(&mut self, kind: MatchKind) -> &mut AhoCorasickBuilder { 1745 self.nfa_builder.match_kind(kind); 1746 self 1747 } 1748 1749 /// Enable anchored mode, which requires all matches to start at the 1750 /// first position in a haystack. 1751 /// 1752 /// This option is disabled by default. 1753 /// 1754 /// # Examples 1755 /// 1756 /// Basic usage: 1757 /// 1758 /// ``` 1759 /// use aho_corasick::AhoCorasickBuilder; 1760 /// 1761 /// let patterns = &["foo", "bar"]; 1762 /// let haystack = "foobar"; 1763 /// 1764 /// let ac = AhoCorasickBuilder::new() 1765 /// .anchored(true) 1766 /// .build(patterns); 1767 /// assert_eq!(1, ac.find_iter(haystack).count()); 1768 /// ``` 1769 /// 1770 /// When searching for overlapping matches, all matches that start at 1771 /// the beginning of a haystack will be reported: 1772 /// 1773 /// ``` 1774 /// use aho_corasick::AhoCorasickBuilder; 1775 /// 1776 /// let patterns = &["foo", "foofoo"]; 1777 /// let haystack = "foofoo"; 1778 /// 1779 /// let ac = AhoCorasickBuilder::new() 1780 /// .anchored(true) 1781 /// .build(patterns); 1782 /// assert_eq!(2, ac.find_overlapping_iter(haystack).count()); 1783 /// // A non-anchored search would return 3 matches. 1784 /// ``` anchored(&mut self, yes: bool) -> &mut AhoCorasickBuilder1785 pub fn anchored(&mut self, yes: bool) -> &mut AhoCorasickBuilder { 1786 self.nfa_builder.anchored(yes); 1787 self 1788 } 1789 1790 /// Enable ASCII-aware case insensitive matching. 1791 /// 1792 /// When this option is enabled, searching will be performed without 1793 /// respect to case for ASCII letters (`a-z` and `A-Z`) only. 1794 /// 1795 /// Enabling this option does not change the search algorithm, but it may 1796 /// increase the size of the automaton. 1797 /// 1798 /// **NOTE:** It is unlikely that support for Unicode case folding will 1799 /// be added in the future. The ASCII case works via a simple hack to the 1800 /// underlying automaton, but full Unicode handling requires a fair bit of 1801 /// sophistication. If you do need Unicode handling, you might consider 1802 /// using the [`regex` crate](https://docs.rs/regex) or the lower level 1803 /// [`regex-automata` crate](https://docs.rs/regex-automata). 1804 /// 1805 /// # Examples 1806 /// 1807 /// Basic usage: 1808 /// 1809 /// ``` 1810 /// use aho_corasick::AhoCorasickBuilder; 1811 /// 1812 /// let patterns = &["FOO", "bAr", "BaZ"]; 1813 /// let haystack = "foo bar baz"; 1814 /// 1815 /// let ac = AhoCorasickBuilder::new() 1816 /// .ascii_case_insensitive(true) 1817 /// .build(patterns); 1818 /// assert_eq!(3, ac.find_iter(haystack).count()); 1819 /// ``` ascii_case_insensitive( &mut self, yes: bool, ) -> &mut AhoCorasickBuilder1820 pub fn ascii_case_insensitive( 1821 &mut self, 1822 yes: bool, 1823 ) -> &mut AhoCorasickBuilder { 1824 self.nfa_builder.ascii_case_insensitive(yes); 1825 self 1826 } 1827 1828 /// Set the limit on how many NFA states use a dense representation for 1829 /// their transitions. 1830 /// 1831 /// A dense representation uses more space, but supports faster access to 1832 /// transitions at search time. Thus, this setting permits the control of a 1833 /// space vs time trade off when using the NFA variant of Aho-Corasick. 1834 /// 1835 /// This limit is expressed in terms of the depth of a state, i.e., the 1836 /// number of transitions from the starting state of the NFA. The idea is 1837 /// that most of the time searching will be spent near the starting state 1838 /// of the automaton, so states near the start state should use a dense 1839 /// representation. States further away from the start state would then use 1840 /// a sparse representation, which uses less space but is slower to access 1841 /// transitions at search time. 1842 /// 1843 /// By default, this is set to a low but non-zero number. 1844 /// 1845 /// This setting has no effect if the `dfa` option is enabled. dense_depth(&mut self, depth: usize) -> &mut AhoCorasickBuilder1846 pub fn dense_depth(&mut self, depth: usize) -> &mut AhoCorasickBuilder { 1847 self.nfa_builder.dense_depth(depth); 1848 self 1849 } 1850 1851 /// Compile the standard Aho-Corasick automaton into a deterministic finite 1852 /// automaton (DFA). 1853 /// 1854 /// When this is disabled (which is the default), then a non-deterministic 1855 /// finite automaton (NFA) is used instead. 1856 /// 1857 /// The main benefit to a DFA is that it can execute searches more quickly 1858 /// than a NFA (perhaps 2-4 times as fast). The main drawback is that the 1859 /// DFA uses more space and can take much longer to build. 1860 /// 1861 /// Enabling this option does not change the time complexity for 1862 /// constructing the Aho-Corasick automaton (which is `O(p)` where 1863 /// `p` is the total number of patterns being compiled). Enabling this 1864 /// option does however reduce the time complexity of non-overlapping 1865 /// searches from `O(n + p)` to `O(n)`, where `n` is the length of the 1866 /// haystack. 1867 /// 1868 /// In general, it's a good idea to enable this if you're searching a 1869 /// small number of fairly short patterns (~1000), or if you want the 1870 /// fastest possible search without regard to compilation time or space 1871 /// usage. dfa(&mut self, yes: bool) -> &mut AhoCorasickBuilder1872 pub fn dfa(&mut self, yes: bool) -> &mut AhoCorasickBuilder { 1873 self.dfa = yes; 1874 self 1875 } 1876 1877 /// Enable heuristic prefilter optimizations. 1878 /// 1879 /// When enabled, searching will attempt to quickly skip to match 1880 /// candidates using specialized literal search routines. A prefilter 1881 /// cannot always be used, and is generally treated as a heuristic. It 1882 /// can be useful to disable this if the prefilter is observed to be 1883 /// sub-optimal for a particular workload. 1884 /// 1885 /// This is enabled by default. prefilter(&mut self, yes: bool) -> &mut AhoCorasickBuilder1886 pub fn prefilter(&mut self, yes: bool) -> &mut AhoCorasickBuilder { 1887 self.nfa_builder.prefilter(yes); 1888 self 1889 } 1890 1891 /// Shrink the size of the transition alphabet by mapping bytes to their 1892 /// equivalence classes. This only has an effect when the `dfa` option is 1893 /// enabled. 1894 /// 1895 /// When enabled, each a DFA will use a map from all possible bytes 1896 /// to their corresponding equivalence class. Each equivalence class 1897 /// represents a set of bytes that does not discriminate between a match 1898 /// and a non-match in the DFA. For example, the patterns `bar` and `baz` 1899 /// have at least five equivalence classes: singleton sets of `b`, `a`, `r` 1900 /// and `z`, and a final set that contains every other byte. 1901 /// 1902 /// The advantage of this map is that the size of the transition table can 1903 /// be reduced drastically from `#states * 256 * sizeof(id)` to 1904 /// `#states * k * sizeof(id)` where `k` is the number of equivalence 1905 /// classes. As a result, total space usage can decrease substantially. 1906 /// Moreover, since a smaller alphabet is used, compilation becomes faster 1907 /// as well. 1908 /// 1909 /// The disadvantage of this map is that every byte searched must be 1910 /// passed through this map before it can be used to determine the next 1911 /// transition. This has a small match time performance cost. However, if 1912 /// the DFA is otherwise very large without byte classes, then using byte 1913 /// classes can greatly improve memory locality and thus lead to better 1914 /// overall performance. 1915 /// 1916 /// This option is enabled by default. 1917 #[deprecated( 1918 since = "0.7.16", 1919 note = "not carrying its weight, will be always enabled, see: https://github.com/BurntSushi/aho-corasick/issues/57" 1920 )] byte_classes(&mut self, yes: bool) -> &mut AhoCorasickBuilder1921 pub fn byte_classes(&mut self, yes: bool) -> &mut AhoCorasickBuilder { 1922 self.dfa_builder.byte_classes(yes); 1923 self 1924 } 1925 1926 /// Premultiply state identifiers in the transition table. This only has 1927 /// an effect when the `dfa` option is enabled. 1928 /// 1929 /// When enabled, state identifiers are premultiplied to point to their 1930 /// corresponding row in the transition table. That is, given the `i`th 1931 /// state, its corresponding premultiplied identifier is `i * k` where `k` 1932 /// is the alphabet size of the automaton. (The alphabet size is at most 1933 /// 256, but is in practice smaller if byte classes is enabled.) 1934 /// 1935 /// When state identifiers are not premultiplied, then the identifier of 1936 /// the `i`th state is `i`. 1937 /// 1938 /// The advantage of premultiplying state identifiers is that is saves a 1939 /// multiplication instruction per byte when searching with a DFA. This has 1940 /// been observed to lead to a 20% performance benefit in micro-benchmarks. 1941 /// 1942 /// The primary disadvantage of premultiplying state identifiers is 1943 /// that they require a larger integer size to represent. For example, 1944 /// if the DFA has 200 states, then its premultiplied form requires 16 1945 /// bits to represent every possible state identifier, where as its 1946 /// non-premultiplied form only requires 8 bits. 1947 /// 1948 /// This option is enabled by default. 1949 #[deprecated( 1950 since = "0.7.16", 1951 note = "not carrying its weight, will be always enabled, see: https://github.com/BurntSushi/aho-corasick/issues/57" 1952 )] premultiply(&mut self, yes: bool) -> &mut AhoCorasickBuilder1953 pub fn premultiply(&mut self, yes: bool) -> &mut AhoCorasickBuilder { 1954 self.dfa_builder.premultiply(yes); 1955 self 1956 } 1957 } 1958 1959 /// A knob for controlling the match semantics of an Aho-Corasick automaton. 1960 /// 1961 /// There are two generally different ways that Aho-Corasick automatons can 1962 /// report matches. The first way is the "standard" approach that results from 1963 /// implementing most textbook explanations of Aho-Corasick. The second way is 1964 /// to report only the leftmost non-overlapping matches. The leftmost approach 1965 /// is in turn split into two different ways of resolving ambiguous matches: 1966 /// leftmost-first and leftmost-longest. 1967 /// 1968 /// The `Standard` match kind is the default and is the only one that supports 1969 /// overlapping matches and stream searching. (Trying to find overlapping 1970 /// or streaming matches using leftmost match semantics will result in a 1971 /// panic.) The `Standard` match kind will report matches as they are seen. 1972 /// When searching for overlapping matches, then all possible matches are 1973 /// reported. When searching for non-overlapping matches, the first match seen 1974 /// is reported. For example, for non-overlapping matches, given the patterns 1975 /// `abcd` and `b` and the subject string `abcdef`, only a match for `b` is 1976 /// reported since it is detected first. The `abcd` match is never reported 1977 /// since it overlaps with the `b` match. 1978 /// 1979 /// In contrast, the leftmost match kind always prefers the leftmost match 1980 /// among all possible matches. Given the same example as above with `abcd` and 1981 /// `b` as patterns and `abcdef` as the subject string, the leftmost match is 1982 /// `abcd` since it begins before the `b` match, even though the `b` match is 1983 /// detected before the `abcd` match. In this case, the `b` match is not 1984 /// reported at all since it overlaps with the `abcd` match. 1985 /// 1986 /// The difference between leftmost-first and leftmost-longest is in how they 1987 /// resolve ambiguous matches when there are multiple leftmost matches to 1988 /// choose from. Leftmost-first always chooses the pattern that was provided 1989 /// earliest, where as leftmost-longest always chooses the longest matching 1990 /// pattern. For example, given the patterns `a` and `ab` and the subject 1991 /// string `ab`, the leftmost-first match is `a` but the leftmost-longest match 1992 /// is `ab`. Conversely, if the patterns were given in reverse order, i.e., 1993 /// `ab` and `a`, then both the leftmost-first and leftmost-longest matches 1994 /// would be `ab`. Stated differently, the leftmost-first match depends on the 1995 /// order in which the patterns were given to the Aho-Corasick automaton. 1996 /// Because of that, when leftmost-first matching is used, if a pattern `A` 1997 /// that appears before a pattern `B` is a prefix of `B`, then it is impossible 1998 /// to ever observe a match of `B`. 1999 /// 2000 /// If you're not sure which match kind to pick, then stick with the standard 2001 /// kind, which is the default. In particular, if you need overlapping or 2002 /// streaming matches, then you _must_ use the standard kind. The leftmost 2003 /// kinds are useful in specific circumstances. For example, leftmost-first can 2004 /// be very useful as a way to implement match priority based on the order of 2005 /// patterns given and leftmost-longest can be useful for dictionary searching 2006 /// such that only the longest matching words are reported. 2007 /// 2008 /// # Relationship with regular expression alternations 2009 /// 2010 /// Understanding match semantics can be a little tricky, and one easy way 2011 /// to conceptualize non-overlapping matches from an Aho-Corasick automaton 2012 /// is to think about them as a simple alternation of literals in a regular 2013 /// expression. For example, let's say we wanted to match the strings 2014 /// `Sam` and `Samwise`, which would turn into the regex `Sam|Samwise`. It 2015 /// turns out that regular expression engines have two different ways of 2016 /// matching this alternation. The first way, leftmost-longest, is commonly 2017 /// found in POSIX compatible implementations of regular expressions (such as 2018 /// `grep`). The second way, leftmost-first, is commonly found in backtracking 2019 /// implementations such as Perl. (Some regex engines, such as RE2 and Rust's 2020 /// regex engine do not use backtracking, but still implement leftmost-first 2021 /// semantics in an effort to match the behavior of dominant backtracking 2022 /// regex engines such as those found in Perl, Ruby, Python, Javascript and 2023 /// PHP.) 2024 /// 2025 /// That is, when matching `Sam|Samwise` against `Samwise`, a POSIX regex 2026 /// will match `Samwise` because it is the longest possible match, but a 2027 /// Perl-like regex will match `Sam` since it appears earlier in the 2028 /// alternation. Indeed, the regex `Sam|Samwise` in a Perl-like regex engine 2029 /// will never match `Samwise` since `Sam` will always have higher priority. 2030 /// Conversely, matching the regex `Samwise|Sam` against `Samwise` will lead to 2031 /// a match of `Samwise` in both POSIX and Perl-like regexes since `Samwise` is 2032 /// still longest match, but it also appears earlier than `Sam`. 2033 /// 2034 /// The "standard" match semantics of Aho-Corasick generally don't correspond 2035 /// to the match semantics of any large group of regex implementations, so 2036 /// there's no direct analogy that can be made here. Standard match semantics 2037 /// are generally useful for overlapping matches, or if you just want to see 2038 /// matches as they are detected. 2039 /// 2040 /// The main conclusion to draw from this section is that the match semantics 2041 /// can be tweaked to precisely match either Perl-like regex alternations or 2042 /// POSIX regex alternations. 2043 #[derive(Clone, Copy, Debug, Eq, PartialEq)] 2044 pub enum MatchKind { 2045 /// Use standard match semantics, which support overlapping matches. When 2046 /// used with non-overlapping matches, matches are reported as they are 2047 /// seen. 2048 Standard, 2049 /// Use leftmost-first match semantics, which reports leftmost matches. 2050 /// When there are multiple possible leftmost matches, the match 2051 /// corresponding to the pattern that appeared earlier when constructing 2052 /// the automaton is reported. 2053 /// 2054 /// This does **not** support overlapping matches or stream searching. If 2055 /// this match kind is used, attempting to find overlapping matches or 2056 /// stream matches will panic. 2057 LeftmostFirst, 2058 /// Use leftmost-longest match semantics, which reports leftmost matches. 2059 /// When there are multiple possible leftmost matches, the longest match 2060 /// is chosen. 2061 /// 2062 /// This does **not** support overlapping matches or stream searching. If 2063 /// this match kind is used, attempting to find overlapping matches or 2064 /// stream matches will panic. 2065 LeftmostLongest, 2066 /// Hints that destructuring should not be exhaustive. 2067 /// 2068 /// This enum may grow additional variants, so this makes sure clients 2069 /// don't count on exhaustive matching. (Otherwise, adding a new variant 2070 /// could break existing code.) 2071 #[doc(hidden)] 2072 __Nonexhaustive, 2073 } 2074 2075 /// The default match kind is `MatchKind::Standard`. 2076 impl Default for MatchKind { default() -> MatchKind2077 fn default() -> MatchKind { 2078 MatchKind::Standard 2079 } 2080 } 2081 2082 impl MatchKind { supports_overlapping(&self) -> bool2083 fn supports_overlapping(&self) -> bool { 2084 self.is_standard() 2085 } 2086 supports_stream(&self) -> bool2087 fn supports_stream(&self) -> bool { 2088 // TODO: It may be possible to support this. It's hard. 2089 // 2090 // See: https://github.com/rust-lang/regex/issues/425#issuecomment-471367838 2091 self.is_standard() 2092 } 2093 is_standard(&self) -> bool2094 pub(crate) fn is_standard(&self) -> bool { 2095 *self == MatchKind::Standard 2096 } 2097 is_leftmost(&self) -> bool2098 pub(crate) fn is_leftmost(&self) -> bool { 2099 *self == MatchKind::LeftmostFirst 2100 || *self == MatchKind::LeftmostLongest 2101 } 2102 is_leftmost_first(&self) -> bool2103 pub(crate) fn is_leftmost_first(&self) -> bool { 2104 *self == MatchKind::LeftmostFirst 2105 } 2106 2107 /// Convert this match kind into a packed match kind. If this match kind 2108 /// corresponds to standard semantics, then this returns None, since 2109 /// packed searching does not support standard semantics. as_packed(&self) -> Option<packed::MatchKind>2110 pub(crate) fn as_packed(&self) -> Option<packed::MatchKind> { 2111 match *self { 2112 MatchKind::Standard => None, 2113 MatchKind::LeftmostFirst => Some(packed::MatchKind::LeftmostFirst), 2114 MatchKind::LeftmostLongest => { 2115 Some(packed::MatchKind::LeftmostLongest) 2116 } 2117 MatchKind::__Nonexhaustive => unreachable!(), 2118 } 2119 } 2120 } 2121 2122 #[cfg(test)] 2123 mod tests { 2124 use super::*; 2125 2126 #[test] oibits()2127 fn oibits() { 2128 use std::panic::{RefUnwindSafe, UnwindSafe}; 2129 2130 fn assert_send<T: Send>() {} 2131 fn assert_sync<T: Sync>() {} 2132 fn assert_unwind_safe<T: RefUnwindSafe + UnwindSafe>() {} 2133 2134 assert_send::<AhoCorasick>(); 2135 assert_sync::<AhoCorasick>(); 2136 assert_unwind_safe::<AhoCorasick>(); 2137 assert_send::<AhoCorasickBuilder>(); 2138 assert_sync::<AhoCorasickBuilder>(); 2139 assert_unwind_safe::<AhoCorasickBuilder>(); 2140 } 2141 } 2142