1 use crate::ahocorasick::MatchKind; 2 use crate::prefilter::{self, Candidate, Prefilter, PrefilterState}; 3 use crate::state_id::{dead_id, fail_id, StateID}; 4 use crate::Match; 5 6 // NOTE: This trait essentially started as a copy of the same trait from from 7 // regex-automata, with some wording changed since we use this trait for 8 // NFAs in addition to DFAs in this crate. Additionally, we do not export 9 // this trait. It's only used internally to reduce code duplication. The 10 // regex-automata crate needs to expose it because its Regex type is generic 11 // over implementations of this trait. In this crate, we encapsulate everything 12 // behind the AhoCorasick type. 13 // 14 // This trait is a bit of a mess, but it's not quite clear how to fix it. 15 // Basically, there are several competing concerns: 16 // 17 // * We need performance, so everything effectively needs to get monomorphized. 18 // * There are several variations on searching Aho-Corasick automatons: 19 // overlapping, standard and leftmost. Overlapping and standard are somewhat 20 // combined together below, but there is no real way to combine standard with 21 // leftmost. Namely, leftmost requires continuing a search even after a match 22 // is found, in order to correctly disambiguate a match. 23 // * On top of that, *sometimes* callers want to know which state the automaton 24 // is in after searching. This is principally useful for overlapping and 25 // stream searches. However, when callers don't care about this, we really 26 // do not want to be forced to compute it, since it sometimes requires extra 27 // work. Thus, there are effectively two copies of leftmost searching: one 28 // for tracking the state ID and one that doesn't. We should ideally do the 29 // same for standard searching, but my sanity stopped me. 30 31 // SAFETY RATIONALE: Previously, the code below went to some length to remove 32 // all bounds checks. This generally produced tighter assembly and lead to 33 // 20-50% improvements in micro-benchmarks on corpora made up of random 34 // characters. This somewhat makes sense, since the branch predictor is going 35 // to be at its worse on random text. 36 // 37 // However, using the aho-corasick-debug tool and manually benchmarking 38 // different inputs, the code *with* bounds checks actually wound up being 39 // slightly faster: 40 // 41 // $ cat input 42 // Sherlock Holmes 43 // John Watson 44 // Professor Moriarty 45 // Irene Adler 46 // Mary Watson 47 // 48 // $ aho-corasick-debug-safe \ 49 // input OpenSubtitles2018.raw.sample.en --kind leftmost-first --dfa 50 // pattern read time: 32.824µs 51 // automaton build time: 444.687µs 52 // automaton heap usage: 72392 bytes 53 // match count: 639 54 // count time: 1.809961702s 55 // 56 // $ aho-corasick-debug-master \ 57 // input OpenSubtitles2018.raw.sample.en --kind leftmost-first --dfa 58 // pattern read time: 31.425µs 59 // automaton build time: 317.434µs 60 // automaton heap usage: 72392 bytes 61 // match count: 639 62 // count time: 2.059157705s 63 // 64 // I was able to reproduce this result on two different machines (an i5 and 65 // an i7). Therefore, we go the route of safe code for now. 66 67 /// A trait describing the interface of an Aho-Corasick finite state machine. 68 /// 69 /// Every automaton has exactly one fail state, one dead state and exactly one 70 /// start state. Generally, these correspond to the first, second and third 71 /// states, respectively. The dead state is always treated as a sentinel. That 72 /// is, no correct Aho-Corasick automaton will ever transition into the fail 73 /// state. The dead state, however, can be transitioned into, but only when 74 /// leftmost-first or leftmost-longest match semantics are enabled and only 75 /// when at least one match has been observed. 76 /// 77 /// Every automaton also has one or more match states, such that 78 /// `Automaton::is_match_state(id)` returns `true` if and only if `id` 79 /// corresponds to a match state. 80 pub trait Automaton { 81 /// The representation used for state identifiers in this automaton. 82 /// 83 /// Typically, this is one of `u8`, `u16`, `u32`, `u64` or `usize`. 84 type ID: StateID; 85 86 /// The type of matching that should be done. match_kind(&self) -> &MatchKind87 fn match_kind(&self) -> &MatchKind; 88 89 /// Returns true if and only if this automaton uses anchored searches. anchored(&self) -> bool90 fn anchored(&self) -> bool; 91 92 /// An optional prefilter for quickly skipping to the next candidate match. 93 /// A prefilter must report at least every match, although it may report 94 /// positions that do not correspond to a match. That is, it must not allow 95 /// false negatives, but can allow false positives. 96 /// 97 /// Currently, a prefilter only runs when the automaton is in the start 98 /// state. That is, the position reported by a prefilter should always 99 /// correspond to the start of a potential match. prefilter(&self) -> Option<&dyn Prefilter>100 fn prefilter(&self) -> Option<&dyn Prefilter>; 101 102 /// Return the identifier of this automaton's start state. start_state(&self) -> Self::ID103 fn start_state(&self) -> Self::ID; 104 105 /// Returns true if and only if the given state identifier refers to a 106 /// valid state. is_valid(&self, id: Self::ID) -> bool107 fn is_valid(&self, id: Self::ID) -> bool; 108 109 /// Returns true if and only if the given identifier corresponds to a match 110 /// state. 111 /// 112 /// The state ID given must be valid, or else implementors may panic. is_match_state(&self, id: Self::ID) -> bool113 fn is_match_state(&self, id: Self::ID) -> bool; 114 115 /// Returns true if and only if the given identifier corresponds to a state 116 /// that is either the dead state or a match state. 117 /// 118 /// Depending on the implementation of the automaton, this routine can 119 /// be used to save a branch in the core matching loop. Nevertheless, 120 /// `is_match_state(id) || id == dead_id()` is always a valid 121 /// implementation. Indeed, this is the default implementation. 122 /// 123 /// The state ID given must be valid, or else implementors may panic. is_match_or_dead_state(&self, id: Self::ID) -> bool124 fn is_match_or_dead_state(&self, id: Self::ID) -> bool { 125 id == dead_id() || self.is_match_state(id) 126 } 127 128 /// If the given state is a match state, return the match corresponding 129 /// to the given match index. `end` must be the ending position of the 130 /// detected match. If no match exists or if `match_index` exceeds the 131 /// number of matches in this state, then `None` is returned. 132 /// 133 /// The state ID given must be valid, or else implementors may panic. 134 /// 135 /// If the given state ID is correct and if the `match_index` is less than 136 /// the number of matches for that state, then this is guaranteed to return 137 /// a match. get_match( &self, id: Self::ID, match_index: usize, end: usize, ) -> Option<Match>138 fn get_match( 139 &self, 140 id: Self::ID, 141 match_index: usize, 142 end: usize, 143 ) -> Option<Match>; 144 145 /// Returns the number of matches for the given state. If the given state 146 /// is not a match state, then this returns 0. 147 /// 148 /// The state ID given must be valid, or else implementors must panic. match_count(&self, id: Self::ID) -> usize149 fn match_count(&self, id: Self::ID) -> usize; 150 151 /// Given the current state that this automaton is in and the next input 152 /// byte, this method returns the identifier of the next state. The 153 /// identifier returned must always be valid and may never correspond to 154 /// the fail state. The returned identifier may, however, point to the 155 /// dead state. 156 /// 157 /// This is not safe so that implementors may look up the next state 158 /// without memory safety checks such as bounds checks. As such, callers 159 /// must ensure that the given identifier corresponds to a valid automaton 160 /// state. Implementors must, in turn, ensure that this routine is safe for 161 /// all valid state identifiers and for all possible `u8` values. next_state(&self, current: Self::ID, input: u8) -> Self::ID162 fn next_state(&self, current: Self::ID, input: u8) -> Self::ID; 163 164 /// Like next_state, but debug_asserts that the underlying 165 /// implementation never returns a `fail_id()` for the next state. next_state_no_fail(&self, current: Self::ID, input: u8) -> Self::ID166 fn next_state_no_fail(&self, current: Self::ID, input: u8) -> Self::ID { 167 let next = self.next_state(current, input); 168 // We should never see a transition to the failure state. 169 debug_assert!( 170 next != fail_id(), 171 "automaton should never return fail_id for next state" 172 ); 173 next 174 } 175 176 /// Execute a search using standard match semantics. 177 /// 178 /// This can be used even when the automaton was constructed with leftmost 179 /// match semantics when you want to find the earliest possible match. This 180 /// can also be used as part of an overlapping search implementation. 181 /// 182 /// N.B. This does not report a match if `state_id` is given as a matching 183 /// state. As such, this should not be used directly. 184 #[inline(always)] standard_find_at( &self, prestate: &mut PrefilterState, haystack: &[u8], at: usize, state_id: &mut Self::ID, ) -> Option<Match>185 fn standard_find_at( 186 &self, 187 prestate: &mut PrefilterState, 188 haystack: &[u8], 189 at: usize, 190 state_id: &mut Self::ID, 191 ) -> Option<Match> { 192 if let Some(pre) = self.prefilter() { 193 self.standard_find_at_imp( 194 prestate, 195 Some(pre), 196 haystack, 197 at, 198 state_id, 199 ) 200 } else { 201 self.standard_find_at_imp(prestate, None, haystack, at, state_id) 202 } 203 } 204 205 // It's important for this to always be inlined. Namely, its only caller 206 // is standard_find_at, and the inlining should remove the case analysis 207 // for prefilter scanning when there is no prefilter available. 208 #[inline(always)] standard_find_at_imp( &self, prestate: &mut PrefilterState, prefilter: Option<&dyn Prefilter>, haystack: &[u8], mut at: usize, state_id: &mut Self::ID, ) -> Option<Match>209 fn standard_find_at_imp( 210 &self, 211 prestate: &mut PrefilterState, 212 prefilter: Option<&dyn Prefilter>, 213 haystack: &[u8], 214 mut at: usize, 215 state_id: &mut Self::ID, 216 ) -> Option<Match> { 217 while at < haystack.len() { 218 if let Some(pre) = prefilter { 219 if prestate.is_effective(at) && *state_id == self.start_state() 220 { 221 let c = prefilter::next(prestate, pre, haystack, at) 222 .into_option(); 223 match c { 224 None => return None, 225 Some(i) => { 226 at = i; 227 } 228 } 229 } 230 } 231 // CORRECTNESS: next_state is correct for all possible u8 values, 232 // so the only thing we're concerned about is the validity of 233 // `state_id`. `state_id` either comes from the caller (in which 234 // case, we assume it is correct), or it comes from the return 235 // value of next_state, which is guaranteed to be correct. 236 *state_id = self.next_state_no_fail(*state_id, haystack[at]); 237 at += 1; 238 // This routine always quits immediately after seeing a 239 // match, and since dead states can only come after seeing 240 // a match, seeing a dead state here is impossible. (Unless 241 // we have an anchored automaton, in which case, dead states 242 // are used to stop a search.) 243 debug_assert!( 244 *state_id != dead_id() || self.anchored(), 245 "standard find should never see a dead state" 246 ); 247 248 if self.is_match_or_dead_state(*state_id) { 249 return if *state_id == dead_id() { 250 None 251 } else { 252 self.get_match(*state_id, 0, at) 253 }; 254 } 255 } 256 None 257 } 258 259 /// Execute a search using leftmost (either first or longest) match 260 /// semantics. 261 /// 262 /// The principle difference between searching with standard semantics and 263 /// searching with leftmost semantics is that leftmost searching will 264 /// continue searching even after a match has been found. Once a match 265 /// is found, the search does not stop until either the haystack has been 266 /// exhausted or a dead state is observed in the automaton. (Dead states 267 /// only exist in automatons constructed with leftmost semantics.) That is, 268 /// we rely on the construction of the automaton to tell us when to quit. 269 #[inline(never)] leftmost_find_at( &self, prestate: &mut PrefilterState, haystack: &[u8], at: usize, state_id: &mut Self::ID, ) -> Option<Match>270 fn leftmost_find_at( 271 &self, 272 prestate: &mut PrefilterState, 273 haystack: &[u8], 274 at: usize, 275 state_id: &mut Self::ID, 276 ) -> Option<Match> { 277 if let Some(pre) = self.prefilter() { 278 self.leftmost_find_at_imp( 279 prestate, 280 Some(pre), 281 haystack, 282 at, 283 state_id, 284 ) 285 } else { 286 self.leftmost_find_at_imp(prestate, None, haystack, at, state_id) 287 } 288 } 289 290 // It's important for this to always be inlined. Namely, its only caller 291 // is leftmost_find_at, and the inlining should remove the case analysis 292 // for prefilter scanning when there is no prefilter available. 293 #[inline(always)] leftmost_find_at_imp( &self, prestate: &mut PrefilterState, prefilter: Option<&dyn Prefilter>, haystack: &[u8], mut at: usize, state_id: &mut Self::ID, ) -> Option<Match>294 fn leftmost_find_at_imp( 295 &self, 296 prestate: &mut PrefilterState, 297 prefilter: Option<&dyn Prefilter>, 298 haystack: &[u8], 299 mut at: usize, 300 state_id: &mut Self::ID, 301 ) -> Option<Match> { 302 debug_assert!(self.match_kind().is_leftmost()); 303 if self.anchored() && at > 0 && *state_id == self.start_state() { 304 return None; 305 } 306 let mut last_match = self.get_match(*state_id, 0, at); 307 while at < haystack.len() { 308 if let Some(pre) = prefilter { 309 if prestate.is_effective(at) && *state_id == self.start_state() 310 { 311 let c = prefilter::next(prestate, pre, haystack, at) 312 .into_option(); 313 match c { 314 None => return None, 315 Some(i) => { 316 at = i; 317 } 318 } 319 } 320 } 321 // CORRECTNESS: next_state is correct for all possible u8 values, 322 // so the only thing we're concerned about is the validity of 323 // `state_id`. `state_id` either comes from the caller (in which 324 // case, we assume it is correct), or it comes from the return 325 // value of next_state, which is guaranteed to be correct. 326 *state_id = self.next_state_no_fail(*state_id, haystack[at]); 327 at += 1; 328 if self.is_match_or_dead_state(*state_id) { 329 if *state_id == dead_id() { 330 // The only way to enter into a dead state is if a match 331 // has been found, so we assert as much. This is different 332 // from normal automata, where you might enter a dead state 333 // if you know a subsequent match will never be found 334 // (regardless of whether a match has already been found). 335 // For Aho-Corasick, it is built so that we can match at 336 // any position, so the possibility of a match always 337 // exists. 338 // 339 // (Unless we have an anchored automaton, in which case, 340 // dead states are used to stop a search.) 341 debug_assert!( 342 last_match.is_some() || self.anchored(), 343 "dead state should only be seen after match" 344 ); 345 return last_match; 346 } 347 last_match = self.get_match(*state_id, 0, at); 348 } 349 } 350 last_match 351 } 352 353 /// This is like leftmost_find_at, but does not need to track a caller 354 /// provided state id. In other words, the only output of this routine is a 355 /// match, if one exists. 356 /// 357 /// It is regrettable that we need to effectively copy a chunk of 358 /// implementation twice, but when we don't need to track the state ID, we 359 /// can allow the prefilter to report matches immediately without having 360 /// to re-confirm them with the automaton. The re-confirmation step is 361 /// necessary in leftmost_find_at because tracing through the automaton is 362 /// the only way to correctly set the state ID. (Perhaps an alternative 363 /// would be to keep a map from pattern ID to matching state ID, but that 364 /// complicates the code and still doesn't permit us to defer to the 365 /// prefilter entirely when possible.) 366 /// 367 /// I did try a few things to avoid the code duplication here, but nothing 368 /// optimized as well as this approach. (In microbenchmarks, there was 369 /// about a 25% difference.) 370 #[inline(never)] leftmost_find_at_no_state( &self, prestate: &mut PrefilterState, haystack: &[u8], at: usize, ) -> Option<Match>371 fn leftmost_find_at_no_state( 372 &self, 373 prestate: &mut PrefilterState, 374 haystack: &[u8], 375 at: usize, 376 ) -> Option<Match> { 377 if let Some(pre) = self.prefilter() { 378 self.leftmost_find_at_no_state_imp( 379 prestate, 380 Some(pre), 381 haystack, 382 at, 383 ) 384 } else { 385 self.leftmost_find_at_no_state_imp(prestate, None, haystack, at) 386 } 387 } 388 389 // It's important for this to always be inlined. Namely, its only caller 390 // is leftmost_find_at_no_state, and the inlining should remove the case 391 // analysis for prefilter scanning when there is no prefilter available. 392 #[inline(always)] leftmost_find_at_no_state_imp( &self, prestate: &mut PrefilterState, prefilter: Option<&dyn Prefilter>, haystack: &[u8], mut at: usize, ) -> Option<Match>393 fn leftmost_find_at_no_state_imp( 394 &self, 395 prestate: &mut PrefilterState, 396 prefilter: Option<&dyn Prefilter>, 397 haystack: &[u8], 398 mut at: usize, 399 ) -> Option<Match> { 400 debug_assert!(self.match_kind().is_leftmost()); 401 if self.anchored() && at > 0 { 402 return None; 403 } 404 // If our prefilter handles confirmation of matches 100% of the 405 // time, and since we don't need to track state IDs, we can avoid 406 // Aho-Corasick completely. 407 if let Some(pre) = prefilter { 408 // We should never have a prefilter during an anchored search. 409 debug_assert!(!self.anchored()); 410 if !pre.reports_false_positives() { 411 return match pre.next_candidate(prestate, haystack, at) { 412 Candidate::None => None, 413 Candidate::Match(m) => Some(m), 414 Candidate::PossibleStartOfMatch(_) => unreachable!(), 415 }; 416 } 417 } 418 419 let mut state_id = self.start_state(); 420 let mut last_match = self.get_match(state_id, 0, at); 421 while at < haystack.len() { 422 if let Some(pre) = prefilter { 423 if prestate.is_effective(at) && state_id == self.start_state() 424 { 425 match prefilter::next(prestate, pre, haystack, at) { 426 Candidate::None => return None, 427 // Since we aren't tracking a state ID, we can 428 // quit early once we know we have a match. 429 Candidate::Match(m) => return Some(m), 430 Candidate::PossibleStartOfMatch(i) => { 431 at = i; 432 } 433 } 434 } 435 } 436 // CORRECTNESS: next_state is correct for all possible u8 values, 437 // so the only thing we're concerned about is the validity of 438 // `state_id`. `state_id` either comes from the caller (in which 439 // case, we assume it is correct), or it comes from the return 440 // value of next_state, which is guaranteed to be correct. 441 state_id = self.next_state_no_fail(state_id, haystack[at]); 442 at += 1; 443 if self.is_match_or_dead_state(state_id) { 444 if state_id == dead_id() { 445 // The only way to enter into a dead state is if a 446 // match has been found, so we assert as much. This 447 // is different from normal automata, where you might 448 // enter a dead state if you know a subsequent match 449 // will never be found (regardless of whether a match 450 // has already been found). For Aho-Corasick, it is 451 // built so that we can match at any position, so the 452 // possibility of a match always exists. 453 // 454 // (Unless we have an anchored automaton, in which 455 // case, dead states are used to stop a search.) 456 debug_assert!( 457 last_match.is_some() || self.anchored(), 458 "dead state should only be seen after match" 459 ); 460 return last_match; 461 } 462 last_match = self.get_match(state_id, 0, at); 463 } 464 } 465 last_match 466 } 467 468 /// Execute an overlapping search. 469 /// 470 /// When executing an overlapping match, the previous state ID in addition 471 /// to the previous match index should be given. If there are more matches 472 /// at the given state, then the match is reported and the given index is 473 /// incremented. 474 #[inline(always)] overlapping_find_at( &self, prestate: &mut PrefilterState, haystack: &[u8], at: usize, state_id: &mut Self::ID, match_index: &mut usize, ) -> Option<Match>475 fn overlapping_find_at( 476 &self, 477 prestate: &mut PrefilterState, 478 haystack: &[u8], 479 at: usize, 480 state_id: &mut Self::ID, 481 match_index: &mut usize, 482 ) -> Option<Match> { 483 if self.anchored() && at > 0 && *state_id == self.start_state() { 484 return None; 485 } 486 487 let match_count = self.match_count(*state_id); 488 if *match_index < match_count { 489 // This is guaranteed to return a match since 490 // match_index < match_count. 491 let result = self.get_match(*state_id, *match_index, at); 492 debug_assert!(result.is_some(), "must be a match"); 493 *match_index += 1; 494 return result; 495 } 496 497 *match_index = 0; 498 match self.standard_find_at(prestate, haystack, at, state_id) { 499 None => None, 500 Some(m) => { 501 *match_index = 1; 502 Some(m) 503 } 504 } 505 } 506 507 /// Return the earliest match found. This returns as soon as we know that 508 /// we have a match. As such, this does not necessarily correspond to the 509 /// leftmost starting match, but rather, the leftmost position at which a 510 /// match ends. 511 #[inline(always)] earliest_find_at( &self, prestate: &mut PrefilterState, haystack: &[u8], at: usize, state_id: &mut Self::ID, ) -> Option<Match>512 fn earliest_find_at( 513 &self, 514 prestate: &mut PrefilterState, 515 haystack: &[u8], 516 at: usize, 517 state_id: &mut Self::ID, 518 ) -> Option<Match> { 519 if *state_id == self.start_state() { 520 if self.anchored() && at > 0 { 521 return None; 522 } 523 if let Some(m) = self.get_match(*state_id, 0, at) { 524 return Some(m); 525 } 526 } 527 self.standard_find_at(prestate, haystack, at, state_id) 528 } 529 530 /// A convenience function for finding the next match according to the 531 /// match semantics of this automaton. For standard match semantics, this 532 /// finds the earliest match. Otherwise, the leftmost match is found. 533 #[inline(always)] find_at( &self, prestate: &mut PrefilterState, haystack: &[u8], at: usize, state_id: &mut Self::ID, ) -> Option<Match>534 fn find_at( 535 &self, 536 prestate: &mut PrefilterState, 537 haystack: &[u8], 538 at: usize, 539 state_id: &mut Self::ID, 540 ) -> Option<Match> { 541 match *self.match_kind() { 542 MatchKind::Standard => { 543 self.earliest_find_at(prestate, haystack, at, state_id) 544 } 545 MatchKind::LeftmostFirst | MatchKind::LeftmostLongest => { 546 self.leftmost_find_at(prestate, haystack, at, state_id) 547 } 548 MatchKind::__Nonexhaustive => unreachable!(), 549 } 550 } 551 552 /// Like find_at, but does not track state identifiers. This permits some 553 /// optimizations when a prefilter that confirms its own matches is 554 /// present. 555 #[inline(always)] find_at_no_state( &self, prestate: &mut PrefilterState, haystack: &[u8], at: usize, ) -> Option<Match>556 fn find_at_no_state( 557 &self, 558 prestate: &mut PrefilterState, 559 haystack: &[u8], 560 at: usize, 561 ) -> Option<Match> { 562 match *self.match_kind() { 563 MatchKind::Standard => { 564 let mut state = self.start_state(); 565 self.earliest_find_at(prestate, haystack, at, &mut state) 566 } 567 MatchKind::LeftmostFirst | MatchKind::LeftmostLongest => { 568 self.leftmost_find_at_no_state(prestate, haystack, at) 569 } 570 MatchKind::__Nonexhaustive => unreachable!(), 571 } 572 } 573 } 574