• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 use crate::ahocorasick::MatchKind;
2 use crate::prefilter::{self, Candidate, Prefilter, PrefilterState};
3 use crate::state_id::{dead_id, fail_id, StateID};
4 use crate::Match;
5 
6 // NOTE: This trait essentially started as a copy of the same trait from from
7 // regex-automata, with some wording changed since we use this trait for
8 // NFAs in addition to DFAs in this crate. Additionally, we do not export
9 // this trait. It's only used internally to reduce code duplication. The
10 // regex-automata crate needs to expose it because its Regex type is generic
11 // over implementations of this trait. In this crate, we encapsulate everything
12 // behind the AhoCorasick type.
13 //
14 // This trait is a bit of a mess, but it's not quite clear how to fix it.
15 // Basically, there are several competing concerns:
16 //
17 // * We need performance, so everything effectively needs to get monomorphized.
18 // * There are several variations on searching Aho-Corasick automatons:
19 //   overlapping, standard and leftmost. Overlapping and standard are somewhat
20 //   combined together below, but there is no real way to combine standard with
21 //   leftmost. Namely, leftmost requires continuing a search even after a match
22 //   is found, in order to correctly disambiguate a match.
23 // * On top of that, *sometimes* callers want to know which state the automaton
24 //   is in after searching. This is principally useful for overlapping and
25 //   stream searches. However, when callers don't care about this, we really
26 //   do not want to be forced to compute it, since it sometimes requires extra
27 //   work. Thus, there are effectively two copies of leftmost searching: one
28 //   for tracking the state ID and one that doesn't. We should ideally do the
29 //   same for standard searching, but my sanity stopped me.
30 
31 // SAFETY RATIONALE: Previously, the code below went to some length to remove
32 // all bounds checks. This generally produced tighter assembly and lead to
33 // 20-50% improvements in micro-benchmarks on corpora made up of random
34 // characters. This somewhat makes sense, since the branch predictor is going
35 // to be at its worse on random text.
36 //
37 // However, using the aho-corasick-debug tool and manually benchmarking
38 // different inputs, the code *with* bounds checks actually wound up being
39 // slightly faster:
40 //
41 //     $ cat input
42 //     Sherlock Holmes
43 //     John Watson
44 //     Professor Moriarty
45 //     Irene Adler
46 //     Mary Watson
47 //
48 //     $ aho-corasick-debug-safe \
49 //         input OpenSubtitles2018.raw.sample.en --kind leftmost-first --dfa
50 //     pattern read time: 32.824µs
51 //     automaton build time: 444.687µs
52 //     automaton heap usage: 72392 bytes
53 //     match count: 639
54 //     count time: 1.809961702s
55 //
56 //     $ aho-corasick-debug-master \
57 //         input OpenSubtitles2018.raw.sample.en --kind leftmost-first --dfa
58 //     pattern read time: 31.425µs
59 //     automaton build time: 317.434µs
60 //     automaton heap usage: 72392 bytes
61 //     match count: 639
62 //     count time: 2.059157705s
63 //
64 // I was able to reproduce this result on two different machines (an i5 and
65 // an i7). Therefore, we go the route of safe code for now.
66 
67 /// A trait describing the interface of an Aho-Corasick finite state machine.
68 ///
69 /// Every automaton has exactly one fail state, one dead state and exactly one
70 /// start state. Generally, these correspond to the first, second and third
71 /// states, respectively. The dead state is always treated as a sentinel. That
72 /// is, no correct Aho-Corasick automaton will ever transition into the fail
73 /// state. The dead state, however, can be transitioned into, but only when
74 /// leftmost-first or leftmost-longest match semantics are enabled and only
75 /// when at least one match has been observed.
76 ///
77 /// Every automaton also has one or more match states, such that
78 /// `Automaton::is_match_state(id)` returns `true` if and only if `id`
79 /// corresponds to a match state.
80 pub trait Automaton {
81     /// The representation used for state identifiers in this automaton.
82     ///
83     /// Typically, this is one of `u8`, `u16`, `u32`, `u64` or `usize`.
84     type ID: StateID;
85 
86     /// The type of matching that should be done.
match_kind(&self) -> &MatchKind87     fn match_kind(&self) -> &MatchKind;
88 
89     /// Returns true if and only if this automaton uses anchored searches.
anchored(&self) -> bool90     fn anchored(&self) -> bool;
91 
92     /// An optional prefilter for quickly skipping to the next candidate match.
93     /// A prefilter must report at least every match, although it may report
94     /// positions that do not correspond to a match. That is, it must not allow
95     /// false negatives, but can allow false positives.
96     ///
97     /// Currently, a prefilter only runs when the automaton is in the start
98     /// state. That is, the position reported by a prefilter should always
99     /// correspond to the start of a potential match.
prefilter(&self) -> Option<&dyn Prefilter>100     fn prefilter(&self) -> Option<&dyn Prefilter>;
101 
102     /// Return the identifier of this automaton's start state.
start_state(&self) -> Self::ID103     fn start_state(&self) -> Self::ID;
104 
105     /// Returns true if and only if the given state identifier refers to a
106     /// valid state.
is_valid(&self, id: Self::ID) -> bool107     fn is_valid(&self, id: Self::ID) -> bool;
108 
109     /// Returns true if and only if the given identifier corresponds to a match
110     /// state.
111     ///
112     /// The state ID given must be valid, or else implementors may panic.
is_match_state(&self, id: Self::ID) -> bool113     fn is_match_state(&self, id: Self::ID) -> bool;
114 
115     /// Returns true if and only if the given identifier corresponds to a state
116     /// that is either the dead state or a match state.
117     ///
118     /// Depending on the implementation of the automaton, this routine can
119     /// be used to save a branch in the core matching loop. Nevertheless,
120     /// `is_match_state(id) || id == dead_id()` is always a valid
121     /// implementation. Indeed, this is the default implementation.
122     ///
123     /// The state ID given must be valid, or else implementors may panic.
is_match_or_dead_state(&self, id: Self::ID) -> bool124     fn is_match_or_dead_state(&self, id: Self::ID) -> bool {
125         id == dead_id() || self.is_match_state(id)
126     }
127 
128     /// If the given state is a match state, return the match corresponding
129     /// to the given match index. `end` must be the ending position of the
130     /// detected match. If no match exists or if `match_index` exceeds the
131     /// number of matches in this state, then `None` is returned.
132     ///
133     /// The state ID given must be valid, or else implementors may panic.
134     ///
135     /// If the given state ID is correct and if the `match_index` is less than
136     /// the number of matches for that state, then this is guaranteed to return
137     /// a match.
get_match( &self, id: Self::ID, match_index: usize, end: usize, ) -> Option<Match>138     fn get_match(
139         &self,
140         id: Self::ID,
141         match_index: usize,
142         end: usize,
143     ) -> Option<Match>;
144 
145     /// Returns the number of matches for the given state. If the given state
146     /// is not a match state, then this returns 0.
147     ///
148     /// The state ID given must be valid, or else implementors must panic.
match_count(&self, id: Self::ID) -> usize149     fn match_count(&self, id: Self::ID) -> usize;
150 
151     /// Given the current state that this automaton is in and the next input
152     /// byte, this method returns the identifier of the next state. The
153     /// identifier returned must always be valid and may never correspond to
154     /// the fail state. The returned identifier may, however, point to the
155     /// dead state.
156     ///
157     /// This is not safe so that implementors may look up the next state
158     /// without memory safety checks such as bounds checks. As such, callers
159     /// must ensure that the given identifier corresponds to a valid automaton
160     /// state. Implementors must, in turn, ensure that this routine is safe for
161     /// all valid state identifiers and for all possible `u8` values.
next_state(&self, current: Self::ID, input: u8) -> Self::ID162     fn next_state(&self, current: Self::ID, input: u8) -> Self::ID;
163 
164     /// Like next_state, but debug_asserts that the underlying
165     /// implementation never returns a `fail_id()` for the next state.
next_state_no_fail(&self, current: Self::ID, input: u8) -> Self::ID166     fn next_state_no_fail(&self, current: Self::ID, input: u8) -> Self::ID {
167         let next = self.next_state(current, input);
168         // We should never see a transition to the failure state.
169         debug_assert!(
170             next != fail_id(),
171             "automaton should never return fail_id for next state"
172         );
173         next
174     }
175 
176     /// Execute a search using standard match semantics.
177     ///
178     /// This can be used even when the automaton was constructed with leftmost
179     /// match semantics when you want to find the earliest possible match. This
180     /// can also be used as part of an overlapping search implementation.
181     ///
182     /// N.B. This does not report a match if `state_id` is given as a matching
183     /// state. As such, this should not be used directly.
184     #[inline(always)]
standard_find_at( &self, prestate: &mut PrefilterState, haystack: &[u8], at: usize, state_id: &mut Self::ID, ) -> Option<Match>185     fn standard_find_at(
186         &self,
187         prestate: &mut PrefilterState,
188         haystack: &[u8],
189         at: usize,
190         state_id: &mut Self::ID,
191     ) -> Option<Match> {
192         if let Some(pre) = self.prefilter() {
193             self.standard_find_at_imp(
194                 prestate,
195                 Some(pre),
196                 haystack,
197                 at,
198                 state_id,
199             )
200         } else {
201             self.standard_find_at_imp(prestate, None, haystack, at, state_id)
202         }
203     }
204 
205     // It's important for this to always be inlined. Namely, its only caller
206     // is standard_find_at, and the inlining should remove the case analysis
207     // for prefilter scanning when there is no prefilter available.
208     #[inline(always)]
standard_find_at_imp( &self, prestate: &mut PrefilterState, prefilter: Option<&dyn Prefilter>, haystack: &[u8], mut at: usize, state_id: &mut Self::ID, ) -> Option<Match>209     fn standard_find_at_imp(
210         &self,
211         prestate: &mut PrefilterState,
212         prefilter: Option<&dyn Prefilter>,
213         haystack: &[u8],
214         mut at: usize,
215         state_id: &mut Self::ID,
216     ) -> Option<Match> {
217         while at < haystack.len() {
218             if let Some(pre) = prefilter {
219                 if prestate.is_effective(at) && *state_id == self.start_state()
220                 {
221                     let c = prefilter::next(prestate, pre, haystack, at)
222                         .into_option();
223                     match c {
224                         None => return None,
225                         Some(i) => {
226                             at = i;
227                         }
228                     }
229                 }
230             }
231             // CORRECTNESS: next_state is correct for all possible u8 values,
232             // so the only thing we're concerned about is the validity of
233             // `state_id`. `state_id` either comes from the caller (in which
234             // case, we assume it is correct), or it comes from the return
235             // value of next_state, which is guaranteed to be correct.
236             *state_id = self.next_state_no_fail(*state_id, haystack[at]);
237             at += 1;
238             // This routine always quits immediately after seeing a
239             // match, and since dead states can only come after seeing
240             // a match, seeing a dead state here is impossible. (Unless
241             // we have an anchored automaton, in which case, dead states
242             // are used to stop a search.)
243             debug_assert!(
244                 *state_id != dead_id() || self.anchored(),
245                 "standard find should never see a dead state"
246             );
247 
248             if self.is_match_or_dead_state(*state_id) {
249                 return if *state_id == dead_id() {
250                     None
251                 } else {
252                     self.get_match(*state_id, 0, at)
253                 };
254             }
255         }
256         None
257     }
258 
259     /// Execute a search using leftmost (either first or longest) match
260     /// semantics.
261     ///
262     /// The principle difference between searching with standard semantics and
263     /// searching with leftmost semantics is that leftmost searching will
264     /// continue searching even after a match has been found. Once a match
265     /// is found, the search does not stop until either the haystack has been
266     /// exhausted or a dead state is observed in the automaton. (Dead states
267     /// only exist in automatons constructed with leftmost semantics.) That is,
268     /// we rely on the construction of the automaton to tell us when to quit.
269     #[inline(never)]
leftmost_find_at( &self, prestate: &mut PrefilterState, haystack: &[u8], at: usize, state_id: &mut Self::ID, ) -> Option<Match>270     fn leftmost_find_at(
271         &self,
272         prestate: &mut PrefilterState,
273         haystack: &[u8],
274         at: usize,
275         state_id: &mut Self::ID,
276     ) -> Option<Match> {
277         if let Some(pre) = self.prefilter() {
278             self.leftmost_find_at_imp(
279                 prestate,
280                 Some(pre),
281                 haystack,
282                 at,
283                 state_id,
284             )
285         } else {
286             self.leftmost_find_at_imp(prestate, None, haystack, at, state_id)
287         }
288     }
289 
290     // It's important for this to always be inlined. Namely, its only caller
291     // is leftmost_find_at, and the inlining should remove the case analysis
292     // for prefilter scanning when there is no prefilter available.
293     #[inline(always)]
leftmost_find_at_imp( &self, prestate: &mut PrefilterState, prefilter: Option<&dyn Prefilter>, haystack: &[u8], mut at: usize, state_id: &mut Self::ID, ) -> Option<Match>294     fn leftmost_find_at_imp(
295         &self,
296         prestate: &mut PrefilterState,
297         prefilter: Option<&dyn Prefilter>,
298         haystack: &[u8],
299         mut at: usize,
300         state_id: &mut Self::ID,
301     ) -> Option<Match> {
302         debug_assert!(self.match_kind().is_leftmost());
303         if self.anchored() && at > 0 && *state_id == self.start_state() {
304             return None;
305         }
306         let mut last_match = self.get_match(*state_id, 0, at);
307         while at < haystack.len() {
308             if let Some(pre) = prefilter {
309                 if prestate.is_effective(at) && *state_id == self.start_state()
310                 {
311                     let c = prefilter::next(prestate, pre, haystack, at)
312                         .into_option();
313                     match c {
314                         None => return None,
315                         Some(i) => {
316                             at = i;
317                         }
318                     }
319                 }
320             }
321             // CORRECTNESS: next_state is correct for all possible u8 values,
322             // so the only thing we're concerned about is the validity of
323             // `state_id`. `state_id` either comes from the caller (in which
324             // case, we assume it is correct), or it comes from the return
325             // value of next_state, which is guaranteed to be correct.
326             *state_id = self.next_state_no_fail(*state_id, haystack[at]);
327             at += 1;
328             if self.is_match_or_dead_state(*state_id) {
329                 if *state_id == dead_id() {
330                     // The only way to enter into a dead state is if a match
331                     // has been found, so we assert as much. This is different
332                     // from normal automata, where you might enter a dead state
333                     // if you know a subsequent match will never be found
334                     // (regardless of whether a match has already been found).
335                     // For Aho-Corasick, it is built so that we can match at
336                     // any position, so the possibility of a match always
337                     // exists.
338                     //
339                     // (Unless we have an anchored automaton, in which case,
340                     // dead states are used to stop a search.)
341                     debug_assert!(
342                         last_match.is_some() || self.anchored(),
343                         "dead state should only be seen after match"
344                     );
345                     return last_match;
346                 }
347                 last_match = self.get_match(*state_id, 0, at);
348             }
349         }
350         last_match
351     }
352 
353     /// This is like leftmost_find_at, but does not need to track a caller
354     /// provided state id. In other words, the only output of this routine is a
355     /// match, if one exists.
356     ///
357     /// It is regrettable that we need to effectively copy a chunk of
358     /// implementation twice, but when we don't need to track the state ID, we
359     /// can allow the prefilter to report matches immediately without having
360     /// to re-confirm them with the automaton. The re-confirmation step is
361     /// necessary in leftmost_find_at because tracing through the automaton is
362     /// the only way to correctly set the state ID. (Perhaps an alternative
363     /// would be to keep a map from pattern ID to matching state ID, but that
364     /// complicates the code and still doesn't permit us to defer to the
365     /// prefilter entirely when possible.)
366     ///
367     /// I did try a few things to avoid the code duplication here, but nothing
368     /// optimized as well as this approach. (In microbenchmarks, there was
369     /// about a 25% difference.)
370     #[inline(never)]
leftmost_find_at_no_state( &self, prestate: &mut PrefilterState, haystack: &[u8], at: usize, ) -> Option<Match>371     fn leftmost_find_at_no_state(
372         &self,
373         prestate: &mut PrefilterState,
374         haystack: &[u8],
375         at: usize,
376     ) -> Option<Match> {
377         if let Some(pre) = self.prefilter() {
378             self.leftmost_find_at_no_state_imp(
379                 prestate,
380                 Some(pre),
381                 haystack,
382                 at,
383             )
384         } else {
385             self.leftmost_find_at_no_state_imp(prestate, None, haystack, at)
386         }
387     }
388 
389     // It's important for this to always be inlined. Namely, its only caller
390     // is leftmost_find_at_no_state, and the inlining should remove the case
391     // analysis for prefilter scanning when there is no prefilter available.
392     #[inline(always)]
leftmost_find_at_no_state_imp( &self, prestate: &mut PrefilterState, prefilter: Option<&dyn Prefilter>, haystack: &[u8], mut at: usize, ) -> Option<Match>393     fn leftmost_find_at_no_state_imp(
394         &self,
395         prestate: &mut PrefilterState,
396         prefilter: Option<&dyn Prefilter>,
397         haystack: &[u8],
398         mut at: usize,
399     ) -> Option<Match> {
400         debug_assert!(self.match_kind().is_leftmost());
401         if self.anchored() && at > 0 {
402             return None;
403         }
404         // If our prefilter handles confirmation of matches 100% of the
405         // time, and since we don't need to track state IDs, we can avoid
406         // Aho-Corasick completely.
407         if let Some(pre) = prefilter {
408             // We should never have a prefilter during an anchored search.
409             debug_assert!(!self.anchored());
410             if !pre.reports_false_positives() {
411                 return match pre.next_candidate(prestate, haystack, at) {
412                     Candidate::None => None,
413                     Candidate::Match(m) => Some(m),
414                     Candidate::PossibleStartOfMatch(_) => unreachable!(),
415                 };
416             }
417         }
418 
419         let mut state_id = self.start_state();
420         let mut last_match = self.get_match(state_id, 0, at);
421         while at < haystack.len() {
422             if let Some(pre) = prefilter {
423                 if prestate.is_effective(at) && state_id == self.start_state()
424                 {
425                     match prefilter::next(prestate, pre, haystack, at) {
426                         Candidate::None => return None,
427                         // Since we aren't tracking a state ID, we can
428                         // quit early once we know we have a match.
429                         Candidate::Match(m) => return Some(m),
430                         Candidate::PossibleStartOfMatch(i) => {
431                             at = i;
432                         }
433                     }
434                 }
435             }
436             // CORRECTNESS: next_state is correct for all possible u8 values,
437             // so the only thing we're concerned about is the validity of
438             // `state_id`. `state_id` either comes from the caller (in which
439             // case, we assume it is correct), or it comes from the return
440             // value of next_state, which is guaranteed to be correct.
441             state_id = self.next_state_no_fail(state_id, haystack[at]);
442             at += 1;
443             if self.is_match_or_dead_state(state_id) {
444                 if state_id == dead_id() {
445                     // The only way to enter into a dead state is if a
446                     // match has been found, so we assert as much. This
447                     // is different from normal automata, where you might
448                     // enter a dead state if you know a subsequent match
449                     // will never be found (regardless of whether a match
450                     // has already been found). For Aho-Corasick, it is
451                     // built so that we can match at any position, so the
452                     // possibility of a match always exists.
453                     //
454                     // (Unless we have an anchored automaton, in which
455                     // case, dead states are used to stop a search.)
456                     debug_assert!(
457                         last_match.is_some() || self.anchored(),
458                         "dead state should only be seen after match"
459                     );
460                     return last_match;
461                 }
462                 last_match = self.get_match(state_id, 0, at);
463             }
464         }
465         last_match
466     }
467 
468     /// Execute an overlapping search.
469     ///
470     /// When executing an overlapping match, the previous state ID in addition
471     /// to the previous match index should be given. If there are more matches
472     /// at the given state, then the match is reported and the given index is
473     /// incremented.
474     #[inline(always)]
overlapping_find_at( &self, prestate: &mut PrefilterState, haystack: &[u8], at: usize, state_id: &mut Self::ID, match_index: &mut usize, ) -> Option<Match>475     fn overlapping_find_at(
476         &self,
477         prestate: &mut PrefilterState,
478         haystack: &[u8],
479         at: usize,
480         state_id: &mut Self::ID,
481         match_index: &mut usize,
482     ) -> Option<Match> {
483         if self.anchored() && at > 0 && *state_id == self.start_state() {
484             return None;
485         }
486 
487         let match_count = self.match_count(*state_id);
488         if *match_index < match_count {
489             // This is guaranteed to return a match since
490             // match_index < match_count.
491             let result = self.get_match(*state_id, *match_index, at);
492             debug_assert!(result.is_some(), "must be a match");
493             *match_index += 1;
494             return result;
495         }
496 
497         *match_index = 0;
498         match self.standard_find_at(prestate, haystack, at, state_id) {
499             None => None,
500             Some(m) => {
501                 *match_index = 1;
502                 Some(m)
503             }
504         }
505     }
506 
507     /// Return the earliest match found. This returns as soon as we know that
508     /// we have a match. As such, this does not necessarily correspond to the
509     /// leftmost starting match, but rather, the leftmost position at which a
510     /// match ends.
511     #[inline(always)]
earliest_find_at( &self, prestate: &mut PrefilterState, haystack: &[u8], at: usize, state_id: &mut Self::ID, ) -> Option<Match>512     fn earliest_find_at(
513         &self,
514         prestate: &mut PrefilterState,
515         haystack: &[u8],
516         at: usize,
517         state_id: &mut Self::ID,
518     ) -> Option<Match> {
519         if *state_id == self.start_state() {
520             if self.anchored() && at > 0 {
521                 return None;
522             }
523             if let Some(m) = self.get_match(*state_id, 0, at) {
524                 return Some(m);
525             }
526         }
527         self.standard_find_at(prestate, haystack, at, state_id)
528     }
529 
530     /// A convenience function for finding the next match according to the
531     /// match semantics of this automaton. For standard match semantics, this
532     /// finds the earliest match. Otherwise, the leftmost match is found.
533     #[inline(always)]
find_at( &self, prestate: &mut PrefilterState, haystack: &[u8], at: usize, state_id: &mut Self::ID, ) -> Option<Match>534     fn find_at(
535         &self,
536         prestate: &mut PrefilterState,
537         haystack: &[u8],
538         at: usize,
539         state_id: &mut Self::ID,
540     ) -> Option<Match> {
541         match *self.match_kind() {
542             MatchKind::Standard => {
543                 self.earliest_find_at(prestate, haystack, at, state_id)
544             }
545             MatchKind::LeftmostFirst | MatchKind::LeftmostLongest => {
546                 self.leftmost_find_at(prestate, haystack, at, state_id)
547             }
548             MatchKind::__Nonexhaustive => unreachable!(),
549         }
550     }
551 
552     /// Like find_at, but does not track state identifiers. This permits some
553     /// optimizations when a prefilter that confirms its own matches is
554     /// present.
555     #[inline(always)]
find_at_no_state( &self, prestate: &mut PrefilterState, haystack: &[u8], at: usize, ) -> Option<Match>556     fn find_at_no_state(
557         &self,
558         prestate: &mut PrefilterState,
559         haystack: &[u8],
560         at: usize,
561     ) -> Option<Match> {
562         match *self.match_kind() {
563             MatchKind::Standard => {
564                 let mut state = self.start_state();
565                 self.earliest_find_at(prestate, haystack, at, &mut state)
566             }
567             MatchKind::LeftmostFirst | MatchKind::LeftmostLongest => {
568                 self.leftmost_find_at_no_state(prestate, haystack, at)
569             }
570             MatchKind::__Nonexhaustive => unreachable!(),
571         }
572     }
573 }
574