• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 use std::cmp;
2 use std::collections::{BTreeSet, VecDeque};
3 use std::fmt;
4 use std::mem::size_of;
5 use std::ops::{Index, IndexMut};
6 
7 use crate::ahocorasick::MatchKind;
8 use crate::automaton::Automaton;
9 use crate::classes::{ByteClassBuilder, ByteClasses};
10 use crate::error::Result;
11 use crate::prefilter::{self, opposite_ascii_case, Prefilter, PrefilterObj};
12 use crate::state_id::{dead_id, fail_id, usize_to_state_id, StateID};
13 use crate::Match;
14 
15 /// The identifier for a pattern, which is simply the position of the pattern
16 /// in the sequence of patterns given by the caller.
17 pub type PatternID = usize;
18 
19 /// The length of a pattern, in bytes.
20 pub type PatternLength = usize;
21 
22 /// An Aho-Corasick automaton, represented as an NFA.
23 ///
24 /// This is the classical formulation of Aho-Corasick, which involves building
25 /// up a prefix trie of a given set of patterns, and then wiring up failure
26 /// transitions between states in order to guarantee linear time matching. The
27 /// standard formulation is, technically, an NFA because of these failure
28 /// transitions. That is, one can see them as enabling the automaton to be in
29 /// multiple states at once. Indeed, during search, it is possible to check
30 /// the transitions on multiple states for a single input byte.
31 ///
32 /// This particular implementation not only supports the standard style of
33 /// matching, but also provides a mode for choosing leftmost-first or
34 /// leftmost-longest match semantics. When a leftmost mode is chosen, some
35 /// failure transitions that would otherwise be added are elided. See
36 /// the documentation of `MatchKind` for more details and examples on how the
37 /// match semantics may differ.
38 ///
39 /// If one wants a DFA, then it is necessary to first build an NFA and convert
40 /// it into a DFA. Note, however, that because we've constrained ourselves to
41 /// matching literal patterns, this does not need to use subset construction
42 /// for determinization. Instead, the DFA has at most a number of states
43 /// equivalent to the number of NFA states. The only real difference between
44 /// them is that all failure transitions are followed and pre-computed. This
45 /// uses much more memory, but also executes searches more quickly.
46 #[derive(Clone)]
47 pub struct NFA<S> {
48     /// The match semantics built into this NFA.
49     match_kind: MatchKind,
50     /// The start state id as an index into `states`.
51     start_id: S,
52     /// The length, in bytes, of the longest pattern in this automaton. This
53     /// information is useful for keeping correct buffer sizes when searching
54     /// on streams.
55     max_pattern_len: usize,
56     /// The total number of patterns added to this automaton, including
57     /// patterns that may never be matched.
58     pattern_count: usize,
59     /// The number of bytes of heap used by this NFA's transition table.
60     heap_bytes: usize,
61     /// A prefilter for quickly skipping to candidate matches, if pertinent.
62     prefilter: Option<PrefilterObj>,
63     /// Whether this automaton anchors all matches to the start of input.
64     anchored: bool,
65     /// A set of equivalence classes in terms of bytes. We compute this while
66     /// building the NFA, but don't use it in the NFA's states. Instead, we
67     /// use this for building the DFA. We store it on the NFA since it's easy
68     /// to compute while visiting the patterns.
69     byte_classes: ByteClasses,
70     /// A set of states. Each state defines its own transitions, a fail
71     /// transition and a set of indices corresponding to matches.
72     ///
73     /// The first state is always the fail state, which is used only as a
74     /// sentinel. Namely, in the final NFA, no transition into the fail state
75     /// exists. (Well, they do, but they aren't followed. Instead, the state's
76     /// failure transition is followed.)
77     ///
78     /// The second state (index 1) is always the dead state. Dead states are
79     /// in every automaton, but only used when leftmost-{first,longest} match
80     /// semantics are enabled. Specifically, they instruct search to stop
81     /// at specific points in order to report the correct match location. In
82     /// the standard Aho-Corasick construction, there are no transitions to
83     /// the dead state.
84     ///
85     /// The third state (index 2) is generally intended to be the starting or
86     /// "root" state.
87     states: Vec<State<S>>,
88 }
89 
90 impl<S: StateID> NFA<S> {
91     /// Returns the equivalence classes of bytes found while constructing
92     /// this NFA.
93     ///
94     /// Note that the NFA doesn't actually make use of these equivalence
95     /// classes. Instead, these are useful for building the DFA when desired.
byte_classes(&self) -> &ByteClasses96     pub fn byte_classes(&self) -> &ByteClasses {
97         &self.byte_classes
98     }
99 
100     /// Returns a prefilter, if one exists.
prefilter_obj(&self) -> Option<&PrefilterObj>101     pub fn prefilter_obj(&self) -> Option<&PrefilterObj> {
102         self.prefilter.as_ref()
103     }
104 
105     /// Returns the total number of heap bytes used by this NFA's transition
106     /// table.
heap_bytes(&self) -> usize107     pub fn heap_bytes(&self) -> usize {
108         self.heap_bytes
109             + self.prefilter.as_ref().map_or(0, |p| p.as_ref().heap_bytes())
110     }
111 
112     /// Return the length of the longest pattern in this automaton.
max_pattern_len(&self) -> usize113     pub fn max_pattern_len(&self) -> usize {
114         self.max_pattern_len
115     }
116 
117     /// Return the total number of patterns added to this automaton.
pattern_count(&self) -> usize118     pub fn pattern_count(&self) -> usize {
119         self.pattern_count
120     }
121 
122     /// Returns the total number of states in this NFA.
state_len(&self) -> usize123     pub fn state_len(&self) -> usize {
124         self.states.len()
125     }
126 
127     /// Returns the matches for the given state.
matches(&self, id: S) -> &[(PatternID, PatternLength)]128     pub fn matches(&self, id: S) -> &[(PatternID, PatternLength)] {
129         &self.states[id.to_usize()].matches
130     }
131 
132     /// Returns an iterator over all transitions in the given state according
133     /// to the given equivalence classes, including transitions to `fail_id()`.
134     /// The number of transitions returned is always equivalent to the number
135     /// of equivalence classes.
iter_all_transitions<F: FnMut(u8, S)>( &self, byte_classes: &ByteClasses, id: S, f: F, )136     pub fn iter_all_transitions<F: FnMut(u8, S)>(
137         &self,
138         byte_classes: &ByteClasses,
139         id: S,
140         f: F,
141     ) {
142         self.states[id.to_usize()].trans.iter_all(byte_classes, f);
143     }
144 
145     /// Returns the failure transition for the given state.
failure_transition(&self, id: S) -> S146     pub fn failure_transition(&self, id: S) -> S {
147         self.states[id.to_usize()].fail
148     }
149 
150     /// Returns the next state for the given state and input byte.
151     ///
152     /// Note that this does not follow failure transitions. As such, the id
153     /// returned may be `fail_id`.
next_state(&self, current: S, input: u8) -> S154     pub fn next_state(&self, current: S, input: u8) -> S {
155         self.states[current.to_usize()].next_state(input)
156     }
157 
state(&self, id: S) -> &State<S>158     fn state(&self, id: S) -> &State<S> {
159         &self.states[id.to_usize()]
160     }
161 
state_mut(&mut self, id: S) -> &mut State<S>162     fn state_mut(&mut self, id: S) -> &mut State<S> {
163         &mut self.states[id.to_usize()]
164     }
165 
start(&self) -> &State<S>166     fn start(&self) -> &State<S> {
167         self.state(self.start_id)
168     }
169 
start_mut(&mut self) -> &mut State<S>170     fn start_mut(&mut self) -> &mut State<S> {
171         let id = self.start_id;
172         self.state_mut(id)
173     }
174 
iter_transitions_mut(&mut self, id: S) -> IterTransitionsMut<'_, S>175     fn iter_transitions_mut(&mut self, id: S) -> IterTransitionsMut<'_, S> {
176         IterTransitionsMut::new(self, id)
177     }
178 
copy_matches(&mut self, src: S, dst: S)179     fn copy_matches(&mut self, src: S, dst: S) {
180         let (src, dst) =
181             get_two_mut(&mut self.states, src.to_usize(), dst.to_usize());
182         dst.matches.extend_from_slice(&src.matches);
183     }
184 
copy_empty_matches(&mut self, dst: S)185     fn copy_empty_matches(&mut self, dst: S) {
186         let start_id = self.start_id;
187         self.copy_matches(start_id, dst);
188     }
189 
add_dense_state(&mut self, depth: usize) -> Result<S>190     fn add_dense_state(&mut self, depth: usize) -> Result<S> {
191         let trans = Transitions::Dense(Dense::new());
192         let id = usize_to_state_id(self.states.len())?;
193         self.states.push(State {
194             trans,
195             // Anchored automatons do not have any failure transitions.
196             fail: if self.anchored { dead_id() } else { self.start_id },
197             depth,
198             matches: vec![],
199         });
200         Ok(id)
201     }
202 
add_sparse_state(&mut self, depth: usize) -> Result<S>203     fn add_sparse_state(&mut self, depth: usize) -> Result<S> {
204         let trans = Transitions::Sparse(vec![]);
205         let id = usize_to_state_id(self.states.len())?;
206         self.states.push(State {
207             trans,
208             // Anchored automatons do not have any failure transitions.
209             fail: if self.anchored { dead_id() } else { self.start_id },
210             depth,
211             matches: vec![],
212         });
213         Ok(id)
214     }
215 }
216 
217 impl<S: StateID> Automaton for NFA<S> {
218     type ID = S;
219 
match_kind(&self) -> &MatchKind220     fn match_kind(&self) -> &MatchKind {
221         &self.match_kind
222     }
223 
anchored(&self) -> bool224     fn anchored(&self) -> bool {
225         self.anchored
226     }
227 
prefilter(&self) -> Option<&dyn Prefilter>228     fn prefilter(&self) -> Option<&dyn Prefilter> {
229         self.prefilter.as_ref().map(|p| p.as_ref())
230     }
231 
start_state(&self) -> S232     fn start_state(&self) -> S {
233         self.start_id
234     }
235 
is_valid(&self, id: S) -> bool236     fn is_valid(&self, id: S) -> bool {
237         id.to_usize() < self.states.len()
238     }
239 
is_match_state(&self, id: S) -> bool240     fn is_match_state(&self, id: S) -> bool {
241         self.states[id.to_usize()].is_match()
242     }
243 
get_match( &self, id: S, match_index: usize, end: usize, ) -> Option<Match>244     fn get_match(
245         &self,
246         id: S,
247         match_index: usize,
248         end: usize,
249     ) -> Option<Match> {
250         let state = match self.states.get(id.to_usize()) {
251             None => return None,
252             Some(state) => state,
253         };
254         state.matches.get(match_index).map(|&(id, len)| Match {
255             pattern: id,
256             len,
257             end,
258         })
259     }
260 
match_count(&self, id: S) -> usize261     fn match_count(&self, id: S) -> usize {
262         self.states[id.to_usize()].matches.len()
263     }
264 
next_state(&self, mut current: S, input: u8) -> S265     fn next_state(&self, mut current: S, input: u8) -> S {
266         // This terminates since:
267         //
268         // 1. `State.fail` never points to fail_id().
269         // 2. All `State.fail` values point to a state closer to `start`.
270         // 3. The start state has no transitions to fail_id().
271         loop {
272             let state = &self.states[current.to_usize()];
273             let next = state.next_state(input);
274             if next != fail_id() {
275                 return next;
276             }
277             current = state.fail;
278         }
279     }
280 }
281 
282 /// A representation of an NFA state for an Aho-Corasick automaton.
283 ///
284 /// It contains the transitions to the next state, a failure transition for
285 /// cases where there exists no other transition for the current input byte,
286 /// the matches implied by visiting this state (if any) and the depth of this
287 /// state. The depth of a state is simply the distance from it to the start
288 /// state in the automaton, where the depth of the start state is 0.
289 #[derive(Clone, Debug)]
290 pub struct State<S> {
291     trans: Transitions<S>,
292     fail: S,
293     matches: Vec<(PatternID, PatternLength)>,
294     // TODO: Strictly speaking, this isn't needed for searching. It's only
295     // used when building an NFA that supports leftmost match semantics. We
296     // could drop this from the state and dynamically build a map only when
297     // computing failure transitions, but it's not clear which is better.
298     // Benchmark this.
299     depth: usize,
300 }
301 
302 impl<S: StateID> State<S> {
heap_bytes(&self) -> usize303     fn heap_bytes(&self) -> usize {
304         self.trans.heap_bytes()
305             + (self.matches.len() * size_of::<(PatternID, PatternLength)>())
306     }
307 
add_match(&mut self, i: PatternID, len: PatternLength)308     fn add_match(&mut self, i: PatternID, len: PatternLength) {
309         self.matches.push((i, len));
310     }
311 
is_match(&self) -> bool312     fn is_match(&self) -> bool {
313         !self.matches.is_empty()
314     }
315 
next_state(&self, input: u8) -> S316     fn next_state(&self, input: u8) -> S {
317         self.trans.next_state(input)
318     }
319 
set_next_state(&mut self, input: u8, next: S)320     fn set_next_state(&mut self, input: u8, next: S) {
321         self.trans.set_next_state(input, next);
322     }
323 }
324 
325 /// Represents the transitions for a single dense state.
326 ///
327 /// The primary purpose here is to encapsulate index access. Namely, since a
328 /// dense representation always contains 256 elements, all values of `u8` are
329 /// valid indices.
330 #[derive(Clone, Debug)]
331 struct Dense<S>(Vec<S>);
332 
333 impl<S> Dense<S>
334 where
335     S: StateID,
336 {
new() -> Self337     fn new() -> Self {
338         Dense(vec![fail_id(); 256])
339     }
340 
341     #[inline]
len(&self) -> usize342     fn len(&self) -> usize {
343         self.0.len()
344     }
345 }
346 
347 impl<S> Index<u8> for Dense<S> {
348     type Output = S;
349 
350     #[inline]
index(&self, i: u8) -> &S351     fn index(&self, i: u8) -> &S {
352         // SAFETY: This is safe because all dense transitions have
353         // exactly 256 elements, so all u8 values are valid indices.
354         &self.0[i as usize]
355     }
356 }
357 
358 impl<S> IndexMut<u8> for Dense<S> {
359     #[inline]
index_mut(&mut self, i: u8) -> &mut S360     fn index_mut(&mut self, i: u8) -> &mut S {
361         // SAFETY: This is safe because all dense transitions have
362         // exactly 256 elements, so all u8 values are valid indices.
363         &mut self.0[i as usize]
364     }
365 }
366 
367 /// A representation of transitions in an NFA.
368 ///
369 /// Transitions have either a sparse representation, which is slower for
370 /// lookups but uses less memory, or a dense representation, which is faster
371 /// for lookups but uses more memory. In the sparse representation, the absence
372 /// of a state implies a transition to `fail_id()`. Transitions to `dead_id()`
373 /// are still explicitly represented.
374 ///
375 /// For the NFA, by default, we use a dense representation for transitions for
376 /// states close to the start state because it's likely these are the states
377 /// that will be most frequently visited.
378 #[derive(Clone, Debug)]
379 enum Transitions<S> {
380     Sparse(Vec<(u8, S)>),
381     Dense(Dense<S>),
382 }
383 
384 impl<S: StateID> Transitions<S> {
heap_bytes(&self) -> usize385     fn heap_bytes(&self) -> usize {
386         match *self {
387             Transitions::Sparse(ref sparse) => {
388                 sparse.len() * size_of::<(u8, S)>()
389             }
390             Transitions::Dense(ref dense) => dense.len() * size_of::<S>(),
391         }
392     }
393 
next_state(&self, input: u8) -> S394     fn next_state(&self, input: u8) -> S {
395         match *self {
396             Transitions::Sparse(ref sparse) => {
397                 for &(b, id) in sparse {
398                     if b == input {
399                         return id;
400                     }
401                 }
402                 fail_id()
403             }
404             Transitions::Dense(ref dense) => dense[input],
405         }
406     }
407 
set_next_state(&mut self, input: u8, next: S)408     fn set_next_state(&mut self, input: u8, next: S) {
409         match *self {
410             Transitions::Sparse(ref mut sparse) => {
411                 match sparse.binary_search_by_key(&input, |&(b, _)| b) {
412                     Ok(i) => sparse[i] = (input, next),
413                     Err(i) => sparse.insert(i, (input, next)),
414                 }
415             }
416             Transitions::Dense(ref mut dense) => {
417                 dense[input] = next;
418             }
419         }
420     }
421 
422     /// Iterate over transitions in this state while skipping over transitions
423     /// to `fail_id()`.
iter<F: FnMut(u8, S)>(&self, mut f: F)424     fn iter<F: FnMut(u8, S)>(&self, mut f: F) {
425         match *self {
426             Transitions::Sparse(ref sparse) => {
427                 for &(b, id) in sparse {
428                     f(b, id);
429                 }
430             }
431             Transitions::Dense(ref dense) => {
432                 for b in AllBytesIter::new() {
433                     let id = dense[b];
434                     if id != fail_id() {
435                         f(b, id);
436                     }
437                 }
438             }
439         }
440     }
441 
442     /// Iterate over all transitions in this state according to the given
443     /// equivalence classes, including transitions to `fail_id()`.
iter_all<F: FnMut(u8, S)>(&self, classes: &ByteClasses, mut f: F)444     fn iter_all<F: FnMut(u8, S)>(&self, classes: &ByteClasses, mut f: F) {
445         if classes.is_singleton() {
446             match *self {
447                 Transitions::Sparse(ref sparse) => {
448                     sparse_iter(sparse, f);
449                 }
450                 Transitions::Dense(ref dense) => {
451                     for b in AllBytesIter::new() {
452                         f(b, dense[b]);
453                     }
454                 }
455             }
456         } else {
457             // In this case, we only want to yield a single byte for each
458             // equivalence class.
459             match *self {
460                 Transitions::Sparse(ref sparse) => {
461                     let mut last_class = None;
462                     sparse_iter(sparse, |b, next| {
463                         let class = classes.get(b);
464                         if last_class != Some(class) {
465                             last_class = Some(class);
466                             f(b, next);
467                         }
468                     })
469                 }
470                 Transitions::Dense(ref dense) => {
471                     for b in classes.representatives() {
472                         f(b, dense[b]);
473                     }
474                 }
475             }
476         }
477     }
478 }
479 
480 /// Iterator over transitions in a state, skipping transitions to `fail_id()`.
481 ///
482 /// This abstracts over the representation of NFA transitions, which may be
483 /// either in a sparse or dense representation.
484 ///
485 /// This somewhat idiosyncratically borrows the NFA mutably, so that when one
486 /// is iterating over transitions, the caller can still mutate the NFA. This
487 /// is useful when creating failure transitions.
488 #[derive(Debug)]
489 struct IterTransitionsMut<'a, S: StateID> {
490     nfa: &'a mut NFA<S>,
491     state_id: S,
492     cur: usize,
493 }
494 
495 impl<'a, S: StateID> IterTransitionsMut<'a, S> {
new(nfa: &'a mut NFA<S>, state_id: S) -> IterTransitionsMut<'a, S>496     fn new(nfa: &'a mut NFA<S>, state_id: S) -> IterTransitionsMut<'a, S> {
497         IterTransitionsMut { nfa, state_id, cur: 0 }
498     }
499 
nfa(&mut self) -> &mut NFA<S>500     fn nfa(&mut self) -> &mut NFA<S> {
501         self.nfa
502     }
503 }
504 
505 impl<'a, S: StateID> Iterator for IterTransitionsMut<'a, S> {
506     type Item = (u8, S);
507 
next(&mut self) -> Option<(u8, S)>508     fn next(&mut self) -> Option<(u8, S)> {
509         match self.nfa.states[self.state_id.to_usize()].trans {
510             Transitions::Sparse(ref sparse) => {
511                 if self.cur >= sparse.len() {
512                     return None;
513                 }
514                 let i = self.cur;
515                 self.cur += 1;
516                 Some(sparse[i])
517             }
518             Transitions::Dense(ref dense) => {
519                 while self.cur < dense.len() {
520                     // There are always exactly 255 transitions in dense repr.
521                     debug_assert!(self.cur < 256);
522 
523                     let b = self.cur as u8;
524                     let id = dense[b];
525                     self.cur += 1;
526                     if id != fail_id() {
527                         return Some((b, id));
528                     }
529                 }
530                 None
531             }
532         }
533     }
534 }
535 
536 /// A simple builder for configuring the NFA construction of Aho-Corasick.
537 #[derive(Clone, Debug)]
538 pub struct Builder {
539     dense_depth: usize,
540     match_kind: MatchKind,
541     prefilter: bool,
542     anchored: bool,
543     ascii_case_insensitive: bool,
544 }
545 
546 impl Default for Builder {
default() -> Builder547     fn default() -> Builder {
548         Builder {
549             dense_depth: 2,
550             match_kind: MatchKind::default(),
551             prefilter: true,
552             anchored: false,
553             ascii_case_insensitive: false,
554         }
555     }
556 }
557 
558 impl Builder {
new() -> Builder559     pub fn new() -> Builder {
560         Builder::default()
561     }
562 
build<I, P, S: StateID>(&self, patterns: I) -> Result<NFA<S>> where I: IntoIterator<Item = P>, P: AsRef<[u8]>,563     pub fn build<I, P, S: StateID>(&self, patterns: I) -> Result<NFA<S>>
564     where
565         I: IntoIterator<Item = P>,
566         P: AsRef<[u8]>,
567     {
568         Compiler::new(self)?.compile(patterns)
569     }
570 
match_kind(&mut self, kind: MatchKind) -> &mut Builder571     pub fn match_kind(&mut self, kind: MatchKind) -> &mut Builder {
572         self.match_kind = kind;
573         self
574     }
575 
dense_depth(&mut self, depth: usize) -> &mut Builder576     pub fn dense_depth(&mut self, depth: usize) -> &mut Builder {
577         self.dense_depth = depth;
578         self
579     }
580 
prefilter(&mut self, yes: bool) -> &mut Builder581     pub fn prefilter(&mut self, yes: bool) -> &mut Builder {
582         self.prefilter = yes;
583         self
584     }
585 
anchored(&mut self, yes: bool) -> &mut Builder586     pub fn anchored(&mut self, yes: bool) -> &mut Builder {
587         self.anchored = yes;
588         self
589     }
590 
ascii_case_insensitive(&mut self, yes: bool) -> &mut Builder591     pub fn ascii_case_insensitive(&mut self, yes: bool) -> &mut Builder {
592         self.ascii_case_insensitive = yes;
593         self
594     }
595 }
596 
597 /// A compiler uses a builder configuration and builds up the NFA formulation
598 /// of an Aho-Corasick automaton. This roughly corresponds to the standard
599 /// formulation described in textbooks.
600 #[derive(Debug)]
601 struct Compiler<'a, S: StateID> {
602     builder: &'a Builder,
603     prefilter: prefilter::Builder,
604     nfa: NFA<S>,
605     byte_classes: ByteClassBuilder,
606 }
607 
608 impl<'a, S: StateID> Compiler<'a, S> {
new(builder: &'a Builder) -> Result<Compiler<'a, S>>609     fn new(builder: &'a Builder) -> Result<Compiler<'a, S>> {
610         Ok(Compiler {
611             builder,
612             prefilter: prefilter::Builder::new(builder.match_kind)
613                 .ascii_case_insensitive(builder.ascii_case_insensitive),
614             nfa: NFA {
615                 match_kind: builder.match_kind,
616                 start_id: usize_to_state_id(2)?,
617                 max_pattern_len: 0,
618                 pattern_count: 0,
619                 heap_bytes: 0,
620                 prefilter: None,
621                 anchored: builder.anchored,
622                 byte_classes: ByteClasses::singletons(),
623                 states: vec![],
624             },
625             byte_classes: ByteClassBuilder::new(),
626         })
627     }
628 
compile<I, P>(mut self, patterns: I) -> Result<NFA<S>> where I: IntoIterator<Item = P>, P: AsRef<[u8]>,629     fn compile<I, P>(mut self, patterns: I) -> Result<NFA<S>>
630     where
631         I: IntoIterator<Item = P>,
632         P: AsRef<[u8]>,
633     {
634         self.add_state(0)?; // the fail state, which is never entered
635         self.add_state(0)?; // the dead state, only used for leftmost
636         self.add_state(0)?; // the start state
637         self.build_trie(patterns)?;
638         self.add_start_state_loop();
639         self.add_dead_state_loop();
640         if !self.builder.anchored {
641             self.fill_failure_transitions();
642         }
643         self.close_start_state_loop();
644         self.nfa.byte_classes = self.byte_classes.build();
645         if !self.builder.anchored {
646             self.nfa.prefilter = self.prefilter.build();
647         }
648         self.calculate_size();
649         Ok(self.nfa)
650     }
651 
652     /// This sets up the initial prefix trie that makes up the Aho-Corasick
653     /// automaton. Effectively, it creates the basic structure of the
654     /// automaton, where every pattern given has a path from the start state to
655     /// the end of the pattern.
build_trie<I, P>(&mut self, patterns: I) -> Result<()> where I: IntoIterator<Item = P>, P: AsRef<[u8]>,656     fn build_trie<I, P>(&mut self, patterns: I) -> Result<()>
657     where
658         I: IntoIterator<Item = P>,
659         P: AsRef<[u8]>,
660     {
661         'PATTERNS: for (pati, pat) in patterns.into_iter().enumerate() {
662             let pat = pat.as_ref();
663             self.nfa.max_pattern_len =
664                 cmp::max(self.nfa.max_pattern_len, pat.len());
665             self.nfa.pattern_count += 1;
666 
667             let mut prev = self.nfa.start_id;
668             let mut saw_match = false;
669             for (depth, &b) in pat.iter().enumerate() {
670                 // When leftmost-first match semantics are requested, we
671                 // specifically stop adding patterns when a previously added
672                 // pattern is a prefix of it. We avoid adding it because
673                 // leftmost-first semantics imply that the pattern can never
674                 // match. This is not just an optimization to save space! It
675                 // is necessary for correctness. In fact, this is the only
676                 // difference in the automaton between the implementations for
677                 // leftmost-first and leftmost-longest.
678                 saw_match = saw_match || self.nfa.state(prev).is_match();
679                 if self.builder.match_kind.is_leftmost_first() && saw_match {
680                     // Skip to the next pattern immediately. This avoids
681                     // incorrectly adding a match after this loop terminates.
682                     continue 'PATTERNS;
683                 }
684 
685                 // Add this byte to our equivalence classes. We don't use these
686                 // for NFA construction. These are instead used only if we're
687                 // building a DFA. They would technically be useful for the
688                 // NFA, but it would require a second pass over the patterns.
689                 self.byte_classes.set_range(b, b);
690                 if self.builder.ascii_case_insensitive {
691                     let b = opposite_ascii_case(b);
692                     self.byte_classes.set_range(b, b);
693                 }
694 
695                 // If the transition from prev using the current byte already
696                 // exists, then just move through it. Otherwise, add a new
697                 // state. We track the depth here so that we can determine
698                 // how to represent transitions. States near the start state
699                 // use a dense representation that uses more memory but is
700                 // faster. Other states use a sparse representation that uses
701                 // less memory but is slower.
702                 let next = self.nfa.state(prev).next_state(b);
703                 if next != fail_id() {
704                     prev = next;
705                 } else {
706                     let next = self.add_state(depth + 1)?;
707                     self.nfa.state_mut(prev).set_next_state(b, next);
708                     if self.builder.ascii_case_insensitive {
709                         let b = opposite_ascii_case(b);
710                         self.nfa.state_mut(prev).set_next_state(b, next);
711                     }
712                     prev = next;
713                 }
714             }
715             // Once the pattern has been added, log the match in the final
716             // state that it reached.
717             self.nfa.state_mut(prev).add_match(pati, pat.len());
718             // ... and hand it to the prefilter builder, if applicable.
719             if self.builder.prefilter {
720                 self.prefilter.add(pat);
721             }
722         }
723         Ok(())
724     }
725 
726     /// This routine creates failure transitions according to the standard
727     /// textbook formulation of the Aho-Corasick algorithm, with a couple small
728     /// tweaks to support "leftmost" semantics.
729     ///
730     /// Building failure transitions is the most interesting part of building
731     /// the Aho-Corasick automaton, because they are what allow searches to
732     /// be performed in linear time. Specifically, a failure transition is
733     /// a single transition associated with each state that points back to
734     /// the longest proper suffix of the pattern being searched. The failure
735     /// transition is followed whenever there exists no transition on the
736     /// current state for the current input byte. If there is no other proper
737     /// suffix, then the failure transition points back to the starting state.
738     ///
739     /// For example, let's say we built an Aho-Corasick automaton with the
740     /// following patterns: 'abcd' and 'cef'. The trie looks like this:
741     ///
742     /// ```ignore
743     ///          a - S1 - b - S2 - c - S3 - d - S4*
744     ///         /
745     ///     S0 - c - S5 - e - S6 - f - S7*
746     /// ```
747     ///
748     /// At this point, it should be fairly straight-forward to see how this
749     /// trie can be used in a simplistic way. At any given position in the
750     /// text we're searching (called the "subject" string), all we need to do
751     /// is follow the transitions in the trie by consuming one transition for
752     /// each byte in the subject string. If we reach a match state, then we can
753     /// report that location as a match.
754     ///
755     /// The trick comes when searching a subject string like 'abcef'. We'll
756     /// initially follow the transition from S0 to S1 and wind up in S3 after
757     /// observng the 'c' byte. At this point, the next byte is 'e' but state
758     /// S3 has no transition for 'e', so the search fails. We then would need
759     /// to restart the search at the next position in 'abcef', which
760     /// corresponds to 'b'. The match would fail, but the next search starting
761     /// at 'c' would finally succeed. The problem with this approach is that
762     /// we wind up searching the subject string potentially many times. In
763     /// effect, this makes the algorithm have worst case `O(n * m)` complexity,
764     /// where `n ~ len(subject)` and `m ~ len(all patterns)`. We would instead
765     /// like to achieve a `O(n + m)` worst case complexity.
766     ///
767     /// This is where failure transitions come in. Instead of dying at S3 in
768     /// the first search, the automaton can instruct the search to move to
769     /// another part of the automaton that corresponds to a suffix of what
770     /// we've seen so far. Recall that we've seen 'abc' in the subject string,
771     /// and the automaton does indeed have a non-empty suffix, 'c', that could
772     /// potentially lead to another match. Thus, the actual Aho-Corasick
773     /// automaton for our patterns in this case looks like this:
774     ///
775     /// ```ignore
776     ///          a - S1 - b - S2 - c - S3 - d - S4*
777     ///         /                      /
778     ///        /       ----------------
779     ///       /       /
780     ///     S0 - c - S5 - e - S6 - f - S7*
781     /// ```
782     ///
783     /// That is, we have a failure transition from S3 to S5, which is followed
784     /// exactly in cases when we are in state S3 but see any byte other than
785     /// 'd' (that is, we've "failed" to find a match in this portion of our
786     /// trie). We know we can transition back to S5 because we've already seen
787     /// a 'c' byte, so we don't need to re-scan it. We can then pick back up
788     /// with the search starting at S5 and complete our match.
789     ///
790     /// Adding failure transitions to a trie is fairly simple, but subtle. The
791     /// key issue is that you might have multiple failure transition that you
792     /// need to follow. For example, look at the trie for the patterns
793     /// 'abcd', 'b', 'bcd' and 'cd':
794     ///
795     /// ```ignore
796     ///          - a - S1 - b - S2* - c - S3 - d - S4*
797     ///         /               /         /
798     ///        /         -------   -------
799     ///       /         /         /
800     ///     S0 --- b - S5* - c - S6 - d - S7*
801     ///       \                  /
802     ///        \         --------
803     ///         \       /
804     ///          - c - S8 - d - S9*
805     /// ```
806     ///
807     /// The failure transitions for this trie are defined from S2 to S5,
808     /// S3 to S6 and S6 to S8. Moreover, state S2 needs to track that it
809     /// corresponds to a match, since its failure transition to S5 is itself
810     /// a match state.
811     ///
812     /// Perhaps simplest way to think about adding these failure transitions
813     /// is recursively. That is, if you know the failure transitions for every
814     /// possible previous state that could be visited (e.g., when computing the
815     /// failure transition for S3, you already know the failure transitions
816     /// for S0, S1 and S2), then you can simply follow the failure transition
817     /// of the previous state and check whether the incoming transition is
818     /// defined after following the failure transition.
819     ///
820     /// For example, when determining the failure state for S3, by our
821     /// assumptions, we already know that there is a failure transition from
822     /// S2 (the previous state) to S5. So we follow that transition and check
823     /// whether the transition connecting S2 to S3 is defined. Indeed, it is,
824     /// as there is a transition from S5 to S6 for the byte 'c'. If no such
825     /// transition existed, we could keep following the failure transitions
826     /// until we reach the start state, which is the failure transition for
827     /// every state that has no corresponding proper suffix.
828     ///
829     /// We don't actually use recursion to implement this, but instead, use a
830     /// breadth first search of the automaton. Our base case is the start
831     /// state, whose failure transition is just a transition to itself.
832     ///
833     /// When building a leftmost automaton, we proceed as above, but only
834     /// include a subset of failure transitions. Namely, we omit any failure
835     /// transitions that appear after a match state in the trie. This is
836     /// because failure transitions always point back to a proper suffix of
837     /// what has been seen so far. Thus, following a failure transition after
838     /// a match implies looking for a match that starts after the one that has
839     /// already been seen, which is of course therefore not the leftmost match.
840     ///
841     /// N.B. I came up with this algorithm on my own, and after scouring all of
842     /// the other AC implementations I know of (Perl, Snort, many on GitHub).
843     /// I couldn't find any that implement leftmost semantics like this.
844     /// Perl of course needs leftmost-first semantics, but they implement it
845     /// with a seeming hack at *search* time instead of encoding it into the
846     /// automaton. There are also a couple Java libraries that support leftmost
847     /// longest semantics, but they do it by building a queue of matches at
848     /// search time, which is even worse than what Perl is doing. ---AG
fill_failure_transitions(&mut self)849     fn fill_failure_transitions(&mut self) {
850         let kind = self.match_kind();
851         // Initialize the queue for breadth first search with all transitions
852         // out of the start state. We handle the start state specially because
853         // we only want to follow non-self transitions. If we followed self
854         // transitions, then this would never terminate.
855         let mut queue = VecDeque::new();
856         let mut seen = self.queued_set();
857         let mut it = self.nfa.iter_transitions_mut(self.nfa.start_id);
858         while let Some((_, next)) = it.next() {
859             // Skip anything we've seen before and any self-transitions on the
860             // start state.
861             if next == it.nfa().start_id || seen.contains(next) {
862                 continue;
863             }
864             queue.push_back(next);
865             seen.insert(next);
866             // Under leftmost semantics, if a state immediately following
867             // the start state is a match state, then we never want to
868             // follow its failure transition since the failure transition
869             // necessarily leads back to the start state, which we never
870             // want to do for leftmost matching after a match has been
871             // found.
872             //
873             // We apply the same logic to non-start states below as well.
874             if kind.is_leftmost() && it.nfa().state(next).is_match() {
875                 it.nfa().state_mut(next).fail = dead_id();
876             }
877         }
878         while let Some(id) = queue.pop_front() {
879             let mut it = self.nfa.iter_transitions_mut(id);
880             while let Some((b, next)) = it.next() {
881                 if seen.contains(next) {
882                     // The only way to visit a duplicate state in a transition
883                     // list is when ASCII case insensitivity is enabled. In
884                     // this case, we want to skip it since it's redundant work.
885                     // But it would also end up duplicating matches, which
886                     // results in reporting duplicate matches in some cases.
887                     // See the 'acasei010' regression test.
888                     continue;
889                 }
890                 queue.push_back(next);
891                 seen.insert(next);
892 
893                 // As above for start states, under leftmost semantics, once
894                 // we see a match all subsequent states should have no failure
895                 // transitions because failure transitions always imply looking
896                 // for a match that is a suffix of what has been seen so far
897                 // (where "seen so far" corresponds to the string formed by
898                 // following the transitions from the start state to the
899                 // current state). Under leftmost semantics, we specifically do
900                 // not want to allow this to happen because we always want to
901                 // report the match found at the leftmost position.
902                 //
903                 // The difference between leftmost-first and leftmost-longest
904                 // occurs previously while we build the trie. For
905                 // leftmost-first, we simply omit any entries that would
906                 // otherwise require passing through a match state.
907                 //
908                 // Note that for correctness, the failure transition has to be
909                 // set to the dead state for ALL states following a match, not
910                 // just the match state itself. However, by setting the failure
911                 // transition to the dead state on all match states, the dead
912                 // state will automatically propagate to all subsequent states
913                 // via the failure state computation below.
914                 if kind.is_leftmost() && it.nfa().state(next).is_match() {
915                     it.nfa().state_mut(next).fail = dead_id();
916                     continue;
917                 }
918                 let mut fail = it.nfa().state(id).fail;
919                 while it.nfa().state(fail).next_state(b) == fail_id() {
920                     fail = it.nfa().state(fail).fail;
921                 }
922                 fail = it.nfa().state(fail).next_state(b);
923                 it.nfa().state_mut(next).fail = fail;
924                 it.nfa().copy_matches(fail, next);
925             }
926             // If the start state is a match state, then this automaton can
927             // match the empty string. This implies all states are match states
928             // since every position matches the empty string, so copy the
929             // matches from the start state to every state. Strictly speaking,
930             // this is only necessary for overlapping matches since each
931             // non-empty non-start match state needs to report empty matches
932             // in addition to its own. For the non-overlapping case, such
933             // states only report the first match, which is never empty since
934             // it isn't a start state.
935             if !kind.is_leftmost() {
936                 it.nfa().copy_empty_matches(id);
937             }
938         }
939     }
940 
941     /// Returns a set that tracked queued states.
942     ///
943     /// This is only necessary when ASCII case insensitivity is enabled, since
944     /// it is the only way to visit the same state twice. Otherwise, this
945     /// returns an inert set that nevers adds anything and always reports
946     /// `false` for every member test.
queued_set(&self) -> QueuedSet<S>947     fn queued_set(&self) -> QueuedSet<S> {
948         if self.builder.ascii_case_insensitive {
949             QueuedSet::active()
950         } else {
951             QueuedSet::inert()
952         }
953     }
954 
955     /// Set the failure transitions on the start state to loop back to the
956     /// start state. This effectively permits the Aho-Corasick automaton to
957     /// match at any position. This is also required for finding the next
958     /// state to terminate, namely, finding the next state should never return
959     /// a fail_id.
960     ///
961     /// This must be done after building the initial trie, since trie
962     /// construction depends on transitions to `fail_id` to determine whether a
963     /// state already exists or not.
add_start_state_loop(&mut self)964     fn add_start_state_loop(&mut self) {
965         let start_id = self.nfa.start_id;
966         let start = self.nfa.start_mut();
967         for b in AllBytesIter::new() {
968             if start.next_state(b) == fail_id() {
969                 start.set_next_state(b, start_id);
970             }
971         }
972     }
973 
974     /// Remove the start state loop by rewriting any transitions on the start
975     /// state back to the start state with transitions to the dead state.
976     ///
977     /// The loop is only closed when two conditions are met: the start state
978     /// is a match state and the match kind is leftmost-first or
979     /// leftmost-longest. (Alternatively, if this is an anchored automaton,
980     /// then the start state is always closed, regardless of aforementioned
981     /// conditions.)
982     ///
983     /// The reason for this is that under leftmost semantics, a start state
984     /// that is also a match implies that we should never restart the search
985     /// process. We allow normal transitions out of the start state, but if
986     /// none exist, we transition to the dead state, which signals that
987     /// searching should stop.
close_start_state_loop(&mut self)988     fn close_start_state_loop(&mut self) {
989         if self.builder.anchored
990             || (self.match_kind().is_leftmost() && self.nfa.start().is_match())
991         {
992             let start_id = self.nfa.start_id;
993             let start = self.nfa.start_mut();
994             for b in AllBytesIter::new() {
995                 if start.next_state(b) == start_id {
996                     start.set_next_state(b, dead_id());
997                 }
998             }
999         }
1000     }
1001 
1002     /// Sets all transitions on the dead state to point back to the dead state.
1003     /// Normally, missing transitions map back to the failure state, but the
1004     /// point of the dead state is to act as a sink that can never be escaped.
add_dead_state_loop(&mut self)1005     fn add_dead_state_loop(&mut self) {
1006         let dead = self.nfa.state_mut(dead_id());
1007         for b in AllBytesIter::new() {
1008             dead.set_next_state(b, dead_id());
1009         }
1010     }
1011 
1012     /// Computes the total amount of heap used by this NFA in bytes.
calculate_size(&mut self)1013     fn calculate_size(&mut self) {
1014         let mut size = 0;
1015         for state in &self.nfa.states {
1016             size += size_of::<State<S>>() + state.heap_bytes();
1017         }
1018         self.nfa.heap_bytes = size;
1019     }
1020 
1021     /// Add a new state to the underlying NFA with the given depth. The depth
1022     /// is used to determine how to represent the transitions.
1023     ///
1024     /// If adding the new state would overflow the chosen state ID
1025     /// representation, then this returns an error.
add_state(&mut self, depth: usize) -> Result<S>1026     fn add_state(&mut self, depth: usize) -> Result<S> {
1027         if depth < self.builder.dense_depth {
1028             self.nfa.add_dense_state(depth)
1029         } else {
1030             self.nfa.add_sparse_state(depth)
1031         }
1032     }
1033 
1034     /// Returns the match kind configured on the underlying builder.
match_kind(&self) -> MatchKind1035     fn match_kind(&self) -> MatchKind {
1036         self.builder.match_kind
1037     }
1038 }
1039 
1040 /// A set of state identifiers used to avoid revisiting the same state multiple
1041 /// times when filling in failure transitions.
1042 ///
1043 /// This set has an "inert" and an "active" mode. When inert, the set never
1044 /// stores anything and always returns `false` for every member test. This is
1045 /// useful to avoid the performance and memory overhead of maintaining this
1046 /// set when it is not needed.
1047 #[derive(Debug)]
1048 struct QueuedSet<S> {
1049     set: Option<BTreeSet<S>>,
1050 }
1051 
1052 impl<S: StateID> QueuedSet<S> {
1053     /// Return an inert set that returns `false` for every state ID membership
1054     /// test.
inert() -> QueuedSet<S>1055     fn inert() -> QueuedSet<S> {
1056         QueuedSet { set: None }
1057     }
1058 
1059     /// Return an active set that tracks state ID membership.
active() -> QueuedSet<S>1060     fn active() -> QueuedSet<S> {
1061         QueuedSet { set: Some(BTreeSet::new()) }
1062     }
1063 
1064     /// Inserts the given state ID into this set. (If the set is inert, then
1065     /// this is a no-op.)
insert(&mut self, state_id: S)1066     fn insert(&mut self, state_id: S) {
1067         if let Some(ref mut set) = self.set {
1068             set.insert(state_id);
1069         }
1070     }
1071 
1072     /// Returns true if and only if the given state ID is in this set. If the
1073     /// set is inert, this always returns false.
contains(&self, state_id: S) -> bool1074     fn contains(&self, state_id: S) -> bool {
1075         match self.set {
1076             None => false,
1077             Some(ref set) => set.contains(&state_id),
1078         }
1079     }
1080 }
1081 
1082 /// An iterator over every byte value.
1083 ///
1084 /// We use this instead of (0..256).map(|b| b as u8) because this optimizes
1085 /// better in debug builds.
1086 ///
1087 /// We also use this instead of 0..=255 because we're targeting Rust 1.24 and
1088 /// inclusive range syntax was stabilized in Rust 1.26. We can get rid of this
1089 /// once our MSRV is Rust 1.26 or newer.
1090 #[derive(Debug)]
1091 struct AllBytesIter(u16);
1092 
1093 impl AllBytesIter {
new() -> AllBytesIter1094     fn new() -> AllBytesIter {
1095         AllBytesIter(0)
1096     }
1097 }
1098 
1099 impl Iterator for AllBytesIter {
1100     type Item = u8;
1101 
next(&mut self) -> Option<Self::Item>1102     fn next(&mut self) -> Option<Self::Item> {
1103         if self.0 >= 256 {
1104             None
1105         } else {
1106             let b = self.0 as u8;
1107             self.0 += 1;
1108             Some(b)
1109         }
1110     }
1111 }
1112 
1113 impl<S: StateID> fmt::Debug for NFA<S> {
fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result1114     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
1115         writeln!(f, "NFA(")?;
1116         writeln!(f, "match_kind: {:?}", self.match_kind)?;
1117         writeln!(f, "prefilter: {:?}", self.prefilter)?;
1118         writeln!(f, "{}", "-".repeat(79))?;
1119         for (id, s) in self.states.iter().enumerate() {
1120             let mut trans = vec![];
1121             s.trans.iter(|byte, next| {
1122                 // The start state has a bunch of uninteresting transitions
1123                 // back into itself. It's questionable to hide them since they
1124                 // are critical to understanding the automaton, but they are
1125                 // very noisy without better formatting for contiugous ranges
1126                 // to the same state.
1127                 if id == self.start_id.to_usize() && next == self.start_id {
1128                     return;
1129                 }
1130                 // Similarly, the dead state has a bunch of uninteresting
1131                 // transitions too.
1132                 if id == dead_id() {
1133                     return;
1134                 }
1135                 trans.push(format!("{} => {}", escape(byte), next.to_usize()));
1136             });
1137             writeln!(f, "{:04}: {}", id, trans.join(", "))?;
1138 
1139             let matches: Vec<String> = s
1140                 .matches
1141                 .iter()
1142                 .map(|&(pattern_id, _)| pattern_id.to_string())
1143                 .collect();
1144             writeln!(f, "  matches: {}", matches.join(", "))?;
1145             writeln!(f, "     fail: {}", s.fail.to_usize())?;
1146             writeln!(f, "    depth: {}", s.depth)?;
1147         }
1148         writeln!(f, "{}", "-".repeat(79))?;
1149         writeln!(f, ")")?;
1150         Ok(())
1151     }
1152 }
1153 
1154 /// Iterate over all possible byte transitions given a sparse set.
sparse_iter<S: StateID, F: FnMut(u8, S)>(trans: &[(u8, S)], mut f: F)1155 fn sparse_iter<S: StateID, F: FnMut(u8, S)>(trans: &[(u8, S)], mut f: F) {
1156     let mut byte = 0u16;
1157     for &(b, id) in trans {
1158         while byte < (b as u16) {
1159             f(byte as u8, fail_id());
1160             byte += 1;
1161         }
1162         f(b, id);
1163         byte += 1;
1164     }
1165     for b in byte..256 {
1166         f(b as u8, fail_id());
1167     }
1168 }
1169 
1170 /// Safely return two mutable borrows to two different locations in the given
1171 /// slice.
1172 ///
1173 /// This panics if i == j.
get_two_mut<T>(xs: &mut [T], i: usize, j: usize) -> (&mut T, &mut T)1174 fn get_two_mut<T>(xs: &mut [T], i: usize, j: usize) -> (&mut T, &mut T) {
1175     assert!(i != j, "{} must not be equal to {}", i, j);
1176     if i < j {
1177         let (before, after) = xs.split_at_mut(j);
1178         (&mut before[i], &mut after[0])
1179     } else {
1180         let (before, after) = xs.split_at_mut(i);
1181         (&mut after[0], &mut before[j])
1182     }
1183 }
1184 
1185 /// Return the given byte as its escaped string form.
escape(b: u8) -> String1186 fn escape(b: u8) -> String {
1187     use std::ascii;
1188 
1189     String::from_utf8(ascii::escape_default(b).collect::<Vec<_>>()).unwrap()
1190 }
1191 
1192 #[cfg(test)]
1193 mod tests {
1194     use super::*;
1195 
1196     #[test]
scratch()1197     fn scratch() {
1198         let nfa: NFA<usize> = Builder::new()
1199             .dense_depth(0)
1200             // .match_kind(MatchKind::LeftmostShortest)
1201             // .match_kind(MatchKind::LeftmostLongest)
1202             .match_kind(MatchKind::LeftmostFirst)
1203             // .build(&["abcd", "ce", "b"])
1204             // .build(&["ab", "bc"])
1205             // .build(&["b", "bcd", "ce"])
1206             // .build(&["abc", "bx"])
1207             // .build(&["abc", "bd", "ab"])
1208             // .build(&["abcdefghi", "hz", "abcdefgh"])
1209             // .build(&["abcd", "bce", "b"])
1210             .build(&["abcdefg", "bcde", "bcdef"])
1211             .unwrap();
1212         println!("{:?}", nfa);
1213     }
1214 }
1215