• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 use std::cmp;
2 use std::collections::{BTreeSet, VecDeque};
3 use std::fmt;
4 use std::mem::size_of;
5 use std::ops::{Index, IndexMut};
6 
7 use crate::ahocorasick::MatchKind;
8 use crate::automaton::Automaton;
9 use crate::classes::{ByteClassBuilder, ByteClasses};
10 use crate::error::Result;
11 use crate::prefilter::{self, opposite_ascii_case, Prefilter, PrefilterObj};
12 use crate::state_id::{dead_id, fail_id, usize_to_state_id, StateID};
13 use crate::Match;
14 
15 /// The identifier for a pattern, which is simply the position of the pattern
16 /// in the sequence of patterns given by the caller.
17 pub type PatternID = usize;
18 
19 /// The length of a pattern, in bytes.
20 pub type PatternLength = usize;
21 
22 /// An Aho-Corasick automaton, represented as an NFA.
23 ///
24 /// This is the classical formulation of Aho-Corasick, which involves building
25 /// up a prefix trie of a given set of patterns, and then wiring up failure
26 /// transitions between states in order to guarantee linear time matching. The
27 /// standard formulation is, technically, an NFA because of these failure
28 /// transitions. That is, one can see them as enabling the automaton to be in
29 /// multiple states at once. Indeed, during search, it is possible to check
30 /// the transitions on multiple states for a single input byte.
31 ///
32 /// This particular implementation not only supports the standard style of
33 /// matching, but also provides a mode for choosing leftmost-first or
34 /// leftmost-longest match semantics. When a leftmost mode is chosen, some
35 /// failure transitions that would otherwise be added are elided. See
36 /// the documentation of `MatchKind` for more details and examples on how the
37 /// match semantics may differ.
38 ///
39 /// If one wants a DFA, then it is necessary to first build an NFA and convert
40 /// it into a DFA. Note, however, that because we've constrained ourselves to
41 /// matching literal patterns, this does not need to use subset construction
42 /// for determinization. Instead, the DFA has at most a number of states
43 /// equivalent to the number of NFA states. The only real difference between
44 /// them is that all failure transitions are followed and pre-computed. This
45 /// uses much more memory, but also executes searches more quickly.
46 #[derive(Clone)]
47 pub struct NFA<S> {
48     /// The match semantics built into this NFA.
49     match_kind: MatchKind,
50     /// The start state id as an index into `states`.
51     start_id: S,
52     /// The length, in bytes, of the longest pattern in this automaton. This
53     /// information is useful for keeping correct buffer sizes when searching
54     /// on streams.
55     max_pattern_len: usize,
56     /// The total number of patterns added to this automaton, including
57     /// patterns that may never be matched.
58     pattern_count: usize,
59     /// The number of bytes of heap used by this NFA's transition table.
60     heap_bytes: usize,
61     /// A prefilter for quickly skipping to candidate matches, if pertinent.
62     prefilter: Option<PrefilterObj>,
63     /// Whether this automaton anchors all matches to the start of input.
64     anchored: bool,
65     /// A set of equivalence classes in terms of bytes. We compute this while
66     /// building the NFA, but don't use it in the NFA's states. Instead, we
67     /// use this for building the DFA. We store it on the NFA since it's easy
68     /// to compute while visiting the the patterns.
69     byte_classes: ByteClasses,
70     /// A set of states. Each state defines its own transitions, a fail
71     /// transition and a set of indices corresponding to matches.
72     ///
73     /// The first state is always the fail state, which is used only as a
74     /// sentinel. Namely, in the final NFA, no transition into the fail state
75     /// exists. (Well, they do, but they aren't followed. Instead, the state's
76     /// failure transition is followed.)
77     ///
78     /// The second state (index 1) is always the dead state. Dead states are
79     /// in every automaton, but only used when leftmost-{first,longest} match
80     /// semantics are enabled. Specifically, they instruct search to stop
81     /// at specific points in order to report the correct match location. In
82     /// the standard Aho-Corasick construction, there are no transitions to
83     /// the dead state.
84     ///
85     /// The third state (index 2) is generally intended to be the starting or
86     /// "root" state.
87     states: Vec<State<S>>,
88 }
89 
90 impl<S: StateID> NFA<S> {
91     /// Returns the equivalence classes of bytes found while constructing
92     /// this NFA.
93     ///
94     /// Note that the NFA doesn't actually make use of these equivalence
95     /// classes. Instead, these are useful for building the DFA when desired.
byte_classes(&self) -> &ByteClasses96     pub fn byte_classes(&self) -> &ByteClasses {
97         &self.byte_classes
98     }
99 
100     /// Returns a prefilter, if one exists.
prefilter_obj(&self) -> Option<&PrefilterObj>101     pub fn prefilter_obj(&self) -> Option<&PrefilterObj> {
102         self.prefilter.as_ref()
103     }
104 
105     /// Returns the total number of heap bytes used by this NFA's transition
106     /// table.
heap_bytes(&self) -> usize107     pub fn heap_bytes(&self) -> usize {
108         self.heap_bytes
109             + self.prefilter.as_ref().map_or(0, |p| p.as_ref().heap_bytes())
110     }
111 
112     /// Return the length of the longest pattern in this automaton.
max_pattern_len(&self) -> usize113     pub fn max_pattern_len(&self) -> usize {
114         self.max_pattern_len
115     }
116 
117     /// Return the total number of patterns added to this automaton.
pattern_count(&self) -> usize118     pub fn pattern_count(&self) -> usize {
119         self.pattern_count
120     }
121 
122     /// Returns the total number of states in this NFA.
state_len(&self) -> usize123     pub fn state_len(&self) -> usize {
124         self.states.len()
125     }
126 
127     /// Returns the matches for the given state.
matches(&self, id: S) -> &[(PatternID, PatternLength)]128     pub fn matches(&self, id: S) -> &[(PatternID, PatternLength)] {
129         &self.states[id.to_usize()].matches
130     }
131 
132     /// Returns an iterator over all transitions in the given state according
133     /// to the given equivalence classes, including transitions to `fail_id()`.
134     /// The number of transitions returned is always equivalent to the number
135     /// of equivalence classes.
iter_all_transitions<F: FnMut(u8, S)>( &self, byte_classes: &ByteClasses, id: S, f: F, )136     pub fn iter_all_transitions<F: FnMut(u8, S)>(
137         &self,
138         byte_classes: &ByteClasses,
139         id: S,
140         f: F,
141     ) {
142         self.states[id.to_usize()].trans.iter_all(byte_classes, f);
143     }
144 
145     /// Returns the failure transition for the given state.
failure_transition(&self, id: S) -> S146     pub fn failure_transition(&self, id: S) -> S {
147         self.states[id.to_usize()].fail
148     }
149 
150     /// Returns the next state for the given state and input byte.
151     ///
152     /// Note that this does not follow failure transitions. As such, the id
153     /// returned may be `fail_id`.
next_state(&self, current: S, input: u8) -> S154     pub fn next_state(&self, current: S, input: u8) -> S {
155         self.states[current.to_usize()].next_state(input)
156     }
157 
state(&self, id: S) -> &State<S>158     fn state(&self, id: S) -> &State<S> {
159         &self.states[id.to_usize()]
160     }
161 
state_mut(&mut self, id: S) -> &mut State<S>162     fn state_mut(&mut self, id: S) -> &mut State<S> {
163         &mut self.states[id.to_usize()]
164     }
165 
start(&self) -> &State<S>166     fn start(&self) -> &State<S> {
167         self.state(self.start_id)
168     }
169 
start_mut(&mut self) -> &mut State<S>170     fn start_mut(&mut self) -> &mut State<S> {
171         let id = self.start_id;
172         self.state_mut(id)
173     }
174 
iter_transitions_mut(&mut self, id: S) -> IterTransitionsMut<'_, S>175     fn iter_transitions_mut(&mut self, id: S) -> IterTransitionsMut<'_, S> {
176         IterTransitionsMut::new(self, id)
177     }
178 
copy_matches(&mut self, src: S, dst: S)179     fn copy_matches(&mut self, src: S, dst: S) {
180         let (src, dst) =
181             get_two_mut(&mut self.states, src.to_usize(), dst.to_usize());
182         dst.matches.extend_from_slice(&src.matches);
183     }
184 
copy_empty_matches(&mut self, dst: S)185     fn copy_empty_matches(&mut self, dst: S) {
186         let start_id = self.start_id;
187         self.copy_matches(start_id, dst);
188     }
189 
add_dense_state(&mut self, depth: usize) -> Result<S>190     fn add_dense_state(&mut self, depth: usize) -> Result<S> {
191         let trans = Transitions::Dense(Dense::new());
192         let id = usize_to_state_id(self.states.len())?;
193         self.states.push(State {
194             trans,
195             // Anchored automatons do not have any failure transitions.
196             fail: if self.anchored { dead_id() } else { self.start_id },
197             depth,
198             matches: vec![],
199         });
200         Ok(id)
201     }
202 
add_sparse_state(&mut self, depth: usize) -> Result<S>203     fn add_sparse_state(&mut self, depth: usize) -> Result<S> {
204         let trans = Transitions::Sparse(vec![]);
205         let id = usize_to_state_id(self.states.len())?;
206         self.states.push(State {
207             trans,
208             // Anchored automatons do not have any failure transitions.
209             fail: if self.anchored { dead_id() } else { self.start_id },
210             depth,
211             matches: vec![],
212         });
213         Ok(id)
214     }
215 }
216 
217 impl<S: StateID> Automaton for NFA<S> {
218     type ID = S;
219 
match_kind(&self) -> &MatchKind220     fn match_kind(&self) -> &MatchKind {
221         &self.match_kind
222     }
223 
anchored(&self) -> bool224     fn anchored(&self) -> bool {
225         self.anchored
226     }
227 
prefilter(&self) -> Option<&dyn Prefilter>228     fn prefilter(&self) -> Option<&dyn Prefilter> {
229         self.prefilter.as_ref().map(|p| p.as_ref())
230     }
231 
start_state(&self) -> S232     fn start_state(&self) -> S {
233         self.start_id
234     }
235 
is_valid(&self, id: S) -> bool236     fn is_valid(&self, id: S) -> bool {
237         id.to_usize() < self.states.len()
238     }
239 
is_match_state(&self, id: S) -> bool240     fn is_match_state(&self, id: S) -> bool {
241         self.states[id.to_usize()].is_match()
242     }
243 
get_match( &self, id: S, match_index: usize, end: usize, ) -> Option<Match>244     fn get_match(
245         &self,
246         id: S,
247         match_index: usize,
248         end: usize,
249     ) -> Option<Match> {
250         let state = match self.states.get(id.to_usize()) {
251             None => return None,
252             Some(state) => state,
253         };
254         state.matches.get(match_index).map(|&(id, len)| Match {
255             pattern: id,
256             len,
257             end,
258         })
259     }
260 
match_count(&self, id: S) -> usize261     fn match_count(&self, id: S) -> usize {
262         self.states[id.to_usize()].matches.len()
263     }
264 
next_state(&self, mut current: S, input: u8) -> S265     fn next_state(&self, mut current: S, input: u8) -> S {
266         // This terminates since:
267         //
268         // 1. `State.fail` never points to fail_id().
269         // 2. All `State.fail` values point to a state closer to `start`.
270         // 3. The start state has no transitions to fail_id().
271         loop {
272             let state = &self.states[current.to_usize()];
273             let next = state.next_state(input);
274             if next != fail_id() {
275                 return next;
276             }
277             current = state.fail;
278         }
279     }
280 }
281 
282 /// A representation of an NFA state for an Aho-Corasick automaton.
283 ///
284 /// It contains the transitions to the next state, a failure transition for
285 /// cases where there exists no other transition for the current input byte,
286 /// the matches implied by visiting this state (if any) and the depth of this
287 /// state. The depth of a state is simply the distance from it to the start
288 /// state in the automaton, where the depth of the start state is 0.
289 #[derive(Clone, Debug)]
290 pub struct State<S> {
291     trans: Transitions<S>,
292     fail: S,
293     matches: Vec<(PatternID, PatternLength)>,
294     // TODO: Strictly speaking, this isn't needed for searching. It's only
295     // used when building an NFA that supports leftmost match semantics. We
296     // could drop this from the state and dynamically build a map only when
297     // computing failure transitions, but it's not clear which is better.
298     // Benchmark this.
299     depth: usize,
300 }
301 
302 impl<S: StateID> State<S> {
heap_bytes(&self) -> usize303     fn heap_bytes(&self) -> usize {
304         self.trans.heap_bytes()
305             + (self.matches.len() * size_of::<(PatternID, PatternLength)>())
306     }
307 
add_match(&mut self, i: PatternID, len: PatternLength)308     fn add_match(&mut self, i: PatternID, len: PatternLength) {
309         self.matches.push((i, len));
310     }
311 
is_match(&self) -> bool312     fn is_match(&self) -> bool {
313         !self.matches.is_empty()
314     }
315 
get_longest_match_len(&self) -> Option<usize>316     fn get_longest_match_len(&self) -> Option<usize> {
317         // Why is this true? Because the first match in any matching state
318         // will always correspond to the match added to it during trie
319         // construction (since when we copy matches due to failure transitions,
320         // we always append them). Therefore, it follows that the first match
321         // must always be longest since any subsequent match must be from a
322         // failure transition, and a failure transition by construction points
323         // to a proper suffix. A proper suffix is, by definition, smaller.
324         self.matches.get(0).map(|&(_, len)| len)
325     }
326 
next_state(&self, input: u8) -> S327     fn next_state(&self, input: u8) -> S {
328         self.trans.next_state(input)
329     }
330 
set_next_state(&mut self, input: u8, next: S)331     fn set_next_state(&mut self, input: u8, next: S) {
332         self.trans.set_next_state(input, next);
333     }
334 }
335 
336 /// Represents the transitions for a single dense state.
337 ///
338 /// The primary purpose here is to encapsulate index access. Namely, since a
339 /// dense representation always contains 256 elements, all values of `u8` are
340 /// valid indices.
341 #[derive(Clone, Debug)]
342 struct Dense<S>(Vec<S>);
343 
344 impl<S> Dense<S>
345 where
346     S: StateID,
347 {
new() -> Self348     fn new() -> Self {
349         Dense(vec![fail_id(); 256])
350     }
351 
352     #[inline]
len(&self) -> usize353     fn len(&self) -> usize {
354         self.0.len()
355     }
356 }
357 
358 impl<S> Index<u8> for Dense<S> {
359     type Output = S;
360 
361     #[inline]
index(&self, i: u8) -> &S362     fn index(&self, i: u8) -> &S {
363         // SAFETY: This is safe because all dense transitions have
364         // exactly 256 elements, so all u8 values are valid indices.
365         &self.0[i as usize]
366     }
367 }
368 
369 impl<S> IndexMut<u8> for Dense<S> {
370     #[inline]
index_mut(&mut self, i: u8) -> &mut S371     fn index_mut(&mut self, i: u8) -> &mut S {
372         // SAFETY: This is safe because all dense transitions have
373         // exactly 256 elements, so all u8 values are valid indices.
374         &mut self.0[i as usize]
375     }
376 }
377 
378 /// A representation of transitions in an NFA.
379 ///
380 /// Transitions have either a sparse representation, which is slower for
381 /// lookups but uses less memory, or a dense representation, which is faster
382 /// for lookups but uses more memory. In the sparse representation, the absence
383 /// of a state implies a transition to `fail_id()`. Transitions to `dead_id()`
384 /// are still explicitly represented.
385 ///
386 /// For the NFA, by default, we use a dense representation for transitions for
387 /// states close to the start state because it's likely these are the states
388 /// that will be most frequently visited.
389 #[derive(Clone, Debug)]
390 enum Transitions<S> {
391     Sparse(Vec<(u8, S)>),
392     Dense(Dense<S>),
393 }
394 
395 impl<S: StateID> Transitions<S> {
heap_bytes(&self) -> usize396     fn heap_bytes(&self) -> usize {
397         match *self {
398             Transitions::Sparse(ref sparse) => {
399                 sparse.len() * size_of::<(u8, S)>()
400             }
401             Transitions::Dense(ref dense) => dense.len() * size_of::<S>(),
402         }
403     }
404 
next_state(&self, input: u8) -> S405     fn next_state(&self, input: u8) -> S {
406         match *self {
407             Transitions::Sparse(ref sparse) => {
408                 for &(b, id) in sparse {
409                     if b == input {
410                         return id;
411                     }
412                 }
413                 fail_id()
414             }
415             Transitions::Dense(ref dense) => dense[input],
416         }
417     }
418 
set_next_state(&mut self, input: u8, next: S)419     fn set_next_state(&mut self, input: u8, next: S) {
420         match *self {
421             Transitions::Sparse(ref mut sparse) => {
422                 match sparse.binary_search_by_key(&input, |&(b, _)| b) {
423                     Ok(i) => sparse[i] = (input, next),
424                     Err(i) => sparse.insert(i, (input, next)),
425                 }
426             }
427             Transitions::Dense(ref mut dense) => {
428                 dense[input] = next;
429             }
430         }
431     }
432 
433     /// Iterate over transitions in this state while skipping over transitions
434     /// to `fail_id()`.
iter<F: FnMut(u8, S)>(&self, mut f: F)435     fn iter<F: FnMut(u8, S)>(&self, mut f: F) {
436         match *self {
437             Transitions::Sparse(ref sparse) => {
438                 for &(b, id) in sparse {
439                     f(b, id);
440                 }
441             }
442             Transitions::Dense(ref dense) => {
443                 for b in AllBytesIter::new() {
444                     let id = dense[b];
445                     if id != fail_id() {
446                         f(b, id);
447                     }
448                 }
449             }
450         }
451     }
452 
453     /// Iterate over all transitions in this state according to the given
454     /// equivalence classes, including transitions to `fail_id()`.
iter_all<F: FnMut(u8, S)>(&self, classes: &ByteClasses, mut f: F)455     fn iter_all<F: FnMut(u8, S)>(&self, classes: &ByteClasses, mut f: F) {
456         if classes.is_singleton() {
457             match *self {
458                 Transitions::Sparse(ref sparse) => {
459                     sparse_iter(sparse, f);
460                 }
461                 Transitions::Dense(ref dense) => {
462                     for b in AllBytesIter::new() {
463                         f(b, dense[b]);
464                     }
465                 }
466             }
467         } else {
468             // In this case, we only want to yield a single byte for each
469             // equivalence class.
470             match *self {
471                 Transitions::Sparse(ref sparse) => {
472                     let mut last_class = None;
473                     sparse_iter(sparse, |b, next| {
474                         let class = classes.get(b);
475                         if last_class != Some(class) {
476                             last_class = Some(class);
477                             f(b, next);
478                         }
479                     })
480                 }
481                 Transitions::Dense(ref dense) => {
482                     for b in classes.representatives() {
483                         f(b, dense[b]);
484                     }
485                 }
486             }
487         }
488     }
489 }
490 
491 /// Iterator over transitions in a state, skipping transitions to `fail_id()`.
492 ///
493 /// This abstracts over the representation of NFA transitions, which may be
494 /// either in a sparse or dense representation.
495 ///
496 /// This somewhat idiosyncratically borrows the NFA mutably, so that when one
497 /// is iterating over transitions, the caller can still mutate the NFA. This
498 /// is useful when creating failure transitions.
499 #[derive(Debug)]
500 struct IterTransitionsMut<'a, S: StateID> {
501     nfa: &'a mut NFA<S>,
502     state_id: S,
503     cur: usize,
504 }
505 
506 impl<'a, S: StateID> IterTransitionsMut<'a, S> {
new(nfa: &'a mut NFA<S>, state_id: S) -> IterTransitionsMut<'a, S>507     fn new(nfa: &'a mut NFA<S>, state_id: S) -> IterTransitionsMut<'a, S> {
508         IterTransitionsMut { nfa, state_id, cur: 0 }
509     }
510 
nfa(&mut self) -> &mut NFA<S>511     fn nfa(&mut self) -> &mut NFA<S> {
512         self.nfa
513     }
514 }
515 
516 impl<'a, S: StateID> Iterator for IterTransitionsMut<'a, S> {
517     type Item = (u8, S);
518 
next(&mut self) -> Option<(u8, S)>519     fn next(&mut self) -> Option<(u8, S)> {
520         match self.nfa.states[self.state_id.to_usize()].trans {
521             Transitions::Sparse(ref sparse) => {
522                 if self.cur >= sparse.len() {
523                     return None;
524                 }
525                 let i = self.cur;
526                 self.cur += 1;
527                 Some(sparse[i])
528             }
529             Transitions::Dense(ref dense) => {
530                 while self.cur < dense.len() {
531                     // There are always exactly 255 transitions in dense repr.
532                     debug_assert!(self.cur < 256);
533 
534                     let b = self.cur as u8;
535                     let id = dense[b];
536                     self.cur += 1;
537                     if id != fail_id() {
538                         return Some((b, id));
539                     }
540                 }
541                 None
542             }
543         }
544     }
545 }
546 
547 /// A simple builder for configuring the NFA construction of Aho-Corasick.
548 #[derive(Clone, Debug)]
549 pub struct Builder {
550     dense_depth: usize,
551     match_kind: MatchKind,
552     prefilter: bool,
553     anchored: bool,
554     ascii_case_insensitive: bool,
555 }
556 
557 impl Default for Builder {
default() -> Builder558     fn default() -> Builder {
559         Builder {
560             dense_depth: 2,
561             match_kind: MatchKind::default(),
562             prefilter: true,
563             anchored: false,
564             ascii_case_insensitive: false,
565         }
566     }
567 }
568 
569 impl Builder {
new() -> Builder570     pub fn new() -> Builder {
571         Builder::default()
572     }
573 
build<I, P, S: StateID>(&self, patterns: I) -> Result<NFA<S>> where I: IntoIterator<Item = P>, P: AsRef<[u8]>,574     pub fn build<I, P, S: StateID>(&self, patterns: I) -> Result<NFA<S>>
575     where
576         I: IntoIterator<Item = P>,
577         P: AsRef<[u8]>,
578     {
579         Compiler::new(self)?.compile(patterns)
580     }
581 
match_kind(&mut self, kind: MatchKind) -> &mut Builder582     pub fn match_kind(&mut self, kind: MatchKind) -> &mut Builder {
583         self.match_kind = kind;
584         self
585     }
586 
dense_depth(&mut self, depth: usize) -> &mut Builder587     pub fn dense_depth(&mut self, depth: usize) -> &mut Builder {
588         self.dense_depth = depth;
589         self
590     }
591 
prefilter(&mut self, yes: bool) -> &mut Builder592     pub fn prefilter(&mut self, yes: bool) -> &mut Builder {
593         self.prefilter = yes;
594         self
595     }
596 
anchored(&mut self, yes: bool) -> &mut Builder597     pub fn anchored(&mut self, yes: bool) -> &mut Builder {
598         self.anchored = yes;
599         self
600     }
601 
ascii_case_insensitive(&mut self, yes: bool) -> &mut Builder602     pub fn ascii_case_insensitive(&mut self, yes: bool) -> &mut Builder {
603         self.ascii_case_insensitive = yes;
604         self
605     }
606 }
607 
608 /// A compiler uses a builder configuration and builds up the NFA formulation
609 /// of an Aho-Corasick automaton. This roughly corresponds to the standard
610 /// formulation described in textbooks.
611 #[derive(Debug)]
612 struct Compiler<'a, S: StateID> {
613     builder: &'a Builder,
614     prefilter: prefilter::Builder,
615     nfa: NFA<S>,
616     byte_classes: ByteClassBuilder,
617 }
618 
619 impl<'a, S: StateID> Compiler<'a, S> {
new(builder: &'a Builder) -> Result<Compiler<'a, S>>620     fn new(builder: &'a Builder) -> Result<Compiler<'a, S>> {
621         Ok(Compiler {
622             builder,
623             prefilter: prefilter::Builder::new(builder.match_kind)
624                 .ascii_case_insensitive(builder.ascii_case_insensitive),
625             nfa: NFA {
626                 match_kind: builder.match_kind,
627                 start_id: usize_to_state_id(2)?,
628                 max_pattern_len: 0,
629                 pattern_count: 0,
630                 heap_bytes: 0,
631                 prefilter: None,
632                 anchored: builder.anchored,
633                 byte_classes: ByteClasses::singletons(),
634                 states: vec![],
635             },
636             byte_classes: ByteClassBuilder::new(),
637         })
638     }
639 
compile<I, P>(mut self, patterns: I) -> Result<NFA<S>> where I: IntoIterator<Item = P>, P: AsRef<[u8]>,640     fn compile<I, P>(mut self, patterns: I) -> Result<NFA<S>>
641     where
642         I: IntoIterator<Item = P>,
643         P: AsRef<[u8]>,
644     {
645         self.add_state(0)?; // the fail state, which is never entered
646         self.add_state(0)?; // the dead state, only used for leftmost
647         self.add_state(0)?; // the start state
648         self.build_trie(patterns)?;
649         self.add_start_state_loop();
650         self.add_dead_state_loop();
651         if !self.builder.anchored {
652             if self.match_kind().is_leftmost() {
653                 self.fill_failure_transitions_leftmost();
654             } else {
655                 self.fill_failure_transitions_standard();
656             }
657         }
658         self.close_start_state_loop();
659         self.nfa.byte_classes = self.byte_classes.build();
660         if !self.builder.anchored {
661             self.nfa.prefilter = self.prefilter.build();
662         }
663         self.calculate_size();
664         Ok(self.nfa)
665     }
666 
667     /// This sets up the initial prefix trie that makes up the Aho-Corasick
668     /// automaton. Effectively, it creates the basic structure of the
669     /// automaton, where every pattern given has a path from the start state to
670     /// the end of the pattern.
build_trie<I, P>(&mut self, patterns: I) -> Result<()> where I: IntoIterator<Item = P>, P: AsRef<[u8]>,671     fn build_trie<I, P>(&mut self, patterns: I) -> Result<()>
672     where
673         I: IntoIterator<Item = P>,
674         P: AsRef<[u8]>,
675     {
676         'PATTERNS: for (pati, pat) in patterns.into_iter().enumerate() {
677             let pat = pat.as_ref();
678             self.nfa.max_pattern_len =
679                 cmp::max(self.nfa.max_pattern_len, pat.len());
680             self.nfa.pattern_count += 1;
681 
682             let mut prev = self.nfa.start_id;
683             let mut saw_match = false;
684             for (depth, &b) in pat.iter().enumerate() {
685                 // When leftmost-first match semantics are requested, we
686                 // specifically stop adding patterns when a previously added
687                 // pattern is a prefix of it. We avoid adding it because
688                 // leftmost-first semantics imply that the pattern can never
689                 // match. This is not just an optimization to save space! It
690                 // is necessary for correctness. In fact, this is the only
691                 // difference in the automaton between the implementations for
692                 // leftmost-first and leftmost-longest.
693                 saw_match = saw_match || self.nfa.state(prev).is_match();
694                 if self.builder.match_kind.is_leftmost_first() && saw_match {
695                     // Skip to the next pattern immediately. This avoids
696                     // incorrectly adding a match after this loop terminates.
697                     continue 'PATTERNS;
698                 }
699 
700                 // Add this byte to our equivalence classes. We don't use these
701                 // for NFA construction. These are instead used only if we're
702                 // building a DFA. They would technically be useful for the
703                 // NFA, but it would require a second pass over the patterns.
704                 self.byte_classes.set_range(b, b);
705                 if self.builder.ascii_case_insensitive {
706                     let b = opposite_ascii_case(b);
707                     self.byte_classes.set_range(b, b);
708                 }
709 
710                 // If the transition from prev using the current byte already
711                 // exists, then just move through it. Otherwise, add a new
712                 // state. We track the depth here so that we can determine
713                 // how to represent transitions. States near the start state
714                 // use a dense representation that uses more memory but is
715                 // faster. Other states use a sparse representation that uses
716                 // less memory but is slower.
717                 let next = self.nfa.state(prev).next_state(b);
718                 if next != fail_id() {
719                     prev = next;
720                 } else {
721                     let next = self.add_state(depth + 1)?;
722                     self.nfa.state_mut(prev).set_next_state(b, next);
723                     if self.builder.ascii_case_insensitive {
724                         let b = opposite_ascii_case(b);
725                         self.nfa.state_mut(prev).set_next_state(b, next);
726                     }
727                     prev = next;
728                 }
729             }
730             // Once the pattern has been added, log the match in the final
731             // state that it reached.
732             self.nfa.state_mut(prev).add_match(pati, pat.len());
733             // ... and hand it to the prefilter builder, if applicable.
734             if self.builder.prefilter {
735                 self.prefilter.add(pat);
736             }
737         }
738         Ok(())
739     }
740 
741     /// This routine creates failure transitions according to the standard
742     /// textbook formulation of the Aho-Corasick algorithm.
743     ///
744     /// Building failure transitions is the most interesting part of building
745     /// the Aho-Corasick automaton, because they are what allow searches to
746     /// be performed in linear time. Specifically, a failure transition is
747     /// a single transition associated with each state that points back to
748     /// the longest proper suffix of the pattern being searched. The failure
749     /// transition is followed whenever there exists no transition on the
750     /// current state for the current input byte. If there is no other proper
751     /// suffix, then the failure transition points back to the starting state.
752     ///
753     /// For example, let's say we built an Aho-Corasick automaton with the
754     /// following patterns: 'abcd' and 'cef'. The trie looks like this:
755     ///
756     /// ```ignore
757     ///          a - S1 - b - S2 - c - S3 - d - S4*
758     ///         /
759     ///     S0 - c - S5 - e - S6 - f - S7*
760     /// ```
761     ///
762     /// At this point, it should be fairly straight-forward to see how this
763     /// trie can be used in a simplistic way. At any given position in the
764     /// text we're searching (called the "subject" string), all we need to do
765     /// is follow the transitions in the trie by consuming one transition for
766     /// each byte in the subject string. If we reach a match state, then we can
767     /// report that location as a match.
768     ///
769     /// The trick comes when searching a subject string like 'abcef'. We'll
770     /// initially follow the transition from S0 to S1 and wind up in S3 after
771     /// observng the 'c' byte. At this point, the next byte is 'e' but state
772     /// S3 has no transition for 'e', so the search fails. We then would need
773     /// to restart the search at the next position in 'abcef', which
774     /// corresponds to 'b'. The match would fail, but the next search starting
775     /// at 'c' would finally succeed. The problem with this approach is that
776     /// we wind up searching the subject string potentially many times. In
777     /// effect, this makes the algorithm have worst case `O(n * m)` complexity,
778     /// where `n ~ len(subject)` and `m ~ len(all patterns)`. We would instead
779     /// like to achieve a `O(n + m)` worst case complexity.
780     ///
781     /// This is where failure transitions come in. Instead of dying at S3 in
782     /// the first search, the automaton can instruct the search to move to
783     /// another part of the automaton that corresponds to a suffix of what
784     /// we've seen so far. Recall that we've seen 'abc' in the subject string,
785     /// and the automaton does indeed have a non-empty suffix, 'c', that could
786     /// potentially lead to another match. Thus, the actual Aho-Corasick
787     /// automaton for our patterns in this case looks like this:
788     ///
789     /// ```ignore
790     ///          a - S1 - b - S2 - c - S3 - d - S4*
791     ///         /                      /
792     ///        /       ----------------
793     ///       /       /
794     ///     S0 - c - S5 - e - S6 - f - S7*
795     /// ```
796     ///
797     /// That is, we have a failure transition from S3 to S5, which is followed
798     /// exactly in cases when we are in state S3 but see any byte other than
799     /// 'd' (that is, we've "failed" to find a match in this portion of our
800     /// trie). We know we can transition back to S5 because we've already seen
801     /// a 'c' byte, so we don't need to re-scan it. We can then pick back up
802     /// with the search starting at S5 and complete our match.
803     ///
804     /// Adding failure transitions to a trie is fairly simple, but subtle. The
805     /// key issue is that you might have multiple failure transition that you
806     /// need to follow. For example, look at the trie for the patterns
807     /// 'abcd', 'b', 'bcd' and 'cd':
808     ///
809     /// ```ignore
810     ///        - a - S1 - b - S2 - c - S3 - d - S4*
811     ///       /
812     ///     S0 - b - S5* - c - S6 - d - S7*
813     ///       \
814     ///        - c - S8 - d - S9*
815     /// ```
816     ///
817     /// The failure transitions for this trie are defined from S2 to S5,
818     /// S3 to S6 and S6 to S8. Moreover, state S2 needs to track that it
819     /// corresponds to a match, since its failure transition to S5 is itself
820     /// a match state.
821     ///
822     /// Perhaps simplest way to think about adding these failure transitions
823     /// is recursively. That is, if you know the failure transitions for every
824     /// possible previous state that could be visited (e.g., when computing the
825     /// failure transition for S3, you already know the failure transitions
826     /// for S0, S1 and S2), then you can simply follow the failure transition
827     /// of the previous state and check whether the incoming transition is
828     /// defined after following the failure transition.
829     ///
830     /// For example, when determining the failure state for S3, by our
831     /// assumptions, we already know that there is a failure transition from
832     /// S2 (the previous state) to S5. So we follow that transition and check
833     /// whether the transition connecting S2 to S3 is defined. Indeed, it is,
834     /// as there is a transition from S5 to S6 for the byte 'c'. If no such
835     /// transition existed, we could keep following the failure transitions
836     /// until we reach the start state, which is the failure transition for
837     /// every state that has no corresponding proper suffix.
838     ///
839     /// We don't actually use recursion to implement this, but instead, use a
840     /// breadth first search of the automaton. Our base case is the start
841     /// state, whose failure transition is just a transition to itself.
fill_failure_transitions_standard(&mut self)842     fn fill_failure_transitions_standard(&mut self) {
843         // Initialize the queue for breadth first search with all transitions
844         // out of the start state. We handle the start state specially because
845         // we only want to follow non-self transitions. If we followed self
846         // transitions, then this would never terminate.
847         let mut queue = VecDeque::new();
848         let mut seen = self.queued_set();
849         for b in AllBytesIter::new() {
850             let next = self.nfa.start().next_state(b);
851             if next != self.nfa.start_id {
852                 if !seen.contains(next) {
853                     queue.push_back(next);
854                     seen.insert(next);
855                 }
856             }
857         }
858         while let Some(id) = queue.pop_front() {
859             let mut it = self.nfa.iter_transitions_mut(id);
860             while let Some((b, next)) = it.next() {
861                 if seen.contains(next) {
862                     // The only way to visit a duplicate state in a transition
863                     // list is when ASCII case insensitivity is enabled. In
864                     // this case, we want to skip it since it's redundant work.
865                     // But it would also end up duplicating matches, which
866                     // results in reporting duplicate matches in some cases.
867                     // See the 'acasei010' regression test.
868                     continue;
869                 }
870                 queue.push_back(next);
871                 seen.insert(next);
872 
873                 let mut fail = it.nfa().state(id).fail;
874                 while it.nfa().state(fail).next_state(b) == fail_id() {
875                     fail = it.nfa().state(fail).fail;
876                 }
877                 fail = it.nfa().state(fail).next_state(b);
878                 it.nfa().state_mut(next).fail = fail;
879                 it.nfa().copy_matches(fail, next);
880             }
881             // If the start state is a match state, then this automaton can
882             // match the empty string. This implies all states are match states
883             // since every position matches the empty string, so copy the
884             // matches from the start state to every state. Strictly speaking,
885             // this is only necessary for overlapping matches since each
886             // non-empty non-start match state needs to report empty matches
887             // in addition to its own. For the non-overlapping case, such
888             // states only report the first match, which is never empty since
889             // it isn't a start state.
890             it.nfa().copy_empty_matches(id);
891         }
892     }
893 
894     /// This routine is just like fill_failure_transitions_standard, except
895     /// it adds failure transitions in a way that preserves leftmost match
896     /// semantics (for both leftmost-first and leftmost-longest).
897     ///
898     /// The algorithms are so similar that it would be possible to write it
899     /// generically. But doing so without overhead would require a bit of
900     /// ceremony, so we just copy it and add in the extra leftmost logic.
901     /// Moreover, the standard algorithm above is so simple that it feels like
902     /// a crime to disturb it.
903     ///
904     /// In effect, this proceeds just like the standard approach, but we
905     /// specifically add only a subset of all failure transitions. Namely, we
906     /// only add failure transitions that either do not occur after a match
907     /// or failure transitions that do occur after a match but preserve the
908     /// match. The comments in the implementation below should help.
909     ///
910     /// N.B. The only differences in the automaton between leftmost-first and
911     /// leftmost-longest are in trie construction. Otherwise, both have exactly
912     /// the same set of failure transitions. leftmost-longest adds everything
913     /// to the trie, where as leftmost-first skips any patterns for which there
914     /// exists a prefix of it that was added earlier.
915     ///
916     /// N.B. I came up with this algorithm on my own, and after scouring all of
917     /// the other AC implementations I know of (Perl, Snort, many on GitHub).
918     /// I couldn't find any that implement leftmost semantics like this.
919     /// Perl of course needs leftmost-first semantics, but they implement it
920     /// with a seeming hack at *search* time instead of encoding it into the
921     /// automaton. There are also a couple Java libraries that support leftmost
922     /// longest semantics, but they do it by building a queue of matches at
923     /// search time, which is even worse than what Perl is doing. ---AG
fill_failure_transitions_leftmost(&mut self)924     fn fill_failure_transitions_leftmost(&mut self) {
925         /// Represents an item in our queue of states to process.
926         ///
927         /// Fundamentally, this queue serves the same purpose as the queue
928         /// for filling failure transitions using the standard formulation.
929         /// In the leftmost case, though, we need to track a bit more
930         /// information. See comments below.
931         #[derive(Clone, Copy, Debug)]
932         struct QueuedState<S> {
933             /// The id of the state to visit.
934             id: S,
935             /// The depth at which the first match was observed in the path
936             /// to this state. Note that this corresponds to the depth at
937             /// which the beginning of the match was detected. If no match
938             /// has been seen, then this is None.
939             match_at_depth: Option<usize>,
940         }
941 
942         impl<S: StateID> QueuedState<S> {
943             /// Create a queued state corresponding to the given NFA's start
944             /// state.
945             fn start(nfa: &NFA<S>) -> QueuedState<S> {
946                 let match_at_depth =
947                     if nfa.start().is_match() { Some(0) } else { None };
948                 QueuedState { id: nfa.start_id, match_at_depth }
949             }
950 
951             /// Return the next state to queue up. The given id must be a state
952             /// corresponding to a single transition from this queued state.
953             fn next_queued_state(
954                 &self,
955                 nfa: &NFA<S>,
956                 id: S,
957             ) -> QueuedState<S> {
958                 let match_at_depth = self.next_match_at_depth(nfa, id);
959                 QueuedState { id, match_at_depth }
960             }
961 
962             /// Return the earliest depth at which a match has occurred for
963             /// the given state. The given state must correspond to a single
964             /// transition from this queued state.
965             fn next_match_at_depth(
966                 &self,
967                 nfa: &NFA<S>,
968                 next: S,
969             ) -> Option<usize> {
970                 // This is a little tricky. If the previous state has already
971                 // seen a match or if `next` isn't a match state, then nothing
972                 // needs to change since a later state cannot find an earlier
973                 // match.
974                 match self.match_at_depth {
975                     Some(x) => return Some(x),
976                     None if nfa.state(next).is_match() => {}
977                     None => return None,
978                 }
979                 let depth = nfa.state(next).depth
980                     - nfa.state(next).get_longest_match_len().unwrap()
981                     + 1;
982                 Some(depth)
983             }
984         }
985 
986         // Initialize the queue for breadth first search with all transitions
987         // out of the start state. We handle the start state specially because
988         // we only want to follow non-self transitions. If we followed self
989         // transitions, then this would never terminate.
990         let mut queue: VecDeque<QueuedState<S>> = VecDeque::new();
991         let mut seen = self.queued_set();
992         let start = QueuedState::start(&self.nfa);
993         for b in AllBytesIter::new() {
994             let next_id = self.nfa.start().next_state(b);
995             if next_id != start.id {
996                 let next = start.next_queued_state(&self.nfa, next_id);
997                 if !seen.contains(next.id) {
998                     queue.push_back(next);
999                     seen.insert(next.id);
1000                 }
1001                 // If a state immediately following the start state is a match
1002                 // state, then we never want to follow its failure transition
1003                 // since the failure transition necessarily leads back to the
1004                 // start state, which we never want to do for leftmost matching
1005                 // after a match has been found.
1006                 //
1007                 // N.B. This is a special case of the more general handling
1008                 // found below.
1009                 if self.nfa.state(next_id).is_match() {
1010                     self.nfa.state_mut(next_id).fail = dead_id();
1011                 }
1012             }
1013         }
1014         while let Some(item) = queue.pop_front() {
1015             let mut any_trans = false;
1016             let mut it = self.nfa.iter_transitions_mut(item.id);
1017             while let Some((b, next_id)) = it.next() {
1018                 any_trans = true;
1019 
1020                 // Queue up the next state.
1021                 let next = item.next_queued_state(it.nfa(), next_id);
1022                 if seen.contains(next.id) {
1023                     // The only way to visit a duplicate state in a transition
1024                     // list is when ASCII case insensitivity is enabled. In
1025                     // this case, we want to skip it since it's redundant work.
1026                     // But it would also end up duplicating matches, which
1027                     // results in reporting duplicate matches in some cases.
1028                     // See the 'acasei010' regression test.
1029                     continue;
1030                 }
1031                 queue.push_back(next);
1032                 seen.insert(next.id);
1033 
1034                 // Find the failure state for next. Same as standard.
1035                 let mut fail = it.nfa().state(item.id).fail;
1036                 while it.nfa().state(fail).next_state(b) == fail_id() {
1037                     fail = it.nfa().state(fail).fail;
1038                 }
1039                 fail = it.nfa().state(fail).next_state(b);
1040 
1041                 // This is the key difference from the standard formulation.
1042                 // Namely, if we've seen a match, then we only want a failure
1043                 // transition if the failure transition preserves the match
1044                 // we've seen. In general, this is not true of all failure
1045                 // transitions since they can point back to any suffix of what
1046                 // we've seen so far. Instead, we only want to point back to
1047                 // suffixes that contain any match we've seen.
1048                 //
1049                 // We achieve this by comparing the depth of the failure
1050                 // transition with the number of states between this state
1051                 // and the beginning of the earliest match detected. If the
1052                 // depth of the failure state is smaller than this difference,
1053                 // then it cannot contain the match. If it's bigger or equal
1054                 // to the difference, then it necessarily includes the match
1055                 // we've seen since all failure transitions correspond to a
1056                 // suffix.
1057                 //
1058                 // If we've determined that we don't want the failure
1059                 // transition, then we set this state's failure transition to
1060                 // the dead state. In other words, when a search hits this
1061                 // state, it will not continue and correctly stop. (N.B. A
1062                 // dead state is different than a fail state. A dead state
1063                 // MUST be preceded by a match and acts as a sentinel to search
1064                 // routines to terminate.)
1065                 //
1066                 // Understanding this is tricky, and it took me several days
1067                 // to think through this and get it right. If you want to grok
1068                 // it, then I'd recommend: 1) switch the implementation to
1069                 // always use the standard algorithm for filling in failure
1070                 // transitions, 2) run the test suite and 3) examine the test
1071                 // failures. Write out the automatons for them and try to work
1072                 // backwards by figuring out which failure transitions should
1073                 // be removed. You should arrive at the same rule used below.
1074                 if let Some(match_depth) = next.match_at_depth {
1075                     let fail_depth = it.nfa().state(fail).depth;
1076                     let next_depth = it.nfa().state(next.id).depth;
1077                     if next_depth - match_depth + 1 > fail_depth {
1078                         it.nfa().state_mut(next.id).fail = dead_id();
1079                         continue;
1080                     }
1081                     assert_ne!(
1082                         start.id,
1083                         it.nfa().state(next.id).fail,
1084                         "states that are match states or follow match \
1085                          states should never have a failure transition \
1086                          back to the start state in leftmost searching",
1087                     );
1088                 }
1089                 it.nfa().state_mut(next.id).fail = fail;
1090                 it.nfa().copy_matches(fail, next.id);
1091             }
1092             // If there are no transitions for this state and if it's a match
1093             // state, then we must set its failure transition to the dead
1094             // state since we never want it to restart the search.
1095             if !any_trans && it.nfa().state(item.id).is_match() {
1096                 it.nfa().state_mut(item.id).fail = dead_id();
1097             }
1098             // We don't need to copy empty matches from the start state here
1099             // because that's only necessary for overlapping matches and
1100             // leftmost match kinds don't support overlapping matches.
1101         }
1102     }
1103 
1104     /// Returns a set that tracked queued states.
1105     ///
1106     /// This is only necessary when ASCII case insensitivity is enabled, since
1107     /// it is the only way to visit the same state twice. Otherwise, this
1108     /// returns an inert set that nevers adds anything and always reports
1109     /// `false` for every member test.
queued_set(&self) -> QueuedSet<S>1110     fn queued_set(&self) -> QueuedSet<S> {
1111         if self.builder.ascii_case_insensitive {
1112             QueuedSet::active()
1113         } else {
1114             QueuedSet::inert()
1115         }
1116     }
1117 
1118     /// Set the failure transitions on the start state to loop back to the
1119     /// start state. This effectively permits the Aho-Corasick automaton to
1120     /// match at any position. This is also required for finding the next
1121     /// state to terminate, namely, finding the next state should never return
1122     /// a fail_id.
1123     ///
1124     /// This must be done after building the initial trie, since trie
1125     /// construction depends on transitions to `fail_id` to determine whether a
1126     /// state already exists or not.
add_start_state_loop(&mut self)1127     fn add_start_state_loop(&mut self) {
1128         let start_id = self.nfa.start_id;
1129         let start = self.nfa.start_mut();
1130         for b in AllBytesIter::new() {
1131             if start.next_state(b) == fail_id() {
1132                 start.set_next_state(b, start_id);
1133             }
1134         }
1135     }
1136 
1137     /// Remove the start state loop by rewriting any transitions on the start
1138     /// state back to the start state with transitions to the dead state.
1139     ///
1140     /// The loop is only closed when two conditions are met: the start state
1141     /// is a match state and the match kind is leftmost-first or
1142     /// leftmost-longest. (Alternatively, if this is an anchored automaton,
1143     /// then the start state is always closed, regardless of aforementioned
1144     /// conditions.)
1145     ///
1146     /// The reason for this is that under leftmost semantics, a start state
1147     /// that is also a match implies that we should never restart the search
1148     /// process. We allow normal transitions out of the start state, but if
1149     /// none exist, we transition to the dead state, which signals that
1150     /// searching should stop.
close_start_state_loop(&mut self)1151     fn close_start_state_loop(&mut self) {
1152         if self.builder.anchored
1153             || (self.match_kind().is_leftmost() && self.nfa.start().is_match())
1154         {
1155             let start_id = self.nfa.start_id;
1156             let start = self.nfa.start_mut();
1157             for b in AllBytesIter::new() {
1158                 if start.next_state(b) == start_id {
1159                     start.set_next_state(b, dead_id());
1160                 }
1161             }
1162         }
1163     }
1164 
1165     /// Sets all transitions on the dead state to point back to the dead state.
1166     /// Normally, missing transitions map back to the failure state, but the
1167     /// point of the dead state is to act as a sink that can never be escaped.
add_dead_state_loop(&mut self)1168     fn add_dead_state_loop(&mut self) {
1169         let dead = self.nfa.state_mut(dead_id());
1170         for b in AllBytesIter::new() {
1171             dead.set_next_state(b, dead_id());
1172         }
1173     }
1174 
1175     /// Computes the total amount of heap used by this NFA in bytes.
calculate_size(&mut self)1176     fn calculate_size(&mut self) {
1177         let mut size = 0;
1178         for state in &self.nfa.states {
1179             size += state.heap_bytes();
1180         }
1181         self.nfa.heap_bytes = size;
1182     }
1183 
1184     /// Add a new state to the underlying NFA with the given depth. The depth
1185     /// is used to determine how to represent the transitions.
1186     ///
1187     /// If adding the new state would overflow the chosen state ID
1188     /// representation, then this returns an error.
add_state(&mut self, depth: usize) -> Result<S>1189     fn add_state(&mut self, depth: usize) -> Result<S> {
1190         if depth < self.builder.dense_depth {
1191             self.nfa.add_dense_state(depth)
1192         } else {
1193             self.nfa.add_sparse_state(depth)
1194         }
1195     }
1196 
1197     /// Returns the match kind configured on the underlying builder.
match_kind(&self) -> MatchKind1198     fn match_kind(&self) -> MatchKind {
1199         self.builder.match_kind
1200     }
1201 }
1202 
1203 /// A set of state identifiers used to avoid revisiting the same state multiple
1204 /// times when filling in failure transitions.
1205 ///
1206 /// This set has an "inert" and an "active" mode. When inert, the set never
1207 /// stores anything and always returns `false` for every member test. This is
1208 /// useful to avoid the performance and memory overhead of maintaining this
1209 /// set when it is not needed.
1210 #[derive(Debug)]
1211 struct QueuedSet<S> {
1212     set: Option<BTreeSet<S>>,
1213 }
1214 
1215 impl<S: StateID> QueuedSet<S> {
1216     /// Return an inert set that returns `false` for every state ID membership
1217     /// test.
inert() -> QueuedSet<S>1218     fn inert() -> QueuedSet<S> {
1219         QueuedSet { set: None }
1220     }
1221 
1222     /// Return an active set that tracks state ID membership.
active() -> QueuedSet<S>1223     fn active() -> QueuedSet<S> {
1224         QueuedSet { set: Some(BTreeSet::new()) }
1225     }
1226 
1227     /// Inserts the given state ID into this set. (If the set is inert, then
1228     /// this is a no-op.)
insert(&mut self, state_id: S)1229     fn insert(&mut self, state_id: S) {
1230         if let Some(ref mut set) = self.set {
1231             set.insert(state_id);
1232         }
1233     }
1234 
1235     /// Returns true if and only if the given state ID is in this set. If the
1236     /// set is inert, this always returns false.
contains(&self, state_id: S) -> bool1237     fn contains(&self, state_id: S) -> bool {
1238         match self.set {
1239             None => false,
1240             Some(ref set) => set.contains(&state_id),
1241         }
1242     }
1243 }
1244 
1245 /// An iterator over every byte value.
1246 ///
1247 /// We use this instead of (0..256).map(|b| b as u8) because this optimizes
1248 /// better in debug builds.
1249 ///
1250 /// We also use this instead of 0..=255 because we're targeting Rust 1.24 and
1251 /// inclusive range syntax was stabilized in Rust 1.26. We can get rid of this
1252 /// once our MSRV is Rust 1.26 or newer.
1253 #[derive(Debug)]
1254 struct AllBytesIter(u16);
1255 
1256 impl AllBytesIter {
new() -> AllBytesIter1257     fn new() -> AllBytesIter {
1258         AllBytesIter(0)
1259     }
1260 }
1261 
1262 impl Iterator for AllBytesIter {
1263     type Item = u8;
1264 
next(&mut self) -> Option<Self::Item>1265     fn next(&mut self) -> Option<Self::Item> {
1266         if self.0 >= 256 {
1267             None
1268         } else {
1269             let b = self.0 as u8;
1270             self.0 += 1;
1271             Some(b)
1272         }
1273     }
1274 }
1275 
1276 impl<S: StateID> fmt::Debug for NFA<S> {
fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result1277     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
1278         writeln!(f, "NFA(")?;
1279         writeln!(f, "match_kind: {:?}", self.match_kind)?;
1280         writeln!(f, "prefilter: {:?}", self.prefilter)?;
1281         writeln!(f, "{}", "-".repeat(79))?;
1282         for (id, s) in self.states.iter().enumerate() {
1283             let mut trans = vec![];
1284             s.trans.iter(|byte, next| {
1285                 // The start state has a bunch of uninteresting transitions
1286                 // back into itself. It's questionable to hide them since they
1287                 // are critical to understanding the automaton, but they are
1288                 // very noisy without better formatting for contiugous ranges
1289                 // to the same state.
1290                 if id == self.start_id.to_usize() && next == self.start_id {
1291                     return;
1292                 }
1293                 // Similarly, the dead state has a bunch of uninteresting
1294                 // transitions too.
1295                 if id == dead_id() {
1296                     return;
1297                 }
1298                 trans.push(format!("{} => {}", escape(byte), next.to_usize()));
1299             });
1300             writeln!(f, "{:04}: {}", id, trans.join(", "))?;
1301 
1302             let matches: Vec<String> = s
1303                 .matches
1304                 .iter()
1305                 .map(|&(pattern_id, _)| pattern_id.to_string())
1306                 .collect();
1307             writeln!(f, "  matches: {}", matches.join(", "))?;
1308             writeln!(f, "     fail: {}", s.fail.to_usize())?;
1309             writeln!(f, "    depth: {}", s.depth)?;
1310         }
1311         writeln!(f, "{}", "-".repeat(79))?;
1312         writeln!(f, ")")?;
1313         Ok(())
1314     }
1315 }
1316 
1317 /// Iterate over all possible byte transitions given a sparse set.
sparse_iter<S: StateID, F: FnMut(u8, S)>(trans: &[(u8, S)], mut f: F)1318 fn sparse_iter<S: StateID, F: FnMut(u8, S)>(trans: &[(u8, S)], mut f: F) {
1319     let mut byte = 0u16;
1320     for &(b, id) in trans {
1321         while byte < (b as u16) {
1322             f(byte as u8, fail_id());
1323             byte += 1;
1324         }
1325         f(b, id);
1326         byte += 1;
1327     }
1328     for b in byte..256 {
1329         f(b as u8, fail_id());
1330     }
1331 }
1332 
1333 /// Safely return two mutable borrows to two different locations in the given
1334 /// slice.
1335 ///
1336 /// This panics if i == j.
get_two_mut<T>(xs: &mut [T], i: usize, j: usize) -> (&mut T, &mut T)1337 fn get_two_mut<T>(xs: &mut [T], i: usize, j: usize) -> (&mut T, &mut T) {
1338     assert!(i != j, "{} must not be equal to {}", i, j);
1339     if i < j {
1340         let (before, after) = xs.split_at_mut(j);
1341         (&mut before[i], &mut after[0])
1342     } else {
1343         let (before, after) = xs.split_at_mut(i);
1344         (&mut after[0], &mut before[j])
1345     }
1346 }
1347 
1348 /// Return the given byte as its escaped string form.
escape(b: u8) -> String1349 fn escape(b: u8) -> String {
1350     use std::ascii;
1351 
1352     String::from_utf8(ascii::escape_default(b).collect::<Vec<_>>()).unwrap()
1353 }
1354 
1355 #[cfg(test)]
1356 mod tests {
1357     use super::*;
1358 
1359     #[test]
scratch()1360     fn scratch() {
1361         let nfa: NFA<usize> = Builder::new()
1362             .dense_depth(0)
1363             // .match_kind(MatchKind::LeftmostShortest)
1364             // .match_kind(MatchKind::LeftmostLongest)
1365             .match_kind(MatchKind::LeftmostFirst)
1366             // .build(&["abcd", "ce", "b"])
1367             // .build(&["ab", "bc"])
1368             // .build(&["b", "bcd", "ce"])
1369             // .build(&["abc", "bx"])
1370             // .build(&["abc", "bd", "ab"])
1371             // .build(&["abcdefghi", "hz", "abcdefgh"])
1372             // .build(&["abcd", "bce", "b"])
1373             .build(&["abcdefg", "bcde", "bcdef"])
1374             .unwrap();
1375         println!("{:?}", nfa);
1376     }
1377 }
1378