1 use std::cmp;
2 use std::collections::{BTreeSet, VecDeque};
3 use std::fmt;
4 use std::mem::size_of;
5 use std::ops::{Index, IndexMut};
6
7 use crate::ahocorasick::MatchKind;
8 use crate::automaton::Automaton;
9 use crate::classes::{ByteClassBuilder, ByteClasses};
10 use crate::error::Result;
11 use crate::prefilter::{self, opposite_ascii_case, Prefilter, PrefilterObj};
12 use crate::state_id::{dead_id, fail_id, usize_to_state_id, StateID};
13 use crate::Match;
14
15 /// The identifier for a pattern, which is simply the position of the pattern
16 /// in the sequence of patterns given by the caller.
17 pub type PatternID = usize;
18
19 /// The length of a pattern, in bytes.
20 pub type PatternLength = usize;
21
22 /// An Aho-Corasick automaton, represented as an NFA.
23 ///
24 /// This is the classical formulation of Aho-Corasick, which involves building
25 /// up a prefix trie of a given set of patterns, and then wiring up failure
26 /// transitions between states in order to guarantee linear time matching. The
27 /// standard formulation is, technically, an NFA because of these failure
28 /// transitions. That is, one can see them as enabling the automaton to be in
29 /// multiple states at once. Indeed, during search, it is possible to check
30 /// the transitions on multiple states for a single input byte.
31 ///
32 /// This particular implementation not only supports the standard style of
33 /// matching, but also provides a mode for choosing leftmost-first or
34 /// leftmost-longest match semantics. When a leftmost mode is chosen, some
35 /// failure transitions that would otherwise be added are elided. See
36 /// the documentation of `MatchKind` for more details and examples on how the
37 /// match semantics may differ.
38 ///
39 /// If one wants a DFA, then it is necessary to first build an NFA and convert
40 /// it into a DFA. Note, however, that because we've constrained ourselves to
41 /// matching literal patterns, this does not need to use subset construction
42 /// for determinization. Instead, the DFA has at most a number of states
43 /// equivalent to the number of NFA states. The only real difference between
44 /// them is that all failure transitions are followed and pre-computed. This
45 /// uses much more memory, but also executes searches more quickly.
46 #[derive(Clone)]
47 pub struct NFA<S> {
48 /// The match semantics built into this NFA.
49 match_kind: MatchKind,
50 /// The start state id as an index into `states`.
51 start_id: S,
52 /// The length, in bytes, of the longest pattern in this automaton. This
53 /// information is useful for keeping correct buffer sizes when searching
54 /// on streams.
55 max_pattern_len: usize,
56 /// The total number of patterns added to this automaton, including
57 /// patterns that may never be matched.
58 pattern_count: usize,
59 /// The number of bytes of heap used by this NFA's transition table.
60 heap_bytes: usize,
61 /// A prefilter for quickly skipping to candidate matches, if pertinent.
62 prefilter: Option<PrefilterObj>,
63 /// Whether this automaton anchors all matches to the start of input.
64 anchored: bool,
65 /// A set of equivalence classes in terms of bytes. We compute this while
66 /// building the NFA, but don't use it in the NFA's states. Instead, we
67 /// use this for building the DFA. We store it on the NFA since it's easy
68 /// to compute while visiting the patterns.
69 byte_classes: ByteClasses,
70 /// A set of states. Each state defines its own transitions, a fail
71 /// transition and a set of indices corresponding to matches.
72 ///
73 /// The first state is always the fail state, which is used only as a
74 /// sentinel. Namely, in the final NFA, no transition into the fail state
75 /// exists. (Well, they do, but they aren't followed. Instead, the state's
76 /// failure transition is followed.)
77 ///
78 /// The second state (index 1) is always the dead state. Dead states are
79 /// in every automaton, but only used when leftmost-{first,longest} match
80 /// semantics are enabled. Specifically, they instruct search to stop
81 /// at specific points in order to report the correct match location. In
82 /// the standard Aho-Corasick construction, there are no transitions to
83 /// the dead state.
84 ///
85 /// The third state (index 2) is generally intended to be the starting or
86 /// "root" state.
87 states: Vec<State<S>>,
88 }
89
90 impl<S: StateID> NFA<S> {
91 /// Returns the equivalence classes of bytes found while constructing
92 /// this NFA.
93 ///
94 /// Note that the NFA doesn't actually make use of these equivalence
95 /// classes. Instead, these are useful for building the DFA when desired.
byte_classes(&self) -> &ByteClasses96 pub fn byte_classes(&self) -> &ByteClasses {
97 &self.byte_classes
98 }
99
100 /// Returns a prefilter, if one exists.
prefilter_obj(&self) -> Option<&PrefilterObj>101 pub fn prefilter_obj(&self) -> Option<&PrefilterObj> {
102 self.prefilter.as_ref()
103 }
104
105 /// Returns the total number of heap bytes used by this NFA's transition
106 /// table.
heap_bytes(&self) -> usize107 pub fn heap_bytes(&self) -> usize {
108 self.heap_bytes
109 + self.prefilter.as_ref().map_or(0, |p| p.as_ref().heap_bytes())
110 }
111
112 /// Return the length of the longest pattern in this automaton.
max_pattern_len(&self) -> usize113 pub fn max_pattern_len(&self) -> usize {
114 self.max_pattern_len
115 }
116
117 /// Return the total number of patterns added to this automaton.
pattern_count(&self) -> usize118 pub fn pattern_count(&self) -> usize {
119 self.pattern_count
120 }
121
122 /// Returns the total number of states in this NFA.
state_len(&self) -> usize123 pub fn state_len(&self) -> usize {
124 self.states.len()
125 }
126
127 /// Returns the matches for the given state.
matches(&self, id: S) -> &[(PatternID, PatternLength)]128 pub fn matches(&self, id: S) -> &[(PatternID, PatternLength)] {
129 &self.states[id.to_usize()].matches
130 }
131
132 /// Returns an iterator over all transitions in the given state according
133 /// to the given equivalence classes, including transitions to `fail_id()`.
134 /// The number of transitions returned is always equivalent to the number
135 /// of equivalence classes.
iter_all_transitions<F: FnMut(u8, S)>( &self, byte_classes: &ByteClasses, id: S, f: F, )136 pub fn iter_all_transitions<F: FnMut(u8, S)>(
137 &self,
138 byte_classes: &ByteClasses,
139 id: S,
140 f: F,
141 ) {
142 self.states[id.to_usize()].trans.iter_all(byte_classes, f);
143 }
144
145 /// Returns the failure transition for the given state.
failure_transition(&self, id: S) -> S146 pub fn failure_transition(&self, id: S) -> S {
147 self.states[id.to_usize()].fail
148 }
149
150 /// Returns the next state for the given state and input byte.
151 ///
152 /// Note that this does not follow failure transitions. As such, the id
153 /// returned may be `fail_id`.
next_state(&self, current: S, input: u8) -> S154 pub fn next_state(&self, current: S, input: u8) -> S {
155 self.states[current.to_usize()].next_state(input)
156 }
157
state(&self, id: S) -> &State<S>158 fn state(&self, id: S) -> &State<S> {
159 &self.states[id.to_usize()]
160 }
161
state_mut(&mut self, id: S) -> &mut State<S>162 fn state_mut(&mut self, id: S) -> &mut State<S> {
163 &mut self.states[id.to_usize()]
164 }
165
start(&self) -> &State<S>166 fn start(&self) -> &State<S> {
167 self.state(self.start_id)
168 }
169
start_mut(&mut self) -> &mut State<S>170 fn start_mut(&mut self) -> &mut State<S> {
171 let id = self.start_id;
172 self.state_mut(id)
173 }
174
iter_transitions_mut(&mut self, id: S) -> IterTransitionsMut<'_, S>175 fn iter_transitions_mut(&mut self, id: S) -> IterTransitionsMut<'_, S> {
176 IterTransitionsMut::new(self, id)
177 }
178
copy_matches(&mut self, src: S, dst: S)179 fn copy_matches(&mut self, src: S, dst: S) {
180 let (src, dst) =
181 get_two_mut(&mut self.states, src.to_usize(), dst.to_usize());
182 dst.matches.extend_from_slice(&src.matches);
183 }
184
copy_empty_matches(&mut self, dst: S)185 fn copy_empty_matches(&mut self, dst: S) {
186 let start_id = self.start_id;
187 self.copy_matches(start_id, dst);
188 }
189
add_dense_state(&mut self, depth: usize) -> Result<S>190 fn add_dense_state(&mut self, depth: usize) -> Result<S> {
191 let trans = Transitions::Dense(Dense::new());
192 let id = usize_to_state_id(self.states.len())?;
193 self.states.push(State {
194 trans,
195 // Anchored automatons do not have any failure transitions.
196 fail: if self.anchored { dead_id() } else { self.start_id },
197 depth,
198 matches: vec![],
199 });
200 Ok(id)
201 }
202
add_sparse_state(&mut self, depth: usize) -> Result<S>203 fn add_sparse_state(&mut self, depth: usize) -> Result<S> {
204 let trans = Transitions::Sparse(vec![]);
205 let id = usize_to_state_id(self.states.len())?;
206 self.states.push(State {
207 trans,
208 // Anchored automatons do not have any failure transitions.
209 fail: if self.anchored { dead_id() } else { self.start_id },
210 depth,
211 matches: vec![],
212 });
213 Ok(id)
214 }
215 }
216
217 impl<S: StateID> Automaton for NFA<S> {
218 type ID = S;
219
match_kind(&self) -> &MatchKind220 fn match_kind(&self) -> &MatchKind {
221 &self.match_kind
222 }
223
anchored(&self) -> bool224 fn anchored(&self) -> bool {
225 self.anchored
226 }
227
prefilter(&self) -> Option<&dyn Prefilter>228 fn prefilter(&self) -> Option<&dyn Prefilter> {
229 self.prefilter.as_ref().map(|p| p.as_ref())
230 }
231
start_state(&self) -> S232 fn start_state(&self) -> S {
233 self.start_id
234 }
235
is_valid(&self, id: S) -> bool236 fn is_valid(&self, id: S) -> bool {
237 id.to_usize() < self.states.len()
238 }
239
is_match_state(&self, id: S) -> bool240 fn is_match_state(&self, id: S) -> bool {
241 self.states[id.to_usize()].is_match()
242 }
243
get_match( &self, id: S, match_index: usize, end: usize, ) -> Option<Match>244 fn get_match(
245 &self,
246 id: S,
247 match_index: usize,
248 end: usize,
249 ) -> Option<Match> {
250 let state = match self.states.get(id.to_usize()) {
251 None => return None,
252 Some(state) => state,
253 };
254 state.matches.get(match_index).map(|&(id, len)| Match {
255 pattern: id,
256 len,
257 end,
258 })
259 }
260
match_count(&self, id: S) -> usize261 fn match_count(&self, id: S) -> usize {
262 self.states[id.to_usize()].matches.len()
263 }
264
next_state(&self, mut current: S, input: u8) -> S265 fn next_state(&self, mut current: S, input: u8) -> S {
266 // This terminates since:
267 //
268 // 1. `State.fail` never points to fail_id().
269 // 2. All `State.fail` values point to a state closer to `start`.
270 // 3. The start state has no transitions to fail_id().
271 loop {
272 let state = &self.states[current.to_usize()];
273 let next = state.next_state(input);
274 if next != fail_id() {
275 return next;
276 }
277 current = state.fail;
278 }
279 }
280 }
281
282 /// A representation of an NFA state for an Aho-Corasick automaton.
283 ///
284 /// It contains the transitions to the next state, a failure transition for
285 /// cases where there exists no other transition for the current input byte,
286 /// the matches implied by visiting this state (if any) and the depth of this
287 /// state. The depth of a state is simply the distance from it to the start
288 /// state in the automaton, where the depth of the start state is 0.
289 #[derive(Clone, Debug)]
290 pub struct State<S> {
291 trans: Transitions<S>,
292 fail: S,
293 matches: Vec<(PatternID, PatternLength)>,
294 // TODO: Strictly speaking, this isn't needed for searching. It's only
295 // used when building an NFA that supports leftmost match semantics. We
296 // could drop this from the state and dynamically build a map only when
297 // computing failure transitions, but it's not clear which is better.
298 // Benchmark this.
299 depth: usize,
300 }
301
302 impl<S: StateID> State<S> {
heap_bytes(&self) -> usize303 fn heap_bytes(&self) -> usize {
304 self.trans.heap_bytes()
305 + (self.matches.len() * size_of::<(PatternID, PatternLength)>())
306 }
307
add_match(&mut self, i: PatternID, len: PatternLength)308 fn add_match(&mut self, i: PatternID, len: PatternLength) {
309 self.matches.push((i, len));
310 }
311
is_match(&self) -> bool312 fn is_match(&self) -> bool {
313 !self.matches.is_empty()
314 }
315
next_state(&self, input: u8) -> S316 fn next_state(&self, input: u8) -> S {
317 self.trans.next_state(input)
318 }
319
set_next_state(&mut self, input: u8, next: S)320 fn set_next_state(&mut self, input: u8, next: S) {
321 self.trans.set_next_state(input, next);
322 }
323 }
324
325 /// Represents the transitions for a single dense state.
326 ///
327 /// The primary purpose here is to encapsulate index access. Namely, since a
328 /// dense representation always contains 256 elements, all values of `u8` are
329 /// valid indices.
330 #[derive(Clone, Debug)]
331 struct Dense<S>(Vec<S>);
332
333 impl<S> Dense<S>
334 where
335 S: StateID,
336 {
new() -> Self337 fn new() -> Self {
338 Dense(vec![fail_id(); 256])
339 }
340
341 #[inline]
len(&self) -> usize342 fn len(&self) -> usize {
343 self.0.len()
344 }
345 }
346
347 impl<S> Index<u8> for Dense<S> {
348 type Output = S;
349
350 #[inline]
index(&self, i: u8) -> &S351 fn index(&self, i: u8) -> &S {
352 // SAFETY: This is safe because all dense transitions have
353 // exactly 256 elements, so all u8 values are valid indices.
354 &self.0[i as usize]
355 }
356 }
357
358 impl<S> IndexMut<u8> for Dense<S> {
359 #[inline]
index_mut(&mut self, i: u8) -> &mut S360 fn index_mut(&mut self, i: u8) -> &mut S {
361 // SAFETY: This is safe because all dense transitions have
362 // exactly 256 elements, so all u8 values are valid indices.
363 &mut self.0[i as usize]
364 }
365 }
366
367 /// A representation of transitions in an NFA.
368 ///
369 /// Transitions have either a sparse representation, which is slower for
370 /// lookups but uses less memory, or a dense representation, which is faster
371 /// for lookups but uses more memory. In the sparse representation, the absence
372 /// of a state implies a transition to `fail_id()`. Transitions to `dead_id()`
373 /// are still explicitly represented.
374 ///
375 /// For the NFA, by default, we use a dense representation for transitions for
376 /// states close to the start state because it's likely these are the states
377 /// that will be most frequently visited.
378 #[derive(Clone, Debug)]
379 enum Transitions<S> {
380 Sparse(Vec<(u8, S)>),
381 Dense(Dense<S>),
382 }
383
384 impl<S: StateID> Transitions<S> {
heap_bytes(&self) -> usize385 fn heap_bytes(&self) -> usize {
386 match *self {
387 Transitions::Sparse(ref sparse) => {
388 sparse.len() * size_of::<(u8, S)>()
389 }
390 Transitions::Dense(ref dense) => dense.len() * size_of::<S>(),
391 }
392 }
393
next_state(&self, input: u8) -> S394 fn next_state(&self, input: u8) -> S {
395 match *self {
396 Transitions::Sparse(ref sparse) => {
397 for &(b, id) in sparse {
398 if b == input {
399 return id;
400 }
401 }
402 fail_id()
403 }
404 Transitions::Dense(ref dense) => dense[input],
405 }
406 }
407
set_next_state(&mut self, input: u8, next: S)408 fn set_next_state(&mut self, input: u8, next: S) {
409 match *self {
410 Transitions::Sparse(ref mut sparse) => {
411 match sparse.binary_search_by_key(&input, |&(b, _)| b) {
412 Ok(i) => sparse[i] = (input, next),
413 Err(i) => sparse.insert(i, (input, next)),
414 }
415 }
416 Transitions::Dense(ref mut dense) => {
417 dense[input] = next;
418 }
419 }
420 }
421
422 /// Iterate over transitions in this state while skipping over transitions
423 /// to `fail_id()`.
iter<F: FnMut(u8, S)>(&self, mut f: F)424 fn iter<F: FnMut(u8, S)>(&self, mut f: F) {
425 match *self {
426 Transitions::Sparse(ref sparse) => {
427 for &(b, id) in sparse {
428 f(b, id);
429 }
430 }
431 Transitions::Dense(ref dense) => {
432 for b in AllBytesIter::new() {
433 let id = dense[b];
434 if id != fail_id() {
435 f(b, id);
436 }
437 }
438 }
439 }
440 }
441
442 /// Iterate over all transitions in this state according to the given
443 /// equivalence classes, including transitions to `fail_id()`.
iter_all<F: FnMut(u8, S)>(&self, classes: &ByteClasses, mut f: F)444 fn iter_all<F: FnMut(u8, S)>(&self, classes: &ByteClasses, mut f: F) {
445 if classes.is_singleton() {
446 match *self {
447 Transitions::Sparse(ref sparse) => {
448 sparse_iter(sparse, f);
449 }
450 Transitions::Dense(ref dense) => {
451 for b in AllBytesIter::new() {
452 f(b, dense[b]);
453 }
454 }
455 }
456 } else {
457 // In this case, we only want to yield a single byte for each
458 // equivalence class.
459 match *self {
460 Transitions::Sparse(ref sparse) => {
461 let mut last_class = None;
462 sparse_iter(sparse, |b, next| {
463 let class = classes.get(b);
464 if last_class != Some(class) {
465 last_class = Some(class);
466 f(b, next);
467 }
468 })
469 }
470 Transitions::Dense(ref dense) => {
471 for b in classes.representatives() {
472 f(b, dense[b]);
473 }
474 }
475 }
476 }
477 }
478 }
479
480 /// Iterator over transitions in a state, skipping transitions to `fail_id()`.
481 ///
482 /// This abstracts over the representation of NFA transitions, which may be
483 /// either in a sparse or dense representation.
484 ///
485 /// This somewhat idiosyncratically borrows the NFA mutably, so that when one
486 /// is iterating over transitions, the caller can still mutate the NFA. This
487 /// is useful when creating failure transitions.
488 #[derive(Debug)]
489 struct IterTransitionsMut<'a, S: StateID> {
490 nfa: &'a mut NFA<S>,
491 state_id: S,
492 cur: usize,
493 }
494
495 impl<'a, S: StateID> IterTransitionsMut<'a, S> {
new(nfa: &'a mut NFA<S>, state_id: S) -> IterTransitionsMut<'a, S>496 fn new(nfa: &'a mut NFA<S>, state_id: S) -> IterTransitionsMut<'a, S> {
497 IterTransitionsMut { nfa, state_id, cur: 0 }
498 }
499
nfa(&mut self) -> &mut NFA<S>500 fn nfa(&mut self) -> &mut NFA<S> {
501 self.nfa
502 }
503 }
504
505 impl<'a, S: StateID> Iterator for IterTransitionsMut<'a, S> {
506 type Item = (u8, S);
507
next(&mut self) -> Option<(u8, S)>508 fn next(&mut self) -> Option<(u8, S)> {
509 match self.nfa.states[self.state_id.to_usize()].trans {
510 Transitions::Sparse(ref sparse) => {
511 if self.cur >= sparse.len() {
512 return None;
513 }
514 let i = self.cur;
515 self.cur += 1;
516 Some(sparse[i])
517 }
518 Transitions::Dense(ref dense) => {
519 while self.cur < dense.len() {
520 // There are always exactly 255 transitions in dense repr.
521 debug_assert!(self.cur < 256);
522
523 let b = self.cur as u8;
524 let id = dense[b];
525 self.cur += 1;
526 if id != fail_id() {
527 return Some((b, id));
528 }
529 }
530 None
531 }
532 }
533 }
534 }
535
536 /// A simple builder for configuring the NFA construction of Aho-Corasick.
537 #[derive(Clone, Debug)]
538 pub struct Builder {
539 dense_depth: usize,
540 match_kind: MatchKind,
541 prefilter: bool,
542 anchored: bool,
543 ascii_case_insensitive: bool,
544 }
545
546 impl Default for Builder {
default() -> Builder547 fn default() -> Builder {
548 Builder {
549 dense_depth: 2,
550 match_kind: MatchKind::default(),
551 prefilter: true,
552 anchored: false,
553 ascii_case_insensitive: false,
554 }
555 }
556 }
557
558 impl Builder {
new() -> Builder559 pub fn new() -> Builder {
560 Builder::default()
561 }
562
build<I, P, S: StateID>(&self, patterns: I) -> Result<NFA<S>> where I: IntoIterator<Item = P>, P: AsRef<[u8]>,563 pub fn build<I, P, S: StateID>(&self, patterns: I) -> Result<NFA<S>>
564 where
565 I: IntoIterator<Item = P>,
566 P: AsRef<[u8]>,
567 {
568 Compiler::new(self)?.compile(patterns)
569 }
570
match_kind(&mut self, kind: MatchKind) -> &mut Builder571 pub fn match_kind(&mut self, kind: MatchKind) -> &mut Builder {
572 self.match_kind = kind;
573 self
574 }
575
dense_depth(&mut self, depth: usize) -> &mut Builder576 pub fn dense_depth(&mut self, depth: usize) -> &mut Builder {
577 self.dense_depth = depth;
578 self
579 }
580
prefilter(&mut self, yes: bool) -> &mut Builder581 pub fn prefilter(&mut self, yes: bool) -> &mut Builder {
582 self.prefilter = yes;
583 self
584 }
585
anchored(&mut self, yes: bool) -> &mut Builder586 pub fn anchored(&mut self, yes: bool) -> &mut Builder {
587 self.anchored = yes;
588 self
589 }
590
ascii_case_insensitive(&mut self, yes: bool) -> &mut Builder591 pub fn ascii_case_insensitive(&mut self, yes: bool) -> &mut Builder {
592 self.ascii_case_insensitive = yes;
593 self
594 }
595 }
596
597 /// A compiler uses a builder configuration and builds up the NFA formulation
598 /// of an Aho-Corasick automaton. This roughly corresponds to the standard
599 /// formulation described in textbooks.
600 #[derive(Debug)]
601 struct Compiler<'a, S: StateID> {
602 builder: &'a Builder,
603 prefilter: prefilter::Builder,
604 nfa: NFA<S>,
605 byte_classes: ByteClassBuilder,
606 }
607
608 impl<'a, S: StateID> Compiler<'a, S> {
new(builder: &'a Builder) -> Result<Compiler<'a, S>>609 fn new(builder: &'a Builder) -> Result<Compiler<'a, S>> {
610 Ok(Compiler {
611 builder,
612 prefilter: prefilter::Builder::new(builder.match_kind)
613 .ascii_case_insensitive(builder.ascii_case_insensitive),
614 nfa: NFA {
615 match_kind: builder.match_kind,
616 start_id: usize_to_state_id(2)?,
617 max_pattern_len: 0,
618 pattern_count: 0,
619 heap_bytes: 0,
620 prefilter: None,
621 anchored: builder.anchored,
622 byte_classes: ByteClasses::singletons(),
623 states: vec![],
624 },
625 byte_classes: ByteClassBuilder::new(),
626 })
627 }
628
compile<I, P>(mut self, patterns: I) -> Result<NFA<S>> where I: IntoIterator<Item = P>, P: AsRef<[u8]>,629 fn compile<I, P>(mut self, patterns: I) -> Result<NFA<S>>
630 where
631 I: IntoIterator<Item = P>,
632 P: AsRef<[u8]>,
633 {
634 self.add_state(0)?; // the fail state, which is never entered
635 self.add_state(0)?; // the dead state, only used for leftmost
636 self.add_state(0)?; // the start state
637 self.build_trie(patterns)?;
638 self.add_start_state_loop();
639 self.add_dead_state_loop();
640 if !self.builder.anchored {
641 self.fill_failure_transitions();
642 }
643 self.close_start_state_loop();
644 self.nfa.byte_classes = self.byte_classes.build();
645 if !self.builder.anchored {
646 self.nfa.prefilter = self.prefilter.build();
647 }
648 self.calculate_size();
649 Ok(self.nfa)
650 }
651
652 /// This sets up the initial prefix trie that makes up the Aho-Corasick
653 /// automaton. Effectively, it creates the basic structure of the
654 /// automaton, where every pattern given has a path from the start state to
655 /// the end of the pattern.
build_trie<I, P>(&mut self, patterns: I) -> Result<()> where I: IntoIterator<Item = P>, P: AsRef<[u8]>,656 fn build_trie<I, P>(&mut self, patterns: I) -> Result<()>
657 where
658 I: IntoIterator<Item = P>,
659 P: AsRef<[u8]>,
660 {
661 'PATTERNS: for (pati, pat) in patterns.into_iter().enumerate() {
662 let pat = pat.as_ref();
663 self.nfa.max_pattern_len =
664 cmp::max(self.nfa.max_pattern_len, pat.len());
665 self.nfa.pattern_count += 1;
666
667 let mut prev = self.nfa.start_id;
668 let mut saw_match = false;
669 for (depth, &b) in pat.iter().enumerate() {
670 // When leftmost-first match semantics are requested, we
671 // specifically stop adding patterns when a previously added
672 // pattern is a prefix of it. We avoid adding it because
673 // leftmost-first semantics imply that the pattern can never
674 // match. This is not just an optimization to save space! It
675 // is necessary for correctness. In fact, this is the only
676 // difference in the automaton between the implementations for
677 // leftmost-first and leftmost-longest.
678 saw_match = saw_match || self.nfa.state(prev).is_match();
679 if self.builder.match_kind.is_leftmost_first() && saw_match {
680 // Skip to the next pattern immediately. This avoids
681 // incorrectly adding a match after this loop terminates.
682 continue 'PATTERNS;
683 }
684
685 // Add this byte to our equivalence classes. We don't use these
686 // for NFA construction. These are instead used only if we're
687 // building a DFA. They would technically be useful for the
688 // NFA, but it would require a second pass over the patterns.
689 self.byte_classes.set_range(b, b);
690 if self.builder.ascii_case_insensitive {
691 let b = opposite_ascii_case(b);
692 self.byte_classes.set_range(b, b);
693 }
694
695 // If the transition from prev using the current byte already
696 // exists, then just move through it. Otherwise, add a new
697 // state. We track the depth here so that we can determine
698 // how to represent transitions. States near the start state
699 // use a dense representation that uses more memory but is
700 // faster. Other states use a sparse representation that uses
701 // less memory but is slower.
702 let next = self.nfa.state(prev).next_state(b);
703 if next != fail_id() {
704 prev = next;
705 } else {
706 let next = self.add_state(depth + 1)?;
707 self.nfa.state_mut(prev).set_next_state(b, next);
708 if self.builder.ascii_case_insensitive {
709 let b = opposite_ascii_case(b);
710 self.nfa.state_mut(prev).set_next_state(b, next);
711 }
712 prev = next;
713 }
714 }
715 // Once the pattern has been added, log the match in the final
716 // state that it reached.
717 self.nfa.state_mut(prev).add_match(pati, pat.len());
718 // ... and hand it to the prefilter builder, if applicable.
719 if self.builder.prefilter {
720 self.prefilter.add(pat);
721 }
722 }
723 Ok(())
724 }
725
726 /// This routine creates failure transitions according to the standard
727 /// textbook formulation of the Aho-Corasick algorithm, with a couple small
728 /// tweaks to support "leftmost" semantics.
729 ///
730 /// Building failure transitions is the most interesting part of building
731 /// the Aho-Corasick automaton, because they are what allow searches to
732 /// be performed in linear time. Specifically, a failure transition is
733 /// a single transition associated with each state that points back to
734 /// the longest proper suffix of the pattern being searched. The failure
735 /// transition is followed whenever there exists no transition on the
736 /// current state for the current input byte. If there is no other proper
737 /// suffix, then the failure transition points back to the starting state.
738 ///
739 /// For example, let's say we built an Aho-Corasick automaton with the
740 /// following patterns: 'abcd' and 'cef'. The trie looks like this:
741 ///
742 /// ```ignore
743 /// a - S1 - b - S2 - c - S3 - d - S4*
744 /// /
745 /// S0 - c - S5 - e - S6 - f - S7*
746 /// ```
747 ///
748 /// At this point, it should be fairly straight-forward to see how this
749 /// trie can be used in a simplistic way. At any given position in the
750 /// text we're searching (called the "subject" string), all we need to do
751 /// is follow the transitions in the trie by consuming one transition for
752 /// each byte in the subject string. If we reach a match state, then we can
753 /// report that location as a match.
754 ///
755 /// The trick comes when searching a subject string like 'abcef'. We'll
756 /// initially follow the transition from S0 to S1 and wind up in S3 after
757 /// observng the 'c' byte. At this point, the next byte is 'e' but state
758 /// S3 has no transition for 'e', so the search fails. We then would need
759 /// to restart the search at the next position in 'abcef', which
760 /// corresponds to 'b'. The match would fail, but the next search starting
761 /// at 'c' would finally succeed. The problem with this approach is that
762 /// we wind up searching the subject string potentially many times. In
763 /// effect, this makes the algorithm have worst case `O(n * m)` complexity,
764 /// where `n ~ len(subject)` and `m ~ len(all patterns)`. We would instead
765 /// like to achieve a `O(n + m)` worst case complexity.
766 ///
767 /// This is where failure transitions come in. Instead of dying at S3 in
768 /// the first search, the automaton can instruct the search to move to
769 /// another part of the automaton that corresponds to a suffix of what
770 /// we've seen so far. Recall that we've seen 'abc' in the subject string,
771 /// and the automaton does indeed have a non-empty suffix, 'c', that could
772 /// potentially lead to another match. Thus, the actual Aho-Corasick
773 /// automaton for our patterns in this case looks like this:
774 ///
775 /// ```ignore
776 /// a - S1 - b - S2 - c - S3 - d - S4*
777 /// / /
778 /// / ----------------
779 /// / /
780 /// S0 - c - S5 - e - S6 - f - S7*
781 /// ```
782 ///
783 /// That is, we have a failure transition from S3 to S5, which is followed
784 /// exactly in cases when we are in state S3 but see any byte other than
785 /// 'd' (that is, we've "failed" to find a match in this portion of our
786 /// trie). We know we can transition back to S5 because we've already seen
787 /// a 'c' byte, so we don't need to re-scan it. We can then pick back up
788 /// with the search starting at S5 and complete our match.
789 ///
790 /// Adding failure transitions to a trie is fairly simple, but subtle. The
791 /// key issue is that you might have multiple failure transition that you
792 /// need to follow. For example, look at the trie for the patterns
793 /// 'abcd', 'b', 'bcd' and 'cd':
794 ///
795 /// ```ignore
796 /// - a - S1 - b - S2* - c - S3 - d - S4*
797 /// / / /
798 /// / ------- -------
799 /// / / /
800 /// S0 --- b - S5* - c - S6 - d - S7*
801 /// \ /
802 /// \ --------
803 /// \ /
804 /// - c - S8 - d - S9*
805 /// ```
806 ///
807 /// The failure transitions for this trie are defined from S2 to S5,
808 /// S3 to S6 and S6 to S8. Moreover, state S2 needs to track that it
809 /// corresponds to a match, since its failure transition to S5 is itself
810 /// a match state.
811 ///
812 /// Perhaps simplest way to think about adding these failure transitions
813 /// is recursively. That is, if you know the failure transitions for every
814 /// possible previous state that could be visited (e.g., when computing the
815 /// failure transition for S3, you already know the failure transitions
816 /// for S0, S1 and S2), then you can simply follow the failure transition
817 /// of the previous state and check whether the incoming transition is
818 /// defined after following the failure transition.
819 ///
820 /// For example, when determining the failure state for S3, by our
821 /// assumptions, we already know that there is a failure transition from
822 /// S2 (the previous state) to S5. So we follow that transition and check
823 /// whether the transition connecting S2 to S3 is defined. Indeed, it is,
824 /// as there is a transition from S5 to S6 for the byte 'c'. If no such
825 /// transition existed, we could keep following the failure transitions
826 /// until we reach the start state, which is the failure transition for
827 /// every state that has no corresponding proper suffix.
828 ///
829 /// We don't actually use recursion to implement this, but instead, use a
830 /// breadth first search of the automaton. Our base case is the start
831 /// state, whose failure transition is just a transition to itself.
832 ///
833 /// When building a leftmost automaton, we proceed as above, but only
834 /// include a subset of failure transitions. Namely, we omit any failure
835 /// transitions that appear after a match state in the trie. This is
836 /// because failure transitions always point back to a proper suffix of
837 /// what has been seen so far. Thus, following a failure transition after
838 /// a match implies looking for a match that starts after the one that has
839 /// already been seen, which is of course therefore not the leftmost match.
840 ///
841 /// N.B. I came up with this algorithm on my own, and after scouring all of
842 /// the other AC implementations I know of (Perl, Snort, many on GitHub).
843 /// I couldn't find any that implement leftmost semantics like this.
844 /// Perl of course needs leftmost-first semantics, but they implement it
845 /// with a seeming hack at *search* time instead of encoding it into the
846 /// automaton. There are also a couple Java libraries that support leftmost
847 /// longest semantics, but they do it by building a queue of matches at
848 /// search time, which is even worse than what Perl is doing. ---AG
fill_failure_transitions(&mut self)849 fn fill_failure_transitions(&mut self) {
850 let kind = self.match_kind();
851 // Initialize the queue for breadth first search with all transitions
852 // out of the start state. We handle the start state specially because
853 // we only want to follow non-self transitions. If we followed self
854 // transitions, then this would never terminate.
855 let mut queue = VecDeque::new();
856 let mut seen = self.queued_set();
857 let mut it = self.nfa.iter_transitions_mut(self.nfa.start_id);
858 while let Some((_, next)) = it.next() {
859 // Skip anything we've seen before and any self-transitions on the
860 // start state.
861 if next == it.nfa().start_id || seen.contains(next) {
862 continue;
863 }
864 queue.push_back(next);
865 seen.insert(next);
866 // Under leftmost semantics, if a state immediately following
867 // the start state is a match state, then we never want to
868 // follow its failure transition since the failure transition
869 // necessarily leads back to the start state, which we never
870 // want to do for leftmost matching after a match has been
871 // found.
872 //
873 // We apply the same logic to non-start states below as well.
874 if kind.is_leftmost() && it.nfa().state(next).is_match() {
875 it.nfa().state_mut(next).fail = dead_id();
876 }
877 }
878 while let Some(id) = queue.pop_front() {
879 let mut it = self.nfa.iter_transitions_mut(id);
880 while let Some((b, next)) = it.next() {
881 if seen.contains(next) {
882 // The only way to visit a duplicate state in a transition
883 // list is when ASCII case insensitivity is enabled. In
884 // this case, we want to skip it since it's redundant work.
885 // But it would also end up duplicating matches, which
886 // results in reporting duplicate matches in some cases.
887 // See the 'acasei010' regression test.
888 continue;
889 }
890 queue.push_back(next);
891 seen.insert(next);
892
893 // As above for start states, under leftmost semantics, once
894 // we see a match all subsequent states should have no failure
895 // transitions because failure transitions always imply looking
896 // for a match that is a suffix of what has been seen so far
897 // (where "seen so far" corresponds to the string formed by
898 // following the transitions from the start state to the
899 // current state). Under leftmost semantics, we specifically do
900 // not want to allow this to happen because we always want to
901 // report the match found at the leftmost position.
902 //
903 // The difference between leftmost-first and leftmost-longest
904 // occurs previously while we build the trie. For
905 // leftmost-first, we simply omit any entries that would
906 // otherwise require passing through a match state.
907 //
908 // Note that for correctness, the failure transition has to be
909 // set to the dead state for ALL states following a match, not
910 // just the match state itself. However, by setting the failure
911 // transition to the dead state on all match states, the dead
912 // state will automatically propagate to all subsequent states
913 // via the failure state computation below.
914 if kind.is_leftmost() && it.nfa().state(next).is_match() {
915 it.nfa().state_mut(next).fail = dead_id();
916 continue;
917 }
918 let mut fail = it.nfa().state(id).fail;
919 while it.nfa().state(fail).next_state(b) == fail_id() {
920 fail = it.nfa().state(fail).fail;
921 }
922 fail = it.nfa().state(fail).next_state(b);
923 it.nfa().state_mut(next).fail = fail;
924 it.nfa().copy_matches(fail, next);
925 }
926 // If the start state is a match state, then this automaton can
927 // match the empty string. This implies all states are match states
928 // since every position matches the empty string, so copy the
929 // matches from the start state to every state. Strictly speaking,
930 // this is only necessary for overlapping matches since each
931 // non-empty non-start match state needs to report empty matches
932 // in addition to its own. For the non-overlapping case, such
933 // states only report the first match, which is never empty since
934 // it isn't a start state.
935 if !kind.is_leftmost() {
936 it.nfa().copy_empty_matches(id);
937 }
938 }
939 }
940
941 /// Returns a set that tracked queued states.
942 ///
943 /// This is only necessary when ASCII case insensitivity is enabled, since
944 /// it is the only way to visit the same state twice. Otherwise, this
945 /// returns an inert set that nevers adds anything and always reports
946 /// `false` for every member test.
queued_set(&self) -> QueuedSet<S>947 fn queued_set(&self) -> QueuedSet<S> {
948 if self.builder.ascii_case_insensitive {
949 QueuedSet::active()
950 } else {
951 QueuedSet::inert()
952 }
953 }
954
955 /// Set the failure transitions on the start state to loop back to the
956 /// start state. This effectively permits the Aho-Corasick automaton to
957 /// match at any position. This is also required for finding the next
958 /// state to terminate, namely, finding the next state should never return
959 /// a fail_id.
960 ///
961 /// This must be done after building the initial trie, since trie
962 /// construction depends on transitions to `fail_id` to determine whether a
963 /// state already exists or not.
add_start_state_loop(&mut self)964 fn add_start_state_loop(&mut self) {
965 let start_id = self.nfa.start_id;
966 let start = self.nfa.start_mut();
967 for b in AllBytesIter::new() {
968 if start.next_state(b) == fail_id() {
969 start.set_next_state(b, start_id);
970 }
971 }
972 }
973
974 /// Remove the start state loop by rewriting any transitions on the start
975 /// state back to the start state with transitions to the dead state.
976 ///
977 /// The loop is only closed when two conditions are met: the start state
978 /// is a match state and the match kind is leftmost-first or
979 /// leftmost-longest. (Alternatively, if this is an anchored automaton,
980 /// then the start state is always closed, regardless of aforementioned
981 /// conditions.)
982 ///
983 /// The reason for this is that under leftmost semantics, a start state
984 /// that is also a match implies that we should never restart the search
985 /// process. We allow normal transitions out of the start state, but if
986 /// none exist, we transition to the dead state, which signals that
987 /// searching should stop.
close_start_state_loop(&mut self)988 fn close_start_state_loop(&mut self) {
989 if self.builder.anchored
990 || (self.match_kind().is_leftmost() && self.nfa.start().is_match())
991 {
992 let start_id = self.nfa.start_id;
993 let start = self.nfa.start_mut();
994 for b in AllBytesIter::new() {
995 if start.next_state(b) == start_id {
996 start.set_next_state(b, dead_id());
997 }
998 }
999 }
1000 }
1001
1002 /// Sets all transitions on the dead state to point back to the dead state.
1003 /// Normally, missing transitions map back to the failure state, but the
1004 /// point of the dead state is to act as a sink that can never be escaped.
add_dead_state_loop(&mut self)1005 fn add_dead_state_loop(&mut self) {
1006 let dead = self.nfa.state_mut(dead_id());
1007 for b in AllBytesIter::new() {
1008 dead.set_next_state(b, dead_id());
1009 }
1010 }
1011
1012 /// Computes the total amount of heap used by this NFA in bytes.
calculate_size(&mut self)1013 fn calculate_size(&mut self) {
1014 let mut size = 0;
1015 for state in &self.nfa.states {
1016 size += size_of::<State<S>>() + state.heap_bytes();
1017 }
1018 self.nfa.heap_bytes = size;
1019 }
1020
1021 /// Add a new state to the underlying NFA with the given depth. The depth
1022 /// is used to determine how to represent the transitions.
1023 ///
1024 /// If adding the new state would overflow the chosen state ID
1025 /// representation, then this returns an error.
add_state(&mut self, depth: usize) -> Result<S>1026 fn add_state(&mut self, depth: usize) -> Result<S> {
1027 if depth < self.builder.dense_depth {
1028 self.nfa.add_dense_state(depth)
1029 } else {
1030 self.nfa.add_sparse_state(depth)
1031 }
1032 }
1033
1034 /// Returns the match kind configured on the underlying builder.
match_kind(&self) -> MatchKind1035 fn match_kind(&self) -> MatchKind {
1036 self.builder.match_kind
1037 }
1038 }
1039
1040 /// A set of state identifiers used to avoid revisiting the same state multiple
1041 /// times when filling in failure transitions.
1042 ///
1043 /// This set has an "inert" and an "active" mode. When inert, the set never
1044 /// stores anything and always returns `false` for every member test. This is
1045 /// useful to avoid the performance and memory overhead of maintaining this
1046 /// set when it is not needed.
1047 #[derive(Debug)]
1048 struct QueuedSet<S> {
1049 set: Option<BTreeSet<S>>,
1050 }
1051
1052 impl<S: StateID> QueuedSet<S> {
1053 /// Return an inert set that returns `false` for every state ID membership
1054 /// test.
inert() -> QueuedSet<S>1055 fn inert() -> QueuedSet<S> {
1056 QueuedSet { set: None }
1057 }
1058
1059 /// Return an active set that tracks state ID membership.
active() -> QueuedSet<S>1060 fn active() -> QueuedSet<S> {
1061 QueuedSet { set: Some(BTreeSet::new()) }
1062 }
1063
1064 /// Inserts the given state ID into this set. (If the set is inert, then
1065 /// this is a no-op.)
insert(&mut self, state_id: S)1066 fn insert(&mut self, state_id: S) {
1067 if let Some(ref mut set) = self.set {
1068 set.insert(state_id);
1069 }
1070 }
1071
1072 /// Returns true if and only if the given state ID is in this set. If the
1073 /// set is inert, this always returns false.
contains(&self, state_id: S) -> bool1074 fn contains(&self, state_id: S) -> bool {
1075 match self.set {
1076 None => false,
1077 Some(ref set) => set.contains(&state_id),
1078 }
1079 }
1080 }
1081
1082 /// An iterator over every byte value.
1083 ///
1084 /// We use this instead of (0..256).map(|b| b as u8) because this optimizes
1085 /// better in debug builds.
1086 ///
1087 /// We also use this instead of 0..=255 because we're targeting Rust 1.24 and
1088 /// inclusive range syntax was stabilized in Rust 1.26. We can get rid of this
1089 /// once our MSRV is Rust 1.26 or newer.
1090 #[derive(Debug)]
1091 struct AllBytesIter(u16);
1092
1093 impl AllBytesIter {
new() -> AllBytesIter1094 fn new() -> AllBytesIter {
1095 AllBytesIter(0)
1096 }
1097 }
1098
1099 impl Iterator for AllBytesIter {
1100 type Item = u8;
1101
next(&mut self) -> Option<Self::Item>1102 fn next(&mut self) -> Option<Self::Item> {
1103 if self.0 >= 256 {
1104 None
1105 } else {
1106 let b = self.0 as u8;
1107 self.0 += 1;
1108 Some(b)
1109 }
1110 }
1111 }
1112
1113 impl<S: StateID> fmt::Debug for NFA<S> {
fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result1114 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
1115 writeln!(f, "NFA(")?;
1116 writeln!(f, "match_kind: {:?}", self.match_kind)?;
1117 writeln!(f, "prefilter: {:?}", self.prefilter)?;
1118 writeln!(f, "{}", "-".repeat(79))?;
1119 for (id, s) in self.states.iter().enumerate() {
1120 let mut trans = vec![];
1121 s.trans.iter(|byte, next| {
1122 // The start state has a bunch of uninteresting transitions
1123 // back into itself. It's questionable to hide them since they
1124 // are critical to understanding the automaton, but they are
1125 // very noisy without better formatting for contiugous ranges
1126 // to the same state.
1127 if id == self.start_id.to_usize() && next == self.start_id {
1128 return;
1129 }
1130 // Similarly, the dead state has a bunch of uninteresting
1131 // transitions too.
1132 if id == dead_id() {
1133 return;
1134 }
1135 trans.push(format!("{} => {}", escape(byte), next.to_usize()));
1136 });
1137 writeln!(f, "{:04}: {}", id, trans.join(", "))?;
1138
1139 let matches: Vec<String> = s
1140 .matches
1141 .iter()
1142 .map(|&(pattern_id, _)| pattern_id.to_string())
1143 .collect();
1144 writeln!(f, " matches: {}", matches.join(", "))?;
1145 writeln!(f, " fail: {}", s.fail.to_usize())?;
1146 writeln!(f, " depth: {}", s.depth)?;
1147 }
1148 writeln!(f, "{}", "-".repeat(79))?;
1149 writeln!(f, ")")?;
1150 Ok(())
1151 }
1152 }
1153
1154 /// Iterate over all possible byte transitions given a sparse set.
sparse_iter<S: StateID, F: FnMut(u8, S)>(trans: &[(u8, S)], mut f: F)1155 fn sparse_iter<S: StateID, F: FnMut(u8, S)>(trans: &[(u8, S)], mut f: F) {
1156 let mut byte = 0u16;
1157 for &(b, id) in trans {
1158 while byte < (b as u16) {
1159 f(byte as u8, fail_id());
1160 byte += 1;
1161 }
1162 f(b, id);
1163 byte += 1;
1164 }
1165 for b in byte..256 {
1166 f(b as u8, fail_id());
1167 }
1168 }
1169
1170 /// Safely return two mutable borrows to two different locations in the given
1171 /// slice.
1172 ///
1173 /// This panics if i == j.
get_two_mut<T>(xs: &mut [T], i: usize, j: usize) -> (&mut T, &mut T)1174 fn get_two_mut<T>(xs: &mut [T], i: usize, j: usize) -> (&mut T, &mut T) {
1175 assert!(i != j, "{} must not be equal to {}", i, j);
1176 if i < j {
1177 let (before, after) = xs.split_at_mut(j);
1178 (&mut before[i], &mut after[0])
1179 } else {
1180 let (before, after) = xs.split_at_mut(i);
1181 (&mut after[0], &mut before[j])
1182 }
1183 }
1184
1185 /// Return the given byte as its escaped string form.
escape(b: u8) -> String1186 fn escape(b: u8) -> String {
1187 use std::ascii;
1188
1189 String::from_utf8(ascii::escape_default(b).collect::<Vec<_>>()).unwrap()
1190 }
1191
1192 #[cfg(test)]
1193 mod tests {
1194 use super::*;
1195
1196 #[test]
scratch()1197 fn scratch() {
1198 let nfa: NFA<usize> = Builder::new()
1199 .dense_depth(0)
1200 // .match_kind(MatchKind::LeftmostShortest)
1201 // .match_kind(MatchKind::LeftmostLongest)
1202 .match_kind(MatchKind::LeftmostFirst)
1203 // .build(&["abcd", "ce", "b"])
1204 // .build(&["ab", "bc"])
1205 // .build(&["b", "bcd", "ce"])
1206 // .build(&["abc", "bx"])
1207 // .build(&["abc", "bd", "ab"])
1208 // .build(&["abcdefghi", "hz", "abcdefgh"])
1209 // .build(&["abcd", "bce", "b"])
1210 .build(&["abcdefg", "bcde", "bcdef"])
1211 .unwrap();
1212 println!("{:?}", nfa);
1213 }
1214 }
1215